Files
the-biergarten-app/pipeline/src/main.cpp
2026-04-01 19:33:50 -04:00

155 lines
5.5 KiB
C++

/**
* @file main.cpp
* @brief Entry point for the brewery data pipeline
*
* Pipeline Overview:
* This is the main data processing pipeline that:
* 1. Initializes an in-memory SQLite database
* 2. Loads world city data from a JSON file (50k+ cities)
* 3. Initializes the brewery generation system (currently mocked)
* 4. Demonstrates brewery generation for sample cities
*
* Architecture:
* ┌─────────────┐
* │ JSON File │ (world_city_data.json - 50k+ cities)
* └──────┬──────┘
* │
* ▼
* ┌─────────────────────┐
* │ JsonLoader::Load │ Parse and validate JSON
* └──────┬──────────────┘
* │
* ▼
* ┌─────────────────────┐
* │ SQLite Database │ Store cities in-memory
* └──────┬──────────────┘
* │
* ▼
* ┌─────────────────────┐
* │ BreweryGenerator │ Mock generation (hash-based)
* │ .GenerateBrewery() │ Future: LLM-based generation
* └─────────────────────┘
*
* Command Line Arguments:
* - argv[1]: Path to GGUF model file (default: ./model.gguf)
* - argv[2]: Path to cache directory for JSON downloads (default: /tmp)
* - argv[3]: Git commit hash for reproducible data version (default: c5eb7772)
*
* The pipeline automatically downloads the geographic data from GitHub on first
* run and caches it locally to avoid repeated network calls.
*
* Example Usage - Auto-download (stable 2026-03-28 build):
* @code
* ./brewery-pipeline ./llama-7b.gguf
* @endcode
*
* Example Usage - Custom commit:
* @code
* ./brewery-pipeline ./llama-7b.gguf /tmp main
* @endcode
*
* Exit Codes:
* - 0: Pipeline completed successfully
* - 1: Pipeline failed (exception caught)
*/
#include "data_downloader.h"
#include "database.h"
#include "generator.h"
#include "json_loader.h"
#include <curl/curl.h>
#include <iostream>
int main(int argc, char *argv[]) {
try {
// Initialize libcurl globally (thread-safe mode)
curl_global_init(CURL_GLOBAL_DEFAULT);
// Parse command-line arguments
std::string modelPath = argc > 1 ? argv[1] : "./model.gguf";
std::string cacheDir = argc > 2 ? argv[2] : "/tmp";
std::string commit =
argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28
// Construct cache path for downloaded JSON
std::string jsonPath = cacheDir + "/countries+states+cities.json";
// Step 0: Download geographic data from GitHub (cached locally)
// On first run, downloads 45MB JSON. On subsequent runs, uses cached file.
// Commit hash allows pinning to specific data versions for reproducibility.
std::cout << "\n[Pipeline] Downloading geographic data from GitHub...\n";
DataDownloader downloader;
downloader.DownloadCountriesDatabase(jsonPath, commit);
SqliteDatabase db;
// Step 1: Initialize empty in-memory database
std::cout << "Initializing in-memory SQLite database...\n";
db.Initialize();
// Step 2: Load world city data from JSON file
// This populates the database with ~50k city records
// Each record includes: city name, country, latitude, longitude, population
JsonLoader::LoadWorldCities(jsonPath, db);
// Step 3: Initialize brewery generator
// Current: Mock implementation using deterministic hashing
// Future: LLM-based generation with llama.cpp
std::cout << "Initializing brewery generator...\n";
LlamaBreweryGenerator generator;
generator.LoadModel(modelPath);
// Step 4: Query geographic data from database
std::cout << "\n=== GEOGRAPHIC DATA OVERVIEW ===\n";
auto countries = db.QueryCountries(50);
auto states = db.QueryStates(50);
auto cities = db.QueryCities();
std::cout << "\nTotal records loaded:";
std::cout << "\n Countries: " << db.QueryCountries(0).size();
std::cout << "\n States: " << db.QueryStates(0).size();
std::cout << "\n Cities: " << cities.size() << "\n";
// Display 50 countries
std::cout << "\n--- 50 COUNTRIES ---\n";
for (size_t i = 0; i < countries.size(); i++) {
std::cout << (i + 1) << ". " << countries[i].iso2 << " ("
<< countries[i].iso3 << ") " << countries[i].name << "\n";
}
// Display 50 states
std::cout << "\n--- 50 STATES ---\n";
for (size_t i = 0; i < states.size(); i++) {
std::cout << (i + 1) << ". " << states[i].iso2 << ": " << states[i].name
<< "\n";
}
// Display 50 cities
std::cout << "\n--- 50 CITIES ---\n";
for (size_t i = 0; i < std::min(size_t(50), cities.size()); i++) {
std::cout << (i + 1) << ". " << cities[i].second << "\n";
}
// Step 5: Demonstrate brewery generation on sample cities
std::cout << "\n=== SAMPLE BREWERY GENERATION ===\n\n";
for (size_t i = 0; i < std::min(size_t(5), cities.size()); i++) {
const auto &[cityId, cityName] = cities[i];
auto brewery = generator.GenerateBrewery(cityName, i);
std::cout << " " << cityName << ": " << brewery.name << "\n";
std::cout << "" << brewery.description << "\n";
}
std::cout << "\n✓ Pipeline completed successfully\n";
// Cleanup
curl_global_cleanup();
return 0;
} catch (const std::exception &e) {
std::cerr << "✗ Pipeline failed: " << e.what() << "\n";
curl_global_cleanup();
return 1;
}
}