/** * @file main.cpp * @brief Entry point for the brewery data pipeline * * Pipeline Overview: * This is the main data processing pipeline that: * 1. Initializes an in-memory SQLite database * 2. Loads world city data from a JSON file (50k+ cities) * 3. Initializes the brewery generation system (currently mocked) * 4. Demonstrates brewery generation for sample cities * * Architecture: * ┌─────────────┐ * │ JSON File │ (world_city_data.json - 50k+ cities) * └──────┬──────┘ * │ * ▼ * ┌─────────────────────┐ * │ JsonLoader::Load │ Parse and validate JSON * └──────┬──────────────┘ * │ * ▼ * ┌─────────────────────┐ * │ SQLite Database │ Store cities in-memory * └──────┬──────────────┘ * │ * ▼ * ┌─────────────────────┐ * │ BreweryGenerator │ Mock generation (hash-based) * │ .GenerateBrewery() │ Future: LLM-based generation * └─────────────────────┘ * * Command Line Arguments: * - argv[1]: Path to GGUF model file (default: ./model.gguf) * - argv[2]: Path to cache directory for JSON downloads (default: /tmp) * - argv[3]: Git commit hash for reproducible data version (default: c5eb7772) * * The pipeline automatically downloads the geographic data from GitHub on first * run and caches it locally to avoid repeated network calls. * * Example Usage - Auto-download (stable 2026-03-28 build): * @code * ./brewery-pipeline ./llama-7b.gguf * @endcode * * Example Usage - Custom commit: * @code * ./brewery-pipeline ./llama-7b.gguf /tmp main * @endcode * * Exit Codes: * - 0: Pipeline completed successfully * - 1: Pipeline failed (exception caught) */ #include "data_downloader.h" #include "database.h" #include "generator.h" #include "json_loader.h" #include #include int main(int argc, char *argv[]) { try { // Initialize libcurl globally (thread-safe mode) curl_global_init(CURL_GLOBAL_DEFAULT); // Parse command-line arguments std::string modelPath = argc > 1 ? argv[1] : "./model.gguf"; std::string cacheDir = argc > 2 ? argv[2] : "/tmp"; std::string commit = argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28 // Construct cache path for downloaded JSON std::string jsonPath = cacheDir + "/countries+states+cities.json"; // Step 0: Download geographic data from GitHub (cached locally) // On first run, downloads 45MB JSON. On subsequent runs, uses cached file. // Commit hash allows pinning to specific data versions for reproducibility. std::cout << "\n[Pipeline] Downloading geographic data from GitHub...\n"; DataDownloader downloader; downloader.DownloadCountriesDatabase(jsonPath, commit); SqliteDatabase db; // Step 1: Initialize empty in-memory database std::cout << "Initializing in-memory SQLite database...\n"; db.Initialize(); // Step 2: Load world city data from JSON file // This populates the database with ~50k city records // Each record includes: city name, country, latitude, longitude, population JsonLoader::LoadWorldCities(jsonPath, db); // Step 3: Initialize brewery generator // Current: Mock implementation using deterministic hashing // Future: LLM-based generation with llama.cpp std::cout << "Initializing brewery generator...\n"; LlamaBreweryGenerator generator; generator.LoadModel(modelPath); // Step 4: Query geographic data from database std::cout << "\n=== GEOGRAPHIC DATA OVERVIEW ===\n"; auto countries = db.QueryCountries(50); auto states = db.QueryStates(50); auto cities = db.QueryCities(); std::cout << "\nTotal records loaded:"; std::cout << "\n Countries: " << db.QueryCountries(0).size(); std::cout << "\n States: " << db.QueryStates(0).size(); std::cout << "\n Cities: " << cities.size() << "\n"; // Display 50 countries std::cout << "\n--- 50 COUNTRIES ---\n"; for (size_t i = 0; i < countries.size(); i++) { std::cout << (i + 1) << ". " << countries[i].iso2 << " (" << countries[i].iso3 << ") " << countries[i].name << "\n"; } // Display 50 states std::cout << "\n--- 50 STATES ---\n"; for (size_t i = 0; i < states.size(); i++) { std::cout << (i + 1) << ". " << states[i].iso2 << ": " << states[i].name << "\n"; } // Display 50 cities std::cout << "\n--- 50 CITIES ---\n"; for (size_t i = 0; i < std::min(size_t(50), cities.size()); i++) { std::cout << (i + 1) << ". " << cities[i].second << "\n"; } // Step 5: Demonstrate brewery generation on sample cities std::cout << "\n=== SAMPLE BREWERY GENERATION ===\n\n"; for (size_t i = 0; i < std::min(size_t(5), cities.size()); i++) { const auto &[cityId, cityName] = cities[i]; auto brewery = generator.GenerateBrewery(cityName, i); std::cout << " " << cityName << ": " << brewery.name << "\n"; std::cout << " → " << brewery.description << "\n"; } std::cout << "\n✓ Pipeline completed successfully\n"; // Cleanup curl_global_cleanup(); return 0; } catch (const std::exception &e) { std::cerr << "✗ Pipeline failed: " << e.what() << "\n"; curl_global_cleanup(); return 1; } }