From 52e23333044738f0b99cdc44d25ed50e389d58ff Mon Sep 17 00:00:00 2001 From: Aaron Po Date: Thu, 2 Apr 2026 18:27:01 -0400 Subject: [PATCH] Reorganize directory structure --- pipeline/CMakeLists.txt | 25 +-- pipeline/includes/application_options.h | 2 + pipeline/includes/biergarten_data_generator.h | 111 ++++++++++ .../{ => data_generation}/data_downloader.h | 2 +- .../{ => data_generation}/data_generator.h | 0 .../{ => data_generation}/llama_generator.h | 2 +- .../{ => data_generation}/mock_generator.h | 2 +- pipeline/includes/{ => database}/database.h | 0 .../{ => json_handling}/json_loader.h | 4 +- .../{ => json_handling}/stream_parser.h | 2 +- .../{ => web_client}/curl_web_client.h | 2 +- .../includes/{ => web_client}/web_client.h | 0 .../{ => wikipedia}/wikipedia_service.h | 2 +- pipeline/src/biergarten_data_generator.cpp | 132 ++++++++++++ .../{ => data_generation}/data_downloader.cpp | 4 +- .../{ => data_generation}/llama_generator.cpp | 2 +- .../{ => data_generation}/mock_generator.cpp | 2 +- pipeline/src/{ => database}/database.cpp | 2 +- .../src/{ => json_handling}/json_loader.cpp | 4 +- .../src/{ => json_handling}/stream_parser.cpp | 4 +- pipeline/src/main.cpp | 193 +++++------------- .../src/{ => web_client}/curl_web_client.cpp | 2 +- .../src/{ => wikipedia}/wikipedia_service.cpp | 2 +- 23 files changed, 330 insertions(+), 171 deletions(-) create mode 100644 pipeline/includes/application_options.h create mode 100644 pipeline/includes/biergarten_data_generator.h rename pipeline/includes/{ => data_generation}/data_downloader.h (95%) rename pipeline/includes/{ => data_generation}/data_generator.h (100%) rename pipeline/includes/{ => data_generation}/llama_generator.h (96%) rename pipeline/includes/{ => data_generation}/mock_generator.h (95%) rename pipeline/includes/{ => database}/database.h (100%) rename pipeline/includes/{ => json_handling}/json_loader.h (80%) rename pipeline/includes/{ => json_handling}/stream_parser.h (97%) rename pipeline/includes/{ => web_client}/curl_web_client.h (95%) rename pipeline/includes/{ => web_client}/web_client.h (100%) rename pipeline/includes/{ => wikipedia}/wikipedia_service.h (95%) create mode 100644 pipeline/src/biergarten_data_generator.cpp rename pipeline/src/{ => data_generation}/data_downloader.cpp (94%) rename pipeline/src/{ => data_generation}/llama_generator.cpp (99%) rename pipeline/src/{ => data_generation}/mock_generator.cpp (99%) rename pipeline/src/{ => database}/database.cpp (99%) rename pipeline/src/{ => json_handling}/json_loader.cpp (96%) rename pipeline/src/{ => json_handling}/stream_parser.cpp (99%) rename pipeline/src/{ => web_client}/curl_web_client.cpp (99%) rename pipeline/src/{ => wikipedia}/wikipedia_service.cpp (98%) diff --git a/pipeline/CMakeLists.txt b/pipeline/CMakeLists.txt index a33b9b9..2527116 100644 --- a/pipeline/CMakeLists.txt +++ b/pipeline/CMakeLists.txt @@ -78,14 +78,15 @@ endif() # Main Executable # ----------------------------------------------------------------------------- set(PIPELINE_SOURCES - src/curl_web_client.cpp - src/data_downloader.cpp - src/database.cpp - src/json_loader.cpp - src/llama_generator.cpp - src/mock_generator.cpp - src/stream_parser.cpp - src/wikipedia_service.cpp + src/biergarten_data_generator.cpp + src/web_client/curl_web_client.cpp + src/data_generation/data_downloader.cpp + src/database/database.cpp + src/json_handling/json_loader.cpp + src/data_generation/llama_generator.cpp + src/data_generation/mock_generator.cpp + src/json_handling/stream_parser.cpp + src/wikipedia/wikipedia_service.cpp src/main.cpp ) @@ -118,10 +119,10 @@ if(ENABLE_CLANG_FORMAT_TARGETS) find_program(CLANG_FORMAT_EXE NAMES clang-format) if(CLANG_FORMAT_EXE) file(GLOB_RECURSE FORMAT_SOURCES CONFIGURE_DEPENDS - ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc - ${CMAKE_CURRENT_SOURCE_DIR}/includes/*.h - ${CMAKE_CURRENT_SOURCE_DIR}/includes/*.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cc + ${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.h + ${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.hpp ) add_custom_target(format diff --git a/pipeline/includes/application_options.h b/pipeline/includes/application_options.h new file mode 100644 index 0000000..3f59c93 --- /dev/null +++ b/pipeline/includes/application_options.h @@ -0,0 +1,2 @@ +#pragma once + diff --git a/pipeline/includes/biergarten_data_generator.h b/pipeline/includes/biergarten_data_generator.h new file mode 100644 index 0000000..b2c75d3 --- /dev/null +++ b/pipeline/includes/biergarten_data_generator.h @@ -0,0 +1,111 @@ +#pragma once + +#include +#include +#include +#include + +#include "application_options.h" +#include "data_generation/data_generator.h" +#include "database/database.h" +#include "web_client/web_client.h" +#include "wikipedia/wikipedia_service.h" + + +/** + * @brief Program options for the Biergarten pipeline application. + */ +struct ApplicationOptions { + /// @brief Path to the LLM model file (gguf format). + std::string modelPath; + + /// @brief Directory for cached JSON and database files. + std::string cacheDir; + + /// @brief LLM sampling temperature (0.0 to 1.0, higher = more random). + float temperature = 0.8f; + + /// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more random). + float topP = 0.92f; + + /// @brief Random seed for sampling (-1 for random, otherwise non-negative). + int seed = -1; + + /// @brief Git commit hash for database consistency. + std::string commit = "c5eb7772"; +}; + + +/** + * @brief Main data generator class for the Biergarten pipeline. + * + * This class encapsulates the core logic for generating brewery data. + * It handles database initialization, data loading/downloading, and brewery generation. + */ +class BiergartenDataGenerator { +public: + /** + * @brief Construct a BiergartenDataGenerator with injected dependencies. + * + * @param options Application configuration options. + * @param webClient HTTP client for downloading data. + * @param database SQLite database instance. + */ + BiergartenDataGenerator(const ApplicationOptions &options, + std::shared_ptr webClient, + SqliteDatabase &database); + + /** + * @brief Run the data generation pipeline. + * + * Performs the following steps: + * 1. Initialize database + * 2. Download geographic data if needed + * 3. Initialize the generator (LLM or Mock) + * 4. Generate brewery data for sample cities + * + * @return 0 on success, 1 on failure. + */ + int Run(); + +private: + /// @brief Immutable application options. + const ApplicationOptions options_; + + /// @brief Shared HTTP client dependency. + std::shared_ptr webClient_; + + /// @brief Database dependency. + SqliteDatabase &database_; + + /** + * @brief Initialize the data generator based on options. + * + * Creates either a MockGenerator (if no model path) or LlamaGenerator. + * + * @return A unique_ptr to the initialized generator. + */ + std::unique_ptr InitializeGenerator(); + + /** + * @brief Download and load geographic data if not cached. + */ + void LoadGeographicData(); + + /** + * @brief Generate sample breweries for demonstration. + */ + void GenerateSampleBreweries(); + + /** + * @brief Helper struct to store generated brewery data. + */ + struct GeneratedBrewery { + int cityId; + std::string cityName; + BreweryResult brewery; + }; + + /// @brief Stores generated brewery data. + std::vector generatedBreweries_; +}; diff --git a/pipeline/includes/data_downloader.h b/pipeline/includes/data_generation/data_downloader.h similarity index 95% rename from pipeline/includes/data_downloader.h rename to pipeline/includes/data_generation/data_downloader.h index def79ec..a7c8148 100644 --- a/pipeline/includes/data_downloader.h +++ b/pipeline/includes/data_generation/data_downloader.h @@ -5,7 +5,7 @@ #include #include -#include "web_client.h" +#include "web_client/web_client.h" /// @brief Downloads and caches source geography JSON payloads. class DataDownloader { diff --git a/pipeline/includes/data_generator.h b/pipeline/includes/data_generation/data_generator.h similarity index 100% rename from pipeline/includes/data_generator.h rename to pipeline/includes/data_generation/data_generator.h diff --git a/pipeline/includes/llama_generator.h b/pipeline/includes/data_generation/llama_generator.h similarity index 96% rename from pipeline/includes/llama_generator.h rename to pipeline/includes/data_generation/llama_generator.h index a5d8c06..5b375b0 100644 --- a/pipeline/includes/llama_generator.h +++ b/pipeline/includes/data_generation/llama_generator.h @@ -3,7 +3,7 @@ #include #include -#include "data_generator.h" +#include "data_generation/data_generator.h" struct llama_model; struct llama_context; diff --git a/pipeline/includes/mock_generator.h b/pipeline/includes/data_generation/mock_generator.h similarity index 95% rename from pipeline/includes/mock_generator.h rename to pipeline/includes/data_generation/mock_generator.h index ca3f1d7..e4497ad 100644 --- a/pipeline/includes/mock_generator.h +++ b/pipeline/includes/data_generation/mock_generator.h @@ -1,6 +1,6 @@ #pragma once -#include "data_generator.h" +#include "data_generation/data_generator.h" #include #include diff --git a/pipeline/includes/database.h b/pipeline/includes/database/database.h similarity index 100% rename from pipeline/includes/database.h rename to pipeline/includes/database/database.h diff --git a/pipeline/includes/json_loader.h b/pipeline/includes/json_handling/json_loader.h similarity index 80% rename from pipeline/includes/json_loader.h rename to pipeline/includes/json_handling/json_loader.h index b85b863..10f1135 100644 --- a/pipeline/includes/json_loader.h +++ b/pipeline/includes/json_handling/json_loader.h @@ -1,7 +1,7 @@ #pragma once -#include "database.h" -#include "stream_parser.h" +#include "database/database.h" +#include "json_handling/stream_parser.h" #include /// @brief Loads world-city JSON data into SQLite through streaming parsing. diff --git a/pipeline/includes/stream_parser.h b/pipeline/includes/json_handling/stream_parser.h similarity index 97% rename from pipeline/includes/stream_parser.h rename to pipeline/includes/json_handling/stream_parser.h index 5977189..6552d1d 100644 --- a/pipeline/includes/stream_parser.h +++ b/pipeline/includes/json_handling/stream_parser.h @@ -1,6 +1,6 @@ #pragma once -#include "database.h" +#include "database/database.h" #include #include diff --git a/pipeline/includes/curl_web_client.h b/pipeline/includes/web_client/curl_web_client.h similarity index 95% rename from pipeline/includes/curl_web_client.h rename to pipeline/includes/web_client/curl_web_client.h index ae60cf6..51f50c0 100644 --- a/pipeline/includes/curl_web_client.h +++ b/pipeline/includes/web_client/curl_web_client.h @@ -1,6 +1,6 @@ #pragma once -#include "web_client.h" +#include "web_client/web_client.h" #include // RAII for curl_global_init/cleanup. diff --git a/pipeline/includes/web_client.h b/pipeline/includes/web_client/web_client.h similarity index 100% rename from pipeline/includes/web_client.h rename to pipeline/includes/web_client/web_client.h diff --git a/pipeline/includes/wikipedia_service.h b/pipeline/includes/wikipedia/wikipedia_service.h similarity index 95% rename from pipeline/includes/wikipedia_service.h rename to pipeline/includes/wikipedia/wikipedia_service.h index 55c1e32..343300f 100644 --- a/pipeline/includes/wikipedia_service.h +++ b/pipeline/includes/wikipedia/wikipedia_service.h @@ -5,7 +5,7 @@ #include #include -#include "web_client.h" +#include "web_client/web_client.h" /// @brief Provides cached Wikipedia summary lookups for city and country pairs. class WikipediaService { diff --git a/pipeline/src/biergarten_data_generator.cpp b/pipeline/src/biergarten_data_generator.cpp new file mode 100644 index 0000000..7fa8427 --- /dev/null +++ b/pipeline/src/biergarten_data_generator.cpp @@ -0,0 +1,132 @@ +#include "biergarten_data_generator.h" + +#include +#include +#include + +#include + +#include "data_generation/data_downloader.h" +#include "json_handling/json_loader.h" +#include "data_generation/llama_generator.h" +#include "data_generation/mock_generator.h" +#include "wikipedia/wikipedia_service.h" + +BiergartenDataGenerator::BiergartenDataGenerator( + const ApplicationOptions &options, + std::shared_ptr webClient, + SqliteDatabase &database) + : options_(options), webClient_(webClient), database_(database) {} + +std::unique_ptr BiergartenDataGenerator::InitializeGenerator() { + spdlog::info("Initializing brewery generator..."); + + std::unique_ptr generator; + if (options_.modelPath.empty()) { + generator = std::make_unique(); + spdlog::info("[Generator] Using MockGenerator (no model path provided)"); + } else { + auto llamaGenerator = std::make_unique(); + llamaGenerator->setSamplingOptions(options_.temperature, options_.topP, + options_.seed); + spdlog::info( + "[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, " + "seed={})", + options_.modelPath, options_.temperature, options_.topP, + options_.seed); + generator = std::move(llamaGenerator); + } + generator->load(options_.modelPath); + + return generator; +} + +void BiergartenDataGenerator::LoadGeographicData() { + std::string jsonPath = options_.cacheDir + "/countries+states+cities.json"; + std::string dbPath = options_.cacheDir + "/biergarten-pipeline.db"; + + bool hasJsonCache = std::filesystem::exists(jsonPath); + bool hasDbCache = std::filesystem::exists(dbPath); + + spdlog::info("Initializing SQLite database at {}...", dbPath); + database_.Initialize(dbPath); + + if (hasDbCache && hasJsonCache) { + spdlog::info("[Pipeline] Cache hit: skipping download and parse"); + } else { + spdlog::info("\n[Pipeline] Downloading geographic data from GitHub..."); + DataDownloader downloader(webClient_); + downloader.DownloadCountriesDatabase(jsonPath, options_.commit); + + JsonLoader::LoadWorldCities(jsonPath, database_); + } +} + +void BiergartenDataGenerator::GenerateSampleBreweries() { + auto generator = InitializeGenerator(); + WikipediaService wikipediaService(webClient_); + + spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ==="); + + auto countries = database_.QueryCountries(50); + auto states = database_.QueryStates(50); + auto cities = database_.QueryCities(); + + // Build a quick map of country id -> name for per-city lookups. + auto allCountries = database_.QueryCountries(0); + std::unordered_map countryMap; + for (const auto &c : allCountries) + countryMap[c.id] = c.name; + + spdlog::info("\nTotal records loaded:"); + spdlog::info(" Countries: {}", database_.QueryCountries(0).size()); + spdlog::info(" States: {}", database_.QueryStates(0).size()); + spdlog::info(" Cities: {}", cities.size()); + + generatedBreweries_.clear(); + const size_t sampleCount = std::min(size_t(30), cities.size()); + + spdlog::info("\n=== SAMPLE BREWERY GENERATION ==="); + for (size_t i = 0; i < sampleCount; i++) { + const auto &city = cities[i]; + const int cityId = city.id; + const std::string cityName = city.name; + + std::string localCountry; + const auto countryIt = countryMap.find(city.countryId); + if (countryIt != countryMap.end()) { + localCountry = countryIt->second; + } + + const std::string regionContext = + wikipediaService.GetSummary(cityName, localCountry); + spdlog::debug("[Pipeline] Region context for {}: {}", cityName, + regionContext); + + auto brewery = + generator->generateBrewery(cityName, localCountry, regionContext); + generatedBreweries_.push_back({cityId, cityName, brewery}); + } + + spdlog::info("\n=== GENERATED DATA DUMP ==="); + for (size_t i = 0; i < generatedBreweries_.size(); i++) { + const auto &entry = generatedBreweries_[i]; + spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.cityId, + entry.cityName); + spdlog::info(" brewery_name=\"{}\"", entry.brewery.name); + spdlog::info(" brewery_description=\"{}\"", entry.brewery.description); + } +} + +int BiergartenDataGenerator::Run() { + try { + LoadGeographicData(); + GenerateSampleBreweries(); + + spdlog::info("\nOK: Pipeline completed successfully"); + return 0; + } catch (const std::exception &e) { + spdlog::error("ERROR: Pipeline failed: {}", e.what()); + return 1; + } +} diff --git a/pipeline/src/data_downloader.cpp b/pipeline/src/data_generation/data_downloader.cpp similarity index 94% rename from pipeline/src/data_downloader.cpp rename to pipeline/src/data_generation/data_downloader.cpp index 7d141f0..a859d11 100644 --- a/pipeline/src/data_downloader.cpp +++ b/pipeline/src/data_generation/data_downloader.cpp @@ -1,5 +1,5 @@ -#include "data_downloader.h" -#include "web_client.h" +#include "data_generation/data_downloader.h" +#include "web_client/web_client.h" #include #include #include diff --git a/pipeline/src/llama_generator.cpp b/pipeline/src/data_generation/llama_generator.cpp similarity index 99% rename from pipeline/src/llama_generator.cpp rename to pipeline/src/data_generation/llama_generator.cpp index f736852..7c7f414 100644 --- a/pipeline/src/llama_generator.cpp +++ b/pipeline/src/data_generation/llama_generator.cpp @@ -11,7 +11,7 @@ #include #include -#include "llama_generator.h" +#include "data_generation/llama_generator.h" namespace { diff --git a/pipeline/src/mock_generator.cpp b/pipeline/src/data_generation/mock_generator.cpp similarity index 99% rename from pipeline/src/mock_generator.cpp rename to pipeline/src/data_generation/mock_generator.cpp index 126bc66..f777cb7 100644 --- a/pipeline/src/mock_generator.cpp +++ b/pipeline/src/data_generation/mock_generator.cpp @@ -1,4 +1,4 @@ -#include "mock_generator.h" +#include "data_generation/mock_generator.h" #include #include diff --git a/pipeline/src/database.cpp b/pipeline/src/database/database.cpp similarity index 99% rename from pipeline/src/database.cpp rename to pipeline/src/database/database.cpp index a8749d2..fcdc60a 100644 --- a/pipeline/src/database.cpp +++ b/pipeline/src/database/database.cpp @@ -1,4 +1,4 @@ -#include "database.h" +#include "database/database.h" #include #include diff --git a/pipeline/src/json_loader.cpp b/pipeline/src/json_handling/json_loader.cpp similarity index 96% rename from pipeline/src/json_loader.cpp rename to pipeline/src/json_handling/json_loader.cpp index 6a7a966..41f7076 100644 --- a/pipeline/src/json_loader.cpp +++ b/pipeline/src/json_handling/json_loader.cpp @@ -2,8 +2,8 @@ #include -#include "json_loader.h" -#include "stream_parser.h" +#include "json_handling/json_loader.h" +#include "json_handling/stream_parser.h" void JsonLoader::LoadWorldCities(const std::string &jsonPath, SqliteDatabase &db) { diff --git a/pipeline/src/stream_parser.cpp b/pipeline/src/json_handling/stream_parser.cpp similarity index 99% rename from pipeline/src/stream_parser.cpp rename to pipeline/src/json_handling/stream_parser.cpp index abf722d..cdb87e8 100644 --- a/pipeline/src/stream_parser.cpp +++ b/pipeline/src/json_handling/stream_parser.cpp @@ -5,8 +5,8 @@ #include #include -#include "database.h" -#include "stream_parser.h" +#include "database/database.h" +#include "json_handling/stream_parser.h" class CityRecordHandler { friend class boost::json::basic_parser; diff --git a/pipeline/src/main.cpp b/pipeline/src/main.cpp index 3f3c458..72ce570 100644 --- a/pipeline/src/main.cpp +++ b/pipeline/src/main.cpp @@ -1,163 +1,76 @@ -#include -#include #include #include -#include -#include #include #include -#include "curl_web_client.h" -#include "data_downloader.h" -#include "data_generator.h" -#include "database.h" -#include "json_loader.h" -#include "llama_generator.h" -#include "mock_generator.h" -#include "wikipedia_service.h" +#include "application_options.h" +#include "biergarten_data_generator.h" +#include "web_client/curl_web_client.h" +#include "database/database.h" namespace po = boost::program_options; +/** + * @brief Parse command-line arguments into ApplicationOptions. + * + * @param argc Command-line argument count. + * @param argv Command-line arguments. + * @param options Output ApplicationOptions struct. + * @return true if parsing succeeded and help was not requested, false otherwise. + */ +bool ParseArguments(int argc, char **argv, ApplicationOptions &options) { + po::options_description desc("Pipeline Options"); + desc.add_options()("help,h", "Produce help message")( + "model,m", po::value()->default_value(""), + "Path to LLM model (gguf)")( + "cache-dir,c", po::value()->default_value("/tmp"), + "Directory for cached JSON")( + "temperature", po::value()->default_value(0.8f), + "Sampling temperature (higher = more random)")( + "top-p", po::value()->default_value(0.92f), + "Nucleus sampling top-p in (0,1] (higher = more random)")( + "seed", po::value()->default_value(-1), + "Sampler seed: -1 for random, otherwise non-negative integer")( + "commit", po::value()->default_value("c5eb7772"), + "Git commit hash for DB consistency"); + + po::variables_map vm; + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << "\n"; + return false; + } + + options.modelPath = vm["model"].as(); + options.cacheDir = vm["cache-dir"].as(); + options.temperature = vm["temperature"].as(); + options.topP = vm["top-p"].as(); + options.seed = vm["seed"].as(); + options.commit = vm["commit"].as(); + + return true; +} + int main(int argc, char *argv[]) { try { const CurlGlobalState curl_state; - po::options_description desc("Pipeline Options"); - desc.add_options()("help,h", "Produce help message")( - "model,m", po::value()->default_value(""), - "Path to LLM model (gguf)")( - "cache-dir,c", po::value()->default_value("/tmp"), - "Directory for cached JSON")( - "temperature", po::value()->default_value(0.8f), - "Sampling temperature (higher = more random)")( - "top-p", po::value()->default_value(0.92f), - "Nucleus sampling top-p in (0,1] (higher = more random)")( - "seed", po::value()->default_value(-1), - "Sampler seed: -1 for random, otherwise non-negative integer")( - "commit", po::value()->default_value("c5eb7772"), - "Git commit hash for DB consistency"); - - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, desc), vm); - po::notify(vm); - - if (vm.count("help")) { - std::cout << desc << "\n"; + ApplicationOptions options; + if (!ParseArguments(argc, argv, options)) { return 0; } - std::string modelPath = vm["model"].as(); - std::string cacheDir = vm["cache-dir"].as(); - float temperature = vm["temperature"].as(); - float topP = vm["top-p"].as(); - int seed = vm["seed"].as(); - std::string commit = vm["commit"].as(); - - std::string jsonPath = cacheDir + "/countries+states+cities.json"; - std::string dbPath = cacheDir + "/biergarten-pipeline.db"; - - bool hasJsonCache = std::filesystem::exists(jsonPath); - bool hasDbCache = std::filesystem::exists(dbPath); - auto webClient = std::make_shared(); + SqliteDatabase database; - SqliteDatabase db; - - spdlog::info("Initializing SQLite database at {}...", dbPath); - db.Initialize(dbPath); - - if (hasDbCache && hasJsonCache) { - spdlog::info("[Pipeline] Cache hit: skipping download and parse"); - } else { - spdlog::info("\n[Pipeline] Downloading geographic data from GitHub..."); - DataDownloader downloader(webClient); - downloader.DownloadCountriesDatabase(jsonPath, commit); - - JsonLoader::LoadWorldCities(jsonPath, db); - } - - spdlog::info("Initializing brewery generator..."); - std::unique_ptr generator; - if (modelPath.empty()) { - generator = std::make_unique(); - spdlog::info("[Generator] Using MockGenerator (no model path provided)"); - } else { - auto llamaGenerator = std::make_unique(); - llamaGenerator->setSamplingOptions(temperature, topP, seed); - spdlog::info( - "[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, " - "seed={})", - modelPath, temperature, topP, seed); - generator = std::move(llamaGenerator); - } - generator->load(modelPath); - - WikipediaService wikipediaService(webClient); - - spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ==="); - - auto countries = db.QueryCountries(50); - auto states = db.QueryStates(50); - auto cities = db.QueryCities(); - - // Build a quick map of country id -> name for per-city lookups. - auto allCountries = db.QueryCountries(0); - std::unordered_map countryMap; - for (const auto &c : allCountries) - countryMap[c.id] = c.name; - - spdlog::info("\nTotal records loaded:"); - spdlog::info(" Countries: {}", db.QueryCountries(0).size()); - spdlog::info(" States: {}", db.QueryStates(0).size()); - spdlog::info(" Cities: {}", cities.size()); - - struct GeneratedBrewery { - int cityId; - std::string cityName; - BreweryResult brewery; - }; - - std::vector generatedBreweries; - const size_t sampleCount = std::min(size_t(30), cities.size()); - - spdlog::info("\n=== SAMPLE BREWERY GENERATION ==="); - for (size_t i = 0; i < sampleCount; i++) { - const auto &city = cities[i]; - const int cityId = city.id; - const std::string cityName = city.name; - - std::string localCountry; - const auto countryIt = countryMap.find(city.countryId); - if (countryIt != countryMap.end()) { - localCountry = countryIt->second; - } - - const std::string regionContext = - wikipediaService.GetSummary(cityName, localCountry); - spdlog::debug("[Pipeline] Region context for {}: {}", cityName, - regionContext); - - auto brewery = - generator->generateBrewery(cityName, localCountry, regionContext); - generatedBreweries.push_back({cityId, cityName, brewery}); - } - - spdlog::info("\n=== GENERATED DATA DUMP ==="); - for (size_t i = 0; i < generatedBreweries.size(); i++) { - const auto &entry = generatedBreweries[i]; - spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.cityId, - entry.cityName); - spdlog::info(" brewery_name=\"{}\"", entry.brewery.name); - spdlog::info(" brewery_description=\"{}\"", entry.brewery.description); - } - - spdlog::info("\nOK: Pipeline completed successfully"); - - return 0; + BiergartenDataGenerator generator(options, webClient, database); + return generator.Run(); } catch (const std::exception &e) { - spdlog::error("ERROR: Pipeline failed: {}", e.what()); + spdlog::error("ERROR: Application failed: {}", e.what()); return 1; } } diff --git a/pipeline/src/curl_web_client.cpp b/pipeline/src/web_client/curl_web_client.cpp similarity index 99% rename from pipeline/src/curl_web_client.cpp rename to pipeline/src/web_client/curl_web_client.cpp index 2201dec..df08780 100644 --- a/pipeline/src/curl_web_client.cpp +++ b/pipeline/src/web_client/curl_web_client.cpp @@ -1,4 +1,4 @@ -#include "curl_web_client.h" +#include "web_client/curl_web_client.h" #include #include #include diff --git a/pipeline/src/wikipedia_service.cpp b/pipeline/src/wikipedia/wikipedia_service.cpp similarity index 98% rename from pipeline/src/wikipedia_service.cpp rename to pipeline/src/wikipedia/wikipedia_service.cpp index 29b3092..e1bd82d 100644 --- a/pipeline/src/wikipedia_service.cpp +++ b/pipeline/src/wikipedia/wikipedia_service.cpp @@ -1,4 +1,4 @@ -#include "wikipedia_service.h" +#include "wikipedia/wikipedia_service.h" #include #include