diff --git a/pipeline/CMakeLists.txt b/pipeline/CMakeLists.txt index 4872066..0ad101f 100644 --- a/pipeline/CMakeLists.txt +++ b/pipeline/CMakeLists.txt @@ -89,7 +89,27 @@ FetchContent_MakeAvailable(spdlog) # ============================================================================= set(SOURCES src/main.cpp - src/biergarten_data_generator.cpp + # BiergartenDataGenerator methods + src/biergarten_data_generator/constructor.cpp + src/biergarten_data_generator/run.cpp + src/biergarten_data_generator/initialize_generator.cpp + src/biergarten_data_generator/query_cities_with_countries.cpp + src/biergarten_data_generator/enrich_with_wikipedia.cpp + src/biergarten_data_generator/generate_breweries.cpp + src/biergarten_data_generator/log_results.cpp + # WikipediaService methods + src/wikipedia/constructor.cpp + src/wikipedia/get_summary.cpp + src/wikipedia/fetch_extract.cpp + # CURLWebClient and CurlGlobalState methods + src/web_client/curl_global_state_constructor.cpp + src/web_client/curl_global_state_destructor.cpp + src/web_client/curl_web_client_constructor.cpp + src/web_client/curl_web_client_destructor.cpp + src/web_client/curl_web_client_download_to_file.cpp + src/web_client/curl_web_client_get.cpp + src/web_client/curl_web_client_url_encode.cpp + # Data generation modules src/data_generation/llama/destructor.cpp src/data_generation/llama/generate_brewery.cpp src/data_generation/llama/generate_user.cpp @@ -104,8 +124,6 @@ set(SOURCES src/data_generation/mock/generate_user.cpp src/data_generation/mock/load.cpp src/json_handling/json_loader.cpp - src/web_client/curl_web_client.cpp - src/wikipedia/wikipedia_service.cpp ) # ============================================================================= # 5. Target diff --git a/pipeline/includes/biergarten_data_generator.h b/pipeline/includes/biergarten_data_generator.h index 7cab868..f3ab31a 100644 --- a/pipeline/includes/biergarten_data_generator.h +++ b/pipeline/includes/biergarten_data_generator.h @@ -27,9 +27,6 @@ struct ApplicationOptions { /// model_path. bool use_mocked = false; - /// @brief Directory for cached JSON and database files. - std::string cache_dir; - /// @brief LLM sampling temperature (0.0 to 1.0, higher = more random). float temperature = 0.8f; @@ -43,10 +40,6 @@ struct ApplicationOptions { /// @brief Random seed for sampling (-1 for random, otherwise non-negative). int seed = -1; - - /// @brief Git commit hash for database consistency (always pinned to - /// c5eb7772). - std::string commit = "c5eb7772"; }; /** @@ -63,8 +56,8 @@ class BiergartenDataGenerator { * @param options Application configuration options. * @param web_client HTTP client for downloading data. */ - BiergartenDataGenerator(const ApplicationOptions& options, - std::shared_ptr web_client); + BiergartenDataGenerator(ApplicationOptions options, + std::unique_ptr web_client); /** * @brief Run the data generation pipeline. @@ -74,9 +67,9 @@ class BiergartenDataGenerator { * 2. Initialize the generator (LLM or Mock) * 3. Generate brewery data for sampled cities * - * @return 0 on success, 1 on failure. + * @return true if successful, false if not */ - int Run(); + bool Run(); private: /// @brief Immutable application options. @@ -100,14 +93,14 @@ class BiergartenDataGenerator { * * @return A unique_ptr to the initialized generator. */ - std::unique_ptr InitializeGenerator(); + std::unique_ptr InitializeGenerator() const; /** * @brief Load locations from JSON and sample cities. * * @return Vector of sampled locations capped at 30 entries. */ - std::vector QueryCitiesWithCountries(); + static std::vector QueryCitiesWithCountries(); /** * @brief Enrich cities with Wikipedia summaries. diff --git a/pipeline/includes/wikipedia/wikipedia_service.h b/pipeline/includes/wikipedia/wikipedia_service.h index 4a21962..5c5aa1b 100644 --- a/pipeline/includes/wikipedia/wikipedia_service.h +++ b/pipeline/includes/wikipedia/wikipedia_service.h @@ -24,7 +24,7 @@ class WikipediaService { std::string_view country); private: - std::string FetchExtract(std::string_view query); + std::string FetchExtract(std::string_view query) const; std::shared_ptr client_; std::unordered_map cache_; }; diff --git a/pipeline/src/biergarten_data_generator.cpp b/pipeline/src/biergarten_data_generator.cpp deleted file mode 100644 index b00730c..0000000 --- a/pipeline/src/biergarten_data_generator.cpp +++ /dev/null @@ -1,168 +0,0 @@ -/** - * @file biergarten_data_generator.cpp - * @brief Orchestrates end-to-end pipeline execution for city sampling, - * Wikipedia enrichment, generator initialization, and brewery result output. - */ - -#include "biergarten_data_generator.h" - -#include - -#include -#include -#include -#include -#include - -#include "data_generation/llama_generator.h" -#include "data_generation/mock_generator.h" -#include "json_handling/json_loader.h" -#include "wikipedia/wikipedia_service.h" - -BiergartenDataGenerator::BiergartenDataGenerator( - const ApplicationOptions& options, std::shared_ptr web_client) - : options_(options), webClient_(std::move(web_client)) {} - -auto BiergartenDataGenerator::InitializeGenerator() - -> std::unique_ptr { - spdlog::info("Initializing brewery generator..."); - - std::unique_ptr generator; - if (options_.model_path.empty()) { - generator = std::make_unique(); - spdlog::info("[Generator] Using MockGenerator (no model path provided)"); - } else { - auto llama_generator = std::make_unique(); - llama_generator->SetSamplingOptions(options_.temperature, options_.top_p, - options_.seed); - llama_generator->SetContextSize(options_.n_ctx); - spdlog::info( - "[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, " - "n_ctx={}, seed={})", - options_.model_path, options_.temperature, options_.top_p, - options_.n_ctx, options_.seed); - generator = std::move(llama_generator); - } - generator->Load(options_.model_path); - - return generator; -} - -auto BiergartenDataGenerator::QueryCitiesWithCountries() - -> std::vector { - spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ==="); - - std::filesystem::path locations_path = "locations.json"; - if (!std::filesystem::exists(locations_path)) { - const std::filesystem::path cache_path = - std::filesystem::path(options_.cache_dir) / "locations.json"; - if (std::filesystem::exists(cache_path)) { - locations_path = cache_path; - } - } - - auto all_locations = JsonLoader::LoadLocations(locations_path.string()); - spdlog::info(" Locations available: {}", all_locations.size()); - - const size_t sample_count = std::min(4, all_locations.size()); - std::vector sampled_locations; - sampled_locations.reserve(sample_count); - - std::random_device random_generator; - std::sample(all_locations.begin(), all_locations.end(), - std::back_inserter(sampled_locations), sample_count, - random_generator); - - spdlog::info(" Sampled locations: {}", sampled_locations.size()); - return sampled_locations; -} - -auto BiergartenDataGenerator::EnrichWithWikipedia( - const std::vector& cities) -> std::vector { - std::vector enriched; - enriched.reserve(cities.size()); - - std::vector> pending; - pending.reserve(cities.size()); - - for (const auto& city : cities) { - pending.push_back( - std::async(std::launch::async, [web_client = webClient_, city]() { - WikipediaService wikipedia_service(web_client); - const std::string region_context = - wikipedia_service.GetSummary(city.city, city.country); - spdlog::debug("[Pipeline] Region context for {}: {}", city.city, - region_context); - return EnrichedCity{city, region_context}; - })); - } - - for (auto& task : pending) { - enriched.push_back(task.get()); - } - - return enriched; -} - -void BiergartenDataGenerator::GenerateBreweries( - DataGenerator& generator, const std::vector& cities) { - spdlog::info("\n=== SAMPLE BREWERY GENERATION ==="); - generatedBreweries_.clear(); - - size_t skipped_count = 0; - - for (const auto& enriched_city : cities) { - try { - auto brewery = generator.GenerateBrewery( - enriched_city.location.city, enriched_city.location.country, - enriched_city.region_context); - generatedBreweries_.push_back({enriched_city.location, brewery}); - } catch (const std::exception& e) { - ++skipped_count; - spdlog::warn( - "[Pipeline] Skipping city '{}' ({}): brewery generation failed: " - "{}", - enriched_city.location.city, enriched_city.location.country, - e.what()); - } - } - - if (skipped_count > 0) { - spdlog::warn( - "[Pipeline] Skipped {} city/cities due to generation " - "errors", - skipped_count); - } -} - -void BiergartenDataGenerator::LogResults() const { - spdlog::info("\n=== GENERATED DATA DUMP ==="); - size_t index = 1; - for (const auto& entry : generatedBreweries_) { - spdlog::info( - "{}. city=\"{}\" country=\"{}\" state=\"{}\" " - "iso3166_2={} lat={} lon={}", - index, entry.location.city, entry.location.country, - entry.location.state_province, entry.location.iso3166_2, - entry.location.latitude, entry.location.longitude); - spdlog::info(" brewery_name=\"{}\"", entry.brewery.name); - spdlog::info(" brewery_description=\"{}\"", entry.brewery.description); - ++index; - } -} - -auto BiergartenDataGenerator::Run() -> int { - try { - auto generator = InitializeGenerator(); - auto cities = QueryCitiesWithCountries(); - auto enriched = EnrichWithWikipedia(cities); - GenerateBreweries(*generator, enriched); - LogResults(); - - spdlog::info("\nOK: Pipeline completed successfully"); - return 0; - } catch (const std::exception& e) { - spdlog::error("ERROR: Pipeline failed: {}", e.what()); - return 1; - } -} diff --git a/pipeline/src/biergarten_data_generator/constructor.cpp b/pipeline/src/biergarten_data_generator/constructor.cpp new file mode 100644 index 0000000..51f3e10 --- /dev/null +++ b/pipeline/src/biergarten_data_generator/constructor.cpp @@ -0,0 +1,12 @@ +/** + * @file biergarten_data_generator/constructor.cpp + * @brief BiergartenDataGenerator constructor implementation. + */ + +#include + +#include "biergarten_data_generator.h" + +BiergartenDataGenerator::BiergartenDataGenerator( + ApplicationOptions options, std::unique_ptr web_client) + : options_(std::move(options)), webClient_(std::move(web_client)) {} diff --git a/pipeline/src/biergarten_data_generator/enrich_with_wikipedia.cpp b/pipeline/src/biergarten_data_generator/enrich_with_wikipedia.cpp new file mode 100644 index 0000000..82d9e85 --- /dev/null +++ b/pipeline/src/biergarten_data_generator/enrich_with_wikipedia.cpp @@ -0,0 +1,69 @@ +/** + * @file biergarten_data_generator/enrich_with_wikipedia.cpp + * @brief BiergartenDataGenerator::EnrichWithWikipedia() implementation. + */ + +#include + +#include +#include +#include + +#include "biergarten_data_generator.h" +#include "wikipedia/wikipedia_service.h" + +namespace { + +auto TryGetRegionContext(const std::shared_ptr& web_client, + const Location* city_ptr, + std::atomic* skipped_enrichment_count) noexcept + -> std::optional { + try { + WikipediaService wikipedia_service(web_client); + return wikipedia_service.GetSummary(city_ptr->city, city_ptr->country); + } catch (...) { + skipped_enrichment_count->fetch_add(1, std::memory_order_relaxed); + return std::nullopt; + } +} + +} // namespace + +auto BiergartenDataGenerator::EnrichWithWikipedia( + const std::vector& cities) -> std::vector { + std::vector enriched; + enriched.reserve(cities.size()); + + std::atomic skipped_enrichment_count = 0; + std::vector>> pending; + pending.reserve(cities.size()); + + for (const auto& city : cities) { + const Location* city_ptr = &city; + pending.push_back(std::async(std::launch::async, TryGetRegionContext, + webClient_, city_ptr, + &skipped_enrichment_count)); + } + + auto city_it = cities.cbegin(); + for (auto& task : pending) { + auto maybe_region_context = task.get(); + if (maybe_region_context.has_value()) { + spdlog::debug("[Pipeline] Region context for {}: {}", city_it->city, + *maybe_region_context); + enriched.push_back( + EnrichedCity{.location = *city_it, + .region_context = std::move(*maybe_region_context)}); + } + ++city_it; + } + + if (skipped_enrichment_count.load(std::memory_order_relaxed) > 0) { + spdlog::warn( + "[Pipeline] Skipped {} city/cities due to Wikipedia enrichment " + "errors", + skipped_enrichment_count.load(std::memory_order_relaxed)); + } + + return enriched; +} diff --git a/pipeline/src/biergarten_data_generator/generate_breweries.cpp b/pipeline/src/biergarten_data_generator/generate_breweries.cpp new file mode 100644 index 0000000..904b04a --- /dev/null +++ b/pipeline/src/biergarten_data_generator/generate_breweries.cpp @@ -0,0 +1,40 @@ +/** + * @file biergarten_data_generator/generate_breweries.cpp + * @brief BiergartenDataGenerator::GenerateBreweries() implementation. + */ + +#include + +#include "biergarten_data_generator.h" + +void BiergartenDataGenerator::GenerateBreweries( + DataGenerator& generator, const std::vector& cities) { + spdlog::info("\n=== SAMPLE BREWERY GENERATION ==="); + generatedBreweries_.clear(); + + size_t skipped_count = 0; + + for (const auto& enriched_city : cities) { + try { + auto brewery = generator.GenerateBrewery( + enriched_city.location.city, enriched_city.location.country, + enriched_city.region_context); + generatedBreweries_.push_back(GeneratedBrewery{ + .location = enriched_city.location, .brewery = brewery}); + } catch (const std::exception& e) { + ++skipped_count; + spdlog::warn( + "[Pipeline] Skipping city '{}' ({}): brewery generation failed: " + "{}", + enriched_city.location.city, enriched_city.location.country, + e.what()); + } + } + + if (skipped_count > 0) { + spdlog::warn( + "[Pipeline] Skipped {} city/cities due to generation " + "errors", + skipped_count); + } +} diff --git a/pipeline/src/biergarten_data_generator/initialize_generator.cpp b/pipeline/src/biergarten_data_generator/initialize_generator.cpp new file mode 100644 index 0000000..00b5c46 --- /dev/null +++ b/pipeline/src/biergarten_data_generator/initialize_generator.cpp @@ -0,0 +1,35 @@ +/** + * @file biergarten_data_generator/initialize_generator.cpp + * @brief BiergartenDataGenerator::InitializeGenerator() implementation. + */ + +#include + +#include "biergarten_data_generator.h" +#include "data_generation/llama_generator.h" +#include "data_generation/mock_generator.h" + +auto BiergartenDataGenerator::InitializeGenerator() const + -> std::unique_ptr { + spdlog::info("Initializing brewery generator..."); + + std::unique_ptr generator; + if (options_.model_path.empty()) { + generator = std::make_unique(); + spdlog::info("[Generator] Using MockGenerator (no model path provided)"); + } else { + auto llama_generator = std::make_unique(); + llama_generator->SetSamplingOptions(options_.temperature, options_.top_p, + options_.seed); + llama_generator->SetContextSize(options_.n_ctx); + spdlog::info( + "[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, " + "n_ctx={}, seed={})", + options_.model_path, options_.temperature, options_.top_p, + options_.n_ctx, options_.seed); + generator = std::move(llama_generator); + } + generator->Load(options_.model_path); + + return generator; +} diff --git a/pipeline/src/biergarten_data_generator/log_results.cpp b/pipeline/src/biergarten_data_generator/log_results.cpp new file mode 100644 index 0000000..041fb3a --- /dev/null +++ b/pipeline/src/biergarten_data_generator/log_results.cpp @@ -0,0 +1,23 @@ +/** + * @file biergarten_data_generator/log_results.cpp + * @brief BiergartenDataGenerator::LogResults() implementation. + */ + +#include + +#include "biergarten_data_generator.h" + +void BiergartenDataGenerator::LogResults() const { + spdlog::info("\n=== GENERATED DATA DUMP ==="); + size_t index = 1; + for (const auto& [location, brewery] : generatedBreweries_) { + spdlog::info( + "{}. city=\"{}\" country=\"{}\" state=\"{}\" " + "iso3166_2={} lat={} lon={}", + index, location.city, location.country, location.state_province, + location.iso3166_2, location.latitude, location.longitude); + spdlog::info(" brewery_name=\"{}\"", brewery.name); + spdlog::info(" brewery_description=\"{}\"", brewery.description); + ++index; + } +} diff --git a/pipeline/src/biergarten_data_generator/query_cities_with_countries.cpp b/pipeline/src/biergarten_data_generator/query_cities_with_countries.cpp new file mode 100644 index 0000000..8b3972c --- /dev/null +++ b/pipeline/src/biergarten_data_generator/query_cities_with_countries.cpp @@ -0,0 +1,40 @@ +/** + * @file biergarten_data_generator/query_cities_with_countries.cpp + * @brief BiergartenDataGenerator::QueryCitiesWithCountries() implementation. + */ + +#include + +#include +#include +#include + +#include "biergarten_data_generator.h" +#include "json_handling/json_loader.h" + +static constexpr unsigned int brewery_amount = 4; + +auto BiergartenDataGenerator::QueryCitiesWithCountries() + -> std::vector { + spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ==="); + + const std::filesystem::path locations_path = "locations.json"; + + auto all_locations = JsonLoader::LoadLocations(locations_path.string()); + spdlog::info(" Locations available: {}", all_locations.size()); + + const size_t sample_count = + std::min(brewery_amount, all_locations.size()); + const auto sample_count_signed = + static_cast>( + sample_count); + std::vector sampled_locations; + sampled_locations.reserve(sample_count); + + std::random_device random_generator; + std::ranges::sample(all_locations, std::back_inserter(sampled_locations), + sample_count_signed, random_generator); + + spdlog::info(" Sampled locations: {}", sampled_locations.size()); + return sampled_locations; +} diff --git a/pipeline/src/biergarten_data_generator/run.cpp b/pipeline/src/biergarten_data_generator/run.cpp new file mode 100644 index 0000000..e5ca88c --- /dev/null +++ b/pipeline/src/biergarten_data_generator/run.cpp @@ -0,0 +1,22 @@ +/** + * @file biergarten_data_generator/run.cpp + * @brief BiergartenDataGenerator::Run() implementation. + */ + +#include + +#include "biergarten_data_generator.h" + +auto BiergartenDataGenerator::Run() -> bool { + try { + const std::unique_ptr generator = InitializeGenerator(); + const std::vector cities = QueryCitiesWithCountries(); + const std::vector enriched = EnrichWithWikipedia(cities); + this->GenerateBreweries(*generator, enriched); + this->LogResults(); + return true; + } catch (const std::exception& e) { + spdlog::error("Pipeline execution failed with error: {}", e.what()); + return false; + } +} diff --git a/pipeline/src/main.cpp b/pipeline/src/main.cpp index 8258adb..2a9496c 100644 --- a/pipeline/src/main.cpp +++ b/pipeline/src/main.cpp @@ -7,13 +7,14 @@ #include #include -#include +#include #include +#include #include "biergarten_data_generator.h" #include "web_client/curl_web_client.h" -namespace po = boost::program_options; +namespace prog_opts = boost::program_options; /** * @brief Parse command-line arguments into ApplicationOptions. @@ -23,123 +24,119 @@ namespace po = boost::program_options; * @param options Output ApplicationOptions struct. * @return true if parsing succeeded and should proceed, false otherwise. */ -bool ParseArguments(int argc, char** argv, ApplicationOptions& options) { - // If no arguments provided, display usage and exit - if (argc == 1) { - std::cout << "Biergarten Pipeline - Geographic Data Pipeline with " - "Brewery Generation\n\n"; - std::cout << "Usage: biergarten-pipeline [options]\n\n"; - std::cout << "Options:\n"; - std::cout << " --mocked Use mocked generator for " - "brewery/user data\n"; - std::cout << " --model, -m PATH Path to LLM model file (gguf) for " - "generation\n"; - std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: " - "/tmp)\n"; - std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 " - "(default: 0.8)\n"; - std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 " - "(default: 0.92)\n"; - std::cout << " --n-ctx SIZE Context window size in tokens " - "(default: 4096)\n"; - std::cout << " --seed SEED Random seed: -1 for random " - "(default: -1)\n"; - std::cout << " --help, -h Show this help message\n\n"; - std::cout << "Note: --mocked and --model are mutually exclusive. Exactly " - "one must be provided.\n"; - return false; - } - - po::options_description desc("Pipeline Options"); - desc.add_options()("help,h", "Produce help message")( - "mocked", po::bool_switch(), - "Use mocked generator for brewery/user data")( - "model,m", po::value()->default_value(""), - "Path to LLM model (gguf)")( - "cache-dir,c", po::value()->default_value("/tmp"), - "Directory for cached JSON")( - "temperature", po::value()->default_value(0.8f), - "Sampling temperature (higher = more random)")( - "top-p", po::value()->default_value(0.92f), - "Nucleus sampling top-p in (0,1] (higher = more random)")( - "n-ctx", po::value()->default_value(8192), - "Context window size in tokens (1-32768)")( - "seed", po::value()->default_value(-1), +auto ParseArguments(const int argc, char** argv, + ApplicationOptions& options) noexcept -> bool { + prog_opts::options_description desc("Pipeline Options"); + desc.add_options() + ("help,h", "Produce help message") + ("mocked", + prog_opts::bool_switch(), + "Use mocked generator for brewery/user data") + ("model,m", + prog_opts::value()->default_value(""), + "Path to LLM model (gguf)") + ("temperature", + prog_opts::value()->default_value(0.8f), + "Sampling temperature (higher = more random)") + ("top-p", + prog_opts::value()->default_value(0.92f), + "Nucleus sampling top-p in (0,1] (higher = more random)") + ("n-ctx", + prog_opts::value()->default_value(8192), + "Context window size in tokens (1-32768)") + ("seed", + prog_opts::value()->default_value(-1), "Sampler seed: -1 for random, otherwise non-negative integer"); - po::variables_map vm; - po::store(po::parse_command_line(argc, argv, desc), vm); - po::notify(vm); - - if (vm.count("help")) { - std::cout << desc << "\n"; + // Handle the "no arguments" or "help" case + if (argc == 1) { + spdlog::info("Biergarten Pipeline"); + std::stringstream ss; + ss << "\nUsage: biergarten-pipeline [options]\n\n" << desc; + spdlog::info(ss.str()); return false; } - // Check for mutually exclusive --mocked and --model flags - bool use_mocked = vm["mocked"].as(); - std::string model_path = vm["model"].as(); + try { + prog_opts::variables_map vm; + prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), vm); + prog_opts::notify(vm); - if (use_mocked && !model_path.empty()) { - spdlog::error("ERROR: --mocked and --model are mutually exclusive"); - return false; - } - - if (!use_mocked && model_path.empty()) { - spdlog::error("ERROR: Either --mocked or --model must be specified"); - return false; - } - - // Warn if sampling parameters are provided with --mocked - if (use_mocked) { - bool hasTemperature = vm["temperature"].defaulted() == false; - bool hasTopP = vm["top-p"].defaulted() == false; - bool hasSeed = vm["seed"].defaulted() == false; - - if (hasTemperature || hasTopP || hasSeed) { - spdlog::warn( - "WARNING: Sampling parameters (--temperature, --top-p, --seed) " - "are ignored when using --mocked"); + if (vm.contains("help")) { + std::stringstream ss; + ss << "\n" << desc; + spdlog::info(ss.str()); + return false; } + + const auto use_mocked = vm["mocked"].as(); + const auto model_path = vm["model"].as(); + + if (use_mocked && !model_path.empty()) { + spdlog::error( + "Invalid arguments: --mocked and --model are mutually exclusive"); + return false; + } + + if (!use_mocked && model_path.empty()) { + spdlog::error( + "Invalid arguments: Either --mocked or --model must be specified"); + return false; + } + + const bool has_llm_params = !vm["temperature"].defaulted() || + !vm["top-p"].defaulted() || + !vm["seed"].defaulted(); + + if (use_mocked && has_llm_params) { + spdlog::warn( + "Sampling parameters (--temperature, --top-p, --seed) are" + " ignored when using --mocked"); + } + + options.use_mocked = use_mocked; + options.model_path = model_path; + options.temperature = vm["temperature"].as(); + options.top_p = vm["top-p"].as(); + options.n_ctx = vm["n-ctx"].as(); + options.seed = vm["seed"].as(); + + return true; + } catch (const std::exception& exception) { + spdlog::error("Failed to parse command-line arguments: {}", + exception.what()); + return false; + } catch (...) { + spdlog::error("Failed to parse command-line arguments: unknown error"); + return false; } - - options.use_mocked = use_mocked; - options.model_path = model_path; - options.cache_dir = vm["cache-dir"].as(); - options.temperature = vm["temperature"].as(); - options.top_p = vm["top-p"].as(); - options.n_ctx = vm["n-ctx"].as(); - options.seed = vm["seed"].as(); - // commit is always pinned to c5eb7772 - - return true; } -int main(int argc, char* argv[]) { +auto main(const int argc, char** argv) noexcept -> int { try { const CurlGlobalState curl_state; + spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v"); ApplicationOptions options; if (!ParseArguments(argc, argv, options)) { return 0; } - auto webClient = std::make_shared(); + auto webClient = std::make_unique(); + BiergartenDataGenerator generator(options, std::move(webClient)); - BiergartenDataGenerator generator(options, webClient); - return generator.Run(); - - } catch (const std::exception& e) { - const std::string message = e.what() ? e.what() : ""; - - if (message.find("LlamaGenerator: malformed brewery response") != - std::string::npos) { - spdlog::warn("WARNING: Non-fatal LLM failure after retries: {}", - message); - return 0; + if (!generator.Run()) { + spdlog::error("Pipeline execution failed"); + return 1; } - spdlog::error("ERROR: Application failed: {}", e.what()); + spdlog::info("Pipeline executed successfully"); + return 0; + } catch (const std::exception& exception) { + spdlog::critical("Unhandled fatal error in main: {}", exception.what()); + return 1; + } catch (...) { + spdlog::critical("Unhandled fatal non-standard exception in main"); return 1; } -} +} \ No newline at end of file diff --git a/pipeline/src/web_client/curl_global_state_constructor.cpp b/pipeline/src/web_client/curl_global_state_constructor.cpp new file mode 100644 index 0000000..6f2f317 --- /dev/null +++ b/pipeline/src/web_client/curl_global_state_constructor.cpp @@ -0,0 +1,17 @@ +/** + * @file web_client/curl_global_state_constructor.cpp + * @brief CurlGlobalState constructor implementation. + */ + +#include + +#include + +#include "web_client/curl_web_client.h" + +CurlGlobalState::CurlGlobalState() { + if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) { + throw std::runtime_error( + "[CURLWebClient] Failed to initialize libcurl globally"); + } +} diff --git a/pipeline/src/web_client/curl_global_state_destructor.cpp b/pipeline/src/web_client/curl_global_state_destructor.cpp new file mode 100644 index 0000000..3783715 --- /dev/null +++ b/pipeline/src/web_client/curl_global_state_destructor.cpp @@ -0,0 +1,10 @@ +/** + * @file web_client/curl_global_state_destructor.cpp + * @brief CurlGlobalState destructor implementation. + */ + +#include + +#include "web_client/curl_web_client.h" + +CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); } diff --git a/pipeline/src/web_client/curl_web_client_constructor.cpp b/pipeline/src/web_client/curl_web_client_constructor.cpp new file mode 100644 index 0000000..8874b42 --- /dev/null +++ b/pipeline/src/web_client/curl_web_client_constructor.cpp @@ -0,0 +1,8 @@ +/** + * @file web_client/curl_web_client_constructor.cpp + * @brief CURLWebClient constructor implementation. + */ + +#include "web_client/curl_web_client.h" + +CURLWebClient::CURLWebClient() {} diff --git a/pipeline/src/web_client/curl_web_client_destructor.cpp b/pipeline/src/web_client/curl_web_client_destructor.cpp new file mode 100644 index 0000000..342bcf7 --- /dev/null +++ b/pipeline/src/web_client/curl_web_client_destructor.cpp @@ -0,0 +1,8 @@ +/** + * @file web_client/curl_web_client_destructor.cpp + * @brief CURLWebClient destructor implementation. + */ + +#include "web_client/curl_web_client.h" + +CURLWebClient::~CURLWebClient() {} diff --git a/pipeline/src/web_client/curl_web_client.cpp b/pipeline/src/web_client/curl_web_client_download_to_file.cpp similarity index 55% rename from pipeline/src/web_client/curl_web_client.cpp rename to pipeline/src/web_client/curl_web_client_download_to_file.cpp index 056f458..4c74ee4 100644 --- a/pipeline/src/web_client/curl_web_client.cpp +++ b/pipeline/src/web_client/curl_web_client_download_to_file.cpp @@ -1,11 +1,8 @@ /** - * @file web_client/curl_web_client.cpp - * @brief Implements libcurl-backed HTTP utilities, including GET requests, - * file downloads, URL encoding, and RAII global curl lifecycle handling. + * @file web_client/curl_web_client_download_to_file.cpp + * @brief CURLWebClient::DownloadToFile() implementation. */ -#include "web_client/curl_web_client.h" - #include #include @@ -14,34 +11,9 @@ #include #include -CurlGlobalState::CurlGlobalState() { - if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) { - throw std::runtime_error( - "[CURLWebClient] Failed to initialize libcurl globally"); - } -} - -CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); } +#include "web_client/curl_web_client.h" namespace { -// curl write callback that appends response data into a std::string -size_t WriteCallbackString(void* contents, size_t size, size_t nmemb, - void* userp) { - size_t realsize = size * nmemb; - auto* s = static_cast(userp); - s->append(static_cast(contents), realsize); - return realsize; -} - -// curl write callback that writes to a file stream -size_t WriteCallbackFile(void* contents, size_t size, size_t nmemb, - void* userp) { - size_t realsize = size * nmemb; - auto* outFile = static_cast(userp); - outFile->write(static_cast(contents), realsize); - return realsize; -} - // RAII wrapper for CURL handle using unique_ptr using CurlHandle = std::unique_ptr; @@ -64,12 +36,17 @@ void set_common_get_options(CURL* curl, const std::string& url, curl_easy_setopt(curl, CURLOPT_TIMEOUT, total_timeout); curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip"); } + +// curl write callback that writes to a file stream +size_t WriteCallbackFile(void* contents, size_t size, size_t nmemb, + void* userp) { + size_t realsize = size * nmemb; + auto* outFile = static_cast(userp); + outFile->write(static_cast(contents), realsize); + return realsize; +} } // namespace -CURLWebClient::CURLWebClient() {} - -CURLWebClient::~CURLWebClient() {} - void CURLWebClient::DownloadToFile(const std::string& url, const std::string& file_path) { auto curl = create_handle(); @@ -105,43 +82,3 @@ void CURLWebClient::DownloadToFile(const std::string& url, throw std::runtime_error(ss.str()); } } - -std::string CURLWebClient::Get(const std::string& url) { - auto curl = create_handle(); - - std::string response_string; - set_common_get_options(curl.get(), url, 10L, 20L); - curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString); - curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string); - - CURLcode res = curl_easy_perform(curl.get()); - - if (res != CURLE_OK) { - std::string error = - std::string("[CURLWebClient] GET failed: ") + curl_easy_strerror(res); - throw std::runtime_error(error); - } - - long httpCode = 0; - curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode); - - if (httpCode != 200) { - std::stringstream ss; - ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url; - throw std::runtime_error(ss.str()); - } - - return response_string; -} - -std::string CURLWebClient::UrlEncode(const std::string& value) { - // A NULL handle is fine for UTF-8 encoding according to libcurl docs. - char* output = curl_easy_escape(nullptr, value.c_str(), 0); - - if (output) { - std::string result(output); - curl_free(output); - return result; - } - throw std::runtime_error("[CURLWebClient] curl_easy_escape failed"); -} diff --git a/pipeline/src/web_client/curl_web_client_get.cpp b/pipeline/src/web_client/curl_web_client_get.cpp new file mode 100644 index 0000000..cff7830 --- /dev/null +++ b/pipeline/src/web_client/curl_web_client_get.cpp @@ -0,0 +1,75 @@ +/** + * @file web_client/curl_web_client_get.cpp + * @brief CURLWebClient::Get() implementation. + */ + +#include + +#include +#include +#include +#include + +#include "web_client/curl_web_client.h" + +namespace { +// RAII wrapper for CURL handle using unique_ptr +using CurlHandle = std::unique_ptr; + +CurlHandle create_handle() { + CURL* handle = curl_easy_init(); + if (!handle) { + throw std::runtime_error( + "[CURLWebClient] Failed to initialize libcurl handle"); + } + return CurlHandle(handle, &curl_easy_cleanup); +} + +void set_common_get_options(CURL* curl, const std::string& url, + long connect_timeout, long total_timeout) { + curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); + curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0"); + curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, connect_timeout); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, total_timeout); + curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip"); +} + +// curl write callback that appends response data into a std::string +size_t WriteCallbackString(void* contents, size_t size, size_t nmemb, + void* userp) { + size_t realsize = size * nmemb; + auto* s = static_cast(userp); + s->append(static_cast(contents), realsize); + return realsize; +} +} // namespace + +std::string CURLWebClient::Get(const std::string& url) { + auto curl = create_handle(); + + std::string response_string; + set_common_get_options(curl.get(), url, 10L, 20L); + curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString); + curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string); + + CURLcode res = curl_easy_perform(curl.get()); + + if (res != CURLE_OK) { + std::string error = + std::string("[CURLWebClient] GET failed: ") + curl_easy_strerror(res); + throw std::runtime_error(error); + } + + long httpCode = 0; + curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode); + + if (httpCode != 200) { + std::stringstream ss; + ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url; + throw std::runtime_error(ss.str()); + } + + return response_string; +} diff --git a/pipeline/src/web_client/curl_web_client_url_encode.cpp b/pipeline/src/web_client/curl_web_client_url_encode.cpp new file mode 100644 index 0000000..2f6fca9 --- /dev/null +++ b/pipeline/src/web_client/curl_web_client_url_encode.cpp @@ -0,0 +1,23 @@ +/** + * @file web_client/curl_web_client_url_encode.cpp + * @brief CURLWebClient::UrlEncode() implementation. + */ + +#include + +#include +#include + +#include "web_client/curl_web_client.h" + +std::string CURLWebClient::UrlEncode(const std::string& value) { + // A NULL handle is fine for UTF-8 encoding according to libcurl docs. + char* output = curl_easy_escape(nullptr, value.c_str(), 0); + + if (output) { + std::string result(output); + curl_free(output); + return result; + } + throw std::runtime_error("[CURLWebClient] curl_easy_escape failed"); +} diff --git a/pipeline/src/wikipedia/constructor.cpp b/pipeline/src/wikipedia/constructor.cpp new file mode 100644 index 0000000..8884375 --- /dev/null +++ b/pipeline/src/wikipedia/constructor.cpp @@ -0,0 +1,11 @@ +/** + * @file wikipedia/constructor.cpp + * @brief WikipediaService constructor implementation. + */ + +#include + +#include "wikipedia/wikipedia_service.h" + +WikipediaService::WikipediaService(std::shared_ptr client) + : client_(std::move(client)) {} diff --git a/pipeline/src/wikipedia/fetch_extract.cpp b/pipeline/src/wikipedia/fetch_extract.cpp new file mode 100644 index 0000000..6b4259e --- /dev/null +++ b/pipeline/src/wikipedia/fetch_extract.cpp @@ -0,0 +1,51 @@ +/** + * @file wikipedia/fetch_extract.cpp + * @brief WikipediaService::FetchExtract() implementation. + */ + +#include + +#include +#include +#include + +#include "wikipedia/wikipedia_service.h" + +auto WikipediaService::FetchExtract(std::string_view query) const + -> std::string { + const std::string encoded = client_->UrlEncode(std::string(query)); + const std::string url = + "https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded + + "&prop=extracts&explaintext=1&format=json"; + + const std::string body = client_->Get(url); + + boost::system::error_code ec; + boost::json::value doc = boost::json::parse(body, ec); + + if (!ec && doc.is_object()) { + try { + auto& pages = doc.at("query").at("pages").get_object(); + if (!pages.empty()) { + auto& page = pages.begin()->value().get_object(); + if (page.contains("extract") && page.at("extract").is_string()) { + std::string extract(page.at("extract").as_string().c_str()); + spdlog::debug("WikipediaService fetched {} chars for '{}'", + extract.size(), query); + return extract; + } + } + } catch (const std::exception& e) { + spdlog::warn( + "WikipediaService: failed to parse response structure for '{}': " + "{}", + query, e.what()); + return {}; + } + } else if (ec) { + spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query, + ec.message()); + } + + return {}; +} diff --git a/pipeline/src/wikipedia/get_summary.cpp b/pipeline/src/wikipedia/get_summary.cpp new file mode 100644 index 0000000..550229a --- /dev/null +++ b/pipeline/src/wikipedia/get_summary.cpp @@ -0,0 +1,55 @@ +/** + * @file wikipedia/get_summary.cpp + * @brief WikipediaService::GetSummary() implementation. + */ + +#include + +#include + +#include "wikipedia/wikipedia_service.h" + +auto WikipediaService::GetSummary(std::string_view city, + std::string_view country) -> std::string { + const std::string key = std::string(city) + "|" + std::string(country); + const auto cacheIt = cache_.find(key); + if (cacheIt != cache_.end()) { + return cacheIt->second; + } + + std::string result; + + if (!client_) { + cache_.emplace(key, result); + return result; + } + + std::string regionQuery(city); + if (!country.empty()) { + regionQuery += ", "; + regionQuery += country; + } + + const std::string beerQuery = "beer in " + std::string(country); + + try { + const std::string regionExtract = FetchExtract(regionQuery); + const std::string beerExtract = FetchExtract(beerQuery); + + if (!regionExtract.empty()) { + result += regionExtract; + } + if (!beerExtract.empty()) { + if (!result.empty()) { + result += "\n\n"; + } + result += beerExtract; + } + } catch (const std::runtime_error& e) { + spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery, + e.what()); + } + + cache_.emplace(key, result); + return result; +} diff --git a/pipeline/src/wikipedia/wikipedia_service.cpp b/pipeline/src/wikipedia/wikipedia_service.cpp deleted file mode 100644 index 62f7673..0000000 --- a/pipeline/src/wikipedia/wikipedia_service.cpp +++ /dev/null @@ -1,95 +0,0 @@ -/** - * @file wikipedia/wikipedia_service.cpp - * @brief Implements Wikipedia extract retrieval and caching for city/country - * queries, including response parsing and resilient error handling. - */ - -#include "wikipedia/wikipedia_service.h" - -#include - -#include - -WikipediaService::WikipediaService(std::shared_ptr client) - : client_(std::move(client)) {} - -std::string WikipediaService::FetchExtract(std::string_view query) { - const std::string encoded = client_->UrlEncode(std::string(query)); - const std::string url = - "https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded + - "&prop=extracts&explaintext=1&format=json"; - - const std::string body = client_->Get(url); - - boost::system::error_code ec; - boost::json::value doc = boost::json::parse(body, ec); - - if (!ec && doc.is_object()) { - try { - auto& pages = doc.at("query").at("pages").get_object(); - if (!pages.empty()) { - auto& page = pages.begin()->value().get_object(); - if (page.contains("extract") && page.at("extract").is_string()) { - std::string extract(page.at("extract").as_string().c_str()); - spdlog::debug("WikipediaService fetched {} chars for '{}'", - extract.size(), query); - return extract; - } - } - } catch (const std::exception& e) { - spdlog::warn( - "WikipediaService: failed to parse response structure for '{}': " - "{}", - query, e.what()); - return {}; - } - } else if (ec) { - spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query, - ec.message()); - } - - return {}; -} - -std::string WikipediaService::GetSummary(std::string_view city, - std::string_view country) { - const std::string key = std::string(city) + "|" + std::string(country); - const auto cacheIt = cache_.find(key); - if (cacheIt != cache_.end()) { - return cacheIt->second; - } - - std::string result; - - if (!client_) { - cache_.emplace(key, result); - return result; - } - - std::string regionQuery(city); - if (!country.empty()) { - regionQuery += ", "; - regionQuery += country; - } - - const std::string beerQuery = "beer in " + std::string(country); - - try { - const std::string regionExtract = FetchExtract(regionQuery); - const std::string beerExtract = FetchExtract(beerQuery); - - if (!regionExtract.empty()) { - result += regionExtract; - } - if (!beerExtract.empty()) { - if (!result.empty()) result += "\n\n"; - result += beerExtract; - } - } catch (const std::runtime_error& e) { - spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery, - e.what()); - } - - cache_.emplace(key, result); - return result; -}