From 2ee7b3d2a2468d4de02ac9ea038c78d2222804b1 Mon Sep 17 00:00:00 2001 From: Aaron Po Date: Thu, 14 May 2026 19:15:51 -0400 Subject: [PATCH] Add timeout to wikipedia enrichment to avoid breaking rate limits, add mock enrichment (#224) * Add timeout for enrichment, refactor json deserialization * Add location count to application options and as a cli arg * Add mock enrichment process --- tooling/pipeline/CMakeLists.txt | 9 +- .../includes/biergarten_data_generator.h | 9 +- tooling/pipeline/includes/data_model/models.h | 10 +- .../services/enrichment/mock_enrichment.h | 17 +++ .../services/enrichment/wikipedia_service.h | 4 +- .../includes/web_client/http_web_client.h | 2 +- .../pipeline/includes/web_client/web_client.h | 2 +- .../application_options/parse_arguments.cc | 12 +- .../biergarten_data_generator.cc | 6 +- .../query_cities_with_countries.cc | 6 +- .../src/biergarten_data_generator/run.cc | 4 +- .../data_generation/llama/llama_generator.cc | 2 +- tooling/pipeline/src/main.cc | 33 ++++-- .../enrichment/wikipedia/fetch_extract.cc | 112 ++++++++++++++++++ .../enrichment/wikipedia/get_summary.cc | 58 +++++++++ .../wikipedia/wikipedia_service.cc | 3 +- .../src/services/wikipedia/fetch_extract.cc | 61 ---------- .../src/services/wikipedia/get_summary.cc | 47 -------- .../src/web_client/http_web_client.cc | 11 +- 19 files changed, 261 insertions(+), 147 deletions(-) create mode 100644 tooling/pipeline/includes/services/enrichment/mock_enrichment.h create mode 100644 tooling/pipeline/src/services/enrichment/wikipedia/fetch_extract.cc create mode 100644 tooling/pipeline/src/services/enrichment/wikipedia/get_summary.cc rename tooling/pipeline/src/services/{ => enrichment}/wikipedia/wikipedia_service.cc (69%) delete mode 100644 tooling/pipeline/src/services/wikipedia/fetch_extract.cc delete mode 100644 tooling/pipeline/src/services/wikipedia/get_summary.cc diff --git a/tooling/pipeline/CMakeLists.txt b/tooling/pipeline/CMakeLists.txt index 9aea17d..abdf592 100644 --- a/tooling/pipeline/CMakeLists.txt +++ b/tooling/pipeline/CMakeLists.txt @@ -137,7 +137,8 @@ set(HTTPLIB_REQUIRE_OPENSSL ON CACHE BOOL "Require OpenSSL for cpp-httplib" FORC FetchContent_MakeAvailable(cpp-httplib) # 5. Executable & Sources -add_executable(${PROJECT_NAME}) +add_executable(${PROJECT_NAME} + includes/services/enrichment/mock_enrichment.h) # --- Entry point --- target_sources(${PROJECT_NAME} PRIVATE @@ -194,9 +195,9 @@ endif() # --- services: wikipedia --- target_sources(${PROJECT_NAME} PRIVATE - src/services/wikipedia/wikipedia_service.cc - src/services/wikipedia/fetch_extract.cc - src/services/wikipedia/get_summary.cc + src/services/enrichment/wikipedia/wikipedia_service.cc + src/services/enrichment/wikipedia/fetch_extract.cc + src/services/enrichment/wikipedia/get_summary.cc ) # --- services: sqlite --- diff --git a/tooling/pipeline/includes/biergarten_data_generator.h b/tooling/pipeline/includes/biergarten_data_generator.h index 1396213..e74ba02 100644 --- a/tooling/pipeline/includes/biergarten_data_generator.h +++ b/tooling/pipeline/includes/biergarten_data_generator.h @@ -12,8 +12,8 @@ #include "data_generation/data_generator.h" #include "data_model/generated_models.h" -#include "services/enrichment/enrichment_service.h" #include "services/database/export_service.h" +#include "services/enrichment/enrichment_service.h" /** * @brief Main data generator class for the Biergarten pipeline. @@ -32,7 +32,8 @@ class BiergartenDataGenerator { */ BiergartenDataGenerator(std::unique_ptr context_service, std::unique_ptr generator, - std::unique_ptr exporter); + std::unique_ptr exporter, + const ApplicationOptions& application_options); /** * @brief Run the data generation pipeline. @@ -56,12 +57,14 @@ class BiergartenDataGenerator { /// @brief Storage backend for generated brewery records. std::unique_ptr exporter_; + const ApplicationOptions application_options_; + /** * @brief Load locations from JSON and sample cities. * * @return Vector of sampled locations capped at 50 entries. */ - static std::vector QueryCitiesWithCountries(); + std::vector QueryCitiesWithCountries(); /** * @brief Generate breweries for enriched cities. diff --git a/tooling/pipeline/includes/data_model/models.h b/tooling/pipeline/includes/data_model/models.h index f08cf41..9346b01 100644 --- a/tooling/pipeline/includes/data_model/models.h +++ b/tooling/pipeline/includes/data_model/models.h @@ -83,6 +83,9 @@ struct SamplingOptions { /// @brief Random seed (-1 for random, otherwise non-negative). int seed = -1; + + /// @brief Number of layers to offload to GPU. + int n_gpu_layers = 0; }; /** @@ -95,8 +98,7 @@ struct GeneratorOptions { /// @brief Use mocked generator instead of actual LLM inference. bool use_mocked = false; - /// @brief Number of layers to offload to GPU. - int n_gpu_layers = 0; + /// @brief Specific sampling parameters for this generator. /// If nullopt, the application should use global defaults. @@ -116,6 +118,10 @@ struct PipelineOptions { /// @brief Path for application logs. std::filesystem::path log_path; + + /// @brief Number of locations to sample from the dataset + /// More locations -> more users/more breweries + uint32_t location_count; }; /** diff --git a/tooling/pipeline/includes/services/enrichment/mock_enrichment.h b/tooling/pipeline/includes/services/enrichment/mock_enrichment.h new file mode 100644 index 0000000..0eae2ba --- /dev/null +++ b/tooling/pipeline/includes/services/enrichment/mock_enrichment.h @@ -0,0 +1,17 @@ +// +// Created by aaronpo on 13/05/2026. +// + +#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_ENRICHMENT_MOCK_ENRICHMENT_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_ENRICHMENT_MOCK_ENRICHMENT_H_ +#include + +#include "enrichment_service.h" + +class MockEnrichmentService final : public IEnrichmentService { + public: + std::string GetLocationContext(const Location& /*loc*/) override { + return {}; + } +}; +#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_ENRICHMENT_MOCK_ENRICHMENT_H_ diff --git a/tooling/pipeline/includes/services/enrichment/wikipedia_service.h b/tooling/pipeline/includes/services/enrichment/wikipedia_service.h index 696cf24..c51ff28 100644 --- a/tooling/pipeline/includes/services/enrichment/wikipedia_service.h +++ b/tooling/pipeline/includes/services/enrichment/wikipedia_service.h @@ -15,10 +15,10 @@ #include "web_client/web_client.h" /// @brief Provides Wikipedia summary lookups backed by cached raw extracts. -class WikipediaService final : public IEnrichmentService { +class WikipediaEnrichmentService final : public IEnrichmentService { public: /// @brief Creates a new Wikipedia service with the provided web client. - explicit WikipediaService(std::unique_ptr client); + explicit WikipediaEnrichmentService(std::unique_ptr client); /// @brief Returns the Wikipedia-derived context for a location. [[nodiscard]] std::string GetLocationContext(const Location& loc) override; diff --git a/tooling/pipeline/includes/web_client/http_web_client.h b/tooling/pipeline/includes/web_client/http_web_client.h index 778d5d3..a38beba 100644 --- a/tooling/pipeline/includes/web_client/http_web_client.h +++ b/tooling/pipeline/includes/web_client/http_web_client.h @@ -42,7 +42,7 @@ public: * @param value Raw string to encode. * @return Percent-encoded string safe for use in a URL. */ - std::string UrlEncode(const std::string& value) override; + std::string EncodeURL(const std::string& value) override; }; diff --git a/tooling/pipeline/includes/web_client/web_client.h b/tooling/pipeline/includes/web_client/web_client.h index bb16323..641eb12 100644 --- a/tooling/pipeline/includes/web_client/web_client.h +++ b/tooling/pipeline/includes/web_client/web_client.h @@ -30,7 +30,7 @@ class WebClient { * @param value Raw string value. * @return Encoded value safe for URL usage. */ - virtual std::string UrlEncode(const std::string& value) = 0; + virtual std::string EncodeURL(const std::string& value) = 0; }; #endif // BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_WEB_CLIENT_H_ diff --git a/tooling/pipeline/src/application_options/parse_arguments.cc b/tooling/pipeline/src/application_options/parse_arguments.cc index b06c1b7..b2995d1 100644 --- a/tooling/pipeline/src/application_options/parse_arguments.cc +++ b/tooling/pipeline/src/application_options/parse_arguments.cc @@ -30,6 +30,8 @@ std::optional ParseArguments(const int argc, char** argv) { "Context window size in tokens"); opt("seed", prog_opts::value()->default_value(sampling_defaults.seed), "Sampler seed: -1 for random, otherwise non-negative integer"); + opt("n-gpu-layers", prog_opts::value()->default_value(0), + "Number of layers to offload to GPU"); }; // --mocked and --model are mutually exclusive; validation is enforced below @@ -50,8 +52,7 @@ std::optional ParseArguments(const int argc, char** argv) { opt("prompt-dir", prog_opts::value()->default_value(""), "Directory containing named prompt files (e.g. BREWERY_GENERATION.md)." " Required when not using --mocked."); - opt("n-gpu-layers", prog_opts::value()->default_value(0), - "Number of layers to offload to GPU"); + opt("location-count", prog_opts::value()->default_value(10)); }; add_sampling_options(); @@ -84,6 +85,8 @@ std::optional ParseArguments(const int argc, char** argv) { options.pipeline.output_path = var_map["output"].as(); options.pipeline.log_path = var_map["log-path"].as(); options.pipeline.prompt_dir = var_map["prompt-dir"].as(); + options.pipeline.location_count = + var_map["location-count"].as(); const bool use_mocked = var_map["mocked"].as(); const std::string model_path = var_map["model"].as(); @@ -113,7 +116,7 @@ std::optional ParseArguments(const int argc, char** argv) { options.generator.use_mocked = use_mocked; options.generator.model_path = model_path; - options.generator.n_gpu_layers = n_gpu_layers; + // options.generator.n_gpu_layers = n_gpu_layers; // Only populate sampling config when the user explicitly overrides at // least one value. Leaving it as std::nullopt lets LlamaGenerator fall @@ -122,7 +125,7 @@ std::optional ParseArguments(const int argc, char** argv) { const bool user_provided_sampling = !var_map["temperature"].defaulted() || !var_map["top-p"].defaulted() || !var_map["top-k"].defaulted() || !var_map["n-ctx"].defaulted() || - !var_map["seed"].defaulted(); + !var_map["seed"].defaulted() || !var_map["n_gpu_layers"].defaulted(); if (user_provided_sampling) { // Warn but do not fail — the run is still valid, the flags are just @@ -136,6 +139,7 @@ std::optional ParseArguments(const int argc, char** argv) { sampling.top_k = var_map["top-k"].as(); sampling.n_ctx = var_map["n-ctx"].as(); sampling.seed = var_map["seed"].as(); + sampling.n_gpu_layers = var_map["n-gpu-layers"].as(); options.generator.sampling = sampling; } diff --git a/tooling/pipeline/src/biergarten_data_generator/biergarten_data_generator.cc b/tooling/pipeline/src/biergarten_data_generator/biergarten_data_generator.cc index 033795d..71875b3 100644 --- a/tooling/pipeline/src/biergarten_data_generator/biergarten_data_generator.cc +++ b/tooling/pipeline/src/biergarten_data_generator/biergarten_data_generator.cc @@ -10,7 +10,9 @@ BiergartenDataGenerator::BiergartenDataGenerator( std::unique_ptr context_service, std::unique_ptr generator, - std::unique_ptr exporter) + std::unique_ptr exporter, + const ApplicationOptions &app_options) : context_service_(std::move(context_service)), generator_(std::move(generator)), - exporter_(std::move(exporter)) {} + exporter_(std::move(exporter)), + application_options_(app_options) {} diff --git a/tooling/pipeline/src/biergarten_data_generator/query_cities_with_countries.cc b/tooling/pipeline/src/biergarten_data_generator/query_cities_with_countries.cc index 5cf60b6..c17654f 100644 --- a/tooling/pipeline/src/biergarten_data_generator/query_cities_with_countries.cc +++ b/tooling/pipeline/src/biergarten_data_generator/query_cities_with_countries.cc @@ -13,8 +13,6 @@ #include "biergarten_data_generator.h" #include "json_handling/json_loader.h" -static constexpr size_t kBreweryAmount = 50; - std::vector BiergartenDataGenerator::QueryCitiesWithCountries() { spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ==="); @@ -23,7 +21,9 @@ std::vector BiergartenDataGenerator::QueryCitiesWithCountries() { auto all_locations = JsonLoader::LoadLocations(locations_path); spdlog::info(" Locations available: {}", all_locations.size()); - const size_t sample_count = std::min(kBreweryAmount, all_locations.size()); + const size_t sample_count = std::min( + static_cast(application_options_.pipeline.location_count), + all_locations.size()); const auto sample_count_signed = static_cast>( diff --git a/tooling/pipeline/src/biergarten_data_generator/run.cc b/tooling/pipeline/src/biergarten_data_generator/run.cc index 82ebbfc..4ee2b46 100644 --- a/tooling/pipeline/src/biergarten_data_generator/run.cc +++ b/tooling/pipeline/src/biergarten_data_generator/run.cc @@ -21,8 +21,8 @@ bool BiergartenDataGenerator::Run() { for (auto& city : cities) { try { std::string region_context = context_service_->GetLocationContext(city); - spdlog::debug("[Pipeline] Context for '{}' ({}) gathered:\n{}", - city.city, city.country, region_context); + // spdlog::debug("[Pipeline] Context for '{}' ({}) gathered:\n{}", + // city.city, city.iso3166_2, region_context); enriched.push_back( EnrichedCity{.location = std::move(city), diff --git a/tooling/pipeline/src/data_generation/llama/llama_generator.cc b/tooling/pipeline/src/data_generation/llama/llama_generator.cc index 72a888e..d780f2f 100644 --- a/tooling/pipeline/src/data_generation/llama/llama_generator.cc +++ b/tooling/pipeline/src/data_generation/llama/llama_generator.cc @@ -89,7 +89,7 @@ LlamaGenerator::LlamaGenerator( } n_ctx_ = sampling.n_ctx; - n_gpu_layers_ = options.generator.n_gpu_layers; + n_gpu_layers_ = sampling.n_gpu_layers; this->Load(model_path); } diff --git a/tooling/pipeline/src/main.cc b/tooling/pipeline/src/main.cc index 3b2a3ce..cd204c9 100644 --- a/tooling/pipeline/src/main.cc +++ b/tooling/pipeline/src/main.cc @@ -8,11 +8,9 @@ #include #include - #include #include #include - #include #include "biergarten_data_generator.h" @@ -21,12 +19,13 @@ #include "data_generation/prompt_formatting/gemma4_jinja_prompt_formatter.h" #include "data_model/models.h" #include "llama_backend_state.h" -#include "services/enrichment/enrichment_service.h" #include "services/database/export_service.h" -#include "services/prompting/prompt_directory.h" #include "services/database/sqlite_export_service.h" #include "services/datetime/timer.h" +#include "services/enrichment/enrichment_service.h" +#include "services/enrichment/mock_enrichment.h" #include "services/enrichment/wikipedia_service.h" +#include "services/prompting/prompt_directory.h" #include "web_client/http_web_client.h" namespace di = boost::di; @@ -43,7 +42,9 @@ int main(const int argc, char** argv) { spdlog::set_level(spdlog::level::debug); #endif - const auto parsed_options = ParseArguments(argc, argv); + const std::optional parsed_options = + ParseArguments(argc, argv); + if (!parsed_options.has_value()) { return 0; } @@ -65,15 +66,23 @@ int main(const int argc, char** argv) { } const auto injector = di::make_injector( - di::bind().to(), di::bind().to(options), - di::bind().to(), + di::bind().to(model_path), + di::bind().to(), di::bind().to(), di::bind().to(), - di::bind().to(model_path), + di::bind().to( + [options](const auto& inj) -> std::unique_ptr { + if (options.generator.use_mocked) { + return std::make_unique(); + } + + return std::make_unique( + inj.template create>()); + }), di::bind().to( [options, model_path, sampling, &prompt_directory]( - const auto& inj) -> std::unique_ptr { + const auto& inj) -> std::unique_ptr { if (options.generator.use_mocked) { spdlog::info( "[Generator] Using MockGenerator (no model path provided)"); @@ -89,9 +98,11 @@ int main(const int argc, char** argv) { options, model_path, inj.template create>(), std::move(prompt_directory)); - })); + }) - auto generator = + ); + + const auto generator = injector.create>(); if (!generator->Run()) { diff --git a/tooling/pipeline/src/services/enrichment/wikipedia/fetch_extract.cc b/tooling/pipeline/src/services/enrichment/wikipedia/fetch_extract.cc new file mode 100644 index 0000000..94f2f1e --- /dev/null +++ b/tooling/pipeline/src/services/enrichment/wikipedia/fetch_extract.cc @@ -0,0 +1,112 @@ +/** + * @file wikipedia/fetch_extract.cc + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include "services/enrichment/wikipedia_service.h" + +using namespace boost; + +std::string WikipediaEnrichmentService::FetchExtract(std::string_view query) { + + const std::string cache_key(query); + + // 1. Cache Lookup + if (const auto cache_it = this->extract_cache_.find(cache_key); + cache_it != this->extract_cache_.end()) { + spdlog::debug("Wikipedia: Cache hit for {}!", cache_key); + return cache_it->second; + } + + const std::string encoded = this->client_->EncodeURL(cache_key); + const std::string url = std::format( + "https://en.wikipedia.org/w/" + "api.php?action=query&titles={}&prop=extracts&explaintext=1&format=json", + encoded); + + + const std::string body = this->client_->Get(url); + { + using namespace std::literals::chrono_literals; + std::this_thread::sleep_for(1s); + } + + // 2. Parse JSON + system::error_code ec; + json::value doc = json::parse(body, ec); + + if (ec) { + spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query, + ec.message()); + return {}; + } + + // 3. Safe Extraction + const json::object* obj = doc.if_object(); + if (obj == nullptr) { + spdlog::warn("WikipediaService: Expected root object for '{}'", query); + return {}; + } + + const json::value* query_ptr = obj->if_contains("query"); + const json::value* pages_ptr = + ((query_ptr != nullptr) && query_ptr->is_object()) + ? query_ptr->get_object().if_contains("pages") + : nullptr; + + if ((pages_ptr == nullptr) || !pages_ptr->is_object()) { + spdlog::warn("WikipediaService: Missing query.pages for '{}'", query); + return {}; + } + + const json::object& pages = pages_ptr->get_object(); + + if (pages.empty()) { + spdlog::warn("WikipediaService: No pages returned for '{}'", query); + this->extract_cache_.emplace(cache_key, ""); + return {}; + } + + // Wikipedia returns the page under a dynamic ID key; we just want the first + // one + const json::value& page_val = pages.begin()->value(); + + if (!page_val.is_object()) { + spdlog::warn("WikipediaService: Unexpected page format for '{}'", query); + return {}; + } + + const json::object& page = page_val.get_object(); + + // Handle 404/Missing status + if (page.contains("missing")) { + spdlog::warn("WikipediaService: Page '{}' does not exist", query); + this->extract_cache_.emplace(cache_key, ""); + return {}; + } + + const json::value* extract_ptr = page.if_contains("extract"); + + if ((extract_ptr == nullptr) || !extract_ptr->is_string()) { + spdlog::warn("WikipediaService: No extract string found for '{}'", query); + this->extract_cache_.emplace(cache_key, ""); + return {}; + } + + // 4. Success + std::string extract(extract_ptr->as_string()); + spdlog::info("WikipediaService: Fetched {} chars for '{}'", extract.size(), + query); + + this->extract_cache_.insert_or_assign(cache_key, extract); + + return extract; +} \ No newline at end of file diff --git a/tooling/pipeline/src/services/enrichment/wikipedia/get_summary.cc b/tooling/pipeline/src/services/enrichment/wikipedia/get_summary.cc new file mode 100644 index 0000000..d19a420 --- /dev/null +++ b/tooling/pipeline/src/services/enrichment/wikipedia/get_summary.cc @@ -0,0 +1,58 @@ +/** + * @file wikipedia/get_summary.cc + * @brief WikipediaService::GetLocationContext() implementation. + */ + +#include + +#include +#include +#include +#include + +#include "services/enrichment/wikipedia_service.h" + +std::string WikipediaEnrichmentService::GetLocationContext(const Location& loc) { + using namespace std::literals::chrono_literals; + if (!this->client_) { + spdlog::warn("Client is nullptr."); + return {}; + } + + std::string result; + + // std::string region_query(loc.city); + // if (!loc.country.empty()) { + // region_query += loc.state_province, + // region_query += ", "; + // region_query += loc.country; + // } + + constexpr std::string_view brewing_query = "brewing"; + const std::string location_query = + std::format("{}, {}", loc.city, loc.iso3166_2); + const std::string beer_query = std::format("beer in {}", loc.country); + + auto append_extract = [&result](const std::string& extract) -> void { + if (extract.empty()) { + return; + } + if (!result.empty()) { + result += "\n\n"; + } + result += extract; + }; + + try { + append_extract(FetchExtract(brewing_query)); + append_extract(FetchExtract(beer_query)); + spdlog::info("Done fetching for {}. Sleeping for 10 seconds.", + location_query); + std::this_thread::sleep_for(10s); + + } catch (const std::runtime_error& e) { + spdlog::debug("WikipediaService lookup failed for '{}': {}", location_query, + e.what()); + } + return result; +} diff --git a/tooling/pipeline/src/services/wikipedia/wikipedia_service.cc b/tooling/pipeline/src/services/enrichment/wikipedia/wikipedia_service.cc similarity index 69% rename from tooling/pipeline/src/services/wikipedia/wikipedia_service.cc rename to tooling/pipeline/src/services/enrichment/wikipedia/wikipedia_service.cc index 4142a5c..dbccd5e 100644 --- a/tooling/pipeline/src/services/wikipedia/wikipedia_service.cc +++ b/tooling/pipeline/src/services/enrichment/wikipedia/wikipedia_service.cc @@ -7,5 +7,6 @@ #include -WikipediaService::WikipediaService(std::unique_ptr client) +WikipediaEnrichmentService::WikipediaEnrichmentService( + std::unique_ptr client) : client_(std::move(client)) {} diff --git a/tooling/pipeline/src/services/wikipedia/fetch_extract.cc b/tooling/pipeline/src/services/wikipedia/fetch_extract.cc deleted file mode 100644 index 748ed36..0000000 --- a/tooling/pipeline/src/services/wikipedia/fetch_extract.cc +++ /dev/null @@ -1,61 +0,0 @@ -/** - * @file wikipedia/fetch_extract.cc - * @brief WikipediaService::FetchExtract() implementation. - */ - -#include - -#include -#include -#include - -#include "services/enrichment/wikipedia_service.h" - -std::string WikipediaService::FetchExtract(std::string_view query) { - const std::string cache_key(query); - const auto cache_it = this->extract_cache_.find(cache_key); - if (cache_it != this->extract_cache_.end()) { - return cache_it->second; - } - - const std::string encoded = this->client_->UrlEncode(cache_key); - const std::string url = - "https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded + - "&prop=extracts&explaintext=1&format=json"; - - const std::string body = this->client_->Get(url); - - boost::system::error_code parse_error; - boost::json::value doc = boost::json::parse(body, parse_error); - - if (!parse_error && doc.is_object()) { - try { - auto& pages = doc.at("query").at("pages").get_object(); - if (!pages.empty()) { - auto& page = pages.begin()->value().get_object(); - if (page.contains("extract") && page.at("extract").is_string()) { - const std::string_view extract_view = page.at("extract").as_string(); - std::string extract(extract_view); - - spdlog::debug("WikipediaService fetched {} chars for '{}'", - extract.size(), query); - - this->extract_cache_.emplace(cache_key, extract); - return extract; - } - } - this->extract_cache_.emplace(cache_key, std::string{}); - } catch (const std::exception& e) { - spdlog::warn( - "WikipediaService: failed to parse response structure for '{}': " - "{}", - query, e.what()); - return {}; - } - } else if (parse_error) { - spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query, - parse_error.message()); - } - - return {}; -} diff --git a/tooling/pipeline/src/services/wikipedia/get_summary.cc b/tooling/pipeline/src/services/wikipedia/get_summary.cc deleted file mode 100644 index 16fc7b6..0000000 --- a/tooling/pipeline/src/services/wikipedia/get_summary.cc +++ /dev/null @@ -1,47 +0,0 @@ -/** - * @file wikipedia/get_summary.cc - * @brief WikipediaService::GetLocationContext() implementation. - */ - -#include - -#include - -#include "services/enrichment/wikipedia_service.h" - -std::string WikipediaService::GetLocationContext(const Location& loc) { - if (!client_) { - return {}; - } - - std::string result; - - std::string region_query(loc.city); - if (!loc.country.empty()) { - region_query += ", "; - region_query += loc.country; - } - - const std::string beer_query = "beer in " + loc.country; - const std::string city_beer_query = "beer in " + loc.city; - - auto append_extract = [&result](const std::string& extract) -> void { - if (extract.empty()) { - return; - } - if (!result.empty()) { - result += "\n\n"; - } - result += extract; - }; - - try { - append_extract(FetchExtract(region_query)); - append_extract(FetchExtract(beer_query)); - append_extract(FetchExtract(city_beer_query)); - } catch (const std::runtime_error& e) { - spdlog::debug("WikipediaService lookup failed for '{}': {}", region_query, - e.what()); - } - return result; -} diff --git a/tooling/pipeline/src/web_client/http_web_client.cc b/tooling/pipeline/src/web_client/http_web_client.cc index aba30cf..4653102 100644 --- a/tooling/pipeline/src/web_client/http_web_client.cc +++ b/tooling/pipeline/src/web_client/http_web_client.cc @@ -12,6 +12,8 @@ #include #include +#include "spdlog/spdlog.h" + namespace { constexpr time_t kConnectionTimeoutSeconds = 5; constexpr time_t kReadTimeoutSeconds = 10; @@ -38,8 +40,12 @@ std::string HttpWebClient::Get(const std::string& url) { client.set_follow_location(true); client.set_connection_timeout(kConnectionTimeoutSeconds); client.set_read_timeout(kReadTimeoutSeconds); + client.set_default_headers({ + {"Accept", "application/json"}, + {"User-Agent", "biergarten-pipeline/1.0"} + }); - const auto result = client.Get(path); + const httplib::Result result = client.Get(path); if (!result) { throw std::runtime_error( @@ -48,6 +54,7 @@ std::string HttpWebClient::Get(const std::string& url) { } if (result->status < kSuccessMin || result->status >= kSuccessMax) { + spdlog::error("[HttpWebClient] Request failed for URL: " + url); throw std::runtime_error( "[HttpWebClient] HTTP " + std::to_string(result->status) + " for URL: " + url); @@ -56,6 +63,6 @@ std::string HttpWebClient::Get(const std::string& url) { return result->body; } -std::string HttpWebClient::UrlEncode(const std::string& value) { +std::string HttpWebClient::EncodeURL(const std::string& value) { return httplib::encode_uri_component(value); } \ No newline at end of file