mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-05-31 17:53:59 +00:00
Create one method per file
This commit is contained in:
@@ -89,7 +89,27 @@ FetchContent_MakeAvailable(spdlog)
|
||||
# =============================================================================
|
||||
set(SOURCES
|
||||
src/main.cpp
|
||||
src/biergarten_data_generator.cpp
|
||||
# BiergartenDataGenerator methods
|
||||
src/biergarten_data_generator/constructor.cpp
|
||||
src/biergarten_data_generator/run.cpp
|
||||
src/biergarten_data_generator/initialize_generator.cpp
|
||||
src/biergarten_data_generator/query_cities_with_countries.cpp
|
||||
src/biergarten_data_generator/enrich_with_wikipedia.cpp
|
||||
src/biergarten_data_generator/generate_breweries.cpp
|
||||
src/biergarten_data_generator/log_results.cpp
|
||||
# WikipediaService methods
|
||||
src/wikipedia/constructor.cpp
|
||||
src/wikipedia/get_summary.cpp
|
||||
src/wikipedia/fetch_extract.cpp
|
||||
# CURLWebClient and CurlGlobalState methods
|
||||
src/web_client/curl_global_state_constructor.cpp
|
||||
src/web_client/curl_global_state_destructor.cpp
|
||||
src/web_client/curl_web_client_constructor.cpp
|
||||
src/web_client/curl_web_client_destructor.cpp
|
||||
src/web_client/curl_web_client_download_to_file.cpp
|
||||
src/web_client/curl_web_client_get.cpp
|
||||
src/web_client/curl_web_client_url_encode.cpp
|
||||
# Data generation modules
|
||||
src/data_generation/llama/destructor.cpp
|
||||
src/data_generation/llama/generate_brewery.cpp
|
||||
src/data_generation/llama/generate_user.cpp
|
||||
@@ -104,8 +124,6 @@ set(SOURCES
|
||||
src/data_generation/mock/generate_user.cpp
|
||||
src/data_generation/mock/load.cpp
|
||||
src/json_handling/json_loader.cpp
|
||||
src/web_client/curl_web_client.cpp
|
||||
src/wikipedia/wikipedia_service.cpp
|
||||
)
|
||||
# =============================================================================
|
||||
# 5. Target
|
||||
|
||||
@@ -27,9 +27,6 @@ struct ApplicationOptions {
|
||||
/// model_path.
|
||||
bool use_mocked = false;
|
||||
|
||||
/// @brief Directory for cached JSON and database files.
|
||||
std::string cache_dir;
|
||||
|
||||
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
|
||||
float temperature = 0.8f;
|
||||
|
||||
@@ -43,10 +40,6 @@ struct ApplicationOptions {
|
||||
|
||||
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
|
||||
int seed = -1;
|
||||
|
||||
/// @brief Git commit hash for database consistency (always pinned to
|
||||
/// c5eb7772).
|
||||
std::string commit = "c5eb7772";
|
||||
};
|
||||
|
||||
/**
|
||||
@@ -63,8 +56,8 @@ class BiergartenDataGenerator {
|
||||
* @param options Application configuration options.
|
||||
* @param web_client HTTP client for downloading data.
|
||||
*/
|
||||
BiergartenDataGenerator(const ApplicationOptions& options,
|
||||
std::shared_ptr<WebClient> web_client);
|
||||
BiergartenDataGenerator(ApplicationOptions options,
|
||||
std::unique_ptr<WebClient> web_client);
|
||||
|
||||
/**
|
||||
* @brief Run the data generation pipeline.
|
||||
@@ -74,9 +67,9 @@ class BiergartenDataGenerator {
|
||||
* 2. Initialize the generator (LLM or Mock)
|
||||
* 3. Generate brewery data for sampled cities
|
||||
*
|
||||
* @return 0 on success, 1 on failure.
|
||||
* @return true if successful, false if not
|
||||
*/
|
||||
int Run();
|
||||
bool Run();
|
||||
|
||||
private:
|
||||
/// @brief Immutable application options.
|
||||
@@ -100,14 +93,14 @@ class BiergartenDataGenerator {
|
||||
*
|
||||
* @return A unique_ptr to the initialized generator.
|
||||
*/
|
||||
std::unique_ptr<DataGenerator> InitializeGenerator();
|
||||
std::unique_ptr<DataGenerator> InitializeGenerator() const;
|
||||
|
||||
/**
|
||||
* @brief Load locations from JSON and sample cities.
|
||||
*
|
||||
* @return Vector of sampled locations capped at 30 entries.
|
||||
*/
|
||||
std::vector<Location> QueryCitiesWithCountries();
|
||||
static std::vector<Location> QueryCitiesWithCountries();
|
||||
|
||||
/**
|
||||
* @brief Enrich cities with Wikipedia summaries.
|
||||
|
||||
@@ -24,7 +24,7 @@ class WikipediaService {
|
||||
std::string_view country);
|
||||
|
||||
private:
|
||||
std::string FetchExtract(std::string_view query);
|
||||
std::string FetchExtract(std::string_view query) const;
|
||||
std::shared_ptr<WebClient> client_;
|
||||
std::unordered_map<std::string, std::string> cache_;
|
||||
};
|
||||
|
||||
@@ -1,168 +0,0 @@
|
||||
/**
|
||||
* @file biergarten_data_generator.cpp
|
||||
* @brief Orchestrates end-to-end pipeline execution for city sampling,
|
||||
* Wikipedia enrichment, generator initialization, and brewery result output.
|
||||
*/
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <filesystem>
|
||||
#include <future>
|
||||
#include <iterator>
|
||||
#include <random>
|
||||
|
||||
#include "data_generation/llama_generator.h"
|
||||
#include "data_generation/mock_generator.h"
|
||||
#include "json_handling/json_loader.h"
|
||||
#include "wikipedia/wikipedia_service.h"
|
||||
|
||||
BiergartenDataGenerator::BiergartenDataGenerator(
|
||||
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
|
||||
: options_(options), webClient_(std::move(web_client)) {}
|
||||
|
||||
auto BiergartenDataGenerator::InitializeGenerator()
|
||||
-> std::unique_ptr<DataGenerator> {
|
||||
spdlog::info("Initializing brewery generator...");
|
||||
|
||||
std::unique_ptr<DataGenerator> generator;
|
||||
if (options_.model_path.empty()) {
|
||||
generator = std::make_unique<MockGenerator>();
|
||||
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
|
||||
} else {
|
||||
auto llama_generator = std::make_unique<LlamaGenerator>();
|
||||
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
|
||||
options_.seed);
|
||||
llama_generator->SetContextSize(options_.n_ctx);
|
||||
spdlog::info(
|
||||
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
|
||||
"n_ctx={}, seed={})",
|
||||
options_.model_path, options_.temperature, options_.top_p,
|
||||
options_.n_ctx, options_.seed);
|
||||
generator = std::move(llama_generator);
|
||||
}
|
||||
generator->Load(options_.model_path);
|
||||
|
||||
return generator;
|
||||
}
|
||||
|
||||
auto BiergartenDataGenerator::QueryCitiesWithCountries()
|
||||
-> std::vector<Location> {
|
||||
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||
|
||||
std::filesystem::path locations_path = "locations.json";
|
||||
if (!std::filesystem::exists(locations_path)) {
|
||||
const std::filesystem::path cache_path =
|
||||
std::filesystem::path(options_.cache_dir) / "locations.json";
|
||||
if (std::filesystem::exists(cache_path)) {
|
||||
locations_path = cache_path;
|
||||
}
|
||||
}
|
||||
|
||||
auto all_locations = JsonLoader::LoadLocations(locations_path.string());
|
||||
spdlog::info(" Locations available: {}", all_locations.size());
|
||||
|
||||
const size_t sample_count = std::min<size_t>(4, all_locations.size());
|
||||
std::vector<Location> sampled_locations;
|
||||
sampled_locations.reserve(sample_count);
|
||||
|
||||
std::random_device random_generator;
|
||||
std::sample(all_locations.begin(), all_locations.end(),
|
||||
std::back_inserter(sampled_locations), sample_count,
|
||||
random_generator);
|
||||
|
||||
spdlog::info(" Sampled locations: {}", sampled_locations.size());
|
||||
return sampled_locations;
|
||||
}
|
||||
|
||||
auto BiergartenDataGenerator::EnrichWithWikipedia(
|
||||
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
|
||||
std::vector<EnrichedCity> enriched;
|
||||
enriched.reserve(cities.size());
|
||||
|
||||
std::vector<std::future<EnrichedCity>> pending;
|
||||
pending.reserve(cities.size());
|
||||
|
||||
for (const auto& city : cities) {
|
||||
pending.push_back(
|
||||
std::async(std::launch::async, [web_client = webClient_, city]() {
|
||||
WikipediaService wikipedia_service(web_client);
|
||||
const std::string region_context =
|
||||
wikipedia_service.GetSummary(city.city, city.country);
|
||||
spdlog::debug("[Pipeline] Region context for {}: {}", city.city,
|
||||
region_context);
|
||||
return EnrichedCity{city, region_context};
|
||||
}));
|
||||
}
|
||||
|
||||
for (auto& task : pending) {
|
||||
enriched.push_back(task.get());
|
||||
}
|
||||
|
||||
return enriched;
|
||||
}
|
||||
|
||||
void BiergartenDataGenerator::GenerateBreweries(
|
||||
DataGenerator& generator, const std::vector<EnrichedCity>& cities) {
|
||||
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
|
||||
generatedBreweries_.clear();
|
||||
|
||||
size_t skipped_count = 0;
|
||||
|
||||
for (const auto& enriched_city : cities) {
|
||||
try {
|
||||
auto brewery = generator.GenerateBrewery(
|
||||
enriched_city.location.city, enriched_city.location.country,
|
||||
enriched_city.region_context);
|
||||
generatedBreweries_.push_back({enriched_city.location, brewery});
|
||||
} catch (const std::exception& e) {
|
||||
++skipped_count;
|
||||
spdlog::warn(
|
||||
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: "
|
||||
"{}",
|
||||
enriched_city.location.city, enriched_city.location.country,
|
||||
e.what());
|
||||
}
|
||||
}
|
||||
|
||||
if (skipped_count > 0) {
|
||||
spdlog::warn(
|
||||
"[Pipeline] Skipped {} city/cities due to generation "
|
||||
"errors",
|
||||
skipped_count);
|
||||
}
|
||||
}
|
||||
|
||||
void BiergartenDataGenerator::LogResults() const {
|
||||
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
||||
size_t index = 1;
|
||||
for (const auto& entry : generatedBreweries_) {
|
||||
spdlog::info(
|
||||
"{}. city=\"{}\" country=\"{}\" state=\"{}\" "
|
||||
"iso3166_2={} lat={} lon={}",
|
||||
index, entry.location.city, entry.location.country,
|
||||
entry.location.state_province, entry.location.iso3166_2,
|
||||
entry.location.latitude, entry.location.longitude);
|
||||
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
|
||||
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
|
||||
++index;
|
||||
}
|
||||
}
|
||||
|
||||
auto BiergartenDataGenerator::Run() -> int {
|
||||
try {
|
||||
auto generator = InitializeGenerator();
|
||||
auto cities = QueryCitiesWithCountries();
|
||||
auto enriched = EnrichWithWikipedia(cities);
|
||||
GenerateBreweries(*generator, enriched);
|
||||
LogResults();
|
||||
|
||||
spdlog::info("\nOK: Pipeline completed successfully");
|
||||
return 0;
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::error("ERROR: Pipeline failed: {}", e.what());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
12
pipeline/src/biergarten_data_generator/constructor.cpp
Normal file
12
pipeline/src/biergarten_data_generator/constructor.cpp
Normal file
@@ -0,0 +1,12 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/constructor.cpp
|
||||
* @brief BiergartenDataGenerator constructor implementation.
|
||||
*/
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
|
||||
BiergartenDataGenerator::BiergartenDataGenerator(
|
||||
ApplicationOptions options, std::unique_ptr<WebClient> web_client)
|
||||
: options_(std::move(options)), webClient_(std::move(web_client)) {}
|
||||
@@ -0,0 +1,69 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/enrich_with_wikipedia.cpp
|
||||
* @brief BiergartenDataGenerator::EnrichWithWikipedia() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <future>
|
||||
#include <optional>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
#include "wikipedia/wikipedia_service.h"
|
||||
|
||||
namespace {
|
||||
|
||||
auto TryGetRegionContext(const std::shared_ptr<WebClient>& web_client,
|
||||
const Location* city_ptr,
|
||||
std::atomic<size_t>* skipped_enrichment_count) noexcept
|
||||
-> std::optional<std::string> {
|
||||
try {
|
||||
WikipediaService wikipedia_service(web_client);
|
||||
return wikipedia_service.GetSummary(city_ptr->city, city_ptr->country);
|
||||
} catch (...) {
|
||||
skipped_enrichment_count->fetch_add(1, std::memory_order_relaxed);
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
auto BiergartenDataGenerator::EnrichWithWikipedia(
|
||||
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
|
||||
std::vector<EnrichedCity> enriched;
|
||||
enriched.reserve(cities.size());
|
||||
|
||||
std::atomic<size_t> skipped_enrichment_count = 0;
|
||||
std::vector<std::future<std::optional<std::string>>> pending;
|
||||
pending.reserve(cities.size());
|
||||
|
||||
for (const auto& city : cities) {
|
||||
const Location* city_ptr = &city;
|
||||
pending.push_back(std::async(std::launch::async, TryGetRegionContext,
|
||||
webClient_, city_ptr,
|
||||
&skipped_enrichment_count));
|
||||
}
|
||||
|
||||
auto city_it = cities.cbegin();
|
||||
for (auto& task : pending) {
|
||||
auto maybe_region_context = task.get();
|
||||
if (maybe_region_context.has_value()) {
|
||||
spdlog::debug("[Pipeline] Region context for {}: {}", city_it->city,
|
||||
*maybe_region_context);
|
||||
enriched.push_back(
|
||||
EnrichedCity{.location = *city_it,
|
||||
.region_context = std::move(*maybe_region_context)});
|
||||
}
|
||||
++city_it;
|
||||
}
|
||||
|
||||
if (skipped_enrichment_count.load(std::memory_order_relaxed) > 0) {
|
||||
spdlog::warn(
|
||||
"[Pipeline] Skipped {} city/cities due to Wikipedia enrichment "
|
||||
"errors",
|
||||
skipped_enrichment_count.load(std::memory_order_relaxed));
|
||||
}
|
||||
|
||||
return enriched;
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/generate_breweries.cpp
|
||||
* @brief BiergartenDataGenerator::GenerateBreweries() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
|
||||
void BiergartenDataGenerator::GenerateBreweries(
|
||||
DataGenerator& generator, const std::vector<EnrichedCity>& cities) {
|
||||
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
|
||||
generatedBreweries_.clear();
|
||||
|
||||
size_t skipped_count = 0;
|
||||
|
||||
for (const auto& enriched_city : cities) {
|
||||
try {
|
||||
auto brewery = generator.GenerateBrewery(
|
||||
enriched_city.location.city, enriched_city.location.country,
|
||||
enriched_city.region_context);
|
||||
generatedBreweries_.push_back(GeneratedBrewery{
|
||||
.location = enriched_city.location, .brewery = brewery});
|
||||
} catch (const std::exception& e) {
|
||||
++skipped_count;
|
||||
spdlog::warn(
|
||||
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: "
|
||||
"{}",
|
||||
enriched_city.location.city, enriched_city.location.country,
|
||||
e.what());
|
||||
}
|
||||
}
|
||||
|
||||
if (skipped_count > 0) {
|
||||
spdlog::warn(
|
||||
"[Pipeline] Skipped {} city/cities due to generation "
|
||||
"errors",
|
||||
skipped_count);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,35 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/initialize_generator.cpp
|
||||
* @brief BiergartenDataGenerator::InitializeGenerator() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
#include "data_generation/llama_generator.h"
|
||||
#include "data_generation/mock_generator.h"
|
||||
|
||||
auto BiergartenDataGenerator::InitializeGenerator() const
|
||||
-> std::unique_ptr<DataGenerator> {
|
||||
spdlog::info("Initializing brewery generator...");
|
||||
|
||||
std::unique_ptr<DataGenerator> generator;
|
||||
if (options_.model_path.empty()) {
|
||||
generator = std::make_unique<MockGenerator>();
|
||||
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
|
||||
} else {
|
||||
auto llama_generator = std::make_unique<LlamaGenerator>();
|
||||
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
|
||||
options_.seed);
|
||||
llama_generator->SetContextSize(options_.n_ctx);
|
||||
spdlog::info(
|
||||
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
|
||||
"n_ctx={}, seed={})",
|
||||
options_.model_path, options_.temperature, options_.top_p,
|
||||
options_.n_ctx, options_.seed);
|
||||
generator = std::move(llama_generator);
|
||||
}
|
||||
generator->Load(options_.model_path);
|
||||
|
||||
return generator;
|
||||
}
|
||||
23
pipeline/src/biergarten_data_generator/log_results.cpp
Normal file
23
pipeline/src/biergarten_data_generator/log_results.cpp
Normal file
@@ -0,0 +1,23 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/log_results.cpp
|
||||
* @brief BiergartenDataGenerator::LogResults() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
|
||||
void BiergartenDataGenerator::LogResults() const {
|
||||
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
||||
size_t index = 1;
|
||||
for (const auto& [location, brewery] : generatedBreweries_) {
|
||||
spdlog::info(
|
||||
"{}. city=\"{}\" country=\"{}\" state=\"{}\" "
|
||||
"iso3166_2={} lat={} lon={}",
|
||||
index, location.city, location.country, location.state_province,
|
||||
location.iso3166_2, location.latitude, location.longitude);
|
||||
spdlog::info(" brewery_name=\"{}\"", brewery.name);
|
||||
spdlog::info(" brewery_description=\"{}\"", brewery.description);
|
||||
++index;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/query_cities_with_countries.cpp
|
||||
* @brief BiergartenDataGenerator::QueryCitiesWithCountries() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <filesystem>
|
||||
#include <random>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
#include "json_handling/json_loader.h"
|
||||
|
||||
static constexpr unsigned int brewery_amount = 4;
|
||||
|
||||
auto BiergartenDataGenerator::QueryCitiesWithCountries()
|
||||
-> std::vector<Location> {
|
||||
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||
|
||||
const std::filesystem::path locations_path = "locations.json";
|
||||
|
||||
auto all_locations = JsonLoader::LoadLocations(locations_path.string());
|
||||
spdlog::info(" Locations available: {}", all_locations.size());
|
||||
|
||||
const size_t sample_count =
|
||||
std::min<size_t>(brewery_amount, all_locations.size());
|
||||
const auto sample_count_signed =
|
||||
static_cast<std::iter_difference_t<decltype(all_locations.cbegin())>>(
|
||||
sample_count);
|
||||
std::vector<Location> sampled_locations;
|
||||
sampled_locations.reserve(sample_count);
|
||||
|
||||
std::random_device random_generator;
|
||||
std::ranges::sample(all_locations, std::back_inserter(sampled_locations),
|
||||
sample_count_signed, random_generator);
|
||||
|
||||
spdlog::info(" Sampled locations: {}", sampled_locations.size());
|
||||
return sampled_locations;
|
||||
}
|
||||
22
pipeline/src/biergarten_data_generator/run.cpp
Normal file
22
pipeline/src/biergarten_data_generator/run.cpp
Normal file
@@ -0,0 +1,22 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/run.cpp
|
||||
* @brief BiergartenDataGenerator::Run() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
|
||||
auto BiergartenDataGenerator::Run() -> bool {
|
||||
try {
|
||||
const std::unique_ptr<DataGenerator> generator = InitializeGenerator();
|
||||
const std::vector<Location> cities = QueryCitiesWithCountries();
|
||||
const std::vector<EnrichedCity> enriched = EnrichWithWikipedia(cities);
|
||||
this->GenerateBreweries(*generator, enriched);
|
||||
this->LogResults();
|
||||
return true;
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::error("Pipeline execution failed with error: {}", e.what());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -7,13 +7,14 @@
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
#include <iostream>
|
||||
#include <exception>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
namespace po = boost::program_options;
|
||||
namespace prog_opts = boost::program_options;
|
||||
|
||||
/**
|
||||
* @brief Parse command-line arguments into ApplicationOptions.
|
||||
@@ -23,123 +24,119 @@ namespace po = boost::program_options;
|
||||
* @param options Output ApplicationOptions struct.
|
||||
* @return true if parsing succeeded and should proceed, false otherwise.
|
||||
*/
|
||||
bool ParseArguments(int argc, char** argv, ApplicationOptions& options) {
|
||||
// If no arguments provided, display usage and exit
|
||||
if (argc == 1) {
|
||||
std::cout << "Biergarten Pipeline - Geographic Data Pipeline with "
|
||||
"Brewery Generation\n\n";
|
||||
std::cout << "Usage: biergarten-pipeline [options]\n\n";
|
||||
std::cout << "Options:\n";
|
||||
std::cout << " --mocked Use mocked generator for "
|
||||
"brewery/user data\n";
|
||||
std::cout << " --model, -m PATH Path to LLM model file (gguf) for "
|
||||
"generation\n";
|
||||
std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: "
|
||||
"/tmp)\n";
|
||||
std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 "
|
||||
"(default: 0.8)\n";
|
||||
std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 "
|
||||
"(default: 0.92)\n";
|
||||
std::cout << " --n-ctx SIZE Context window size in tokens "
|
||||
"(default: 4096)\n";
|
||||
std::cout << " --seed SEED Random seed: -1 for random "
|
||||
"(default: -1)\n";
|
||||
std::cout << " --help, -h Show this help message\n\n";
|
||||
std::cout << "Note: --mocked and --model are mutually exclusive. Exactly "
|
||||
"one must be provided.\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
po::options_description desc("Pipeline Options");
|
||||
desc.add_options()("help,h", "Produce help message")(
|
||||
"mocked", po::bool_switch(),
|
||||
"Use mocked generator for brewery/user data")(
|
||||
"model,m", po::value<std::string>()->default_value(""),
|
||||
"Path to LLM model (gguf)")(
|
||||
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
|
||||
"Directory for cached JSON")(
|
||||
"temperature", po::value<float>()->default_value(0.8f),
|
||||
"Sampling temperature (higher = more random)")(
|
||||
"top-p", po::value<float>()->default_value(0.92f),
|
||||
"Nucleus sampling top-p in (0,1] (higher = more random)")(
|
||||
"n-ctx", po::value<uint32_t>()->default_value(8192),
|
||||
"Context window size in tokens (1-32768)")(
|
||||
"seed", po::value<int>()->default_value(-1),
|
||||
auto ParseArguments(const int argc, char** argv,
|
||||
ApplicationOptions& options) noexcept -> bool {
|
||||
prog_opts::options_description desc("Pipeline Options");
|
||||
desc.add_options()
|
||||
("help,h", "Produce help message")
|
||||
("mocked",
|
||||
prog_opts::bool_switch(),
|
||||
"Use mocked generator for brewery/user data")
|
||||
("model,m",
|
||||
prog_opts::value<std::string>()->default_value(""),
|
||||
"Path to LLM model (gguf)")
|
||||
("temperature",
|
||||
prog_opts::value<float>()->default_value(0.8f),
|
||||
"Sampling temperature (higher = more random)")
|
||||
("top-p",
|
||||
prog_opts::value<float>()->default_value(0.92f),
|
||||
"Nucleus sampling top-p in (0,1] (higher = more random)")
|
||||
("n-ctx",
|
||||
prog_opts::value<uint32_t>()->default_value(8192),
|
||||
"Context window size in tokens (1-32768)")
|
||||
("seed",
|
||||
prog_opts::value<int>()->default_value(-1),
|
||||
"Sampler seed: -1 for random, otherwise non-negative integer");
|
||||
|
||||
po::variables_map vm;
|
||||
po::store(po::parse_command_line(argc, argv, desc), vm);
|
||||
po::notify(vm);
|
||||
|
||||
if (vm.count("help")) {
|
||||
std::cout << desc << "\n";
|
||||
// Handle the "no arguments" or "help" case
|
||||
if (argc == 1) {
|
||||
spdlog::info("Biergarten Pipeline");
|
||||
std::stringstream ss;
|
||||
ss << "\nUsage: biergarten-pipeline [options]\n\n" << desc;
|
||||
spdlog::info(ss.str());
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for mutually exclusive --mocked and --model flags
|
||||
bool use_mocked = vm["mocked"].as<bool>();
|
||||
std::string model_path = vm["model"].as<std::string>();
|
||||
try {
|
||||
prog_opts::variables_map vm;
|
||||
prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), vm);
|
||||
prog_opts::notify(vm);
|
||||
|
||||
if (vm.contains("help")) {
|
||||
std::stringstream ss;
|
||||
ss << "\n" << desc;
|
||||
spdlog::info(ss.str());
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto use_mocked = vm["mocked"].as<bool>();
|
||||
const auto model_path = vm["model"].as<std::string>();
|
||||
|
||||
if (use_mocked && !model_path.empty()) {
|
||||
spdlog::error("ERROR: --mocked and --model are mutually exclusive");
|
||||
spdlog::error(
|
||||
"Invalid arguments: --mocked and --model are mutually exclusive");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!use_mocked && model_path.empty()) {
|
||||
spdlog::error("ERROR: Either --mocked or --model must be specified");
|
||||
spdlog::error(
|
||||
"Invalid arguments: Either --mocked or --model must be specified");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Warn if sampling parameters are provided with --mocked
|
||||
if (use_mocked) {
|
||||
bool hasTemperature = vm["temperature"].defaulted() == false;
|
||||
bool hasTopP = vm["top-p"].defaulted() == false;
|
||||
bool hasSeed = vm["seed"].defaulted() == false;
|
||||
const bool has_llm_params = !vm["temperature"].defaulted() ||
|
||||
!vm["top-p"].defaulted() ||
|
||||
!vm["seed"].defaulted();
|
||||
|
||||
if (hasTemperature || hasTopP || hasSeed) {
|
||||
if (use_mocked && has_llm_params) {
|
||||
spdlog::warn(
|
||||
"WARNING: Sampling parameters (--temperature, --top-p, --seed) "
|
||||
"are ignored when using --mocked");
|
||||
}
|
||||
"Sampling parameters (--temperature, --top-p, --seed) are"
|
||||
" ignored when using --mocked");
|
||||
}
|
||||
|
||||
options.use_mocked = use_mocked;
|
||||
options.model_path = model_path;
|
||||
options.cache_dir = vm["cache-dir"].as<std::string>();
|
||||
options.temperature = vm["temperature"].as<float>();
|
||||
options.top_p = vm["top-p"].as<float>();
|
||||
options.n_ctx = vm["n-ctx"].as<uint32_t>();
|
||||
options.seed = vm["seed"].as<int>();
|
||||
// commit is always pinned to c5eb7772
|
||||
|
||||
return true;
|
||||
} catch (const std::exception& exception) {
|
||||
spdlog::error("Failed to parse command-line arguments: {}",
|
||||
exception.what());
|
||||
return false;
|
||||
} catch (...) {
|
||||
spdlog::error("Failed to parse command-line arguments: unknown error");
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]) {
|
||||
auto main(const int argc, char** argv) noexcept -> int {
|
||||
try {
|
||||
const CurlGlobalState curl_state;
|
||||
spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v");
|
||||
|
||||
ApplicationOptions options;
|
||||
if (!ParseArguments(argc, argv, options)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto webClient = std::make_shared<CURLWebClient>();
|
||||
auto webClient = std::make_unique<CURLWebClient>();
|
||||
BiergartenDataGenerator generator(options, std::move(webClient));
|
||||
|
||||
BiergartenDataGenerator generator(options, webClient);
|
||||
return generator.Run();
|
||||
|
||||
} catch (const std::exception& e) {
|
||||
const std::string message = e.what() ? e.what() : "";
|
||||
|
||||
if (message.find("LlamaGenerator: malformed brewery response") !=
|
||||
std::string::npos) {
|
||||
spdlog::warn("WARNING: Non-fatal LLM failure after retries: {}",
|
||||
message);
|
||||
return 0;
|
||||
if (!generator.Run()) {
|
||||
spdlog::error("Pipeline execution failed");
|
||||
return 1;
|
||||
}
|
||||
|
||||
spdlog::error("ERROR: Application failed: {}", e.what());
|
||||
spdlog::info("Pipeline executed successfully");
|
||||
return 0;
|
||||
} catch (const std::exception& exception) {
|
||||
spdlog::critical("Unhandled fatal error in main: {}", exception.what());
|
||||
return 1;
|
||||
} catch (...) {
|
||||
spdlog::critical("Unhandled fatal non-standard exception in main");
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
17
pipeline/src/web_client/curl_global_state_constructor.cpp
Normal file
17
pipeline/src/web_client/curl_global_state_constructor.cpp
Normal file
@@ -0,0 +1,17 @@
|
||||
/**
|
||||
* @file web_client/curl_global_state_constructor.cpp
|
||||
* @brief CurlGlobalState constructor implementation.
|
||||
*/
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
CurlGlobalState::CurlGlobalState() {
|
||||
if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
|
||||
throw std::runtime_error(
|
||||
"[CURLWebClient] Failed to initialize libcurl globally");
|
||||
}
|
||||
}
|
||||
10
pipeline/src/web_client/curl_global_state_destructor.cpp
Normal file
10
pipeline/src/web_client/curl_global_state_destructor.cpp
Normal file
@@ -0,0 +1,10 @@
|
||||
/**
|
||||
* @file web_client/curl_global_state_destructor.cpp
|
||||
* @brief CurlGlobalState destructor implementation.
|
||||
*/
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }
|
||||
8
pipeline/src/web_client/curl_web_client_constructor.cpp
Normal file
8
pipeline/src/web_client/curl_web_client_constructor.cpp
Normal file
@@ -0,0 +1,8 @@
|
||||
/**
|
||||
* @file web_client/curl_web_client_constructor.cpp
|
||||
* @brief CURLWebClient constructor implementation.
|
||||
*/
|
||||
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
CURLWebClient::CURLWebClient() {}
|
||||
8
pipeline/src/web_client/curl_web_client_destructor.cpp
Normal file
8
pipeline/src/web_client/curl_web_client_destructor.cpp
Normal file
@@ -0,0 +1,8 @@
|
||||
/**
|
||||
* @file web_client/curl_web_client_destructor.cpp
|
||||
* @brief CURLWebClient destructor implementation.
|
||||
*/
|
||||
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
CURLWebClient::~CURLWebClient() {}
|
||||
@@ -1,11 +1,8 @@
|
||||
/**
|
||||
* @file web_client/curl_web_client.cpp
|
||||
* @brief Implements libcurl-backed HTTP utilities, including GET requests,
|
||||
* file downloads, URL encoding, and RAII global curl lifecycle handling.
|
||||
* @file web_client/curl_web_client_download_to_file.cpp
|
||||
* @brief CURLWebClient::DownloadToFile() implementation.
|
||||
*/
|
||||
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
#include <cstdio>
|
||||
@@ -14,34 +11,9 @@
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
CurlGlobalState::CurlGlobalState() {
|
||||
if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
|
||||
throw std::runtime_error(
|
||||
"[CURLWebClient] Failed to initialize libcurl globally");
|
||||
}
|
||||
}
|
||||
|
||||
CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
namespace {
|
||||
// curl write callback that appends response data into a std::string
|
||||
size_t WriteCallbackString(void* contents, size_t size, size_t nmemb,
|
||||
void* userp) {
|
||||
size_t realsize = size * nmemb;
|
||||
auto* s = static_cast<std::string*>(userp);
|
||||
s->append(static_cast<char*>(contents), realsize);
|
||||
return realsize;
|
||||
}
|
||||
|
||||
// curl write callback that writes to a file stream
|
||||
size_t WriteCallbackFile(void* contents, size_t size, size_t nmemb,
|
||||
void* userp) {
|
||||
size_t realsize = size * nmemb;
|
||||
auto* outFile = static_cast<std::ofstream*>(userp);
|
||||
outFile->write(static_cast<char*>(contents), realsize);
|
||||
return realsize;
|
||||
}
|
||||
|
||||
// RAII wrapper for CURL handle using unique_ptr
|
||||
using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
||||
|
||||
@@ -64,12 +36,17 @@ void set_common_get_options(CURL* curl, const std::string& url,
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, total_timeout);
|
||||
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
|
||||
}
|
||||
|
||||
// curl write callback that writes to a file stream
|
||||
size_t WriteCallbackFile(void* contents, size_t size, size_t nmemb,
|
||||
void* userp) {
|
||||
size_t realsize = size * nmemb;
|
||||
auto* outFile = static_cast<std::ofstream*>(userp);
|
||||
outFile->write(static_cast<char*>(contents), realsize);
|
||||
return realsize;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
CURLWebClient::CURLWebClient() {}
|
||||
|
||||
CURLWebClient::~CURLWebClient() {}
|
||||
|
||||
void CURLWebClient::DownloadToFile(const std::string& url,
|
||||
const std::string& file_path) {
|
||||
auto curl = create_handle();
|
||||
@@ -105,43 +82,3 @@ void CURLWebClient::DownloadToFile(const std::string& url,
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
}
|
||||
|
||||
std::string CURLWebClient::Get(const std::string& url) {
|
||||
auto curl = create_handle();
|
||||
|
||||
std::string response_string;
|
||||
set_common_get_options(curl.get(), url, 10L, 20L);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl.get());
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
std::string error =
|
||||
std::string("[CURLWebClient] GET failed: ") + curl_easy_strerror(res);
|
||||
throw std::runtime_error(error);
|
||||
}
|
||||
|
||||
long httpCode = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
|
||||
|
||||
if (httpCode != 200) {
|
||||
std::stringstream ss;
|
||||
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
return response_string;
|
||||
}
|
||||
|
||||
std::string CURLWebClient::UrlEncode(const std::string& value) {
|
||||
// A NULL handle is fine for UTF-8 encoding according to libcurl docs.
|
||||
char* output = curl_easy_escape(nullptr, value.c_str(), 0);
|
||||
|
||||
if (output) {
|
||||
std::string result(output);
|
||||
curl_free(output);
|
||||
return result;
|
||||
}
|
||||
throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
|
||||
}
|
||||
75
pipeline/src/web_client/curl_web_client_get.cpp
Normal file
75
pipeline/src/web_client/curl_web_client_get.cpp
Normal file
@@ -0,0 +1,75 @@
|
||||
/**
|
||||
* @file web_client/curl_web_client_get.cpp
|
||||
* @brief CURLWebClient::Get() implementation.
|
||||
*/
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
#include <memory>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
namespace {
|
||||
// RAII wrapper for CURL handle using unique_ptr
|
||||
using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
||||
|
||||
CurlHandle create_handle() {
|
||||
CURL* handle = curl_easy_init();
|
||||
if (!handle) {
|
||||
throw std::runtime_error(
|
||||
"[CURLWebClient] Failed to initialize libcurl handle");
|
||||
}
|
||||
return CurlHandle(handle, &curl_easy_cleanup);
|
||||
}
|
||||
|
||||
void set_common_get_options(CURL* curl, const std::string& url,
|
||||
long connect_timeout, long total_timeout) {
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
|
||||
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, connect_timeout);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, total_timeout);
|
||||
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
|
||||
}
|
||||
|
||||
// curl write callback that appends response data into a std::string
|
||||
size_t WriteCallbackString(void* contents, size_t size, size_t nmemb,
|
||||
void* userp) {
|
||||
size_t realsize = size * nmemb;
|
||||
auto* s = static_cast<std::string*>(userp);
|
||||
s->append(static_cast<char*>(contents), realsize);
|
||||
return realsize;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
std::string CURLWebClient::Get(const std::string& url) {
|
||||
auto curl = create_handle();
|
||||
|
||||
std::string response_string;
|
||||
set_common_get_options(curl.get(), url, 10L, 20L);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
|
||||
|
||||
CURLcode res = curl_easy_perform(curl.get());
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
std::string error =
|
||||
std::string("[CURLWebClient] GET failed: ") + curl_easy_strerror(res);
|
||||
throw std::runtime_error(error);
|
||||
}
|
||||
|
||||
long httpCode = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
|
||||
|
||||
if (httpCode != 200) {
|
||||
std::stringstream ss;
|
||||
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
return response_string;
|
||||
}
|
||||
23
pipeline/src/web_client/curl_web_client_url_encode.cpp
Normal file
23
pipeline/src/web_client/curl_web_client_url_encode.cpp
Normal file
@@ -0,0 +1,23 @@
|
||||
/**
|
||||
* @file web_client/curl_web_client_url_encode.cpp
|
||||
* @brief CURLWebClient::UrlEncode() implementation.
|
||||
*/
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
std::string CURLWebClient::UrlEncode(const std::string& value) {
|
||||
// A NULL handle is fine for UTF-8 encoding according to libcurl docs.
|
||||
char* output = curl_easy_escape(nullptr, value.c_str(), 0);
|
||||
|
||||
if (output) {
|
||||
std::string result(output);
|
||||
curl_free(output);
|
||||
return result;
|
||||
}
|
||||
throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
|
||||
}
|
||||
11
pipeline/src/wikipedia/constructor.cpp
Normal file
11
pipeline/src/wikipedia/constructor.cpp
Normal file
@@ -0,0 +1,11 @@
|
||||
/**
|
||||
* @file wikipedia/constructor.cpp
|
||||
* @brief WikipediaService constructor implementation.
|
||||
*/
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include "wikipedia/wikipedia_service.h"
|
||||
|
||||
WikipediaService::WikipediaService(std::shared_ptr<WebClient> client)
|
||||
: client_(std::move(client)) {}
|
||||
51
pipeline/src/wikipedia/fetch_extract.cpp
Normal file
51
pipeline/src/wikipedia/fetch_extract.cpp
Normal file
@@ -0,0 +1,51 @@
|
||||
/**
|
||||
* @file wikipedia/fetch_extract.cpp
|
||||
* @brief WikipediaService::FetchExtract() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <boost/json.hpp>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "wikipedia/wikipedia_service.h"
|
||||
|
||||
auto WikipediaService::FetchExtract(std::string_view query) const
|
||||
-> std::string {
|
||||
const std::string encoded = client_->UrlEncode(std::string(query));
|
||||
const std::string url =
|
||||
"https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
|
||||
"&prop=extracts&explaintext=1&format=json";
|
||||
|
||||
const std::string body = client_->Get(url);
|
||||
|
||||
boost::system::error_code ec;
|
||||
boost::json::value doc = boost::json::parse(body, ec);
|
||||
|
||||
if (!ec && doc.is_object()) {
|
||||
try {
|
||||
auto& pages = doc.at("query").at("pages").get_object();
|
||||
if (!pages.empty()) {
|
||||
auto& page = pages.begin()->value().get_object();
|
||||
if (page.contains("extract") && page.at("extract").is_string()) {
|
||||
std::string extract(page.at("extract").as_string().c_str());
|
||||
spdlog::debug("WikipediaService fetched {} chars for '{}'",
|
||||
extract.size(), query);
|
||||
return extract;
|
||||
}
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::warn(
|
||||
"WikipediaService: failed to parse response structure for '{}': "
|
||||
"{}",
|
||||
query, e.what());
|
||||
return {};
|
||||
}
|
||||
} else if (ec) {
|
||||
spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
|
||||
ec.message());
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
55
pipeline/src/wikipedia/get_summary.cpp
Normal file
55
pipeline/src/wikipedia/get_summary.cpp
Normal file
@@ -0,0 +1,55 @@
|
||||
/**
|
||||
* @file wikipedia/get_summary.cpp
|
||||
* @brief WikipediaService::GetSummary() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "wikipedia/wikipedia_service.h"
|
||||
|
||||
auto WikipediaService::GetSummary(std::string_view city,
|
||||
std::string_view country) -> std::string {
|
||||
const std::string key = std::string(city) + "|" + std::string(country);
|
||||
const auto cacheIt = cache_.find(key);
|
||||
if (cacheIt != cache_.end()) {
|
||||
return cacheIt->second;
|
||||
}
|
||||
|
||||
std::string result;
|
||||
|
||||
if (!client_) {
|
||||
cache_.emplace(key, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string regionQuery(city);
|
||||
if (!country.empty()) {
|
||||
regionQuery += ", ";
|
||||
regionQuery += country;
|
||||
}
|
||||
|
||||
const std::string beerQuery = "beer in " + std::string(country);
|
||||
|
||||
try {
|
||||
const std::string regionExtract = FetchExtract(regionQuery);
|
||||
const std::string beerExtract = FetchExtract(beerQuery);
|
||||
|
||||
if (!regionExtract.empty()) {
|
||||
result += regionExtract;
|
||||
}
|
||||
if (!beerExtract.empty()) {
|
||||
if (!result.empty()) {
|
||||
result += "\n\n";
|
||||
}
|
||||
result += beerExtract;
|
||||
}
|
||||
} catch (const std::runtime_error& e) {
|
||||
spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery,
|
||||
e.what());
|
||||
}
|
||||
|
||||
cache_.emplace(key, result);
|
||||
return result;
|
||||
}
|
||||
@@ -1,95 +0,0 @@
|
||||
/**
|
||||
* @file wikipedia/wikipedia_service.cpp
|
||||
* @brief Implements Wikipedia extract retrieval and caching for city/country
|
||||
* queries, including response parsing and resilient error handling.
|
||||
*/
|
||||
|
||||
#include "wikipedia/wikipedia_service.h"
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <boost/json.hpp>
|
||||
|
||||
WikipediaService::WikipediaService(std::shared_ptr<WebClient> client)
|
||||
: client_(std::move(client)) {}
|
||||
|
||||
std::string WikipediaService::FetchExtract(std::string_view query) {
|
||||
const std::string encoded = client_->UrlEncode(std::string(query));
|
||||
const std::string url =
|
||||
"https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
|
||||
"&prop=extracts&explaintext=1&format=json";
|
||||
|
||||
const std::string body = client_->Get(url);
|
||||
|
||||
boost::system::error_code ec;
|
||||
boost::json::value doc = boost::json::parse(body, ec);
|
||||
|
||||
if (!ec && doc.is_object()) {
|
||||
try {
|
||||
auto& pages = doc.at("query").at("pages").get_object();
|
||||
if (!pages.empty()) {
|
||||
auto& page = pages.begin()->value().get_object();
|
||||
if (page.contains("extract") && page.at("extract").is_string()) {
|
||||
std::string extract(page.at("extract").as_string().c_str());
|
||||
spdlog::debug("WikipediaService fetched {} chars for '{}'",
|
||||
extract.size(), query);
|
||||
return extract;
|
||||
}
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::warn(
|
||||
"WikipediaService: failed to parse response structure for '{}': "
|
||||
"{}",
|
||||
query, e.what());
|
||||
return {};
|
||||
}
|
||||
} else if (ec) {
|
||||
spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
|
||||
ec.message());
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string WikipediaService::GetSummary(std::string_view city,
|
||||
std::string_view country) {
|
||||
const std::string key = std::string(city) + "|" + std::string(country);
|
||||
const auto cacheIt = cache_.find(key);
|
||||
if (cacheIt != cache_.end()) {
|
||||
return cacheIt->second;
|
||||
}
|
||||
|
||||
std::string result;
|
||||
|
||||
if (!client_) {
|
||||
cache_.emplace(key, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string regionQuery(city);
|
||||
if (!country.empty()) {
|
||||
regionQuery += ", ";
|
||||
regionQuery += country;
|
||||
}
|
||||
|
||||
const std::string beerQuery = "beer in " + std::string(country);
|
||||
|
||||
try {
|
||||
const std::string regionExtract = FetchExtract(regionQuery);
|
||||
const std::string beerExtract = FetchExtract(beerQuery);
|
||||
|
||||
if (!regionExtract.empty()) {
|
||||
result += regionExtract;
|
||||
}
|
||||
if (!beerExtract.empty()) {
|
||||
if (!result.empty()) result += "\n\n";
|
||||
result += beerExtract;
|
||||
}
|
||||
} catch (const std::runtime_error& e) {
|
||||
spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery,
|
||||
e.what());
|
||||
}
|
||||
|
||||
cache_.emplace(key, result);
|
||||
return result;
|
||||
}
|
||||
Reference in New Issue
Block a user