Create one method per file

This commit is contained in:
Aaron Po
2026-04-09 17:19:04 -04:00
parent b31be494d7
commit d7a31b5264
23 changed files with 635 additions and 454 deletions

View File

@@ -1,168 +0,0 @@
/**
* @file biergarten_data_generator.cpp
* @brief Orchestrates end-to-end pipeline execution for city sampling,
* Wikipedia enrichment, generator initialization, and brewery result output.
*/
#include "biergarten_data_generator.h"
#include <spdlog/spdlog.h>
#include <algorithm>
#include <filesystem>
#include <future>
#include <iterator>
#include <random>
#include "data_generation/llama_generator.h"
#include "data_generation/mock_generator.h"
#include "json_handling/json_loader.h"
#include "wikipedia/wikipedia_service.h"
BiergartenDataGenerator::BiergartenDataGenerator(
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
: options_(options), webClient_(std::move(web_client)) {}
auto BiergartenDataGenerator::InitializeGenerator()
-> std::unique_ptr<DataGenerator> {
spdlog::info("Initializing brewery generator...");
std::unique_ptr<DataGenerator> generator;
if (options_.model_path.empty()) {
generator = std::make_unique<MockGenerator>();
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
} else {
auto llama_generator = std::make_unique<LlamaGenerator>();
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
options_.seed);
llama_generator->SetContextSize(options_.n_ctx);
spdlog::info(
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
"n_ctx={}, seed={})",
options_.model_path, options_.temperature, options_.top_p,
options_.n_ctx, options_.seed);
generator = std::move(llama_generator);
}
generator->Load(options_.model_path);
return generator;
}
auto BiergartenDataGenerator::QueryCitiesWithCountries()
-> std::vector<Location> {
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
std::filesystem::path locations_path = "locations.json";
if (!std::filesystem::exists(locations_path)) {
const std::filesystem::path cache_path =
std::filesystem::path(options_.cache_dir) / "locations.json";
if (std::filesystem::exists(cache_path)) {
locations_path = cache_path;
}
}
auto all_locations = JsonLoader::LoadLocations(locations_path.string());
spdlog::info(" Locations available: {}", all_locations.size());
const size_t sample_count = std::min<size_t>(4, all_locations.size());
std::vector<Location> sampled_locations;
sampled_locations.reserve(sample_count);
std::random_device random_generator;
std::sample(all_locations.begin(), all_locations.end(),
std::back_inserter(sampled_locations), sample_count,
random_generator);
spdlog::info(" Sampled locations: {}", sampled_locations.size());
return sampled_locations;
}
auto BiergartenDataGenerator::EnrichWithWikipedia(
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
std::vector<EnrichedCity> enriched;
enriched.reserve(cities.size());
std::vector<std::future<EnrichedCity>> pending;
pending.reserve(cities.size());
for (const auto& city : cities) {
pending.push_back(
std::async(std::launch::async, [web_client = webClient_, city]() {
WikipediaService wikipedia_service(web_client);
const std::string region_context =
wikipedia_service.GetSummary(city.city, city.country);
spdlog::debug("[Pipeline] Region context for {}: {}", city.city,
region_context);
return EnrichedCity{city, region_context};
}));
}
for (auto& task : pending) {
enriched.push_back(task.get());
}
return enriched;
}
void BiergartenDataGenerator::GenerateBreweries(
DataGenerator& generator, const std::vector<EnrichedCity>& cities) {
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
generatedBreweries_.clear();
size_t skipped_count = 0;
for (const auto& enriched_city : cities) {
try {
auto brewery = generator.GenerateBrewery(
enriched_city.location.city, enriched_city.location.country,
enriched_city.region_context);
generatedBreweries_.push_back({enriched_city.location, brewery});
} catch (const std::exception& e) {
++skipped_count;
spdlog::warn(
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: "
"{}",
enriched_city.location.city, enriched_city.location.country,
e.what());
}
}
if (skipped_count > 0) {
spdlog::warn(
"[Pipeline] Skipped {} city/cities due to generation "
"errors",
skipped_count);
}
}
void BiergartenDataGenerator::LogResults() const {
spdlog::info("\n=== GENERATED DATA DUMP ===");
size_t index = 1;
for (const auto& entry : generatedBreweries_) {
spdlog::info(
"{}. city=\"{}\" country=\"{}\" state=\"{}\" "
"iso3166_2={} lat={} lon={}",
index, entry.location.city, entry.location.country,
entry.location.state_province, entry.location.iso3166_2,
entry.location.latitude, entry.location.longitude);
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
++index;
}
}
auto BiergartenDataGenerator::Run() -> int {
try {
auto generator = InitializeGenerator();
auto cities = QueryCitiesWithCountries();
auto enriched = EnrichWithWikipedia(cities);
GenerateBreweries(*generator, enriched);
LogResults();
spdlog::info("\nOK: Pipeline completed successfully");
return 0;
} catch (const std::exception& e) {
spdlog::error("ERROR: Pipeline failed: {}", e.what());
return 1;
}
}

View File

@@ -0,0 +1,12 @@
/**
* @file biergarten_data_generator/constructor.cpp
* @brief BiergartenDataGenerator constructor implementation.
*/
#include <utility>
#include "biergarten_data_generator.h"
BiergartenDataGenerator::BiergartenDataGenerator(
ApplicationOptions options, std::unique_ptr<WebClient> web_client)
: options_(std::move(options)), webClient_(std::move(web_client)) {}

View File

@@ -0,0 +1,69 @@
/**
* @file biergarten_data_generator/enrich_with_wikipedia.cpp
* @brief BiergartenDataGenerator::EnrichWithWikipedia() implementation.
*/
#include <spdlog/spdlog.h>
#include <atomic>
#include <future>
#include <optional>
#include "biergarten_data_generator.h"
#include "wikipedia/wikipedia_service.h"
namespace {
auto TryGetRegionContext(const std::shared_ptr<WebClient>& web_client,
const Location* city_ptr,
std::atomic<size_t>* skipped_enrichment_count) noexcept
-> std::optional<std::string> {
try {
WikipediaService wikipedia_service(web_client);
return wikipedia_service.GetSummary(city_ptr->city, city_ptr->country);
} catch (...) {
skipped_enrichment_count->fetch_add(1, std::memory_order_relaxed);
return std::nullopt;
}
}
} // namespace
auto BiergartenDataGenerator::EnrichWithWikipedia(
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
std::vector<EnrichedCity> enriched;
enriched.reserve(cities.size());
std::atomic<size_t> skipped_enrichment_count = 0;
std::vector<std::future<std::optional<std::string>>> pending;
pending.reserve(cities.size());
for (const auto& city : cities) {
const Location* city_ptr = &city;
pending.push_back(std::async(std::launch::async, TryGetRegionContext,
webClient_, city_ptr,
&skipped_enrichment_count));
}
auto city_it = cities.cbegin();
for (auto& task : pending) {
auto maybe_region_context = task.get();
if (maybe_region_context.has_value()) {
spdlog::debug("[Pipeline] Region context for {}: {}", city_it->city,
*maybe_region_context);
enriched.push_back(
EnrichedCity{.location = *city_it,
.region_context = std::move(*maybe_region_context)});
}
++city_it;
}
if (skipped_enrichment_count.load(std::memory_order_relaxed) > 0) {
spdlog::warn(
"[Pipeline] Skipped {} city/cities due to Wikipedia enrichment "
"errors",
skipped_enrichment_count.load(std::memory_order_relaxed));
}
return enriched;
}

View File

@@ -0,0 +1,40 @@
/**
* @file biergarten_data_generator/generate_breweries.cpp
* @brief BiergartenDataGenerator::GenerateBreweries() implementation.
*/
#include <spdlog/spdlog.h>
#include "biergarten_data_generator.h"
void BiergartenDataGenerator::GenerateBreweries(
DataGenerator& generator, const std::vector<EnrichedCity>& cities) {
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
generatedBreweries_.clear();
size_t skipped_count = 0;
for (const auto& enriched_city : cities) {
try {
auto brewery = generator.GenerateBrewery(
enriched_city.location.city, enriched_city.location.country,
enriched_city.region_context);
generatedBreweries_.push_back(GeneratedBrewery{
.location = enriched_city.location, .brewery = brewery});
} catch (const std::exception& e) {
++skipped_count;
spdlog::warn(
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: "
"{}",
enriched_city.location.city, enriched_city.location.country,
e.what());
}
}
if (skipped_count > 0) {
spdlog::warn(
"[Pipeline] Skipped {} city/cities due to generation "
"errors",
skipped_count);
}
}

View File

@@ -0,0 +1,35 @@
/**
* @file biergarten_data_generator/initialize_generator.cpp
* @brief BiergartenDataGenerator::InitializeGenerator() implementation.
*/
#include <spdlog/spdlog.h>
#include "biergarten_data_generator.h"
#include "data_generation/llama_generator.h"
#include "data_generation/mock_generator.h"
auto BiergartenDataGenerator::InitializeGenerator() const
-> std::unique_ptr<DataGenerator> {
spdlog::info("Initializing brewery generator...");
std::unique_ptr<DataGenerator> generator;
if (options_.model_path.empty()) {
generator = std::make_unique<MockGenerator>();
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
} else {
auto llama_generator = std::make_unique<LlamaGenerator>();
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
options_.seed);
llama_generator->SetContextSize(options_.n_ctx);
spdlog::info(
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
"n_ctx={}, seed={})",
options_.model_path, options_.temperature, options_.top_p,
options_.n_ctx, options_.seed);
generator = std::move(llama_generator);
}
generator->Load(options_.model_path);
return generator;
}

View File

@@ -0,0 +1,23 @@
/**
* @file biergarten_data_generator/log_results.cpp
* @brief BiergartenDataGenerator::LogResults() implementation.
*/
#include <spdlog/spdlog.h>
#include "biergarten_data_generator.h"
void BiergartenDataGenerator::LogResults() const {
spdlog::info("\n=== GENERATED DATA DUMP ===");
size_t index = 1;
for (const auto& [location, brewery] : generatedBreweries_) {
spdlog::info(
"{}. city=\"{}\" country=\"{}\" state=\"{}\" "
"iso3166_2={} lat={} lon={}",
index, location.city, location.country, location.state_province,
location.iso3166_2, location.latitude, location.longitude);
spdlog::info(" brewery_name=\"{}\"", brewery.name);
spdlog::info(" brewery_description=\"{}\"", brewery.description);
++index;
}
}

View File

@@ -0,0 +1,40 @@
/**
* @file biergarten_data_generator/query_cities_with_countries.cpp
* @brief BiergartenDataGenerator::QueryCitiesWithCountries() implementation.
*/
#include <spdlog/spdlog.h>
#include <algorithm>
#include <filesystem>
#include <random>
#include "biergarten_data_generator.h"
#include "json_handling/json_loader.h"
static constexpr unsigned int brewery_amount = 4;
auto BiergartenDataGenerator::QueryCitiesWithCountries()
-> std::vector<Location> {
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
const std::filesystem::path locations_path = "locations.json";
auto all_locations = JsonLoader::LoadLocations(locations_path.string());
spdlog::info(" Locations available: {}", all_locations.size());
const size_t sample_count =
std::min<size_t>(brewery_amount, all_locations.size());
const auto sample_count_signed =
static_cast<std::iter_difference_t<decltype(all_locations.cbegin())>>(
sample_count);
std::vector<Location> sampled_locations;
sampled_locations.reserve(sample_count);
std::random_device random_generator;
std::ranges::sample(all_locations, std::back_inserter(sampled_locations),
sample_count_signed, random_generator);
spdlog::info(" Sampled locations: {}", sampled_locations.size());
return sampled_locations;
}

View File

@@ -0,0 +1,22 @@
/**
* @file biergarten_data_generator/run.cpp
* @brief BiergartenDataGenerator::Run() implementation.
*/
#include <spdlog/spdlog.h>
#include "biergarten_data_generator.h"
auto BiergartenDataGenerator::Run() -> bool {
try {
const std::unique_ptr<DataGenerator> generator = InitializeGenerator();
const std::vector<Location> cities = QueryCitiesWithCountries();
const std::vector<EnrichedCity> enriched = EnrichWithWikipedia(cities);
this->GenerateBreweries(*generator, enriched);
this->LogResults();
return true;
} catch (const std::exception& e) {
spdlog::error("Pipeline execution failed with error: {}", e.what());
return false;
}
}

View File

@@ -7,13 +7,14 @@
#include <spdlog/spdlog.h>
#include <boost/program_options.hpp>
#include <iostream>
#include <exception>
#include <memory>
#include <string>
#include "biergarten_data_generator.h"
#include "web_client/curl_web_client.h"
namespace po = boost::program_options;
namespace prog_opts = boost::program_options;
/**
* @brief Parse command-line arguments into ApplicationOptions.
@@ -23,123 +24,119 @@ namespace po = boost::program_options;
* @param options Output ApplicationOptions struct.
* @return true if parsing succeeded and should proceed, false otherwise.
*/
bool ParseArguments(int argc, char** argv, ApplicationOptions& options) {
// If no arguments provided, display usage and exit
if (argc == 1) {
std::cout << "Biergarten Pipeline - Geographic Data Pipeline with "
"Brewery Generation\n\n";
std::cout << "Usage: biergarten-pipeline [options]\n\n";
std::cout << "Options:\n";
std::cout << " --mocked Use mocked generator for "
"brewery/user data\n";
std::cout << " --model, -m PATH Path to LLM model file (gguf) for "
"generation\n";
std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: "
"/tmp)\n";
std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 "
"(default: 0.8)\n";
std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 "
"(default: 0.92)\n";
std::cout << " --n-ctx SIZE Context window size in tokens "
"(default: 4096)\n";
std::cout << " --seed SEED Random seed: -1 for random "
"(default: -1)\n";
std::cout << " --help, -h Show this help message\n\n";
std::cout << "Note: --mocked and --model are mutually exclusive. Exactly "
"one must be provided.\n";
return false;
}
po::options_description desc("Pipeline Options");
desc.add_options()("help,h", "Produce help message")(
"mocked", po::bool_switch(),
"Use mocked generator for brewery/user data")(
"model,m", po::value<std::string>()->default_value(""),
"Path to LLM model (gguf)")(
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
"Directory for cached JSON")(
"temperature", po::value<float>()->default_value(0.8f),
"Sampling temperature (higher = more random)")(
"top-p", po::value<float>()->default_value(0.92f),
"Nucleus sampling top-p in (0,1] (higher = more random)")(
"n-ctx", po::value<uint32_t>()->default_value(8192),
"Context window size in tokens (1-32768)")(
"seed", po::value<int>()->default_value(-1),
auto ParseArguments(const int argc, char** argv,
ApplicationOptions& options) noexcept -> bool {
prog_opts::options_description desc("Pipeline Options");
desc.add_options()
("help,h", "Produce help message")
("mocked",
prog_opts::bool_switch(),
"Use mocked generator for brewery/user data")
("model,m",
prog_opts::value<std::string>()->default_value(""),
"Path to LLM model (gguf)")
("temperature",
prog_opts::value<float>()->default_value(0.8f),
"Sampling temperature (higher = more random)")
("top-p",
prog_opts::value<float>()->default_value(0.92f),
"Nucleus sampling top-p in (0,1] (higher = more random)")
("n-ctx",
prog_opts::value<uint32_t>()->default_value(8192),
"Context window size in tokens (1-32768)")
("seed",
prog_opts::value<int>()->default_value(-1),
"Sampler seed: -1 for random, otherwise non-negative integer");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);
if (vm.count("help")) {
std::cout << desc << "\n";
// Handle the "no arguments" or "help" case
if (argc == 1) {
spdlog::info("Biergarten Pipeline");
std::stringstream ss;
ss << "\nUsage: biergarten-pipeline [options]\n\n" << desc;
spdlog::info(ss.str());
return false;
}
// Check for mutually exclusive --mocked and --model flags
bool use_mocked = vm["mocked"].as<bool>();
std::string model_path = vm["model"].as<std::string>();
try {
prog_opts::variables_map vm;
prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), vm);
prog_opts::notify(vm);
if (use_mocked && !model_path.empty()) {
spdlog::error("ERROR: --mocked and --model are mutually exclusive");
return false;
}
if (!use_mocked && model_path.empty()) {
spdlog::error("ERROR: Either --mocked or --model must be specified");
return false;
}
// Warn if sampling parameters are provided with --mocked
if (use_mocked) {
bool hasTemperature = vm["temperature"].defaulted() == false;
bool hasTopP = vm["top-p"].defaulted() == false;
bool hasSeed = vm["seed"].defaulted() == false;
if (hasTemperature || hasTopP || hasSeed) {
spdlog::warn(
"WARNING: Sampling parameters (--temperature, --top-p, --seed) "
"are ignored when using --mocked");
if (vm.contains("help")) {
std::stringstream ss;
ss << "\n" << desc;
spdlog::info(ss.str());
return false;
}
const auto use_mocked = vm["mocked"].as<bool>();
const auto model_path = vm["model"].as<std::string>();
if (use_mocked && !model_path.empty()) {
spdlog::error(
"Invalid arguments: --mocked and --model are mutually exclusive");
return false;
}
if (!use_mocked && model_path.empty()) {
spdlog::error(
"Invalid arguments: Either --mocked or --model must be specified");
return false;
}
const bool has_llm_params = !vm["temperature"].defaulted() ||
!vm["top-p"].defaulted() ||
!vm["seed"].defaulted();
if (use_mocked && has_llm_params) {
spdlog::warn(
"Sampling parameters (--temperature, --top-p, --seed) are"
" ignored when using --mocked");
}
options.use_mocked = use_mocked;
options.model_path = model_path;
options.temperature = vm["temperature"].as<float>();
options.top_p = vm["top-p"].as<float>();
options.n_ctx = vm["n-ctx"].as<uint32_t>();
options.seed = vm["seed"].as<int>();
return true;
} catch (const std::exception& exception) {
spdlog::error("Failed to parse command-line arguments: {}",
exception.what());
return false;
} catch (...) {
spdlog::error("Failed to parse command-line arguments: unknown error");
return false;
}
options.use_mocked = use_mocked;
options.model_path = model_path;
options.cache_dir = vm["cache-dir"].as<std::string>();
options.temperature = vm["temperature"].as<float>();
options.top_p = vm["top-p"].as<float>();
options.n_ctx = vm["n-ctx"].as<uint32_t>();
options.seed = vm["seed"].as<int>();
// commit is always pinned to c5eb7772
return true;
}
int main(int argc, char* argv[]) {
auto main(const int argc, char** argv) noexcept -> int {
try {
const CurlGlobalState curl_state;
spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v");
ApplicationOptions options;
if (!ParseArguments(argc, argv, options)) {
return 0;
}
auto webClient = std::make_shared<CURLWebClient>();
auto webClient = std::make_unique<CURLWebClient>();
BiergartenDataGenerator generator(options, std::move(webClient));
BiergartenDataGenerator generator(options, webClient);
return generator.Run();
} catch (const std::exception& e) {
const std::string message = e.what() ? e.what() : "";
if (message.find("LlamaGenerator: malformed brewery response") !=
std::string::npos) {
spdlog::warn("WARNING: Non-fatal LLM failure after retries: {}",
message);
return 0;
if (!generator.Run()) {
spdlog::error("Pipeline execution failed");
return 1;
}
spdlog::error("ERROR: Application failed: {}", e.what());
spdlog::info("Pipeline executed successfully");
return 0;
} catch (const std::exception& exception) {
spdlog::critical("Unhandled fatal error in main: {}", exception.what());
return 1;
} catch (...) {
spdlog::critical("Unhandled fatal non-standard exception in main");
return 1;
}
}
}

View File

@@ -0,0 +1,17 @@
/**
* @file web_client/curl_global_state_constructor.cpp
* @brief CurlGlobalState constructor implementation.
*/
#include <curl/curl.h>
#include <stdexcept>
#include "web_client/curl_web_client.h"
CurlGlobalState::CurlGlobalState() {
if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
throw std::runtime_error(
"[CURLWebClient] Failed to initialize libcurl globally");
}
}

View File

@@ -0,0 +1,10 @@
/**
* @file web_client/curl_global_state_destructor.cpp
* @brief CurlGlobalState destructor implementation.
*/
#include <curl/curl.h>
#include "web_client/curl_web_client.h"
CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }

View File

@@ -0,0 +1,8 @@
/**
* @file web_client/curl_web_client_constructor.cpp
* @brief CURLWebClient constructor implementation.
*/
#include "web_client/curl_web_client.h"
CURLWebClient::CURLWebClient() {}

View File

@@ -0,0 +1,8 @@
/**
* @file web_client/curl_web_client_destructor.cpp
* @brief CURLWebClient destructor implementation.
*/
#include "web_client/curl_web_client.h"
CURLWebClient::~CURLWebClient() {}

View File

@@ -1,11 +1,8 @@
/**
* @file web_client/curl_web_client.cpp
* @brief Implements libcurl-backed HTTP utilities, including GET requests,
* file downloads, URL encoding, and RAII global curl lifecycle handling.
* @file web_client/curl_web_client_download_to_file.cpp
* @brief CURLWebClient::DownloadToFile() implementation.
*/
#include "web_client/curl_web_client.h"
#include <curl/curl.h>
#include <cstdio>
@@ -14,34 +11,9 @@
#include <sstream>
#include <stdexcept>
CurlGlobalState::CurlGlobalState() {
if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
throw std::runtime_error(
"[CURLWebClient] Failed to initialize libcurl globally");
}
}
CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }
#include "web_client/curl_web_client.h"
namespace {
// curl write callback that appends response data into a std::string
size_t WriteCallbackString(void* contents, size_t size, size_t nmemb,
void* userp) {
size_t realsize = size * nmemb;
auto* s = static_cast<std::string*>(userp);
s->append(static_cast<char*>(contents), realsize);
return realsize;
}
// curl write callback that writes to a file stream
size_t WriteCallbackFile(void* contents, size_t size, size_t nmemb,
void* userp) {
size_t realsize = size * nmemb;
auto* outFile = static_cast<std::ofstream*>(userp);
outFile->write(static_cast<char*>(contents), realsize);
return realsize;
}
// RAII wrapper for CURL handle using unique_ptr
using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
@@ -64,12 +36,17 @@ void set_common_get_options(CURL* curl, const std::string& url,
curl_easy_setopt(curl, CURLOPT_TIMEOUT, total_timeout);
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
}
// curl write callback that writes to a file stream
size_t WriteCallbackFile(void* contents, size_t size, size_t nmemb,
void* userp) {
size_t realsize = size * nmemb;
auto* outFile = static_cast<std::ofstream*>(userp);
outFile->write(static_cast<char*>(contents), realsize);
return realsize;
}
} // namespace
CURLWebClient::CURLWebClient() {}
CURLWebClient::~CURLWebClient() {}
void CURLWebClient::DownloadToFile(const std::string& url,
const std::string& file_path) {
auto curl = create_handle();
@@ -105,43 +82,3 @@ void CURLWebClient::DownloadToFile(const std::string& url,
throw std::runtime_error(ss.str());
}
}
std::string CURLWebClient::Get(const std::string& url) {
auto curl = create_handle();
std::string response_string;
set_common_get_options(curl.get(), url, 10L, 20L);
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
std::string error =
std::string("[CURLWebClient] GET failed: ") + curl_easy_strerror(res);
throw std::runtime_error(error);
}
long httpCode = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
if (httpCode != 200) {
std::stringstream ss;
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
throw std::runtime_error(ss.str());
}
return response_string;
}
std::string CURLWebClient::UrlEncode(const std::string& value) {
// A NULL handle is fine for UTF-8 encoding according to libcurl docs.
char* output = curl_easy_escape(nullptr, value.c_str(), 0);
if (output) {
std::string result(output);
curl_free(output);
return result;
}
throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
}

View File

@@ -0,0 +1,75 @@
/**
* @file web_client/curl_web_client_get.cpp
* @brief CURLWebClient::Get() implementation.
*/
#include <curl/curl.h>
#include <memory>
#include <sstream>
#include <stdexcept>
#include <string>
#include "web_client/curl_web_client.h"
namespace {
// RAII wrapper for CURL handle using unique_ptr
using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
CurlHandle create_handle() {
CURL* handle = curl_easy_init();
if (!handle) {
throw std::runtime_error(
"[CURLWebClient] Failed to initialize libcurl handle");
}
return CurlHandle(handle, &curl_easy_cleanup);
}
void set_common_get_options(CURL* curl, const std::string& url,
long connect_timeout, long total_timeout) {
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, connect_timeout);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, total_timeout);
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
}
// curl write callback that appends response data into a std::string
size_t WriteCallbackString(void* contents, size_t size, size_t nmemb,
void* userp) {
size_t realsize = size * nmemb;
auto* s = static_cast<std::string*>(userp);
s->append(static_cast<char*>(contents), realsize);
return realsize;
}
} // namespace
std::string CURLWebClient::Get(const std::string& url) {
auto curl = create_handle();
std::string response_string;
set_common_get_options(curl.get(), url, 10L, 20L);
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
std::string error =
std::string("[CURLWebClient] GET failed: ") + curl_easy_strerror(res);
throw std::runtime_error(error);
}
long httpCode = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
if (httpCode != 200) {
std::stringstream ss;
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
throw std::runtime_error(ss.str());
}
return response_string;
}

View File

@@ -0,0 +1,23 @@
/**
* @file web_client/curl_web_client_url_encode.cpp
* @brief CURLWebClient::UrlEncode() implementation.
*/
#include <curl/curl.h>
#include <stdexcept>
#include <string>
#include "web_client/curl_web_client.h"
std::string CURLWebClient::UrlEncode(const std::string& value) {
// A NULL handle is fine for UTF-8 encoding according to libcurl docs.
char* output = curl_easy_escape(nullptr, value.c_str(), 0);
if (output) {
std::string result(output);
curl_free(output);
return result;
}
throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
}

View File

@@ -0,0 +1,11 @@
/**
* @file wikipedia/constructor.cpp
* @brief WikipediaService constructor implementation.
*/
#include <utility>
#include "wikipedia/wikipedia_service.h"
WikipediaService::WikipediaService(std::shared_ptr<WebClient> client)
: client_(std::move(client)) {}

View File

@@ -0,0 +1,51 @@
/**
* @file wikipedia/fetch_extract.cpp
* @brief WikipediaService::FetchExtract() implementation.
*/
#include <spdlog/spdlog.h>
#include <boost/json.hpp>
#include <string>
#include <string_view>
#include "wikipedia/wikipedia_service.h"
auto WikipediaService::FetchExtract(std::string_view query) const
-> std::string {
const std::string encoded = client_->UrlEncode(std::string(query));
const std::string url =
"https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
"&prop=extracts&explaintext=1&format=json";
const std::string body = client_->Get(url);
boost::system::error_code ec;
boost::json::value doc = boost::json::parse(body, ec);
if (!ec && doc.is_object()) {
try {
auto& pages = doc.at("query").at("pages").get_object();
if (!pages.empty()) {
auto& page = pages.begin()->value().get_object();
if (page.contains("extract") && page.at("extract").is_string()) {
std::string extract(page.at("extract").as_string().c_str());
spdlog::debug("WikipediaService fetched {} chars for '{}'",
extract.size(), query);
return extract;
}
}
} catch (const std::exception& e) {
spdlog::warn(
"WikipediaService: failed to parse response structure for '{}': "
"{}",
query, e.what());
return {};
}
} else if (ec) {
spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
ec.message());
}
return {};
}

View File

@@ -0,0 +1,55 @@
/**
* @file wikipedia/get_summary.cpp
* @brief WikipediaService::GetSummary() implementation.
*/
#include <spdlog/spdlog.h>
#include <string>
#include "wikipedia/wikipedia_service.h"
auto WikipediaService::GetSummary(std::string_view city,
std::string_view country) -> std::string {
const std::string key = std::string(city) + "|" + std::string(country);
const auto cacheIt = cache_.find(key);
if (cacheIt != cache_.end()) {
return cacheIt->second;
}
std::string result;
if (!client_) {
cache_.emplace(key, result);
return result;
}
std::string regionQuery(city);
if (!country.empty()) {
regionQuery += ", ";
regionQuery += country;
}
const std::string beerQuery = "beer in " + std::string(country);
try {
const std::string regionExtract = FetchExtract(regionQuery);
const std::string beerExtract = FetchExtract(beerQuery);
if (!regionExtract.empty()) {
result += regionExtract;
}
if (!beerExtract.empty()) {
if (!result.empty()) {
result += "\n\n";
}
result += beerExtract;
}
} catch (const std::runtime_error& e) {
spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery,
e.what());
}
cache_.emplace(key, result);
return result;
}

View File

@@ -1,95 +0,0 @@
/**
* @file wikipedia/wikipedia_service.cpp
* @brief Implements Wikipedia extract retrieval and caching for city/country
* queries, including response parsing and resilient error handling.
*/
#include "wikipedia/wikipedia_service.h"
#include <spdlog/spdlog.h>
#include <boost/json.hpp>
WikipediaService::WikipediaService(std::shared_ptr<WebClient> client)
: client_(std::move(client)) {}
std::string WikipediaService::FetchExtract(std::string_view query) {
const std::string encoded = client_->UrlEncode(std::string(query));
const std::string url =
"https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
"&prop=extracts&explaintext=1&format=json";
const std::string body = client_->Get(url);
boost::system::error_code ec;
boost::json::value doc = boost::json::parse(body, ec);
if (!ec && doc.is_object()) {
try {
auto& pages = doc.at("query").at("pages").get_object();
if (!pages.empty()) {
auto& page = pages.begin()->value().get_object();
if (page.contains("extract") && page.at("extract").is_string()) {
std::string extract(page.at("extract").as_string().c_str());
spdlog::debug("WikipediaService fetched {} chars for '{}'",
extract.size(), query);
return extract;
}
}
} catch (const std::exception& e) {
spdlog::warn(
"WikipediaService: failed to parse response structure for '{}': "
"{}",
query, e.what());
return {};
}
} else if (ec) {
spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
ec.message());
}
return {};
}
std::string WikipediaService::GetSummary(std::string_view city,
std::string_view country) {
const std::string key = std::string(city) + "|" + std::string(country);
const auto cacheIt = cache_.find(key);
if (cacheIt != cache_.end()) {
return cacheIt->second;
}
std::string result;
if (!client_) {
cache_.emplace(key, result);
return result;
}
std::string regionQuery(city);
if (!country.empty()) {
regionQuery += ", ";
regionQuery += country;
}
const std::string beerQuery = "beer in " + std::string(country);
try {
const std::string regionExtract = FetchExtract(regionQuery);
const std::string beerExtract = FetchExtract(beerQuery);
if (!regionExtract.empty()) {
result += regionExtract;
}
if (!beerExtract.empty()) {
if (!result.empty()) result += "\n\n";
result += beerExtract;
}
} catch (const std::runtime_error& e) {
spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery,
e.what());
}
cache_.emplace(key, result);
return result;
}