diff --git a/pipeline/CMakeLists.txt b/pipeline/CMakeLists.txt index 0ad101f..94117ef 100644 --- a/pipeline/CMakeLists.txt +++ b/pipeline/CMakeLists.txt @@ -1,5 +1,9 @@ cmake_minimum_required(VERSION 3.24) project(biergarten-pipeline) + +# Boost.DI still declares a very old minimum CMake version, which newer CMake +# releases reject unless a policy version floor is provided. +set(CMAKE_POLICY_VERSION_MINIMUM 3.5 CACHE STRING "" FORCE) # ============================================================================= # 1. GPU Detection # ============================================================================= @@ -71,6 +75,16 @@ FetchContent_Declare( GIT_TAG b8711 ) FetchContent_MakeAvailable(llama-cpp) +# --- boost-ext/di ------------------------------------------------------------- +FetchContent_Declare( + boost-di + GIT_REPOSITORY https://github.com/boost-ext/di.git + GIT_TAG v1.3.0 +) +FetchContent_MakeAvailable(boost-di) +if(TARGET Boost.DI AND NOT TARGET boost::di) + add_library(boost::di ALIAS Boost.DI) +endif() # --- Boost (JSON + program_options) ------------------------------------------ FetchContent_Declare( boost @@ -92,15 +106,13 @@ set(SOURCES # BiergartenDataGenerator methods src/biergarten_data_generator/constructor.cpp src/biergarten_data_generator/run.cpp - src/biergarten_data_generator/initialize_generator.cpp src/biergarten_data_generator/query_cities_with_countries.cpp - src/biergarten_data_generator/enrich_with_wikipedia.cpp src/biergarten_data_generator/generate_breweries.cpp src/biergarten_data_generator/log_results.cpp # WikipediaService methods - src/wikipedia/constructor.cpp - src/wikipedia/get_summary.cpp - src/wikipedia/fetch_extract.cpp + src/services/wikipedia/constructor.cpp + src/services/wikipedia/get_summary.cpp + src/services/wikipedia/fetch_extract.cpp # CURLWebClient and CurlGlobalState methods src/web_client/curl_global_state_constructor.cpp src/web_client/curl_global_state_destructor.cpp @@ -111,18 +123,17 @@ set(SOURCES src/web_client/curl_web_client_url_encode.cpp # Data generation modules src/data_generation/llama/destructor.cpp + src/data_generation/llama/constructor.cpp src/data_generation/llama/generate_brewery.cpp src/data_generation/llama/generate_user.cpp src/data_generation/llama/helpers.cpp src/data_generation/llama/infer.cpp src/data_generation/llama/load.cpp src/data_generation/llama/load_brewery_prompt.cpp - src/data_generation/llama/set_sampling_options.cpp src/data_generation/mock/data.cpp src/data_generation/mock/deterministic_hash.cpp src/data_generation/mock/generate_brewery.cpp src/data_generation/mock/generate_user.cpp - src/data_generation/mock/load.cpp src/json_handling/json_loader.cpp ) # ============================================================================= @@ -138,6 +149,7 @@ target_include_directories(${PROJECT_NAME} PRIVATE ) target_link_libraries(${PROJECT_NAME} PRIVATE llama + boost::di boost_json boost_program_options spdlog::spdlog diff --git a/pipeline/README.md b/pipeline/README.md index 91ecbeb..55de349 100644 --- a/pipeline/README.md +++ b/pipeline/README.md @@ -1,28 +1,29 @@ # Biergarten Pipeline -Biergarten Pipeline is a C++23 command-line tool that reads a local city list, looks up a short Wikipedia summary for each sampled city, and generates brewery names and descriptions. The current code samples up to four locations per run, then uses either a local GGUF model or the mock generator to produce the output. +Biergarten Pipeline is a C++23 command-line tool that reads a local city list, resolves contextual enrichment for each sampled city through an injected service, and generates brewery names and descriptions. The current code samples up to four locations per run, then uses either a local GGUF model or the mock generator to produce the output. ## Pipeline -| Stage | What happens | -| -------- | ------------------------------------------------------------------------------ | -| Load | Reads `locations.json` and picks up to four city/country pairs. | -| Enrich | Fetches a short Wikipedia summary for each city in parallel with `std::async`. | -| Generate | Passes the city, country, and summary to the active generator. | -| Log | Writes the generated breweries and any warnings through `spdlog`. | +| Stage | What happens | +| -------- | ----------------------------------------------------------------------- | +| Load | Reads `locations.json` and picks up to four city/country pairs. | +| Enrich | Calls the injected enrichment service for each sampled city. | +| Generate | Passes the city, country, and gathered context to the active generator. | +| Log | Writes the generated breweries and any warnings through `spdlog`. | -If one Wikipedia lookup fails, the pipeline skips that city and keeps going. +If an enrichment lookup throws, the pipeline skips that city and keeps going. If the lookup returns an empty string, the city stays in the pipeline and is still passed to the generator. ## Core Components -| Component | Role | -| ----------------------- | ---------------------------------------------------------- | -| BiergartenDataGenerator | Orchestrates loading, enrichment, generation, and logging. | -| WikipediaService | Fetches city summaries from Wikipedia. | -| LlamaGenerator | Runs local GGUF inference and validates output. | -| MockGenerator | Produces deterministic fallback data without a model. | -| JsonLoader | Parses the local `locations.json` file. | -| CURLWebClient | Handles HTTP requests to Wikipedia. | +| Component | Role | +| ----------------------- | ---------------------------------------------------------------------- | +| BiergartenDataGenerator | Orchestrates loading, enrichment lookup, generation, and logging. | +| IEnrichmentService | Abstraction for location-context providers. | +| WikipediaService | Default enrichment provider backed by Wikipedia and in-memory caching. | +| LlamaGenerator | Runs local GGUF inference and validates output. | +| MockGenerator | Produces deterministic fallback data without a model. | +| JsonLoader | Parses the local `locations.json` file. | +| CURLWebClient | Handles HTTP requests to Wikipedia. | ## Build @@ -33,7 +34,7 @@ If one Wikipedia lookup fails, the pipeline skips that city and keeps going. | libcurl | Required for Wikipedia requests. | | Optional GPU tooling | CUDA on NVIDIA, HIP/ROCm on supported AMD systems, Metal on Apple Silicon. | -Boost, spdlog, and llama.cpp are fetched by CMake. On Apple Silicon, Metal is enabled automatically. On Linux, the build looks for CUDA or HIP/ROCm when the matching toolkit is present. Windows is not supported. +Boost, Boost.DI, spdlog, and llama.cpp are fetched by CMake. On Apple Silicon, Metal is enabled automatically. On Linux, the build looks for CUDA or HIP/ROCm when the matching toolkit is present. Windows is not supported. ```bash cmake -S . -B build @@ -61,7 +62,7 @@ Run the executable from the build directory so the copied `locations.json` is av | `--seed` | Random seed. Default: `-1`. | | `--help, -h` | Prints usage. | -`--mocked` and `--model` are mutually exclusive. If neither is set, the program exits with an error. The sampling flags only matter when a model is loaded. +`--mocked` and `--model` are mutually exclusive. If neither is set, the program exits with an error. The sampling flags only matter when a model is loaded. The enrichment step is sequential now, and empty context is allowed. ## Layout diff --git a/pipeline/biergarten_pipeline.puml b/pipeline/biergarten_pipeline.puml index 8f27545..16878a4 100644 --- a/pipeline/biergarten_pipeline.puml +++ b/pipeline/biergarten_pipeline.puml @@ -1,12 +1,12 @@ @startuml -title Biergarten Pipeline - Class Diagram +title Biergarten Pipeline - Class and Composition Diagram left to right direction skinparam shadowing false skinparam classAttributeIconSize 0 skinparam packageStyle rectangle -package "Entry point" { +package "Composition root" { class Main <> { +main(argc: int, argv: char**): int } @@ -15,6 +15,14 @@ package "Entry point" { +CurlGlobalState() +~CurlGlobalState() } + + note right of Main + Binds with Boost.DI: + - WebClient -> CURLWebClient + - IEnrichmentService -> WikipediaService + - DataGenerator -> MockGenerator or LlamaGenerator + - LlamaGenerator receives ApplicationOptions and model_path directly + end note } package "Core orchestration" { @@ -28,16 +36,19 @@ package "Core orchestration" { } class BiergartenDataGenerator { - -options_: ApplicationOptions - -webClient_: std::shared_ptr - +BiergartenDataGenerator(options: ApplicationOptions, web_client: std::unique_ptr) + -context_service_: std::shared_ptr + -generator_: std::unique_ptr + +BiergartenDataGenerator(context_service: std::shared_ptr, generator: std::unique_ptr) +Run(): bool - -InitializeGenerator(): std::unique_ptr -QueryCitiesWithCountries(): std::vector - -EnrichWithWikipedia(cities: std::vector): std::vector - -GenerateBreweries(generator: DataGenerator&, cities: std::vector): void + -GenerateBreweries(cities: std::vector): void -LogResults(): void } + + class EnrichedCity <> { + +location: Location + +region_context: std::string + } } package "Shared models" { @@ -56,21 +67,17 @@ package "Shared models" { package "Generation" { interface DataGenerator { - +Load(model_path: std::string): void +GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult +GenerateUser(locale: std::string): UserResult } class MockGenerator { - +Load(model_path: std::string): void +GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult +GenerateUser(locale: std::string): UserResult } class LlamaGenerator { - +SetSamplingOptions(temperature: float, top_p: float, seed: int = -1): void - +SetContextSize(n_ctx: uint32_t): void - +Load(model_path: std::string): void + +LlamaGenerator(options: ApplicationOptions, model_path: std::string) +GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult +GenerateUser(locale: std::string): UserResult } @@ -93,9 +100,13 @@ package "HTTP" { } package "Wikipedia" { + interface IEnrichmentService { + +GetLocationContext(loc: Location): std::string + } + class WikipediaService { +WikipediaService(client: std::shared_ptr) - +GetSummary(city: std::string_view, country: std::string_view): std::string + +GetLocationContext(loc: Location): std::string } class JsonLoader { @@ -106,27 +117,30 @@ package "Wikipedia" { Main --> CurlGlobalState Main --> ApplicationOptions Main --> BiergartenDataGenerator -Main --> CURLWebClient +Main ..> IEnrichmentService : DI binding +Main ..> DataGenerator : DI factory +Main ..> CURLWebClient : DI binding -BiergartenDataGenerator *-- ApplicationOptions : options_ -BiergartenDataGenerator --> WebClient : shared_ptr +BiergartenDataGenerator *-- EnrichedCity BiergartenDataGenerator ..> JsonLoader : LoadLocations() -BiergartenDataGenerator ..> WikipediaService : enrich cities -BiergartenDataGenerator ..> DataGenerator : initialize generator +BiergartenDataGenerator --> IEnrichmentService : context lookup +BiergartenDataGenerator --> DataGenerator : brewery generation BiergartenDataGenerator ..> Location BiergartenDataGenerator ..> BreweryResult DataGenerator <|.. MockGenerator DataGenerator <|.. LlamaGenerator WebClient <|.. CURLWebClient +IEnrichmentService <|.. WikipediaService WikipediaService --> WebClient : shared_ptr note right of BiergartenDataGenerator Current behavior: samples up to four locations per run. -Wikipedia enrichment runs asynchronously per sampled city. -If a lookup fails, that city is skipped. +Enrichment runs once per sampled city. +If a lookup throws, that city is skipped. +Empty context is retained and still passed to the generator. end note @enduml diff --git a/pipeline/includes/biergarten_data_generator.h b/pipeline/includes/biergarten_data_generator.h index 034dcc9..4b23143 100644 --- a/pipeline/includes/biergarten_data_generator.h +++ b/pipeline/includes/biergarten_data_generator.h @@ -6,14 +6,14 @@ * @brief Core orchestration class for pipeline data generation. */ +#include #include #include #include #include "data_generation/data_generator.h" #include "data_model/location.h" -#include "web_client/web_client.h" -#include "wikipedia/wikipedia_service.h" +#include "services/enrichment_service.h" /** * @brief Program options for the Biergarten pipeline application. @@ -53,18 +53,18 @@ class BiergartenDataGenerator { /** * @brief Construct a BiergartenDataGenerator with injected dependencies. * - * @param options Application configuration options. - * @param web_client HTTP client for downloading data. + * @param context_service Context provider for sampled locations. + * @param generator Brewery and user data generator. */ - BiergartenDataGenerator(const ApplicationOptions& options, - std::shared_ptr web_client); + BiergartenDataGenerator(std::shared_ptr context_service, + std::unique_ptr generator); /** * @brief Run the data generation pipeline. * * Performs the following steps: * 1. Load curated locations from JSON - * 2. Initialize the generator (LLM or Mock) + * 2. Resolve context for each city using the injected context service * 3. Generate brewery data for sampled cities * * @return true if successful, false if not @@ -72,11 +72,11 @@ class BiergartenDataGenerator { bool Run(); private: - /// @brief Immutable application options. - const ApplicationOptions options_; + /// @brief Shared context provider dependency. + std::shared_ptr context_service_; - /// @brief Shared HTTP client dependency. - std::shared_ptr webClient_; + /// @brief Generator dependency selected in the composition root. + std::unique_ptr generator_; /** * @brief Enriched city data with Wikipedia context. @@ -86,15 +86,6 @@ class BiergartenDataGenerator { std::string region_context; }; - /** - * @brief Initialize the data generator based on options. - * - * Creates either a MockGenerator (if no model path) or LlamaGenerator. - * - * @return A unique_ptr to the initialized generator. - */ - std::unique_ptr InitializeGenerator() const; - /** * @brief Load locations from JSON and sample cities. * @@ -102,23 +93,12 @@ class BiergartenDataGenerator { */ static std::vector QueryCitiesWithCountries(); - /** - * @brief Enrich cities with Wikipedia summaries. - * - * @param cities Vector of sampled locations. - * @return Vector of enriched city data with context. - */ - std::vector EnrichWithWikipedia( - const std::vector& cities); - /** * @brief Generate breweries for enriched cities. * - * @param generator The data generator instance. * @param cities Vector of enriched city data. */ - void GenerateBreweries(DataGenerator& generator, - const std::vector& cities); + void GenerateBreweries(const std::vector& cities); /** * @brief Log the generated brewery results. diff --git a/pipeline/includes/data_generation/data_generator.h b/pipeline/includes/data_generation/data_generator.h index 3d22d85..b338574 100644 --- a/pipeline/includes/data_generation/data_generator.h +++ b/pipeline/includes/data_generation/data_generator.h @@ -38,13 +38,6 @@ class DataGenerator { /// @brief Virtual destructor for polymorphic cleanup. virtual ~DataGenerator() = default; - /** - * @brief Loads and initializes generator resources. - * - * @param model_path Path to model assets. Implementations may ignore this. - */ - virtual void Load(const std::string& model_path) = 0; - /** * @brief Generates brewery data for a location. * diff --git a/pipeline/includes/data_generation/llama_generator.h b/pipeline/includes/data_generation/llama_generator.h index 5e100d1..c817e53 100644 --- a/pipeline/includes/data_generation/llama_generator.h +++ b/pipeline/includes/data_generation/llama_generator.h @@ -11,6 +11,8 @@ #include "data_generation/data_generator.h" +struct ApplicationOptions; + struct llama_model; struct llama_context; @@ -19,35 +21,19 @@ struct llama_context; */ class LlamaGenerator final : public DataGenerator { public: - /// @brief Constructs a generator with default sampling and context settings. - LlamaGenerator() = default; + /** + * @brief Constructs a generator using parsed application options and loads + * the configured model immediately. + * + * @param options Parsed application options. + * @param model_path Filesystem path to GGUF model assets. + */ + LlamaGenerator(const ApplicationOptions& options, + const std::string& model_path); /// @brief Releases model/context resources. ~LlamaGenerator() override; - /** - * @brief Configures sampling parameters for generation. - * - * @param temperature Sampling temperature. - * @param top_p Nucleus sampling threshold. - * @param seed Seed for sampling; use -1 for random seed. - */ - void SetSamplingOptions(float temperature, float top_p, int seed = -1); - - /** - * @brief Sets context window size used during model load. - * - * @param n_ctx Context size in tokens. - */ - void SetContextSize(uint32_t n_ctx); - - /** - * @brief Loads model and prepares inference context. - * - * @param model_path Filesystem path to GGUF model. - */ - void Load(const std::string& model_path) override; - /** * @brief Generates brewery data for a specific location. * @@ -69,6 +55,13 @@ class LlamaGenerator final : public DataGenerator { UserResult GenerateUser(const std::string& locale) override; private: + /** + * @brief Loads model and prepares inference context. + * + * @param model_path Filesystem path to GGUF model. + */ + void Load(const std::string& model_path); + /** * @brief Infers text from a user prompt. * diff --git a/pipeline/includes/data_generation/mock_generator.h b/pipeline/includes/data_generation/mock_generator.h index be87b5c..fddab8d 100644 --- a/pipeline/includes/data_generation/mock_generator.h +++ b/pipeline/includes/data_generation/mock_generator.h @@ -16,13 +16,6 @@ */ class MockGenerator final : public DataGenerator { public: - /** - * @brief Initializes the mock generator. - * - * @param model_path Unused for mock generation. - */ - void Load(const std::string& model_path) override; - /** * @brief Generates deterministic brewery data for a location. * diff --git a/pipeline/includes/services/enrichment_service.h b/pipeline/includes/services/enrichment_service.h new file mode 100644 index 0000000..7c60305 --- /dev/null +++ b/pipeline/includes/services/enrichment_service.h @@ -0,0 +1,30 @@ +#ifndef BIERGARTEN_PIPELINE_SERVICES_ENRICHMENT_SERVICE_H_ +#define BIERGARTEN_PIPELINE_SERVICES_ENRICHMENT_SERVICE_H_ + +/** + * @file services/enrichment_service.h + * @brief Abstraction for resolving contextual enrichment for a location. + */ + +#include + +#include "data_model/location.h" + +/** + * @brief Interface for services that can enrich a location with context. + */ +class IEnrichmentService { + public: + /// @brief Virtual destructor for polymorphic cleanup. + virtual ~IEnrichmentService() = default; + + /** + * @brief Resolves contextual enrichment for a location. + * + * @param loc Location to enrich. + * @return Context text, or an empty string if unavailable. + */ + virtual std::string GetLocationContext(const Location& loc) = 0; +}; + +#endif // BIERGARTEN_PIPELINE_SERVICES_ENRICHMENT_SERVICE_H_ \ No newline at end of file diff --git a/pipeline/includes/wikipedia/wikipedia_service.h b/pipeline/includes/services/wikipedia_service.h similarity index 72% rename from pipeline/includes/wikipedia/wikipedia_service.h rename to pipeline/includes/services/wikipedia_service.h index 1fa9a28..88e6286 100644 --- a/pipeline/includes/wikipedia/wikipedia_service.h +++ b/pipeline/includes/services/wikipedia_service.h @@ -2,7 +2,7 @@ #define BIERGARTEN_PIPELINE_WIKIPEDIA_SERVICE_H_ /** - * @file wikipedia/wikipedia_service.h + * @file services/wikipedia_service.h * @brief Wikipedia summary retrieval service with in-memory caching. */ @@ -11,17 +11,17 @@ #include #include +#include "services/enrichment_service.h" #include "web_client/web_client.h" /// @brief Provides cached Wikipedia summary lookups for city and country pairs. -class WikipediaService { +class WikipediaService final : public IEnrichmentService { public: /// @brief Creates a new Wikipedia service with the provided web client. explicit WikipediaService(std::shared_ptr client); - /// @brief Returns the Wikipedia summary extract for city and country. - [[nodiscard]] std::string GetSummary(std::string_view city, - std::string_view country); + /// @brief Returns the Wikipedia-derived context for a location. + [[nodiscard]] std::string GetLocationContext(const Location& loc) override; private: std::string FetchExtract(std::string_view query) const; diff --git a/pipeline/src/biergarten_data_generator/constructor.cpp b/pipeline/src/biergarten_data_generator/constructor.cpp index b1f3d86..d90365b 100644 --- a/pipeline/src/biergarten_data_generator/constructor.cpp +++ b/pipeline/src/biergarten_data_generator/constructor.cpp @@ -8,6 +8,7 @@ #include "biergarten_data_generator.h" BiergartenDataGenerator::BiergartenDataGenerator( - ApplicationOptions const& options, std::shared_ptr web_client) - : options_(options), webClient_(std::move(web_client)) { -} \ No newline at end of file + std::shared_ptr context_service, + std::unique_ptr generator) + : context_service_(std::move(context_service)), + generator_(std::move(generator)) {} diff --git a/pipeline/src/biergarten_data_generator/enrich_with_wikipedia.cpp b/pipeline/src/biergarten_data_generator/enrich_with_wikipedia.cpp deleted file mode 100644 index 0c984c3..0000000 --- a/pipeline/src/biergarten_data_generator/enrich_with_wikipedia.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/** - * @file biergarten_data_generator/enrich_with_wikipedia.cpp - * @brief BiergartenDataGenerator::EnrichWithWikipedia() implementation. - */ - -#include - -#include -#include -#include - -#include "biergarten_data_generator.h" -#include "wikipedia/wikipedia_service.h" - -static auto TryGetRegionContext( - const std::shared_ptr& web_client, const Location* city_ptr, - std::atomic* skipped_enrichment_count) noexcept - -> std::optional { - try { - WikipediaService wikipedia_service(web_client); - return wikipedia_service.GetSummary(city_ptr->city, city_ptr->country); - } catch (...) { - skipped_enrichment_count->fetch_add(1, std::memory_order_relaxed); - return std::nullopt; - } -} - -auto BiergartenDataGenerator::EnrichWithWikipedia( - const std::vector& cities) -> std::vector { - std::vector enriched; - enriched.reserve(cities.size()); - - std::atomic skipped_enrichment_count = 0; - std::vector>> pending; - pending.reserve(cities.size()); - - for (const auto& city : cities) { - const Location* city_ptr = &city; - pending.push_back(std::async(std::launch::async, TryGetRegionContext, - webClient_, city_ptr, - &skipped_enrichment_count)); - } - - auto city_it = cities.cbegin(); - for (auto& task : pending) { - auto maybe_region_context = task.get(); - if (maybe_region_context.has_value()) { - spdlog::debug("[Pipeline] Region context for {}: {}", city_it->city, - *maybe_region_context); - enriched.push_back( - EnrichedCity{.location = *city_it, - .region_context = std::move(*maybe_region_context)}); - } - ++city_it; - } - - if (skipped_enrichment_count.load(std::memory_order_relaxed) > 0) { - spdlog::warn( - "[Pipeline] Skipped {} city/cities due to Wikipedia enrichment " - "errors", - skipped_enrichment_count.load(std::memory_order_relaxed)); - } - - return enriched; -} diff --git a/pipeline/src/biergarten_data_generator/generate_breweries.cpp b/pipeline/src/biergarten_data_generator/generate_breweries.cpp index 904b04a..59a8385 100644 --- a/pipeline/src/biergarten_data_generator/generate_breweries.cpp +++ b/pipeline/src/biergarten_data_generator/generate_breweries.cpp @@ -8,7 +8,7 @@ #include "biergarten_data_generator.h" void BiergartenDataGenerator::GenerateBreweries( - DataGenerator& generator, const std::vector& cities) { + const std::vector& cities) { spdlog::info("\n=== SAMPLE BREWERY GENERATION ==="); generatedBreweries_.clear(); @@ -16,7 +16,7 @@ void BiergartenDataGenerator::GenerateBreweries( for (const auto& enriched_city : cities) { try { - auto brewery = generator.GenerateBrewery( + auto brewery = generator_->GenerateBrewery( enriched_city.location.city, enriched_city.location.country, enriched_city.region_context); generatedBreweries_.push_back(GeneratedBrewery{ diff --git a/pipeline/src/biergarten_data_generator/initialize_generator.cpp b/pipeline/src/biergarten_data_generator/initialize_generator.cpp deleted file mode 100644 index 00b5c46..0000000 --- a/pipeline/src/biergarten_data_generator/initialize_generator.cpp +++ /dev/null @@ -1,35 +0,0 @@ -/** - * @file biergarten_data_generator/initialize_generator.cpp - * @brief BiergartenDataGenerator::InitializeGenerator() implementation. - */ - -#include - -#include "biergarten_data_generator.h" -#include "data_generation/llama_generator.h" -#include "data_generation/mock_generator.h" - -auto BiergartenDataGenerator::InitializeGenerator() const - -> std::unique_ptr { - spdlog::info("Initializing brewery generator..."); - - std::unique_ptr generator; - if (options_.model_path.empty()) { - generator = std::make_unique(); - spdlog::info("[Generator] Using MockGenerator (no model path provided)"); - } else { - auto llama_generator = std::make_unique(); - llama_generator->SetSamplingOptions(options_.temperature, options_.top_p, - options_.seed); - llama_generator->SetContextSize(options_.n_ctx); - spdlog::info( - "[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, " - "n_ctx={}, seed={})", - options_.model_path, options_.temperature, options_.top_p, - options_.n_ctx, options_.seed); - generator = std::move(llama_generator); - } - generator->Load(options_.model_path); - - return generator; -} diff --git a/pipeline/src/biergarten_data_generator/run.cpp b/pipeline/src/biergarten_data_generator/run.cpp index e5ca88c..f8f2595 100644 --- a/pipeline/src/biergarten_data_generator/run.cpp +++ b/pipeline/src/biergarten_data_generator/run.cpp @@ -9,10 +9,35 @@ auto BiergartenDataGenerator::Run() -> bool { try { - const std::unique_ptr generator = InitializeGenerator(); const std::vector cities = QueryCitiesWithCountries(); - const std::vector enriched = EnrichWithWikipedia(cities); - this->GenerateBreweries(*generator, enriched); + std::vector enriched; + enriched.reserve(cities.size()); + + size_t skipped_count = 0; + for (const auto& city : cities) { + try { + const std::string region_context = + context_service_->GetLocationContext(city); + spdlog::info("[Pipeline] Context for '{}' ({}) gathered:\n{}", + city.city, city.country, region_context); + + enriched.push_back(EnrichedCity{.location = city, + .region_context = region_context}); + } catch (const std::exception& exception) { + ++skipped_count; + spdlog::warn( + "[Pipeline] Skipping city '{}' ({}): context lookup failed: {}", + city.city, city.country, exception.what()); + } + } + + if (skipped_count > 0) { + spdlog::warn( + "[Pipeline] Skipped {} city/cities due to context lookup errors", + skipped_count); + } + + this->GenerateBreweries(enriched); this->LogResults(); return true; } catch (const std::exception& e) { diff --git a/pipeline/src/data_generation/llama/constructor.cpp b/pipeline/src/data_generation/llama/constructor.cpp new file mode 100644 index 0000000..c1400cb --- /dev/null +++ b/pipeline/src/data_generation/llama/constructor.cpp @@ -0,0 +1,53 @@ +/** + * @file data_generation/llama/constructor.cpp + * @brief LlamaGenerator constructor implementation. + */ + +#include + +#include +#include + +#include "biergarten_data_generator.h" +#include "data_generation/llama_generator.h" + +LlamaGenerator::LlamaGenerator(const ApplicationOptions& options, + const std::string& model_path) { + if (model_path.empty()) { + throw std::runtime_error("LlamaGenerator: model path must not be empty"); + } + + if (options.temperature < 0.0F) { + throw std::runtime_error( + "LlamaGenerator: sampling temperature must be >= 0"); + } + + if (options.top_p <= 0.0F || options.top_p > 1.0F) { + throw std::runtime_error( + "LlamaGenerator: sampling top-p must be in (0, 1]"); + } + + if (options.seed < -1) { + throw std::runtime_error( + "LlamaGenerator: seed must be >= 0, or -1 for random"); + } + + if (options.n_ctx == 0 || options.n_ctx > 32768) { + throw std::runtime_error( + "LlamaGenerator: context size must be in range [1, 32768]"); + } + + sampling_temperature_ = options.temperature; + sampling_top_p_ = options.top_p; + sampling_seed_ = (options.seed < 0) + ? static_cast(LLAMA_DEFAULT_SEED) + : static_cast(options.seed); + n_ctx_ = options.n_ctx; + + try { + Load(model_path); + } catch (...) { + llama_backend_free(); + throw; + } +} diff --git a/pipeline/src/data_generation/llama/load.cpp b/pipeline/src/data_generation/llama/load.cpp index 97bd4b4..2b4204e 100644 --- a/pipeline/src/data_generation/llama/load.cpp +++ b/pipeline/src/data_generation/llama/load.cpp @@ -1,7 +1,7 @@ /** * @file data_generation/llama/load.cpp * @brief Initializes llama backend, loads model weights, creates inference - * context, and resets prior resources during model reload. + * context, and resets prior resources during model initialization. */ #include @@ -13,12 +13,6 @@ #include "llama.h" void LlamaGenerator::Load(const std::string& model_path) { - /** - * Validate input and clean up any previously loaded model/context - */ - if (model_path.empty()) - throw std::runtime_error("LlamaGenerator: model path must not be empty"); - if (context_ != nullptr) { llama_free(context_); context_ = nullptr; diff --git a/pipeline/src/data_generation/llama/set_sampling_options.cpp b/pipeline/src/data_generation/llama/set_sampling_options.cpp deleted file mode 100644 index b16c3aa..0000000 --- a/pipeline/src/data_generation/llama/set_sampling_options.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/** - * @file data_generation/llama/set_sampling_options.cpp - * @brief Validates and stores sampling temperature, top-p, seed, and context - * size configuration used by subsequent LlamaGenerator inference calls. - */ - -#include - -#include "data_generation/llama_generator.h" -#include "llama.h" - -void LlamaGenerator::SetSamplingOptions(float temperature, float top_p, - int seed) { - /** - * Validate temperature: controls randomness in output distribution - * 0.0 = deterministic (always pick highest probability token) - * Higher values = more random/diverse output - */ - if (temperature < 0.0f) { - throw std::runtime_error( - "LlamaGenerator: sampling temperature must be >= 0"); - } - - /** - * Validate top-p (nucleus sampling): only sample from top cumulative - * probability e.g., top-p=0.9 means sample from tokens that make up 90% of - * probability mass - */ - if (!(top_p > 0.0f && top_p <= 1.0f)) { - throw std::runtime_error( - "LlamaGenerator: sampling top-p must be in (0, 1]"); - } - - /** - * Validate seed: for reproducible results (-1 uses random seed) - */ - if (seed < -1) { - throw std::runtime_error( - "LlamaGenerator: seed must be >= 0, or -1 for random"); - } - - /** - * Store sampling parameters for use during token generation - */ - sampling_temperature_ = temperature; - sampling_top_p_ = top_p; - sampling_seed_ = (seed < 0) ? static_cast(LLAMA_DEFAULT_SEED) - : static_cast(seed); -} - -void LlamaGenerator::SetContextSize(uint32_t n_ctx) { - /** - * Validate context size: must be positive and reasonable for the model - */ - if (n_ctx == 0 || n_ctx > 32768) { - throw std::runtime_error( - "LlamaGenerator: context size must be in range [1, 32768]"); - } - - /** - * Store context size for use during model loading - */ - n_ctx_ = n_ctx; -} diff --git a/pipeline/src/data_generation/mock/load.cpp b/pipeline/src/data_generation/mock/load.cpp deleted file mode 100644 index 5011249..0000000 --- a/pipeline/src/data_generation/mock/load.cpp +++ /dev/null @@ -1,15 +0,0 @@ -/** - * @file data_generation/mock/load.cpp - * @brief Provides MockGenerator initialization behavior, which is a no-op load - * path that logs readiness without model resources. - */ - -#include - -#include - -#include "data_generation/mock_generator.h" - -void MockGenerator::Load(const std::string& /*modelPath*/) { - spdlog::info("[MockGenerator] No model needed"); -} diff --git a/pipeline/src/main.cpp b/pipeline/src/main.cpp index cc48dee..9f53346 100644 --- a/pipeline/src/main.cpp +++ b/pipeline/src/main.cpp @@ -6,15 +6,22 @@ #include +#include #include #include #include +#include #include #include "biergarten_data_generator.h" +#include "data_generation/llama_generator.h" +#include "data_generation/mock_generator.h" +#include "services/enrichment_service.h" +#include "services/wikipedia_service.h" #include "web_client/curl_web_client.h" namespace prog_opts = boost::program_options; +namespace di = boost::di; /** * @brief Parse command-line arguments into ApplicationOptions. @@ -44,26 +51,27 @@ auto ParseArguments(const int argc, char** argv, // Handle the "no arguments" or "help" case if (argc == 1) { spdlog::info("Biergarten Pipeline"); - std::stringstream ss; - ss << "\nUsage: biergarten-pipeline [options]\n\n" << desc; - spdlog::info(ss.str()); + std::stringstream usage_stream; + usage_stream << "\nUsage: biergarten-pipeline [options]\n\n" << desc; + spdlog::info(usage_stream.str()); return false; } try { - prog_opts::variables_map vm; - prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), vm); - prog_opts::notify(vm); + prog_opts::variables_map variables_map; + prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), + variables_map); + prog_opts::notify(variables_map); - if (vm.contains("help")) { - std::stringstream ss; - ss << "\n" << desc; - spdlog::info(ss.str()); + if (variables_map.contains("help")) { + std::stringstream help_stream; + help_stream << "\n" << desc; + spdlog::info(help_stream.str()); return false; } - const auto use_mocked = vm["mocked"].as(); - const auto model_path = vm["model"].as(); + const auto use_mocked = variables_map["mocked"].as(); + const auto model_path = variables_map["model"].as(); if (use_mocked && !model_path.empty()) { spdlog::error( @@ -77,9 +85,9 @@ auto ParseArguments(const int argc, char** argv, return false; } - const bool has_llm_params = !vm["temperature"].defaulted() || - !vm["top-p"].defaulted() || - !vm["seed"].defaulted(); + const bool has_llm_params = !variables_map["temperature"].defaulted() || + !variables_map["top-p"].defaulted() || + !variables_map["seed"].defaulted(); if (use_mocked && has_llm_params) { spdlog::warn( @@ -89,10 +97,10 @@ auto ParseArguments(const int argc, char** argv, options.use_mocked = use_mocked; options.model_path = model_path; - options.temperature = vm["temperature"].as(); - options.top_p = vm["top-p"].as(); - options.n_ctx = vm["n-ctx"].as(); - options.seed = vm["seed"].as(); + options.temperature = variables_map["temperature"].as(); + options.top_p = variables_map["top-p"].as(); + options.n_ctx = variables_map["n-ctx"].as(); + options.seed = variables_map["seed"].as(); return true; } catch (const std::exception& exception) { @@ -115,8 +123,29 @@ auto main(const int argc, char** argv) noexcept -> int { return 0; } - auto webClient = std::make_shared(); - BiergartenDataGenerator generator(options, std::move(webClient)); + const auto injector = di::make_injector( + di::bind().to(), + di::bind().to(options), + di::bind().to(), + di::bind().to(options.model_path), + di::bind().to([options](const auto& injector) + -> std::unique_ptr { + if (options.use_mocked) { + spdlog::info( + "[Generator] Using MockGenerator (no model path provided)"); + return std::make_unique(); + } + + spdlog::info( + "[Generator] Using LlamaGenerator: {} (temperature={}, " + "top-p={}, " + "n_ctx={}, seed={})", + options.model_path, options.temperature, options.top_p, + options.n_ctx, options.seed); + return injector.template create>(); + })); + + auto generator = injector.create(); if (!generator.Run()) { spdlog::error("Pipeline execution failed"); diff --git a/pipeline/src/wikipedia/constructor.cpp b/pipeline/src/services/wikipedia/constructor.cpp similarity index 84% rename from pipeline/src/wikipedia/constructor.cpp rename to pipeline/src/services/wikipedia/constructor.cpp index 8884375..bb2fc66 100644 --- a/pipeline/src/wikipedia/constructor.cpp +++ b/pipeline/src/services/wikipedia/constructor.cpp @@ -5,7 +5,7 @@ #include -#include "wikipedia/wikipedia_service.h" +#include "services/wikipedia_service.h" WikipediaService::WikipediaService(std::shared_ptr client) : client_(std::move(client)) {} diff --git a/pipeline/src/wikipedia/fetch_extract.cpp b/pipeline/src/services/wikipedia/fetch_extract.cpp similarity index 97% rename from pipeline/src/wikipedia/fetch_extract.cpp rename to pipeline/src/services/wikipedia/fetch_extract.cpp index 6b4259e..a233df3 100644 --- a/pipeline/src/wikipedia/fetch_extract.cpp +++ b/pipeline/src/services/wikipedia/fetch_extract.cpp @@ -9,7 +9,7 @@ #include #include -#include "wikipedia/wikipedia_service.h" +#include "services/wikipedia_service.h" auto WikipediaService::FetchExtract(std::string_view query) const -> std::string { diff --git a/pipeline/src/services/wikipedia/get_summary.cpp b/pipeline/src/services/wikipedia/get_summary.cpp new file mode 100644 index 0000000..25a45d9 --- /dev/null +++ b/pipeline/src/services/wikipedia/get_summary.cpp @@ -0,0 +1,54 @@ +/** + * @file wikipedia/get_summary.cpp + * @brief WikipediaService::GetLocationContext() implementation. + */ + +#include + +#include + +#include "services/wikipedia_service.h" + +auto WikipediaService::GetLocationContext(const Location& loc) -> std::string { + const std::string cache_key = loc.city + "|" + loc.country; + const auto cache_it = cache_.find(cache_key); + if (cache_it != cache_.end()) { + return cache_it->second; + } + + std::string result; + + if (!client_) { + cache_.emplace(cache_key, result); + return result; + } + + std::string region_query(loc.city); + if (!loc.country.empty()) { + region_query += ", "; + region_query += loc.country; + } + + const std::string beer_query = "beer in " + loc.country; + + try { + const std::string region_extract = FetchExtract(region_query); + const std::string beer_extract = FetchExtract(beer_query); + + if (!region_extract.empty()) { + result += region_extract; + } + if (!beer_extract.empty()) { + if (!result.empty()) { + result += "\n\n"; + } + result += beer_extract; + } + } catch (const std::runtime_error& e) { + spdlog::debug("WikipediaService lookup failed for '{}': {}", region_query, + e.what()); + } + + cache_.emplace(cache_key, result); + return result; +} diff --git a/pipeline/src/wikipedia/get_summary.cpp b/pipeline/src/wikipedia/get_summary.cpp deleted file mode 100644 index 550229a..0000000 --- a/pipeline/src/wikipedia/get_summary.cpp +++ /dev/null @@ -1,55 +0,0 @@ -/** - * @file wikipedia/get_summary.cpp - * @brief WikipediaService::GetSummary() implementation. - */ - -#include - -#include - -#include "wikipedia/wikipedia_service.h" - -auto WikipediaService::GetSummary(std::string_view city, - std::string_view country) -> std::string { - const std::string key = std::string(city) + "|" + std::string(country); - const auto cacheIt = cache_.find(key); - if (cacheIt != cache_.end()) { - return cacheIt->second; - } - - std::string result; - - if (!client_) { - cache_.emplace(key, result); - return result; - } - - std::string regionQuery(city); - if (!country.empty()) { - regionQuery += ", "; - regionQuery += country; - } - - const std::string beerQuery = "beer in " + std::string(country); - - try { - const std::string regionExtract = FetchExtract(regionQuery); - const std::string beerExtract = FetchExtract(beerQuery); - - if (!regionExtract.empty()) { - result += regionExtract; - } - if (!beerExtract.empty()) { - if (!result.empty()) { - result += "\n\n"; - } - result += beerExtract; - } - } catch (const std::runtime_error& e) { - spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery, - e.what()); - } - - cache_.emplace(key, result); - return result; -}