Refactor BiergartenDataGenerator to use dependency injection container

This commit is contained in:
Aaron Po
2026-04-09 20:33:48 -04:00
parent 5d93d76e99
commit 824f5b2b4f
23 changed files with 332 additions and 394 deletions

View File

@@ -1,5 +1,9 @@
cmake_minimum_required(VERSION 3.24) cmake_minimum_required(VERSION 3.24)
project(biergarten-pipeline) project(biergarten-pipeline)
# Boost.DI still declares a very old minimum CMake version, which newer CMake
# releases reject unless a policy version floor is provided.
set(CMAKE_POLICY_VERSION_MINIMUM 3.5 CACHE STRING "" FORCE)
# ============================================================================= # =============================================================================
# 1. GPU Detection # 1. GPU Detection
# ============================================================================= # =============================================================================
@@ -71,6 +75,16 @@ FetchContent_Declare(
GIT_TAG b8711 GIT_TAG b8711
) )
FetchContent_MakeAvailable(llama-cpp) FetchContent_MakeAvailable(llama-cpp)
# --- boost-ext/di -------------------------------------------------------------
FetchContent_Declare(
boost-di
GIT_REPOSITORY https://github.com/boost-ext/di.git
GIT_TAG v1.3.0
)
FetchContent_MakeAvailable(boost-di)
if(TARGET Boost.DI AND NOT TARGET boost::di)
add_library(boost::di ALIAS Boost.DI)
endif()
# --- Boost (JSON + program_options) ------------------------------------------ # --- Boost (JSON + program_options) ------------------------------------------
FetchContent_Declare( FetchContent_Declare(
boost boost
@@ -92,15 +106,13 @@ set(SOURCES
# BiergartenDataGenerator methods # BiergartenDataGenerator methods
src/biergarten_data_generator/constructor.cpp src/biergarten_data_generator/constructor.cpp
src/biergarten_data_generator/run.cpp src/biergarten_data_generator/run.cpp
src/biergarten_data_generator/initialize_generator.cpp
src/biergarten_data_generator/query_cities_with_countries.cpp src/biergarten_data_generator/query_cities_with_countries.cpp
src/biergarten_data_generator/enrich_with_wikipedia.cpp
src/biergarten_data_generator/generate_breweries.cpp src/biergarten_data_generator/generate_breweries.cpp
src/biergarten_data_generator/log_results.cpp src/biergarten_data_generator/log_results.cpp
# WikipediaService methods # WikipediaService methods
src/wikipedia/constructor.cpp src/services/wikipedia/constructor.cpp
src/wikipedia/get_summary.cpp src/services/wikipedia/get_summary.cpp
src/wikipedia/fetch_extract.cpp src/services/wikipedia/fetch_extract.cpp
# CURLWebClient and CurlGlobalState methods # CURLWebClient and CurlGlobalState methods
src/web_client/curl_global_state_constructor.cpp src/web_client/curl_global_state_constructor.cpp
src/web_client/curl_global_state_destructor.cpp src/web_client/curl_global_state_destructor.cpp
@@ -111,18 +123,17 @@ set(SOURCES
src/web_client/curl_web_client_url_encode.cpp src/web_client/curl_web_client_url_encode.cpp
# Data generation modules # Data generation modules
src/data_generation/llama/destructor.cpp src/data_generation/llama/destructor.cpp
src/data_generation/llama/constructor.cpp
src/data_generation/llama/generate_brewery.cpp src/data_generation/llama/generate_brewery.cpp
src/data_generation/llama/generate_user.cpp src/data_generation/llama/generate_user.cpp
src/data_generation/llama/helpers.cpp src/data_generation/llama/helpers.cpp
src/data_generation/llama/infer.cpp src/data_generation/llama/infer.cpp
src/data_generation/llama/load.cpp src/data_generation/llama/load.cpp
src/data_generation/llama/load_brewery_prompt.cpp src/data_generation/llama/load_brewery_prompt.cpp
src/data_generation/llama/set_sampling_options.cpp
src/data_generation/mock/data.cpp src/data_generation/mock/data.cpp
src/data_generation/mock/deterministic_hash.cpp src/data_generation/mock/deterministic_hash.cpp
src/data_generation/mock/generate_brewery.cpp src/data_generation/mock/generate_brewery.cpp
src/data_generation/mock/generate_user.cpp src/data_generation/mock/generate_user.cpp
src/data_generation/mock/load.cpp
src/json_handling/json_loader.cpp src/json_handling/json_loader.cpp
) )
# ============================================================================= # =============================================================================
@@ -138,6 +149,7 @@ target_include_directories(${PROJECT_NAME} PRIVATE
) )
target_link_libraries(${PROJECT_NAME} PRIVATE target_link_libraries(${PROJECT_NAME} PRIVATE
llama llama
boost::di
boost_json boost_json
boost_program_options boost_program_options
spdlog::spdlog spdlog::spdlog

View File

@@ -1,28 +1,29 @@
# Biergarten Pipeline # Biergarten Pipeline
Biergarten Pipeline is a C++23 command-line tool that reads a local city list, looks up a short Wikipedia summary for each sampled city, and generates brewery names and descriptions. The current code samples up to four locations per run, then uses either a local GGUF model or the mock generator to produce the output. Biergarten Pipeline is a C++23 command-line tool that reads a local city list, resolves contextual enrichment for each sampled city through an injected service, and generates brewery names and descriptions. The current code samples up to four locations per run, then uses either a local GGUF model or the mock generator to produce the output.
## Pipeline ## Pipeline
| Stage | What happens | | Stage | What happens |
| -------- | ------------------------------------------------------------------------------ | | -------- | ----------------------------------------------------------------------- |
| Load | Reads `locations.json` and picks up to four city/country pairs. | | Load | Reads `locations.json` and picks up to four city/country pairs. |
| Enrich | Fetches a short Wikipedia summary for each city in parallel with `std::async`. | | Enrich | Calls the injected enrichment service for each sampled city. |
| Generate | Passes the city, country, and summary to the active generator. | | Generate | Passes the city, country, and gathered context to the active generator. |
| Log | Writes the generated breweries and any warnings through `spdlog`. | | Log | Writes the generated breweries and any warnings through `spdlog`. |
If one Wikipedia lookup fails, the pipeline skips that city and keeps going. If an enrichment lookup throws, the pipeline skips that city and keeps going. If the lookup returns an empty string, the city stays in the pipeline and is still passed to the generator.
## Core Components ## Core Components
| Component | Role | | Component | Role |
| ----------------------- | ---------------------------------------------------------- | | ----------------------- | ---------------------------------------------------------------------- |
| BiergartenDataGenerator | Orchestrates loading, enrichment, generation, and logging. | | BiergartenDataGenerator | Orchestrates loading, enrichment lookup, generation, and logging. |
| WikipediaService | Fetches city summaries from Wikipedia. | | IEnrichmentService | Abstraction for location-context providers. |
| LlamaGenerator | Runs local GGUF inference and validates output. | | WikipediaService | Default enrichment provider backed by Wikipedia and in-memory caching. |
| MockGenerator | Produces deterministic fallback data without a model. | | LlamaGenerator | Runs local GGUF inference and validates output. |
| JsonLoader | Parses the local `locations.json` file. | | MockGenerator | Produces deterministic fallback data without a model. |
| CURLWebClient | Handles HTTP requests to Wikipedia. | | JsonLoader | Parses the local `locations.json` file. |
| CURLWebClient | Handles HTTP requests to Wikipedia. |
## Build ## Build
@@ -33,7 +34,7 @@ If one Wikipedia lookup fails, the pipeline skips that city and keeps going.
| libcurl | Required for Wikipedia requests. | | libcurl | Required for Wikipedia requests. |
| Optional GPU tooling | CUDA on NVIDIA, HIP/ROCm on supported AMD systems, Metal on Apple Silicon. | | Optional GPU tooling | CUDA on NVIDIA, HIP/ROCm on supported AMD systems, Metal on Apple Silicon. |
Boost, spdlog, and llama.cpp are fetched by CMake. On Apple Silicon, Metal is enabled automatically. On Linux, the build looks for CUDA or HIP/ROCm when the matching toolkit is present. Windows is not supported. Boost, Boost.DI, spdlog, and llama.cpp are fetched by CMake. On Apple Silicon, Metal is enabled automatically. On Linux, the build looks for CUDA or HIP/ROCm when the matching toolkit is present. Windows is not supported.
```bash ```bash
cmake -S . -B build cmake -S . -B build
@@ -61,7 +62,7 @@ Run the executable from the build directory so the copied `locations.json` is av
| `--seed` | Random seed. Default: `-1`. | | `--seed` | Random seed. Default: `-1`. |
| `--help, -h` | Prints usage. | | `--help, -h` | Prints usage. |
`--mocked` and `--model` are mutually exclusive. If neither is set, the program exits with an error. The sampling flags only matter when a model is loaded. `--mocked` and `--model` are mutually exclusive. If neither is set, the program exits with an error. The sampling flags only matter when a model is loaded. The enrichment step is sequential now, and empty context is allowed.
## Layout ## Layout

View File

@@ -1,12 +1,12 @@
@startuml @startuml
title Biergarten Pipeline - Class Diagram title Biergarten Pipeline - Class and Composition Diagram
left to right direction left to right direction
skinparam shadowing false skinparam shadowing false
skinparam classAttributeIconSize 0 skinparam classAttributeIconSize 0
skinparam packageStyle rectangle skinparam packageStyle rectangle
package "Entry point" { package "Composition root" {
class Main <<entrypoint>> { class Main <<entrypoint>> {
+main(argc: int, argv: char**): int +main(argc: int, argv: char**): int
} }
@@ -15,6 +15,14 @@ package "Entry point" {
+CurlGlobalState() +CurlGlobalState()
+~CurlGlobalState() +~CurlGlobalState()
} }
note right of Main
Binds with Boost.DI:
- WebClient -> CURLWebClient
- IEnrichmentService -> WikipediaService
- DataGenerator -> MockGenerator or LlamaGenerator
- LlamaGenerator receives ApplicationOptions and model_path directly
end note
} }
package "Core orchestration" { package "Core orchestration" {
@@ -28,16 +36,19 @@ package "Core orchestration" {
} }
class BiergartenDataGenerator { class BiergartenDataGenerator {
-options_: ApplicationOptions -context_service_: std::shared_ptr<IEnrichmentService>
-webClient_: std::shared_ptr<WebClient> -generator_: std::unique_ptr<DataGenerator>
+BiergartenDataGenerator(options: ApplicationOptions, web_client: std::unique_ptr<WebClient>) +BiergartenDataGenerator(context_service: std::shared_ptr<IEnrichmentService>, generator: std::unique_ptr<DataGenerator>)
+Run(): bool +Run(): bool
-InitializeGenerator(): std::unique_ptr<DataGenerator>
-QueryCitiesWithCountries(): std::vector<Location> -QueryCitiesWithCountries(): std::vector<Location>
-EnrichWithWikipedia(cities: std::vector<Location>): std::vector<EnrichedCity> -GenerateBreweries(cities: std::vector<EnrichedCity>): void
-GenerateBreweries(generator: DataGenerator&, cities: std::vector<EnrichedCity>): void
-LogResults(): void -LogResults(): void
} }
class EnrichedCity <<struct>> {
+location: Location
+region_context: std::string
}
} }
package "Shared models" { package "Shared models" {
@@ -56,21 +67,17 @@ package "Shared models" {
package "Generation" { package "Generation" {
interface DataGenerator { interface DataGenerator {
+Load(model_path: std::string): void
+GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult +GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult
+GenerateUser(locale: std::string): UserResult +GenerateUser(locale: std::string): UserResult
} }
class MockGenerator { class MockGenerator {
+Load(model_path: std::string): void
+GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult +GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult
+GenerateUser(locale: std::string): UserResult +GenerateUser(locale: std::string): UserResult
} }
class LlamaGenerator { class LlamaGenerator {
+SetSamplingOptions(temperature: float, top_p: float, seed: int = -1): void +LlamaGenerator(options: ApplicationOptions, model_path: std::string)
+SetContextSize(n_ctx: uint32_t): void
+Load(model_path: std::string): void
+GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult +GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult
+GenerateUser(locale: std::string): UserResult +GenerateUser(locale: std::string): UserResult
} }
@@ -93,9 +100,13 @@ package "HTTP" {
} }
package "Wikipedia" { package "Wikipedia" {
interface IEnrichmentService {
+GetLocationContext(loc: Location): std::string
}
class WikipediaService { class WikipediaService {
+WikipediaService(client: std::shared_ptr<WebClient>) +WikipediaService(client: std::shared_ptr<WebClient>)
+GetSummary(city: std::string_view, country: std::string_view): std::string +GetLocationContext(loc: Location): std::string
} }
class JsonLoader { class JsonLoader {
@@ -106,27 +117,30 @@ package "Wikipedia" {
Main --> CurlGlobalState Main --> CurlGlobalState
Main --> ApplicationOptions Main --> ApplicationOptions
Main --> BiergartenDataGenerator Main --> BiergartenDataGenerator
Main --> CURLWebClient Main ..> IEnrichmentService : DI binding
Main ..> DataGenerator : DI factory
Main ..> CURLWebClient : DI binding
BiergartenDataGenerator *-- ApplicationOptions : options_ BiergartenDataGenerator *-- EnrichedCity
BiergartenDataGenerator --> WebClient : shared_ptr
BiergartenDataGenerator ..> JsonLoader : LoadLocations() BiergartenDataGenerator ..> JsonLoader : LoadLocations()
BiergartenDataGenerator ..> WikipediaService : enrich cities BiergartenDataGenerator --> IEnrichmentService : context lookup
BiergartenDataGenerator ..> DataGenerator : initialize generator BiergartenDataGenerator --> DataGenerator : brewery generation
BiergartenDataGenerator ..> Location BiergartenDataGenerator ..> Location
BiergartenDataGenerator ..> BreweryResult BiergartenDataGenerator ..> BreweryResult
DataGenerator <|.. MockGenerator DataGenerator <|.. MockGenerator
DataGenerator <|.. LlamaGenerator DataGenerator <|.. LlamaGenerator
WebClient <|.. CURLWebClient WebClient <|.. CURLWebClient
IEnrichmentService <|.. WikipediaService
WikipediaService --> WebClient : shared_ptr WikipediaService --> WebClient : shared_ptr
note right of BiergartenDataGenerator note right of BiergartenDataGenerator
Current behavior: Current behavior:
samples up to four locations per run. samples up to four locations per run.
Wikipedia enrichment runs asynchronously per sampled city. Enrichment runs once per sampled city.
If a lookup fails, that city is skipped. If a lookup throws, that city is skipped.
Empty context is retained and still passed to the generator.
end note end note
@enduml @enduml

View File

@@ -6,14 +6,14 @@
* @brief Core orchestration class for pipeline data generation. * @brief Core orchestration class for pipeline data generation.
*/ */
#include <cstdint>
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "data_generation/data_generator.h" #include "data_generation/data_generator.h"
#include "data_model/location.h" #include "data_model/location.h"
#include "web_client/web_client.h" #include "services/enrichment_service.h"
#include "wikipedia/wikipedia_service.h"
/** /**
* @brief Program options for the Biergarten pipeline application. * @brief Program options for the Biergarten pipeline application.
@@ -53,18 +53,18 @@ class BiergartenDataGenerator {
/** /**
* @brief Construct a BiergartenDataGenerator with injected dependencies. * @brief Construct a BiergartenDataGenerator with injected dependencies.
* *
* @param options Application configuration options. * @param context_service Context provider for sampled locations.
* @param web_client HTTP client for downloading data. * @param generator Brewery and user data generator.
*/ */
BiergartenDataGenerator(const ApplicationOptions& options, BiergartenDataGenerator(std::shared_ptr<IEnrichmentService> context_service,
std::shared_ptr<WebClient> web_client); std::unique_ptr<DataGenerator> generator);
/** /**
* @brief Run the data generation pipeline. * @brief Run the data generation pipeline.
* *
* Performs the following steps: * Performs the following steps:
* 1. Load curated locations from JSON * 1. Load curated locations from JSON
* 2. Initialize the generator (LLM or Mock) * 2. Resolve context for each city using the injected context service
* 3. Generate brewery data for sampled cities * 3. Generate brewery data for sampled cities
* *
* @return true if successful, false if not * @return true if successful, false if not
@@ -72,11 +72,11 @@ class BiergartenDataGenerator {
bool Run(); bool Run();
private: private:
/// @brief Immutable application options. /// @brief Shared context provider dependency.
const ApplicationOptions options_; std::shared_ptr<IEnrichmentService> context_service_;
/// @brief Shared HTTP client dependency. /// @brief Generator dependency selected in the composition root.
std::shared_ptr<WebClient> webClient_; std::unique_ptr<DataGenerator> generator_;
/** /**
* @brief Enriched city data with Wikipedia context. * @brief Enriched city data with Wikipedia context.
@@ -86,15 +86,6 @@ class BiergartenDataGenerator {
std::string region_context; std::string region_context;
}; };
/**
* @brief Initialize the data generator based on options.
*
* Creates either a MockGenerator (if no model path) or LlamaGenerator.
*
* @return A unique_ptr to the initialized generator.
*/
std::unique_ptr<DataGenerator> InitializeGenerator() const;
/** /**
* @brief Load locations from JSON and sample cities. * @brief Load locations from JSON and sample cities.
* *
@@ -102,23 +93,12 @@ class BiergartenDataGenerator {
*/ */
static std::vector<Location> QueryCitiesWithCountries(); static std::vector<Location> QueryCitiesWithCountries();
/**
* @brief Enrich cities with Wikipedia summaries.
*
* @param cities Vector of sampled locations.
* @return Vector of enriched city data with context.
*/
std::vector<EnrichedCity> EnrichWithWikipedia(
const std::vector<Location>& cities);
/** /**
* @brief Generate breweries for enriched cities. * @brief Generate breweries for enriched cities.
* *
* @param generator The data generator instance.
* @param cities Vector of enriched city data. * @param cities Vector of enriched city data.
*/ */
void GenerateBreweries(DataGenerator& generator, void GenerateBreweries(const std::vector<EnrichedCity>& cities);
const std::vector<EnrichedCity>& cities);
/** /**
* @brief Log the generated brewery results. * @brief Log the generated brewery results.

View File

@@ -38,13 +38,6 @@ class DataGenerator {
/// @brief Virtual destructor for polymorphic cleanup. /// @brief Virtual destructor for polymorphic cleanup.
virtual ~DataGenerator() = default; virtual ~DataGenerator() = default;
/**
* @brief Loads and initializes generator resources.
*
* @param model_path Path to model assets. Implementations may ignore this.
*/
virtual void Load(const std::string& model_path) = 0;
/** /**
* @brief Generates brewery data for a location. * @brief Generates brewery data for a location.
* *

View File

@@ -11,6 +11,8 @@
#include "data_generation/data_generator.h" #include "data_generation/data_generator.h"
struct ApplicationOptions;
struct llama_model; struct llama_model;
struct llama_context; struct llama_context;
@@ -19,35 +21,19 @@ struct llama_context;
*/ */
class LlamaGenerator final : public DataGenerator { class LlamaGenerator final : public DataGenerator {
public: public:
/// @brief Constructs a generator with default sampling and context settings. /**
LlamaGenerator() = default; * @brief Constructs a generator using parsed application options and loads
* the configured model immediately.
*
* @param options Parsed application options.
* @param model_path Filesystem path to GGUF model assets.
*/
LlamaGenerator(const ApplicationOptions& options,
const std::string& model_path);
/// @brief Releases model/context resources. /// @brief Releases model/context resources.
~LlamaGenerator() override; ~LlamaGenerator() override;
/**
* @brief Configures sampling parameters for generation.
*
* @param temperature Sampling temperature.
* @param top_p Nucleus sampling threshold.
* @param seed Seed for sampling; use -1 for random seed.
*/
void SetSamplingOptions(float temperature, float top_p, int seed = -1);
/**
* @brief Sets context window size used during model load.
*
* @param n_ctx Context size in tokens.
*/
void SetContextSize(uint32_t n_ctx);
/**
* @brief Loads model and prepares inference context.
*
* @param model_path Filesystem path to GGUF model.
*/
void Load(const std::string& model_path) override;
/** /**
* @brief Generates brewery data for a specific location. * @brief Generates brewery data for a specific location.
* *
@@ -69,6 +55,13 @@ class LlamaGenerator final : public DataGenerator {
UserResult GenerateUser(const std::string& locale) override; UserResult GenerateUser(const std::string& locale) override;
private: private:
/**
* @brief Loads model and prepares inference context.
*
* @param model_path Filesystem path to GGUF model.
*/
void Load(const std::string& model_path);
/** /**
* @brief Infers text from a user prompt. * @brief Infers text from a user prompt.
* *

View File

@@ -16,13 +16,6 @@
*/ */
class MockGenerator final : public DataGenerator { class MockGenerator final : public DataGenerator {
public: public:
/**
* @brief Initializes the mock generator.
*
* @param model_path Unused for mock generation.
*/
void Load(const std::string& model_path) override;
/** /**
* @brief Generates deterministic brewery data for a location. * @brief Generates deterministic brewery data for a location.
* *

View File

@@ -0,0 +1,30 @@
#ifndef BIERGARTEN_PIPELINE_SERVICES_ENRICHMENT_SERVICE_H_
#define BIERGARTEN_PIPELINE_SERVICES_ENRICHMENT_SERVICE_H_
/**
* @file services/enrichment_service.h
* @brief Abstraction for resolving contextual enrichment for a location.
*/
#include <string>
#include "data_model/location.h"
/**
* @brief Interface for services that can enrich a location with context.
*/
class IEnrichmentService {
public:
/// @brief Virtual destructor for polymorphic cleanup.
virtual ~IEnrichmentService() = default;
/**
* @brief Resolves contextual enrichment for a location.
*
* @param loc Location to enrich.
* @return Context text, or an empty string if unavailable.
*/
virtual std::string GetLocationContext(const Location& loc) = 0;
};
#endif // BIERGARTEN_PIPELINE_SERVICES_ENRICHMENT_SERVICE_H_

View File

@@ -2,7 +2,7 @@
#define BIERGARTEN_PIPELINE_WIKIPEDIA_SERVICE_H_ #define BIERGARTEN_PIPELINE_WIKIPEDIA_SERVICE_H_
/** /**
* @file wikipedia/wikipedia_service.h * @file services/wikipedia_service.h
* @brief Wikipedia summary retrieval service with in-memory caching. * @brief Wikipedia summary retrieval service with in-memory caching.
*/ */
@@ -11,17 +11,17 @@
#include <string_view> #include <string_view>
#include <unordered_map> #include <unordered_map>
#include "services/enrichment_service.h"
#include "web_client/web_client.h" #include "web_client/web_client.h"
/// @brief Provides cached Wikipedia summary lookups for city and country pairs. /// @brief Provides cached Wikipedia summary lookups for city and country pairs.
class WikipediaService { class WikipediaService final : public IEnrichmentService {
public: public:
/// @brief Creates a new Wikipedia service with the provided web client. /// @brief Creates a new Wikipedia service with the provided web client.
explicit WikipediaService(std::shared_ptr<WebClient> client); explicit WikipediaService(std::shared_ptr<WebClient> client);
/// @brief Returns the Wikipedia summary extract for city and country. /// @brief Returns the Wikipedia-derived context for a location.
[[nodiscard]] std::string GetSummary(std::string_view city, [[nodiscard]] std::string GetLocationContext(const Location& loc) override;
std::string_view country);
private: private:
std::string FetchExtract(std::string_view query) const; std::string FetchExtract(std::string_view query) const;

View File

@@ -8,6 +8,7 @@
#include "biergarten_data_generator.h" #include "biergarten_data_generator.h"
BiergartenDataGenerator::BiergartenDataGenerator( BiergartenDataGenerator::BiergartenDataGenerator(
ApplicationOptions const& options, std::shared_ptr<WebClient> web_client) std::shared_ptr<IEnrichmentService> context_service,
: options_(options), webClient_(std::move(web_client)) { std::unique_ptr<DataGenerator> generator)
} : context_service_(std::move(context_service)),
generator_(std::move(generator)) {}

View File

@@ -1,65 +0,0 @@
/**
* @file biergarten_data_generator/enrich_with_wikipedia.cpp
* @brief BiergartenDataGenerator::EnrichWithWikipedia() implementation.
*/
#include <spdlog/spdlog.h>
#include <atomic>
#include <future>
#include <optional>
#include "biergarten_data_generator.h"
#include "wikipedia/wikipedia_service.h"
static auto TryGetRegionContext(
const std::shared_ptr<WebClient>& web_client, const Location* city_ptr,
std::atomic<size_t>* skipped_enrichment_count) noexcept
-> std::optional<std::string> {
try {
WikipediaService wikipedia_service(web_client);
return wikipedia_service.GetSummary(city_ptr->city, city_ptr->country);
} catch (...) {
skipped_enrichment_count->fetch_add(1, std::memory_order_relaxed);
return std::nullopt;
}
}
auto BiergartenDataGenerator::EnrichWithWikipedia(
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
std::vector<EnrichedCity> enriched;
enriched.reserve(cities.size());
std::atomic<size_t> skipped_enrichment_count = 0;
std::vector<std::future<std::optional<std::string>>> pending;
pending.reserve(cities.size());
for (const auto& city : cities) {
const Location* city_ptr = &city;
pending.push_back(std::async(std::launch::async, TryGetRegionContext,
webClient_, city_ptr,
&skipped_enrichment_count));
}
auto city_it = cities.cbegin();
for (auto& task : pending) {
auto maybe_region_context = task.get();
if (maybe_region_context.has_value()) {
spdlog::debug("[Pipeline] Region context for {}: {}", city_it->city,
*maybe_region_context);
enriched.push_back(
EnrichedCity{.location = *city_it,
.region_context = std::move(*maybe_region_context)});
}
++city_it;
}
if (skipped_enrichment_count.load(std::memory_order_relaxed) > 0) {
spdlog::warn(
"[Pipeline] Skipped {} city/cities due to Wikipedia enrichment "
"errors",
skipped_enrichment_count.load(std::memory_order_relaxed));
}
return enriched;
}

View File

@@ -8,7 +8,7 @@
#include "biergarten_data_generator.h" #include "biergarten_data_generator.h"
void BiergartenDataGenerator::GenerateBreweries( void BiergartenDataGenerator::GenerateBreweries(
DataGenerator& generator, const std::vector<EnrichedCity>& cities) { const std::vector<EnrichedCity>& cities) {
spdlog::info("\n=== SAMPLE BREWERY GENERATION ==="); spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
generatedBreweries_.clear(); generatedBreweries_.clear();
@@ -16,7 +16,7 @@ void BiergartenDataGenerator::GenerateBreweries(
for (const auto& enriched_city : cities) { for (const auto& enriched_city : cities) {
try { try {
auto brewery = generator.GenerateBrewery( auto brewery = generator_->GenerateBrewery(
enriched_city.location.city, enriched_city.location.country, enriched_city.location.city, enriched_city.location.country,
enriched_city.region_context); enriched_city.region_context);
generatedBreweries_.push_back(GeneratedBrewery{ generatedBreweries_.push_back(GeneratedBrewery{

View File

@@ -1,35 +0,0 @@
/**
* @file biergarten_data_generator/initialize_generator.cpp
* @brief BiergartenDataGenerator::InitializeGenerator() implementation.
*/
#include <spdlog/spdlog.h>
#include "biergarten_data_generator.h"
#include "data_generation/llama_generator.h"
#include "data_generation/mock_generator.h"
auto BiergartenDataGenerator::InitializeGenerator() const
-> std::unique_ptr<DataGenerator> {
spdlog::info("Initializing brewery generator...");
std::unique_ptr<DataGenerator> generator;
if (options_.model_path.empty()) {
generator = std::make_unique<MockGenerator>();
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
} else {
auto llama_generator = std::make_unique<LlamaGenerator>();
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
options_.seed);
llama_generator->SetContextSize(options_.n_ctx);
spdlog::info(
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
"n_ctx={}, seed={})",
options_.model_path, options_.temperature, options_.top_p,
options_.n_ctx, options_.seed);
generator = std::move(llama_generator);
}
generator->Load(options_.model_path);
return generator;
}

View File

@@ -9,10 +9,35 @@
auto BiergartenDataGenerator::Run() -> bool { auto BiergartenDataGenerator::Run() -> bool {
try { try {
const std::unique_ptr<DataGenerator> generator = InitializeGenerator();
const std::vector<Location> cities = QueryCitiesWithCountries(); const std::vector<Location> cities = QueryCitiesWithCountries();
const std::vector<EnrichedCity> enriched = EnrichWithWikipedia(cities); std::vector<EnrichedCity> enriched;
this->GenerateBreweries(*generator, enriched); enriched.reserve(cities.size());
size_t skipped_count = 0;
for (const auto& city : cities) {
try {
const std::string region_context =
context_service_->GetLocationContext(city);
spdlog::info("[Pipeline] Context for '{}' ({}) gathered:\n{}",
city.city, city.country, region_context);
enriched.push_back(EnrichedCity{.location = city,
.region_context = region_context});
} catch (const std::exception& exception) {
++skipped_count;
spdlog::warn(
"[Pipeline] Skipping city '{}' ({}): context lookup failed: {}",
city.city, city.country, exception.what());
}
}
if (skipped_count > 0) {
spdlog::warn(
"[Pipeline] Skipped {} city/cities due to context lookup errors",
skipped_count);
}
this->GenerateBreweries(enriched);
this->LogResults(); this->LogResults();
return true; return true;
} catch (const std::exception& e) { } catch (const std::exception& e) {

View File

@@ -0,0 +1,53 @@
/**
* @file data_generation/llama/constructor.cpp
* @brief LlamaGenerator constructor implementation.
*/
#include <llama.h>
#include <stdexcept>
#include <string>
#include "biergarten_data_generator.h"
#include "data_generation/llama_generator.h"
LlamaGenerator::LlamaGenerator(const ApplicationOptions& options,
const std::string& model_path) {
if (model_path.empty()) {
throw std::runtime_error("LlamaGenerator: model path must not be empty");
}
if (options.temperature < 0.0F) {
throw std::runtime_error(
"LlamaGenerator: sampling temperature must be >= 0");
}
if (options.top_p <= 0.0F || options.top_p > 1.0F) {
throw std::runtime_error(
"LlamaGenerator: sampling top-p must be in (0, 1]");
}
if (options.seed < -1) {
throw std::runtime_error(
"LlamaGenerator: seed must be >= 0, or -1 for random");
}
if (options.n_ctx == 0 || options.n_ctx > 32768) {
throw std::runtime_error(
"LlamaGenerator: context size must be in range [1, 32768]");
}
sampling_temperature_ = options.temperature;
sampling_top_p_ = options.top_p;
sampling_seed_ = (options.seed < 0)
? static_cast<uint32_t>(LLAMA_DEFAULT_SEED)
: static_cast<uint32_t>(options.seed);
n_ctx_ = options.n_ctx;
try {
Load(model_path);
} catch (...) {
llama_backend_free();
throw;
}
}

View File

@@ -1,7 +1,7 @@
/** /**
* @file data_generation/llama/load.cpp * @file data_generation/llama/load.cpp
* @brief Initializes llama backend, loads model weights, creates inference * @brief Initializes llama backend, loads model weights, creates inference
* context, and resets prior resources during model reload. * context, and resets prior resources during model initialization.
*/ */
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
@@ -13,12 +13,6 @@
#include "llama.h" #include "llama.h"
void LlamaGenerator::Load(const std::string& model_path) { void LlamaGenerator::Load(const std::string& model_path) {
/**
* Validate input and clean up any previously loaded model/context
*/
if (model_path.empty())
throw std::runtime_error("LlamaGenerator: model path must not be empty");
if (context_ != nullptr) { if (context_ != nullptr) {
llama_free(context_); llama_free(context_);
context_ = nullptr; context_ = nullptr;

View File

@@ -1,64 +0,0 @@
/**
* @file data_generation/llama/set_sampling_options.cpp
* @brief Validates and stores sampling temperature, top-p, seed, and context
* size configuration used by subsequent LlamaGenerator inference calls.
*/
#include <stdexcept>
#include "data_generation/llama_generator.h"
#include "llama.h"
void LlamaGenerator::SetSamplingOptions(float temperature, float top_p,
int seed) {
/**
* Validate temperature: controls randomness in output distribution
* 0.0 = deterministic (always pick highest probability token)
* Higher values = more random/diverse output
*/
if (temperature < 0.0f) {
throw std::runtime_error(
"LlamaGenerator: sampling temperature must be >= 0");
}
/**
* Validate top-p (nucleus sampling): only sample from top cumulative
* probability e.g., top-p=0.9 means sample from tokens that make up 90% of
* probability mass
*/
if (!(top_p > 0.0f && top_p <= 1.0f)) {
throw std::runtime_error(
"LlamaGenerator: sampling top-p must be in (0, 1]");
}
/**
* Validate seed: for reproducible results (-1 uses random seed)
*/
if (seed < -1) {
throw std::runtime_error(
"LlamaGenerator: seed must be >= 0, or -1 for random");
}
/**
* Store sampling parameters for use during token generation
*/
sampling_temperature_ = temperature;
sampling_top_p_ = top_p;
sampling_seed_ = (seed < 0) ? static_cast<uint32_t>(LLAMA_DEFAULT_SEED)
: static_cast<uint32_t>(seed);
}
void LlamaGenerator::SetContextSize(uint32_t n_ctx) {
/**
* Validate context size: must be positive and reasonable for the model
*/
if (n_ctx == 0 || n_ctx > 32768) {
throw std::runtime_error(
"LlamaGenerator: context size must be in range [1, 32768]");
}
/**
* Store context size for use during model loading
*/
n_ctx_ = n_ctx;
}

View File

@@ -1,15 +0,0 @@
/**
* @file data_generation/mock/load.cpp
* @brief Provides MockGenerator initialization behavior, which is a no-op load
* path that logs readiness without model resources.
*/
#include <spdlog/spdlog.h>
#include <string>
#include "data_generation/mock_generator.h"
void MockGenerator::Load(const std::string& /*modelPath*/) {
spdlog::info("[MockGenerator] No model needed");
}

View File

@@ -6,15 +6,22 @@
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <boost/di.hpp>
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#include <exception> #include <exception>
#include <memory> #include <memory>
#include <sstream>
#include <string> #include <string>
#include "biergarten_data_generator.h" #include "biergarten_data_generator.h"
#include "data_generation/llama_generator.h"
#include "data_generation/mock_generator.h"
#include "services/enrichment_service.h"
#include "services/wikipedia_service.h"
#include "web_client/curl_web_client.h" #include "web_client/curl_web_client.h"
namespace prog_opts = boost::program_options; namespace prog_opts = boost::program_options;
namespace di = boost::di;
/** /**
* @brief Parse command-line arguments into ApplicationOptions. * @brief Parse command-line arguments into ApplicationOptions.
@@ -44,26 +51,27 @@ auto ParseArguments(const int argc, char** argv,
// Handle the "no arguments" or "help" case // Handle the "no arguments" or "help" case
if (argc == 1) { if (argc == 1) {
spdlog::info("Biergarten Pipeline"); spdlog::info("Biergarten Pipeline");
std::stringstream ss; std::stringstream usage_stream;
ss << "\nUsage: biergarten-pipeline [options]\n\n" << desc; usage_stream << "\nUsage: biergarten-pipeline [options]\n\n" << desc;
spdlog::info(ss.str()); spdlog::info(usage_stream.str());
return false; return false;
} }
try { try {
prog_opts::variables_map vm; prog_opts::variables_map variables_map;
prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), vm); prog_opts::store(prog_opts::parse_command_line(argc, argv, desc),
prog_opts::notify(vm); variables_map);
prog_opts::notify(variables_map);
if (vm.contains("help")) { if (variables_map.contains("help")) {
std::stringstream ss; std::stringstream help_stream;
ss << "\n" << desc; help_stream << "\n" << desc;
spdlog::info(ss.str()); spdlog::info(help_stream.str());
return false; return false;
} }
const auto use_mocked = vm["mocked"].as<bool>(); const auto use_mocked = variables_map["mocked"].as<bool>();
const auto model_path = vm["model"].as<std::string>(); const auto model_path = variables_map["model"].as<std::string>();
if (use_mocked && !model_path.empty()) { if (use_mocked && !model_path.empty()) {
spdlog::error( spdlog::error(
@@ -77,9 +85,9 @@ auto ParseArguments(const int argc, char** argv,
return false; return false;
} }
const bool has_llm_params = !vm["temperature"].defaulted() || const bool has_llm_params = !variables_map["temperature"].defaulted() ||
!vm["top-p"].defaulted() || !variables_map["top-p"].defaulted() ||
!vm["seed"].defaulted(); !variables_map["seed"].defaulted();
if (use_mocked && has_llm_params) { if (use_mocked && has_llm_params) {
spdlog::warn( spdlog::warn(
@@ -89,10 +97,10 @@ auto ParseArguments(const int argc, char** argv,
options.use_mocked = use_mocked; options.use_mocked = use_mocked;
options.model_path = model_path; options.model_path = model_path;
options.temperature = vm["temperature"].as<float>(); options.temperature = variables_map["temperature"].as<float>();
options.top_p = vm["top-p"].as<float>(); options.top_p = variables_map["top-p"].as<float>();
options.n_ctx = vm["n-ctx"].as<uint32_t>(); options.n_ctx = variables_map["n-ctx"].as<uint32_t>();
options.seed = vm["seed"].as<int>(); options.seed = variables_map["seed"].as<int>();
return true; return true;
} catch (const std::exception& exception) { } catch (const std::exception& exception) {
@@ -115,8 +123,29 @@ auto main(const int argc, char** argv) noexcept -> int {
return 0; return 0;
} }
auto webClient = std::make_shared<CURLWebClient>(); const auto injector = di::make_injector(
BiergartenDataGenerator generator(options, std::move(webClient)); di::bind<WebClient>().to<CURLWebClient>(),
di::bind<ApplicationOptions>().to(options),
di::bind<IEnrichmentService>().to<WikipediaService>(),
di::bind<std::string>().to(options.model_path),
di::bind<DataGenerator>().to([options](const auto& injector)
-> std::unique_ptr<DataGenerator> {
if (options.use_mocked) {
spdlog::info(
"[Generator] Using MockGenerator (no model path provided)");
return std::make_unique<MockGenerator>();
}
spdlog::info(
"[Generator] Using LlamaGenerator: {} (temperature={}, "
"top-p={}, "
"n_ctx={}, seed={})",
options.model_path, options.temperature, options.top_p,
options.n_ctx, options.seed);
return injector.template create<std::unique_ptr<LlamaGenerator>>();
}));
auto generator = injector.create<BiergartenDataGenerator>();
if (!generator.Run()) { if (!generator.Run()) {
spdlog::error("Pipeline execution failed"); spdlog::error("Pipeline execution failed");

View File

@@ -5,7 +5,7 @@
#include <utility> #include <utility>
#include "wikipedia/wikipedia_service.h" #include "services/wikipedia_service.h"
WikipediaService::WikipediaService(std::shared_ptr<WebClient> client) WikipediaService::WikipediaService(std::shared_ptr<WebClient> client)
: client_(std::move(client)) {} : client_(std::move(client)) {}

View File

@@ -9,7 +9,7 @@
#include <string> #include <string>
#include <string_view> #include <string_view>
#include "wikipedia/wikipedia_service.h" #include "services/wikipedia_service.h"
auto WikipediaService::FetchExtract(std::string_view query) const auto WikipediaService::FetchExtract(std::string_view query) const
-> std::string { -> std::string {

View File

@@ -0,0 +1,54 @@
/**
* @file wikipedia/get_summary.cpp
* @brief WikipediaService::GetLocationContext() implementation.
*/
#include <spdlog/spdlog.h>
#include <string>
#include "services/wikipedia_service.h"
auto WikipediaService::GetLocationContext(const Location& loc) -> std::string {
const std::string cache_key = loc.city + "|" + loc.country;
const auto cache_it = cache_.find(cache_key);
if (cache_it != cache_.end()) {
return cache_it->second;
}
std::string result;
if (!client_) {
cache_.emplace(cache_key, result);
return result;
}
std::string region_query(loc.city);
if (!loc.country.empty()) {
region_query += ", ";
region_query += loc.country;
}
const std::string beer_query = "beer in " + loc.country;
try {
const std::string region_extract = FetchExtract(region_query);
const std::string beer_extract = FetchExtract(beer_query);
if (!region_extract.empty()) {
result += region_extract;
}
if (!beer_extract.empty()) {
if (!result.empty()) {
result += "\n\n";
}
result += beer_extract;
}
} catch (const std::runtime_error& e) {
spdlog::debug("WikipediaService lookup failed for '{}': {}", region_query,
e.what());
}
cache_.emplace(cache_key, result);
return result;
}

View File

@@ -1,55 +0,0 @@
/**
* @file wikipedia/get_summary.cpp
* @brief WikipediaService::GetSummary() implementation.
*/
#include <spdlog/spdlog.h>
#include <string>
#include "wikipedia/wikipedia_service.h"
auto WikipediaService::GetSummary(std::string_view city,
std::string_view country) -> std::string {
const std::string key = std::string(city) + "|" + std::string(country);
const auto cacheIt = cache_.find(key);
if (cacheIt != cache_.end()) {
return cacheIt->second;
}
std::string result;
if (!client_) {
cache_.emplace(key, result);
return result;
}
std::string regionQuery(city);
if (!country.empty()) {
regionQuery += ", ";
regionQuery += country;
}
const std::string beerQuery = "beer in " + std::string(country);
try {
const std::string regionExtract = FetchExtract(regionQuery);
const std::string beerExtract = FetchExtract(beerQuery);
if (!regionExtract.empty()) {
result += regionExtract;
}
if (!beerExtract.empty()) {
if (!result.empty()) {
result += "\n\n";
}
result += beerExtract;
}
} catch (const std::runtime_error& e) {
spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery,
e.what());
}
cache_.emplace(key, result);
return result;
}