From bcfde856fec9bb8365ee5cdbdabca85d19a060f4 Mon Sep 17 00:00:00 2001 From: Aaron Po Date: Sat, 11 Apr 2026 13:21:50 -0400 Subject: [PATCH] Split data models into dedicated headers --- pipeline/.gitignore | 1 + pipeline/biergarten_pipeline.puml | 61 ++++++++++++------- pipeline/includes/biergarten_data_generator.h | 50 +-------------- .../includes/data_generation/data_generator.h | 36 +---------- .../data_generation/llama_generator.h | 3 +- .../includes/data_model/application_options.h | 42 +++++++++++++ .../includes/data_model/brewery_location.h | 22 +++++++ pipeline/includes/data_model/brewery_result.h | 22 +++++++ pipeline/includes/data_model/enriched_city.h | 21 +++++++ .../includes/data_model/generated_brewery.h | 20 ++++++ .../includes/data_model/generation_models.h | 13 ++++ .../includes/data_model/pipeline_models.h | 12 ++++ pipeline/includes/data_model/user_result.h | 22 +++++++ pipeline/includes/web_client/web_client.h | 3 +- .../data_generation/llama/llama_generator.cpp | 2 +- pipeline/src/main.cpp | 1 + 16 files changed, 223 insertions(+), 108 deletions(-) create mode 100644 pipeline/includes/data_model/application_options.h create mode 100644 pipeline/includes/data_model/brewery_location.h create mode 100644 pipeline/includes/data_model/brewery_result.h create mode 100644 pipeline/includes/data_model/enriched_city.h create mode 100644 pipeline/includes/data_model/generated_brewery.h create mode 100644 pipeline/includes/data_model/generation_models.h create mode 100644 pipeline/includes/data_model/pipeline_models.h create mode 100644 pipeline/includes/data_model/user_result.h diff --git a/pipeline/.gitignore b/pipeline/.gitignore index 931567f..c87bb55 100644 --- a/pipeline/.gitignore +++ b/pipeline/.gitignore @@ -3,3 +3,4 @@ build data models *.gguf +BiergartenPipeline.png diff --git a/pipeline/biergarten_pipeline.puml b/pipeline/biergarten_pipeline.puml index 6b34a71..5c7a148 100644 --- a/pipeline/biergarten_pipeline.puml +++ b/pipeline/biergarten_pipeline.puml @@ -26,6 +26,19 @@ package "Composition root" { } package "Core orchestration" { + class BiergartenDataGenerator { + -context_service_: std::shared_ptr + -generator_: std::unique_ptr + -generated_breweries_: std::vector + +BiergartenDataGenerator(context_service: std::shared_ptr, generator: std::unique_ptr) + +Run(): bool + -QueryCitiesWithCountries(): std::vector + -GenerateBreweries(cities: std::vector): void + -LogResults(): void + } +} + +package "Data models" { class ApplicationOptions <> { +model_path: std::string +use_mocked: bool @@ -36,29 +49,20 @@ package "Core orchestration" { +seed: int } - class BiergartenDataGenerator { - -context_service_: std::shared_ptr - -generator_: std::unique_ptr - +BiergartenDataGenerator(context_service: std::shared_ptr, generator: std::unique_ptr) - +Run(): bool - -QueryCitiesWithCountries(): std::vector - -GenerateBreweries(cities: std::vector): void - -LogResults(): void - } - - class EnrichedCity <> { - +location: Location - +region_context: std::string - } -} - -package "Shared models" { class BreweryLocation <> { +city_name: std::string_view +country_name: std::string_view } - class Location + class Location <> { + +city: std::string + +state_province: std::string + +iso3166_2: std::string + +country: std::string + +iso3166_1: std::string + +latitude: double + +longitude: double + } class BreweryResult <> { +name: std::string @@ -69,6 +73,16 @@ package "Shared models" { +username: std::string +bio: std::string } + + class EnrichedCity <> { + +location: Location + +region_context: std::string + } + + class GeneratedBrewery <> { + +location: Location + +brewery: BreweryResult + } } package "Generation" { @@ -105,6 +119,12 @@ package "HTTP" { } } +package "JSON handling" { + class JsonLoader { + {static} +LoadLocations(filepath: std::string): std::vector + } +} + package "Wikipedia" { interface IEnrichmentService { +GetLocationContext(loc: Location): std::string @@ -114,10 +134,6 @@ package "Wikipedia" { +WikipediaService(client: std::shared_ptr) +GetLocationContext(loc: Location): std::string } - - class JsonLoader { - {static} +LoadLocations(filepath: std::string): std::vector - } } Main --> CurlGlobalState @@ -128,6 +144,7 @@ Main ..> DataGenerator : DI factory Main ..> CURLWebClient : DI binding BiergartenDataGenerator *-- EnrichedCity +BiergartenDataGenerator *-- GeneratedBrewery BiergartenDataGenerator ..> JsonLoader : LoadLocations() BiergartenDataGenerator --> IEnrichmentService : context lookup BiergartenDataGenerator --> DataGenerator : brewery generation diff --git a/pipeline/includes/biergarten_data_generator.h b/pipeline/includes/biergarten_data_generator.h index 6a9e4e9..b4ef399 100644 --- a/pipeline/includes/biergarten_data_generator.h +++ b/pipeline/includes/biergarten_data_generator.h @@ -6,45 +6,15 @@ * @brief Core orchestration class for pipeline data generation. */ -#include #include -#include #include #include "data_generation/data_generator.h" +#include "data_model/enriched_city.h" +#include "data_model/generated_brewery.h" #include "data_model/location.h" #include "services/enrichment_service.h" -/** - * @brief Program options for the Biergarten pipeline application. - */ -struct ApplicationOptions { - /// @brief Path to the LLM model file (gguf format); mutually exclusive with - /// use_mocked. - std::string model_path; - - /// @brief Use mocked generator instead of LLM; mutually exclusive with - /// model_path. - bool use_mocked = false; - - /// @brief LLM sampling temperature (0.0 to 1.0, higher = more random). - float temperature = 1.0F; - - /// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more - /// random). - float top_p = 0.95F; - - /// @brief LLM top-k sampling parameter. - uint32_t top_k = 64; - - /// @brief Context window size (tokens) for LLM inference. Higher values - /// support longer prompts but use more memory. - uint32_t n_ctx = 2048; - - /// @brief Random seed for sampling (-1 for random, otherwise non-negative). - int seed = -1; -}; - /** * @brief Main data generator class for the Biergarten pipeline. * @@ -81,14 +51,6 @@ class BiergartenDataGenerator { /// @brief Generator dependency selected in the composition root. std::unique_ptr generator_; - /** - * @brief Enriched city data with Wikipedia context. - */ - struct EnrichedCity { - Location location; - std::string region_context; - }; - /** * @brief Load locations from JSON and sample cities. * @@ -108,14 +70,6 @@ class BiergartenDataGenerator { */ void LogResults() const; - /** - * @brief Helper struct to store generated brewery data. - */ - struct GeneratedBrewery { - Location location; - BreweryResult brewery; - }; - /// @brief Stores generated brewery data. std::vector generated_breweries_; }; diff --git a/pipeline/includes/data_generation/data_generator.h b/pipeline/includes/data_generation/data_generator.h index 0f60bfc..48e96ca 100644 --- a/pipeline/includes/data_generation/data_generator.h +++ b/pipeline/includes/data_generation/data_generator.h @@ -7,40 +7,10 @@ */ #include -#include -/** - * @brief Non-owning brewery location input. - */ -struct BreweryLocation { - /// @brief City name. - std::string_view city_name; - - /// @brief Country name. - std::string_view country_name; -}; - -/** - * @brief Generated brewery payload. - */ -struct BreweryResult { - /// @brief Brewery display name. - std::string name; - - /// @brief Brewery description text. - std::string description; -}; - -/** - * @brief Generated user profile payload. - */ -struct UserResult { - /// @brief Username handle. - std::string username; - - /// @brief Short user biography. - std::string bio; -}; +#include "data_model/brewery_location.h" +#include "data_model/brewery_result.h" +#include "data_model/user_result.h" /** * @brief Interface for data generator implementations. diff --git a/pipeline/includes/data_generation/llama_generator.h b/pipeline/includes/data_generation/llama_generator.h index fac2742..8969bd5 100644 --- a/pipeline/includes/data_generation/llama_generator.h +++ b/pipeline/includes/data_generation/llama_generator.h @@ -12,8 +12,7 @@ #include #include "data_generation/data_generator.h" - -struct ApplicationOptions; +#include "data_model/application_options.h" struct llama_model; struct llama_context; diff --git a/pipeline/includes/data_model/application_options.h b/pipeline/includes/data_model/application_options.h new file mode 100644 index 0000000..0cc7cc0 --- /dev/null +++ b/pipeline/includes/data_model/application_options.h @@ -0,0 +1,42 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_APPLICATION_OPTIONS_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_APPLICATION_OPTIONS_H_ + +/** + * @file data_model/application_options.h + * @brief Program options for the Biergarten pipeline application. + */ + +#include +#include + +/** + * @brief Program options for the Biergarten pipeline application. + */ +struct ApplicationOptions { + /// @brief Path to the LLM model file (gguf format); mutually exclusive with + /// use_mocked. + std::string model_path; + + /// @brief Use mocked generator instead of LLM; mutually exclusive with + /// model_path. + bool use_mocked = false; + + /// @brief LLM sampling temperature (0.0 to 1.0, higher = more random). + float temperature = 1.0F; + + /// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more + /// random). + float top_p = 0.95F; + + /// @brief LLM top-k sampling parameter. + uint32_t top_k = 64; + + /// @brief Context window size (tokens) for LLM inference. Higher values + /// support longer prompts but use more memory. + uint32_t n_ctx = 2048; + + /// @brief Random seed for sampling (-1 for random, otherwise non-negative). + int seed = -1; +}; + +#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_APPLICATION_OPTIONS_H_ diff --git a/pipeline/includes/data_model/brewery_location.h b/pipeline/includes/data_model/brewery_location.h new file mode 100644 index 0000000..27f9b1b --- /dev/null +++ b/pipeline/includes/data_model/brewery_location.h @@ -0,0 +1,22 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_BREWERY_LOCATION_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_BREWERY_LOCATION_H_ + +/** + * @file data_model/brewery_location.h + * @brief Non-owning brewery location input. + */ + +#include + +/** + * @brief Non-owning brewery location input. + */ +struct BreweryLocation { + /// @brief City name. + std::string_view city_name; + + /// @brief Country name. + std::string_view country_name; +}; + +#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_BREWERY_LOCATION_H_ diff --git a/pipeline/includes/data_model/brewery_result.h b/pipeline/includes/data_model/brewery_result.h new file mode 100644 index 0000000..d40c009 --- /dev/null +++ b/pipeline/includes/data_model/brewery_result.h @@ -0,0 +1,22 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_BREWERY_RESULT_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_BREWERY_RESULT_H_ + +/** + * @file data_model/brewery_result.h + * @brief Generated brewery payload. + */ + +#include + +/** + * @brief Generated brewery payload. + */ +struct BreweryResult { + /// @brief Brewery display name. + std::string name; + + /// @brief Brewery description text. + std::string description; +}; + +#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_BREWERY_RESULT_H_ diff --git a/pipeline/includes/data_model/enriched_city.h b/pipeline/includes/data_model/enriched_city.h new file mode 100644 index 0000000..613fba4 --- /dev/null +++ b/pipeline/includes/data_model/enriched_city.h @@ -0,0 +1,21 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_ENRICHED_CITY_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_ENRICHED_CITY_H_ + +/** + * @file data_model/enriched_city.h + * @brief Enriched city data with Wikipedia context. + */ + +#include + +#include "data_model/location.h" + +/** + * @brief Enriched city data with Wikipedia context. + */ +struct EnrichedCity { + Location location; + std::string region_context; +}; + +#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_ENRICHED_CITY_H_ diff --git a/pipeline/includes/data_model/generated_brewery.h b/pipeline/includes/data_model/generated_brewery.h new file mode 100644 index 0000000..0414888 --- /dev/null +++ b/pipeline/includes/data_model/generated_brewery.h @@ -0,0 +1,20 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_GENERATED_BREWERY_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_GENERATED_BREWERY_H_ + +/** + * @file data_model/generated_brewery.h + * @brief Helper struct to store generated brewery data. + */ + +#include "data_model/brewery_result.h" +#include "data_model/location.h" + +/** + * @brief Helper struct to store generated brewery data. + */ +struct GeneratedBrewery { + Location location; + BreweryResult brewery; +}; + +#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_GENERATED_BREWERY_H_ diff --git a/pipeline/includes/data_model/generation_models.h b/pipeline/includes/data_model/generation_models.h new file mode 100644 index 0000000..3897f93 --- /dev/null +++ b/pipeline/includes/data_model/generation_models.h @@ -0,0 +1,13 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_GENERATION_MODELS_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_GENERATION_MODELS_H_ + +/** + * @file data_model/generation_models.h + * @brief Convenience include for shared generation payload models. + */ + +#include "data_model/brewery_location.h" +#include "data_model/brewery_result.h" +#include "data_model/user_result.h" + +#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_GENERATION_MODELS_H_ diff --git a/pipeline/includes/data_model/pipeline_models.h b/pipeline/includes/data_model/pipeline_models.h new file mode 100644 index 0000000..373db7a --- /dev/null +++ b/pipeline/includes/data_model/pipeline_models.h @@ -0,0 +1,12 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_PIPELINE_MODELS_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_PIPELINE_MODELS_H_ + +/** + * @file data_model/pipeline_models.h + * @brief Convenience include for pipeline-specific data models. + */ + +#include "data_model/enriched_city.h" +#include "data_model/generated_brewery.h" + +#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_PIPELINE_MODELS_H_ diff --git a/pipeline/includes/data_model/user_result.h b/pipeline/includes/data_model/user_result.h new file mode 100644 index 0000000..fa018c8 --- /dev/null +++ b/pipeline/includes/data_model/user_result.h @@ -0,0 +1,22 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_USER_RESULT_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_USER_RESULT_H_ + +/** + * @file data_model/user_result.h + * @brief Generated user profile payload. + */ + +#include + +/** + * @brief Generated user profile payload. + */ +struct UserResult { + /// @brief Username handle. + std::string username; + + /// @brief Short user biography. + std::string bio; +}; + +#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_USER_RESULT_H_ diff --git a/pipeline/includes/web_client/web_client.h b/pipeline/includes/web_client/web_client.h index 1ef0bbf..affe12f 100644 --- a/pipeline/includes/web_client/web_client.h +++ b/pipeline/includes/web_client/web_client.h @@ -1,5 +1,4 @@ -#ifndef -BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_WEB_CLIENT_H_ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_WEB_CLIENT_H_ #define BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_WEB_CLIENT_H_ /** diff --git a/pipeline/src/data_generation/llama/llama_generator.cpp b/pipeline/src/data_generation/llama/llama_generator.cpp index ed39fc4..8bf036b 100644 --- a/pipeline/src/data_generation/llama/llama_generator.cpp +++ b/pipeline/src/data_generation/llama/llama_generator.cpp @@ -9,7 +9,7 @@ #include #include -#include "biergarten_data_generator.h" +#include "data_model/application_options.h" #include "llama.h" LlamaGenerator::LlamaGenerator(const ApplicationOptions& options, diff --git a/pipeline/src/main.cpp b/pipeline/src/main.cpp index 4ee58b7..cf3e353 100644 --- a/pipeline/src/main.cpp +++ b/pipeline/src/main.cpp @@ -16,6 +16,7 @@ #include "biergarten_data_generator.h" #include "data_generation/llama_generator.h" #include "data_generation/mock_generator.h" +#include "data_model/application_options.h" #include "llama_backend_state.h" #include "services/enrichment_service.h" #include "services/wikipedia_service.h"