Create biergarten brewery pipeline project (#199)

This commit is contained in:
Aaron Po
2026-04-18 19:19:14 -04:00
committed by GitHub
parent fd3c172e35
commit 898cc8971b
59 changed files with 5638 additions and 0 deletions

View File

@@ -0,0 +1,128 @@
@startuml
skinparam style strictuml
skinparam defaultFontName "DM Sans"
skinparam defaultFontSize 14
skinparam titleFontName "Volkhov"
skinparam titleFontSize 20
skinparam backgroundColor #FAFCF9
skinparam defaultFontColor #28342A
skinparam titleFontColor #28342A
skinparam ArrowColor #628A5B
skinparam NoteBackgroundColor #EAF0E8
skinparam NoteBorderColor #547461
skinparam ActivityBackgroundColor #FAFCF9
skinparam ActivityBorderColor #547461
skinparam ActivityDiamondBackgroundColor #FAFCF9
skinparam ActivityDiamondBorderColor #628A5B
skinparam ActivityBarColor #628A5B
skinparam SwimlaneBorderColor transparent
skinparam SwimlaneBorderThickness 0
title The Biergarten Data Pipeline
|#F2F6F0|main.cc|
start
:ParseArguments(argc, argv);
note right
Validates --mocked, --model,
--temperature, --top-p, etc.
end note
if (Are arguments valid?) then (no)
:spdlog::error usage info;
stop
else (yes)
endif
:Init CurlGlobalState & LlamaBackendState;
:di::make_injector(...);
note right
Binds CURLWebClient, WikipediaService,
Gemma4JinjaPromptFormatter, and
either MockGenerator or LlamaGenerator
end note
:injector.create<BiergartenDataGenerator>();
:BiergartenDataGenerator::Run();
|#EAF0E8|BiergartenDataGenerator|
:QueryCitiesWithCountries();
|#E2EBDC|JsonLoader|
:JsonLoader::LoadLocations("locations.json");
:std::ranges::sample(all_locations, 50);
|#EAF0E8|BiergartenDataGenerator|
while (For each sampled Location?) is (Remaining cities)
|#DCE8D8|WikipediaService|
:GetLocationContext(loc);
:FetchExtract("City, Country");
:FetchExtract("beer in Country");
:FetchExtract("beer in City");
note right: Backed by CURLWebClient::Get
|#EAF0E8|BiergartenDataGenerator|
if (Lookup failed?) then (yes)
:spdlog::warn "context lookup failed";
else (no)
:Store EnrichedCity{Location, region_context};
endif
endwhile (Done)
:GenerateBreweries(enriched_cities);
|#E5EDE1|DataGenerator|
while (For each EnrichedCity?) is (Remaining cities)
if (Generator Mode) then (MockGenerator)
:DeterministicHash(location);
:Select from kBreweryAdjectives, kBreweryNouns,\nkBreweryDescriptions;
:Format BreweryResult;
else (LlamaGenerator)
:PrepareRegionContext(region_context);
:LoadBrewerySystemPrompt("prompts/system.md");
:Format user_prompt;
:Attempt = 0;
repeat
:Infer(system_prompt, user_prompt, max_tokens, kBreweryJsonGrammar);
note right
Uses Gemma4JinjaPromptFormatter,
llama_tokenize, and llama_sampler_sample
end note
:ValidateBreweryJson(raw, brewery);
if (Is JSON Valid?) then (yes)
break
else (no)
if (Error == "incomplete JSON") then (yes)
:max_tokens += 700;
endif
:Update user_prompt with validation error;
:Attempt++;
endif
repeat while (Attempt < 3?) is (yes)
if (Still Invalid?) then (yes)
:throw std::runtime_error;
else (no)
:Return BreweryResult;
endif
endif
|#EAF0E8|BiergartenDataGenerator|
if (Exception thrown?) then (yes)
:spdlog::warn "brewery generation failed";
else (no)
:Store GeneratedBrewery;
endif
|#E5EDE1|DataGenerator|
endwhile (Done)
|#EAF0E8|BiergartenDataGenerator|
:LogResults();
note right: spdlog::info dump of generated JSON fields
|#F2F6F0|main.cc|
:Return 0;
stop
@enduml

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,112 @@
@startuml
skinparam style strictuml
skinparam defaultFontName "DM Sans"
skinparam defaultFontSize 14
skinparam titleFontName "Volkhov"
skinparam titleFontSize 20
skinparam backgroundColor #FAFCF9
skinparam defaultFontColor #28342A
skinparam titleFontColor #28342A
skinparam ArrowColor #628A5B
skinparam class {
BackgroundColor #FAFCF9
HeaderBackgroundColor #EAF0E8
BorderColor #547461
ArrowColor #628A5B
FontColor #28342A
}
skinparam note {
BackgroundColor #EAF0E8
BorderColor #547461
FontColor #28342A
}
title The Biergarten Data Pipeline - Class Diagram
class BiergartenDataGenerator {
- context_service_ : std::unique_ptr<IEnrichmentService>
- generator_ : std::unique_ptr<DataGenerator>
- generated_breweries_ : std::vector<GeneratedBrewery>
+ Run() : bool
- QueryCitiesWithCountries() : std::vector<Location>
- GenerateBreweries(cities : std::span<const EnrichedCity>) : void
- LogResults() : void
}
interface IEnrichmentService <<interface>> {
+ GetLocationContext(loc : const Location&) : std::string
}
class WikipediaService {
- client_ : std::unique_ptr<WebClient>
- extract_cache_ : std::unordered_map<std::string, std::string>
+ GetLocationContext(loc : const Location&) : std::string
- FetchExtract(query : std::string_view) : std::string
}
interface WebClient <<interface>> {
+ Get(url : const std::string&) : std::string
+ UrlEncode(value : const std::string&) : std::string
}
class CURLWebClient {
+ Get(url : const std::string&) : std::string
+ UrlEncode(value : const std::string&) : std::string
}
interface DataGenerator <<interface>> {
+ GenerateBrewery(location : const Location&, region_context : const std::string&) : BreweryResult
+ GenerateUser(locale : const std::string&) : UserResult
}
class MockGenerator {
+ GenerateBrewery(...) : BreweryResult
+ GenerateUser(...) : UserResult
- DeterministicHash(location : const Location&) : size_t
}
class LlamaGenerator {
- model_ : ModelHandle
- context_ : ContextHandle
- prompt_formatter_ : std::unique_ptr<IPromptFormatter>
- rng_ : std::mt19937
+ GenerateBrewery(...) : BreweryResult
+ GenerateUser(...) : UserResult
- Load(model_path : const std::string&) : void
- Infer(...) : std::string
- InferFormatted(...) : std::string
- LoadBrewerySystemPrompt(...) : std::string
}
interface IPromptFormatter <<interface>> {
+ Format(system_prompt : std::string_view, user_prompt : std::string_view) : std::string
}
class Gemma4JinjaPromptFormatter {
+ Format(system_prompt : std::string_view, user_prompt : std::string_view) : std::string
}
class JsonLoader {
+ {static} LoadLocations(filepath : const std::filesystem::path&) : std::vector<Location>
}
' Structural Relationships / Dependency Injection
BiergartenDataGenerator *-- IEnrichmentService : owns
BiergartenDataGenerator *-- DataGenerator : owns
IEnrichmentService <|.. WikipediaService : implements
WikipediaService *-- WebClient : owns
WebClient <|.. CURLWebClient : implements
DataGenerator <|.. MockGenerator : implements
DataGenerator <|.. LlamaGenerator : implements
LlamaGenerator *-- IPromptFormatter : uses
IPromptFormatter <|.. Gemma4JinjaPromptFormatter : implements
BiergartenDataGenerator ..> JsonLoader : uses
@enduml

File diff suppressed because one or more lines are too long