mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
1416 lines
52 KiB
Diff
1416 lines
52 KiB
Diff
diff --git a/pipeline/CMakeLists.txt b/pipeline/CMakeLists.txt
|
|
index 02f769b..0c0aab1 100644
|
|
--- a/pipeline/CMakeLists.txt
|
|
+++ b/pipeline/CMakeLists.txt
|
|
@@ -60,6 +60,28 @@ endif()
|
|
# Require system Boost for JSON and Program Options to speed up build times
|
|
find_package(Boost REQUIRED COMPONENTS json program_options)
|
|
|
|
+FetchContent_Declare(
|
|
+ sqlite_amalgamation
|
|
+ URL https://www.sqlite.org/2026/sqlite-amalgamation-3530000.zip
|
|
+ URL_HASH SHA3_256=c2325c53b3b41761469f91cfb078e96882ac5d85bac10c11b0bd8f253b031e5b
|
|
+)
|
|
+FetchContent_GetProperties(sqlite_amalgamation)
|
|
+if(NOT sqlite_amalgamation_POPULATED)
|
|
+ FetchContent_Populate(sqlite_amalgamation)
|
|
+endif()
|
|
+
|
|
+if(NOT TARGET sqlite3)
|
|
+ add_library(sqlite3 STATIC
|
|
+ ${sqlite_amalgamation_SOURCE_DIR}/sqlite3.c
|
|
+ )
|
|
+ target_include_directories(sqlite3 PUBLIC
|
|
+ ${sqlite_amalgamation_SOURCE_DIR}
|
|
+ )
|
|
+ target_compile_definitions(sqlite3 PUBLIC
|
|
+ SQLITE_THREADSAFE=1
|
|
+ )
|
|
+endif()
|
|
+
|
|
FetchContent_Declare(
|
|
llama-cpp
|
|
GIT_REPOSITORY https://github.com/ggml-org/llama.cpp.git
|
|
@@ -97,6 +119,16 @@ set(SOURCES
|
|
src/services/wikipedia/wikipedia_service.cc
|
|
src/services/wikipedia/get_summary.cc
|
|
src/services/wikipedia/fetch_extract.cc
|
|
+ src/services/sqlite/sqlite_export_service.cc
|
|
+ src/services/sqlite/build_database_path.cc
|
|
+ src/services/sqlite/build_location_key.cc
|
|
+ src/services/sqlite/initialize_schema.cc
|
|
+ src/services/sqlite/prepare_statements.cc
|
|
+ src/services/sqlite/initialize.cc
|
|
+ src/services/sqlite/process_record.cc
|
|
+ src/services/sqlite/finalize_statements.cc
|
|
+ src/services/sqlite/rollback_and_close_no_throw.cc
|
|
+ src/services/sqlite/finalize.cc
|
|
src/web_client/curl_global_state.cc
|
|
src/web_client/curl_web_client_get.cc
|
|
src/web_client/curl_web_client_url_encode.cc
|
|
@@ -129,6 +161,7 @@ target_link_libraries(${PROJECT_NAME} PRIVATE
|
|
Boost::json
|
|
Boost::program_options
|
|
spdlog::spdlog
|
|
+ sqlite3
|
|
CURL::libcurl
|
|
)
|
|
|
|
diff --git a/pipeline/README.md b/pipeline/README.md
|
|
index 5463382..103d9dd 100644
|
|
--- a/pipeline/README.md
|
|
+++ b/pipeline/README.md
|
|
@@ -6,7 +6,7 @@ A C++20 command-line pipeline that samples city records from local JSON, enriche
|
|
|
|
## Table of Contents
|
|
|
|
-- [How It Fits the Main App](#how-it-fits-the-main-app)
|
|
+- [How It Fits The Main App](#how-it-fits-the-main-app)
|
|
- [Tech Stack](#tech-stack)
|
|
- [Build](#build)
|
|
- [Model](#model)
|
|
@@ -26,7 +26,7 @@ A C++20 command-line pipeline that samples city records from local JSON, enriche
|
|
|
|
---
|
|
|
|
-## How It Fits the Main App
|
|
+## How It Fits The Main App
|
|
|
|
The pipeline is a data ingestion layer. It sits outside the web app runtime and produces seed records the app imports at startup or during a dedicated seed step.
|
|
|
|
@@ -46,17 +46,19 @@ The pipeline is a data ingestion layer. It sits outside the web app runtime and
|
|
- Boost.JSON, Boost.ProgramOptions, Boost.DI
|
|
- spdlog
|
|
- libcurl
|
|
+- SQLite amalgamation fetched and compiled via CMake FetchContent
|
|
- llama.cpp
|
|
|
|
-The build fetches Boost.DI, spdlog, and llama.cpp via CMake. Metal is enabled on Apple Silicon; CUDA or HIP/ROCm is detected on Linux when the toolkit is present.
|
|
+The build fetches Boost.DI, spdlog, llama.cpp, and SQLite via CMake. Metal is enabled on Apple Silicon; CUDA or HIP/ROCm is detected on Linux when the toolkit is present.
|
|
|
|
-> **Code Style:** Modern C++20 throughout — RAII for ownership, `std::unique_ptr` for injected dependencies, `std::optional` for parse outcomes, `std::span` for read-only views over generated city data, structured bindings in pipeline loops. Formatting follows the Google C++ Style Guide via `.clang-format` with a narrow column limit and two-space indentation.
|
|
+> **Code Style:** Modern C++20 throughout - RAII for ownership, `std::unique_ptr` for injected dependencies, `std::optional` for parse outcomes, `std::span` for read-only views over generated city data, structured bindings in pipeline loops. Formatting follows the Google C++ Style Guide via `.clang-format` with a narrow column limit and two-space indentation.
|
|
|
|
---
|
|
|
|
## Build
|
|
|
|
Requirements: C++20 compiler, CMake 3.24+, libcurl, Boost (JSON and ProgramOptions).
|
|
+SQLite is fetched from the upstream amalgamation, so no system SQLite package is required.
|
|
|
|
```bash
|
|
cmake -S . -B build
|
|
@@ -80,7 +82,7 @@ curl -L \
|
|
|
|
## Run
|
|
|
|
-Run from `build/` so the copied `locations.json` and `prompts/` are available.
|
|
+Run from `build/` so the copied `locations.json` and `prompts/` are available. Each run also writes a fresh dated SQLite file such as `biergarten_seed_2026-04-19T15-30-45.123456Z.sqlite` into the working directory.
|
|
|
|
```bash
|
|
./biergarten-pipeline --mocked
|
|
@@ -102,7 +104,7 @@ Run from `build/` so the copied `locations.json` and `prompts/` are available.
|
|
|
|
`--mocked` and `--model` are mutually exclusive. Omitting both exits with an error before the pipeline starts. Sampling flags are ignored when `--mocked` is set.
|
|
|
|
-The post-build step copies `prompts/` into `build/prompts/`. Rebuild after editing [prompts/system.md](prompts/system.md).
|
|
+The post-build step copies `prompts/` into `build/prompts/`. Rebuild after editing `prompts/system.md`.
|
|
|
|
---
|
|
|
|
@@ -110,23 +112,25 @@ The post-build step copies `prompts/` into `build/prompts/`. Rebuild after editi
|
|
|
|
### Pipeline Stages
|
|
|
|
-| Stage | Implementation |
|
|
-| -------- | -------------------------------------------------------------------------------------------------------------- |
|
|
-| Load | `JsonLoader::LoadLocations()` reads `locations.json` into typed `Location` records. |
|
|
-| Sample | `BiergartenDataGenerator::QueryCitiesWithCountries()` samples up to 50 locations per run. |
|
|
-| Enrich | `WikipediaService` fetches city and beer context. Keeps going when a lookup fails. |
|
|
-| Generate | `MockGenerator` or `LlamaGenerator` produces brewery names and descriptions in English and the local language. |
|
|
-| Log | `spdlog` writes results and warnings to the console. |
|
|
+| Stage | Implementation |
|
|
+| -------- | --------------------------------------------------------------------------------------------------------------------------------------- |
|
|
+| Load | `JsonLoader::LoadLocations()` reads `locations.json` into typed `Location` records. |
|
|
+| Sample | `BiergartenDataGenerator::QueryCitiesWithCountries()` samples up to 50 locations per run. |
|
|
+| Enrich | `WikipediaService` fetches city and beer context. Keeps going when a lookup fails. |
|
|
+| Generate | `MockGenerator` or `LlamaGenerator` produces brewery names and descriptions in English and the local language. |
|
|
+| Store | `SqliteExportService` writes each successful brewery into a fresh dated `.sqlite` database with normalized location and brewery tables. |
|
|
+| Log | `spdlog` writes results and warnings to the console. |
|
|
|
|
If enrichment or generation fails for a city, that city is skipped and the pipeline continues.
|
|
|
|
### Key Components
|
|
|
|
-- `src/main.cc` — argument parsing and Boost.DI composition root.
|
|
-- `JsonLoader` — validates curated location input.
|
|
-- `WikipediaService` — queries Wikipedia extracts, caches results, returns empty context on failure.
|
|
-- `LlamaGenerator` — formats prompts for Gemma 4, validates JSON output, retries malformed responses up to three times. If output looks truncated, the retry raises the token budget before trying again.
|
|
-- `MockGenerator` — stable hash-based output so the same city input always produces the same brewery.
|
|
+- `src/main.cc` - argument parsing and Boost.DI composition root.
|
|
+- `JsonLoader` - validates curated location input.
|
|
+- `WikipediaService` - queries Wikipedia extracts, caches results, returns empty context on failure.
|
|
+- `LlamaGenerator` - formats prompts for Gemma 4, validates JSON output, retries malformed responses up to three times. If output looks truncated, the retry raises the token budget before trying again.
|
|
+- `MockGenerator` - stable hash-based output so the same city input always produces the same brewery.
|
|
+- `SqliteExportService` - creates a dated SQLite file per run and persists each successful brewery into normalized tables.
|
|
- Brewery payloads include English and local-language name and description fields.
|
|
|
|
### Runtime Behaviour
|
|
@@ -139,11 +143,11 @@ If enrichment or generation fails for a city, that city is skipped and the pipel
|
|
|
|
`MockGenerator` uses stable hashes for repeatable output in demos and Storybook runs.
|
|
|
|
-### Process Flow — Activity Diagram
|
|
+### Process Flow - Activity Diagram
|
|
|
|

|
|
|
|
-### Architectural Overview — Class Diagram
|
|
+### Architectural Overview - Class Diagram
|
|
|
|

|
|
|
|
@@ -151,7 +155,7 @@ If enrichment or generation fails for a city, that city is skipped and the pipel
|
|
|
|
## Generated Output
|
|
|
|
-Each successful run stores a `GeneratedBrewery` pair with the source location and a `BreweryResult` payload.
|
|
+Each successful run stores a `GeneratedBrewery` pair with the source location and a `BreweryResult` payload. The same generated records are also written to a fresh SQLite export file named with the current UTC timestamp.
|
|
|
|
| Field | Meaning |
|
|
| ------------------- | ------------------------------------------ |
|
|
@@ -255,7 +259,7 @@ For languages such as Welsh (Wales), Maori (Aotearoa/New Zealand), or Sicilian (
|
|
|
|
## Tested Hardware
|
|
|
|
-### ARM macOS — M1 Pro
|
|
+### ARM macOS - M1 Pro
|
|
|
|
| | |
|
|
| --------- | --------------------------------- |
|
|
@@ -266,7 +270,7 @@ For languages such as Welsh (Wales), Maori (Aotearoa/New Zealand), or Sicilian (
|
|
| Model | Gemma 4 E4B |
|
|
| Inference | llama.cpp with Metal |
|
|
|
|
-### x86_64 Linux — NVIDIA RTX 2000
|
|
+### x86_64 Linux - NVIDIA RTX 2000
|
|
|
|
| | |
|
|
| --------- | ------------------------------ |
|
|
@@ -293,11 +297,12 @@ For languages such as Welsh (Wales), Maori (Aotearoa/New Zealand), or Sicilian (
|
|
|
|
## Code Tour
|
|
|
|
-- `src/main.cc` — argument parsing and DI composition root.
|
|
-- `src/biergarten_data_generator/` — orchestration, sampling, logging.
|
|
-- `src/services/wikipedia/` — enrichment service and cache.
|
|
-- `src/data_generation/llama/` — local inference, prompt loading, output validation.
|
|
-- `src/data_generation/mock/` — deterministic fallback.
|
|
+- `src/main.cc` - argument parsing and DI composition root.
|
|
+- `src/biergarten_data_generator/` - orchestration, sampling, logging, and export.
|
|
+- `src/services/wikipedia/` - enrichment service and cache.
|
|
+- `src/services/sqlite/` - SQLite export implementation.
|
|
+- `src/data_generation/llama/` - local inference, prompt loading, output validation.
|
|
+- `src/data_generation/mock/` - deterministic fallback.
|
|
|
|
---
|
|
|
|
@@ -312,11 +317,7 @@ For languages such as Welsh (Wales), Maori (Aotearoa/New Zealand), or Sicilian (
|
|
|
|
## Next Steps
|
|
|
|
-The pipeline currently produces city-aware brewery records. The next passes add SQLite output and additional fixture types so the app can exercise the full brewery domain without live data.
|
|
-
|
|
-### SQLite Output _(Highest Importance)_
|
|
-
|
|
-Write generated records to a SQLite database for downstream OLTP seeding. Normalized schema with foreign keys between locations and breweries. Output replaces the current log-only result so the pipeline functions as a proper ingestion layer.
|
|
+The pipeline currently produces city-aware brewery records and dated SQLite exports. The next passes add additional fixture types so the app can exercise the full brewery domain without live data.
|
|
|
|
### Testing _(Very High Importance)_
|
|
|
|
@@ -336,7 +337,7 @@ Generate user profiles with stable names, bios, locale hints, and preference sig
|
|
|
|
### Check-In System
|
|
|
|
-Produce timestamped check-in events between users and breweries. Use a J-curve activity profile — a small set of users accounts for most check-ins, the rest appear occasionally. Add bursty behaviour around weekends and travel periods.
|
|
+Produce timestamped check-in events between users and breweries. Use a J-curve activity profile - a small set of users accounts for most check-ins, the rest appear occasionally. Add bursty behaviour around weekends and travel periods.
|
|
|
|
### Beer Ratings
|
|
|
|
diff --git a/pipeline/diagrams/activity-diagram.puml b/pipeline/diagrams/activity-diagram.puml
|
|
index bc95437..87caefb 100644
|
|
--- a/pipeline/diagrams/activity-diagram.puml
|
|
+++ b/pipeline/diagrams/activity-diagram.puml
|
|
@@ -15,19 +15,14 @@ skinparam ActivityBorderColor #547461
|
|
skinparam ActivityDiamondBackgroundColor #FAFCF9
|
|
skinparam ActivityDiamondBorderColor #628A5B
|
|
skinparam ActivityBarColor #628A5B
|
|
-skinparam SwimlaneBorderColor transparent
|
|
-skinparam SwimlaneBorderThickness 0
|
|
+skinparam SwimlaneBorderColor #547461
|
|
+skinparam SwimlaneBorderThickness 0.3
|
|
|
|
-title The Biergarten Data Pipeline
|
|
+title The Biergarten Data Pipeline (Streaming Architecture)
|
|
|
|
|#F2F6F0|main.cc|
|
|
start
|
|
:ParseArguments(argc, argv);
|
|
-note right
|
|
- Validates --mocked, --model,
|
|
- --temperature, --top-p, etc.
|
|
-end note
|
|
-
|
|
if (Are arguments valid?) then (no)
|
|
:spdlog::error usage info;
|
|
stop
|
|
@@ -36,13 +31,22 @@ endif
|
|
|
|
:Init CurlGlobalState & LlamaBackendState;
|
|
:di::make_injector(...);
|
|
+:injector.create<std::unique_ptr<BiergartenDataGenerator>>();
|
|
+:BiergartenDataGenerator::Run();
|
|
+
|
|
+|#EAF0E8|BiergartenDataGenerator|
|
|
+:Initialize SQLite export;
|
|
+
|
|
+|#E0EAE0|SqliteExportService|
|
|
+:GetUtcTimestamp() from SystemDateTimeProvider;
|
|
+:Initialize();
|
|
note right
|
|
- Binds CURLWebClient, WikipediaService,
|
|
- Gemma4JinjaPromptFormatter, and
|
|
- either MockGenerator or LlamaGenerator
|
|
+ Builds a fresh biergarten_seed_<UTC datetime>.sqlite filename
|
|
+ Appends a numeric suffix if the timestamp already exists
|
|
+ Opens DB Connection
|
|
+ Executes Schema DDL
|
|
+ Begins Transaction
|
|
end note
|
|
-:injector.create<BiergartenDataGenerator>();
|
|
-:BiergartenDataGenerator::Run();
|
|
|
|
|#EAF0E8|BiergartenDataGenerator|
|
|
:QueryCitiesWithCountries();
|
|
@@ -55,71 +59,64 @@ end note
|
|
while (For each sampled Location?) is (Remaining cities)
|
|
|#DCE8D8|WikipediaService|
|
|
:GetLocationContext(loc);
|
|
- :FetchExtract("City, Country");
|
|
- :FetchExtract("beer in Country");
|
|
- :FetchExtract("beer in City");
|
|
- note right: Backed by CURLWebClient::Get
|
|
-
|
|
+ :FetchExtracts(City, Country, Beer);
|
|
|#EAF0E8|BiergartenDataGenerator|
|
|
- if (Lookup failed?) then (yes)
|
|
- :spdlog::warn "context lookup failed";
|
|
- else (no)
|
|
- :Store EnrichedCity{Location, region_context};
|
|
- endif
|
|
+ :Store EnrichedCity{Location, region_context};
|
|
endwhile (Done)
|
|
|
|
+|#EAF0E8|BiergartenDataGenerator|
|
|
:GenerateBreweries(enriched_cities);
|
|
|
|
|#E5EDE1|DataGenerator|
|
|
while (For each EnrichedCity?) is (Remaining cities)
|
|
if (Generator Mode) then (MockGenerator)
|
|
- :DeterministicHash(location);
|
|
- :Select from kBreweryAdjectives, kBreweryNouns,\nkBreweryDescriptions;
|
|
- :Format BreweryResult;
|
|
+ :DeterministicHash & Format;
|
|
else (LlamaGenerator)
|
|
- :PrepareRegionContext(region_context);
|
|
+ :PrepareRegionContext;
|
|
:LoadBrewerySystemPrompt("prompts/system.md");
|
|
- :Format user_prompt;
|
|
- :Attempt = 0;
|
|
repeat
|
|
:Infer(system_prompt, user_prompt, max_tokens, kBreweryJsonGrammar);
|
|
- note right
|
|
- Uses Gemma4JinjaPromptFormatter,
|
|
- llama_tokenize, and llama_sampler_sample
|
|
- end note
|
|
:ValidateBreweryJson(raw, brewery);
|
|
-
|
|
if (Is JSON Valid?) then (yes)
|
|
break
|
|
else (no)
|
|
- if (Error == "incomplete JSON") then (yes)
|
|
- :max_tokens += 700;
|
|
- endif
|
|
- :Update user_prompt with validation error;
|
|
:Attempt++;
|
|
endif
|
|
-
|
|
repeat while (Attempt < 3?) is (yes)
|
|
+ endif
|
|
|
|
- if (Still Invalid?) then (yes)
|
|
- :throw std::runtime_error;
|
|
+ |#EAF0E8|BiergartenDataGenerator|
|
|
+ if (Generation successful?) then (yes)
|
|
+ |#E0EAE0|SqliteExportService|
|
|
+ :ProcessRecord(GeneratedBrewery);
|
|
+ if (Location in cache?) then (yes)
|
|
+ :Reuse location_id;
|
|
else (no)
|
|
- :Return BreweryResult;
|
|
+ :Insert Location & Cache ID;
|
|
endif
|
|
- endif
|
|
+ :Insert Brewery (FK: location_id);
|
|
|
|
- |#EAF0E8|BiergartenDataGenerator|
|
|
- if (Exception thrown?) then (yes)
|
|
- :spdlog::warn "brewery generation failed";
|
|
+ if (Exception caught during insert?) then (yes)
|
|
+ |#EAF0E8|BiergartenDataGenerator|
|
|
+ :spdlog::warn "Failed to stream record to SQLite export";
|
|
+ note right
|
|
+ Data loss is prevented per-record.
|
|
+ The pipeline continues running.
|
|
+ end note
|
|
+ else (no)
|
|
+ endif
|
|
else (no)
|
|
- :Store GeneratedBrewery;
|
|
+ :spdlog::warn "Generation failed, skipping...";
|
|
endif
|
|
|#E5EDE1|DataGenerator|
|
|
endwhile (Done)
|
|
|
|
-|#EAF0E8|BiergartenDataGenerator|
|
|
-:LogResults();
|
|
-note right: spdlog::info dump of generated JSON fields
|
|
+|#E0EAE0|SqliteExportService|
|
|
+:Finalize();
|
|
+note right
|
|
+ Commits Transaction
|
|
+ Closes Database Connection
|
|
+end note
|
|
|
|
|#F2F6F0|main.cc|
|
|
:Return 0;
|
|
diff --git a/pipeline/diagrams/class-diagram.puml b/pipeline/diagrams/class-diagram.puml
|
|
index 9046b96..74acc97 100644
|
|
--- a/pipeline/diagrams/class-diagram.puml
|
|
+++ b/pipeline/diagrams/class-diagram.puml
|
|
@@ -28,6 +28,7 @@ title The Biergarten Data Pipeline - Class Diagram
|
|
class BiergartenDataGenerator {
|
|
- context_service_ : std::unique_ptr<IEnrichmentService>
|
|
- generator_ : std::unique_ptr<DataGenerator>
|
|
+ - exporter_ : std::unique_ptr<IExportService>
|
|
- generated_breweries_ : std::vector<GeneratedBrewery>
|
|
+ Run() : bool
|
|
- QueryCitiesWithCountries() : std::vector<Location>
|
|
@@ -92,9 +93,39 @@ class JsonLoader {
|
|
+ {static} LoadLocations(filepath : const std::filesystem::path&) : std::vector<Location>
|
|
}
|
|
|
|
+interface IExportService <<interface>> {
|
|
+ + Initialize() : void
|
|
+ + ProcessRecord(brewery : const GeneratedBrewery&) : void
|
|
+ + Finalize() : void
|
|
+}
|
|
+
|
|
+class SqliteExportService {
|
|
+ - date_time_provider_ : std::unique_ptr<IDateTimeProvider>
|
|
+ - run_timestamp_utc_ : std::string
|
|
+ - database_path_ : std::filesystem::path
|
|
+ - db_handle_ : sqlite3*
|
|
+ - insert_location_stmt_ : sqlite3_stmt*
|
|
+ - insert_brewery_stmt_ : sqlite3_stmt*
|
|
+ - transaction_open_ : bool
|
|
+ - location_cache_ : std::unordered_map<std::string, sqlite3_int64>
|
|
+ + Initialize() : void
|
|
+ + ProcessRecord(brewery : const GeneratedBrewery&) : void
|
|
+ + Finalize() : void
|
|
+ - InitializeSchema() : void
|
|
+}
|
|
+
|
|
+interface IDateTimeProvider <<interface>> {
|
|
+ + GetUtcTimestamp() : std::string
|
|
+}
|
|
+
|
|
+class SystemDateTimeProvider {
|
|
+ + GetUtcTimestamp() : std::string
|
|
+}
|
|
+
|
|
' Structural Relationships / Dependency Injection
|
|
BiergartenDataGenerator *-- IEnrichmentService : owns
|
|
BiergartenDataGenerator *-- DataGenerator : owns
|
|
+BiergartenDataGenerator *-- IExportService : owns
|
|
|
|
IEnrichmentService <|.. WikipediaService : implements
|
|
WikipediaService *-- WebClient : owns
|
|
@@ -109,4 +140,9 @@ LlamaGenerator *-- IPromptFormatter : uses
|
|
IPromptFormatter <|.. Gemma4JinjaPromptFormatter : implements
|
|
|
|
BiergartenDataGenerator ..> JsonLoader : uses
|
|
+
|
|
+IExportService <|.. SqliteExportService : implements
|
|
+SqliteExportService *-- IDateTimeProvider : owns
|
|
+IDateTimeProvider <|.. SystemDateTimeProvider : implements
|
|
+
|
|
@enduml
|
|
diff --git a/pipeline/includes/biergarten_data_generator.h b/pipeline/includes/biergarten_data_generator.h
|
|
index 260ea29..5663cd6 100644
|
|
--- a/pipeline/includes/biergarten_data_generator.h
|
|
+++ b/pipeline/includes/biergarten_data_generator.h
|
|
@@ -14,6 +14,7 @@
|
|
#include "data_model/enriched_city.h"
|
|
#include "data_model/generated_brewery.h"
|
|
#include "data_model/location.h"
|
|
+#include "services/export_service.h"
|
|
#include "services/enrichment_service.h"
|
|
|
|
/**
|
|
@@ -28,10 +29,12 @@ class BiergartenDataGenerator {
|
|
* @brief Construct a BiergartenDataGenerator with injected dependencies.
|
|
*
|
|
* @param context_service Context provider for sampled locations.
|
|
- * @param generator Brewery and user data generator.
|
|
+ * @param generator Brewery and user data generator.
|
|
+ * @param exporter Storage backend for generated brewery data.
|
|
*/
|
|
BiergartenDataGenerator(std::unique_ptr<IEnrichmentService> context_service,
|
|
- std::unique_ptr<DataGenerator> generator);
|
|
+ std::unique_ptr<DataGenerator> generator,
|
|
+ std::unique_ptr<IExportService> exporter);
|
|
|
|
/**
|
|
* @brief Run the data generation pipeline.
|
|
@@ -52,6 +55,9 @@ class BiergartenDataGenerator {
|
|
/// @brief Generator dependency selected in the composition root.
|
|
std::unique_ptr<DataGenerator> generator_;
|
|
|
|
+ /// @brief Storage backend for generated brewery records.
|
|
+ std::unique_ptr<IExportService> exporter_;
|
|
+
|
|
/**
|
|
* @brief Load locations from JSON and sample cities.
|
|
*
|
|
diff --git a/pipeline/includes/services/date_time_provider.h b/pipeline/includes/services/date_time_provider.h
|
|
new file mode 100644
|
|
index 0000000..80c99d7
|
|
--- /dev/null
|
|
+++ b/pipeline/includes/services/date_time_provider.h
|
|
@@ -0,0 +1,68 @@
|
|
+#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_DATE_TIME_PROVIDER_H_
|
|
+#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_DATE_TIME_PROVIDER_H_
|
|
+
|
|
+/**
|
|
+ * @file services/date_time_provider.h
|
|
+ * @brief Abstraction for UTC timestamp generation.
|
|
+ */
|
|
+
|
|
+#include <chrono>
|
|
+#include <ctime>
|
|
+#include <iomanip>
|
|
+#include <sstream>
|
|
+#include <stdexcept>
|
|
+#include <string>
|
|
+
|
|
+/**
|
|
+ * @brief Interface for UTC timestamp providers.
|
|
+ */
|
|
+class IDateTimeProvider {
|
|
+ public:
|
|
+
|
|
+ /// @brief Virtual destructor for polymorphic cleanup.
|
|
+ virtual ~IDateTimeProvider() = default;
|
|
+
|
|
+
|
|
+ IDateTimeProvider() = default;
|
|
+ IDateTimeProvider(const IDateTimeProvider&) = delete;
|
|
+ IDateTimeProvider& operator=(const IDateTimeProvider&) = delete;
|
|
+ IDateTimeProvider(IDateTimeProvider&&) = delete;
|
|
+ IDateTimeProvider& operator=(IDateTimeProvider&&) = delete;
|
|
+
|
|
+ /**
|
|
+ * @brief Returns the current UTC timestamp in a file-safe format.
|
|
+ *
|
|
+ * @return UTC timestamp string.
|
|
+ */
|
|
+ virtual std::string GetUtcTimestamp() = 0;
|
|
+};
|
|
+
|
|
+/**
|
|
+ * @brief Timestamp provider backed by the system clock.
|
|
+ */
|
|
+class SystemDateTimeProvider final : public IDateTimeProvider {
|
|
+ public:
|
|
+ std::string GetUtcTimestamp() override {
|
|
+ constexpr int kFractionalSecondWidth = 6;
|
|
+
|
|
+ const auto now = std::chrono::system_clock::now();
|
|
+ const auto now_time = std::chrono::system_clock::to_time_t(now);
|
|
+ const std::tm* utc_time_ptr = std::gmtime(&now_time);
|
|
+ if (utc_time_ptr == nullptr) {
|
|
+ throw std::runtime_error("Failed to format UTC timestamp");
|
|
+ }
|
|
+
|
|
+ const auto fractional_seconds =
|
|
+ std::chrono::duration_cast<std::chrono::microseconds>(
|
|
+ now.time_since_epoch()) %
|
|
+ std::chrono::seconds(1);
|
|
+
|
|
+ std::ostringstream output;
|
|
+ output << std::put_time(utc_time_ptr, "%Y-%m-%dT%H-%M-%S");
|
|
+ output << '.' << std::setw(kFractionalSecondWidth) << std::setfill('0')
|
|
+ << fractional_seconds.count() << 'Z';
|
|
+ return output.str();
|
|
+ }
|
|
+};
|
|
+
|
|
+#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_DATE_TIME_PROVIDER_H_
|
|
diff --git a/pipeline/includes/services/export_service.h b/pipeline/includes/services/export_service.h
|
|
new file mode 100644
|
|
index 0000000..a45c483
|
|
--- /dev/null
|
|
+++ b/pipeline/includes/services/export_service.h
|
|
@@ -0,0 +1,40 @@
|
|
+#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_EXPORT_SERVICE_H_
|
|
+#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_EXPORT_SERVICE_H_
|
|
+
|
|
+/**
|
|
+ * @file services/export_service.h
|
|
+ * @brief Abstraction for persisting generated brewery data.
|
|
+ */
|
|
+
|
|
+#include "data_model/generated_brewery.h"
|
|
+
|
|
+/**
|
|
+ * @brief Interface for services that persist generated brewery records.
|
|
+ */
|
|
+class IExportService {
|
|
+ public:
|
|
+ IExportService() = default;
|
|
+
|
|
+ /// @brief Virtual destructor for polymorphic cleanup.
|
|
+ virtual ~IExportService() = default;
|
|
+
|
|
+ IExportService(const IExportService&) = delete;
|
|
+ IExportService& operator=(const IExportService&) = delete;
|
|
+ IExportService(IExportService&&) = delete;
|
|
+ IExportService& operator=(IExportService&&) = delete;
|
|
+
|
|
+ /// @brief Prepares the export destination for a new run.
|
|
+ virtual void Initialize() = 0;
|
|
+
|
|
+ /**
|
|
+ * @brief Persists one generated brewery record.
|
|
+ *
|
|
+ * @param brewery Generated brewery payload to store.
|
|
+ */
|
|
+ virtual void ProcessRecord(const GeneratedBrewery& brewery) = 0;
|
|
+
|
|
+ /// @brief Finalizes the export destination.
|
|
+ virtual void Finalize() = 0;
|
|
+};
|
|
+
|
|
+#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_EXPORT_SERVICE_H_
|
|
diff --git a/pipeline/includes/services/sqlite_export_service.h b/pipeline/includes/services/sqlite_export_service.h
|
|
new file mode 100644
|
|
index 0000000..957c82a
|
|
--- /dev/null
|
|
+++ b/pipeline/includes/services/sqlite_export_service.h
|
|
@@ -0,0 +1,57 @@
|
|
+#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_EXPORT_SERVICE_H_
|
|
+#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_EXPORT_SERVICE_H_
|
|
+
|
|
+/**
|
|
+ * @file services/sqlite_export_service.h
|
|
+ * @brief SQLite-backed export service for generated brewery data.
|
|
+ */
|
|
+
|
|
+#include <filesystem>
|
|
+#include <memory>
|
|
+#include <string>
|
|
+#include <unordered_map>
|
|
+
|
|
+#include "services/date_time_provider.h"
|
|
+#include "services/export_service.h"
|
|
+#include "services/sqlite_export_service_helpers.h"
|
|
+
|
|
+/**
|
|
+ * @brief Persists generated brewery records into a fresh SQLite database.
|
|
+ */
|
|
+class SqliteExportService final : public IExportService {
|
|
+ public:
|
|
+ SqliteExportService();
|
|
+ ~SqliteExportService() override;
|
|
+
|
|
+ SqliteExportService(const SqliteExportService&) = delete;
|
|
+ SqliteExportService& operator=(const SqliteExportService&) = delete;
|
|
+ SqliteExportService(SqliteExportService&&) = delete;
|
|
+ SqliteExportService& operator=(SqliteExportService&&) = delete;
|
|
+
|
|
+ void Initialize() override;
|
|
+ void ProcessRecord(const GeneratedBrewery& brewery) override;
|
|
+ void Finalize() override;
|
|
+
|
|
+ private:
|
|
+ using SqliteDatabaseHandle = sqlite_export_service_internal::SqliteDatabaseHandle;
|
|
+ using SqliteStatementHandle = sqlite_export_service_internal::SqliteStatementHandle;
|
|
+
|
|
+ void InitializeSchema();
|
|
+ void PrepareStatements();
|
|
+ void RollbackAndCloseNoThrow() noexcept;
|
|
+ void FinalizeStatements() noexcept;
|
|
+
|
|
+ [[nodiscard]] std::filesystem::path BuildDatabasePath() const;
|
|
+ [[nodiscard]] static std::string BuildLocationKey(const Location& location);
|
|
+
|
|
+ std::unique_ptr<IDateTimeProvider> date_time_provider_;
|
|
+ std::string run_timestamp_utc_;
|
|
+ std::filesystem::path database_path_;
|
|
+ SqliteDatabaseHandle db_handle_;
|
|
+ SqliteStatementHandle insert_location_stmt_;
|
|
+ SqliteStatementHandle insert_brewery_stmt_;
|
|
+ bool transaction_open_ = false;
|
|
+ std::unordered_map<std::string, sqlite3_int64> location_cache_;
|
|
+};
|
|
+
|
|
+#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_EXPORT_SERVICE_H_
|
|
diff --git a/pipeline/includes/services/sqlite_export_service_helpers.h b/pipeline/includes/services/sqlite_export_service_helpers.h
|
|
new file mode 100644
|
|
index 0000000..acdf890
|
|
--- /dev/null
|
|
+++ b/pipeline/includes/services/sqlite_export_service_helpers.h
|
|
@@ -0,0 +1,250 @@
|
|
+#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_EXPORT_SERVICE_HELPERS_H_
|
|
+#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_EXPORT_SERVICE_HELPERS_H_
|
|
+
|
|
+/**
|
|
+ * @file services/sqlite_export_service_helpers.h
|
|
+ * @brief Internal SQLite export helpers shared across per-method translation
|
|
+ * units.
|
|
+ */
|
|
+
|
|
+#include <sqlite3.h>
|
|
+
|
|
+#include <boost/json.hpp>
|
|
+#include <cstddef>
|
|
+#include <cstring>
|
|
+#include <filesystem>
|
|
+#include <limits>
|
|
+#include <memory>
|
|
+#include <stdexcept>
|
|
+#include <string>
|
|
+#include <string_view>
|
|
+#include <vector>
|
|
+
|
|
+namespace sqlite_export_service_internal {
|
|
+
|
|
+struct SqliteDatabaseDeleter {
|
|
+ void operator()(sqlite3* handle) const noexcept {
|
|
+ if (handle != nullptr) {
|
|
+ sqlite3_close(handle);
|
|
+ }
|
|
+ }
|
|
+};
|
|
+
|
|
+struct SqliteStatementDeleter {
|
|
+ void operator()(sqlite3_stmt* statement) const noexcept {
|
|
+ if (statement != nullptr) {
|
|
+ sqlite3_finalize(statement);
|
|
+ }
|
|
+ }
|
|
+};
|
|
+
|
|
+using SqliteDatabaseHandle = std::unique_ptr<sqlite3, SqliteDatabaseDeleter>;
|
|
+using SqliteStatementHandle =
|
|
+ std::unique_ptr<sqlite3_stmt, SqliteStatementDeleter>;
|
|
+
|
|
+inline constexpr std::string_view kCreateLocationsTableSql = R"sql(
|
|
+
|
|
+CREATE TABLE IF NOT EXISTS locations (
|
|
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
+ city TEXT NOT NULL,
|
|
+ state_province TEXT NOT NULL,
|
|
+ iso3166_2 TEXT NOT NULL,
|
|
+ country TEXT NOT NULL,
|
|
+ iso3166_1 TEXT NOT NULL,
|
|
+ local_languages_json TEXT NOT NULL,
|
|
+ latitude REAL NOT NULL,
|
|
+ longitude REAL NOT NULL,
|
|
+ UNIQUE(city, state_province, iso3166_2, country, latitude, longitude)
|
|
+);
|
|
+
|
|
+)sql";
|
|
+
|
|
+inline constexpr std::string_view kCreateBreweriesTableSql = R"sql(
|
|
+
|
|
+CREATE TABLE IF NOT EXISTS breweries (
|
|
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
+ location_id INTEGER NOT NULL,
|
|
+ name_en TEXT NOT NULL,
|
|
+ description_en TEXT NOT NULL,
|
|
+ name_local TEXT NOT NULL,
|
|
+ description_local TEXT NOT NULL,
|
|
+ FOREIGN KEY(location_id) REFERENCES locations(id) ON DELETE CASCADE
|
|
+);
|
|
+
|
|
+CREATE INDEX IF NOT EXISTS idx_breweries_location_id ON breweries(location_id);
|
|
+
|
|
+)sql";
|
|
+
|
|
+inline constexpr std::string_view kInsertLocationSql = R"sql(
|
|
+INSERT INTO locations (
|
|
+ city,
|
|
+ state_province,
|
|
+ iso3166_2,
|
|
+ country,
|
|
+ iso3166_1,
|
|
+ local_languages_json,
|
|
+ latitude,
|
|
+ longitude
|
|
+) VALUES (?, ?, ?, ?, ?, ?, ?, ?);
|
|
+)sql";
|
|
+
|
|
+inline constexpr std::string_view kInsertBrewerySql = R"sql(
|
|
+INSERT INTO breweries (
|
|
+ location_id,
|
|
+ name_en,
|
|
+ description_en,
|
|
+ name_local,
|
|
+ description_local
|
|
+) VALUES (?, ?, ?, ?, ?);
|
|
+)sql";
|
|
+
|
|
+inline constexpr int kLocationCityBindIndex = 1;
|
|
+inline constexpr int kLocationStateProvinceBindIndex = 2;
|
|
+inline constexpr int kLocationIso31662BindIndex = 3;
|
|
+inline constexpr int kLocationCountryBindIndex = 4;
|
|
+inline constexpr int kLocationIso31661BindIndex = 5;
|
|
+inline constexpr int kLocationLanguagesBindIndex = 6;
|
|
+inline constexpr int kLocationLatitudeBindIndex = 7;
|
|
+inline constexpr int kLocationLongitudeBindIndex = 8;
|
|
+
|
|
+inline constexpr int kBreweryLocationIdBindIndex = 1;
|
|
+inline constexpr int kBreweryEnglishNameBindIndex = 2;
|
|
+inline constexpr int kBreweryEnglishDescriptionBindIndex = 3;
|
|
+inline constexpr int kBreweryLocalNameBindIndex = 4;
|
|
+inline constexpr int kBreweryLocalDescriptionBindIndex = 5;
|
|
+
|
|
+inline void ThrowSqliteError(sqlite3* db_handle, std::string_view action) {
|
|
+ const std::string message =
|
|
+ db_handle != nullptr ? sqlite3_errmsg(db_handle) : "unknown SQLite error";
|
|
+ throw std::runtime_error(std::string(action) + ": " + message);
|
|
+}
|
|
+
|
|
+inline SqliteDatabaseHandle OpenDatabase(const std::filesystem::path& path) {
|
|
+ sqlite3* raw_handle = nullptr;
|
|
+ const std::string path_string = path.string();
|
|
+ const int result = sqlite3_open(path_string.c_str(), &raw_handle);
|
|
+ SqliteDatabaseHandle handle(raw_handle);
|
|
+ if (result != SQLITE_OK) {
|
|
+ const std::string message = raw_handle != nullptr
|
|
+ ? sqlite3_errmsg(raw_handle)
|
|
+ : "unknown SQLite error";
|
|
+ throw std::runtime_error("Failed to open SQLite export database: " +
|
|
+ message);
|
|
+ }
|
|
+
|
|
+ return handle;
|
|
+}
|
|
+
|
|
+inline void ExecSql(const SqliteDatabaseHandle& db_handle,
|
|
+ std::string_view sql, const char* action) {
|
|
+ char* error_message = nullptr;
|
|
+ const std::string sql_text(sql);
|
|
+ const int result = sqlite3_exec(db_handle.get(), sql_text.c_str(), nullptr,
|
|
+ nullptr, &error_message);
|
|
+ if (result != SQLITE_OK) {
|
|
+ const std::string message = error_message != nullptr
|
|
+ ? error_message
|
|
+ : sqlite3_errmsg(db_handle.get());
|
|
+ sqlite3_free(error_message);
|
|
+ throw std::runtime_error(std::string(action) + ": " + message);
|
|
+ }
|
|
+}
|
|
+
|
|
+inline SqliteStatementHandle PrepareStatement(
|
|
+ const SqliteDatabaseHandle& db_handle, std::string_view sql,
|
|
+ const char* action) {
|
|
+ sqlite3_stmt* raw_statement = nullptr;
|
|
+ const std::string sql_text(sql);
|
|
+ const int result = sqlite3_prepare_v2(db_handle.get(), sql_text.c_str(), -1,
|
|
+ &raw_statement, nullptr);
|
|
+ SqliteStatementHandle statement(raw_statement);
|
|
+ if (result != SQLITE_OK) {
|
|
+ ThrowSqliteError(db_handle.get(), action);
|
|
+ }
|
|
+
|
|
+ return statement;
|
|
+}
|
|
+
|
|
+inline void ResetStatement(SqliteStatementHandle& statement) {
|
|
+ if (statement != nullptr) {
|
|
+ sqlite3_reset(statement.get());
|
|
+ sqlite3_clear_bindings(statement.get());
|
|
+ }
|
|
+}
|
|
+
|
|
+inline void DeleteCharArray(void* data) noexcept {
|
|
+ delete[] static_cast<char*>(data);
|
|
+}
|
|
+
|
|
+inline void BindText(const SqliteStatementHandle& statement, int index,
|
|
+ std::string_view value, const char* action) {
|
|
+ const auto byte_count = value.size();
|
|
+ if (byte_count > static_cast<std::size_t>(std::numeric_limits<int>::max())) {
|
|
+ ThrowSqliteError(sqlite3_db_handle(statement.get()), action);
|
|
+ }
|
|
+
|
|
+ auto buffer = std::make_unique<char[]>(byte_count + 1);
|
|
+ std::memcpy(buffer.get(), value.data(), byte_count);
|
|
+ buffer[byte_count] = '\0';
|
|
+
|
|
+ char* raw_buffer = buffer.release();
|
|
+ const int result = sqlite3_bind_text(statement.get(), index, raw_buffer,
|
|
+ static_cast<int>(byte_count),
|
|
+ DeleteCharArray);
|
|
+ if (result != SQLITE_OK) {
|
|
+ DeleteCharArray(raw_buffer);
|
|
+ ThrowSqliteError(sqlite3_db_handle(statement.get()), action);
|
|
+ }
|
|
+}
|
|
+
|
|
+inline void BindDouble(const SqliteStatementHandle& statement, int index,
|
|
+ double value, std::string_view action) {
|
|
+ const int result = sqlite3_bind_double(statement.get(), index, value);
|
|
+ if (result != SQLITE_OK) {
|
|
+ ThrowSqliteError(sqlite3_db_handle(statement.get()), action);
|
|
+ }
|
|
+}
|
|
+
|
|
+inline void BindInt64(const SqliteStatementHandle& statement, int index,
|
|
+ sqlite3_int64 value, std::string_view action) {
|
|
+ const int result = sqlite3_bind_int64(statement.get(), index, value);
|
|
+ if (result != SQLITE_OK) {
|
|
+ ThrowSqliteError(sqlite3_db_handle(statement.get()), action);
|
|
+ }
|
|
+}
|
|
+
|
|
+inline void StepStatement(const SqliteDatabaseHandle& db_handle,
|
|
+ const SqliteStatementHandle& statement,
|
|
+ std::string_view action) {
|
|
+ const int result = sqlite3_step(statement.get());
|
|
+ if (result != SQLITE_DONE) {
|
|
+ ThrowSqliteError(db_handle.get(), action);
|
|
+ }
|
|
+}
|
|
+
|
|
+inline sqlite3_int64 LastInsertRowId(const SqliteDatabaseHandle& db_handle) {
|
|
+ return sqlite3_last_insert_rowid(db_handle.get());
|
|
+}
|
|
+
|
|
+inline void RollbackTransactionNoThrow(
|
|
+ const SqliteDatabaseHandle& db_handle) noexcept {
|
|
+ if (!db_handle) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ sqlite3_exec(db_handle.get(), "ROLLBACK;", nullptr, nullptr, nullptr);
|
|
+}
|
|
+
|
|
+inline std::string SerializeLocalLanguages(
|
|
+ const std::vector<std::string>& local_languages) {
|
|
+ boost::json::array array;
|
|
+ array.reserve(local_languages.size());
|
|
+ for (const auto& language : local_languages) {
|
|
+ array.emplace_back(language);
|
|
+ }
|
|
+ return boost::json::serialize(array);
|
|
+}
|
|
+
|
|
+} // namespace sqlite_export_service_internal
|
|
+
|
|
+#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_EXPORT_SERVICE_HELPERS_H_
|
|
diff --git a/pipeline/src/biergarten_data_generator/biergarten_data_generator.cc b/pipeline/src/biergarten_data_generator/biergarten_data_generator.cc
|
|
index b3dd072..033795d 100644
|
|
--- a/pipeline/src/biergarten_data_generator/biergarten_data_generator.cc
|
|
+++ b/pipeline/src/biergarten_data_generator/biergarten_data_generator.cc
|
|
@@ -9,6 +9,8 @@
|
|
|
|
BiergartenDataGenerator::BiergartenDataGenerator(
|
|
std::unique_ptr<IEnrichmentService> context_service,
|
|
- std::unique_ptr<DataGenerator> generator)
|
|
+ std::unique_ptr<DataGenerator> generator,
|
|
+ std::unique_ptr<IExportService> exporter)
|
|
: context_service_(std::move(context_service)),
|
|
- generator_(std::move(generator)) {}
|
|
+ generator_(std::move(generator)),
|
|
+ exporter_(std::move(exporter)) {}
|
|
diff --git a/pipeline/src/biergarten_data_generator/generate_breweries.cc b/pipeline/src/biergarten_data_generator/generate_breweries.cc
|
|
index 93b2c85..934c3dd 100644
|
|
--- a/pipeline/src/biergarten_data_generator/generate_breweries.cc
|
|
+++ b/pipeline/src/biergarten_data_generator/generate_breweries.cc
|
|
@@ -13,6 +13,7 @@ void BiergartenDataGenerator::GenerateBreweries(
|
|
|
|
generated_breweries_.clear();
|
|
size_t skipped_count = 0;
|
|
+ size_t export_failed_count = 0;
|
|
|
|
for (const auto& [location, region_context] : cities) {
|
|
try {
|
|
@@ -22,6 +23,17 @@ void BiergartenDataGenerator::GenerateBreweries(
|
|
const GeneratedBrewery gen{.location = location, .brewery = brewery};
|
|
|
|
generated_breweries_.push_back(gen);
|
|
+
|
|
+ try {
|
|
+ exporter_->ProcessRecord(gen);
|
|
+ } catch (const std::exception& export_exception) {
|
|
+ ++export_failed_count;
|
|
+
|
|
+ spdlog::warn(
|
|
+ "[Pipeline] Generated brewery for '{}' ({}) but SQLite export "
|
|
+ "failed: {}",
|
|
+ location.city, location.country, export_exception.what());
|
|
+ }
|
|
} catch (const std::exception& e) {
|
|
++skipped_count;
|
|
|
|
@@ -36,4 +48,11 @@ void BiergartenDataGenerator::GenerateBreweries(
|
|
spdlog::warn("[Pipeline] Skipped {} city/cities due to generation errors",
|
|
skipped_count);
|
|
}
|
|
+
|
|
+ if (export_failed_count > 0) {
|
|
+ spdlog::warn(
|
|
+ "[Pipeline] Failed to export {} generated brewery/breweries to "
|
|
+ "SQLite",
|
|
+ export_failed_count);
|
|
+ }
|
|
}
|
|
diff --git a/pipeline/src/biergarten_data_generator/run.cc b/pipeline/src/biergarten_data_generator/run.cc
|
|
index 48f91ac..7ae3d06 100644
|
|
--- a/pipeline/src/biergarten_data_generator/run.cc
|
|
+++ b/pipeline/src/biergarten_data_generator/run.cc
|
|
@@ -11,6 +11,8 @@
|
|
|
|
bool BiergartenDataGenerator::Run() {
|
|
try {
|
|
+ exporter_->Initialize();
|
|
+
|
|
std::vector<Location> cities = QueryCitiesWithCountries();
|
|
std::vector<EnrichedCity> enriched;
|
|
enriched.reserve(cities.size());
|
|
@@ -40,6 +42,7 @@ bool BiergartenDataGenerator::Run() {
|
|
}
|
|
|
|
this->GenerateBreweries(enriched);
|
|
+ exporter_->Finalize();
|
|
this->LogResults();
|
|
return true;
|
|
} catch (const std::exception& e) {
|
|
diff --git a/pipeline/src/main.cc b/pipeline/src/main.cc
|
|
index 896eb14..1a5cf81 100644
|
|
--- a/pipeline/src/main.cc
|
|
+++ b/pipeline/src/main.cc
|
|
@@ -20,8 +20,10 @@
|
|
#include "data_generation/mock_generator.h"
|
|
#include "data_generation/prompt_formatting/gemma4_jinja_prompt_formatter.h"
|
|
#include "data_model/application_options.h"
|
|
+#include "services/export_service.h"
|
|
#include "llama_backend_state.h"
|
|
#include "services/enrichment_service.h"
|
|
+#include "services/sqlite_export_service.h"
|
|
#include "services/wikipedia_service.h"
|
|
#include "web_client/curl_web_client.h"
|
|
|
|
@@ -160,6 +162,7 @@ int main(const int argc, char** argv) {
|
|
di::bind<WebClient>().to<CURLWebClient>(),
|
|
di::bind<ApplicationOptions>().to(options),
|
|
di::bind<IEnrichmentService>().to<WikipediaService>(),
|
|
+ di::bind<IExportService>().to<SqliteExportService>(),
|
|
di::bind<IPromptFormatter>().to<Gemma4JinjaPromptFormatter>(),
|
|
di::bind<std::string>().to(options.model_path),
|
|
di::bind<DataGenerator>().to(
|
|
@@ -178,9 +181,9 @@ int main(const int argc, char** argv) {
|
|
return inj.template create<std::unique_ptr<LlamaGenerator>>();
|
|
}));
|
|
|
|
- auto generator = injector.create<BiergartenDataGenerator>();
|
|
+ auto generator = injector.create<std::unique_ptr<BiergartenDataGenerator>>();
|
|
|
|
- if (!generator.Run()) {
|
|
+ if (!generator->Run()) {
|
|
spdlog::error("Pipeline execution failed");
|
|
return 1;
|
|
}
|
|
diff --git a/pipeline/src/services/sqlite/build_database_path.cc b/pipeline/src/services/sqlite/build_database_path.cc
|
|
new file mode 100644
|
|
index 0000000..13101f0
|
|
--- /dev/null
|
|
+++ b/pipeline/src/services/sqlite/build_database_path.cc
|
|
@@ -0,0 +1,25 @@
|
|
+/**
|
|
+ * @file services/sqlite/build_database_path.cc
|
|
+ * @brief SqliteExportService::BuildDatabasePath() implementation.
|
|
+ */
|
|
+
|
|
+#include "services/sqlite_export_service.h"
|
|
+
|
|
+#include <filesystem>
|
|
+#include <string>
|
|
+
|
|
+std::filesystem::path SqliteExportService::BuildDatabasePath() const {
|
|
+ const auto base_filename =
|
|
+ std::filesystem::path("biergarten_seed_" + run_timestamp_utc_ + ".sqlite");
|
|
+ std::filesystem::path candidate =
|
|
+ std::filesystem::current_path() / base_filename;
|
|
+
|
|
+ for (int suffix = 1; std::filesystem::exists(candidate); ++suffix) {
|
|
+ candidate = std::filesystem::current_path() /
|
|
+ std::filesystem::path("biergarten_seed_" + run_timestamp_utc_ +
|
|
+ "-" + std::to_string(suffix) +
|
|
+ ".sqlite");
|
|
+ }
|
|
+
|
|
+ return candidate;
|
|
+}
|
|
diff --git a/pipeline/src/services/sqlite/build_location_key.cc b/pipeline/src/services/sqlite/build_location_key.cc
|
|
new file mode 100644
|
|
index 0000000..085c0a7
|
|
--- /dev/null
|
|
+++ b/pipeline/src/services/sqlite/build_location_key.cc
|
|
@@ -0,0 +1,26 @@
|
|
+/**
|
|
+ * @file services/sqlite/build_location_key.cc
|
|
+ * @brief SqliteExportService::BuildLocationKey() implementation.
|
|
+ */
|
|
+
|
|
+#include "services/sqlite_export_service.h"
|
|
+#include "services/sqlite_export_service_helpers.h"
|
|
+
|
|
+#include <iomanip>
|
|
+#include <sstream>
|
|
+
|
|
+constexpr int kLocationPrecision = 17;
|
|
+
|
|
+std::string SqliteExportService::BuildLocationKey(const Location& location) {
|
|
+ std::ostringstream key_stream;
|
|
+ key_stream << location.city << '\n'
|
|
+ << location.state_province << '\n'
|
|
+ << location.iso3166_2 << '\n'
|
|
+ << location.country << '\n'
|
|
+ << location.iso3166_1 << '\n'
|
|
+ << std::setprecision(kLocationPrecision) << location.latitude << '\n'
|
|
+ << std::setprecision(kLocationPrecision) << location.longitude << '\n'
|
|
+ << sqlite_export_service_internal::SerializeLocalLanguages(
|
|
+ location.local_languages);
|
|
+ return key_stream.str();
|
|
+}
|
|
diff --git a/pipeline/src/services/sqlite/finalize.cc b/pipeline/src/services/sqlite/finalize.cc
|
|
new file mode 100644
|
|
index 0000000..d47f41f
|
|
--- /dev/null
|
|
+++ b/pipeline/src/services/sqlite/finalize.cc
|
|
@@ -0,0 +1,30 @@
|
|
+/**
|
|
+ * @file services/sqlite/finalize.cc
|
|
+ * @brief SqliteExportService::Finalize() implementation.
|
|
+ */
|
|
+
|
|
+#include "services/sqlite_export_service.h"
|
|
+#include "services/sqlite_export_service_helpers.h"
|
|
+
|
|
+#include <stdexcept>
|
|
+
|
|
+void SqliteExportService::Finalize() {
|
|
+ if (db_handle_ == nullptr) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ try {
|
|
+ FinalizeStatements();
|
|
+ if (transaction_open_) {
|
|
+ sqlite_export_service_internal::ExecSql(
|
|
+ db_handle_, "COMMIT;", "Failed to commit SQLite transaction");
|
|
+ transaction_open_ = false;
|
|
+ }
|
|
+
|
|
+ db_handle_.reset();
|
|
+ location_cache_.clear();
|
|
+ } catch (...) {
|
|
+ RollbackAndCloseNoThrow();
|
|
+ throw;
|
|
+ }
|
|
+}
|
|
diff --git a/pipeline/src/services/sqlite/finalize_statements.cc b/pipeline/src/services/sqlite/finalize_statements.cc
|
|
new file mode 100644
|
|
index 0000000..15b29a6
|
|
--- /dev/null
|
|
+++ b/pipeline/src/services/sqlite/finalize_statements.cc
|
|
@@ -0,0 +1,11 @@
|
|
+/**
|
|
+ * @file services/sqlite/finalize_statements.cc
|
|
+ * @brief SqliteExportService::FinalizeStatements() implementation.
|
|
+ */
|
|
+
|
|
+#include "services/sqlite_export_service.h"
|
|
+
|
|
+void SqliteExportService::FinalizeStatements() noexcept {
|
|
+ insert_brewery_stmt_.reset();
|
|
+ insert_location_stmt_.reset();
|
|
+}
|
|
diff --git a/pipeline/src/services/sqlite/initialize.cc b/pipeline/src/services/sqlite/initialize.cc
|
|
new file mode 100644
|
|
index 0000000..4ffb235
|
|
--- /dev/null
|
|
+++ b/pipeline/src/services/sqlite/initialize.cc
|
|
@@ -0,0 +1,39 @@
|
|
+/**
|
|
+ * @file services/sqlite/initialize.cc
|
|
+ * @brief SqliteExportService::Initialize() implementation.
|
|
+ */
|
|
+
|
|
+#include "services/sqlite_export_service.h"
|
|
+#include "services/sqlite_export_service_helpers.h"
|
|
+
|
|
+#include <filesystem>
|
|
+#include <memory>
|
|
+#include <stdexcept>
|
|
+#include <string>
|
|
+
|
|
+void SqliteExportService::Initialize() {
|
|
+ if (db_handle_ != nullptr) {
|
|
+ throw std::runtime_error("SQLite export service is already initialized");
|
|
+ }
|
|
+
|
|
+ run_timestamp_utc_ = date_time_provider_->GetUtcTimestamp();
|
|
+ database_path_ = BuildDatabasePath();
|
|
+ std::filesystem::create_directories(database_path_.parent_path());
|
|
+
|
|
+ db_handle_ = sqlite_export_service_internal::OpenDatabase(database_path_);
|
|
+
|
|
+ try {
|
|
+ sqlite_export_service_internal::ExecSql(
|
|
+ db_handle_, "PRAGMA foreign_keys = ON;",
|
|
+ "Failed to enable SQLite foreign keys");
|
|
+ InitializeSchema();
|
|
+ PrepareStatements();
|
|
+ sqlite_export_service_internal::ExecSql(
|
|
+ db_handle_, "BEGIN IMMEDIATE TRANSACTION;",
|
|
+ "Failed to begin SQLite transaction");
|
|
+ transaction_open_ = true;
|
|
+ } catch (...) {
|
|
+ RollbackAndCloseNoThrow();
|
|
+ throw;
|
|
+ }
|
|
+}
|
|
diff --git a/pipeline/src/services/sqlite/initialize_schema.cc b/pipeline/src/services/sqlite/initialize_schema.cc
|
|
new file mode 100644
|
|
index 0000000..e9e22ef
|
|
--- /dev/null
|
|
+++ b/pipeline/src/services/sqlite/initialize_schema.cc
|
|
@@ -0,0 +1,16 @@
|
|
+/**
|
|
+ * @file services/sqlite/initialize_schema.cc
|
|
+ * @brief SqliteExportService::InitializeSchema() implementation.
|
|
+ */
|
|
+
|
|
+#include "services/sqlite_export_service.h"
|
|
+#include "services/sqlite_export_service_helpers.h"
|
|
+
|
|
+void SqliteExportService::InitializeSchema() {
|
|
+ sqlite_export_service_internal::ExecSql(
|
|
+ db_handle_, sqlite_export_service_internal::kCreateLocationsTableSql,
|
|
+ "Failed to create SQLite locations table");
|
|
+ sqlite_export_service_internal::ExecSql(
|
|
+ db_handle_, sqlite_export_service_internal::kCreateBreweriesTableSql,
|
|
+ "Failed to create SQLite breweries table");
|
|
+}
|
|
diff --git a/pipeline/src/services/sqlite/prepare_statements.cc b/pipeline/src/services/sqlite/prepare_statements.cc
|
|
new file mode 100644
|
|
index 0000000..ee61e4a
|
|
--- /dev/null
|
|
+++ b/pipeline/src/services/sqlite/prepare_statements.cc
|
|
@@ -0,0 +1,16 @@
|
|
+/**
|
|
+ * @file services/sqlite/prepare_statements.cc
|
|
+ * @brief SqliteExportService::PrepareStatements() implementation.
|
|
+ */
|
|
+
|
|
+#include "services/sqlite_export_service.h"
|
|
+#include "services/sqlite_export_service_helpers.h"
|
|
+
|
|
+void SqliteExportService::PrepareStatements() {
|
|
+ insert_location_stmt_ = sqlite_export_service_internal::PrepareStatement(
|
|
+ db_handle_, sqlite_export_service_internal::kInsertLocationSql,
|
|
+ "Failed to prepare SQLite location insert statement");
|
|
+ insert_brewery_stmt_ = sqlite_export_service_internal::PrepareStatement(
|
|
+ db_handle_, sqlite_export_service_internal::kInsertBrewerySql,
|
|
+ "Failed to prepare SQLite brewery insert statement");
|
|
+}
|
|
diff --git a/pipeline/src/services/sqlite/process_record.cc b/pipeline/src/services/sqlite/process_record.cc
|
|
new file mode 100644
|
|
index 0000000..fa57c66
|
|
--- /dev/null
|
|
+++ b/pipeline/src/services/sqlite/process_record.cc
|
|
@@ -0,0 +1,106 @@
|
|
+/**
|
|
+ * @file services/sqlite/process_record.cc
|
|
+ * @brief SqliteExportService::ProcessRecord() implementation.
|
|
+ */
|
|
+
|
|
+#include "services/sqlite_export_service.h"
|
|
+#include "services/sqlite_export_service_helpers.h"
|
|
+
|
|
+#include <stdexcept>
|
|
+#include <string>
|
|
+
|
|
+void SqliteExportService::ProcessRecord(const GeneratedBrewery& brewery) {
|
|
+ if (db_handle_ == nullptr || !transaction_open_) {
|
|
+ throw std::runtime_error("SQLite export service is not initialized");
|
|
+ }
|
|
+
|
|
+ const std::string location_key = BuildLocationKey(brewery.location);
|
|
+ const auto cached_location = location_cache_.find(location_key);
|
|
+ sqlite3_int64 location_id = 0;
|
|
+
|
|
+ if (cached_location != location_cache_.end()) {
|
|
+ location_id = cached_location->second;
|
|
+ } else {
|
|
+ const std::string local_languages_json =
|
|
+ sqlite_export_service_internal::SerializeLocalLanguages(
|
|
+ brewery.location.local_languages);
|
|
+
|
|
+ sqlite_export_service_internal::BindText(
|
|
+ insert_location_stmt_,
|
|
+ sqlite_export_service_internal::kLocationCityBindIndex,
|
|
+ brewery.location.city, "Failed to bind SQLite location city");
|
|
+ sqlite_export_service_internal::BindText(
|
|
+ insert_location_stmt_,
|
|
+ sqlite_export_service_internal::kLocationStateProvinceBindIndex,
|
|
+ brewery.location.state_province,
|
|
+ "Failed to bind SQLite location state/province");
|
|
+ sqlite_export_service_internal::BindText(
|
|
+ insert_location_stmt_,
|
|
+ sqlite_export_service_internal::kLocationIso31662BindIndex,
|
|
+ brewery.location.iso3166_2,
|
|
+ "Failed to bind SQLite location ISO 3166-2 code");
|
|
+ sqlite_export_service_internal::BindText(
|
|
+ insert_location_stmt_,
|
|
+ sqlite_export_service_internal::kLocationCountryBindIndex,
|
|
+ brewery.location.country,
|
|
+ "Failed to bind SQLite location country");
|
|
+ sqlite_export_service_internal::BindText(
|
|
+ insert_location_stmt_,
|
|
+ sqlite_export_service_internal::kLocationIso31661BindIndex,
|
|
+ brewery.location.iso3166_1,
|
|
+ "Failed to bind SQLite location ISO 3166-1 code");
|
|
+ sqlite_export_service_internal::BindText(
|
|
+ insert_location_stmt_,
|
|
+ sqlite_export_service_internal::kLocationLanguagesBindIndex,
|
|
+ local_languages_json, "Failed to bind SQLite location languages");
|
|
+ sqlite_export_service_internal::BindDouble(
|
|
+ insert_location_stmt_,
|
|
+ sqlite_export_service_internal::kLocationLatitudeBindIndex,
|
|
+ brewery.location.latitude,
|
|
+ "Failed to bind SQLite location latitude");
|
|
+ sqlite_export_service_internal::BindDouble(
|
|
+ insert_location_stmt_,
|
|
+ sqlite_export_service_internal::kLocationLongitudeBindIndex,
|
|
+ brewery.location.longitude,
|
|
+ "Failed to bind SQLite location longitude");
|
|
+
|
|
+ sqlite_export_service_internal::StepStatement(
|
|
+ db_handle_, insert_location_stmt_,
|
|
+ "Failed to insert SQLite location row");
|
|
+
|
|
+ location_id = sqlite_export_service_internal::LastInsertRowId(db_handle_);
|
|
+ location_cache_.emplace(location_key, location_id);
|
|
+ sqlite_export_service_internal::ResetStatement(insert_location_stmt_);
|
|
+ }
|
|
+
|
|
+ sqlite_export_service_internal::BindInt64(
|
|
+ insert_brewery_stmt_,
|
|
+ sqlite_export_service_internal::kBreweryLocationIdBindIndex, location_id,
|
|
+ "Failed to bind SQLite brewery location id");
|
|
+ sqlite_export_service_internal::BindText(
|
|
+ insert_brewery_stmt_,
|
|
+ sqlite_export_service_internal::kBreweryEnglishNameBindIndex,
|
|
+ brewery.brewery.name_en,
|
|
+ "Failed to bind SQLite brewery English name");
|
|
+ sqlite_export_service_internal::BindText(
|
|
+ insert_brewery_stmt_,
|
|
+ sqlite_export_service_internal::kBreweryEnglishDescriptionBindIndex,
|
|
+ brewery.brewery.description_en,
|
|
+ "Failed to bind SQLite brewery English description");
|
|
+ sqlite_export_service_internal::BindText(
|
|
+ insert_brewery_stmt_,
|
|
+ sqlite_export_service_internal::kBreweryLocalNameBindIndex,
|
|
+ brewery.brewery.name_local,
|
|
+ "Failed to bind SQLite brewery local name");
|
|
+ sqlite_export_service_internal::BindText(
|
|
+ insert_brewery_stmt_,
|
|
+ sqlite_export_service_internal::kBreweryLocalDescriptionBindIndex,
|
|
+ brewery.brewery.description_local,
|
|
+ "Failed to bind SQLite brewery local description");
|
|
+
|
|
+ sqlite_export_service_internal::StepStatement(
|
|
+ db_handle_, insert_brewery_stmt_,
|
|
+ "Failed to insert SQLite brewery row");
|
|
+
|
|
+ sqlite_export_service_internal::ResetStatement(insert_brewery_stmt_);
|
|
+}
|
|
diff --git a/pipeline/src/services/sqlite/rollback_and_close_no_throw.cc b/pipeline/src/services/sqlite/rollback_and_close_no_throw.cc
|
|
new file mode 100644
|
|
index 0000000..c90a395
|
|
--- /dev/null
|
|
+++ b/pipeline/src/services/sqlite/rollback_and_close_no_throw.cc
|
|
@@ -0,0 +1,21 @@
|
|
+/**
|
|
+ * @file services/sqlite/rollback_and_close_no_throw.cc
|
|
+ * @brief SqliteExportService::RollbackAndCloseNoThrow() implementation.
|
|
+ */
|
|
+
|
|
+#include "services/sqlite_export_service.h"
|
|
+
|
|
+void SqliteExportService::RollbackAndCloseNoThrow() noexcept {
|
|
+ if (db_handle_ == nullptr) {
|
|
+ return;
|
|
+ }
|
|
+
|
|
+ if (transaction_open_) {
|
|
+ sqlite_export_service_internal::RollbackTransactionNoThrow(db_handle_);
|
|
+ transaction_open_ = false;
|
|
+ }
|
|
+
|
|
+ FinalizeStatements();
|
|
+ db_handle_.reset();
|
|
+ location_cache_.clear();
|
|
+}
|
|
diff --git a/pipeline/src/services/sqlite/sqlite_export_service.cc b/pipeline/src/services/sqlite/sqlite_export_service.cc
|
|
new file mode 100644
|
|
index 0000000..377c917
|
|
--- /dev/null
|
|
+++ b/pipeline/src/services/sqlite/sqlite_export_service.cc
|
|
@@ -0,0 +1,17 @@
|
|
+/**
|
|
+ * @file services/sqlite/sqlite_export_service.cc
|
|
+ * @brief SqliteExportService constructor and destructor implementation.
|
|
+ */
|
|
+
|
|
+#include "services/sqlite_export_service.h"
|
|
+
|
|
+#include <memory>
|
|
+
|
|
+SqliteExportService::SqliteExportService()
|
|
+ : date_time_provider_(std::make_unique<SystemDateTimeProvider>()) {}
|
|
+
|
|
+SqliteExportService::~SqliteExportService() {
|
|
+ if (db_handle_ != nullptr) {
|
|
+ RollbackAndCloseNoThrow();
|
|
+ }
|
|
+}
|
|
\ No newline at end of file
|