mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
Compare commits
5 Commits
b31be494d7
...
b53f9e5582
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b53f9e5582 | ||
|
|
824f5b2b4f | ||
|
|
5d93d76e99 | ||
|
|
028786b8b5 | ||
|
|
d7a31b5264 |
@@ -1,5 +1,9 @@
|
|||||||
cmake_minimum_required(VERSION 3.24)
|
cmake_minimum_required(VERSION 3.24)
|
||||||
project(biergarten-pipeline)
|
project(biergarten-pipeline)
|
||||||
|
|
||||||
|
# Boost.DI still declares a very old minimum CMake version, which newer CMake
|
||||||
|
# releases reject unless a policy version floor is provided.
|
||||||
|
set(CMAKE_POLICY_VERSION_MINIMUM 3.5 CACHE STRING "" FORCE)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# 1. GPU Detection
|
# 1. GPU Detection
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -71,6 +75,16 @@ FetchContent_Declare(
|
|||||||
GIT_TAG b8711
|
GIT_TAG b8711
|
||||||
)
|
)
|
||||||
FetchContent_MakeAvailable(llama-cpp)
|
FetchContent_MakeAvailable(llama-cpp)
|
||||||
|
# --- boost-ext/di -------------------------------------------------------------
|
||||||
|
FetchContent_Declare(
|
||||||
|
boost-di
|
||||||
|
GIT_REPOSITORY https://github.com/boost-ext/di.git
|
||||||
|
GIT_TAG v1.3.0
|
||||||
|
)
|
||||||
|
FetchContent_MakeAvailable(boost-di)
|
||||||
|
if(TARGET Boost.DI AND NOT TARGET boost::di)
|
||||||
|
add_library(boost::di ALIAS Boost.DI)
|
||||||
|
endif()
|
||||||
# --- Boost (JSON + program_options) ------------------------------------------
|
# --- Boost (JSON + program_options) ------------------------------------------
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
boost
|
boost
|
||||||
@@ -89,23 +103,39 @@ FetchContent_MakeAvailable(spdlog)
|
|||||||
# =============================================================================
|
# =============================================================================
|
||||||
set(SOURCES
|
set(SOURCES
|
||||||
src/main.cpp
|
src/main.cpp
|
||||||
src/biergarten_data_generator.cpp
|
# BiergartenDataGenerator methods
|
||||||
|
src/biergarten_data_generator/constructor.cpp
|
||||||
|
src/biergarten_data_generator/run.cpp
|
||||||
|
src/biergarten_data_generator/query_cities_with_countries.cpp
|
||||||
|
src/biergarten_data_generator/generate_breweries.cpp
|
||||||
|
src/biergarten_data_generator/log_results.cpp
|
||||||
|
# WikipediaService methods
|
||||||
|
src/services/wikipedia/constructor.cpp
|
||||||
|
src/services/wikipedia/get_summary.cpp
|
||||||
|
src/services/wikipedia/fetch_extract.cpp
|
||||||
|
# CURLWebClient and CurlGlobalState methods
|
||||||
|
src/web_client/curl_global_state_constructor.cpp
|
||||||
|
src/web_client/curl_global_state_destructor.cpp
|
||||||
|
src/web_client/curl_web_client_constructor.cpp
|
||||||
|
src/web_client/curl_web_client_destructor.cpp
|
||||||
|
src/web_client/curl_web_client_download_to_file.cpp
|
||||||
|
src/web_client/curl_web_client_get.cpp
|
||||||
|
src/web_client/curl_web_client_utils.cpp
|
||||||
|
src/web_client/curl_web_client_url_encode.cpp
|
||||||
|
# Data generation modules
|
||||||
src/data_generation/llama/destructor.cpp
|
src/data_generation/llama/destructor.cpp
|
||||||
|
src/data_generation/llama/constructor.cpp
|
||||||
src/data_generation/llama/generate_brewery.cpp
|
src/data_generation/llama/generate_brewery.cpp
|
||||||
src/data_generation/llama/generate_user.cpp
|
src/data_generation/llama/generate_user.cpp
|
||||||
src/data_generation/llama/helpers.cpp
|
src/data_generation/llama/helpers.cpp
|
||||||
src/data_generation/llama/infer.cpp
|
src/data_generation/llama/infer.cpp
|
||||||
src/data_generation/llama/load.cpp
|
src/data_generation/llama/load.cpp
|
||||||
src/data_generation/llama/load_brewery_prompt.cpp
|
src/data_generation/llama/load_brewery_prompt.cpp
|
||||||
src/data_generation/llama/set_sampling_options.cpp
|
|
||||||
src/data_generation/mock/data.cpp
|
src/data_generation/mock/data.cpp
|
||||||
src/data_generation/mock/deterministic_hash.cpp
|
src/data_generation/mock/deterministic_hash.cpp
|
||||||
src/data_generation/mock/generate_brewery.cpp
|
src/data_generation/mock/generate_brewery.cpp
|
||||||
src/data_generation/mock/generate_user.cpp
|
src/data_generation/mock/generate_user.cpp
|
||||||
src/data_generation/mock/load.cpp
|
|
||||||
src/json_handling/json_loader.cpp
|
src/json_handling/json_loader.cpp
|
||||||
src/web_client/curl_web_client.cpp
|
|
||||||
src/wikipedia/wikipedia_service.cpp
|
|
||||||
)
|
)
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
# 5. Target
|
# 5. Target
|
||||||
@@ -120,6 +150,7 @@ target_include_directories(${PROJECT_NAME} PRIVATE
|
|||||||
)
|
)
|
||||||
target_link_libraries(${PROJECT_NAME} PRIVATE
|
target_link_libraries(${PROJECT_NAME} PRIVATE
|
||||||
llama
|
llama
|
||||||
|
boost::di
|
||||||
boost_json
|
boost_json
|
||||||
boost_program_options
|
boost_program_options
|
||||||
spdlog::spdlog
|
spdlog::spdlog
|
||||||
|
|||||||
@@ -1,34 +1,10 @@
|
|||||||
# Biergarten Pipeline
|
# Biergarten Pipeline
|
||||||
|
|
||||||
A C++23 tool for processing geographic data and generating brewery metadata. It utilizes a local city manifest, parallel Wikipedia enrichment via `std::async`, and local LLM inference via llama.cpp.
|
Biergarten Pipeline is a C++23 command-line tool that reads a local city list, resolves contextual enrichment for each sampled city through an injected service, and generates brewery names and descriptions. The current code samples up to four locations per run, then uses either a local GGUF model or the mock generator to produce the output.
|
||||||
|
|
||||||
## Overview
|
|
||||||
|
|
||||||
The pipeline runs in four stages:
|
|
||||||
|
|
||||||
- **Query**: Loads and samples from a local `locations.json` file.
|
|
||||||
- **Enrich**: Fetches regional and cultural context from Wikipedia in parallel using `std::async`.
|
|
||||||
- **Generate**: Creates authentic brewery names and descriptions using a local GGUF model or a deterministic mock.
|
|
||||||
- **Log**: Outputs results and metadata summaries via spdlog.
|
|
||||||
|
|
||||||
## Implementation Details
|
|
||||||
|
|
||||||
### Concurrency
|
|
||||||
|
|
||||||
- **Async Enrichment**: Wikipedia API lookups are parallelized using `std::async`. Each city is processed in its own thread to hide network latency.
|
|
||||||
- **RAII**: Resource management for libcurl handles and llama.cpp weights is handled via constructors/destructors to ensure clean teardown.
|
|
||||||
|
|
||||||
### LLM Logic
|
|
||||||
|
|
||||||
- **Retries**: Includes a 3-attempt loop with automated error correction. If the model returns invalid JSON, the specific error is fed back into the next prompt.
|
|
||||||
- **Context Injection**: Wikipedia summaries are injected into the LLM system prompt to ensure descriptions are grounded in actual regional beer culture.
|
|
||||||
- **Sampling**: Temperature, top-p, and seeds are configurable via the CLI.
|
|
||||||
|
|
||||||
## Hardware & GPU Config
|
## Hardware & GPU Config
|
||||||
|
|
||||||
### Test Machines
|
### x86/64 Linux, NVIDIA RTX 2000
|
||||||
|
|
||||||
#### x86/64 Linux, NVIDIA RTX 2000
|
|
||||||
|
|
||||||
- **Host**: ThinkPad P1 Gen 7 (Fedora 43)
|
- **Host**: ThinkPad P1 Gen 7 (Fedora 43)
|
||||||
- **CPU**: Intel Core Ultra 7 155H
|
- **CPU**: Intel Core Ultra 7 155H
|
||||||
@@ -37,7 +13,7 @@ The pipeline runs in four stages:
|
|||||||
- **Model**: Qwen3-8B-Q6-K
|
- **Model**: Qwen3-8B-Q6-K
|
||||||
- **Inference**: llama.cpp with CUDA 12.x support
|
- **Inference**: llama.cpp with CUDA 12.x support
|
||||||
|
|
||||||
#### ARM MacOS, M1 Pro
|
### ARM MacOS, M1 Pro
|
||||||
|
|
||||||
- **Host**: MacBook Pro 14" (2021)
|
- **Host**: MacBook Pro 14" (2021)
|
||||||
- **CPU**: Apple M1 Pro (8-core)
|
- **CPU**: Apple M1 Pro (8-core)
|
||||||
@@ -46,55 +22,73 @@ The pipeline runs in four stages:
|
|||||||
- **Model**: Qwen3-8B-Q6-K
|
- **Model**: Qwen3-8B-Q6-K
|
||||||
- **Inference**: llama.cpp with Metal (MPS) support
|
- **Inference**: llama.cpp with Metal (MPS) support
|
||||||
|
|
||||||
### GPU Build Flags
|
## Pipeline
|
||||||
|
|
||||||
```bash
|
| Stage | What happens |
|
||||||
cmake -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89 ..
|
| -------- | ----------------------------------------------------------------------- |
|
||||||
cmake --build . --config Release
|
| Load | Reads `locations.json` and picks up to four city/country pairs. |
|
||||||
```
|
| Enrich | Calls the injected enrichment service for each sampled city. |
|
||||||
|
| Generate | Passes the city, country, and gathered context to the active generator. |
|
||||||
|
| Log | Writes the generated breweries and any warnings through `spdlog`. |
|
||||||
|
|
||||||
```zsh
|
If an enrichment lookup throws, the pipeline skips that city and keeps going. If the lookup returns an empty string, the city stays in the pipeline and is still passed to the generator.
|
||||||
cmake ..
|
|
||||||
cmake --build .
|
|
||||||
```
|
|
||||||
|
|
||||||
## Core Components
|
## Core Components
|
||||||
|
|
||||||
| Component | Function |
|
| Component | Role |
|
||||||
| ----------------------- | ----------------------------------------------------------------- |
|
| ----------------------- | ---------------------------------------------------------------------- |
|
||||||
| BiergartenDataGenerator | Orchestrates the sampling, enrichment, and generation stages. |
|
| BiergartenDataGenerator | Orchestrates loading, enrichment lookup, generation, and logging. |
|
||||||
| WikipediaService | Fetches and caches summaries for cities and regional beer styles. |
|
| IEnrichmentService | Abstraction for location-context providers. |
|
||||||
| LlamaGenerator | Handles local GGUF inference and output validation. |
|
| WikipediaService | Default enrichment provider backed by Wikipedia and in-memory caching. |
|
||||||
| JsonLoader | Parses the local `locations.json` file into internal structures. |
|
| LlamaGenerator | Runs local GGUF inference and validates output. |
|
||||||
| CURLWebClient | libcurl wrapper for parallel Wikipedia API requests. |
|
| MockGenerator | Produces deterministic fallback data without a model. |
|
||||||
|
| JsonLoader | Parses the local `locations.json` file. |
|
||||||
|
| CURLWebClient | Handles HTTP requests to Wikipedia. |
|
||||||
|
|
||||||
## CLI Options
|
## Build
|
||||||
|
|
||||||
```
|
| Requirement | Notes |
|
||||||
./biergarten-pipeline --model ./path/to/model.gguf [options]
|
| -------------------- | -------------------------------------------------------------------------- |
|
||||||
```
|
| C++23 compiler | GCC 13+ or Clang 16+ are good starting points. |
|
||||||
|
| CMake | Version 3.24 or newer. |
|
||||||
|
| libcurl | Required for Wikipedia requests. |
|
||||||
|
| Optional GPU tooling | CUDA on NVIDIA, HIP/ROCm on supported AMD systems, Metal on Apple Silicon. |
|
||||||
|
|
||||||
| Flag | Description |
|
Boost, Boost.DI, spdlog, and llama.cpp are fetched by CMake. On Apple Silicon, Metal is enabled automatically. On Linux, the build looks for CUDA or HIP/ROCm when the matching toolkit is present. Windows is not supported.
|
||||||
| --------------- | ----------------------------------------------- |
|
|
||||||
| `--mocked` | Use deterministic mock data instead of an LLM. |
|
|
||||||
| `--model`, `-m` | Path to the GGUF file. |
|
|
||||||
| `--temperature` | Model temperature (0.0 - 1.0). |
|
|
||||||
| `--n-ctx` | Context window size (default: 8192). |
|
|
||||||
| `--cache-dir` | Directory containing the `locations.json` file. |
|
|
||||||
|
|
||||||
## Building
|
|
||||||
|
|
||||||
### Requirements
|
|
||||||
|
|
||||||
- C++23 compiler (GCC 13+ / Clang 16+)
|
|
||||||
- CMake 3.20+
|
|
||||||
- Boost (JSON, Program_options), libcurl
|
|
||||||
- CUDA Toolkit 12.x (optional for GPU)
|
|
||||||
|
|
||||||
### Steps
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
mkdir build && cd build
|
cmake -S . -B build
|
||||||
cmake ..
|
cmake --build build
|
||||||
cmake --build . -j$(nproc)
|
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If the dependency build fails on macOS, check the repo build notes.
|
||||||
|
|
||||||
|
## Run
|
||||||
|
|
||||||
|
Run the executable from the build directory so the copied `locations.json` is available.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./biergarten-pipeline --mocked
|
||||||
|
./biergarten-pipeline --model /path/to/model.gguf --temperature 0.8 --top-p 0.92 --n-ctx 8192 --seed -1
|
||||||
|
```
|
||||||
|
|
||||||
|
| Flag | Purpose |
|
||||||
|
| --------------- | -------------------------------------------- |
|
||||||
|
| `--mocked` | Uses the mock generator instead of a model. |
|
||||||
|
| `--model, -m` | Path to a GGUF model file. |
|
||||||
|
| `--temperature` | Sampling temperature. Default: `0.8`. |
|
||||||
|
| `--top-p` | Nucleus sampling parameter. Default: `0.92`. |
|
||||||
|
| `--n-ctx` | Context window size. Default: `8192`. |
|
||||||
|
| `--seed` | Random seed. Default: `-1`. |
|
||||||
|
| `--help, -h` | Prints usage. |
|
||||||
|
|
||||||
|
`--mocked` and `--model` are mutually exclusive. If neither is set, the program exits with an error. The sampling flags only matter when a model is loaded. The enrichment step is sequential now, and empty context is allowed.
|
||||||
|
|
||||||
|
## Layout
|
||||||
|
|
||||||
|
| Path | Use |
|
||||||
|
| ---------------- | ------------------------------------------- |
|
||||||
|
| `includes/` | Public headers. |
|
||||||
|
| `src/` | Implementation files. |
|
||||||
|
| `locations.json` | Input city list copied into the build tree. |
|
||||||
|
| `prompts/` | Prompt text used by the model path. |
|
||||||
|
|||||||
146
pipeline/biergarten_pipeline.puml
Normal file
146
pipeline/biergarten_pipeline.puml
Normal file
@@ -0,0 +1,146 @@
|
|||||||
|
@startuml
|
||||||
|
title Biergarten Pipeline - Class and Composition Diagram
|
||||||
|
|
||||||
|
left to right direction
|
||||||
|
skinparam shadowing false
|
||||||
|
skinparam classAttributeIconSize 0
|
||||||
|
skinparam packageStyle rectangle
|
||||||
|
|
||||||
|
package "Composition root" {
|
||||||
|
class Main <<entrypoint>> {
|
||||||
|
+main(argc: int, argv: char**): int
|
||||||
|
}
|
||||||
|
|
||||||
|
class CurlGlobalState {
|
||||||
|
+CurlGlobalState()
|
||||||
|
+~CurlGlobalState()
|
||||||
|
}
|
||||||
|
|
||||||
|
note right of Main
|
||||||
|
Binds with Boost.DI:
|
||||||
|
- WebClient -> CURLWebClient
|
||||||
|
- IEnrichmentService -> WikipediaService
|
||||||
|
- DataGenerator -> MockGenerator or LlamaGenerator
|
||||||
|
- LlamaGenerator receives ApplicationOptions and model_path directly
|
||||||
|
end note
|
||||||
|
}
|
||||||
|
|
||||||
|
package "Core orchestration" {
|
||||||
|
class ApplicationOptions <<struct>> {
|
||||||
|
+model_path: std::string
|
||||||
|
+use_mocked: bool
|
||||||
|
+temperature: float
|
||||||
|
+top_p: float
|
||||||
|
+n_ctx: uint32_t
|
||||||
|
+seed: int
|
||||||
|
}
|
||||||
|
|
||||||
|
class BiergartenDataGenerator {
|
||||||
|
-context_service_: std::shared_ptr<IEnrichmentService>
|
||||||
|
-generator_: std::unique_ptr<DataGenerator>
|
||||||
|
+BiergartenDataGenerator(context_service: std::shared_ptr<IEnrichmentService>, generator: std::unique_ptr<DataGenerator>)
|
||||||
|
+Run(): bool
|
||||||
|
-QueryCitiesWithCountries(): std::vector<Location>
|
||||||
|
-GenerateBreweries(cities: std::vector<EnrichedCity>): void
|
||||||
|
-LogResults(): void
|
||||||
|
}
|
||||||
|
|
||||||
|
class EnrichedCity <<struct>> {
|
||||||
|
+location: Location
|
||||||
|
+region_context: std::string
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
package "Shared models" {
|
||||||
|
class Location
|
||||||
|
|
||||||
|
class BreweryResult <<struct>> {
|
||||||
|
+name: std::string
|
||||||
|
+description: std::string
|
||||||
|
}
|
||||||
|
|
||||||
|
class UserResult <<struct>> {
|
||||||
|
+username: std::string
|
||||||
|
+bio: std::string
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
package "Generation" {
|
||||||
|
interface DataGenerator {
|
||||||
|
+GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult
|
||||||
|
+GenerateUser(locale: std::string): UserResult
|
||||||
|
}
|
||||||
|
|
||||||
|
class MockGenerator {
|
||||||
|
+GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult
|
||||||
|
+GenerateUser(locale: std::string): UserResult
|
||||||
|
}
|
||||||
|
|
||||||
|
class LlamaGenerator {
|
||||||
|
+LlamaGenerator(options: ApplicationOptions, model_path: std::string)
|
||||||
|
+GenerateBrewery(city_name: std::string, country_name: std::string, region_context: std::string): BreweryResult
|
||||||
|
+GenerateUser(locale: std::string): UserResult
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
package "HTTP" {
|
||||||
|
interface WebClient {
|
||||||
|
+DownloadToFile(url: std::string, file_path: std::string): void
|
||||||
|
+Get(url: std::string): std::string
|
||||||
|
+UrlEncode(value: std::string): std::string
|
||||||
|
}
|
||||||
|
|
||||||
|
class CURLWebClient {
|
||||||
|
+CURLWebClient()
|
||||||
|
+~CURLWebClient()
|
||||||
|
+DownloadToFile(url: std::string, file_path: std::string): void
|
||||||
|
+Get(url: std::string): std::string
|
||||||
|
+UrlEncode(value: std::string): std::string
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
package "Wikipedia" {
|
||||||
|
interface IEnrichmentService {
|
||||||
|
+GetLocationContext(loc: Location): std::string
|
||||||
|
}
|
||||||
|
|
||||||
|
class WikipediaService {
|
||||||
|
+WikipediaService(client: std::shared_ptr<WebClient>)
|
||||||
|
+GetLocationContext(loc: Location): std::string
|
||||||
|
}
|
||||||
|
|
||||||
|
class JsonLoader {
|
||||||
|
{static} +LoadLocations(filepath: std::string): std::vector<Location>
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Main --> CurlGlobalState
|
||||||
|
Main --> ApplicationOptions
|
||||||
|
Main --> BiergartenDataGenerator
|
||||||
|
Main ..> IEnrichmentService : DI binding
|
||||||
|
Main ..> DataGenerator : DI factory
|
||||||
|
Main ..> CURLWebClient : DI binding
|
||||||
|
|
||||||
|
BiergartenDataGenerator *-- EnrichedCity
|
||||||
|
BiergartenDataGenerator ..> JsonLoader : LoadLocations()
|
||||||
|
BiergartenDataGenerator --> IEnrichmentService : context lookup
|
||||||
|
BiergartenDataGenerator --> DataGenerator : brewery generation
|
||||||
|
BiergartenDataGenerator ..> Location
|
||||||
|
BiergartenDataGenerator ..> BreweryResult
|
||||||
|
|
||||||
|
DataGenerator <|.. MockGenerator
|
||||||
|
DataGenerator <|.. LlamaGenerator
|
||||||
|
WebClient <|.. CURLWebClient
|
||||||
|
IEnrichmentService <|.. WikipediaService
|
||||||
|
|
||||||
|
WikipediaService --> WebClient : shared_ptr
|
||||||
|
|
||||||
|
note right of BiergartenDataGenerator
|
||||||
|
Current behavior:
|
||||||
|
samples up to four locations per run.
|
||||||
|
Enrichment runs once per sampled city.
|
||||||
|
If a lookup throws, that city is skipped.
|
||||||
|
Empty context is retained and still passed to the generator.
|
||||||
|
end note
|
||||||
|
|
||||||
|
@enduml
|
||||||
@@ -6,14 +6,14 @@
|
|||||||
* @brief Core orchestration class for pipeline data generation.
|
* @brief Core orchestration class for pipeline data generation.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "data_generation/data_generator.h"
|
#include "data_generation/data_generator.h"
|
||||||
#include "data_model/location.h"
|
#include "data_model/location.h"
|
||||||
#include "web_client/web_client.h"
|
#include "services/enrichment_service.h"
|
||||||
#include "wikipedia/wikipedia_service.h"
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Program options for the Biergarten pipeline application.
|
* @brief Program options for the Biergarten pipeline application.
|
||||||
@@ -27,9 +27,6 @@ struct ApplicationOptions {
|
|||||||
/// model_path.
|
/// model_path.
|
||||||
bool use_mocked = false;
|
bool use_mocked = false;
|
||||||
|
|
||||||
/// @brief Directory for cached JSON and database files.
|
|
||||||
std::string cache_dir;
|
|
||||||
|
|
||||||
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
|
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
|
||||||
float temperature = 0.8f;
|
float temperature = 0.8f;
|
||||||
|
|
||||||
@@ -43,10 +40,6 @@ struct ApplicationOptions {
|
|||||||
|
|
||||||
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
|
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
|
||||||
int seed = -1;
|
int seed = -1;
|
||||||
|
|
||||||
/// @brief Git commit hash for database consistency (always pinned to
|
|
||||||
/// c5eb7772).
|
|
||||||
std::string commit = "c5eb7772";
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -60,30 +53,30 @@ class BiergartenDataGenerator {
|
|||||||
/**
|
/**
|
||||||
* @brief Construct a BiergartenDataGenerator with injected dependencies.
|
* @brief Construct a BiergartenDataGenerator with injected dependencies.
|
||||||
*
|
*
|
||||||
* @param options Application configuration options.
|
* @param context_service Context provider for sampled locations.
|
||||||
* @param web_client HTTP client for downloading data.
|
* @param generator Brewery and user data generator.
|
||||||
*/
|
*/
|
||||||
BiergartenDataGenerator(const ApplicationOptions& options,
|
BiergartenDataGenerator(std::shared_ptr<IEnrichmentService> context_service,
|
||||||
std::shared_ptr<WebClient> web_client);
|
std::unique_ptr<DataGenerator> generator);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Run the data generation pipeline.
|
* @brief Run the data generation pipeline.
|
||||||
*
|
*
|
||||||
* Performs the following steps:
|
* Performs the following steps:
|
||||||
* 1. Load curated locations from JSON
|
* 1. Load curated locations from JSON
|
||||||
* 2. Initialize the generator (LLM or Mock)
|
* 2. Resolve context for each city using the injected context service
|
||||||
* 3. Generate brewery data for sampled cities
|
* 3. Generate brewery data for sampled cities
|
||||||
*
|
*
|
||||||
* @return 0 on success, 1 on failure.
|
* @return true if successful, false if not
|
||||||
*/
|
*/
|
||||||
int Run();
|
bool Run();
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// @brief Immutable application options.
|
/// @brief Shared context provider dependency.
|
||||||
const ApplicationOptions options_;
|
std::shared_ptr<IEnrichmentService> context_service_;
|
||||||
|
|
||||||
/// @brief Shared HTTP client dependency.
|
/// @brief Generator dependency selected in the composition root.
|
||||||
std::shared_ptr<WebClient> webClient_;
|
std::unique_ptr<DataGenerator> generator_;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Enriched city data with Wikipedia context.
|
* @brief Enriched city data with Wikipedia context.
|
||||||
@@ -93,39 +86,19 @@ class BiergartenDataGenerator {
|
|||||||
std::string region_context;
|
std::string region_context;
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Initialize the data generator based on options.
|
|
||||||
*
|
|
||||||
* Creates either a MockGenerator (if no model path) or LlamaGenerator.
|
|
||||||
*
|
|
||||||
* @return A unique_ptr to the initialized generator.
|
|
||||||
*/
|
|
||||||
std::unique_ptr<DataGenerator> InitializeGenerator();
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Load locations from JSON and sample cities.
|
* @brief Load locations from JSON and sample cities.
|
||||||
*
|
*
|
||||||
* @return Vector of sampled locations capped at 30 entries.
|
* @return Vector of sampled locations capped at 30 entries.
|
||||||
*/
|
*/
|
||||||
std::vector<Location> QueryCitiesWithCountries();
|
static std::vector<Location> QueryCitiesWithCountries();
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Enrich cities with Wikipedia summaries.
|
|
||||||
*
|
|
||||||
* @param cities Vector of sampled locations.
|
|
||||||
* @return Vector of enriched city data with context.
|
|
||||||
*/
|
|
||||||
std::vector<EnrichedCity> EnrichWithWikipedia(
|
|
||||||
const std::vector<Location>& cities);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Generate breweries for enriched cities.
|
* @brief Generate breweries for enriched cities.
|
||||||
*
|
*
|
||||||
* @param generator The data generator instance.
|
|
||||||
* @param cities Vector of enriched city data.
|
* @param cities Vector of enriched city data.
|
||||||
*/
|
*/
|
||||||
void GenerateBreweries(DataGenerator& generator,
|
void GenerateBreweries(const std::vector<EnrichedCity>& cities);
|
||||||
const std::vector<EnrichedCity>& cities);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Log the generated brewery results.
|
* @brief Log the generated brewery results.
|
||||||
|
|||||||
@@ -38,13 +38,6 @@ class DataGenerator {
|
|||||||
/// @brief Virtual destructor for polymorphic cleanup.
|
/// @brief Virtual destructor for polymorphic cleanup.
|
||||||
virtual ~DataGenerator() = default;
|
virtual ~DataGenerator() = default;
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Loads and initializes generator resources.
|
|
||||||
*
|
|
||||||
* @param model_path Path to model assets. Implementations may ignore this.
|
|
||||||
*/
|
|
||||||
virtual void Load(const std::string& model_path) = 0;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Generates brewery data for a location.
|
* @brief Generates brewery data for a location.
|
||||||
*
|
*
|
||||||
|
|||||||
@@ -7,10 +7,13 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
|
#include <random>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "data_generation/data_generator.h"
|
#include "data_generation/data_generator.h"
|
||||||
|
|
||||||
|
struct ApplicationOptions;
|
||||||
|
|
||||||
struct llama_model;
|
struct llama_model;
|
||||||
struct llama_context;
|
struct llama_context;
|
||||||
|
|
||||||
@@ -19,35 +22,19 @@ struct llama_context;
|
|||||||
*/
|
*/
|
||||||
class LlamaGenerator final : public DataGenerator {
|
class LlamaGenerator final : public DataGenerator {
|
||||||
public:
|
public:
|
||||||
/// @brief Constructs a generator with default sampling and context settings.
|
/**
|
||||||
LlamaGenerator() = default;
|
* @brief Constructs a generator using parsed application options and loads
|
||||||
|
* the configured model immediately.
|
||||||
|
*
|
||||||
|
* @param options Parsed application options.
|
||||||
|
* @param model_path Filesystem path to GGUF model assets.
|
||||||
|
*/
|
||||||
|
LlamaGenerator(const ApplicationOptions& options,
|
||||||
|
const std::string& model_path);
|
||||||
|
|
||||||
/// @brief Releases model/context resources.
|
/// @brief Releases model/context resources.
|
||||||
~LlamaGenerator() override;
|
~LlamaGenerator() override;
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Configures sampling parameters for generation.
|
|
||||||
*
|
|
||||||
* @param temperature Sampling temperature.
|
|
||||||
* @param top_p Nucleus sampling threshold.
|
|
||||||
* @param seed Seed for sampling; use -1 for random seed.
|
|
||||||
*/
|
|
||||||
void SetSamplingOptions(float temperature, float top_p, int seed = -1);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Sets context window size used during model load.
|
|
||||||
*
|
|
||||||
* @param n_ctx Context size in tokens.
|
|
||||||
*/
|
|
||||||
void SetContextSize(uint32_t n_ctx);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Loads model and prepares inference context.
|
|
||||||
*
|
|
||||||
* @param model_path Filesystem path to GGUF model.
|
|
||||||
*/
|
|
||||||
void Load(const std::string& model_path) override;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Generates brewery data for a specific location.
|
* @brief Generates brewery data for a specific location.
|
||||||
*
|
*
|
||||||
@@ -69,6 +56,13 @@ class LlamaGenerator final : public DataGenerator {
|
|||||||
UserResult GenerateUser(const std::string& locale) override;
|
UserResult GenerateUser(const std::string& locale) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
/**
|
||||||
|
* @brief Loads model and prepares inference context.
|
||||||
|
*
|
||||||
|
* @param model_path Filesystem path to GGUF model.
|
||||||
|
*/
|
||||||
|
void Load(const std::string& model_path);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Infers text from a user prompt.
|
* @brief Infers text from a user prompt.
|
||||||
*
|
*
|
||||||
@@ -121,7 +115,7 @@ class LlamaGenerator final : public DataGenerator {
|
|||||||
llama_context* context_ = nullptr;
|
llama_context* context_ = nullptr;
|
||||||
float sampling_temperature_ = 0.8f;
|
float sampling_temperature_ = 0.8f;
|
||||||
float sampling_top_p_ = 0.92f;
|
float sampling_top_p_ = 0.92f;
|
||||||
uint32_t sampling_seed_ = 0xFFFFFFFFu;
|
std::mt19937 rng_;
|
||||||
uint32_t n_ctx_ = 8192;
|
uint32_t n_ctx_ = 8192;
|
||||||
std::string brewery_system_prompt_;
|
std::string brewery_system_prompt_;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ typedef int llama_token;
|
|||||||
* @return Processed region context.
|
* @return Processed region context.
|
||||||
*/
|
*/
|
||||||
std::string PrepareRegionContextPublic(std::string_view region_context,
|
std::string PrepareRegionContextPublic(std::string_view region_context,
|
||||||
std::size_t max_chars = 700);
|
std::size_t max_chars = 2000);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Parses a response expected to contain two logical lines.
|
* @brief Parses a response expected to contain two logical lines.
|
||||||
|
|||||||
@@ -16,13 +16,6 @@
|
|||||||
*/
|
*/
|
||||||
class MockGenerator final : public DataGenerator {
|
class MockGenerator final : public DataGenerator {
|
||||||
public:
|
public:
|
||||||
/**
|
|
||||||
* @brief Initializes the mock generator.
|
|
||||||
*
|
|
||||||
* @param model_path Unused for mock generation.
|
|
||||||
*/
|
|
||||||
void Load(const std::string& model_path) override;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Generates deterministic brewery data for a location.
|
* @brief Generates deterministic brewery data for a location.
|
||||||
*
|
*
|
||||||
|
|||||||
32
pipeline/includes/llama_backend_state.h
Normal file
32
pipeline/includes/llama_backend_state.h
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_LLAMA_BACKEND_STATE_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_LLAMA_BACKEND_STATE_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file llama_backend_state.h
|
||||||
|
* @brief RAII guard for llama.cpp backend process lifetime.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <llama.h>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief RAII wrapper for llama_backend_init and llama_backend_free.
|
||||||
|
*
|
||||||
|
* Create one instance in application startup before using llama.cpp and keep
|
||||||
|
* it alive for application lifetime.
|
||||||
|
*/
|
||||||
|
class LlamaBackendState {
|
||||||
|
public:
|
||||||
|
/// @brief Initializes global llama backend state.
|
||||||
|
LlamaBackendState() { llama_backend_init(); }
|
||||||
|
|
||||||
|
/// @brief Cleans up global llama backend state.
|
||||||
|
~LlamaBackendState() { llama_backend_free(); }
|
||||||
|
|
||||||
|
/// @brief Non-copyable type.
|
||||||
|
LlamaBackendState(const LlamaBackendState&) = delete;
|
||||||
|
|
||||||
|
/// @brief Non-copyable type.
|
||||||
|
LlamaBackendState& operator=(const LlamaBackendState&) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_LLAMA_BACKEND_STATE_H_
|
||||||
30
pipeline/includes/services/enrichment_service.h
Normal file
30
pipeline/includes/services/enrichment_service.h
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_SERVICES_ENRICHMENT_SERVICE_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_SERVICES_ENRICHMENT_SERVICE_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file services/enrichment_service.h
|
||||||
|
* @brief Abstraction for resolving contextual enrichment for a location.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "data_model/location.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Interface for services that can enrich a location with context.
|
||||||
|
*/
|
||||||
|
class IEnrichmentService {
|
||||||
|
public:
|
||||||
|
/// @brief Virtual destructor for polymorphic cleanup.
|
||||||
|
virtual ~IEnrichmentService() = default;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Resolves contextual enrichment for a location.
|
||||||
|
*
|
||||||
|
* @param loc Location to enrich.
|
||||||
|
* @return Context text, or an empty string if unavailable.
|
||||||
|
*/
|
||||||
|
virtual std::string GetLocationContext(const Location& loc) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_SERVICES_ENRICHMENT_SERVICE_H_
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
#ifndef BIERGARTEN_PIPELINE_WIKIPEDIA_SERVICE_H_
|
||||||
#define BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
#define BIERGARTEN_PIPELINE_WIKIPEDIA_SERVICE_H_
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @file wikipedia/wikipedia_service.h
|
* @file services/wikipedia_service.h
|
||||||
* @brief Wikipedia summary retrieval service with in-memory caching.
|
* @brief Wikipedia summary retrieval service with in-memory caching.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@@ -11,22 +11,23 @@
|
|||||||
#include <string_view>
|
#include <string_view>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include "services/enrichment_service.h"
|
||||||
#include "web_client/web_client.h"
|
#include "web_client/web_client.h"
|
||||||
|
|
||||||
/// @brief Provides cached Wikipedia summary lookups for city and country pairs.
|
/// @brief Provides cached Wikipedia summary lookups for city and country pairs.
|
||||||
class WikipediaService {
|
class WikipediaService final : public IEnrichmentService {
|
||||||
public:
|
public:
|
||||||
/// @brief Creates a new Wikipedia service with the provided web client.
|
/// @brief Creates a new Wikipedia service with the provided web client.
|
||||||
explicit WikipediaService(std::shared_ptr<WebClient> client);
|
explicit WikipediaService(std::shared_ptr<WebClient> client);
|
||||||
|
|
||||||
/// @brief Returns the Wikipedia summary extract for city and country.
|
/// @brief Returns the Wikipedia-derived context for a location.
|
||||||
[[nodiscard]] std::string GetSummary(std::string_view city,
|
[[nodiscard]] std::string GetLocationContext(const Location& loc) override;
|
||||||
std::string_view country);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::string FetchExtract(std::string_view query);
|
std::string FetchExtract(std::string_view query);
|
||||||
std::shared_ptr<WebClient> client_;
|
std::shared_ptr<WebClient> client_;
|
||||||
std::unordered_map<std::string, std::string> cache_;
|
std::unordered_map<std::string, std::string> cache_;
|
||||||
|
std::unordered_map<std::string, std::string> extract_cache_;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
#endif // BIERGARTEN_PIPELINE_WIKIPEDIA_SERVICE_H_
|
||||||
@@ -1,168 +0,0 @@
|
|||||||
/**
|
|
||||||
* @file biergarten_data_generator.cpp
|
|
||||||
* @brief Orchestrates end-to-end pipeline execution for city sampling,
|
|
||||||
* Wikipedia enrichment, generator initialization, and brewery result output.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "biergarten_data_generator.h"
|
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <filesystem>
|
|
||||||
#include <future>
|
|
||||||
#include <iterator>
|
|
||||||
#include <random>
|
|
||||||
|
|
||||||
#include "data_generation/llama_generator.h"
|
|
||||||
#include "data_generation/mock_generator.h"
|
|
||||||
#include "json_handling/json_loader.h"
|
|
||||||
#include "wikipedia/wikipedia_service.h"
|
|
||||||
|
|
||||||
BiergartenDataGenerator::BiergartenDataGenerator(
|
|
||||||
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
|
|
||||||
: options_(options), webClient_(std::move(web_client)) {}
|
|
||||||
|
|
||||||
auto BiergartenDataGenerator::InitializeGenerator()
|
|
||||||
-> std::unique_ptr<DataGenerator> {
|
|
||||||
spdlog::info("Initializing brewery generator...");
|
|
||||||
|
|
||||||
std::unique_ptr<DataGenerator> generator;
|
|
||||||
if (options_.model_path.empty()) {
|
|
||||||
generator = std::make_unique<MockGenerator>();
|
|
||||||
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
|
|
||||||
} else {
|
|
||||||
auto llama_generator = std::make_unique<LlamaGenerator>();
|
|
||||||
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
|
|
||||||
options_.seed);
|
|
||||||
llama_generator->SetContextSize(options_.n_ctx);
|
|
||||||
spdlog::info(
|
|
||||||
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
|
|
||||||
"n_ctx={}, seed={})",
|
|
||||||
options_.model_path, options_.temperature, options_.top_p,
|
|
||||||
options_.n_ctx, options_.seed);
|
|
||||||
generator = std::move(llama_generator);
|
|
||||||
}
|
|
||||||
generator->Load(options_.model_path);
|
|
||||||
|
|
||||||
return generator;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto BiergartenDataGenerator::QueryCitiesWithCountries()
|
|
||||||
-> std::vector<Location> {
|
|
||||||
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
|
||||||
|
|
||||||
std::filesystem::path locations_path = "locations.json";
|
|
||||||
if (!std::filesystem::exists(locations_path)) {
|
|
||||||
const std::filesystem::path cache_path =
|
|
||||||
std::filesystem::path(options_.cache_dir) / "locations.json";
|
|
||||||
if (std::filesystem::exists(cache_path)) {
|
|
||||||
locations_path = cache_path;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto all_locations = JsonLoader::LoadLocations(locations_path.string());
|
|
||||||
spdlog::info(" Locations available: {}", all_locations.size());
|
|
||||||
|
|
||||||
const size_t sample_count = std::min<size_t>(4, all_locations.size());
|
|
||||||
std::vector<Location> sampled_locations;
|
|
||||||
sampled_locations.reserve(sample_count);
|
|
||||||
|
|
||||||
std::random_device random_generator;
|
|
||||||
std::sample(all_locations.begin(), all_locations.end(),
|
|
||||||
std::back_inserter(sampled_locations), sample_count,
|
|
||||||
random_generator);
|
|
||||||
|
|
||||||
spdlog::info(" Sampled locations: {}", sampled_locations.size());
|
|
||||||
return sampled_locations;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto BiergartenDataGenerator::EnrichWithWikipedia(
|
|
||||||
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
|
|
||||||
std::vector<EnrichedCity> enriched;
|
|
||||||
enriched.reserve(cities.size());
|
|
||||||
|
|
||||||
std::vector<std::future<EnrichedCity>> pending;
|
|
||||||
pending.reserve(cities.size());
|
|
||||||
|
|
||||||
for (const auto& city : cities) {
|
|
||||||
pending.push_back(
|
|
||||||
std::async(std::launch::async, [web_client = webClient_, city]() {
|
|
||||||
WikipediaService wikipedia_service(web_client);
|
|
||||||
const std::string region_context =
|
|
||||||
wikipedia_service.GetSummary(city.city, city.country);
|
|
||||||
spdlog::debug("[Pipeline] Region context for {}: {}", city.city,
|
|
||||||
region_context);
|
|
||||||
return EnrichedCity{city, region_context};
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto& task : pending) {
|
|
||||||
enriched.push_back(task.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
return enriched;
|
|
||||||
}
|
|
||||||
|
|
||||||
void BiergartenDataGenerator::GenerateBreweries(
|
|
||||||
DataGenerator& generator, const std::vector<EnrichedCity>& cities) {
|
|
||||||
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
|
|
||||||
generatedBreweries_.clear();
|
|
||||||
|
|
||||||
size_t skipped_count = 0;
|
|
||||||
|
|
||||||
for (const auto& enriched_city : cities) {
|
|
||||||
try {
|
|
||||||
auto brewery = generator.GenerateBrewery(
|
|
||||||
enriched_city.location.city, enriched_city.location.country,
|
|
||||||
enriched_city.region_context);
|
|
||||||
generatedBreweries_.push_back({enriched_city.location, brewery});
|
|
||||||
} catch (const std::exception& e) {
|
|
||||||
++skipped_count;
|
|
||||||
spdlog::warn(
|
|
||||||
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: "
|
|
||||||
"{}",
|
|
||||||
enriched_city.location.city, enriched_city.location.country,
|
|
||||||
e.what());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (skipped_count > 0) {
|
|
||||||
spdlog::warn(
|
|
||||||
"[Pipeline] Skipped {} city/cities due to generation "
|
|
||||||
"errors",
|
|
||||||
skipped_count);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void BiergartenDataGenerator::LogResults() const {
|
|
||||||
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
|
||||||
size_t index = 1;
|
|
||||||
for (const auto& entry : generatedBreweries_) {
|
|
||||||
spdlog::info(
|
|
||||||
"{}. city=\"{}\" country=\"{}\" state=\"{}\" "
|
|
||||||
"iso3166_2={} lat={} lon={}",
|
|
||||||
index, entry.location.city, entry.location.country,
|
|
||||||
entry.location.state_province, entry.location.iso3166_2,
|
|
||||||
entry.location.latitude, entry.location.longitude);
|
|
||||||
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
|
|
||||||
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
|
|
||||||
++index;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto BiergartenDataGenerator::Run() -> int {
|
|
||||||
try {
|
|
||||||
auto generator = InitializeGenerator();
|
|
||||||
auto cities = QueryCitiesWithCountries();
|
|
||||||
auto enriched = EnrichWithWikipedia(cities);
|
|
||||||
GenerateBreweries(*generator, enriched);
|
|
||||||
LogResults();
|
|
||||||
|
|
||||||
spdlog::info("\nOK: Pipeline completed successfully");
|
|
||||||
return 0;
|
|
||||||
} catch (const std::exception& e) {
|
|
||||||
spdlog::error("ERROR: Pipeline failed: {}", e.what());
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
14
pipeline/src/biergarten_data_generator/constructor.cpp
Normal file
14
pipeline/src/biergarten_data_generator/constructor.cpp
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
/**
|
||||||
|
* @file biergarten_data_generator/constructor.cpp
|
||||||
|
* @brief BiergartenDataGenerator constructor implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include "biergarten_data_generator.h"
|
||||||
|
|
||||||
|
BiergartenDataGenerator::BiergartenDataGenerator(
|
||||||
|
std::shared_ptr<IEnrichmentService> context_service,
|
||||||
|
std::unique_ptr<DataGenerator> generator)
|
||||||
|
: context_service_(std::move(context_service)),
|
||||||
|
generator_(std::move(generator)) {}
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
/**
|
||||||
|
* @file biergarten_data_generator/generate_breweries.cpp
|
||||||
|
* @brief BiergartenDataGenerator::GenerateBreweries() implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include "biergarten_data_generator.h"
|
||||||
|
|
||||||
|
void BiergartenDataGenerator::GenerateBreweries(
|
||||||
|
const std::vector<EnrichedCity>& cities) {
|
||||||
|
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
|
||||||
|
generatedBreweries_.clear();
|
||||||
|
|
||||||
|
size_t skipped_count = 0;
|
||||||
|
|
||||||
|
for (const auto& enriched_city : cities) {
|
||||||
|
try {
|
||||||
|
auto brewery = generator_->GenerateBrewery(
|
||||||
|
enriched_city.location.city, enriched_city.location.country,
|
||||||
|
enriched_city.region_context);
|
||||||
|
generatedBreweries_.push_back(GeneratedBrewery{
|
||||||
|
.location = enriched_city.location, .brewery = brewery});
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
++skipped_count;
|
||||||
|
spdlog::warn(
|
||||||
|
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: "
|
||||||
|
"{}",
|
||||||
|
enriched_city.location.city, enriched_city.location.country,
|
||||||
|
e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (skipped_count > 0) {
|
||||||
|
spdlog::warn(
|
||||||
|
"[Pipeline] Skipped {} city/cities due to generation "
|
||||||
|
"errors",
|
||||||
|
skipped_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
23
pipeline/src/biergarten_data_generator/log_results.cpp
Normal file
23
pipeline/src/biergarten_data_generator/log_results.cpp
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
/**
|
||||||
|
* @file biergarten_data_generator/log_results.cpp
|
||||||
|
* @brief BiergartenDataGenerator::LogResults() implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include "biergarten_data_generator.h"
|
||||||
|
|
||||||
|
void BiergartenDataGenerator::LogResults() const {
|
||||||
|
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
||||||
|
size_t index = 1;
|
||||||
|
for (const auto& [location, brewery] : generatedBreweries_) {
|
||||||
|
spdlog::info(
|
||||||
|
"{}. city=\"{}\" country=\"{}\" state=\"{}\" "
|
||||||
|
"iso3166_2={} lat={} lon={}",
|
||||||
|
index, location.city, location.country, location.state_province,
|
||||||
|
location.iso3166_2, location.latitude, location.longitude);
|
||||||
|
spdlog::info(" brewery_name=\"{}\"", brewery.name);
|
||||||
|
spdlog::info(" brewery_description=\"{}\"", brewery.description);
|
||||||
|
++index;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,40 @@
|
|||||||
|
/**
|
||||||
|
* @file biergarten_data_generator/query_cities_with_countries.cpp
|
||||||
|
* @brief BiergartenDataGenerator::QueryCitiesWithCountries() implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
|
#include "biergarten_data_generator.h"
|
||||||
|
#include "json_handling/json_loader.h"
|
||||||
|
|
||||||
|
static constexpr unsigned int brewery_amount = 4;
|
||||||
|
|
||||||
|
auto BiergartenDataGenerator::QueryCitiesWithCountries()
|
||||||
|
-> std::vector<Location> {
|
||||||
|
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||||
|
|
||||||
|
const std::filesystem::path locations_path = "locations.json";
|
||||||
|
|
||||||
|
auto all_locations = JsonLoader::LoadLocations(locations_path.string());
|
||||||
|
spdlog::info(" Locations available: {}", all_locations.size());
|
||||||
|
|
||||||
|
const size_t sample_count =
|
||||||
|
std::min<size_t>(brewery_amount, all_locations.size());
|
||||||
|
const auto sample_count_signed =
|
||||||
|
static_cast<std::iter_difference_t<decltype(all_locations.cbegin())>>(
|
||||||
|
sample_count);
|
||||||
|
std::vector<Location> sampled_locations;
|
||||||
|
sampled_locations.reserve(sample_count);
|
||||||
|
|
||||||
|
std::random_device random_generator;
|
||||||
|
std::ranges::sample(all_locations, std::back_inserter(sampled_locations),
|
||||||
|
sample_count_signed, random_generator);
|
||||||
|
|
||||||
|
spdlog::info(" Sampled locations: {}", sampled_locations.size());
|
||||||
|
return sampled_locations;
|
||||||
|
}
|
||||||
47
pipeline/src/biergarten_data_generator/run.cpp
Normal file
47
pipeline/src/biergarten_data_generator/run.cpp
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
/**
|
||||||
|
* @file biergarten_data_generator/run.cpp
|
||||||
|
* @brief BiergartenDataGenerator::Run() implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include "biergarten_data_generator.h"
|
||||||
|
|
||||||
|
auto BiergartenDataGenerator::Run() -> bool {
|
||||||
|
try {
|
||||||
|
const std::vector<Location> cities = QueryCitiesWithCountries();
|
||||||
|
std::vector<EnrichedCity> enriched;
|
||||||
|
enriched.reserve(cities.size());
|
||||||
|
|
||||||
|
size_t skipped_count = 0;
|
||||||
|
for (const auto& city : cities) {
|
||||||
|
try {
|
||||||
|
const std::string region_context =
|
||||||
|
context_service_->GetLocationContext(city);
|
||||||
|
spdlog::info("[Pipeline] Context for '{}' ({}) gathered:\n{}",
|
||||||
|
city.city, city.country, region_context);
|
||||||
|
|
||||||
|
enriched.push_back(EnrichedCity{.location = city,
|
||||||
|
.region_context = region_context});
|
||||||
|
} catch (const std::exception& exception) {
|
||||||
|
++skipped_count;
|
||||||
|
spdlog::warn(
|
||||||
|
"[Pipeline] Skipping city '{}' ({}): context lookup failed: {}",
|
||||||
|
city.city, city.country, exception.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (skipped_count > 0) {
|
||||||
|
spdlog::warn(
|
||||||
|
"[Pipeline] Skipped {} city/cities due to context lookup errors",
|
||||||
|
skipped_count);
|
||||||
|
}
|
||||||
|
|
||||||
|
this->GenerateBreweries(enriched);
|
||||||
|
this->LogResults();
|
||||||
|
return true;
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
spdlog::error("Pipeline execution failed with error: {}", e.what());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
51
pipeline/src/data_generation/llama/constructor.cpp
Normal file
51
pipeline/src/data_generation/llama/constructor.cpp
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
/**
|
||||||
|
* @file data_generation/llama/constructor.cpp
|
||||||
|
* @brief LlamaGenerator constructor implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <random>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "biergarten_data_generator.h"
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
|
||||||
|
LlamaGenerator::LlamaGenerator(const ApplicationOptions& options,
|
||||||
|
const std::string& model_path)
|
||||||
|
: rng_() {
|
||||||
|
if (model_path.empty()) {
|
||||||
|
throw std::runtime_error("LlamaGenerator: model path must not be empty");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.temperature < 0.0F) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: sampling temperature must be >= 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.top_p <= 0.0F || options.top_p > 1.0F) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: sampling top-p must be in (0, 1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.seed < -1) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: seed must be >= 0, or -1 for random");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options.n_ctx == 0 || options.n_ctx > 32768) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: context size must be in range [1, 32768]");
|
||||||
|
}
|
||||||
|
|
||||||
|
sampling_temperature_ = options.temperature;
|
||||||
|
sampling_top_p_ = options.top_p;
|
||||||
|
if (options.seed == -1) {
|
||||||
|
std::random_device random_device;
|
||||||
|
rng_.seed(random_device());
|
||||||
|
} else {
|
||||||
|
rng_.seed(static_cast<uint32_t>(options.seed));
|
||||||
|
}
|
||||||
|
n_ctx_ = options.n_ctx;
|
||||||
|
|
||||||
|
Load(model_path);
|
||||||
|
}
|
||||||
@@ -23,9 +23,4 @@ LlamaGenerator::~LlamaGenerator() {
|
|||||||
llama_model_free(model_);
|
llama_model_free(model_);
|
||||||
model_ = nullptr;
|
model_ = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Clean up the backend (GPU/CPU acceleration resources)
|
|
||||||
*/
|
|
||||||
llama_backend_free();
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,12 +16,10 @@
|
|||||||
#include "data_generation/llama_generator.h"
|
#include "data_generation/llama_generator.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* String trimming: removes leading and trailing whitespace
|
* String trimming: removes leading and trailing whitespace
|
||||||
*/
|
*/
|
||||||
std::string Trim(std::string value) {
|
static std::string Trim(std::string value) {
|
||||||
auto not_space = [](unsigned char ch) { return !std::isspace(ch); };
|
auto not_space = [](unsigned char ch) { return !std::isspace(ch); };
|
||||||
|
|
||||||
value.erase(value.begin(),
|
value.erase(value.begin(),
|
||||||
@@ -36,7 +34,7 @@ std::string Trim(std::string value) {
|
|||||||
* Normalize whitespace: collapses multiple spaces/tabs/newlines into single
|
* Normalize whitespace: collapses multiple spaces/tabs/newlines into single
|
||||||
* spaces
|
* spaces
|
||||||
*/
|
*/
|
||||||
std::string CondenseWhitespace(std::string text) {
|
static std::string CondenseWhitespace(std::string text) {
|
||||||
std::string out;
|
std::string out;
|
||||||
out.reserve(text.size());
|
out.reserve(text.size());
|
||||||
|
|
||||||
@@ -61,8 +59,8 @@ std::string CondenseWhitespace(std::string text) {
|
|||||||
* Truncate region context to fit within max length while preserving word
|
* Truncate region context to fit within max length while preserving word
|
||||||
* boundaries
|
* boundaries
|
||||||
*/
|
*/
|
||||||
std::string PrepareRegionContext(std::string_view region_context,
|
static std::string PrepareRegionContext(std::string_view region_context,
|
||||||
std::size_t max_chars) {
|
std::size_t max_chars) {
|
||||||
std::string normalized = CondenseWhitespace(std::string(region_context));
|
std::string normalized = CondenseWhitespace(std::string(region_context));
|
||||||
if (normalized.size() <= max_chars) {
|
if (normalized.size() <= max_chars) {
|
||||||
return normalized;
|
return normalized;
|
||||||
@@ -81,7 +79,7 @@ std::string PrepareRegionContext(std::string_view region_context,
|
|||||||
/**
|
/**
|
||||||
* Remove common bullet points, numbers, and field labels added by LLM in output
|
* Remove common bullet points, numbers, and field labels added by LLM in output
|
||||||
*/
|
*/
|
||||||
std::string StripCommonPrefix(std::string line) {
|
static std::string StripCommonPrefix(std::string line) {
|
||||||
line = Trim(std::move(line));
|
line = Trim(std::move(line));
|
||||||
|
|
||||||
if (!line.empty() && (line[0] == '-' || line[0] == '*')) {
|
if (!line.empty() && (line[0] == '-' || line[0] == '*')) {
|
||||||
@@ -126,7 +124,7 @@ std::string StripCommonPrefix(std::string line) {
|
|||||||
* Parse two-line response from LLM: normalize line endings, strip formatting,
|
* Parse two-line response from LLM: normalize line endings, strip formatting,
|
||||||
* filter spurious output, and combine remaining lines if needed
|
* filter spurious output, and combine remaining lines if needed
|
||||||
*/
|
*/
|
||||||
std::pair<std::string, std::string> ParseTwoLineResponse(
|
static std::pair<std::string, std::string> ParseTwoLineResponse(
|
||||||
const std::string& raw, const std::string& error_message) {
|
const std::string& raw, const std::string& error_message) {
|
||||||
std::string normalized = raw;
|
std::string normalized = raw;
|
||||||
std::replace(normalized.begin(), normalized.end(), '\r', '\n');
|
std::replace(normalized.begin(), normalized.end(), '\r', '\n');
|
||||||
@@ -177,8 +175,8 @@ std::pair<std::string, std::string> ParseTwoLineResponse(
|
|||||||
/**
|
/**
|
||||||
* Apply model's chat template to user-only prompt, formatting it for the model
|
* Apply model's chat template to user-only prompt, formatting it for the model
|
||||||
*/
|
*/
|
||||||
std::string ToChatPrompt(const llama_model* model,
|
static std::string ToChatPrompt(const llama_model* model,
|
||||||
const std::string& user_prompt) {
|
const std::string& user_prompt) {
|
||||||
const char* tmpl = llama_model_chat_template(model, nullptr);
|
const char* tmpl = llama_model_chat_template(model, nullptr);
|
||||||
if (tmpl == nullptr) {
|
if (tmpl == nullptr) {
|
||||||
return user_prompt;
|
return user_prompt;
|
||||||
@@ -214,9 +212,9 @@ std::string ToChatPrompt(const llama_model* model,
|
|||||||
* Apply model's chat template to system+user prompt pair, formatting for the
|
* Apply model's chat template to system+user prompt pair, formatting for the
|
||||||
* model
|
* model
|
||||||
*/
|
*/
|
||||||
std::string ToChatPrompt(const llama_model* model,
|
static std::string ToChatPrompt(const llama_model* model,
|
||||||
const std::string& system_prompt,
|
const std::string& system_prompt,
|
||||||
const std::string& user_prompt) {
|
const std::string& user_prompt) {
|
||||||
const char* tmpl = llama_model_chat_template(model, nullptr);
|
const char* tmpl = llama_model_chat_template(model, nullptr);
|
||||||
if (tmpl == nullptr) {
|
if (tmpl == nullptr) {
|
||||||
return system_prompt + "\n\n" + user_prompt;
|
return system_prompt + "\n\n" + user_prompt;
|
||||||
@@ -249,8 +247,8 @@ std::string ToChatPrompt(const llama_model* model,
|
|||||||
return std::string(buffer.data(), static_cast<std::size_t>(required));
|
return std::string(buffer.data(), static_cast<std::size_t>(required));
|
||||||
}
|
}
|
||||||
|
|
||||||
void AppendTokenPiece(const llama_vocab* vocab, llama_token token,
|
static void AppendTokenPiece(const llama_vocab* vocab, llama_token token,
|
||||||
std::string& output) {
|
std::string& output) {
|
||||||
std::array<char, 256> buffer{};
|
std::array<char, 256> buffer{};
|
||||||
int32_t bytes =
|
int32_t bytes =
|
||||||
llama_token_to_piece(vocab, token, buffer.data(),
|
llama_token_to_piece(vocab, token, buffer.data(),
|
||||||
@@ -273,7 +271,8 @@ void AppendTokenPiece(const llama_vocab* vocab, llama_token token,
|
|||||||
output.append(buffer.data(), static_cast<std::size_t>(bytes));
|
output.append(buffer.data(), static_cast<std::size_t>(bytes));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ExtractFirstJsonObject(const std::string& text, std::string& json_out) {
|
static bool ExtractFirstJsonObject(const std::string& text,
|
||||||
|
std::string& json_out) {
|
||||||
std::size_t start = std::string::npos;
|
std::size_t start = std::string::npos;
|
||||||
int depth = 0;
|
int depth = 0;
|
||||||
bool in_string = false;
|
bool in_string = false;
|
||||||
@@ -321,8 +320,9 @@ bool ExtractFirstJsonObject(const std::string& text, std::string& json_out) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string ValidateBreweryJson(const std::string& raw, std::string& name_out,
|
static std::string ValidateBreweryJson(const std::string& raw,
|
||||||
std::string& description_out) {
|
std::string& name_out,
|
||||||
|
std::string& description_out) {
|
||||||
auto validate_object = [&](const boost::json::value& jv,
|
auto validate_object = [&](const boost::json::value& jv,
|
||||||
std::string& error_out) -> bool {
|
std::string& error_out) -> bool {
|
||||||
if (!jv.is_object()) {
|
if (!jv.is_object()) {
|
||||||
@@ -403,8 +403,6 @@ std::string ValidateBreweryJson(const std::string& raw, std::string& name_out,
|
|||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
// Forward declarations for helper functions exposed to other translation units
|
// Forward declarations for helper functions exposed to other translation units
|
||||||
std::string PrepareRegionContextPublic(std::string_view region_context,
|
std::string PrepareRegionContextPublic(std::string_view region_context,
|
||||||
std::size_t max_chars) {
|
std::size_t max_chars) {
|
||||||
|
|||||||
@@ -145,8 +145,7 @@ std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
|
|||||||
* Distribution sampler: selects actual token using configured seed for
|
* Distribution sampler: selects actual token using configured seed for
|
||||||
* reproducibility
|
* reproducibility
|
||||||
*/
|
*/
|
||||||
llama_sampler_chain_add(sampler.get(),
|
llama_sampler_chain_add(sampler.get(), llama_sampler_init_dist(rng_()));
|
||||||
llama_sampler_init_dist(sampling_seed_));
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* TOKEN GENERATION LOOP
|
* TOKEN GENERATION LOOP
|
||||||
@@ -187,10 +186,5 @@ std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
|
|||||||
for (const llama_token token : generated_tokens)
|
for (const llama_token token : generated_tokens)
|
||||||
AppendTokenPiecePublic(vocab, token, output);
|
AppendTokenPiecePublic(vocab, token, output);
|
||||||
|
|
||||||
/**
|
|
||||||
* Advance seed for next generation to improve output diversity
|
|
||||||
*/
|
|
||||||
sampling_seed_ = (sampling_seed_ == 0xFFFFFFFFu) ? 0 : sampling_seed_ + 1;
|
|
||||||
|
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
/**
|
/**
|
||||||
* @file data_generation/llama/load.cpp
|
* @file data_generation/llama/load.cpp
|
||||||
* @brief Initializes llama backend, loads model weights, creates inference
|
* @brief Initializes llama backend, loads model weights, creates inference
|
||||||
* context, and resets prior resources during model reload.
|
* context, and resets prior resources during model initialization.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@@ -13,12 +14,6 @@
|
|||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
void LlamaGenerator::Load(const std::string& model_path) {
|
void LlamaGenerator::Load(const std::string& model_path) {
|
||||||
/**
|
|
||||||
* Validate input and clean up any previously loaded model/context
|
|
||||||
*/
|
|
||||||
if (model_path.empty())
|
|
||||||
throw std::runtime_error("LlamaGenerator: model path must not be empty");
|
|
||||||
|
|
||||||
if (context_ != nullptr) {
|
if (context_ != nullptr) {
|
||||||
llama_free(context_);
|
llama_free(context_);
|
||||||
context_ = nullptr;
|
context_ = nullptr;
|
||||||
@@ -28,11 +23,6 @@ void LlamaGenerator::Load(const std::string& model_path) {
|
|||||||
model_ = nullptr;
|
model_ = nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Initialize the llama backend (one-time setup for GPU/CPU acceleration)
|
|
||||||
*/
|
|
||||||
llama_backend_init();
|
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_ = llama_model_load_from_file(model_path.c_str(), model_params);
|
model_ = llama_model_load_from_file(model_path.c_str(), model_params);
|
||||||
if (model_ == nullptr) {
|
if (model_ == nullptr) {
|
||||||
@@ -42,7 +32,7 @@ void LlamaGenerator::Load(const std::string& model_path) {
|
|||||||
|
|
||||||
llama_context_params context_params = llama_context_default_params();
|
llama_context_params context_params = llama_context_default_params();
|
||||||
context_params.n_ctx = n_ctx_;
|
context_params.n_ctx = n_ctx_;
|
||||||
context_params.n_batch = n_ctx_; // Set batch size equal to context window
|
context_params.n_batch = std::min(n_ctx_, static_cast<uint32_t>(512));
|
||||||
|
|
||||||
context_ = llama_init_from_model(model_, context_params);
|
context_ = llama_init_from_model(model_, context_params);
|
||||||
if (context_ == nullptr) {
|
if (context_ == nullptr) {
|
||||||
|
|||||||
@@ -1,64 +0,0 @@
|
|||||||
/**
|
|
||||||
* @file data_generation/llama/set_sampling_options.cpp
|
|
||||||
* @brief Validates and stores sampling temperature, top-p, seed, and context
|
|
||||||
* size configuration used by subsequent LlamaGenerator inference calls.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
#include "data_generation/llama_generator.h"
|
|
||||||
#include "llama.h"
|
|
||||||
|
|
||||||
void LlamaGenerator::SetSamplingOptions(float temperature, float top_p,
|
|
||||||
int seed) {
|
|
||||||
/**
|
|
||||||
* Validate temperature: controls randomness in output distribution
|
|
||||||
* 0.0 = deterministic (always pick highest probability token)
|
|
||||||
* Higher values = more random/diverse output
|
|
||||||
*/
|
|
||||||
if (temperature < 0.0f) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"LlamaGenerator: sampling temperature must be >= 0");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Validate top-p (nucleus sampling): only sample from top cumulative
|
|
||||||
* probability e.g., top-p=0.9 means sample from tokens that make up 90% of
|
|
||||||
* probability mass
|
|
||||||
*/
|
|
||||||
if (!(top_p > 0.0f && top_p <= 1.0f)) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"LlamaGenerator: sampling top-p must be in (0, 1]");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Validate seed: for reproducible results (-1 uses random seed)
|
|
||||||
*/
|
|
||||||
if (seed < -1) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"LlamaGenerator: seed must be >= 0, or -1 for random");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Store sampling parameters for use during token generation
|
|
||||||
*/
|
|
||||||
sampling_temperature_ = temperature;
|
|
||||||
sampling_top_p_ = top_p;
|
|
||||||
sampling_seed_ = (seed < 0) ? static_cast<uint32_t>(LLAMA_DEFAULT_SEED)
|
|
||||||
: static_cast<uint32_t>(seed);
|
|
||||||
}
|
|
||||||
|
|
||||||
void LlamaGenerator::SetContextSize(uint32_t n_ctx) {
|
|
||||||
/**
|
|
||||||
* Validate context size: must be positive and reasonable for the model
|
|
||||||
*/
|
|
||||||
if (n_ctx == 0 || n_ctx > 32768) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"LlamaGenerator: context size must be in range [1, 32768]");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Store context size for use during model loading
|
|
||||||
*/
|
|
||||||
n_ctx_ = n_ctx;
|
|
||||||
}
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
/**
|
|
||||||
* @file data_generation/mock/load.cpp
|
|
||||||
* @brief Provides MockGenerator initialization behavior, which is a no-op load
|
|
||||||
* path that logs readiness without model resources.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "data_generation/mock_generator.h"
|
|
||||||
|
|
||||||
void MockGenerator::Load(const std::string& /*modelPath*/) {
|
|
||||||
spdlog::info("[MockGenerator] No model needed");
|
|
||||||
}
|
|
||||||
@@ -13,10 +13,8 @@
|
|||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
namespace {
|
static auto ReadRequiredString(const boost::json::object& object,
|
||||||
|
const char* key) -> std::string {
|
||||||
auto ReadRequiredString(const boost::json::object& object, const char* key)
|
|
||||||
-> std::string {
|
|
||||||
const boost::json::value* value = object.if_contains(key);
|
const boost::json::value* value = object.if_contains(key);
|
||||||
if (value == nullptr || !value->is_string()) {
|
if (value == nullptr || !value->is_string()) {
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
@@ -25,8 +23,8 @@ auto ReadRequiredString(const boost::json::object& object, const char* key)
|
|||||||
return std::string(value->as_string().c_str());
|
return std::string(value->as_string().c_str());
|
||||||
}
|
}
|
||||||
|
|
||||||
auto ReadRequiredNumber(const boost::json::object& object, const char* key)
|
static auto ReadRequiredNumber(const boost::json::object& object,
|
||||||
-> double {
|
const char* key) -> double {
|
||||||
const boost::json::value* value = object.if_contains(key);
|
const boost::json::value* value = object.if_contains(key);
|
||||||
if (value == nullptr || !value->is_number()) {
|
if (value == nullptr || !value->is_number()) {
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
@@ -35,8 +33,6 @@ auto ReadRequiredNumber(const boost::json::object& object, const char* key)
|
|||||||
return value->to_number<double>();
|
return value->to_number<double>();
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
auto JsonLoader::LoadLocations(const std::string& filepath)
|
auto JsonLoader::LoadLocations(const std::string& filepath)
|
||||||
-> std::vector<Location> {
|
-> std::vector<Location> {
|
||||||
std::ifstream input(filepath);
|
std::ifstream input(filepath);
|
||||||
|
|||||||
@@ -6,14 +6,23 @@
|
|||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <boost/di.hpp>
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
#include <iostream>
|
#include <exception>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <sstream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
#include "biergarten_data_generator.h"
|
#include "biergarten_data_generator.h"
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
#include "data_generation/mock_generator.h"
|
||||||
|
#include "llama_backend_state.h"
|
||||||
|
#include "services/enrichment_service.h"
|
||||||
|
#include "services/wikipedia_service.h"
|
||||||
#include "web_client/curl_web_client.h"
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
namespace po = boost::program_options;
|
namespace prog_opts = boost::program_options;
|
||||||
|
namespace di = boost::di;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Parse command-line arguments into ApplicationOptions.
|
* @brief Parse command-line arguments into ApplicationOptions.
|
||||||
@@ -23,123 +32,135 @@ namespace po = boost::program_options;
|
|||||||
* @param options Output ApplicationOptions struct.
|
* @param options Output ApplicationOptions struct.
|
||||||
* @return true if parsing succeeded and should proceed, false otherwise.
|
* @return true if parsing succeeded and should proceed, false otherwise.
|
||||||
*/
|
*/
|
||||||
bool ParseArguments(int argc, char** argv, ApplicationOptions& options) {
|
auto ParseArguments(const int argc, char** argv,
|
||||||
// If no arguments provided, display usage and exit
|
ApplicationOptions& options) noexcept -> bool {
|
||||||
if (argc == 1) {
|
prog_opts::options_description desc("Pipeline Options");
|
||||||
std::cout << "Biergarten Pipeline - Geographic Data Pipeline with "
|
|
||||||
"Brewery Generation\n\n";
|
|
||||||
std::cout << "Usage: biergarten-pipeline [options]\n\n";
|
|
||||||
std::cout << "Options:\n";
|
|
||||||
std::cout << " --mocked Use mocked generator for "
|
|
||||||
"brewery/user data\n";
|
|
||||||
std::cout << " --model, -m PATH Path to LLM model file (gguf) for "
|
|
||||||
"generation\n";
|
|
||||||
std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: "
|
|
||||||
"/tmp)\n";
|
|
||||||
std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 "
|
|
||||||
"(default: 0.8)\n";
|
|
||||||
std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 "
|
|
||||||
"(default: 0.92)\n";
|
|
||||||
std::cout << " --n-ctx SIZE Context window size in tokens "
|
|
||||||
"(default: 4096)\n";
|
|
||||||
std::cout << " --seed SEED Random seed: -1 for random "
|
|
||||||
"(default: -1)\n";
|
|
||||||
std::cout << " --help, -h Show this help message\n\n";
|
|
||||||
std::cout << "Note: --mocked and --model are mutually exclusive. Exactly "
|
|
||||||
"one must be provided.\n";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
po::options_description desc("Pipeline Options");
|
|
||||||
desc.add_options()("help,h", "Produce help message")(
|
desc.add_options()("help,h", "Produce help message")(
|
||||||
"mocked", po::bool_switch(),
|
"mocked", prog_opts::bool_switch(),
|
||||||
"Use mocked generator for brewery/user data")(
|
"Use mocked generator for brewery/user data")(
|
||||||
"model,m", po::value<std::string>()->default_value(""),
|
"model,m", prog_opts::value<std::string>()->default_value(""),
|
||||||
"Path to LLM model (gguf)")(
|
"Path to LLM model (gguf)")(
|
||||||
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
|
"temperature", prog_opts::value<float>()->default_value(0.8f),
|
||||||
"Directory for cached JSON")(
|
|
||||||
"temperature", po::value<float>()->default_value(0.8f),
|
|
||||||
"Sampling temperature (higher = more random)")(
|
"Sampling temperature (higher = more random)")(
|
||||||
"top-p", po::value<float>()->default_value(0.92f),
|
"top-p", prog_opts::value<float>()->default_value(0.92f),
|
||||||
"Nucleus sampling top-p in (0,1] (higher = more random)")(
|
"Nucleus sampling top-p in (0,1] (higher = more random)")(
|
||||||
"n-ctx", po::value<uint32_t>()->default_value(8192),
|
"n-ctx", prog_opts::value<uint32_t>()->default_value(8192),
|
||||||
"Context window size in tokens (1-32768)")(
|
"Context window size in tokens (1-32768)")(
|
||||||
"seed", po::value<int>()->default_value(-1),
|
"seed", prog_opts::value<int>()->default_value(-1),
|
||||||
"Sampler seed: -1 for random, otherwise non-negative integer");
|
"Sampler seed: -1 for random, otherwise non-negative integer");
|
||||||
|
|
||||||
po::variables_map vm;
|
// Handle the "no arguments" or "help" case
|
||||||
po::store(po::parse_command_line(argc, argv, desc), vm);
|
if (argc == 1) {
|
||||||
po::notify(vm);
|
spdlog::info("Biergarten Pipeline");
|
||||||
|
std::stringstream usage_stream;
|
||||||
if (vm.count("help")) {
|
usage_stream << "\nUsage: biergarten-pipeline [options]\n\n" << desc;
|
||||||
std::cout << desc << "\n";
|
spdlog::info(usage_stream.str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check for mutually exclusive --mocked and --model flags
|
try {
|
||||||
bool use_mocked = vm["mocked"].as<bool>();
|
prog_opts::variables_map variables_map;
|
||||||
std::string model_path = vm["model"].as<std::string>();
|
prog_opts::store(prog_opts::parse_command_line(argc, argv, desc),
|
||||||
|
variables_map);
|
||||||
|
prog_opts::notify(variables_map);
|
||||||
|
|
||||||
if (use_mocked && !model_path.empty()) {
|
if (variables_map.contains("help")) {
|
||||||
spdlog::error("ERROR: --mocked and --model are mutually exclusive");
|
std::stringstream help_stream;
|
||||||
return false;
|
help_stream << "\n" << desc;
|
||||||
}
|
spdlog::info(help_stream.str());
|
||||||
|
return false;
|
||||||
if (!use_mocked && model_path.empty()) {
|
|
||||||
spdlog::error("ERROR: Either --mocked or --model must be specified");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Warn if sampling parameters are provided with --mocked
|
|
||||||
if (use_mocked) {
|
|
||||||
bool hasTemperature = vm["temperature"].defaulted() == false;
|
|
||||||
bool hasTopP = vm["top-p"].defaulted() == false;
|
|
||||||
bool hasSeed = vm["seed"].defaulted() == false;
|
|
||||||
|
|
||||||
if (hasTemperature || hasTopP || hasSeed) {
|
|
||||||
spdlog::warn(
|
|
||||||
"WARNING: Sampling parameters (--temperature, --top-p, --seed) "
|
|
||||||
"are ignored when using --mocked");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const auto use_mocked = variables_map["mocked"].as<bool>();
|
||||||
|
const auto model_path = variables_map["model"].as<std::string>();
|
||||||
|
|
||||||
|
if (use_mocked && !model_path.empty()) {
|
||||||
|
spdlog::error(
|
||||||
|
"Invalid arguments: --mocked and --model are mutually exclusive");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!use_mocked && model_path.empty()) {
|
||||||
|
spdlog::error(
|
||||||
|
"Invalid arguments: Either --mocked or --model must be specified");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const bool has_llm_params = !variables_map["temperature"].defaulted() ||
|
||||||
|
!variables_map["top-p"].defaulted() ||
|
||||||
|
!variables_map["seed"].defaulted();
|
||||||
|
|
||||||
|
if (use_mocked && has_llm_params) {
|
||||||
|
spdlog::warn(
|
||||||
|
"Sampling parameters (--temperature, --top-p, --seed) are"
|
||||||
|
" ignored when using --mocked");
|
||||||
|
}
|
||||||
|
|
||||||
|
options.use_mocked = use_mocked;
|
||||||
|
options.model_path = model_path;
|
||||||
|
options.temperature = variables_map["temperature"].as<float>();
|
||||||
|
options.top_p = variables_map["top-p"].as<float>();
|
||||||
|
options.n_ctx = variables_map["n-ctx"].as<uint32_t>();
|
||||||
|
options.seed = variables_map["seed"].as<int>();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
} catch (const std::exception& exception) {
|
||||||
|
spdlog::error("Failed to parse command-line arguments: {}",
|
||||||
|
exception.what());
|
||||||
|
return false;
|
||||||
|
} catch (...) {
|
||||||
|
spdlog::error("Failed to parse command-line arguments: unknown error");
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
options.use_mocked = use_mocked;
|
|
||||||
options.model_path = model_path;
|
|
||||||
options.cache_dir = vm["cache-dir"].as<std::string>();
|
|
||||||
options.temperature = vm["temperature"].as<float>();
|
|
||||||
options.top_p = vm["top-p"].as<float>();
|
|
||||||
options.n_ctx = vm["n-ctx"].as<uint32_t>();
|
|
||||||
options.seed = vm["seed"].as<int>();
|
|
||||||
// commit is always pinned to c5eb7772
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
int main(int argc, char* argv[]) {
|
auto main(const int argc, char** argv) noexcept -> int {
|
||||||
try {
|
try {
|
||||||
const CurlGlobalState curl_state;
|
const CurlGlobalState curl_state;
|
||||||
|
const LlamaBackendState llama_backend_state;
|
||||||
|
spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v");
|
||||||
|
|
||||||
ApplicationOptions options;
|
ApplicationOptions options;
|
||||||
if (!ParseArguments(argc, argv, options)) {
|
if (!ParseArguments(argc, argv, options)) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto webClient = std::make_shared<CURLWebClient>();
|
const auto injector = di::make_injector(
|
||||||
|
di::bind<WebClient>().to<CURLWebClient>(),
|
||||||
|
di::bind<ApplicationOptions>().to(options),
|
||||||
|
di::bind<IEnrichmentService>().to<WikipediaService>(),
|
||||||
|
di::bind<std::string>().to(options.model_path),
|
||||||
|
di::bind<DataGenerator>().to([options](const auto& injector)
|
||||||
|
-> std::unique_ptr<DataGenerator> {
|
||||||
|
if (options.use_mocked) {
|
||||||
|
spdlog::info(
|
||||||
|
"[Generator] Using MockGenerator (no model path provided)");
|
||||||
|
return std::make_unique<MockGenerator>();
|
||||||
|
}
|
||||||
|
|
||||||
BiergartenDataGenerator generator(options, webClient);
|
spdlog::info(
|
||||||
return generator.Run();
|
"[Generator] Using LlamaGenerator: {} (temperature={}, "
|
||||||
|
"top-p={}, "
|
||||||
|
"n_ctx={}, seed={})",
|
||||||
|
options.model_path, options.temperature, options.top_p,
|
||||||
|
options.n_ctx, options.seed);
|
||||||
|
return injector.template create<std::unique_ptr<LlamaGenerator>>();
|
||||||
|
}));
|
||||||
|
|
||||||
} catch (const std::exception& e) {
|
auto generator = injector.create<BiergartenDataGenerator>();
|
||||||
const std::string message = e.what() ? e.what() : "";
|
|
||||||
|
|
||||||
if (message.find("LlamaGenerator: malformed brewery response") !=
|
if (!generator.Run()) {
|
||||||
std::string::npos) {
|
spdlog::error("Pipeline execution failed");
|
||||||
spdlog::warn("WARNING: Non-fatal LLM failure after retries: {}",
|
return 1;
|
||||||
message);
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
spdlog::error("ERROR: Application failed: {}", e.what());
|
spdlog::info("Pipeline executed successfully");
|
||||||
|
return 0;
|
||||||
|
} catch (const std::exception& exception) {
|
||||||
|
spdlog::critical("Unhandled fatal error in main: {}", exception.what());
|
||||||
|
return 1;
|
||||||
|
} catch (...) {
|
||||||
|
spdlog::critical("Unhandled fatal non-standard exception in main");
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
11
pipeline/src/services/wikipedia/constructor.cpp
Normal file
11
pipeline/src/services/wikipedia/constructor.cpp
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
/**
|
||||||
|
* @file wikipedia/constructor.cpp
|
||||||
|
* @brief WikipediaService constructor implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
#include "services/wikipedia_service.h"
|
||||||
|
|
||||||
|
WikipediaService::WikipediaService(std::shared_ptr<WebClient> client)
|
||||||
|
: client_(std::move(client)) {}
|
||||||
58
pipeline/src/services/wikipedia/fetch_extract.cpp
Normal file
58
pipeline/src/services/wikipedia/fetch_extract.cpp
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
/**
|
||||||
|
* @file wikipedia/fetch_extract.cpp
|
||||||
|
* @brief WikipediaService::FetchExtract() implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <boost/json.hpp>
|
||||||
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
|
|
||||||
|
#include "services/wikipedia_service.h"
|
||||||
|
|
||||||
|
auto WikipediaService::FetchExtract(std::string_view query) -> std::string {
|
||||||
|
const std::string cache_key(query);
|
||||||
|
const auto cache_it = this->extract_cache_.find(cache_key);
|
||||||
|
if (cache_it != this->extract_cache_.end()) {
|
||||||
|
return cache_it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string encoded = this->client_->UrlEncode(cache_key);
|
||||||
|
const std::string url =
|
||||||
|
"https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
|
||||||
|
"&prop=extracts&explaintext=1&format=json";
|
||||||
|
|
||||||
|
const std::string body = this->client_->Get(url);
|
||||||
|
|
||||||
|
boost::system::error_code parse_error;
|
||||||
|
boost::json::value doc = boost::json::parse(body, parse_error);
|
||||||
|
|
||||||
|
if (!parse_error && doc.is_object()) {
|
||||||
|
try {
|
||||||
|
auto& pages = doc.at("query").at("pages").get_object();
|
||||||
|
if (!pages.empty()) {
|
||||||
|
auto& page = pages.begin()->value().get_object();
|
||||||
|
if (page.contains("extract") && page.at("extract").is_string()) {
|
||||||
|
std::string extract(page.at("extract").as_string().c_str());
|
||||||
|
spdlog::debug("WikipediaService fetched {} chars for '{}'",
|
||||||
|
extract.size(), query);
|
||||||
|
this->extract_cache_.emplace(cache_key, extract);
|
||||||
|
return extract;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
this->extract_cache_.emplace(cache_key, std::string{});
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
spdlog::warn(
|
||||||
|
"WikipediaService: failed to parse response structure for '{}': "
|
||||||
|
"{}",
|
||||||
|
query, e.what());
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
} else if (parse_error) {
|
||||||
|
spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
|
||||||
|
parse_error.message());
|
||||||
|
}
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
56
pipeline/src/services/wikipedia/get_summary.cpp
Normal file
56
pipeline/src/services/wikipedia/get_summary.cpp
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
/**
|
||||||
|
* @file wikipedia/get_summary.cpp
|
||||||
|
* @brief WikipediaService::GetLocationContext() implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "services/wikipedia_service.h"
|
||||||
|
|
||||||
|
auto WikipediaService::GetLocationContext(const Location& loc) -> std::string {
|
||||||
|
const std::string cache_key = loc.city + "|" + loc.country;
|
||||||
|
const auto cache_it = cache_.find(cache_key);
|
||||||
|
if (cache_it != cache_.end()) {
|
||||||
|
return cache_it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
if (!client_) {
|
||||||
|
cache_.emplace(cache_key, result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string region_query(loc.city);
|
||||||
|
if (!loc.country.empty()) {
|
||||||
|
region_query += ", ";
|
||||||
|
region_query += loc.country;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string beer_query = "beer in " + loc.country;
|
||||||
|
const std::string city_beer_query = "beer in " + loc.city;
|
||||||
|
|
||||||
|
auto append_extract = [&result](const std::string& extract) -> void {
|
||||||
|
if (extract.empty()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!result.empty()) {
|
||||||
|
result += "\n\n";
|
||||||
|
}
|
||||||
|
result += extract;
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
append_extract(FetchExtract(region_query));
|
||||||
|
append_extract(FetchExtract(beer_query));
|
||||||
|
append_extract(FetchExtract(city_beer_query));
|
||||||
|
} catch (const std::runtime_error& e) {
|
||||||
|
spdlog::debug("WikipediaService lookup failed for '{}': {}", region_query,
|
||||||
|
e.what());
|
||||||
|
}
|
||||||
|
|
||||||
|
cache_.emplace(cache_key, result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
17
pipeline/src/web_client/curl_global_state_constructor.cpp
Normal file
17
pipeline/src/web_client/curl_global_state_constructor.cpp
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
/**
|
||||||
|
* @file web_client/curl_global_state_constructor.cpp
|
||||||
|
* @brief CurlGlobalState constructor implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
|
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
|
CurlGlobalState::CurlGlobalState() {
|
||||||
|
if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"[CURLWebClient] Failed to initialize libcurl globally");
|
||||||
|
}
|
||||||
|
}
|
||||||
10
pipeline/src/web_client/curl_global_state_destructor.cpp
Normal file
10
pipeline/src/web_client/curl_global_state_destructor.cpp
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
/**
|
||||||
|
* @file web_client/curl_global_state_destructor.cpp
|
||||||
|
* @brief CurlGlobalState destructor implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
|
|
||||||
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
|
CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }
|
||||||
@@ -1,147 +0,0 @@
|
|||||||
/**
|
|
||||||
* @file web_client/curl_web_client.cpp
|
|
||||||
* @brief Implements libcurl-backed HTTP utilities, including GET requests,
|
|
||||||
* file downloads, URL encoding, and RAII global curl lifecycle handling.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "web_client/curl_web_client.h"
|
|
||||||
|
|
||||||
#include <curl/curl.h>
|
|
||||||
|
|
||||||
#include <cstdio>
|
|
||||||
#include <fstream>
|
|
||||||
#include <memory>
|
|
||||||
#include <sstream>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
CurlGlobalState::CurlGlobalState() {
|
|
||||||
if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"[CURLWebClient] Failed to initialize libcurl globally");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
// curl write callback that appends response data into a std::string
|
|
||||||
size_t WriteCallbackString(void* contents, size_t size, size_t nmemb,
|
|
||||||
void* userp) {
|
|
||||||
size_t realsize = size * nmemb;
|
|
||||||
auto* s = static_cast<std::string*>(userp);
|
|
||||||
s->append(static_cast<char*>(contents), realsize);
|
|
||||||
return realsize;
|
|
||||||
}
|
|
||||||
|
|
||||||
// curl write callback that writes to a file stream
|
|
||||||
size_t WriteCallbackFile(void* contents, size_t size, size_t nmemb,
|
|
||||||
void* userp) {
|
|
||||||
size_t realsize = size * nmemb;
|
|
||||||
auto* outFile = static_cast<std::ofstream*>(userp);
|
|
||||||
outFile->write(static_cast<char*>(contents), realsize);
|
|
||||||
return realsize;
|
|
||||||
}
|
|
||||||
|
|
||||||
// RAII wrapper for CURL handle using unique_ptr
|
|
||||||
using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
|
||||||
|
|
||||||
CurlHandle create_handle() {
|
|
||||||
CURL* handle = curl_easy_init();
|
|
||||||
if (!handle) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"[CURLWebClient] Failed to initialize libcurl handle");
|
|
||||||
}
|
|
||||||
return CurlHandle(handle, &curl_easy_cleanup);
|
|
||||||
}
|
|
||||||
|
|
||||||
void set_common_get_options(CURL* curl, const std::string& url,
|
|
||||||
long connect_timeout, long total_timeout) {
|
|
||||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
|
||||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
|
|
||||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, connect_timeout);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, total_timeout);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
|
|
||||||
}
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
CURLWebClient::CURLWebClient() {}
|
|
||||||
|
|
||||||
CURLWebClient::~CURLWebClient() {}
|
|
||||||
|
|
||||||
void CURLWebClient::DownloadToFile(const std::string& url,
|
|
||||||
const std::string& file_path) {
|
|
||||||
auto curl = create_handle();
|
|
||||||
|
|
||||||
std::ofstream outFile(file_path, std::ios::binary);
|
|
||||||
if (!outFile.is_open()) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"[CURLWebClient] Cannot open file for writing: " + file_path);
|
|
||||||
}
|
|
||||||
|
|
||||||
set_common_get_options(curl.get(), url, 30L, 300L);
|
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackFile);
|
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA,
|
|
||||||
static_cast<void*>(&outFile));
|
|
||||||
|
|
||||||
CURLcode res = curl_easy_perform(curl.get());
|
|
||||||
outFile.close();
|
|
||||||
|
|
||||||
if (res != CURLE_OK) {
|
|
||||||
std::remove(file_path.c_str());
|
|
||||||
std::string error = std::string("[CURLWebClient] Download failed: ") +
|
|
||||||
curl_easy_strerror(res);
|
|
||||||
throw std::runtime_error(error);
|
|
||||||
}
|
|
||||||
|
|
||||||
long httpCode = 0;
|
|
||||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
|
|
||||||
|
|
||||||
if (httpCode != 200) {
|
|
||||||
std::remove(file_path.c_str());
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
|
|
||||||
throw std::runtime_error(ss.str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string CURLWebClient::Get(const std::string& url) {
|
|
||||||
auto curl = create_handle();
|
|
||||||
|
|
||||||
std::string response_string;
|
|
||||||
set_common_get_options(curl.get(), url, 10L, 20L);
|
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
|
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
|
|
||||||
|
|
||||||
CURLcode res = curl_easy_perform(curl.get());
|
|
||||||
|
|
||||||
if (res != CURLE_OK) {
|
|
||||||
std::string error =
|
|
||||||
std::string("[CURLWebClient] GET failed: ") + curl_easy_strerror(res);
|
|
||||||
throw std::runtime_error(error);
|
|
||||||
}
|
|
||||||
|
|
||||||
long httpCode = 0;
|
|
||||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
|
|
||||||
|
|
||||||
if (httpCode != 200) {
|
|
||||||
std::stringstream ss;
|
|
||||||
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
|
|
||||||
throw std::runtime_error(ss.str());
|
|
||||||
}
|
|
||||||
|
|
||||||
return response_string;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string CURLWebClient::UrlEncode(const std::string& value) {
|
|
||||||
// A NULL handle is fine for UTF-8 encoding according to libcurl docs.
|
|
||||||
char* output = curl_easy_escape(nullptr, value.c_str(), 0);
|
|
||||||
|
|
||||||
if (output) {
|
|
||||||
std::string result(output);
|
|
||||||
curl_free(output);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
|
|
||||||
}
|
|
||||||
8
pipeline/src/web_client/curl_web_client_constructor.cpp
Normal file
8
pipeline/src/web_client/curl_web_client_constructor.cpp
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
/**
|
||||||
|
* @file web_client/curl_web_client_constructor.cpp
|
||||||
|
* @brief CURLWebClient constructor implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
|
CURLWebClient::CURLWebClient() {}
|
||||||
8
pipeline/src/web_client/curl_web_client_destructor.cpp
Normal file
8
pipeline/src/web_client/curl_web_client_destructor.cpp
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
/**
|
||||||
|
* @file web_client/curl_web_client_destructor.cpp
|
||||||
|
* @brief CURLWebClient destructor implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
|
CURLWebClient::~CURLWebClient() {}
|
||||||
59
pipeline/src/web_client/curl_web_client_download_to_file.cpp
Normal file
59
pipeline/src/web_client/curl_web_client_download_to_file.cpp
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
/**
|
||||||
|
* @file web_client/curl_web_client_download_to_file.cpp
|
||||||
|
* @brief CURLWebClient::DownloadToFile() implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <fstream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
#include "curl_web_client_utils.h"
|
||||||
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
|
// curl write callback that writes to a file stream
|
||||||
|
static size_t WriteCallbackFile(void* contents, size_t size, size_t nmemb,
|
||||||
|
void* userp) {
|
||||||
|
size_t realsize = size * nmemb;
|
||||||
|
auto* outFile = static_cast<std::ofstream*>(userp);
|
||||||
|
outFile->write(static_cast<char*>(contents), realsize);
|
||||||
|
return realsize;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CURLWebClient::DownloadToFile(const std::string& url,
|
||||||
|
const std::string& file_path) {
|
||||||
|
auto curl = create_handle();
|
||||||
|
|
||||||
|
std::ofstream outFile(file_path, std::ios::binary);
|
||||||
|
if (!outFile.is_open()) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"[CURLWebClient] Cannot open file for writing: " + file_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
set_common_get_options(curl.get(), url, {30L, 300L});
|
||||||
|
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackFile);
|
||||||
|
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA,
|
||||||
|
static_cast<void*>(&outFile));
|
||||||
|
|
||||||
|
CURLcode res = curl_easy_perform(curl.get());
|
||||||
|
outFile.close();
|
||||||
|
|
||||||
|
if (res != CURLE_OK) {
|
||||||
|
std::remove(file_path.c_str());
|
||||||
|
std::string error = std::string("[CURLWebClient] Download failed: ") +
|
||||||
|
curl_easy_strerror(res);
|
||||||
|
throw std::runtime_error(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
long httpCode = 0;
|
||||||
|
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
|
||||||
|
|
||||||
|
if (httpCode != 200) {
|
||||||
|
std::remove(file_path.c_str());
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
|
||||||
|
throw std::runtime_error(ss.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
50
pipeline/src/web_client/curl_web_client_get.cpp
Normal file
50
pipeline/src/web_client/curl_web_client_get.cpp
Normal file
@@ -0,0 +1,50 @@
|
|||||||
|
/**
|
||||||
|
* @file web_client/curl_web_client_get.cpp
|
||||||
|
* @brief CURLWebClient::Get() implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "curl_web_client_utils.h"
|
||||||
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
|
// curl write callback that appends response data into a std::string
|
||||||
|
static size_t WriteCallbackString(void* contents, size_t size, size_t nmemb,
|
||||||
|
void* userp) {
|
||||||
|
size_t realsize = size * nmemb;
|
||||||
|
auto* s = static_cast<std::string*>(userp);
|
||||||
|
s->append(static_cast<char*>(contents), realsize);
|
||||||
|
return realsize;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string CURLWebClient::Get(const std::string& url) {
|
||||||
|
auto curl = create_handle();
|
||||||
|
|
||||||
|
std::string response_string;
|
||||||
|
set_common_get_options(curl.get(), url, {10L, 20L});
|
||||||
|
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
|
||||||
|
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
|
||||||
|
|
||||||
|
CURLcode res = curl_easy_perform(curl.get());
|
||||||
|
|
||||||
|
if (res != CURLE_OK) {
|
||||||
|
std::string error =
|
||||||
|
std::string("[CURLWebClient] GET failed: ") + curl_easy_strerror(res);
|
||||||
|
throw std::runtime_error(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
long httpCode = 0;
|
||||||
|
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
|
||||||
|
|
||||||
|
if (httpCode != 200) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
|
||||||
|
throw std::runtime_error(ss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
return response_string;
|
||||||
|
}
|
||||||
23
pipeline/src/web_client/curl_web_client_url_encode.cpp
Normal file
23
pipeline/src/web_client/curl_web_client_url_encode.cpp
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
/**
|
||||||
|
* @file web_client/curl_web_client_url_encode.cpp
|
||||||
|
* @brief CURLWebClient::UrlEncode() implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
|
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
|
std::string CURLWebClient::UrlEncode(const std::string& value) {
|
||||||
|
// A NULL handle is fine for UTF-8 encoding according to libcurl docs.
|
||||||
|
char* output = curl_easy_escape(nullptr, value.c_str(), 0);
|
||||||
|
|
||||||
|
if (output) {
|
||||||
|
std::string result(output);
|
||||||
|
curl_free(output);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
|
||||||
|
}
|
||||||
28
pipeline/src/web_client/curl_web_client_utils.cpp
Normal file
28
pipeline/src/web_client/curl_web_client_utils.cpp
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
/**
|
||||||
|
* @file web_client/curl_web_client_utils.cpp
|
||||||
|
* @brief Shared CURLWebClient helper implementations.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "curl_web_client_utils.h"
|
||||||
|
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
auto create_handle() -> CurlHandle {
|
||||||
|
CURL* handle = curl_easy_init();
|
||||||
|
if (handle == nullptr) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"[CURLWebClient] Failed to initialize libcurl handle");
|
||||||
|
}
|
||||||
|
return CurlHandle(handle, &curl_easy_cleanup);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto set_common_get_options(CURL* curl, const std::string& url,
|
||||||
|
CurlTimeouts timeouts) -> void {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||||
|
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
|
||||||
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, timeouts.connect_timeout);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_TIMEOUT, timeouts.total_timeout);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
|
||||||
|
}
|
||||||
26
pipeline/src/web_client/curl_web_client_utils.h
Normal file
26
pipeline/src/web_client/curl_web_client_utils.h
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_UTILS_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_UTILS_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file web_client/curl_web_client_utils.h
|
||||||
|
* @brief Shared helpers for CURLWebClient request setup.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
||||||
|
|
||||||
|
struct CurlTimeouts {
|
||||||
|
long connect_timeout;
|
||||||
|
long total_timeout;
|
||||||
|
};
|
||||||
|
|
||||||
|
CurlHandle create_handle();
|
||||||
|
|
||||||
|
void set_common_get_options(CURL* curl, const std::string& url,
|
||||||
|
CurlTimeouts timeouts);
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_UTILS_H_
|
||||||
@@ -1,95 +0,0 @@
|
|||||||
/**
|
|
||||||
* @file wikipedia/wikipedia_service.cpp
|
|
||||||
* @brief Implements Wikipedia extract retrieval and caching for city/country
|
|
||||||
* queries, including response parsing and resilient error handling.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "wikipedia/wikipedia_service.h"
|
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
|
||||||
|
|
||||||
#include <boost/json.hpp>
|
|
||||||
|
|
||||||
WikipediaService::WikipediaService(std::shared_ptr<WebClient> client)
|
|
||||||
: client_(std::move(client)) {}
|
|
||||||
|
|
||||||
std::string WikipediaService::FetchExtract(std::string_view query) {
|
|
||||||
const std::string encoded = client_->UrlEncode(std::string(query));
|
|
||||||
const std::string url =
|
|
||||||
"https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
|
|
||||||
"&prop=extracts&explaintext=1&format=json";
|
|
||||||
|
|
||||||
const std::string body = client_->Get(url);
|
|
||||||
|
|
||||||
boost::system::error_code ec;
|
|
||||||
boost::json::value doc = boost::json::parse(body, ec);
|
|
||||||
|
|
||||||
if (!ec && doc.is_object()) {
|
|
||||||
try {
|
|
||||||
auto& pages = doc.at("query").at("pages").get_object();
|
|
||||||
if (!pages.empty()) {
|
|
||||||
auto& page = pages.begin()->value().get_object();
|
|
||||||
if (page.contains("extract") && page.at("extract").is_string()) {
|
|
||||||
std::string extract(page.at("extract").as_string().c_str());
|
|
||||||
spdlog::debug("WikipediaService fetched {} chars for '{}'",
|
|
||||||
extract.size(), query);
|
|
||||||
return extract;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (const std::exception& e) {
|
|
||||||
spdlog::warn(
|
|
||||||
"WikipediaService: failed to parse response structure for '{}': "
|
|
||||||
"{}",
|
|
||||||
query, e.what());
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
} else if (ec) {
|
|
||||||
spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
|
|
||||||
ec.message());
|
|
||||||
}
|
|
||||||
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string WikipediaService::GetSummary(std::string_view city,
|
|
||||||
std::string_view country) {
|
|
||||||
const std::string key = std::string(city) + "|" + std::string(country);
|
|
||||||
const auto cacheIt = cache_.find(key);
|
|
||||||
if (cacheIt != cache_.end()) {
|
|
||||||
return cacheIt->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string result;
|
|
||||||
|
|
||||||
if (!client_) {
|
|
||||||
cache_.emplace(key, result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string regionQuery(city);
|
|
||||||
if (!country.empty()) {
|
|
||||||
regionQuery += ", ";
|
|
||||||
regionQuery += country;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string beerQuery = "beer in " + std::string(country);
|
|
||||||
|
|
||||||
try {
|
|
||||||
const std::string regionExtract = FetchExtract(regionQuery);
|
|
||||||
const std::string beerExtract = FetchExtract(beerQuery);
|
|
||||||
|
|
||||||
if (!regionExtract.empty()) {
|
|
||||||
result += regionExtract;
|
|
||||||
}
|
|
||||||
if (!beerExtract.empty()) {
|
|
||||||
if (!result.empty()) result += "\n\n";
|
|
||||||
result += beerExtract;
|
|
||||||
}
|
|
||||||
} catch (const std::runtime_error& e) {
|
|
||||||
spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery,
|
|
||||||
e.what());
|
|
||||||
}
|
|
||||||
|
|
||||||
cache_.emplace(key, result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user