mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
Compare commits
21 Commits
feat/pipel
...
b1ac3a6068
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b1ac3a6068 | ||
|
|
06d329cac5 | ||
|
|
54c403526b | ||
|
|
b8e96a6d45 | ||
|
|
60ee2ecf74 | ||
|
|
e4e16a5084 | ||
|
|
8d306bf691 | ||
|
|
077f6ab4ae | ||
|
|
534403734a | ||
|
|
3af053f0eb | ||
|
|
ba165d8aa7 | ||
|
|
eb9a2767b4 | ||
|
|
29ea47fdb6 | ||
|
|
52e2333304 | ||
|
|
a1f0ca5b20 | ||
|
|
2ea8aa52b4 | ||
|
|
98083ab40c | ||
|
|
ac136f7179 | ||
|
|
280c9c61bd | ||
|
|
248a51b35f | ||
|
|
35aa7bc0df |
5
pipeline/.clang-format
Normal file
5
pipeline/.clang-format
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
---
|
||||||
|
BasedOnStyle: Google
|
||||||
|
ColumnLimit: 80
|
||||||
|
IndentWidth: 3
|
||||||
|
...
|
||||||
17
pipeline/.clang-tidy
Normal file
17
pipeline/.clang-tidy
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
---
|
||||||
|
Checks: >
|
||||||
|
-*,
|
||||||
|
bugprone-*,
|
||||||
|
clang-analyzer-*,
|
||||||
|
cppcoreguidelines-*,
|
||||||
|
google-*,
|
||||||
|
modernize-*,
|
||||||
|
performance-*,
|
||||||
|
readability-*,
|
||||||
|
-cppcoreguidelines-avoid-magic-numbers,
|
||||||
|
-cppcoreguidelines-owning-memory,
|
||||||
|
-readability-magic-numbers,
|
||||||
|
-google-readability-todo
|
||||||
|
HeaderFilterRegex: "^(src|includes)/.*"
|
||||||
|
FormatStyle: file
|
||||||
|
...
|
||||||
5
pipeline/.gitignore
vendored
Normal file
5
pipeline/.gitignore
vendored
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
dist
|
||||||
|
build
|
||||||
|
data
|
||||||
|
models
|
||||||
|
*.gguf
|
||||||
115
pipeline/CMakeLists.txt
Normal file
115
pipeline/CMakeLists.txt
Normal file
@@ -0,0 +1,115 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.24)
|
||||||
|
project(biergarten-pipeline)
|
||||||
|
# =============================================================================
|
||||||
|
# 1. GPU Detection
|
||||||
|
# =============================================================================
|
||||||
|
# GGML_CUDA / GGML_METAL are set here so that the llama.cpp FetchContent below
|
||||||
|
# inherits them as cache variables before its CMakeLists.txt is processed.
|
||||||
|
if(APPLE)
|
||||||
|
message(STATUS "[biergarten] Apple Silicon detected — enabling Metal acceleration.")
|
||||||
|
set(GGML_METAL ON CACHE BOOL "Enable Metal for Apple Silicon" FORCE)
|
||||||
|
elseif(UNIX AND NOT APPLE)
|
||||||
|
find_package(CUDAToolkit QUIET)
|
||||||
|
if(CUDAToolkit_FOUND)
|
||||||
|
message(STATUS "[biergarten] NVIDIA GPU detected — enabling CUDA acceleration.")
|
||||||
|
set(GGML_CUDA ON CACHE BOOL "Enable CUDA for NVIDIA GPUs" FORCE)
|
||||||
|
# 'native' resolves to the exact SM version of the present GPU at configure time
|
||||||
|
# (e.g. sm_89 for RTX 2000 Ada). Change to a concrete arch list for cross-compilation.
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES native)
|
||||||
|
else()
|
||||||
|
message(STATUS "[biergarten] No NVIDIA GPU found — falling back to CPU.")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
# =============================================================================
|
||||||
|
# 2. Project-wide Settings
|
||||||
|
# =============================================================================
|
||||||
|
set(CMAKE_CXX_STANDARD 23)
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
# =============================================================================
|
||||||
|
# 3. Dependencies
|
||||||
|
# =============================================================================
|
||||||
|
include(FetchContent)
|
||||||
|
# --- libcurl ------------------------------------------------------------------
|
||||||
|
# Prefer the system package; the build will fail at link time if absent and
|
||||||
|
# no system curl is found, so emit a fatal error early rather than a silent gap.
|
||||||
|
find_package(CURL QUIET)
|
||||||
|
if(NOT CURL_FOUND)
|
||||||
|
message(FATAL_ERROR
|
||||||
|
"[biergarten] libcurl not found. Install it via your package manager "
|
||||||
|
"(e.g. 'sudo dnf install libcurl-devel') or set CURL_ROOT.")
|
||||||
|
endif()
|
||||||
|
# --- llama.cpp ----------------------------------------------------------------
|
||||||
|
# Pinned to a specific commit for reproducible builds.
|
||||||
|
# To update: pick a new commit SHA from https://github.com/ggml-org/llama.cpp
|
||||||
|
FetchContent_Declare(
|
||||||
|
llama-cpp
|
||||||
|
GIT_REPOSITORY https://github.com/ggml-org/llama.cpp.git
|
||||||
|
GIT_TAG b8611
|
||||||
|
)
|
||||||
|
FetchContent_MakeAvailable(llama-cpp)
|
||||||
|
# --- Boost (JSON + program_options) ------------------------------------------
|
||||||
|
FetchContent_Declare(
|
||||||
|
boost
|
||||||
|
URL https://github.com/boostorg/boost/releases/download/boost-1.85.0/boost-1.85.0-cmake.tar.gz
|
||||||
|
)
|
||||||
|
FetchContent_MakeAvailable(boost)
|
||||||
|
# --- spdlog -------------------------------------------------------------------
|
||||||
|
FetchContent_Declare(
|
||||||
|
spdlog
|
||||||
|
GIT_REPOSITORY https://github.com/gabime/spdlog.git
|
||||||
|
GIT_TAG v1.15.3
|
||||||
|
)
|
||||||
|
FetchContent_MakeAvailable(spdlog)
|
||||||
|
# =============================================================================
|
||||||
|
# 4. Sources
|
||||||
|
# =============================================================================
|
||||||
|
set(SOURCES
|
||||||
|
src/main.cpp
|
||||||
|
src/biergarten_data_generator.cpp
|
||||||
|
src/data_generation/llama/destructor.cpp
|
||||||
|
src/data_generation/llama/generate_brewery.cpp
|
||||||
|
src/data_generation/llama/generate_user.cpp
|
||||||
|
src/data_generation/llama/helpers.cpp
|
||||||
|
src/data_generation/llama/infer.cpp
|
||||||
|
src/data_generation/llama/load.cpp
|
||||||
|
src/data_generation/llama/load_brewery_prompt.cpp
|
||||||
|
src/data_generation/llama/set_sampling_options.cpp
|
||||||
|
src/data_generation/mock/data.cpp
|
||||||
|
src/data_generation/mock/deterministic_hash.cpp
|
||||||
|
src/data_generation/mock/generate_brewery.cpp
|
||||||
|
src/data_generation/mock/generate_user.cpp
|
||||||
|
src/data_generation/mock/load.cpp
|
||||||
|
src/json_handling/json_loader.cpp
|
||||||
|
src/web_client/curl_web_client.cpp
|
||||||
|
src/wikipedia/wikipedia_service.cpp
|
||||||
|
)
|
||||||
|
# =============================================================================
|
||||||
|
# 5. Target
|
||||||
|
# =============================================================================
|
||||||
|
add_executable(${PROJECT_NAME}
|
||||||
|
${SOURCES}
|
||||||
|
)
|
||||||
|
target_include_directories(${PROJECT_NAME} PRIVATE
|
||||||
|
includes
|
||||||
|
${llama-cpp_SOURCE_DIR}/include
|
||||||
|
${llama-cpp_SOURCE_DIR}/common
|
||||||
|
)
|
||||||
|
target_link_libraries(${PROJECT_NAME} PRIVATE
|
||||||
|
llama
|
||||||
|
boost_json
|
||||||
|
boost_program_options
|
||||||
|
spdlog::spdlog
|
||||||
|
CURL::libcurl
|
||||||
|
)
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# 6. Runtime Assets
|
||||||
|
# =============================================================================
|
||||||
|
# Make locations.json available in the build directory for runtime relative path
|
||||||
|
# lookups (e.g. when running from ./build).
|
||||||
|
configure_file(
|
||||||
|
${CMAKE_SOURCE_DIR}/locations.json
|
||||||
|
${CMAKE_BINARY_DIR}/locations.json
|
||||||
|
COPYONLY
|
||||||
|
)
|
||||||
406
pipeline/README.md
Normal file
406
pipeline/README.md
Normal file
@@ -0,0 +1,406 @@
|
|||||||
|
# Biergarten Pipeline
|
||||||
|
|
||||||
|
A high-performance C++23 data pipeline for fetching, parsing, and storing geographic data (countries, states, cities) with brewery metadata generation capabilities. The system supports both mock and LLM-based (llama.cpp) generation modes.
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The pipeline orchestrates **four key stages**:
|
||||||
|
|
||||||
|
1. **Download** - Fetches `countries+states+cities.json` from a pinned GitHub commit with optional local filesystem caching
|
||||||
|
2. **Parse** - Streams JSON using Boost.JSON's `basic_parser` to extract country/state/city records without loading the entire file into memory
|
||||||
|
3. **Store** - Inserts records into a file-based SQLite database with all operations performed sequentially in a single thread
|
||||||
|
4. **Generate** - Produces brewery metadata or user profiles (mock implementation; supports future LLM integration via llama.cpp)
|
||||||
|
|
||||||
|
## System Architecture
|
||||||
|
|
||||||
|
### Data Sources and Formats
|
||||||
|
|
||||||
|
- **Hierarchical Structure**: Countries array → states per country → cities per state
|
||||||
|
- **Data Fields**:
|
||||||
|
- `id` (integer)
|
||||||
|
- `name` (string)
|
||||||
|
- `iso2` / `iso3` (ISO country/state codes)
|
||||||
|
- `latitude` / `longitude` (geographic coordinates)
|
||||||
|
- **Source**: [dr5hn/countries-states-cities-database](https://github.com/dr5hn/countries-states-cities-database) on GitHub
|
||||||
|
- **Output**: Structured SQLite file-based database (`biergarten-pipeline.db`) + structured logging via spdlog
|
||||||
|
|
||||||
|
### Concurrency Model
|
||||||
|
|
||||||
|
The pipeline currently operates **single-threaded** with sequential stage execution:
|
||||||
|
|
||||||
|
1. **Download Phase**: Main thread blocks while downloading the source JSON file (if not in cache)
|
||||||
|
2. **Parse & Store Phase**: Main thread performs streaming JSON parse with immediate SQLite inserts
|
||||||
|
|
||||||
|
**Thread Safety**: While single-threaded, the `SqliteDatabase` component is **mutex-protected** using `std::mutex` (`dbMutex`) for all database operations. This design enables safe future parallelization without code modifications.
|
||||||
|
|
||||||
|
## Core Components
|
||||||
|
|
||||||
|
| Component | Purpose | Thread Safety | Dependencies |
|
||||||
|
| ----------------------------- | ----------------------------------------------------------------------------------------------- | -------------------------------------------- | --------------------------------------------- |
|
||||||
|
| **BiergartenDataGenerator** | Orchestrates pipeline execution; manages lifecycle of downloader, parser, and generator | Single-threaded coordinator | ApplicationOptions, WebClient, SqliteDatabase |
|
||||||
|
| **DataDownloader** | HTTP fetch with curl; optional filesystem cache; ETag support and retries | Blocking I/O; safe for startup | IWebClient, filesystem |
|
||||||
|
| **StreamingJsonParser** | Extends `boost::json::basic_parser`; emits country/state/city via callbacks; tracks parse depth | Single-threaded parse; callbacks thread-safe | Boost.JSON |
|
||||||
|
| **JsonLoader** | Wraps parser; dispatches callbacks for country/state/city; manages WorkQueue lifecycle | Produces to WorkQueue; safe callbacks | StreamingJsonParser, SqliteDatabase |
|
||||||
|
| **SqliteDatabase** | Manages schema initialization; insert/query methods for geographic data | Mutex-guarded all operations | SQLite3 |
|
||||||
|
| **IDataGenerator** (Abstract) | Interface for brewery/user metadata generation | Stateless virtual methods | N/A |
|
||||||
|
| **LlamaGenerator** | LLM-based generation via llama.cpp; configurable sampling (temperature, top-p, seed) | Manages llama_model* and llama_context* | llama.cpp, BreweryResult, UserResult |
|
||||||
|
| **MockGenerator** | Deterministic mock generation using seeded randomization | Stateless; thread-safe | N/A |
|
||||||
|
| **CURLWebClient** | HTTP client adapter; URL encoding; file downloads | cURL library bindings | libcurl |
|
||||||
|
| **WikipediaService** | (Planned) Wikipedia data lookups for enrichment | N/A | IWebClient |
|
||||||
|
|
||||||
|
## Database Schema
|
||||||
|
|
||||||
|
SQLite file-based database with **three core tables** and **indexes for fast lookups**:
|
||||||
|
|
||||||
|
### Countries
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE countries (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
iso2 TEXT,
|
||||||
|
iso3 TEXT
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_countries_iso2 ON countries(iso2);
|
||||||
|
```
|
||||||
|
|
||||||
|
### States
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE states (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
country_id INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
iso2 TEXT,
|
||||||
|
FOREIGN KEY (country_id) REFERENCES countries(id)
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_states_country ON states(country_id);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cities
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE cities (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
state_id INTEGER NOT NULL,
|
||||||
|
country_id INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
latitude REAL,
|
||||||
|
longitude REAL,
|
||||||
|
FOREIGN KEY (state_id) REFERENCES states(id),
|
||||||
|
FOREIGN KEY (country_id) REFERENCES countries(id)
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_cities_state ON cities(state_id);
|
||||||
|
CREATE INDEX idx_cities_country ON cities(country_id);
|
||||||
|
```
|
||||||
|
|
||||||
|
## Architecture Diagram
|
||||||
|
|
||||||
|
```plantuml
|
||||||
|
@startuml biergarten-pipeline
|
||||||
|
!theme plain
|
||||||
|
skinparam monochrome true
|
||||||
|
skinparam classBackgroundColor #FFFFFF
|
||||||
|
skinparam classBorderColor #000000
|
||||||
|
|
||||||
|
package "Application Layer" {
|
||||||
|
class BiergartenDataGenerator {
|
||||||
|
- options: ApplicationOptions
|
||||||
|
- webClient: IWebClient
|
||||||
|
- database: SqliteDatabase
|
||||||
|
- generator: IDataGenerator
|
||||||
|
--
|
||||||
|
+ Run() : int
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
package "Data Acquisition" {
|
||||||
|
class DataDownloader {
|
||||||
|
- webClient: IWebClient
|
||||||
|
--
|
||||||
|
+ Download(url: string, filePath: string)
|
||||||
|
+ DownloadWithCache(url: string, cachePath: string)
|
||||||
|
}
|
||||||
|
|
||||||
|
interface IWebClient {
|
||||||
|
+ DownloadToFile(url: string, filePath: string)
|
||||||
|
+ Get(url: string) : string
|
||||||
|
+ UrlEncode(value: string) : string
|
||||||
|
}
|
||||||
|
|
||||||
|
class CURLWebClient {
|
||||||
|
- globalState: CurlGlobalState
|
||||||
|
--
|
||||||
|
+ DownloadToFile(url: string, filePath: string)
|
||||||
|
+ Get(url: string) : string
|
||||||
|
+ UrlEncode(value: string) : string
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
package "JSON Processing" {
|
||||||
|
class StreamingJsonParser {
|
||||||
|
- depth: int
|
||||||
|
--
|
||||||
|
+ on_object_begin()
|
||||||
|
+ on_object_end()
|
||||||
|
+ on_array_begin()
|
||||||
|
+ on_array_end()
|
||||||
|
+ on_key(str: string)
|
||||||
|
+ on_string(str: string)
|
||||||
|
+ on_number(value: int)
|
||||||
|
}
|
||||||
|
|
||||||
|
class JsonLoader {
|
||||||
|
--
|
||||||
|
+ LoadWorldCities(jsonPath: string, db: SqliteDatabase)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
package "Data Storage" {
|
||||||
|
class SqliteDatabase {
|
||||||
|
- db: sqlite3*
|
||||||
|
- dbMutex: std::mutex
|
||||||
|
--
|
||||||
|
+ Initialize(dbPath: string)
|
||||||
|
+ InsertCountry(id: int, name: string, iso2: string, iso3: string)
|
||||||
|
+ InsertState(id: int, countryId: int, name: string, iso2: string)
|
||||||
|
+ InsertCity(id: int, stateId: int, countryId: int, name: string, lat: double, lon: double)
|
||||||
|
+ QueryCountries(limit: int) : vector<Country>
|
||||||
|
+ QueryStates(limit: int) : vector<State>
|
||||||
|
+ QueryCities() : vector<City>
|
||||||
|
+ BeginTransaction()
|
||||||
|
+ CommitTransaction()
|
||||||
|
# InitializeSchema()
|
||||||
|
}
|
||||||
|
|
||||||
|
struct Country {
|
||||||
|
id: int
|
||||||
|
name: string
|
||||||
|
iso2: string
|
||||||
|
iso3: string
|
||||||
|
}
|
||||||
|
|
||||||
|
struct State {
|
||||||
|
id: int
|
||||||
|
name: string
|
||||||
|
iso2: string
|
||||||
|
countryId: int
|
||||||
|
}
|
||||||
|
|
||||||
|
struct City {
|
||||||
|
id: int
|
||||||
|
name: string
|
||||||
|
countryId: int
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
package "Data Generation" {
|
||||||
|
interface IDataGenerator {
|
||||||
|
+ load(modelPath: string)
|
||||||
|
+ generateBrewery(cityName: string, countryName: string, regionContext: string) : BreweryResult
|
||||||
|
+ generateUser(locale: string) : UserResult
|
||||||
|
}
|
||||||
|
|
||||||
|
class LlamaGenerator {
|
||||||
|
- model: llama_model*
|
||||||
|
- context: llama_context*
|
||||||
|
- sampling_temperature: float
|
||||||
|
- sampling_top_p: float
|
||||||
|
- sampling_seed: uint32_t
|
||||||
|
--
|
||||||
|
+ load(modelPath: string)
|
||||||
|
+ generateBrewery(...) : BreweryResult
|
||||||
|
+ generateUser(locale: string) : UserResult
|
||||||
|
+ setSamplingOptions(temperature: float, topP: float, seed: int)
|
||||||
|
# infer(prompt: string) : string
|
||||||
|
}
|
||||||
|
|
||||||
|
class MockGenerator {
|
||||||
|
--
|
||||||
|
+ load(modelPath: string)
|
||||||
|
+ generateBrewery(...) : BreweryResult
|
||||||
|
+ generateUser(locale: string) : UserResult
|
||||||
|
}
|
||||||
|
|
||||||
|
struct BreweryResult {
|
||||||
|
name: string
|
||||||
|
description: string
|
||||||
|
}
|
||||||
|
|
||||||
|
struct UserResult {
|
||||||
|
username: string
|
||||||
|
bio: string
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
package "Enrichment (Planned)" {
|
||||||
|
class WikipediaService {
|
||||||
|
- webClient: IWebClient
|
||||||
|
--
|
||||||
|
+ SearchCity(cityName: string, countryName: string) : string
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
' Relationships
|
||||||
|
BiergartenDataGenerator --> DataDownloader
|
||||||
|
BiergartenDataGenerator --> JsonLoader
|
||||||
|
BiergartenDataGenerator --> SqliteDatabase
|
||||||
|
BiergartenDataGenerator --> IDataGenerator
|
||||||
|
|
||||||
|
DataDownloader --> IWebClient
|
||||||
|
CURLWebClient ..|> IWebClient
|
||||||
|
|
||||||
|
JsonLoader --> StreamingJsonParser
|
||||||
|
JsonLoader --> SqliteDatabase
|
||||||
|
|
||||||
|
LlamaGenerator ..|> IDataGenerator
|
||||||
|
MockGenerator ..|> IDataGenerator
|
||||||
|
|
||||||
|
SqliteDatabase --> Country
|
||||||
|
SqliteDatabase --> State
|
||||||
|
SqliteDatabase --> City
|
||||||
|
|
||||||
|
LlamaGenerator --> BreweryResult
|
||||||
|
LlamaGenerator --> UserResult
|
||||||
|
MockGenerator --> BreweryResult
|
||||||
|
MockGenerator --> UserResult
|
||||||
|
|
||||||
|
WikipediaService --> IWebClient
|
||||||
|
|
||||||
|
@enduml
|
||||||
|
```
|
||||||
|
|
||||||
|
## Configuration and Extensibility
|
||||||
|
|
||||||
|
### Command-Line Arguments
|
||||||
|
|
||||||
|
Boost.Program_options provides named CLI arguments. Running without arguments displays usage instructions.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./biergarten-pipeline [options]
|
||||||
|
```
|
||||||
|
|
||||||
|
**Requirement**: Exactly one of `--mocked` or `--model` must be specified.
|
||||||
|
|
||||||
|
| Argument | Short | Type | Purpose |
|
||||||
|
| --------------- | ----- | ------ | --------------------------------------------------------------- |
|
||||||
|
| `--mocked` | - | flag | Use mocked generator for brewery/user data |
|
||||||
|
| `--model` | `-m` | string | Path to LLM model file (gguf); mutually exclusive with --mocked |
|
||||||
|
| `--cache-dir` | `-c` | path | Directory for cached JSON (default: `/tmp`) |
|
||||||
|
| `--temperature` | - | float | LLM sampling temperature 0.0-1.0 (default: `0.8`) |
|
||||||
|
| `--top-p` | - | float | Nucleus sampling parameter 0.0-1.0 (default: `0.92`) |
|
||||||
|
| `--seed` | - | int | Random seed: -1 for random (default: `-1`) |
|
||||||
|
| `--help` | `-h` | flag | Show help message |
|
||||||
|
|
||||||
|
**Note**: The data source is always pinned to commit `c5eb7772` (stable 2026-03-28) and cannot be changed.
|
||||||
|
|
||||||
|
**Note**: When `--mocked` is used, any sampling parameters (`--temperature`, `--top-p`, `--seed`) are ignored with a warning.
|
||||||
|
|
||||||
|
### Usage Examples
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Mocked generator (deterministic, no LLM required)
|
||||||
|
./biergarten-pipeline --mocked
|
||||||
|
|
||||||
|
# With LLM model
|
||||||
|
./biergarten-pipeline --model ./models/llama.gguf --cache-dir /var/cache
|
||||||
|
|
||||||
|
# Mocked with extra parameters provided (will be ignored with warning)
|
||||||
|
./biergarten-pipeline --mocked --temperature 0.5 --top-p 0.8 --seed 42
|
||||||
|
|
||||||
|
# Show help
|
||||||
|
./biergarten-pipeline --help
|
||||||
|
```
|
||||||
|
|
||||||
|
## Building and Running
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- **C++23 compiler** (g++, clang, MSVC)
|
||||||
|
- **CMake** 3.20+
|
||||||
|
- **curl** (for HTTP downloads)
|
||||||
|
- **sqlite3** (database backend)
|
||||||
|
- **Boost** 1.75+ (requires Boost.JSON and Boost.Program_options)
|
||||||
|
- **spdlog** v1.11.0 (fetched via CMake FetchContent)
|
||||||
|
- **llama.cpp** (fetched via CMake FetchContent for LLM inference)
|
||||||
|
|
||||||
|
### Build
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --target biergarten-pipeline -- -j
|
||||||
|
```
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./build/biergarten-pipeline
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output**:
|
||||||
|
|
||||||
|
- Console logs with structured spdlog output
|
||||||
|
- Cached JSON file: `/tmp/countries+states+cities.json`
|
||||||
|
- SQLite database: `biergarten-pipeline.db` (in output directory)
|
||||||
|
|
||||||
|
## Code Quality and Static Analysis
|
||||||
|
|
||||||
|
### Formatting
|
||||||
|
|
||||||
|
This project uses **clang-format** with the **Google C++ style guide**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Apply formatting to all source files
|
||||||
|
cmake --build build --target format
|
||||||
|
|
||||||
|
# Check formatting without modifications
|
||||||
|
cmake --build build --target format-check
|
||||||
|
```
|
||||||
|
|
||||||
|
### Static Analysis
|
||||||
|
|
||||||
|
This project uses **clang-tidy** with configurations for Google, modernize, performance, and bug-prone rules (`.clang-tidy`):
|
||||||
|
|
||||||
|
Static analysis runs automatically during compilation if `clang-tidy` is available.
|
||||||
|
|
||||||
|
## Code Implementation Summary
|
||||||
|
|
||||||
|
### Key Achievements
|
||||||
|
|
||||||
|
✅ **Full pipeline implementation** - Download → Parse → Store → Generate
|
||||||
|
✅ **Streaming JSON parser** - Memory-efficient processing via Boost.JSON callbacks
|
||||||
|
✅ **Thread-safe SQLite wrapper** - Mutex-protected database for future parallelization
|
||||||
|
✅ **Flexible data generation** - Abstract IDataGenerator interface supporting both mock and LLM modes
|
||||||
|
✅ **Comprehensive CLI** - Boost.Program_options with sensible defaults
|
||||||
|
✅ **Production-grade logging** - spdlog integration for structured output
|
||||||
|
✅ **Build quality** - CMake with clang-format/clang-tidy integration
|
||||||
|
|
||||||
|
### Architecture Patterns
|
||||||
|
|
||||||
|
- **Interface-based design**: `IWebClient`, `IDataGenerator` abstract base classes enable substitution and testing
|
||||||
|
- **Dependency injection**: Components receive dependencies via constructors (BiergartenDataGenerator)
|
||||||
|
- **RAII principle**: SQLite connections and resources managed via destructors
|
||||||
|
- **Callback-driven parsing**: Boost.JSON parser emits events to processing callbacks
|
||||||
|
- **Transaction-scoped inserts**: BeginTransaction/CommitTransaction for batch performance
|
||||||
|
|
||||||
|
### External Dependencies
|
||||||
|
|
||||||
|
| Dependency | Version | Purpose | Type |
|
||||||
|
| ---------- | ------- | ---------------------------------- | ------- |
|
||||||
|
| Boost | 1.75+ | JSON parsing, CLI argument parsing | Library |
|
||||||
|
| SQLite3 | - | Persistent data storage | System |
|
||||||
|
| libcurl | - | HTTP downloads | System |
|
||||||
|
| spdlog | v1.11.0 | Structured logging | Fetched |
|
||||||
|
| llama.cpp | b8611 | LLM inference engine | Fetched |
|
||||||
|
|
||||||
|
to validate formatting without modifying files.
|
||||||
|
|
||||||
|
clang-tidy runs automatically on the biergarten-pipeline target when available. You can disable it at configure time:
|
||||||
|
|
||||||
|
cmake -DENABLE_CLANG_TIDY=OFF ..
|
||||||
|
|
||||||
|
You can also disable format helper targets:
|
||||||
|
|
||||||
|
cmake -DENABLE_CLANG_FORMAT_TARGETS=OFF ..
|
||||||
141
pipeline/includes/biergarten_data_generator.h
Normal file
141
pipeline/includes/biergarten_data_generator.h
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "data_generation/data_generator.h"
|
||||||
|
#include "models/location.h"
|
||||||
|
#include "web_client/web_client.h"
|
||||||
|
#include "wikipedia/wikipedia_service.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Program options for the Biergarten pipeline application.
|
||||||
|
*/
|
||||||
|
struct ApplicationOptions {
|
||||||
|
/// @brief Path to the LLM model file (gguf format); mutually exclusive with
|
||||||
|
/// use_mocked.
|
||||||
|
std::string model_path;
|
||||||
|
|
||||||
|
/// @brief Use mocked generator instead of LLM; mutually exclusive with
|
||||||
|
/// model_path.
|
||||||
|
bool use_mocked = false;
|
||||||
|
|
||||||
|
/// @brief Directory for cached JSON and database files.
|
||||||
|
std::string cache_dir;
|
||||||
|
|
||||||
|
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
|
||||||
|
float temperature = 0.8f;
|
||||||
|
|
||||||
|
/// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more
|
||||||
|
/// random).
|
||||||
|
float top_p = 0.92f;
|
||||||
|
|
||||||
|
/// @brief Context window size (tokens) for LLM inference. Higher values
|
||||||
|
/// support longer prompts but use more memory.
|
||||||
|
uint32_t n_ctx = 2048;
|
||||||
|
|
||||||
|
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
|
||||||
|
int seed = -1;
|
||||||
|
|
||||||
|
/// @brief Git commit hash for database consistency (always pinned to
|
||||||
|
/// c5eb7772).
|
||||||
|
std::string commit = "c5eb7772";
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Main data generator class for the Biergarten pipeline.
|
||||||
|
*
|
||||||
|
* This class encapsulates the core logic for generating brewery data.
|
||||||
|
* It handles location loading, city enrichment, and brewery generation.
|
||||||
|
*/
|
||||||
|
class BiergartenDataGenerator {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct a BiergartenDataGenerator with injected dependencies.
|
||||||
|
*
|
||||||
|
* @param options Application configuration options.
|
||||||
|
* @param web_client HTTP client for downloading data.
|
||||||
|
*/
|
||||||
|
BiergartenDataGenerator(const ApplicationOptions& options,
|
||||||
|
std::shared_ptr<WebClient> web_client);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Run the data generation pipeline.
|
||||||
|
*
|
||||||
|
* Performs the following steps:
|
||||||
|
* 1. Load curated locations from JSON
|
||||||
|
* 2. Initialize the generator (LLM or Mock)
|
||||||
|
* 3. Generate brewery data for sampled cities
|
||||||
|
*
|
||||||
|
* @return 0 on success, 1 on failure.
|
||||||
|
*/
|
||||||
|
int Run();
|
||||||
|
|
||||||
|
private:
|
||||||
|
/// @brief Immutable application options.
|
||||||
|
const ApplicationOptions options_;
|
||||||
|
|
||||||
|
/// @brief Shared HTTP client dependency.
|
||||||
|
std::shared_ptr<WebClient> webClient_;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Enriched city data with Wikipedia context.
|
||||||
|
*/
|
||||||
|
struct EnrichedCity {
|
||||||
|
Location location;
|
||||||
|
std::string region_context;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Initialize the data generator based on options.
|
||||||
|
*
|
||||||
|
* Creates either a MockGenerator (if no model path) or LlamaGenerator.
|
||||||
|
*
|
||||||
|
* @return A unique_ptr to the initialized generator.
|
||||||
|
*/
|
||||||
|
std::unique_ptr<DataGenerator> InitializeGenerator();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Load locations from JSON and sample cities.
|
||||||
|
*
|
||||||
|
* @return Vector of sampled locations capped at 30 entries.
|
||||||
|
*/
|
||||||
|
std::vector<Location> QueryCitiesWithCountries();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Enrich cities with Wikipedia summaries.
|
||||||
|
*
|
||||||
|
* @param cities Vector of sampled locations.
|
||||||
|
* @return Vector of enriched city data with context.
|
||||||
|
*/
|
||||||
|
std::vector<EnrichedCity> EnrichWithWikipedia(
|
||||||
|
const std::vector<Location>& cities);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Generate breweries for enriched cities.
|
||||||
|
*
|
||||||
|
* @param generator The data generator instance.
|
||||||
|
* @param cities Vector of enriched city data.
|
||||||
|
*/
|
||||||
|
void GenerateBreweries(DataGenerator& generator,
|
||||||
|
const std::vector<EnrichedCity>& cities);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Log the generated brewery results.
|
||||||
|
*/
|
||||||
|
void LogResults() const;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Helper struct to store generated brewery data.
|
||||||
|
*/
|
||||||
|
struct GeneratedBrewery {
|
||||||
|
Location location;
|
||||||
|
BreweryResult brewery;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// @brief Stores generated brewery data.
|
||||||
|
std::vector<GeneratedBrewery> generatedBreweries_;
|
||||||
|
};
|
||||||
|
#endif // BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
|
||||||
29
pipeline/includes/data_generation/data_generator.h
Normal file
29
pipeline/includes/data_generation/data_generator.h
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
struct BreweryResult {
|
||||||
|
std::string name;
|
||||||
|
std::string description;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct UserResult {
|
||||||
|
std::string username;
|
||||||
|
std::string bio;
|
||||||
|
};
|
||||||
|
|
||||||
|
class DataGenerator {
|
||||||
|
public:
|
||||||
|
virtual ~DataGenerator() = default;
|
||||||
|
|
||||||
|
virtual void Load(const std::string& model_path) = 0;
|
||||||
|
|
||||||
|
virtual BreweryResult GenerateBrewery(const std::string& city_name,
|
||||||
|
const std::string& country_name,
|
||||||
|
const std::string& region_context) = 0;
|
||||||
|
|
||||||
|
virtual UserResult GenerateUser(const std::string& locale) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
|
||||||
51
pipeline/includes/data_generation/llama_generator.h
Normal file
51
pipeline/includes/data_generation/llama_generator.h
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||||
|
|
||||||
|
#include <cstdint>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "data_generation/data_generator.h"
|
||||||
|
|
||||||
|
struct llama_model;
|
||||||
|
struct llama_context;
|
||||||
|
|
||||||
|
class LlamaGenerator final : public DataGenerator {
|
||||||
|
public:
|
||||||
|
LlamaGenerator() = default;
|
||||||
|
~LlamaGenerator() override;
|
||||||
|
|
||||||
|
void SetSamplingOptions(float temperature, float top_p, int seed = -1);
|
||||||
|
|
||||||
|
void SetContextSize(uint32_t n_ctx);
|
||||||
|
|
||||||
|
void Load(const std::string& model_path) override;
|
||||||
|
BreweryResult GenerateBrewery(const std::string& city_name,
|
||||||
|
const std::string& country_name,
|
||||||
|
const std::string& region_context) override;
|
||||||
|
UserResult GenerateUser(const std::string& locale) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string Infer(const std::string& prompt, int max_tokens = 10000);
|
||||||
|
// Overload that allows passing a system message separately so chat-capable
|
||||||
|
// models receive a proper system role instead of having the system text
|
||||||
|
// concatenated into the user prompt (helps avoid revealing internal
|
||||||
|
// reasoning or instructions in model output).
|
||||||
|
std::string Infer(const std::string& system_prompt,
|
||||||
|
const std::string& prompt, int max_tokens = 10000);
|
||||||
|
|
||||||
|
std::string InferFormatted(const std::string& formatted_prompt,
|
||||||
|
int max_tokens = 10000);
|
||||||
|
|
||||||
|
std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path);
|
||||||
|
std::string GetFallbackBreweryPrompt();
|
||||||
|
|
||||||
|
llama_model* model_ = nullptr;
|
||||||
|
llama_context* context_ = nullptr;
|
||||||
|
float sampling_temperature_ = 0.8f;
|
||||||
|
float sampling_top_p_ = 0.92f;
|
||||||
|
uint32_t sampling_seed_ = 0xFFFFFFFFu;
|
||||||
|
uint32_t n_ctx_ = 8192;
|
||||||
|
std::string brewery_system_prompt_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||||
32
pipeline/includes/data_generation/llama_generator_helpers.h
Normal file
32
pipeline/includes/data_generation/llama_generator_helpers.h
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
struct llama_model;
|
||||||
|
struct llama_vocab;
|
||||||
|
typedef int llama_token;
|
||||||
|
|
||||||
|
// Helper functions for LlamaGenerator methods
|
||||||
|
std::string PrepareRegionContextPublic(std::string_view region_context,
|
||||||
|
std::size_t max_chars = 700);
|
||||||
|
|
||||||
|
std::pair<std::string, std::string> ParseTwoLineResponsePublic(
|
||||||
|
const std::string& raw, const std::string& error_message);
|
||||||
|
|
||||||
|
std::string ToChatPromptPublic(const llama_model* model,
|
||||||
|
const std::string& user_prompt);
|
||||||
|
|
||||||
|
std::string ToChatPromptPublic(const llama_model* model,
|
||||||
|
const std::string& system_prompt,
|
||||||
|
const std::string& user_prompt);
|
||||||
|
|
||||||
|
void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
|
||||||
|
std::string& output);
|
||||||
|
|
||||||
|
std::string ValidateBreweryJsonPublic(const std::string& raw,
|
||||||
|
std::string& name_out,
|
||||||
|
std::string& description_out);
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
|
||||||
28
pipeline/includes/data_generation/mock_generator.h
Normal file
28
pipeline/includes/data_generation/mock_generator.h
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "data_generation/data_generator.h"
|
||||||
|
|
||||||
|
class MockGenerator final : public DataGenerator {
|
||||||
|
public:
|
||||||
|
void Load(const std::string& model_path) override;
|
||||||
|
BreweryResult GenerateBrewery(const std::string& city_name,
|
||||||
|
const std::string& country_name,
|
||||||
|
const std::string& region_context) override;
|
||||||
|
UserResult GenerateUser(const std::string& locale) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
static std::size_t DeterministicHash(const std::string& a,
|
||||||
|
const std::string& b);
|
||||||
|
|
||||||
|
static const std::vector<std::string> kBreweryAdjectives;
|
||||||
|
static const std::vector<std::string> kBreweryNouns;
|
||||||
|
static const std::vector<std::string> kBreweryDescriptions;
|
||||||
|
static const std::vector<std::string> kUsernames;
|
||||||
|
static const std::vector<std::string> kBios;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
|
||||||
16
pipeline/includes/json_handling/json_loader.h
Normal file
16
pipeline/includes/json_handling/json_loader.h
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "models/location.h"
|
||||||
|
|
||||||
|
/// @brief Loads curated world locations from a JSON file into memory.
|
||||||
|
class JsonLoader {
|
||||||
|
public:
|
||||||
|
/// @brief Parses a JSON array file and returns all location records.
|
||||||
|
static std::vector<Location> LoadLocations(const std::string& filepath);
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||||
30
pipeline/includes/web_client/curl_web_client.h
Normal file
30
pipeline/includes/web_client/curl_web_client.h
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "web_client/web_client.h"
|
||||||
|
|
||||||
|
// RAII for curl_global_init/cleanup.
|
||||||
|
// An instance of this class should be created in main() before any curl
|
||||||
|
// operations and exist for the lifetime of the application.
|
||||||
|
class CurlGlobalState {
|
||||||
|
public:
|
||||||
|
CurlGlobalState();
|
||||||
|
~CurlGlobalState();
|
||||||
|
CurlGlobalState(const CurlGlobalState&) = delete;
|
||||||
|
CurlGlobalState& operator=(const CurlGlobalState&) = delete;
|
||||||
|
};
|
||||||
|
|
||||||
|
class CURLWebClient : public WebClient {
|
||||||
|
public:
|
||||||
|
CURLWebClient();
|
||||||
|
~CURLWebClient() override;
|
||||||
|
|
||||||
|
void DownloadToFile(const std::string& url,
|
||||||
|
const std::string& file_path) override;
|
||||||
|
std::string Get(const std::string& url) override;
|
||||||
|
std::string UrlEncode(const std::string& value) override;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
||||||
22
pipeline/includes/web_client/web_client.h
Normal file
22
pipeline/includes/web_client/web_client.h
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
class WebClient {
|
||||||
|
public:
|
||||||
|
virtual ~WebClient() = default;
|
||||||
|
|
||||||
|
// Downloads content from a URL to a file. Throws on error.
|
||||||
|
virtual void DownloadToFile(const std::string& url,
|
||||||
|
const std::string& file_path) = 0;
|
||||||
|
|
||||||
|
// Performs a GET request and returns the response body as a string. Throws
|
||||||
|
// on error.
|
||||||
|
virtual std::string Get(const std::string& url) = 0;
|
||||||
|
|
||||||
|
// URL-encodes a string.
|
||||||
|
virtual std::string UrlEncode(const std::string& value) = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
|
||||||
27
pipeline/includes/wikipedia/wikipedia_service.h
Normal file
27
pipeline/includes/wikipedia/wikipedia_service.h
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
#ifndef BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <string_view>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include "web_client/web_client.h"
|
||||||
|
|
||||||
|
/// @brief Provides cached Wikipedia summary lookups for city and country pairs.
|
||||||
|
class WikipediaService {
|
||||||
|
public:
|
||||||
|
/// @brief Creates a new Wikipedia service with the provided web client.
|
||||||
|
explicit WikipediaService(std::shared_ptr<WebClient> client);
|
||||||
|
|
||||||
|
/// @brief Returns the Wikipedia summary extract for city and country.
|
||||||
|
[[nodiscard]] std::string GetSummary(std::string_view city,
|
||||||
|
std::string_view country);
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string FetchExtract(std::string_view query);
|
||||||
|
std::shared_ptr<WebClient> client_;
|
||||||
|
std::unordered_map<std::string, std::string> cache_;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
||||||
902
pipeline/locations.json
Normal file
902
pipeline/locations.json
Normal file
@@ -0,0 +1,902 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"city": "Cape Town",
|
||||||
|
"state_province": "Western Cape",
|
||||||
|
"iso3166_2": "ZA-WC",
|
||||||
|
"country": "South Africa",
|
||||||
|
"iso3166_1": "ZA",
|
||||||
|
"latitude": -33.9249,
|
||||||
|
"longitude": 18.4241
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Johannesburg",
|
||||||
|
"state_province": "Gauteng",
|
||||||
|
"iso3166_2": "ZA-GT",
|
||||||
|
"country": "South Africa",
|
||||||
|
"iso3166_1": "ZA",
|
||||||
|
"latitude": -26.2041,
|
||||||
|
"longitude": 28.0473
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Durban",
|
||||||
|
"state_province": "KwaZulu-Natal",
|
||||||
|
"iso3166_2": "ZA-NL",
|
||||||
|
"country": "South Africa",
|
||||||
|
"iso3166_1": "ZA",
|
||||||
|
"latitude": -29.8587,
|
||||||
|
"longitude": 31.0218
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Franschhoek",
|
||||||
|
"state_province": "Western Cape",
|
||||||
|
"iso3166_2": "ZA-WC",
|
||||||
|
"country": "South Africa",
|
||||||
|
"iso3166_1": "ZA",
|
||||||
|
"latitude": -33.9146,
|
||||||
|
"longitude": 19.1198
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Nairobi",
|
||||||
|
"state_province": "Nairobi",
|
||||||
|
"iso3166_2": "KE-30",
|
||||||
|
"country": "Kenya",
|
||||||
|
"iso3166_1": "KE",
|
||||||
|
"latitude": -1.2921,
|
||||||
|
"longitude": 36.8219
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Buenos Aires",
|
||||||
|
"state_province": "Buenos Aires City",
|
||||||
|
"iso3166_2": "AR-C",
|
||||||
|
"country": "Argentina",
|
||||||
|
"iso3166_1": "AR",
|
||||||
|
"latitude": -34.6037,
|
||||||
|
"longitude": -58.3816
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bariloche",
|
||||||
|
"state_province": "Río Negro",
|
||||||
|
"iso3166_2": "AR-R",
|
||||||
|
"country": "Argentina",
|
||||||
|
"iso3166_1": "AR",
|
||||||
|
"latitude": -41.1335,
|
||||||
|
"longitude": -71.3103
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bogotá",
|
||||||
|
"state_province": "Bogotá D.C.",
|
||||||
|
"iso3166_2": "CO-DC",
|
||||||
|
"country": "Colombia",
|
||||||
|
"iso3166_1": "CO",
|
||||||
|
"latitude": 4.711,
|
||||||
|
"longitude": -74.0721
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Medellín",
|
||||||
|
"state_province": "Antioquia",
|
||||||
|
"iso3166_2": "CO-ANT",
|
||||||
|
"country": "Colombia",
|
||||||
|
"iso3166_1": "CO",
|
||||||
|
"latitude": 6.2442,
|
||||||
|
"longitude": -75.5812
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "São Paulo",
|
||||||
|
"state_province": "São Paulo",
|
||||||
|
"iso3166_2": "BR-SP",
|
||||||
|
"country": "Brazil",
|
||||||
|
"iso3166_1": "BR",
|
||||||
|
"latitude": -23.5505,
|
||||||
|
"longitude": -46.6333
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Curitiba",
|
||||||
|
"state_province": "Paraná",
|
||||||
|
"iso3166_2": "BR-PR",
|
||||||
|
"country": "Brazil",
|
||||||
|
"iso3166_1": "BR",
|
||||||
|
"latitude": -25.4284,
|
||||||
|
"longitude": -49.2733
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Rio de Janeiro",
|
||||||
|
"state_province": "Rio de Janeiro",
|
||||||
|
"iso3166_2": "BR-RJ",
|
||||||
|
"country": "Brazil",
|
||||||
|
"iso3166_1": "BR",
|
||||||
|
"latitude": -22.9068,
|
||||||
|
"longitude": -43.1729
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Santiago",
|
||||||
|
"state_province": "Santiago Metropolitan",
|
||||||
|
"iso3166_2": "CL-RM",
|
||||||
|
"country": "Chile",
|
||||||
|
"iso3166_1": "CL",
|
||||||
|
"latitude": -33.4489,
|
||||||
|
"longitude": -70.6693
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Valdivia",
|
||||||
|
"state_province": "Los Ríos",
|
||||||
|
"iso3166_2": "CL-LR",
|
||||||
|
"country": "Chile",
|
||||||
|
"iso3166_1": "CL",
|
||||||
|
"latitude": -39.8142,
|
||||||
|
"longitude": -73.2459
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Lima",
|
||||||
|
"state_province": "Lima",
|
||||||
|
"iso3166_2": "PE-LMA",
|
||||||
|
"country": "Peru",
|
||||||
|
"iso3166_1": "PE",
|
||||||
|
"latitude": -12.0464,
|
||||||
|
"longitude": -77.0428
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Tokyo",
|
||||||
|
"state_province": "Tokyo",
|
||||||
|
"iso3166_2": "JP-13",
|
||||||
|
"country": "Japan",
|
||||||
|
"iso3166_1": "JP",
|
||||||
|
"latitude": 35.6762,
|
||||||
|
"longitude": 139.6503
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Osaka",
|
||||||
|
"state_province": "Osaka",
|
||||||
|
"iso3166_2": "JP-27",
|
||||||
|
"country": "Japan",
|
||||||
|
"iso3166_1": "JP",
|
||||||
|
"latitude": 34.6937,
|
||||||
|
"longitude": 135.5023
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Kyoto",
|
||||||
|
"state_province": "Kyoto",
|
||||||
|
"iso3166_2": "JP-26",
|
||||||
|
"country": "Japan",
|
||||||
|
"iso3166_1": "JP",
|
||||||
|
"latitude": 35.0116,
|
||||||
|
"longitude": 135.7681
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Sapporo",
|
||||||
|
"state_province": "Hokkaido",
|
||||||
|
"iso3166_2": "JP-01",
|
||||||
|
"country": "Japan",
|
||||||
|
"iso3166_1": "JP",
|
||||||
|
"latitude": 43.0618,
|
||||||
|
"longitude": 141.3545
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Seoul",
|
||||||
|
"state_province": "Seoul",
|
||||||
|
"iso3166_2": "KR-11",
|
||||||
|
"country": "South Korea",
|
||||||
|
"iso3166_1": "KR",
|
||||||
|
"latitude": 37.5665,
|
||||||
|
"longitude": 126.978
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Busan",
|
||||||
|
"state_province": "Busan",
|
||||||
|
"iso3166_2": "KR-26",
|
||||||
|
"country": "South Korea",
|
||||||
|
"iso3166_1": "KR",
|
||||||
|
"latitude": 35.1796,
|
||||||
|
"longitude": 129.0756
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Ho Chi Minh City",
|
||||||
|
"state_province": "Ho Chi Minh",
|
||||||
|
"iso3166_2": "VN-SG",
|
||||||
|
"country": "Vietnam",
|
||||||
|
"iso3166_1": "VN",
|
||||||
|
"latitude": 10.8231,
|
||||||
|
"longitude": 106.6297
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Hanoi",
|
||||||
|
"state_province": "Hanoi",
|
||||||
|
"iso3166_2": "VN-HN",
|
||||||
|
"country": "Vietnam",
|
||||||
|
"iso3166_1": "VN",
|
||||||
|
"latitude": 21.0285,
|
||||||
|
"longitude": 105.8542
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Da Nang",
|
||||||
|
"state_province": "Da Nang",
|
||||||
|
"iso3166_2": "VN-DN",
|
||||||
|
"country": "Vietnam",
|
||||||
|
"iso3166_1": "VN",
|
||||||
|
"latitude": 16.0544,
|
||||||
|
"longitude": 108.2022
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bangkok",
|
||||||
|
"state_province": "Bangkok",
|
||||||
|
"iso3166_2": "TH-10",
|
||||||
|
"country": "Thailand",
|
||||||
|
"iso3166_1": "TH",
|
||||||
|
"latitude": 13.7563,
|
||||||
|
"longitude": 100.5018
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Taipei",
|
||||||
|
"state_province": "Taipei",
|
||||||
|
"iso3166_2": "TW-TPE",
|
||||||
|
"country": "Taiwan",
|
||||||
|
"iso3166_1": "TW",
|
||||||
|
"latitude": 25.033,
|
||||||
|
"longitude": 121.5654
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Beijing",
|
||||||
|
"state_province": "Beijing",
|
||||||
|
"iso3166_2": "CN-BJ",
|
||||||
|
"country": "China",
|
||||||
|
"iso3166_1": "CN",
|
||||||
|
"latitude": 39.9042,
|
||||||
|
"longitude": 116.4074
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Shanghai",
|
||||||
|
"state_province": "Shanghai",
|
||||||
|
"iso3166_2": "CN-SH",
|
||||||
|
"country": "China",
|
||||||
|
"iso3166_1": "CN",
|
||||||
|
"latitude": 31.2304,
|
||||||
|
"longitude": 121.4737
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bengaluru",
|
||||||
|
"state_province": "Karnataka",
|
||||||
|
"iso3166_2": "IN-KA",
|
||||||
|
"country": "India",
|
||||||
|
"iso3166_1": "IN",
|
||||||
|
"latitude": 12.9716,
|
||||||
|
"longitude": 77.5946
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Singapore",
|
||||||
|
"state_province": "Central Singapore",
|
||||||
|
"iso3166_2": "SG-01",
|
||||||
|
"country": "Singapore",
|
||||||
|
"iso3166_1": "SG",
|
||||||
|
"latitude": 1.3521,
|
||||||
|
"longitude": 103.8198
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Melbourne",
|
||||||
|
"state_province": "Victoria",
|
||||||
|
"iso3166_2": "AU-VIC",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -37.8136,
|
||||||
|
"longitude": 144.9631
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Sydney",
|
||||||
|
"state_province": "New South Wales",
|
||||||
|
"iso3166_2": "AU-NSW",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -33.8688,
|
||||||
|
"longitude": 151.2093
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Brisbane",
|
||||||
|
"state_province": "Queensland",
|
||||||
|
"iso3166_2": "AU-QLD",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -27.4705,
|
||||||
|
"longitude": 153.026
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Adelaide",
|
||||||
|
"state_province": "South Australia",
|
||||||
|
"iso3166_2": "AU-SA",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -34.9285,
|
||||||
|
"longitude": 138.6007
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Perth",
|
||||||
|
"state_province": "Western Australia",
|
||||||
|
"iso3166_2": "AU-WA",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -31.9505,
|
||||||
|
"longitude": 115.8605
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Hobart",
|
||||||
|
"state_province": "Tasmania",
|
||||||
|
"iso3166_2": "AU-TAS",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -42.8821,
|
||||||
|
"longitude": 147.3272
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Wellington",
|
||||||
|
"state_province": "Wellington",
|
||||||
|
"iso3166_2": "NZ-WGN",
|
||||||
|
"country": "New Zealand",
|
||||||
|
"iso3166_1": "NZ",
|
||||||
|
"latitude": -41.2865,
|
||||||
|
"longitude": 174.7762
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Auckland",
|
||||||
|
"state_province": "Auckland",
|
||||||
|
"iso3166_2": "NZ-AUK",
|
||||||
|
"country": "New Zealand",
|
||||||
|
"iso3166_1": "NZ",
|
||||||
|
"latitude": -36.8485,
|
||||||
|
"longitude": 174.7633
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Christchurch",
|
||||||
|
"state_province": "Canterbury",
|
||||||
|
"iso3166_2": "NZ-CAN",
|
||||||
|
"country": "New Zealand",
|
||||||
|
"iso3166_1": "NZ",
|
||||||
|
"latitude": -43.532,
|
||||||
|
"longitude": 172.6306
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Nelson",
|
||||||
|
"state_province": "Nelson",
|
||||||
|
"iso3166_2": "NZ-NSN",
|
||||||
|
"country": "New Zealand",
|
||||||
|
"iso3166_1": "NZ",
|
||||||
|
"latitude": -41.2706,
|
||||||
|
"longitude": 173.284
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Munich",
|
||||||
|
"state_province": "Bavaria",
|
||||||
|
"iso3166_2": "DE-BY",
|
||||||
|
"country": "Germany",
|
||||||
|
"iso3166_1": "DE",
|
||||||
|
"latitude": 48.1351,
|
||||||
|
"longitude": 11.582
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Berlin",
|
||||||
|
"state_province": "Berlin",
|
||||||
|
"iso3166_2": "DE-BE",
|
||||||
|
"country": "Germany",
|
||||||
|
"iso3166_1": "DE",
|
||||||
|
"latitude": 52.52,
|
||||||
|
"longitude": 13.405
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Cologne",
|
||||||
|
"state_province": "North Rhine-Westphalia",
|
||||||
|
"iso3166_2": "DE-NW",
|
||||||
|
"country": "Germany",
|
||||||
|
"iso3166_1": "DE",
|
||||||
|
"latitude": 50.9375,
|
||||||
|
"longitude": 6.9603
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bamberg",
|
||||||
|
"state_province": "Bavaria",
|
||||||
|
"iso3166_2": "DE-BY",
|
||||||
|
"country": "Germany",
|
||||||
|
"iso3166_1": "DE",
|
||||||
|
"latitude": 49.8916,
|
||||||
|
"longitude": 10.8916
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Brussels",
|
||||||
|
"state_province": "Brussels-Capital",
|
||||||
|
"iso3166_2": "BE-BRU",
|
||||||
|
"country": "Belgium",
|
||||||
|
"iso3166_1": "BE",
|
||||||
|
"latitude": 50.8503,
|
||||||
|
"longitude": 4.3517
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Antwerp",
|
||||||
|
"state_province": "Flanders",
|
||||||
|
"iso3166_2": "BE-VLG",
|
||||||
|
"country": "Belgium",
|
||||||
|
"iso3166_1": "BE",
|
||||||
|
"latitude": 51.2194,
|
||||||
|
"longitude": 4.4025
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bruges",
|
||||||
|
"state_province": "Flanders",
|
||||||
|
"iso3166_2": "BE-VLG",
|
||||||
|
"country": "Belgium",
|
||||||
|
"iso3166_1": "BE",
|
||||||
|
"latitude": 51.2093,
|
||||||
|
"longitude": 3.2247
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "London",
|
||||||
|
"state_province": "England",
|
||||||
|
"iso3166_2": "GB-ENG",
|
||||||
|
"country": "United Kingdom",
|
||||||
|
"iso3166_1": "GB",
|
||||||
|
"latitude": 51.5074,
|
||||||
|
"longitude": -0.1278
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bristol",
|
||||||
|
"state_province": "England",
|
||||||
|
"iso3166_2": "GB-ENG",
|
||||||
|
"country": "United Kingdom",
|
||||||
|
"iso3166_1": "GB",
|
||||||
|
"latitude": 51.4545,
|
||||||
|
"longitude": -2.5879
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Edinburgh",
|
||||||
|
"state_province": "Scotland",
|
||||||
|
"iso3166_2": "GB-SCT",
|
||||||
|
"country": "United Kingdom",
|
||||||
|
"iso3166_1": "GB",
|
||||||
|
"latitude": 55.9533,
|
||||||
|
"longitude": -3.1883
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Glasgow",
|
||||||
|
"state_province": "Scotland",
|
||||||
|
"iso3166_2": "GB-SCT",
|
||||||
|
"country": "United Kingdom",
|
||||||
|
"iso3166_1": "GB",
|
||||||
|
"latitude": 55.8642,
|
||||||
|
"longitude": -4.2518
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Prague",
|
||||||
|
"state_province": "Prague",
|
||||||
|
"iso3166_2": "CZ-10",
|
||||||
|
"country": "Czechia",
|
||||||
|
"iso3166_1": "CZ",
|
||||||
|
"latitude": 50.0755,
|
||||||
|
"longitude": 14.4378
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Pilsen",
|
||||||
|
"state_province": "Plzeň",
|
||||||
|
"iso3166_2": "CZ-32",
|
||||||
|
"country": "Czechia",
|
||||||
|
"iso3166_1": "CZ",
|
||||||
|
"latitude": 49.7384,
|
||||||
|
"longitude": 13.3736
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Amsterdam",
|
||||||
|
"state_province": "North Holland",
|
||||||
|
"iso3166_2": "NL-NH",
|
||||||
|
"country": "Netherlands",
|
||||||
|
"iso3166_1": "NL",
|
||||||
|
"latitude": 52.3676,
|
||||||
|
"longitude": 4.9041
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Copenhagen",
|
||||||
|
"state_province": "Capital Region",
|
||||||
|
"iso3166_2": "DK-84",
|
||||||
|
"country": "Denmark",
|
||||||
|
"iso3166_1": "DK",
|
||||||
|
"latitude": 55.6761,
|
||||||
|
"longitude": 12.5683
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Warsaw",
|
||||||
|
"state_province": "Masovian",
|
||||||
|
"iso3166_2": "PL-MZ",
|
||||||
|
"country": "Poland",
|
||||||
|
"iso3166_1": "PL",
|
||||||
|
"latitude": 52.2297,
|
||||||
|
"longitude": 21.0122
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Krakow",
|
||||||
|
"state_province": "Lesser Poland",
|
||||||
|
"iso3166_2": "PL-MA",
|
||||||
|
"country": "Poland",
|
||||||
|
"iso3166_1": "PL",
|
||||||
|
"latitude": 50.0647,
|
||||||
|
"longitude": 19.945
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Rome",
|
||||||
|
"state_province": "Lazio",
|
||||||
|
"iso3166_2": "IT-62",
|
||||||
|
"country": "Italy",
|
||||||
|
"iso3166_1": "IT",
|
||||||
|
"latitude": 41.9028,
|
||||||
|
"longitude": 12.4964
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Milan",
|
||||||
|
"state_province": "Lombardy",
|
||||||
|
"iso3166_2": "IT-25",
|
||||||
|
"country": "Italy",
|
||||||
|
"iso3166_1": "IT",
|
||||||
|
"latitude": 45.4642,
|
||||||
|
"longitude": 9.19
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Barcelona",
|
||||||
|
"state_province": "Catalonia",
|
||||||
|
"iso3166_2": "ES-CT",
|
||||||
|
"country": "Spain",
|
||||||
|
"iso3166_1": "ES",
|
||||||
|
"latitude": 41.3851,
|
||||||
|
"longitude": 2.1734
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Madrid",
|
||||||
|
"state_province": "Madrid",
|
||||||
|
"iso3166_2": "ES-MD",
|
||||||
|
"country": "Spain",
|
||||||
|
"iso3166_1": "ES",
|
||||||
|
"latitude": 40.4168,
|
||||||
|
"longitude": -3.7038
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Paris",
|
||||||
|
"state_province": "Île-de-France",
|
||||||
|
"iso3166_2": "FR-IDF",
|
||||||
|
"country": "France",
|
||||||
|
"iso3166_1": "FR",
|
||||||
|
"latitude": 48.8566,
|
||||||
|
"longitude": 2.3522
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Lyon",
|
||||||
|
"state_province": "Auvergne-Rhône-Alpes",
|
||||||
|
"iso3166_2": "FR-ARA",
|
||||||
|
"country": "France",
|
||||||
|
"iso3166_1": "FR",
|
||||||
|
"latitude": 45.764,
|
||||||
|
"longitude": 4.8357
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Stockholm",
|
||||||
|
"state_province": "Stockholm",
|
||||||
|
"iso3166_2": "SE-AB",
|
||||||
|
"country": "Sweden",
|
||||||
|
"iso3166_1": "SE",
|
||||||
|
"latitude": 59.3293,
|
||||||
|
"longitude": 18.0686
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Gothenburg",
|
||||||
|
"state_province": "Västra Götaland",
|
||||||
|
"iso3166_2": "SE-O",
|
||||||
|
"country": "Sweden",
|
||||||
|
"iso3166_1": "SE",
|
||||||
|
"latitude": 57.7089,
|
||||||
|
"longitude": 11.9746
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Oslo",
|
||||||
|
"state_province": "Oslo",
|
||||||
|
"iso3166_2": "NO-03",
|
||||||
|
"country": "Norway",
|
||||||
|
"iso3166_1": "NO",
|
||||||
|
"latitude": 59.9139,
|
||||||
|
"longitude": 10.7522
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Dublin",
|
||||||
|
"state_province": "Leinster",
|
||||||
|
"iso3166_2": "IE-L",
|
||||||
|
"country": "Ireland",
|
||||||
|
"iso3166_1": "IE",
|
||||||
|
"latitude": 53.3498,
|
||||||
|
"longitude": -6.2603
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Vienna",
|
||||||
|
"state_province": "Vienna",
|
||||||
|
"iso3166_2": "AT-9",
|
||||||
|
"country": "Austria",
|
||||||
|
"iso3166_1": "AT",
|
||||||
|
"latitude": 48.2082,
|
||||||
|
"longitude": 16.3738
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Zurich",
|
||||||
|
"state_province": "Zurich",
|
||||||
|
"iso3166_2": "CH-ZH",
|
||||||
|
"country": "Switzerland",
|
||||||
|
"iso3166_1": "CH",
|
||||||
|
"latitude": 47.3769,
|
||||||
|
"longitude": 8.5417
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Tallinn",
|
||||||
|
"state_province": "Harju",
|
||||||
|
"iso3166_2": "EE-37",
|
||||||
|
"country": "Estonia",
|
||||||
|
"iso3166_1": "EE",
|
||||||
|
"latitude": 59.437,
|
||||||
|
"longitude": 24.7536
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Denver",
|
||||||
|
"state_province": "Colorado",
|
||||||
|
"iso3166_2": "US-CO",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 39.7392,
|
||||||
|
"longitude": -104.9903
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Portland",
|
||||||
|
"state_province": "Oregon",
|
||||||
|
"iso3166_2": "US-OR",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 45.5152,
|
||||||
|
"longitude": -122.6784
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "San Diego",
|
||||||
|
"state_province": "California",
|
||||||
|
"iso3166_2": "US-CA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 32.7157,
|
||||||
|
"longitude": -117.1611
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Asheville",
|
||||||
|
"state_province": "North Carolina",
|
||||||
|
"iso3166_2": "US-NC",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 35.5951,
|
||||||
|
"longitude": -82.5515
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Grand Rapids",
|
||||||
|
"state_province": "Michigan",
|
||||||
|
"iso3166_2": "US-MI",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 42.9634,
|
||||||
|
"longitude": -85.6681
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Chicago",
|
||||||
|
"state_province": "Illinois",
|
||||||
|
"iso3166_2": "US-IL",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 41.8781,
|
||||||
|
"longitude": -87.6298
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Seattle",
|
||||||
|
"state_province": "Washington",
|
||||||
|
"iso3166_2": "US-WA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 47.6062,
|
||||||
|
"longitude": -122.3321
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Austin",
|
||||||
|
"state_province": "Texas",
|
||||||
|
"iso3166_2": "US-TX",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 30.2672,
|
||||||
|
"longitude": -97.7431
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Boston",
|
||||||
|
"state_province": "Massachusetts",
|
||||||
|
"iso3166_2": "US-MA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 42.3601,
|
||||||
|
"longitude": -71.0589
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Philadelphia",
|
||||||
|
"state_province": "Pennsylvania",
|
||||||
|
"iso3166_2": "US-PA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 39.9526,
|
||||||
|
"longitude": -75.1652
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Brooklyn",
|
||||||
|
"state_province": "New York",
|
||||||
|
"iso3166_2": "US-NY",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 40.6782,
|
||||||
|
"longitude": -73.9442
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Milwaukee",
|
||||||
|
"state_province": "Wisconsin",
|
||||||
|
"iso3166_2": "US-WI",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 43.0389,
|
||||||
|
"longitude": -87.9065
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Richmond",
|
||||||
|
"state_province": "Virginia",
|
||||||
|
"iso3166_2": "US-VA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 37.5407,
|
||||||
|
"longitude": -77.436
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Cincinnati",
|
||||||
|
"state_province": "Ohio",
|
||||||
|
"iso3166_2": "US-OH",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 39.1031,
|
||||||
|
"longitude": -84.512
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "St. Louis",
|
||||||
|
"state_province": "Missouri",
|
||||||
|
"iso3166_2": "US-MO",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 38.627,
|
||||||
|
"longitude": -90.1994
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Tampa",
|
||||||
|
"state_province": "Florida",
|
||||||
|
"iso3166_2": "US-FL",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 27.9506,
|
||||||
|
"longitude": -82.4572
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Minneapolis",
|
||||||
|
"state_province": "Minnesota",
|
||||||
|
"iso3166_2": "US-MN",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 44.9778,
|
||||||
|
"longitude": -93.265
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Burlington",
|
||||||
|
"state_province": "Vermont",
|
||||||
|
"iso3166_2": "US-VT",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 44.4759,
|
||||||
|
"longitude": -73.2121
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Portland",
|
||||||
|
"state_province": "Maine",
|
||||||
|
"iso3166_2": "US-ME",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 43.6591,
|
||||||
|
"longitude": -70.2568
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Atlanta",
|
||||||
|
"state_province": "Georgia",
|
||||||
|
"iso3166_2": "US-GA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 33.749,
|
||||||
|
"longitude": -84.388
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Toronto",
|
||||||
|
"state_province": "Ontario",
|
||||||
|
"iso3166_2": "CA-ON",
|
||||||
|
"country": "Canada",
|
||||||
|
"iso3166_1": "CA",
|
||||||
|
"latitude": 43.651,
|
||||||
|
"longitude": -79.347
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Vancouver",
|
||||||
|
"state_province": "British Columbia",
|
||||||
|
"iso3166_2": "CA-BC",
|
||||||
|
"country": "Canada",
|
||||||
|
"iso3166_1": "CA",
|
||||||
|
"latitude": 49.2827,
|
||||||
|
"longitude": -123.1207
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Montreal",
|
||||||
|
"state_province": "Quebec",
|
||||||
|
"iso3166_2": "CA-QC",
|
||||||
|
"country": "Canada",
|
||||||
|
"iso3166_1": "CA",
|
||||||
|
"latitude": 45.5017,
|
||||||
|
"longitude": -73.5673
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Calgary",
|
||||||
|
"state_province": "Alberta",
|
||||||
|
"iso3166_2": "CA-AB",
|
||||||
|
"country": "Canada",
|
||||||
|
"iso3166_1": "CA",
|
||||||
|
"latitude": 51.0447,
|
||||||
|
"longitude": -114.0719
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Halifax",
|
||||||
|
"state_province": "Nova Scotia",
|
||||||
|
"iso3166_2": "CA-NS",
|
||||||
|
"country": "Canada",
|
||||||
|
"iso3166_1": "CA",
|
||||||
|
"latitude": 44.6488,
|
||||||
|
"longitude": -63.5752
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Mexico City",
|
||||||
|
"state_province": "Mexico City",
|
||||||
|
"iso3166_2": "MX-CMX",
|
||||||
|
"country": "Mexico",
|
||||||
|
"iso3166_1": "MX",
|
||||||
|
"latitude": 19.4326,
|
||||||
|
"longitude": -99.1332
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Tijuana",
|
||||||
|
"state_province": "Baja California",
|
||||||
|
"iso3166_2": "MX-BCN",
|
||||||
|
"country": "Mexico",
|
||||||
|
"iso3166_1": "MX",
|
||||||
|
"latitude": 32.5149,
|
||||||
|
"longitude": -117.0382
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Monterrey",
|
||||||
|
"state_province": "Nuevo León",
|
||||||
|
"iso3166_2": "MX-NLE",
|
||||||
|
"country": "Mexico",
|
||||||
|
"iso3166_1": "MX",
|
||||||
|
"latitude": 25.6866,
|
||||||
|
"longitude": -100.3161
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Guadalajara",
|
||||||
|
"state_province": "Jalisco",
|
||||||
|
"iso3166_2": "MX-JAL",
|
||||||
|
"country": "Mexico",
|
||||||
|
"iso3166_1": "MX",
|
||||||
|
"latitude": 20.6597,
|
||||||
|
"longitude": -103.3496
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Ensenada",
|
||||||
|
"state_province": "Baja California",
|
||||||
|
"iso3166_2": "MX-BCN",
|
||||||
|
"country": "Mexico",
|
||||||
|
"iso3166_1": "MX",
|
||||||
|
"latitude": 31.8667,
|
||||||
|
"longitude": -116.5964
|
||||||
|
}
|
||||||
|
]
|
||||||
425
pipeline/prompts/brewery_system_prompt.txt
Normal file
425
pipeline/prompts/brewery_system_prompt.txt
Normal file
@@ -0,0 +1,425 @@
|
|||||||
|
================================================================================
|
||||||
|
BREWERY DATA GENERATION - COMPREHENSIVE SYSTEM PROMPT
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
ROLE AND OBJECTIVE
|
||||||
|
You are an experienced brewmaster and owner of a local craft brewery. Your task
|
||||||
|
is to create a distinctive, authentic name and a detailed description for your
|
||||||
|
brewery that genuinely reflects your specific location, your brewing philosophy,
|
||||||
|
the local culture, and your connection to the community.
|
||||||
|
|
||||||
|
The brewery must feel real and grounded in its specific place—not generic or
|
||||||
|
interchangeable with breweries from other regions. Every detail should build
|
||||||
|
authenticity and distinctiveness.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
FORBIDDEN PHRASES AND CLICHÉS
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
NEVER USE THESE OVERUSED CONSTRUCTIONS (even in modified form):
|
||||||
|
- "Love letter to" / "tribute to" / "ode to"
|
||||||
|
- "Rolling hills" / "picturesque landscape" / "scenic beauty"
|
||||||
|
- "Every sip tells a story" / "every pint tells a story" / "transporting you"
|
||||||
|
- "Come for X, stay for Y" formula (Come for beer, stay for...)
|
||||||
|
- "Rich history/traditions" / "storied past" / "storied brewing tradition"
|
||||||
|
- "Passion" as a generic descriptor ("crafted with passion", "our passion")
|
||||||
|
- "Woven into the fabric" / "echoes of" / "steeped in"
|
||||||
|
- "Ancient roots" / "timeless traditions" / "time-honored heritage"
|
||||||
|
- Opening ONLY with landscape/geography (no standalone "Nestled...", "Where...")
|
||||||
|
- "Where tradition meets innovation"
|
||||||
|
- "Celebrating the spirit of [place]"
|
||||||
|
- "Raised on the values of" / "rooted in the values of"
|
||||||
|
- "Taste of [place]" / "essence of [place]"
|
||||||
|
- "From our family to yours"
|
||||||
|
- "Brewing excellence" / "committed to excellence"
|
||||||
|
- "Bringing people together" (without showing HOW)
|
||||||
|
- "Honoring local heritage" (without specifics)
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
SEVEN OPENING APPROACHES - ROTATE BETWEEN THESE
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
1. BEER STYLE ORIGIN ANGLE
|
||||||
|
Start by identifying a specific beer style historically made in or
|
||||||
|
influenced by the region. Explain why THIS place inspired that style.
|
||||||
|
Example Foundation: "Belgian Trappist ales developed from monastic traditions
|
||||||
|
in the Ardennes; our brewery continues that contemplative approach..."
|
||||||
|
|
||||||
|
2. BREWING CHALLENGE / ADVANTAGE ANGLE
|
||||||
|
Begin with a specific environmental or geographic challenge that shapes
|
||||||
|
the brewery's approach. Water hardness, altitude, climate, ingredient scarcity.
|
||||||
|
Example Foundation: "High-altitude fermentation requires patience; at 1,500m,
|
||||||
|
our lagers need 8 weeks to develop the crisp finish..."
|
||||||
|
|
||||||
|
3. FOUNDING STORY / PERSONAL MOTIVATION
|
||||||
|
Open with why the founder started THIS brewery HERE. Personal history,
|
||||||
|
escape from corporate work, multi-generational family legacy, career change.
|
||||||
|
Example Foundation: "After 20 years in finance, I returned to my hometown to
|
||||||
|
revive my grandfather's closed brewery using his original recipe notes..."
|
||||||
|
|
||||||
|
4. SPECIFIC LOCAL INGREDIENT / RESOURCE
|
||||||
|
Lead with a unique input source: special water, rare hops grown locally,
|
||||||
|
grain from a specific mill, honey from local apiaries, barrel aging with
|
||||||
|
local wood.
|
||||||
|
Example Foundation: "The cold springs below Sniffels Peak provide water so soft
|
||||||
|
it inspired our signature pale lager..."
|
||||||
|
|
||||||
|
5. CONTRADICTION / UNEXPECTED ANGLE
|
||||||
|
Start with a surprising fact about the place that defies stereotype.
|
||||||
|
Example Foundation: "Nobody expects beer culture in a Muslim-majority city,
|
||||||
|
yet our secular neighborhood has deep roots in 1920s beer halls..."
|
||||||
|
|
||||||
|
6. LOCAL EVENT / CULTURAL MOMENT
|
||||||
|
Begin with a specific historical moment, festival, cultural practice, or
|
||||||
|
seasonal tradition in the place.
|
||||||
|
Example Foundation: "Every October, the hop harvest brings itinerant workers
|
||||||
|
and tradition. Our brewery grew from a harvest celebration in 2008..."
|
||||||
|
|
||||||
|
7. TANGIBLE PHYSICAL DETAIL
|
||||||
|
Open by describing a concrete architectural or geographic feature: building
|
||||||
|
age, material, location relative to notable structures, layout, history of
|
||||||
|
the space.
|
||||||
|
Example Foundation: "This 1887 mill house once crushed grain; the original
|
||||||
|
water wheel still runs below our fermentation room..."
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
SPECIFICITY AND CONCRETENESS REQUIREMENTS
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
DO NOT GENERALIZE. Every brewery description must include:
|
||||||
|
|
||||||
|
✓ At least ONE concrete proper noun or specific reference:
|
||||||
|
- Actual local landmarks (mountain name, river name, street, neighborhood)
|
||||||
|
- Specific business partner or supplier name (if real to the region)
|
||||||
|
- Named local cultural event or historical period
|
||||||
|
- Specific beer style(s) with regional significance
|
||||||
|
- Actual geographic feature (e.g., "the volcanic ash in our soil")
|
||||||
|
|
||||||
|
✓ Mention specific beer styles relevant to the region's culture:
|
||||||
|
- German Bavaria: Dunkelweizen, Märzen, Kellerbier, Helles
|
||||||
|
- Belgian/Flemish: Lambic, Trappist, Strong Dark Ale
|
||||||
|
- British Isles: Brown Ale, Real Ale, Bitter, Cask Ale
|
||||||
|
- Czech: Pilsner, Bohemian Lager
|
||||||
|
- IPA/Hoppy: American regions, UK (origin)
|
||||||
|
- New Zealand/Australia: Hop-forward, experimental
|
||||||
|
- Japanese: Clean lagers, sake influence
|
||||||
|
- Mexican: Lager-centric, sometimes citrus
|
||||||
|
|
||||||
|
✓ Name concrete brewing challenges or advantages:
|
||||||
|
Examples: water minerality, altitude, temperature swings, grain varieties,
|
||||||
|
humidity, wild yeasts in the region, traditional equipment preserved in place
|
||||||
|
|
||||||
|
✓ Use sensory language SPECIFIC to the place:
|
||||||
|
NOT: "beautiful views" → "the copper beech trees turn rust-colored by
|
||||||
|
September"
|
||||||
|
NOT: "charming" → "the original tile floor from 1924 still mosaic-patterns
|
||||||
|
the taproom"
|
||||||
|
NOT: "authentic" → "the water chiller uses the original 1950s ammonia system"
|
||||||
|
|
||||||
|
✓ Avoid describing multiple regions with the same adjectives:
|
||||||
|
Don't say every brewery is "cozy" or "vibrant" or "historic"—be specific
|
||||||
|
about WHAT makes this one different from others in different regions.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
STRUCTURAL PATTERNS - MIX THESE UP
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
NOT every description should follow: legacy → current brewing → call to action
|
||||||
|
|
||||||
|
TEMPLATE ROTATION (these are EXAMPLES, not formulas):
|
||||||
|
|
||||||
|
TEMPLATE A: [Region origin] → [specific challenge] → [how we adapted] → [result]
|
||||||
|
"The Saône River flooded predictably each spring. Medieval brewers learned
|
||||||
|
to schedule production around it. We use the same seasonal rhythm..."
|
||||||
|
|
||||||
|
TEMPLATE B: [Ingredient story] → [technique developed because of it] → [distinctive result]
|
||||||
|
"Our barley terraces face southwest; the afternoon sun dries the crop weeks
|
||||||
|
before northern valleys. This inspired our crisp, mineral-forward pale ale..."
|
||||||
|
|
||||||
|
TEMPLATE C: [Personal/family history (without generic framing)] → [specific challenge overcome] → [philosophy]
|
||||||
|
"My mother was a chemist studying water quality; she noticed the local supply
|
||||||
|
had unusual pH. Rather than fight it, we formulated our entire range around
|
||||||
|
it. The sulfate content sharpens our bitters..."
|
||||||
|
|
||||||
|
TEMPLATE D: [Describe the physical space in detail] → [how space enables brewing style] → [sensory experience]
|
||||||
|
"The brewhouse occupies a converted 1960s chemical factory. The stainless steel
|
||||||
|
vats still bear faded original markings. The building's thermal mass keeps
|
||||||
|
fermentation stable without modern refrigeration..."
|
||||||
|
|
||||||
|
TEMPLATE E: [Unexpected contradiction] → [explanation] → [brewing philosophy]
|
||||||
|
"In a region famous for wine, we're a beer-only operation. We embrace that
|
||||||
|
outsider status and brew adventurously, avoiding the 'respect tradition'
|
||||||
|
pressure wine makes locals feel..."
|
||||||
|
|
||||||
|
TEMPLATE F: [Community role, specific] → [what that demands] → [brewing expression]
|
||||||
|
"We're the only gathering space in the village that stays open after 10pm.
|
||||||
|
That responsibility means brewing beers that pair with conversation, not
|
||||||
|
provocation. Sessionable, food-friendly, endlessly drinkable..."
|
||||||
|
|
||||||
|
TEMPLATE G: [Backward chronology] → [how practices persist] → [what's evolved]
|
||||||
|
"Our great-grandfather hand-packed bottles in 1952. We still own his bench.
|
||||||
|
Even though we use machines now, the pace he set—careful, thoughtful—shapes
|
||||||
|
every decision. Nothing about us is fast..."
|
||||||
|
|
||||||
|
SOMETIMES skip the narrative entirely and just describe:
|
||||||
|
"We brew four core beers—a dry lager, a copper ale, a wheat beer, and a hop-
|
||||||
|
forward pale. The range itself tells our story: accessible, varied,
|
||||||
|
unpretentious. No flagship. No hero beer. Balance."
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
REGIONAL AUTHENTICITY GUIDELINES
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
GERMAN / ALPINE / CENTRAL EUROPEAN
|
||||||
|
- Discuss water hardness and mineral content
|
||||||
|
- Reference specific beer laws (Reinheitsgebot, Bavarian purity traditions)
|
||||||
|
- Name specific styles: Kellerbier, Märzen, Dunkelweizen, Helles, Alt, Zwickel
|
||||||
|
- Mention lager fermentation dominance and cool-cave advantages
|
||||||
|
- Consider beer hall culture, tradition of communal spaces
|
||||||
|
- Discuss barrel aging if applicable
|
||||||
|
- Reference precision/engineering in brewing approach
|
||||||
|
- Don't romanticize; emphasis can be on technique and consistency
|
||||||
|
|
||||||
|
MEDITERRANEAN / SOUTHERN EUROPEAN
|
||||||
|
- Reference local wine culture (compare or contrast with brewing)
|
||||||
|
- Mention grape varieties if relevant (some regions have wine-brewery overlap)
|
||||||
|
- Discuss sun exposure, heat challenges during fermentation
|
||||||
|
- Ingredient sourcing: local herbs, citrus, wheat quality
|
||||||
|
- May emphasize Mediterranean sociability and gathering spaces
|
||||||
|
- Consider how northern European brewing tradition transplanted here
|
||||||
|
- Water source and quality specific to region
|
||||||
|
- Seasonal agricultural connections (harvest timing, etc.)
|
||||||
|
|
||||||
|
ANGLO-SAXON / BRITISH ISLES / SCANDINAVIAN
|
||||||
|
- Real ale, cask conditioning, hand-pulled pints
|
||||||
|
- IPA heritage (if British, England specifically; if American, different innovation story)
|
||||||
|
- Hops: specific varietal heritage (Fuggle, Golding, Cascade, etc.)
|
||||||
|
- Pub culture and community gathering
|
||||||
|
- Ales: top-fermented, warmer fermentation temperatures
|
||||||
|
- May emphasize working-class history or rural traditions
|
||||||
|
- Cider/mead/fermented heritage alongside beer
|
||||||
|
|
||||||
|
NEW WORLD (US, AUSTRALIA, NZ, SOUTH AFRICA)
|
||||||
|
- Emphasize experimentation and lack of brewing "rules"
|
||||||
|
- Ingredient sourcing: local grain growers, foraged hops, local suppliers
|
||||||
|
- May reference mining heritage, recent settlement, diverse immigration
|
||||||
|
- Craft beer boom influence: how does this brewery differentiate?
|
||||||
|
- Often: bold flavors, high ABVs, creative adjuncts
|
||||||
|
- Can emphasize anti-tradition or deliberate rule-breaking
|
||||||
|
- Emphasis on farmer partnerships and local food scenes
|
||||||
|
|
||||||
|
SMALL VILLAGES / RURAL AREAS
|
||||||
|
- Brewery likely serves as actual gathering place—explain HOW
|
||||||
|
- Ingredient sourcing highly local (grain from X farm, water from Y spring)
|
||||||
|
- May be family operation or multi-generation story
|
||||||
|
- Role in community identity and events
|
||||||
|
- Accessibility and lack of pretension
|
||||||
|
- Seasonal rhythm and agricultural calendar influence
|
||||||
|
- Risk: Don't make it overly quaint or "simpler times" nostalgic
|
||||||
|
|
||||||
|
URBAN / NEIGHBORHOOD-BASED
|
||||||
|
- Distinctive neighborhood identity (don't just say "vibrant")
|
||||||
|
- Specific business community or residential character
|
||||||
|
- Street-level visibility and casual drop-in culture
|
||||||
|
- May emphasize diversity, immigrant heritage, gentrification navigation
|
||||||
|
- Smaller brewing scale in dense area (space constraints)
|
||||||
|
- Walking-distance customer base instead of destination draw
|
||||||
|
- May have stronger food pairing focus (food truck culture, restaurant neighbors)
|
||||||
|
|
||||||
|
WINE REGIONS (Italy, France, Spain, Germany's Mosel, etc.)
|
||||||
|
- Show awareness of wine's prestige locally
|
||||||
|
- Explain why brewing exists here despite wine dominance
|
||||||
|
- Does brewery respect wine or deliberately provide alternative?
|
||||||
|
- Ingredient differences: water quality suited to beer, not wine
|
||||||
|
- Brewing approach: precise, clean—influenced by wine mentality
|
||||||
|
- May emphasize beer's sociability vs. wine's formality
|
||||||
|
- Historical context: beer predates or coexists with wine tradition
|
||||||
|
|
||||||
|
BEER-HERITAGE HOTSPOTS (Belgium, Germany, UK, Czech Republic)
|
||||||
|
- Can't ignore the weight of history without acknowledging it
|
||||||
|
- Do you innovate within tradition or break from it? Say which.
|
||||||
|
- Specific pride in one style over others (Lambic specialist, Trappist-inspired, etc.)
|
||||||
|
- May emphasize family legacy or generational knowledge
|
||||||
|
- Regional identity VERY strong—brewery reflects this unapologetically
|
||||||
|
- Risk: Avoid claiming to "honor" or "continue" without specifics
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
TONE VARIATIONS - NOT ALL BREWERIES ARE SOULFUL
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
These descriptions should NOT all sound romantic, quaint, or emotionally
|
||||||
|
passionate. These are alternative tones:
|
||||||
|
|
||||||
|
IRREVERENT / HUMOROUS
|
||||||
|
"We're brewing beer because wine required too much prayer. Less spirituality,
|
||||||
|
more hops. Our ales are big, unpolished, and perfect after a day's work."
|
||||||
|
|
||||||
|
MATTER-OF-FACT / ENGINEERING-FOCUSED
|
||||||
|
"Brewing is chemistry. We source ingredient components, control variables,
|
||||||
|
and optimize for reproducibility. If that sounds clinical, good—consistency
|
||||||
|
is our craft."
|
||||||
|
|
||||||
|
PROUDLY UNPRETENTIOUS / WORKING-CLASS
|
||||||
|
"This isn't farm-to-table aspirational nonsense. It's a neighborhood beer.
|
||||||
|
$4 pints. No reservations. No sipping notes. Tastes good, fills the glass,
|
||||||
|
keeps you coming back."
|
||||||
|
|
||||||
|
MINIMALIST / DIRECT
|
||||||
|
"We brew three beers. They're good. Come drink one."
|
||||||
|
|
||||||
|
BUSINESS-FOCUSED / PRACTICAL
|
||||||
|
"Starting a brewery in 2015 meant finding a niche. We're the only nano-
|
||||||
|
brewery serving the airport district. Our rapid turnover and distribution
|
||||||
|
focus differentiate us from weekend hobbyists."
|
||||||
|
|
||||||
|
CONFRONTATIONAL / REBELLIOUS
|
||||||
|
"Craft beer got boring. Expensive IPAs and flavor-chasing. We're brewing
|
||||||
|
wheat beers and forgotten styles because fashion is temporary; good beer is timeless."
|
||||||
|
|
||||||
|
MIX these tones across your descriptions. Some breweries should sound romantic
|
||||||
|
and place-proud. Others should sound irreverent or practical.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
NARRATIVE CLICHÉS TO ABSOLUTELY AVOID
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
1. THE "HIDDEN GEM" FRAMING
|
||||||
|
Don't use discovery language: "hidden," "lesser-known," "off the beaten path,"
|
||||||
|
"tucked away." Implies marketing speak, not authenticity.
|
||||||
|
|
||||||
|
2. OVERT NOSTALGIA / "SIMPLER TIMES"
|
||||||
|
Don't appeal to vague sense that past was better: "yearning for," "those
|
||||||
|
days," "how things used to be." Lazy and off-putting.
|
||||||
|
|
||||||
|
3. EMPTY "GATHERING PLACE" CLAIMS
|
||||||
|
Don't just assert "we bring people together." Show HOW: local workers' lunch
|
||||||
|
spot? Trivia night tradition? Live music venue? Political meeting ground?
|
||||||
|
|
||||||
|
4. "SPECIAL" WITHOUT EVIDENCE
|
||||||
|
Don't declare location is "special" or "unique." SHOW what makes it distinct
|
||||||
|
through specific details, not assertion.
|
||||||
|
|
||||||
|
5. "WE BELIEVE IN" AS PLACEHOLDER
|
||||||
|
Every brewery claims to "believe in" quality, community, craft, sustainability.
|
||||||
|
These are empty. What specific belief drives THIS brewery's choices?
|
||||||
|
|
||||||
|
6. "ESCAPE / RETREAT" FRAMING
|
||||||
|
Don't suggest beer allows people to escape reality, retreat from the world,
|
||||||
|
or "get away." Implies you don't trust the place itself to be compelling.
|
||||||
|
|
||||||
|
7. SUPERLATIVE CLAIMS
|
||||||
|
Don't use: "finest," "best," "most authentic," "truly legendary." Let details
|
||||||
|
prove these implied claims instead.
|
||||||
|
|
||||||
|
8. PASSIVE VOICE ABOUT YOUR OWN BREWERY
|
||||||
|
Avoid: "beloved by locals," "known for its," "celebrated for." Active voice:
|
||||||
|
what does the brewery actively DO?
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
LENGTH AND CONTENT REQUIREMENTS
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
TARGET LENGTH: 120-180 words
|
||||||
|
- Long enough to establish place and brewing philosophy
|
||||||
|
- Short enough to avoid meandering or repetition
|
||||||
|
- Specific enough that brewery feels real and unreplicable
|
||||||
|
|
||||||
|
REQUIRED ELEMENTS (at least ONE each):
|
||||||
|
✓ Concrete location reference (proper noun, landmark, geographic feature)
|
||||||
|
✓ One specific brewing detail (challenge, advantage, technique, ingredient)
|
||||||
|
✓ Sensory language specific to the place (NOT generic adjectives)
|
||||||
|
✓ Distinct tone/voice (don't all sound the same quiet reverence)
|
||||||
|
|
||||||
|
OPTIONAL ELEMENTS:
|
||||||
|
- Name 1-2 specific beer styles or beer names
|
||||||
|
- Personal/family story (if it illuminates why brewery exists here)
|
||||||
|
- Ingredient sourcing or supply chain detail
|
||||||
|
- Community role (with evidence, not assertion)
|
||||||
|
- Regional historical context (brief, specific)
|
||||||
|
|
||||||
|
WORD ECONOMY:
|
||||||
|
- Don't waste words on "we believe in quality" or "committed to excellence"
|
||||||
|
- Don't use filler adjectives: "authentic," "genuine," "real," "true," "local"
|
||||||
|
(these should be IMPLIED by specific details)
|
||||||
|
- Every sentence should add information, flavor, or distinctive detail
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
SENSORY LANGUAGE GUIDELINES
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
AVOID THESE GENERIC SENSORY WORDS (they're lazy placeholders):
|
||||||
|
- "Beautiful," "picturesque," "gorgeous," "stunning"
|
||||||
|
- "Warm," "cozy," "inviting" (without context)
|
||||||
|
- "Vibrant," "lively," "energetic" (without examples)
|
||||||
|
- "Charming," "quaint," "rustic" (without specifics)
|
||||||
|
|
||||||
|
USE INSTEAD: Specific, concrete sensory details
|
||||||
|
- Colors: "copper beech," "rust-stained brick," "frost-blue shutters"
|
||||||
|
- Textures: "the grain of wooden barrel hoops," "hand-smoothed stone," "grime-darkened windows"
|
||||||
|
- Sounds: "the hiss of the hand-pump," "coin-drop in the old register," "church bells on Sunday"
|
||||||
|
- Smells: "yeast-heavy floor," "wet limestone," "Hallertau hop resin"
|
||||||
|
- Tastes: (in the beer) "mineral-sharp," "sulfate clarity," "heather honey notes"
|
||||||
|
|
||||||
|
EXAMPLE SENSORY COMPARISON:
|
||||||
|
AVOID: "Our brewery captures the essence of the region's rustic charm."
|
||||||
|
USE: "The five-meter stone walls keep fermentation at 12°C without refrigeration.
|
||||||
|
On warm days, water drips from moss-covered blocks—the original cooling
|
||||||
|
system that hasn't changed in 150 years."
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
DIVERSITY ACROSS DATASET - WHAT NOT TO REPEAT
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
Since you're generating many breweries, ensure variety by:
|
||||||
|
|
||||||
|
□ Alternating tone (soulful → irreverent → matter-of-fact → working-class, etc.)
|
||||||
|
□ Varying opening approach (don't use beer-style origin twice in a row)
|
||||||
|
□ Different geographic contexts (don't make all small villages sound the same)
|
||||||
|
□ Distinct brewery sizes/models (nano-brewery, family operation, investor-backed, etc.)
|
||||||
|
□ Various types of "draw" (neighborhood destination vs. local-only vs. tourist
|
||||||
|
attraction vs. untouched community staple)
|
||||||
|
□ Diverse relationship to beer history/tradition (embrace it, subvert it, ignore it)
|
||||||
|
□ Different community roles (political space, athlete hangout, food destination,
|
||||||
|
working person's bar, experimentation lab, etc.)
|
||||||
|
|
||||||
|
If you notice yourself using the same phrasing twice within three breweries,
|
||||||
|
STOP and take a completely different approach for the next one.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
QUALITY CHECKLIST
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
Before submitting your brewery description, verify:
|
||||||
|
|
||||||
|
□ Zero clichés from the FORBIDDEN list appear anywhere
|
||||||
|
□ At least one specific proper noun or concrete reference included
|
||||||
|
□ No more than two generic adjectives in the entire description
|
||||||
|
□ The brewery is genuinely unreplicable (wouldn't work in a different location)
|
||||||
|
□ Tone matches a SPECIFIC angle (not generic reverence)
|
||||||
|
□ Opening sentence is distinctive and unexpected
|
||||||
|
□ No sentence says the same thing twice in different words
|
||||||
|
□ At least one detail is surprising or specific to this place
|
||||||
|
□ The description would make sense ONLY for this location/region
|
||||||
|
□ "Passion," "tradition," "community" either don't appear or appear with
|
||||||
|
specific context/evidence
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
OUTPUT FORMAT
|
||||||
|
================================================================================
|
||||||
|
|
||||||
|
Return ONLY a valid JSON object with exactly two keys:
|
||||||
|
{
|
||||||
|
"name": "Brewery Name Here",
|
||||||
|
"description": "Full description text here..."
|
||||||
|
}
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
- name: 2-5 words, distinctive, memorable
|
||||||
|
- description: 120-180 words, follows all guidelines above
|
||||||
|
- Valid JSON (escaped quotes, no line breaks in strings)
|
||||||
|
- No markdown, no backticks, no code formatting
|
||||||
|
- No preamble before the JSON
|
||||||
|
- No trailing text after the JSON
|
||||||
|
- No explanations or commentary
|
||||||
|
|
||||||
|
================================================================================
|
||||||
200
pipeline/prompts/brewery_system_prompt_expanded.txt
Normal file
200
pipeline/prompts/brewery_system_prompt_expanded.txt
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
================================================================================
|
||||||
|
BREWERY DATA GENERATION SYSTEM PROMPT
|
||||||
|
|
||||||
|
ROLE AND OBJECTIVE
|
||||||
|
You are an experienced brewmaster creating brewery descriptions grounded in the
|
||||||
|
given city and country. The writing must feel specific, plausible, and local
|
||||||
|
without sounding formulaic or repetitive.
|
||||||
|
|
||||||
|
Primary goal: produce varied outputs across many cities in one run.
|
||||||
|
Do NOT use the same template repeatedly.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
ANTI-REPETITION RULES (CRITICAL)
|
||||||
|
|
||||||
|
Avoid recurring boilerplate patterns. Especially avoid repeatedly using:
|
||||||
|
|
||||||
|
- "The soft spring water beneath..."
|
||||||
|
- fixed mineral ppm patterns in every entry
|
||||||
|
- "1930s copper still/mash tun" in every entry
|
||||||
|
- "the air smells of..." in every entry
|
||||||
|
- "No stainless steel" / anti-modernization comparison
|
||||||
|
- year-heavy historical stacking in every paragraph
|
||||||
|
|
||||||
|
For each brewery, choose a DIFFERENT primary lens from this set:
|
||||||
|
|
||||||
|
1) Local ingredient chain
|
||||||
|
2) Fermentation/process decision
|
||||||
|
3) Building/space constraint
|
||||||
|
4) Workforce/customer culture
|
||||||
|
5) Regional beer tradition adapted locally
|
||||||
|
6) Climate/seasonality challenge
|
||||||
|
|
||||||
|
Use only one primary lens plus one supporting detail.
|
||||||
|
Do not combine all lenses every time.
|
||||||
|
|
||||||
|
Vary rhythm and structure:
|
||||||
|
|
||||||
|
- Some descriptions should be concise and direct.
|
||||||
|
- Some can be narrative.
|
||||||
|
- Some can be technical.
|
||||||
|
- Do not start more than 2 descriptions in a row with the same sentence shape.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
FORBIDDEN PHRASES
|
||||||
|
|
||||||
|
NEVER USE THESE (even in modified form):
|
||||||
|
|
||||||
|
"Love letter to" / "tribute to" / "ode to" / "rolling hills" / "picturesque"
|
||||||
|
|
||||||
|
"Every sip tells a story" / "Come for X, stay for Y" / "Where tradition meets innovation"
|
||||||
|
|
||||||
|
"Rich history" / "ancient roots" / "timeless traditions" / "time-honored heritage"
|
||||||
|
|
||||||
|
"Passion" (standalone descriptor) / "brewing excellence" / "commitment to quality"
|
||||||
|
|
||||||
|
"Authentic" / "genuine" / "real" / "true" (SHOW these, don't state them)
|
||||||
|
|
||||||
|
"Bringing people together" (without HOW) / "community gathering place" (without proof)
|
||||||
|
|
||||||
|
"Hidden gem" / "secret" / "lesser-known" / "beloved by locals"
|
||||||
|
|
||||||
|
Generic adjectives: "beautiful," "gorgeous," "lovely," "cozy," "charming," "vibrant"
|
||||||
|
|
||||||
|
Vague temporal claims: "simpler times," "the good old days," "escape from the modern world"
|
||||||
|
|
||||||
|
Passive voice: "is known for," "has become famous for," "has earned a reputation"
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
OPENING APPROACHES (Choose ONE)
|
||||||
|
|
||||||
|
BEER STYLE ORIGIN: Start with a specific historical beer style from this
|
||||||
|
region, explain why this place created it, show how your brewery continues it.
|
||||||
|
Key: style + local reason + current execution
|
||||||
|
|
||||||
|
BREWING CHALLENGE: Begin with a specific environmental constraint (altitude,
|
||||||
|
water hardness, temperature, endemic yeasts). Explain the technical consequence
|
||||||
|
and what decision you made because of it.
|
||||||
|
Key: constraint + consequence + response
|
||||||
|
|
||||||
|
FOUNDING STORY: Why did the founder return/move HERE? What did they discover?
|
||||||
|
What specific brewing decision followed? Include a concrete artifact (logs, equipment).
|
||||||
|
Key: motivation + discovery + decision
|
||||||
|
|
||||||
|
LOCAL INGREDIENT: What unique resource defines your brewery? Why is it unique?
|
||||||
|
What brewing constraint or opportunity does it create?
|
||||||
|
Key: ingredient + locality + process effect
|
||||||
|
|
||||||
|
CONTRADICTION: What is the region famous for? Why does your brewery do the
|
||||||
|
opposite? Make the contradiction a strength, not an apology.
|
||||||
|
Key: regional norm + divergence + result
|
||||||
|
|
||||||
|
CULTURAL MOMENT: What specific seasonal tradition or event shapes your brewery?
|
||||||
|
How do you connect to it? What brewing decisions follow?
|
||||||
|
Key: event + relationship + brewing choice
|
||||||
|
|
||||||
|
PHYSICAL SPACE: Describe a specific architectural feature with date/material.
|
||||||
|
How does it create technical advantage? What sensory details matter? Why keep
|
||||||
|
constraints instead of modernizing?
|
||||||
|
Key: feature + consequence + sensory note
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
SPECIFICITY REQUIREMENTS
|
||||||
|
|
||||||
|
Every brewery description MUST include:
|
||||||
|
|
||||||
|
CONCRETE PROPER NOUNS (at least 2)
|
||||||
|
|
||||||
|
Named geographic features relevant to the prompt location.
|
||||||
|
|
||||||
|
Named local suppliers or historical events specific to the region.
|
||||||
|
|
||||||
|
BREWING DETAIL (exactly 1-2)
|
||||||
|
|
||||||
|
Examples: mash schedule choice, fermentation temperature strategy,
|
||||||
|
ingredient handling, yeast management, packaging decision.
|
||||||
|
|
||||||
|
Numeric values are OPTIONAL.
|
||||||
|
Only use numbers when highly plausible.
|
||||||
|
Do not force ppm chemistry in every description.
|
||||||
|
|
||||||
|
Avoid making up overly specific historical claims unless they are broadly plausible.
|
||||||
|
|
||||||
|
SENSORY DETAIL (at least 1)
|
||||||
|
Must be local and concrete (sound/smell/texture/visual).
|
||||||
|
Do not reuse identical sensory phrasing across outputs.
|
||||||
|
|
||||||
|
PROOF TEST
|
||||||
|
Could this description be pasted onto another city unchanged?
|
||||||
|
If yes, make it more local.
|
||||||
|
|
||||||
|
If no, proceed.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
TONE VARIATIONS
|
||||||
|
|
||||||
|
Rotate tones consciously.
|
||||||
|
|
||||||
|
Do not lock into one tone for all cities. Choose one per city.
|
||||||
|
|
||||||
|
IRREVERENT: blunt, anti-hype, practical.
|
||||||
|
|
||||||
|
MATTER-OF-FACT: technical and concise.
|
||||||
|
|
||||||
|
WORKING-CLASS PROUD: utility, affordability, regulars.
|
||||||
|
|
||||||
|
MINIMALIST: short, sparse, direct.
|
||||||
|
|
||||||
|
NOSTALGIC-GROUNDED: legacy through tangible artifacts.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
LENGTH & CONTENT REQUIREMENTS
|
||||||
|
|
||||||
|
TARGET LENGTH: 90-170 words
|
||||||
|
|
||||||
|
REQUIRED ELEMENTS:
|
||||||
|
|
||||||
|
At least 2 concrete proper nouns
|
||||||
|
|
||||||
|
At least 1 brewing-specific detail
|
||||||
|
|
||||||
|
At least 1 local sensory detail
|
||||||
|
|
||||||
|
Consistent tone throughout (irreverent, matter-of-fact, working-class, nostalgic, etc.)
|
||||||
|
|
||||||
|
One distinctive detail that proves the brewery could ONLY exist in this location
|
||||||
|
|
||||||
|
DO NOT INCLUDE:
|
||||||
|
|
||||||
|
Generic adjectives without evidence: "authentic," "genuine," "soulful," "passionate"
|
||||||
|
|
||||||
|
Vague community claims without HOW: "gathering place," "beloved," "where people come together"
|
||||||
|
|
||||||
|
Marketing language: "award-winning," "nationally recognized," "craft quality"
|
||||||
|
|
||||||
|
Fillers: "and more," "creating memories," "for all to enjoy"
|
||||||
|
|
||||||
|
Predictions: "we're working on," "coming soon," "we plan to"
|
||||||
|
|
||||||
|
Do not repeat the same structural motifs across outputs in one batch.
|
||||||
|
|
||||||
|
================================================================================
|
||||||
|
OUTPUT FORMAT
|
||||||
|
|
||||||
|
Return ONLY a valid JSON object with exactly two keys:
|
||||||
|
{
|
||||||
|
"name": "Brewery Name Here",
|
||||||
|
"description": "Full description text here..."
|
||||||
|
}
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
|
||||||
|
name: 2-5 words, distinctive, memorable
|
||||||
|
|
||||||
|
description: 90-170 words, follows all guidelines
|
||||||
|
|
||||||
|
Valid JSON (properly escaped quotes, no line breaks)
|
||||||
|
|
||||||
|
No markdown, backticks, or code formatting
|
||||||
|
|
||||||
|
No preamble or trailing text after JSON
|
||||||
162
pipeline/src/biergarten_data_generator.cpp
Normal file
162
pipeline/src/biergarten_data_generator.cpp
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
#include "biergarten_data_generator.h"
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <future>
|
||||||
|
#include <iterator>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
#include "data_generation/mock_generator.h"
|
||||||
|
#include "json_handling/json_loader.h"
|
||||||
|
#include "wikipedia/wikipedia_service.h"
|
||||||
|
|
||||||
|
BiergartenDataGenerator::BiergartenDataGenerator(
|
||||||
|
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
|
||||||
|
: options_(options), webClient_(std::move(web_client)) {}
|
||||||
|
|
||||||
|
auto BiergartenDataGenerator::InitializeGenerator()
|
||||||
|
-> std::unique_ptr<DataGenerator> {
|
||||||
|
spdlog::info("Initializing brewery generator...");
|
||||||
|
|
||||||
|
std::unique_ptr<DataGenerator> generator;
|
||||||
|
if (options_.model_path.empty()) {
|
||||||
|
generator = std::make_unique<MockGenerator>();
|
||||||
|
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
|
||||||
|
} else {
|
||||||
|
auto llama_generator = std::make_unique<LlamaGenerator>();
|
||||||
|
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
|
||||||
|
options_.seed);
|
||||||
|
llama_generator->SetContextSize(options_.n_ctx);
|
||||||
|
spdlog::info(
|
||||||
|
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
|
||||||
|
"n_ctx={}, seed={})",
|
||||||
|
options_.model_path, options_.temperature, options_.top_p,
|
||||||
|
options_.n_ctx, options_.seed);
|
||||||
|
generator = std::move(llama_generator);
|
||||||
|
}
|
||||||
|
generator->Load(options_.model_path);
|
||||||
|
|
||||||
|
return generator;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto BiergartenDataGenerator::QueryCitiesWithCountries()
|
||||||
|
-> std::vector<Location> {
|
||||||
|
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||||
|
|
||||||
|
std::filesystem::path locations_path = "locations.json";
|
||||||
|
if (!std::filesystem::exists(locations_path)) {
|
||||||
|
const std::filesystem::path cache_path =
|
||||||
|
std::filesystem::path(options_.cache_dir) / "locations.json";
|
||||||
|
if (std::filesystem::exists(cache_path)) {
|
||||||
|
locations_path = cache_path;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto all_locations = JsonLoader::LoadLocations(locations_path.string());
|
||||||
|
spdlog::info(" Locations available: {}", all_locations.size());
|
||||||
|
|
||||||
|
const size_t sample_count = std::min<size_t>(4, all_locations.size());
|
||||||
|
std::vector<Location> sampled_locations;
|
||||||
|
sampled_locations.reserve(sample_count);
|
||||||
|
|
||||||
|
std::random_device random_generator;
|
||||||
|
std::sample(all_locations.begin(), all_locations.end(),
|
||||||
|
std::back_inserter(sampled_locations), sample_count,
|
||||||
|
random_generator);
|
||||||
|
|
||||||
|
spdlog::info(" Sampled locations: {}", sampled_locations.size());
|
||||||
|
return sampled_locations;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto BiergartenDataGenerator::EnrichWithWikipedia(
|
||||||
|
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
|
||||||
|
std::vector<EnrichedCity> enriched;
|
||||||
|
enriched.reserve(cities.size());
|
||||||
|
|
||||||
|
std::vector<std::future<EnrichedCity>> pending;
|
||||||
|
pending.reserve(cities.size());
|
||||||
|
|
||||||
|
for (const auto& city : cities) {
|
||||||
|
pending.push_back(std::async(std::launch::async,
|
||||||
|
[web_client = webClient_, city]() {
|
||||||
|
WikipediaService wikipedia_service(
|
||||||
|
web_client);
|
||||||
|
const std::string region_context =
|
||||||
|
wikipedia_service.GetSummary(
|
||||||
|
city.city, city.country);
|
||||||
|
spdlog::debug(
|
||||||
|
"[Pipeline] Region context for {}: {}",
|
||||||
|
city.city, region_context);
|
||||||
|
return EnrichedCity{city, region_context};
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto& task : pending) {
|
||||||
|
enriched.push_back(task.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
return enriched;
|
||||||
|
}
|
||||||
|
|
||||||
|
void BiergartenDataGenerator::GenerateBreweries(
|
||||||
|
DataGenerator& generator, const std::vector<EnrichedCity>& cities) {
|
||||||
|
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
|
||||||
|
generatedBreweries_.clear();
|
||||||
|
|
||||||
|
size_t skipped_count = 0;
|
||||||
|
|
||||||
|
for (const auto& enriched_city : cities) {
|
||||||
|
try {
|
||||||
|
auto brewery = generator.GenerateBrewery(enriched_city.location.city,
|
||||||
|
enriched_city.location.country,
|
||||||
|
enriched_city.region_context);
|
||||||
|
generatedBreweries_.push_back({enriched_city.location, brewery});
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
++skipped_count;
|
||||||
|
spdlog::warn(
|
||||||
|
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: {}",
|
||||||
|
enriched_city.location.city, enriched_city.location.country,
|
||||||
|
e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (skipped_count > 0) {
|
||||||
|
spdlog::warn("[Pipeline] Skipped {} city/cities due to generation "
|
||||||
|
"errors",
|
||||||
|
skipped_count);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BiergartenDataGenerator::LogResults() const {
|
||||||
|
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
||||||
|
size_t index = 1;
|
||||||
|
for (const auto& entry : generatedBreweries_) {
|
||||||
|
spdlog::info("{}. city=\"{}\" country=\"{}\" state=\"{}\" "
|
||||||
|
"iso3166_2={} lat={} lon={}",
|
||||||
|
index, entry.location.city, entry.location.country,
|
||||||
|
entry.location.state_province, entry.location.iso3166_2,
|
||||||
|
entry.location.latitude, entry.location.longitude);
|
||||||
|
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
|
||||||
|
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
|
||||||
|
++index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto BiergartenDataGenerator::Run() -> int {
|
||||||
|
try {
|
||||||
|
auto generator = InitializeGenerator();
|
||||||
|
auto cities = QueryCitiesWithCountries();
|
||||||
|
auto enriched = EnrichWithWikipedia(cities);
|
||||||
|
GenerateBreweries(*generator, enriched);
|
||||||
|
LogResults();
|
||||||
|
|
||||||
|
spdlog::info("\nOK: Pipeline completed successfully");
|
||||||
|
return 0;
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
spdlog::error("ERROR: Pipeline failed: {}", e.what());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
31
pipeline/src/data_generation/llama/destructor.cpp
Normal file
31
pipeline/src/data_generation/llama/destructor.cpp
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
/**
|
||||||
|
* Destructor Module
|
||||||
|
* Ensures proper cleanup of llama.cpp resources (context and model) when the
|
||||||
|
* generator is destroyed, preventing memory leaks and resource exhaustion.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
LlamaGenerator::~LlamaGenerator() {
|
||||||
|
/**
|
||||||
|
* Free the inference context (contains KV cache and computation state)
|
||||||
|
*/
|
||||||
|
if (context_ != nullptr) {
|
||||||
|
llama_free(context_);
|
||||||
|
context_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Free the loaded model (contains weights and vocabulary)
|
||||||
|
*/
|
||||||
|
if (model_ != nullptr) {
|
||||||
|
llama_model_free(model_);
|
||||||
|
model_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clean up the backend (GPU/CPU acceleration resources)
|
||||||
|
*/
|
||||||
|
llama_backend_free();
|
||||||
|
}
|
||||||
107
pipeline/src/data_generation/llama/generate_brewery.cpp
Normal file
107
pipeline/src/data_generation/llama/generate_brewery.cpp
Normal file
@@ -0,0 +1,107 @@
|
|||||||
|
/**
|
||||||
|
* Brewery Data Generation Module
|
||||||
|
* Uses the LLM to generate realistic brewery names and descriptions for a given
|
||||||
|
* location. Implements retry logic with validation and error correction to
|
||||||
|
* ensure valid JSON output conforming to the expected schema.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
#include "data_generation/llama_generator_helpers.h"
|
||||||
|
|
||||||
|
BreweryResult LlamaGenerator::GenerateBrewery(
|
||||||
|
const std::string& city_name, const std::string& country_name,
|
||||||
|
const std::string& region_context) {
|
||||||
|
/**
|
||||||
|
* Preprocess and truncate region context to manageable size
|
||||||
|
*/
|
||||||
|
const std::string safe_region_context =
|
||||||
|
PrepareRegionContextPublic(region_context);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Load brewery system prompt from file
|
||||||
|
* Falls back to minimal inline prompt if file not found
|
||||||
|
* Default path: prompts/brewery_system_prompt_expanded.txt
|
||||||
|
*/
|
||||||
|
const std::string system_prompt =
|
||||||
|
LoadBrewerySystemPrompt("prompts/brewery_system_prompt_expanded.txt");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User prompt: provides geographic context to guide generation towards
|
||||||
|
* culturally appropriate and locally-inspired brewery attributes
|
||||||
|
*/
|
||||||
|
std::string prompt =
|
||||||
|
"Write a brewery name and place-specific long description for a craft "
|
||||||
|
"brewery in " +
|
||||||
|
city_name +
|
||||||
|
(country_name.empty() ? std::string("")
|
||||||
|
: std::string(", ") + country_name) +
|
||||||
|
(safe_region_context.empty()
|
||||||
|
? std::string(".")
|
||||||
|
: std::string(". Regional context: ") + safe_region_context);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store location context for retry prompts (without repeating full context)
|
||||||
|
*/
|
||||||
|
const std::string retry_location =
|
||||||
|
"Location: " + city_name +
|
||||||
|
(country_name.empty() ? std::string("")
|
||||||
|
: std::string(", ") + country_name);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* RETRY LOOP with validation and error correction
|
||||||
|
* Attempts to generate valid brewery data up to 3 times, with feedback-based
|
||||||
|
* refinement
|
||||||
|
*/
|
||||||
|
const int max_attempts = 3;
|
||||||
|
std::string raw;
|
||||||
|
std::string last_error;
|
||||||
|
|
||||||
|
// Limit output length to keep it concise and focused
|
||||||
|
constexpr int max_tokens = 1052;
|
||||||
|
for (int attempt = 0; attempt < max_attempts; ++attempt) {
|
||||||
|
// Generate brewery data from LLM
|
||||||
|
raw = Infer(system_prompt, prompt, max_tokens);
|
||||||
|
spdlog::debug("LlamaGenerator: raw output (attempt {}): {}", attempt + 1,
|
||||||
|
raw);
|
||||||
|
|
||||||
|
// Validate output: parse JSON and check required fields
|
||||||
|
|
||||||
|
std::string name;
|
||||||
|
std::string description;
|
||||||
|
const std::string validation_error =
|
||||||
|
ValidateBreweryJsonPublic(raw, name, description);
|
||||||
|
if (validation_error.empty()) {
|
||||||
|
// Success: return parsed brewery data
|
||||||
|
return {std::move(name), std::move(description)};
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validation failed: log error and prepare corrective feedback
|
||||||
|
|
||||||
|
last_error = validation_error;
|
||||||
|
spdlog::warn("LlamaGenerator: malformed brewery JSON (attempt {}): {}",
|
||||||
|
attempt + 1, validation_error);
|
||||||
|
|
||||||
|
// Update prompt with error details to guide LLM toward correct output.
|
||||||
|
// For retries, use a compact prompt format to avoid exceeding token
|
||||||
|
// limits.
|
||||||
|
prompt =
|
||||||
|
"Your previous response was invalid. Error: " + validation_error +
|
||||||
|
"\nReturn ONLY valid JSON with this exact schema: "
|
||||||
|
"{\"name\": \"string\", \"description\": \"string\"}."
|
||||||
|
"\nDo not include markdown, comments, or extra keys."
|
||||||
|
"\n\n" +
|
||||||
|
retry_location;
|
||||||
|
}
|
||||||
|
|
||||||
|
// All retry attempts exhausted: log failure and throw exception
|
||||||
|
spdlog::error(
|
||||||
|
"LlamaGenerator: malformed brewery response after {} attempts: "
|
||||||
|
"{}",
|
||||||
|
max_attempts, last_error.empty() ? raw : last_error);
|
||||||
|
throw std::runtime_error("LlamaGenerator: malformed brewery response");
|
||||||
|
}
|
||||||
102
pipeline/src/data_generation/llama/generate_user.cpp
Normal file
102
pipeline/src/data_generation/llama/generate_user.cpp
Normal file
@@ -0,0 +1,102 @@
|
|||||||
|
/**
|
||||||
|
* User Profile Generation Module
|
||||||
|
* Uses the LLM to generate realistic user profiles (username and bio) for craft
|
||||||
|
* beer enthusiasts. Implements retry logic to handle parsing failures and
|
||||||
|
* ensures output adheres to strict format constraints (two lines, specific
|
||||||
|
* character limits).
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
#include "data_generation/llama_generator_helpers.h"
|
||||||
|
|
||||||
|
UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
|
||||||
|
/**
|
||||||
|
* System prompt: specifies exact output format to minimize parsing errors
|
||||||
|
* Constraints: 2-line output, username format, bio length bounds
|
||||||
|
*/
|
||||||
|
const std::string system_prompt =
|
||||||
|
"You generate plausible social media profiles for craft beer "
|
||||||
|
"enthusiasts. "
|
||||||
|
"Respond with exactly two lines: "
|
||||||
|
"the first line is a username (lowercase, no spaces, 8-20 characters), "
|
||||||
|
"the second line is a one-sentence bio (20-40 words). "
|
||||||
|
"The profile should feel consistent with the locale. "
|
||||||
|
"No preamble, no labels.";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* User prompt: locale parameter guides cultural appropriateness of generated
|
||||||
|
* profiles
|
||||||
|
*/
|
||||||
|
std::string prompt =
|
||||||
|
"Generate a craft beer enthusiast profile. Locale: " + locale;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* RETRY LOOP with format validation
|
||||||
|
* Attempts up to 3 times to generate valid user profile with correct format
|
||||||
|
*/
|
||||||
|
const int max_attempts = 3;
|
||||||
|
std::string raw;
|
||||||
|
for (int attempt = 0; attempt < max_attempts; ++attempt) {
|
||||||
|
/**
|
||||||
|
* Generate user profile (max 128 tokens - should fit 2 lines easily)
|
||||||
|
*/
|
||||||
|
raw = Infer(system_prompt, prompt, 128);
|
||||||
|
spdlog::debug("LlamaGenerator (user): raw output (attempt {}): {}",
|
||||||
|
attempt + 1, raw);
|
||||||
|
|
||||||
|
try {
|
||||||
|
/**
|
||||||
|
* Parse two-line response: first line = username, second line = bio
|
||||||
|
*/
|
||||||
|
auto [username, bio] = ParseTwoLineResponsePublic(
|
||||||
|
raw, "LlamaGenerator: malformed user response");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove any whitespace from username (usernames shouldn't have
|
||||||
|
* spaces)
|
||||||
|
*/
|
||||||
|
username.erase(
|
||||||
|
std::remove_if(username.begin(), username.end(),
|
||||||
|
[](unsigned char ch) { return std::isspace(ch); }),
|
||||||
|
username.end());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate both fields are non-empty after processing
|
||||||
|
*/
|
||||||
|
if (username.empty() || bio.empty()) {
|
||||||
|
throw std::runtime_error("LlamaGenerator: malformed user response");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Truncate bio if exceeds reasonable length for bio field
|
||||||
|
*/
|
||||||
|
if (bio.size() > 200) bio = bio.substr(0, 200);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Success: return parsed user profile
|
||||||
|
*/
|
||||||
|
return {username, bio};
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
/**
|
||||||
|
* Parsing failed: log and continue to next attempt
|
||||||
|
*/
|
||||||
|
spdlog::warn(
|
||||||
|
"LlamaGenerator: malformed user response (attempt {}): {}",
|
||||||
|
attempt + 1, e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* All retry attempts exhausted: log failure and throw exception
|
||||||
|
*/
|
||||||
|
spdlog::error(
|
||||||
|
"LlamaGenerator: malformed user response after {} attempts: {}",
|
||||||
|
max_attempts, raw);
|
||||||
|
throw std::runtime_error("LlamaGenerator: malformed user response");
|
||||||
|
}
|
||||||
441
pipeline/src/data_generation/llama/helpers.cpp
Normal file
441
pipeline/src/data_generation/llama/helpers.cpp
Normal file
@@ -0,0 +1,441 @@
|
|||||||
|
/**
|
||||||
|
* Helper Functions Module
|
||||||
|
* Provides utility functions for text processing, parsing, and chat template
|
||||||
|
* formatting. Functions handle whitespace normalization, response parsing, and
|
||||||
|
* conversion of prompts to proper chat format using the model's built-in
|
||||||
|
* template.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <array>
|
||||||
|
#include <boost/json.hpp>
|
||||||
|
#include <cctype>
|
||||||
|
#include <sstream>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* String trimming: removes leading and trailing whitespace
|
||||||
|
*/
|
||||||
|
std::string Trim(std::string value) {
|
||||||
|
auto not_space = [](unsigned char ch) { return !std::isspace(ch); };
|
||||||
|
|
||||||
|
value.erase(value.begin(),
|
||||||
|
std::find_if(value.begin(), value.end(), not_space));
|
||||||
|
value.erase(std::find_if(value.rbegin(), value.rend(), not_space).base(),
|
||||||
|
value.end());
|
||||||
|
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Normalize whitespace: collapses multiple spaces/tabs/newlines into single
|
||||||
|
* spaces
|
||||||
|
*/
|
||||||
|
std::string CondenseWhitespace(std::string text) {
|
||||||
|
std::string out;
|
||||||
|
out.reserve(text.size());
|
||||||
|
|
||||||
|
bool in_whitespace = false;
|
||||||
|
for (unsigned char ch : text) {
|
||||||
|
if (std::isspace(ch)) {
|
||||||
|
if (!in_whitespace) {
|
||||||
|
out.push_back(' ');
|
||||||
|
in_whitespace = true;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
in_whitespace = false;
|
||||||
|
out.push_back(static_cast<char>(ch));
|
||||||
|
}
|
||||||
|
|
||||||
|
return Trim(std::move(out));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Truncate region context to fit within max length while preserving word
|
||||||
|
* boundaries
|
||||||
|
*/
|
||||||
|
std::string PrepareRegionContext(std::string_view region_context,
|
||||||
|
std::size_t max_chars) {
|
||||||
|
std::string normalized = CondenseWhitespace(std::string(region_context));
|
||||||
|
if (normalized.size() <= max_chars) {
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
normalized.resize(max_chars);
|
||||||
|
const std::size_t last_space = normalized.find_last_of(' ');
|
||||||
|
if (last_space != std::string::npos && last_space > max_chars / 2) {
|
||||||
|
normalized.resize(last_space);
|
||||||
|
}
|
||||||
|
|
||||||
|
normalized += "...";
|
||||||
|
return normalized;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove common bullet points, numbers, and field labels added by LLM in output
|
||||||
|
*/
|
||||||
|
std::string StripCommonPrefix(std::string line) {
|
||||||
|
line = Trim(std::move(line));
|
||||||
|
|
||||||
|
if (!line.empty() && (line[0] == '-' || line[0] == '*')) {
|
||||||
|
line = Trim(line.substr(1));
|
||||||
|
} else {
|
||||||
|
std::size_t i = 0;
|
||||||
|
while (i < line.size() &&
|
||||||
|
std::isdigit(static_cast<unsigned char>(line[i]))) {
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
if (i > 0 && i < line.size() && (line[i] == '.' || line[i] == ')')) {
|
||||||
|
line = Trim(line.substr(i + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto strip_label = [&line](const std::string& label) {
|
||||||
|
if (line.size() >= label.size()) {
|
||||||
|
bool matches = true;
|
||||||
|
for (std::size_t i = 0; i < label.size(); ++i) {
|
||||||
|
if (std::tolower(static_cast<unsigned char>(line[i])) !=
|
||||||
|
std::tolower(static_cast<unsigned char>(label[i]))) {
|
||||||
|
matches = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (matches) {
|
||||||
|
line = Trim(line.substr(label.size()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
strip_label("name:");
|
||||||
|
strip_label("brewery name:");
|
||||||
|
strip_label("description:");
|
||||||
|
strip_label("username:");
|
||||||
|
strip_label("bio:");
|
||||||
|
|
||||||
|
return Trim(std::move(line));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse two-line response from LLM: normalize line endings, strip formatting,
|
||||||
|
* filter spurious output, and combine remaining lines if needed
|
||||||
|
*/
|
||||||
|
std::pair<std::string, std::string> ParseTwoLineResponse(
|
||||||
|
const std::string& raw, const std::string& error_message) {
|
||||||
|
std::string normalized = raw;
|
||||||
|
std::replace(normalized.begin(), normalized.end(), '\r', '\n');
|
||||||
|
|
||||||
|
std::vector<std::string> lines;
|
||||||
|
std::stringstream stream(normalized);
|
||||||
|
std::string line;
|
||||||
|
while (std::getline(stream, line)) {
|
||||||
|
line = StripCommonPrefix(std::move(line));
|
||||||
|
if (!line.empty()) lines.push_back(std::move(line));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> filtered;
|
||||||
|
for (auto& l : lines) {
|
||||||
|
std::string low = l;
|
||||||
|
std::transform(low.begin(), low.end(), low.begin(), [](unsigned char c) {
|
||||||
|
return static_cast<char>(std::tolower(c));
|
||||||
|
});
|
||||||
|
// Filter known thinking tags like <think>...</think>, but be conservative
|
||||||
|
// to avoid removing legitimate output. Only filter specific known
|
||||||
|
// patterns.
|
||||||
|
if (!l.empty() && l.front() == '<' && low.back() == '>') {
|
||||||
|
// Only filter if it's a known thinking tag: <think>, <reasoning>, etc.
|
||||||
|
if (low.find("think") != std::string::npos ||
|
||||||
|
low.find("reasoning") != std::string::npos ||
|
||||||
|
low.find("reflect") != std::string::npos) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (low.rfind("okay,", 0) == 0 || low.rfind("hmm", 0) == 0) continue;
|
||||||
|
filtered.push_back(std::move(l));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (filtered.size() < 2) throw std::runtime_error(error_message);
|
||||||
|
|
||||||
|
std::string first = Trim(filtered.front());
|
||||||
|
std::string second;
|
||||||
|
for (size_t i = 1; i < filtered.size(); ++i) {
|
||||||
|
if (!second.empty()) second += ' ';
|
||||||
|
second += filtered[i];
|
||||||
|
}
|
||||||
|
second = Trim(std::move(second));
|
||||||
|
|
||||||
|
if (first.empty() || second.empty()) throw std::runtime_error(error_message);
|
||||||
|
return {first, second};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Apply model's chat template to user-only prompt, formatting it for the model
|
||||||
|
*/
|
||||||
|
std::string ToChatPrompt(const llama_model* model,
|
||||||
|
const std::string& user_prompt) {
|
||||||
|
const char* tmpl = llama_model_chat_template(model, nullptr);
|
||||||
|
if (tmpl == nullptr) {
|
||||||
|
return user_prompt;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_chat_message message{"user", user_prompt.c_str()};
|
||||||
|
|
||||||
|
std::vector<char> buffer(
|
||||||
|
std::max<std::size_t>(1024, user_prompt.size() * 4));
|
||||||
|
int32_t required =
|
||||||
|
llama_chat_apply_template(tmpl, &message, 1, true, buffer.data(),
|
||||||
|
static_cast<int32_t>(buffer.size()));
|
||||||
|
|
||||||
|
if (required < 0) {
|
||||||
|
throw std::runtime_error("LlamaGenerator: failed to apply chat template");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (required >= static_cast<int32_t>(buffer.size())) {
|
||||||
|
buffer.resize(static_cast<std::size_t>(required) + 1);
|
||||||
|
required =
|
||||||
|
llama_chat_apply_template(tmpl, &message, 1, true, buffer.data(),
|
||||||
|
static_cast<int32_t>(buffer.size()));
|
||||||
|
if (required < 0) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: failed to apply chat template");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::string(buffer.data(), static_cast<std::size_t>(required));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Apply model's chat template to system+user prompt pair, formatting for the
|
||||||
|
* model
|
||||||
|
*/
|
||||||
|
std::string ToChatPrompt(const llama_model* model,
|
||||||
|
const std::string& system_prompt,
|
||||||
|
const std::string& user_prompt) {
|
||||||
|
const char* tmpl = llama_model_chat_template(model, nullptr);
|
||||||
|
if (tmpl == nullptr) {
|
||||||
|
return system_prompt + "\n\n" + user_prompt;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_chat_message messages[2] = {{"system", system_prompt.c_str()},
|
||||||
|
{"user", user_prompt.c_str()}};
|
||||||
|
|
||||||
|
std::vector<char> buffer(std::max<std::size_t>(
|
||||||
|
1024, (system_prompt.size() + user_prompt.size()) * 4));
|
||||||
|
int32_t required =
|
||||||
|
llama_chat_apply_template(tmpl, messages, 2, true, buffer.data(),
|
||||||
|
static_cast<int32_t>(buffer.size()));
|
||||||
|
|
||||||
|
if (required < 0) {
|
||||||
|
throw std::runtime_error("LlamaGenerator: failed to apply chat template");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (required >= static_cast<int32_t>(buffer.size())) {
|
||||||
|
buffer.resize(static_cast<std::size_t>(required) + 1);
|
||||||
|
required =
|
||||||
|
llama_chat_apply_template(tmpl, messages, 2, true, buffer.data(),
|
||||||
|
static_cast<int32_t>(buffer.size()));
|
||||||
|
if (required < 0) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: failed to apply chat template");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::string(buffer.data(), static_cast<std::size_t>(required));
|
||||||
|
}
|
||||||
|
|
||||||
|
void AppendTokenPiece(const llama_vocab* vocab, llama_token token,
|
||||||
|
std::string& output) {
|
||||||
|
std::array<char, 256> buffer{};
|
||||||
|
int32_t bytes =
|
||||||
|
llama_token_to_piece(vocab, token, buffer.data(),
|
||||||
|
static_cast<int32_t>(buffer.size()), 0, true);
|
||||||
|
|
||||||
|
if (bytes < 0) {
|
||||||
|
std::vector<char> dynamic_buffer(static_cast<std::size_t>(-bytes));
|
||||||
|
bytes = llama_token_to_piece(vocab, token, dynamic_buffer.data(),
|
||||||
|
static_cast<int32_t>(dynamic_buffer.size()),
|
||||||
|
0, true);
|
||||||
|
if (bytes < 0) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: failed to decode sampled token piece");
|
||||||
|
}
|
||||||
|
|
||||||
|
output.append(dynamic_buffer.data(), static_cast<std::size_t>(bytes));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
output.append(buffer.data(), static_cast<std::size_t>(bytes));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ExtractFirstJsonObject(const std::string& text, std::string& json_out) {
|
||||||
|
std::size_t start = std::string::npos;
|
||||||
|
int depth = 0;
|
||||||
|
bool in_string = false;
|
||||||
|
bool escaped = false;
|
||||||
|
|
||||||
|
for (std::size_t i = 0; i < text.size(); ++i) {
|
||||||
|
const char ch = text[i];
|
||||||
|
|
||||||
|
if (in_string) {
|
||||||
|
if (escaped) {
|
||||||
|
escaped = false;
|
||||||
|
} else if (ch == '\\') {
|
||||||
|
escaped = true;
|
||||||
|
} else if (ch == '"') {
|
||||||
|
in_string = false;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ch == '"') {
|
||||||
|
in_string = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ch == '{') {
|
||||||
|
if (depth == 0) {
|
||||||
|
start = i;
|
||||||
|
}
|
||||||
|
++depth;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ch == '}') {
|
||||||
|
if (depth == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
--depth;
|
||||||
|
if (depth == 0 && start != std::string::npos) {
|
||||||
|
json_out = text.substr(start, i - start + 1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string ValidateBreweryJson(const std::string& raw, std::string& name_out,
|
||||||
|
std::string& description_out) {
|
||||||
|
auto validate_object = [&](const boost::json::value& jv,
|
||||||
|
std::string& error_out) -> bool {
|
||||||
|
if (!jv.is_object()) {
|
||||||
|
error_out = "JSON root must be an object";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto& obj = jv.get_object();
|
||||||
|
if (!obj.contains("name") || !obj.at("name").is_string()) {
|
||||||
|
error_out = "JSON field 'name' is missing or not a string";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!obj.contains("description") || !obj.at("description").is_string()) {
|
||||||
|
error_out = "JSON field 'description' is missing or not a string";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
name_out = Trim(std::string(obj.at("name").as_string().c_str()));
|
||||||
|
description_out =
|
||||||
|
Trim(std::string(obj.at("description").as_string().c_str()));
|
||||||
|
|
||||||
|
if (name_out.empty()) {
|
||||||
|
error_out = "JSON field 'name' must not be empty";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (description_out.empty()) {
|
||||||
|
error_out = "JSON field 'description' must not be empty";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string name_lower = name_out;
|
||||||
|
std::string description_lower = description_out;
|
||||||
|
std::transform(
|
||||||
|
name_lower.begin(), name_lower.end(), name_lower.begin(),
|
||||||
|
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
|
||||||
|
std::transform(description_lower.begin(), description_lower.end(),
|
||||||
|
description_lower.begin(), [](unsigned char c) {
|
||||||
|
return static_cast<char>(std::tolower(c));
|
||||||
|
});
|
||||||
|
|
||||||
|
if (name_lower == "string" || description_lower == "string") {
|
||||||
|
error_out = "JSON appears to be a schema placeholder, not content";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
error_out.clear();
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
boost::system::error_code ec;
|
||||||
|
boost::json::value jv = boost::json::parse(raw, ec);
|
||||||
|
std::string validation_error;
|
||||||
|
if (ec) {
|
||||||
|
std::string extracted;
|
||||||
|
if (!ExtractFirstJsonObject(raw, extracted)) {
|
||||||
|
return "JSON parse error: " + ec.message();
|
||||||
|
}
|
||||||
|
|
||||||
|
ec.clear();
|
||||||
|
jv = boost::json::parse(extracted, ec);
|
||||||
|
if (ec) {
|
||||||
|
return "JSON parse error: " + ec.message();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!validate_object(jv, validation_error)) {
|
||||||
|
return validation_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!validate_object(jv, validation_error)) {
|
||||||
|
return validation_error;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
// Forward declarations for helper functions exposed to other translation units
|
||||||
|
std::string PrepareRegionContextPublic(std::string_view region_context,
|
||||||
|
std::size_t max_chars) {
|
||||||
|
return PrepareRegionContext(region_context, max_chars);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<std::string, std::string> ParseTwoLineResponsePublic(
|
||||||
|
const std::string& raw, const std::string& error_message) {
|
||||||
|
return ParseTwoLineResponse(raw, error_message);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string ToChatPromptPublic(const llama_model* model,
|
||||||
|
const std::string& user_prompt) {
|
||||||
|
return ToChatPrompt(model, user_prompt);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string ToChatPromptPublic(const llama_model* model,
|
||||||
|
const std::string& system_prompt,
|
||||||
|
const std::string& user_prompt) {
|
||||||
|
return ToChatPrompt(model, system_prompt, user_prompt);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
|
||||||
|
std::string& output) {
|
||||||
|
AppendTokenPiece(vocab, token, output);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string ValidateBreweryJsonPublic(const std::string& raw,
|
||||||
|
std::string& name_out,
|
||||||
|
std::string& description_out) {
|
||||||
|
return ValidateBreweryJson(raw, name_out, description_out);
|
||||||
|
}
|
||||||
196
pipeline/src/data_generation/llama/infer.cpp
Normal file
196
pipeline/src/data_generation/llama/infer.cpp
Normal file
@@ -0,0 +1,196 @@
|
|||||||
|
/**
|
||||||
|
* Text Generation / Inference Module
|
||||||
|
* Core module that performs LLM inference: converts text prompts into tokens,
|
||||||
|
* runs the neural network forward pass, samples the next token, and converts
|
||||||
|
* output tokens back to text. Supports both simple and system+user prompts.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <memory>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
#include "data_generation/llama_generator_helpers.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
std::string LlamaGenerator::Infer(const std::string& prompt, int max_tokens) {
|
||||||
|
return InferFormatted(ToChatPromptPublic(model_, prompt), max_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string LlamaGenerator::Infer(const std::string& system_prompt,
|
||||||
|
const std::string& prompt, int max_tokens) {
|
||||||
|
return InferFormatted(ToChatPromptPublic(model_, system_prompt, prompt),
|
||||||
|
max_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
|
||||||
|
int max_tokens) {
|
||||||
|
/**
|
||||||
|
* Validate that model and context are loaded
|
||||||
|
*/
|
||||||
|
if (model_ == nullptr || context_ == nullptr)
|
||||||
|
throw std::runtime_error("LlamaGenerator: model not loaded");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get vocabulary for tokenization and token-to-text conversion
|
||||||
|
*/
|
||||||
|
const llama_vocab* vocab = llama_model_get_vocab(model_);
|
||||||
|
if (vocab == nullptr)
|
||||||
|
throw std::runtime_error("LlamaGenerator: vocab unavailable");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clear KV cache to ensure clean inference state (no residual context)
|
||||||
|
*/
|
||||||
|
llama_memory_clear(llama_get_memory(context_), true);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* TOKENIZATION PHASE
|
||||||
|
* Convert text prompt into token IDs (integers) that the model understands
|
||||||
|
*/
|
||||||
|
std::vector<llama_token> prompt_tokens(formatted_prompt.size() + 8);
|
||||||
|
int32_t token_count = llama_tokenize(
|
||||||
|
vocab, formatted_prompt.c_str(),
|
||||||
|
static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
|
||||||
|
static_cast<int32_t>(prompt_tokens.size()), true, true);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If buffer too small, negative return indicates required size
|
||||||
|
*/
|
||||||
|
if (token_count < 0) {
|
||||||
|
prompt_tokens.resize(static_cast<std::size_t>(-token_count));
|
||||||
|
token_count = llama_tokenize(
|
||||||
|
vocab, formatted_prompt.c_str(),
|
||||||
|
static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
|
||||||
|
static_cast<int32_t>(prompt_tokens.size()), true, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (token_count < 0)
|
||||||
|
throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CONTEXT SIZE VALIDATION
|
||||||
|
* Validate and compute effective token budgets based on context window
|
||||||
|
* constraints
|
||||||
|
*/
|
||||||
|
const int32_t n_ctx = static_cast<int32_t>(llama_n_ctx(context_));
|
||||||
|
const int32_t n_batch = static_cast<int32_t>(llama_n_batch(context_));
|
||||||
|
if (n_ctx <= 1 || n_batch <= 0)
|
||||||
|
throw std::runtime_error("LlamaGenerator: invalid context or batch size");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Clamp generation limit to available context window, reserve space for
|
||||||
|
* output
|
||||||
|
*/
|
||||||
|
const int32_t effective_max_tokens =
|
||||||
|
std::max(1, std::min(max_tokens, n_ctx - 1));
|
||||||
|
/**
|
||||||
|
* Prompt can use remaining context after reserving space for generation
|
||||||
|
*/
|
||||||
|
int32_t prompt_budget = std::min(n_batch, n_ctx - effective_max_tokens);
|
||||||
|
prompt_budget = std::max<int32_t>(1, prompt_budget);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Truncate prompt if necessary to fit within constraints
|
||||||
|
*/
|
||||||
|
prompt_tokens.resize(static_cast<std::size_t>(token_count));
|
||||||
|
if (token_count > prompt_budget) {
|
||||||
|
spdlog::warn(
|
||||||
|
"LlamaGenerator: prompt too long ({} tokens), truncating to {} "
|
||||||
|
"tokens to fit n_batch/n_ctx limits",
|
||||||
|
token_count, prompt_budget);
|
||||||
|
prompt_tokens.resize(static_cast<std::size_t>(prompt_budget));
|
||||||
|
token_count = prompt_budget;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* PROMPT PROCESSING PHASE
|
||||||
|
* Create a batch containing all prompt tokens and feed through the model
|
||||||
|
* This computes internal representations and fills the KV cache
|
||||||
|
*/
|
||||||
|
const llama_batch prompt_batch = llama_batch_get_one(
|
||||||
|
prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
|
||||||
|
if (llama_decode(context_, prompt_batch) != 0)
|
||||||
|
throw std::runtime_error("LlamaGenerator: prompt decode failed");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* SAMPLER CONFIGURATION PHASE
|
||||||
|
* Set up the probabilistic token selection pipeline (sampler chain)
|
||||||
|
* Samplers are applied in sequence: temperature -> top-p -> distribution
|
||||||
|
*/
|
||||||
|
llama_sampler_chain_params sampler_params =
|
||||||
|
llama_sampler_chain_default_params();
|
||||||
|
using SamplerPtr =
|
||||||
|
std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
|
||||||
|
SamplerPtr sampler(llama_sampler_chain_init(sampler_params),
|
||||||
|
&llama_sampler_free);
|
||||||
|
if (!sampler)
|
||||||
|
throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Temperature: scales logits before softmax (controls randomness)
|
||||||
|
*/
|
||||||
|
llama_sampler_chain_add(sampler.get(),
|
||||||
|
llama_sampler_init_temp(sampling_temperature_));
|
||||||
|
/**
|
||||||
|
* Top-P: nucleus sampling - filters to most likely tokens summing to top_p
|
||||||
|
* probability
|
||||||
|
*/
|
||||||
|
llama_sampler_chain_add(sampler.get(),
|
||||||
|
llama_sampler_init_top_p(sampling_top_p_, 1));
|
||||||
|
/**
|
||||||
|
* Distribution sampler: selects actual token using configured seed for
|
||||||
|
* reproducibility
|
||||||
|
*/
|
||||||
|
llama_sampler_chain_add(sampler.get(),
|
||||||
|
llama_sampler_init_dist(sampling_seed_));
|
||||||
|
|
||||||
|
/**
|
||||||
|
* TOKEN GENERATION LOOP
|
||||||
|
* Iteratively generate tokens one at a time until max_tokens or
|
||||||
|
* end-of-sequence
|
||||||
|
*/
|
||||||
|
std::vector<llama_token> generated_tokens;
|
||||||
|
generated_tokens.reserve(static_cast<std::size_t>(effective_max_tokens));
|
||||||
|
|
||||||
|
for (int i = 0; i < effective_max_tokens; ++i) {
|
||||||
|
/**
|
||||||
|
* Sample next token using configured sampler chain and model logits
|
||||||
|
* Index -1 means use the last output position from previous batch
|
||||||
|
*/
|
||||||
|
const llama_token next =
|
||||||
|
llama_sampler_sample(sampler.get(), context_, -1);
|
||||||
|
/**
|
||||||
|
* Stop if model predicts end-of-generation token (EOS/EOT)
|
||||||
|
*/
|
||||||
|
if (llama_vocab_is_eog(vocab, next)) break;
|
||||||
|
generated_tokens.push_back(next);
|
||||||
|
/**
|
||||||
|
* Feed the sampled token back into model for next iteration
|
||||||
|
* (autoregressive)
|
||||||
|
*/
|
||||||
|
llama_token token = next;
|
||||||
|
const llama_batch one_token_batch = llama_batch_get_one(&token, 1);
|
||||||
|
if (llama_decode(context_, one_token_batch) != 0)
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: decode failed during generation");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* DETOKENIZATION PHASE
|
||||||
|
* Convert generated token IDs back to text using vocabulary
|
||||||
|
*/
|
||||||
|
std::string output;
|
||||||
|
for (const llama_token token : generated_tokens)
|
||||||
|
AppendTokenPiecePublic(vocab, token, output);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Advance seed for next generation to improve output diversity
|
||||||
|
*/
|
||||||
|
sampling_seed_ = (sampling_seed_ == 0xFFFFFFFFu) ? 0 : sampling_seed_ + 1;
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
56
pipeline/src/data_generation/llama/load.cpp
Normal file
56
pipeline/src/data_generation/llama/load.cpp
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
/**
|
||||||
|
* Model Loading Module
|
||||||
|
* This module handles loading a pre-trained LLM model from disk and
|
||||||
|
* initializing the llama.cpp context for inference. It performs one-time setup
|
||||||
|
* required before any inference operations can be performed.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
void LlamaGenerator::Load(const std::string& model_path) {
|
||||||
|
/**
|
||||||
|
* Validate input and clean up any previously loaded model/context
|
||||||
|
*/
|
||||||
|
if (model_path.empty())
|
||||||
|
throw std::runtime_error("LlamaGenerator: model path must not be empty");
|
||||||
|
|
||||||
|
if (context_ != nullptr) {
|
||||||
|
llama_free(context_);
|
||||||
|
context_ = nullptr;
|
||||||
|
}
|
||||||
|
if (model_ != nullptr) {
|
||||||
|
llama_model_free(model_);
|
||||||
|
model_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize the llama backend (one-time setup for GPU/CPU acceleration)
|
||||||
|
*/
|
||||||
|
llama_backend_init();
|
||||||
|
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
model_ = llama_model_load_from_file(model_path.c_str(), model_params);
|
||||||
|
if (model_ == nullptr) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: failed to load model from path: " + model_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_context_params context_params = llama_context_default_params();
|
||||||
|
context_params.n_ctx = n_ctx_;
|
||||||
|
context_params.n_batch = n_ctx_; // Set batch size equal to context window
|
||||||
|
|
||||||
|
context_ = llama_init_from_model(model_, context_params);
|
||||||
|
if (context_ == nullptr) {
|
||||||
|
llama_model_free(model_);
|
||||||
|
model_ = nullptr;
|
||||||
|
throw std::runtime_error("LlamaGenerator: failed to create context");
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("[LlamaGenerator] Loaded model: {}", model_path);
|
||||||
|
}
|
||||||
74
pipeline/src/data_generation/llama/load_brewery_prompt.cpp
Normal file
74
pipeline/src/data_generation/llama/load_brewery_prompt.cpp
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
#include <fstream>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
|
||||||
|
namespace fs = std::filesystem;
|
||||||
|
|
||||||
|
std::string LlamaGenerator::LoadBrewerySystemPrompt(
|
||||||
|
const std::string& prompt_file_path) {
|
||||||
|
// Return cached version if already loaded
|
||||||
|
if (!brewery_system_prompt_.empty()) {
|
||||||
|
return brewery_system_prompt_;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try multiple path locations
|
||||||
|
std::vector<std::string> paths_to_try = {
|
||||||
|
prompt_file_path, // As provided
|
||||||
|
"../" + prompt_file_path, // One level up
|
||||||
|
"../../" + prompt_file_path, // Two levels up
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const auto& path : paths_to_try) {
|
||||||
|
std::ifstream prompt_file(path);
|
||||||
|
if (prompt_file.is_open()) {
|
||||||
|
std::string prompt((std::istreambuf_iterator<char>(prompt_file)),
|
||||||
|
std::istreambuf_iterator<char>());
|
||||||
|
prompt_file.close();
|
||||||
|
|
||||||
|
if (!prompt.empty()) {
|
||||||
|
spdlog::info(
|
||||||
|
"LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)",
|
||||||
|
path, prompt.length());
|
||||||
|
brewery_system_prompt_ = prompt;
|
||||||
|
return brewery_system_prompt_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::warn(
|
||||||
|
"LlamaGenerator: Could not open brewery system prompt file at any of the "
|
||||||
|
"expected locations. Using fallback inline prompt.");
|
||||||
|
return GetFallbackBreweryPrompt();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fallback: minimal inline prompt if file fails to load
|
||||||
|
std::string LlamaGenerator::GetFallbackBreweryPrompt() {
|
||||||
|
return "You are an experienced brewmaster and owner of a local craft brewery. "
|
||||||
|
"Create a distinctive, authentic name and detailed description that "
|
||||||
|
"genuinely reflects your specific location, brewing philosophy, local "
|
||||||
|
"culture, and community connection. The brewery must feel real and "
|
||||||
|
"grounded—not generic or interchangeable.\n\n"
|
||||||
|
"AVOID REPETITIVE PHRASES - Never use:\n"
|
||||||
|
"Love letter to, tribute to, rolling hills, picturesque, every sip "
|
||||||
|
"tells a story, Come for X stay for Y, rich history, passion, woven "
|
||||||
|
"into, ancient roots, timeless, where tradition meets innovation\n\n"
|
||||||
|
"OPENING APPROACHES - Choose ONE:\n"
|
||||||
|
"1. Start with specific beer style and its regional origins\n"
|
||||||
|
"2. Begin with specific brewing challenge (water, altitude, climate)\n"
|
||||||
|
"3. Open with founding story or personal motivation\n"
|
||||||
|
"4. Lead with specific local ingredient or resource\n"
|
||||||
|
"5. Start with unexpected angle or contradiction\n"
|
||||||
|
"6. Open with local event, tradition, or cultural moment\n"
|
||||||
|
"7. Begin with tangible architectural or geographic detail\n\n"
|
||||||
|
"BE SPECIFIC - Include:\n"
|
||||||
|
"- At least ONE concrete proper noun (landmark, river, neighborhood)\n"
|
||||||
|
"- Specific beer styles relevant to the REGION'S culture\n"
|
||||||
|
"- Concrete brewing challenges or advantages\n"
|
||||||
|
"- Sensory details SPECIFIC to place—not generic adjectives\n\n"
|
||||||
|
"LENGTH: 150-250 words. TONE: Can be soulful, irreverent, "
|
||||||
|
"matter-of-fact, unpretentious, or minimalist.\n\n"
|
||||||
|
"Output ONLY a raw JSON object with keys name and description. "
|
||||||
|
"No markdown, backticks, preamble, or trailing text.";
|
||||||
|
}
|
||||||
65
pipeline/src/data_generation/llama/set_sampling_options.cpp
Normal file
65
pipeline/src/data_generation/llama/set_sampling_options.cpp
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
/**
|
||||||
|
* Sampling Configuration Module
|
||||||
|
* Configures the hyperparameters that control probabilistic token selection
|
||||||
|
* during text generation. These settings affect the randomness, diversity, and
|
||||||
|
* quality of generated output.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
#include "llama.h"
|
||||||
|
|
||||||
|
void LlamaGenerator::SetSamplingOptions(float temperature, float top_p,
|
||||||
|
int seed) {
|
||||||
|
/**
|
||||||
|
* Validate temperature: controls randomness in output distribution
|
||||||
|
* 0.0 = deterministic (always pick highest probability token)
|
||||||
|
* Higher values = more random/diverse output
|
||||||
|
*/
|
||||||
|
if (temperature < 0.0f) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: sampling temperature must be >= 0");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate top-p (nucleus sampling): only sample from top cumulative
|
||||||
|
* probability e.g., top-p=0.9 means sample from tokens that make up 90% of
|
||||||
|
* probability mass
|
||||||
|
*/
|
||||||
|
if (!(top_p > 0.0f && top_p <= 1.0f)) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: sampling top-p must be in (0, 1]");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validate seed: for reproducible results (-1 uses random seed)
|
||||||
|
*/
|
||||||
|
if (seed < -1) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: seed must be >= 0, or -1 for random");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store sampling parameters for use during token generation
|
||||||
|
*/
|
||||||
|
sampling_temperature_ = temperature;
|
||||||
|
sampling_top_p_ = top_p;
|
||||||
|
sampling_seed_ = (seed < 0) ? static_cast<uint32_t>(LLAMA_DEFAULT_SEED)
|
||||||
|
: static_cast<uint32_t>(seed);
|
||||||
|
}
|
||||||
|
|
||||||
|
void LlamaGenerator::SetContextSize(uint32_t n_ctx) {
|
||||||
|
/**
|
||||||
|
* Validate context size: must be positive and reasonable for the model
|
||||||
|
*/
|
||||||
|
if (n_ctx == 0 || n_ctx > 32768) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"LlamaGenerator: context size must be in range [1, 32768]");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Store context size for use during model loading
|
||||||
|
*/
|
||||||
|
n_ctx_ = n_ctx;
|
||||||
|
}
|
||||||
65
pipeline/src/data_generation/mock/data.cpp
Normal file
65
pipeline/src/data_generation/mock/data.cpp
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "data_generation/mock_generator.h"
|
||||||
|
|
||||||
|
const std::vector<std::string> MockGenerator::kBreweryAdjectives = {
|
||||||
|
"Craft", "Heritage", "Local", "Artisan", "Pioneer", "Golden",
|
||||||
|
"Modern", "Classic", "Summit", "Northern", "Riverstone", "Barrel",
|
||||||
|
"Hinterland", "Harbor", "Wild", "Granite", "Copper", "Maple"};
|
||||||
|
|
||||||
|
const std::vector<std::string> MockGenerator::kBreweryNouns = {
|
||||||
|
"Brewing Co.", "Brewery", "Bier Haus", "Taproom", "Works",
|
||||||
|
"House", "Fermentery", "Ale Co.", "Cellars", "Collective",
|
||||||
|
"Project", "Foundry", "Malthouse", "Public House", "Co-op",
|
||||||
|
"Lab", "Beer Hall", "Guild"};
|
||||||
|
|
||||||
|
const std::vector<std::string> MockGenerator::kBreweryDescriptions = {
|
||||||
|
"Handcrafted pale ales and seasonal IPAs with local ingredients.",
|
||||||
|
"Traditional lagers and experimental sours in small batches.",
|
||||||
|
"Award-winning stouts and wildly hoppy blonde ales.",
|
||||||
|
"Craft brewery specializing in Belgian-style triples and dark porters.",
|
||||||
|
"Modern brewery blending tradition with bold experimental flavors.",
|
||||||
|
"Neighborhood-focused taproom pouring crisp pilsners and citrusy pale "
|
||||||
|
"ales.",
|
||||||
|
"Small-batch brewery known for barrel-aged releases and smoky lagers.",
|
||||||
|
"Independent brewhouse pairing farmhouse ales with rotating food pop-ups.",
|
||||||
|
"Community brewpub making balanced bitters, saisons, and hazy IPAs.",
|
||||||
|
"Experimental nanobrewery exploring local yeast and regional grains.",
|
||||||
|
"Family-run brewery producing smooth amber ales and robust porters.",
|
||||||
|
"Urban brewery crafting clean lagers and bright, fruit-forward sours.",
|
||||||
|
"Riverfront brewhouse featuring oak-matured ales and seasonal blends.",
|
||||||
|
"Modern taproom focused on sessionable lagers and classic pub styles.",
|
||||||
|
"Brewery rooted in tradition with a lineup of malty reds and crisp lagers.",
|
||||||
|
"Creative brewery offering rotating collaborations and limited draft-only "
|
||||||
|
"pours.",
|
||||||
|
"Locally inspired brewery serving approachable ales with bold hop "
|
||||||
|
"character.",
|
||||||
|
"Destination taproom known for balanced IPAs and cocoa-rich stouts."};
|
||||||
|
|
||||||
|
const std::vector<std::string> MockGenerator::kUsernames = {
|
||||||
|
"hopseeker", "malttrail", "yeastwhisper", "lagerlane",
|
||||||
|
"barrelbound", "foamfinder", "taphunter", "graingeist",
|
||||||
|
"brewscout", "aleatlas", "caskcompass", "hopsandmaps",
|
||||||
|
"mashpilot", "pintnomad", "fermentfriend", "stoutsignal",
|
||||||
|
"sessionwander", "kettlekeeper"};
|
||||||
|
|
||||||
|
const std::vector<std::string> MockGenerator::kBios = {
|
||||||
|
"Always chasing balanced IPAs and crisp lagers across local taprooms.",
|
||||||
|
"Weekend brewery explorer with a soft spot for dark, roasty stouts.",
|
||||||
|
"Documenting tiny brewpubs, fresh pours, and unforgettable beer gardens.",
|
||||||
|
"Fan of farmhouse ales, food pairings, and long tasting flights.",
|
||||||
|
"Collecting favorite pilsners one city at a time.",
|
||||||
|
"Hops-first drinker who still saves room for classic malt-forward styles.",
|
||||||
|
"Finding hidden tap lists and sharing the best seasonal releases.",
|
||||||
|
"Brewery road-tripper focused on local ingredients and clean fermentation.",
|
||||||
|
"Always comparing house lagers and ranking patio pint vibes.",
|
||||||
|
"Curious about yeast strains, barrel programs, and cellar experiments.",
|
||||||
|
"Believes every neighborhood deserves a great community taproom.",
|
||||||
|
"Looking for session beers that taste great from first sip to last.",
|
||||||
|
"Belgian ale enthusiast who never skips a new saison.",
|
||||||
|
"Hazy IPA critic with deep respect for a perfectly clear pilsner.",
|
||||||
|
"Visits breweries for the stories, stays for the flagship pours.",
|
||||||
|
"Craft beer fan mapping tasting notes and favorite brew routes.",
|
||||||
|
"Always ready to trade recommendations for underrated local breweries.",
|
||||||
|
"Keeping a running list of must-try collab releases and tap takeovers."};
|
||||||
12
pipeline/src/data_generation/mock/deterministic_hash.cpp
Normal file
12
pipeline/src/data_generation/mock/deterministic_hash.cpp
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "data_generation/mock_generator.h"
|
||||||
|
|
||||||
|
std::size_t MockGenerator::DeterministicHash(const std::string& a,
|
||||||
|
const std::string& b) {
|
||||||
|
std::size_t seed = std::hash<std::string>{}(a);
|
||||||
|
const std::size_t mixed = std::hash<std::string>{}(b);
|
||||||
|
seed ^= mixed + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
|
||||||
|
seed = (seed << 13) | (seed >> ((sizeof(std::size_t) * 8) - 13));
|
||||||
|
return seed;
|
||||||
|
}
|
||||||
24
pipeline/src/data_generation/mock/generate_brewery.cpp
Normal file
24
pipeline/src/data_generation/mock/generate_brewery.cpp
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "data_generation/mock_generator.h"
|
||||||
|
|
||||||
|
auto MockGenerator::GenerateBrewery(const std::string& city_name,
|
||||||
|
const std::string& country_name,
|
||||||
|
const std::string& /*region_context*/)
|
||||||
|
-> BreweryResult {
|
||||||
|
const std::size_t hash = DeterministicHash(city_name, country_name);
|
||||||
|
|
||||||
|
const std::string& adjective =
|
||||||
|
kBreweryAdjectives.at(hash % kBreweryAdjectives.size());
|
||||||
|
const std::string& noun = kBreweryNouns.at((hash / 7) % kBreweryNouns.size());
|
||||||
|
const std::string& base_description =
|
||||||
|
kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size());
|
||||||
|
|
||||||
|
const std::string name = city_name + " " + adjective + " " + noun;
|
||||||
|
const std::string description =
|
||||||
|
base_description + " Based in " + city_name +
|
||||||
|
(country_name.empty() ? std::string(".")
|
||||||
|
: std::string(", ") + country_name + ".");
|
||||||
|
|
||||||
|
return {name, description};
|
||||||
|
}
|
||||||
13
pipeline/src/data_generation/mock/generate_user.cpp
Normal file
13
pipeline/src/data_generation/mock/generate_user.cpp
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
#include <functional>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "data_generation/mock_generator.h"
|
||||||
|
|
||||||
|
UserResult MockGenerator::GenerateUser(const std::string& locale) {
|
||||||
|
const std::size_t hash = std::hash<std::string>{}(locale);
|
||||||
|
|
||||||
|
UserResult result;
|
||||||
|
result.username = kUsernames[hash % kUsernames.size()];
|
||||||
|
result.bio = kBios[(hash / 11) % kBios.size()];
|
||||||
|
return result;
|
||||||
|
}
|
||||||
9
pipeline/src/data_generation/mock/load.cpp
Normal file
9
pipeline/src/data_generation/mock/load.cpp
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include "data_generation/mock_generator.h"
|
||||||
|
|
||||||
|
void MockGenerator::Load(const std::string& /*modelPath*/) {
|
||||||
|
spdlog::info("[MockGenerator] No model needed");
|
||||||
|
}
|
||||||
83
pipeline/src/json_handling/json_loader.cpp
Normal file
83
pipeline/src/json_handling/json_loader.cpp
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
#include "json_handling/json_loader.h"
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <boost/json.hpp>
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
auto ReadRequiredString(const boost::json::object& object,
|
||||||
|
const char* key) -> std::string {
|
||||||
|
const boost::json::value* value = object.if_contains(key);
|
||||||
|
if (value == nullptr || !value->is_string()) {
|
||||||
|
throw std::runtime_error(std::string("Missing or invalid string field: ") +
|
||||||
|
key);
|
||||||
|
}
|
||||||
|
return std::string(value->as_string().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
auto ReadRequiredNumber(const boost::json::object& object, const char* key)
|
||||||
|
-> double {
|
||||||
|
const boost::json::value* value = object.if_contains(key);
|
||||||
|
if (value == nullptr || !value->is_number()) {
|
||||||
|
throw std::runtime_error(std::string("Missing or invalid numeric field: ") +
|
||||||
|
key);
|
||||||
|
}
|
||||||
|
return value->to_number<double>();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
auto JsonLoader::LoadLocations(const std::string& filepath)
|
||||||
|
-> std::vector<Location> {
|
||||||
|
std::ifstream input(filepath);
|
||||||
|
if (!input.is_open()) {
|
||||||
|
throw std::runtime_error("Failed to open locations file: " + filepath);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::stringstream buffer;
|
||||||
|
buffer << input.rdbuf();
|
||||||
|
const std::string content = buffer.str();
|
||||||
|
|
||||||
|
boost::json::error_code error;
|
||||||
|
boost::json::value root = boost::json::parse(content, error);
|
||||||
|
if (error) {
|
||||||
|
throw std::runtime_error("Failed to parse locations JSON: " +
|
||||||
|
error.message());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!root.is_array()) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Invalid locations JSON: root element must be an array");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<Location> locations;
|
||||||
|
const auto& items = root.as_array();
|
||||||
|
locations.reserve(items.size());
|
||||||
|
|
||||||
|
for (const auto& item : items) {
|
||||||
|
if (!item.is_object()) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Invalid locations JSON: each entry must be an object");
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto& object = item.as_object();
|
||||||
|
locations.push_back(Location{
|
||||||
|
.city = ReadRequiredString(object, "city"),
|
||||||
|
.state_province = ReadRequiredString(object, "state_province"),
|
||||||
|
.iso3166_2 = ReadRequiredString(object, "iso3166_2"),
|
||||||
|
.country = ReadRequiredString(object, "country"),
|
||||||
|
.iso3166_1 = ReadRequiredString(object, "iso3166_1"),
|
||||||
|
.latitude = ReadRequiredNumber(object, "latitude"),
|
||||||
|
.longitude = ReadRequiredNumber(object, "longitude"),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("[JsonLoader] Loaded {} locations from {}", locations.size(),
|
||||||
|
filepath);
|
||||||
|
return locations;
|
||||||
|
}
|
||||||
139
pipeline/src/main.cpp
Normal file
139
pipeline/src/main.cpp
Normal file
@@ -0,0 +1,139 @@
|
|||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <boost/program_options.hpp>
|
||||||
|
#include <iostream>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
#include "biergarten_data_generator.h"
|
||||||
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
|
namespace po = boost::program_options;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Parse command-line arguments into ApplicationOptions.
|
||||||
|
*
|
||||||
|
* @param argc Command-line argument count.
|
||||||
|
* @param argv Command-line arguments.
|
||||||
|
* @param options Output ApplicationOptions struct.
|
||||||
|
* @return true if parsing succeeded and should proceed, false otherwise.
|
||||||
|
*/
|
||||||
|
bool ParseArguments(int argc, char** argv, ApplicationOptions& options) {
|
||||||
|
// If no arguments provided, display usage and exit
|
||||||
|
if (argc == 1) {
|
||||||
|
std::cout << "Biergarten Pipeline - Geographic Data Pipeline with "
|
||||||
|
"Brewery Generation\n\n";
|
||||||
|
std::cout << "Usage: biergarten-pipeline [options]\n\n";
|
||||||
|
std::cout << "Options:\n";
|
||||||
|
std::cout << " --mocked Use mocked generator for "
|
||||||
|
"brewery/user data\n";
|
||||||
|
std::cout << " --model, -m PATH Path to LLM model file (gguf) for "
|
||||||
|
"generation\n";
|
||||||
|
std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: "
|
||||||
|
"/tmp)\n";
|
||||||
|
std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 "
|
||||||
|
"(default: 0.8)\n";
|
||||||
|
std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 "
|
||||||
|
"(default: 0.92)\n";
|
||||||
|
std::cout << " --n-ctx SIZE Context window size in tokens "
|
||||||
|
"(default: 4096)\n";
|
||||||
|
std::cout << " --seed SEED Random seed: -1 for random "
|
||||||
|
"(default: -1)\n";
|
||||||
|
std::cout << " --help, -h Show this help message\n\n";
|
||||||
|
std::cout << "Note: --mocked and --model are mutually exclusive. Exactly "
|
||||||
|
"one must be provided.\n";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
po::options_description desc("Pipeline Options");
|
||||||
|
desc.add_options()("help,h", "Produce help message")(
|
||||||
|
"mocked", po::bool_switch(),
|
||||||
|
"Use mocked generator for brewery/user data")(
|
||||||
|
"model,m", po::value<std::string>()->default_value(""),
|
||||||
|
"Path to LLM model (gguf)")(
|
||||||
|
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
|
||||||
|
"Directory for cached JSON")(
|
||||||
|
"temperature", po::value<float>()->default_value(0.8f),
|
||||||
|
"Sampling temperature (higher = more random)")(
|
||||||
|
"top-p", po::value<float>()->default_value(0.92f),
|
||||||
|
"Nucleus sampling top-p in (0,1] (higher = more random)")(
|
||||||
|
"n-ctx", po::value<uint32_t>()->default_value(8192),
|
||||||
|
"Context window size in tokens (1-32768)")(
|
||||||
|
"seed", po::value<int>()->default_value(-1),
|
||||||
|
"Sampler seed: -1 for random, otherwise non-negative integer");
|
||||||
|
|
||||||
|
po::variables_map vm;
|
||||||
|
po::store(po::parse_command_line(argc, argv, desc), vm);
|
||||||
|
po::notify(vm);
|
||||||
|
|
||||||
|
if (vm.count("help")) {
|
||||||
|
std::cout << desc << "\n";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check for mutually exclusive --mocked and --model flags
|
||||||
|
bool use_mocked = vm["mocked"].as<bool>();
|
||||||
|
std::string model_path = vm["model"].as<std::string>();
|
||||||
|
|
||||||
|
if (use_mocked && !model_path.empty()) {
|
||||||
|
spdlog::error("ERROR: --mocked and --model are mutually exclusive");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!use_mocked && model_path.empty()) {
|
||||||
|
spdlog::error("ERROR: Either --mocked or --model must be specified");
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Warn if sampling parameters are provided with --mocked
|
||||||
|
if (use_mocked) {
|
||||||
|
bool hasTemperature = vm["temperature"].defaulted() == false;
|
||||||
|
bool hasTopP = vm["top-p"].defaulted() == false;
|
||||||
|
bool hasSeed = vm["seed"].defaulted() == false;
|
||||||
|
|
||||||
|
if (hasTemperature || hasTopP || hasSeed) {
|
||||||
|
spdlog::warn(
|
||||||
|
"WARNING: Sampling parameters (--temperature, --top-p, --seed) "
|
||||||
|
"are ignored when using --mocked");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
options.use_mocked = use_mocked;
|
||||||
|
options.model_path = model_path;
|
||||||
|
options.cache_dir = vm["cache-dir"].as<std::string>();
|
||||||
|
options.temperature = vm["temperature"].as<float>();
|
||||||
|
options.top_p = vm["top-p"].as<float>();
|
||||||
|
options.n_ctx = vm["n-ctx"].as<uint32_t>();
|
||||||
|
options.seed = vm["seed"].as<int>();
|
||||||
|
// commit is always pinned to c5eb7772
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char* argv[]) {
|
||||||
|
try {
|
||||||
|
const CurlGlobalState curl_state;
|
||||||
|
|
||||||
|
ApplicationOptions options;
|
||||||
|
if (!ParseArguments(argc, argv, options)) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto webClient = std::make_shared<CURLWebClient>();
|
||||||
|
|
||||||
|
BiergartenDataGenerator generator(options, webClient);
|
||||||
|
return generator.Run();
|
||||||
|
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
const std::string message = e.what() ? e.what() : "";
|
||||||
|
|
||||||
|
if (message.find("LlamaGenerator: malformed brewery response") !=
|
||||||
|
std::string::npos) {
|
||||||
|
spdlog::warn("WARNING: Non-fatal LLM failure after retries: {}",
|
||||||
|
message);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::error("ERROR: Application failed: {}", e.what());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
141
pipeline/src/web_client/curl_web_client.cpp
Normal file
141
pipeline/src/web_client/curl_web_client.cpp
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
|
#include <curl/curl.h>
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <fstream>
|
||||||
|
#include <memory>
|
||||||
|
#include <sstream>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
CurlGlobalState::CurlGlobalState() {
|
||||||
|
if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"[CURLWebClient] Failed to initialize libcurl globally");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
// curl write callback that appends response data into a std::string
|
||||||
|
size_t WriteCallbackString(void* contents, size_t size, size_t nmemb,
|
||||||
|
void* userp) {
|
||||||
|
size_t realsize = size * nmemb;
|
||||||
|
auto* s = static_cast<std::string*>(userp);
|
||||||
|
s->append(static_cast<char*>(contents), realsize);
|
||||||
|
return realsize;
|
||||||
|
}
|
||||||
|
|
||||||
|
// curl write callback that writes to a file stream
|
||||||
|
size_t WriteCallbackFile(void* contents, size_t size, size_t nmemb,
|
||||||
|
void* userp) {
|
||||||
|
size_t realsize = size * nmemb;
|
||||||
|
auto* outFile = static_cast<std::ofstream*>(userp);
|
||||||
|
outFile->write(static_cast<char*>(contents), realsize);
|
||||||
|
return realsize;
|
||||||
|
}
|
||||||
|
|
||||||
|
// RAII wrapper for CURL handle using unique_ptr
|
||||||
|
using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
||||||
|
|
||||||
|
CurlHandle create_handle() {
|
||||||
|
CURL* handle = curl_easy_init();
|
||||||
|
if (!handle) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"[CURLWebClient] Failed to initialize libcurl handle");
|
||||||
|
}
|
||||||
|
return CurlHandle(handle, &curl_easy_cleanup);
|
||||||
|
}
|
||||||
|
|
||||||
|
void set_common_get_options(CURL* curl, const std::string& url,
|
||||||
|
long connect_timeout, long total_timeout) {
|
||||||
|
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||||
|
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
|
||||||
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, connect_timeout);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_TIMEOUT, total_timeout);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
CURLWebClient::CURLWebClient() {}
|
||||||
|
|
||||||
|
CURLWebClient::~CURLWebClient() {}
|
||||||
|
|
||||||
|
void CURLWebClient::DownloadToFile(const std::string& url,
|
||||||
|
const std::string& file_path) {
|
||||||
|
auto curl = create_handle();
|
||||||
|
|
||||||
|
std::ofstream outFile(file_path, std::ios::binary);
|
||||||
|
if (!outFile.is_open()) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"[CURLWebClient] Cannot open file for writing: " + file_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
set_common_get_options(curl.get(), url, 30L, 300L);
|
||||||
|
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackFile);
|
||||||
|
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA,
|
||||||
|
static_cast<void*>(&outFile));
|
||||||
|
|
||||||
|
CURLcode res = curl_easy_perform(curl.get());
|
||||||
|
outFile.close();
|
||||||
|
|
||||||
|
if (res != CURLE_OK) {
|
||||||
|
std::remove(file_path.c_str());
|
||||||
|
std::string error = std::string("[CURLWebClient] Download failed: ") +
|
||||||
|
curl_easy_strerror(res);
|
||||||
|
throw std::runtime_error(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
long httpCode = 0;
|
||||||
|
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
|
||||||
|
|
||||||
|
if (httpCode != 200) {
|
||||||
|
std::remove(file_path.c_str());
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
|
||||||
|
throw std::runtime_error(ss.str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string CURLWebClient::Get(const std::string& url) {
|
||||||
|
auto curl = create_handle();
|
||||||
|
|
||||||
|
std::string response_string;
|
||||||
|
set_common_get_options(curl.get(), url, 10L, 20L);
|
||||||
|
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
|
||||||
|
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
|
||||||
|
|
||||||
|
CURLcode res = curl_easy_perform(curl.get());
|
||||||
|
|
||||||
|
if (res != CURLE_OK) {
|
||||||
|
std::string error =
|
||||||
|
std::string("[CURLWebClient] GET failed: ") + curl_easy_strerror(res);
|
||||||
|
throw std::runtime_error(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
long httpCode = 0;
|
||||||
|
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
|
||||||
|
|
||||||
|
if (httpCode != 200) {
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
|
||||||
|
throw std::runtime_error(ss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
return response_string;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string CURLWebClient::UrlEncode(const std::string& value) {
|
||||||
|
// A NULL handle is fine for UTF-8 encoding according to libcurl docs.
|
||||||
|
char* output = curl_easy_escape(nullptr, value.c_str(), 0);
|
||||||
|
|
||||||
|
if (output) {
|
||||||
|
std::string result(output);
|
||||||
|
curl_free(output);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
|
||||||
|
}
|
||||||
89
pipeline/src/wikipedia/wikipedia_service.cpp
Normal file
89
pipeline/src/wikipedia/wikipedia_service.cpp
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
#include "wikipedia/wikipedia_service.h"
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <boost/json.hpp>
|
||||||
|
|
||||||
|
WikipediaService::WikipediaService(std::shared_ptr<WebClient> client)
|
||||||
|
: client_(std::move(client)) {}
|
||||||
|
|
||||||
|
std::string WikipediaService::FetchExtract(std::string_view query) {
|
||||||
|
const std::string encoded = client_->UrlEncode(std::string(query));
|
||||||
|
const std::string url =
|
||||||
|
"https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
|
||||||
|
"&prop=extracts&explaintext=1&format=json";
|
||||||
|
|
||||||
|
const std::string body = client_->Get(url);
|
||||||
|
|
||||||
|
boost::system::error_code ec;
|
||||||
|
boost::json::value doc = boost::json::parse(body, ec);
|
||||||
|
|
||||||
|
if (!ec && doc.is_object()) {
|
||||||
|
try {
|
||||||
|
auto& pages = doc.at("query").at("pages").get_object();
|
||||||
|
if (!pages.empty()) {
|
||||||
|
auto& page = pages.begin()->value().get_object();
|
||||||
|
if (page.contains("extract") && page.at("extract").is_string()) {
|
||||||
|
std::string extract(page.at("extract").as_string().c_str());
|
||||||
|
spdlog::debug("WikipediaService fetched {} chars for '{}'",
|
||||||
|
extract.size(), query);
|
||||||
|
return extract;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
spdlog::warn(
|
||||||
|
"WikipediaService: failed to parse response structure for '{}': "
|
||||||
|
"{}",
|
||||||
|
query, e.what());
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
} else if (ec) {
|
||||||
|
spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
|
||||||
|
ec.message());
|
||||||
|
}
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string WikipediaService::GetSummary(std::string_view city,
|
||||||
|
std::string_view country) {
|
||||||
|
const std::string key = std::string(city) + "|" + std::string(country);
|
||||||
|
const auto cacheIt = cache_.find(key);
|
||||||
|
if (cacheIt != cache_.end()) {
|
||||||
|
return cacheIt->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string result;
|
||||||
|
|
||||||
|
if (!client_) {
|
||||||
|
cache_.emplace(key, result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string regionQuery(city);
|
||||||
|
if (!country.empty()) {
|
||||||
|
regionQuery += ", ";
|
||||||
|
regionQuery += country;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string beerQuery = "beer in " + std::string(country);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const std::string regionExtract = FetchExtract(regionQuery);
|
||||||
|
const std::string beerExtract = FetchExtract(beerQuery);
|
||||||
|
|
||||||
|
if (!regionExtract.empty()) {
|
||||||
|
result += regionExtract;
|
||||||
|
}
|
||||||
|
if (!beerExtract.empty()) {
|
||||||
|
if (!result.empty()) result += "\n\n";
|
||||||
|
result += beerExtract;
|
||||||
|
}
|
||||||
|
} catch (const std::runtime_error& e) {
|
||||||
|
spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery,
|
||||||
|
e.what());
|
||||||
|
}
|
||||||
|
|
||||||
|
cache_.emplace(key, result);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user