Begin work on biergarten data generator pipeline

2026-04-05 18:09:04 +00:00 · 2026-04-01 19:33:50 -04:00
47 changed files with 1264 additions and 4077 deletions
--- a/pipeline/.clang-format
+++ b/pipeline/.clang-format
@@ -1,5 +0,0 @@
---
-BasedOnStyle: Google
-ColumnLimit: 80
-IndentWidth: 3
-...
--- a/pipeline/.clang-tidy
+++ b/pipeline/.clang-tidy
@@ -1,17 +0,0 @@
---
-Checks: >
-  -*,
-  bugprone-*,
-  clang-analyzer-*,
-  cppcoreguidelines-*,
-  google-*,
-  modernize-*,
-  performance-*,
-  readability-*,
-  -cppcoreguidelines-avoid-magic-numbers,
-  -cppcoreguidelines-owning-memory,
-  -readability-magic-numbers,
-  -google-readability-todo
-HeaderFilterRegex: "^(src|includes)/.*"
-FormatStyle: file
-...
--- a/pipeline/CMakeLists.txt
+++ b/pipeline/CMakeLists.txt
@@ -1,170 +1,113 @@
 cmake_minimum_required(VERSION 3.20)
 project(biergarten-pipeline VERSION 0.1.0 LANGUAGES CXX)

-# Allows older dependencies to configure on newer CMake.
-set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
+cmake_policy(SET CMP0167 NEW)

-# Policies
-cmake_policy(SET CMP0167 NEW) # FindBoost improvements
-
-# Global Settings
 set(CMAKE_CXX_STANDARD 23)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_EXTENSIONS OFF)
-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

-option(ENABLE_CLANG_TIDY "Enable clang-tidy static analysis for project targets" ON)
-option(ENABLE_CLANG_FORMAT_TARGETS "Enable clang-format helper targets" ON)
-
-if(ENABLE_CLANG_TIDY)
-    find_program(CLANG_TIDY_EXE NAMES clang-tidy)
-    if(CLANG_TIDY_EXE)
-        set(BIERGARTEN_CLANG_TIDY_COMMAND
-            "${CLANG_TIDY_EXE};--config-file=${CMAKE_CURRENT_SOURCE_DIR}/.clang-tidy")
-        message(STATUS "clang-tidy enabled: ${CLANG_TIDY_EXE}")
-    else()
-        message(STATUS "clang-tidy not found; static analysis is disabled")
-    endif()
-endif()
-
-# -----------------------------------------------------------------------------
-# Compiler Options & Warnings (Interface Library)
-# -----------------------------------------------------------------------------
-add_library(project_options INTERFACE)
-target_compile_options(project_options INTERFACE
-    $<$<CXX_COMPILER_ID:GNU,Clang>:
-        -Wall -Wextra -Wpedantic -Wshadow -Wconversion -Wsign-conversion -Wunused
-    >
-    $<$<CXX_COMPILER_ID:MSVC>:
-        /W4 /WX /permissive-
-    >
-)
-
-# -----------------------------------------------------------------------------
-# Dependencies
-# -----------------------------------------------------------------------------
 find_package(CURL REQUIRED)
+find_package(Boost REQUIRED COMPONENTS unit_test_framework)
 find_package(SQLite3 REQUIRED)
-find_package(Boost 1.75 REQUIRED COMPONENTS program_options json)

 include(FetchContent)

-# spdlog (Logging)
 FetchContent_Declare(
-    spdlog
-    GIT_REPOSITORY https://github.com/gabime/spdlog.git
-    GIT_TAG        v1.11.0
+    nlohmann_json
+    GIT_REPOSITORY https://github.com/nlohmann/json.git
+    GIT_TAG        v3.11.3
 )
-FetchContent_MakeAvailable(spdlog)
+FetchContent_MakeAvailable(nlohmann_json)

-# llama.cpp (LLM Inference)
-set(LLAMA_BUILD_TESTS OFF CACHE BOOL "" FORCE)
-set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
-set(LLAMA_BUILD_SERVER OFF CACHE BOOL "" FORCE)
-FetchContent_Declare(
-    llama_cpp
-    GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
-    GIT_TAG        b8611
-)
-FetchContent_MakeAvailable(llama_cpp)
+# TODO: Integrate real llama.cpp when generator is ready to use actual models
+# For now, using mocked brewery generation in generator.cpp

-if(TARGET llama)
-    target_compile_options(llama PRIVATE
-        $<$<CXX_COMPILER_ID:AppleClang>:-include algorithm>
-    )
-endif()
+# SQLite for in-memory database
+find_package(SQLite3 REQUIRED)

-# -----------------------------------------------------------------------------
-# Main Executable
-# -----------------------------------------------------------------------------
-set(PIPELINE_SOURCES
-    src/biergarten_data_generator.cpp
-    src/web_client/curl_web_client.cpp
-    src/data_generation/data_downloader.cpp
-    src/database/database.cpp
-    src/json_handling/json_loader.cpp
-    src/data_generation/llama/destructor.cpp
-    src/data_generation/llama/set_sampling_options.cpp
-    src/data_generation/llama/load.cpp
-    src/data_generation/llama/infer.cpp
-    src/data_generation/llama/generate_brewery.cpp
-    src/data_generation/llama/generate_user.cpp
-    src/data_generation/llama/helpers.cpp
-    src/data_generation/llama/load_brewery_prompt.cpp
-    src/data_generation/mock/data.cpp
-    src/data_generation/mock/deterministic_hash.cpp
-    src/data_generation/mock/load.cpp
-    src/data_generation/mock/generate_brewery.cpp
-    src/data_generation/mock/generate_user.cpp
-    src/json_handling/stream_parser.cpp
-    src/wikipedia/wikipedia_service.cpp
-    src/main.cpp
+file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS
+    src/*.cpp
 )

-add_executable(biergarten-pipeline ${PIPELINE_SOURCES})
-
-if(BIERGARTEN_CLANG_TIDY_COMMAND)
-    set_target_properties(biergarten-pipeline PROPERTIES
-        CXX_CLANG_TIDY "${BIERGARTEN_CLANG_TIDY_COMMAND}"
-    )
-endif()
+add_executable(biergarten-pipeline ${SOURCES})

 target_include_directories(biergarten-pipeline
    PRIVATE
        ${CMAKE_CURRENT_SOURCE_DIR}/includes
-        ${llama_cpp_SOURCE_DIR}/include
 )

 target_link_libraries(biergarten-pipeline
    PRIVATE
-        project_options
        CURL::libcurl
+        nlohmann_json::nlohmann_json
+        Boost::unit_test_framework
        SQLite::SQLite3
-        spdlog::spdlog
-        llama
-        Boost::program_options
-        Boost::json
 )

-if(ENABLE_CLANG_FORMAT_TARGETS)
-    find_program(CLANG_FORMAT_EXE NAMES clang-format)
-    if(CLANG_FORMAT_EXE)
-        file(GLOB_RECURSE FORMAT_SOURCES CONFIGURE_DEPENDS
-            ${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cpp
-            ${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.h
-            ${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.hpp
-        )
+target_compile_options(biergarten-pipeline PRIVATE
+    $<$<CXX_COMPILER_ID:GNU,Clang>:
+        -Wall
+        -Wextra
+        -Wpedantic
+        -Wshadow
+        -Wconversion
+        -Wsign-conversion
+    >
+    $<$<CXX_COMPILER_ID:MSVC>:
+        /W4
+        /WX
+    >
+)

-        add_custom_target(format
-            COMMAND ${CLANG_FORMAT_EXE} -style=file -i ${FORMAT_SOURCES}
-            COMMENT "Formatting source files with clang-format (Google style)"
-            VERBATIM
-        )
-
-        add_custom_target(format-check
-            COMMAND ${CLANG_FORMAT_EXE} -style=file --dry-run --Werror ${FORMAT_SOURCES}
-            COMMENT "Checking source formatting with clang-format (Google style)"
-            VERBATIM
-        )
-    else()
-        message(STATUS "clang-format not found; format targets are disabled")
-    endif()
-endif()
-
-# -----------------------------------------------------------------------------
-# Post-Build Steps & Utilities
-# -----------------------------------------------------------------------------
 add_custom_command(TARGET biergarten-pipeline POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_SOURCE_DIR}/output
-    COMMENT "Ensuring output directory exists"
+    COMMAND ${CMAKE_COMMAND} -E make_directory
+        ${CMAKE_CURRENT_SOURCE_DIR}/output
+    COMMENT "Creating output/ directory for seed SQL files"
 )

 find_program(VALGRIND valgrind)
 if(VALGRIND)
    add_custom_target(memcheck
-        COMMAND ${VALGRIND} --leak-check=full --error-exitcode=1 $<TARGET_FILE:biergarten-pipeline> --help
+        COMMAND ${VALGRIND}
+            --leak-check=full
+            --error-exitcode=1
+            $<TARGET_FILE:biergarten-pipeline> --help
        DEPENDS biergarten-pipeline
-        COMMENT "Running Valgrind memory check"
+        COMMENT "Running Valgrind memcheck"
    )
 endif()
+
+include(CTest)
+
+if(BUILD_TESTING)
+   find_package(Boost REQUIRED COMPONENTS unit_test_framework)
+
+   file(GLOB_RECURSE TEST_SOURCES CONFIGURE_DEPENDS
+      tests/*.cpp
+      tests/*.cc
+      tests/*.cxx
+   )
+
+   if(TEST_SOURCES)
+      add_executable(biergarten-pipeline-tests ${TEST_SOURCES})
+
+      target_include_directories(biergarten-pipeline-tests
+         PRIVATE
+            ${CMAKE_CURRENT_SOURCE_DIR}/include
+      )
+
+      target_link_libraries(biergarten-pipeline-tests
+         PRIVATE
+            Boost::unit_test_framework
+            CURL::libcurl
+            nlohmann_json::nlohmann_json
+            llama
+      )
+
+      add_test(
+         NAME biergarten-pipeline-tests
+         COMMAND biergarten-pipeline-tests
+      )
+   endif()
+endif()
--- a/pipeline/README.md
+++ b/pipeline/README.md
@@ -1,406 +1 @@
-# Biergarten Pipeline

-A high-performance C++23 data pipeline for fetching, parsing, and storing geographic data (countries, states, cities) with brewery metadata generation capabilities. The system supports both mock and LLM-based (llama.cpp) generation modes.
-
-## Overview
-
-The pipeline orchestrates **four key stages**:
-
-1. **Download** - Fetches `countries+states+cities.json` from a pinned GitHub commit with optional local filesystem caching
-2. **Parse** - Streams JSON using Boost.JSON's `basic_parser` to extract country/state/city records without loading the entire file into memory
-3. **Store** - Inserts records into a file-based SQLite database with all operations performed sequentially in a single thread
-4. **Generate** - Produces brewery metadata or user profiles (mock implementation; supports future LLM integration via llama.cpp)
-
-## System Architecture
-
-### Data Sources and Formats
-
- **Hierarchical Structure**: Countries array → states per country → cities per state
- **Data Fields**:
-  - `id` (integer)
-  - `name` (string)
-  - `iso2` / `iso3` (ISO country/state codes)
-  - `latitude` / `longitude` (geographic coordinates)
- **Source**: [dr5hn/countries-states-cities-database](https://github.com/dr5hn/countries-states-cities-database) on GitHub
- **Output**: Structured SQLite file-based database (`biergarten-pipeline.db`) + structured logging via spdlog
-
-### Concurrency Model
-
-The pipeline currently operates **single-threaded** with sequential stage execution:
-
-1. **Download Phase**: Main thread blocks while downloading the source JSON file (if not in cache)
-2. **Parse & Store Phase**: Main thread performs streaming JSON parse with immediate SQLite inserts
-
-**Thread Safety**: While single-threaded, the `SqliteDatabase` component is **mutex-protected** using `std::mutex` (`dbMutex`) for all database operations. This design enables safe future parallelization without code modifications.
-
-## Core Components
-
-| Component                     | Purpose                                                                                         | Thread Safety                                | Dependencies                                  |
-| ----------------------------- | ----------------------------------------------------------------------------------------------- | -------------------------------------------- | --------------------------------------------- |
-| **BiergartenDataGenerator**   | Orchestrates pipeline execution; manages lifecycle of downloader, parser, and generator         | Single-threaded coordinator                  | ApplicationOptions, WebClient, SqliteDatabase |
-| **DataDownloader**            | HTTP fetch with curl; optional filesystem cache; ETag support and retries                       | Blocking I/O; safe for startup               | IWebClient, filesystem                        |
-| **StreamingJsonParser**       | Extends `boost::json::basic_parser`; emits country/state/city via callbacks; tracks parse depth | Single-threaded parse; callbacks thread-safe | Boost.JSON                                    |
-| **JsonLoader**                | Wraps parser; dispatches callbacks for country/state/city; manages WorkQueue lifecycle          | Produces to WorkQueue; safe callbacks        | StreamingJsonParser, SqliteDatabase           |
-| **SqliteDatabase**            | Manages schema initialization; insert/query methods for geographic data                         | Mutex-guarded all operations                 | SQLite3                                       |
-| **IDataGenerator** (Abstract) | Interface for brewery/user metadata generation                                                  | Stateless virtual methods                    | N/A                                           |
-| **LlamaGenerator**            | LLM-based generation via llama.cpp; configurable sampling (temperature, top-p, seed)            | Manages llama_model* and llama_context*      | llama.cpp, BreweryResult, UserResult          |
-| **MockGenerator**             | Deterministic mock generation using seeded randomization                                        | Stateless; thread-safe                       | N/A                                           |
-| **CURLWebClient**             | HTTP client adapter; URL encoding; file downloads                                               | cURL library bindings                        | libcurl                                       |
-| **WikipediaService**          | (Planned) Wikipedia data lookups for enrichment                                                 | N/A                                          | IWebClient                                    |
-
-## Database Schema
-
-SQLite file-based database with **three core tables** and **indexes for fast lookups**:
-
-### Countries
-
-```sql
-CREATE TABLE countries (
-  id INTEGER PRIMARY KEY,
-  name TEXT NOT NULL,
-  iso2 TEXT,
-  iso3 TEXT
-);
-CREATE INDEX idx_countries_iso2 ON countries(iso2);
-```
-
-### States
-
-```sql
-CREATE TABLE states (
-  id INTEGER PRIMARY KEY,
-  country_id INTEGER NOT NULL,
-  name TEXT NOT NULL,
-  iso2 TEXT,
-  FOREIGN KEY (country_id) REFERENCES countries(id)
-);
-CREATE INDEX idx_states_country ON states(country_id);
-```
-
-### Cities
-
-```sql
-CREATE TABLE cities (
-  id INTEGER PRIMARY KEY,
-  state_id INTEGER NOT NULL,
-  country_id INTEGER NOT NULL,
-  name TEXT NOT NULL,
-  latitude REAL,
-  longitude REAL,
-  FOREIGN KEY (state_id) REFERENCES states(id),
-  FOREIGN KEY (country_id) REFERENCES countries(id)
-);
-CREATE INDEX idx_cities_state ON cities(state_id);
-CREATE INDEX idx_cities_country ON cities(country_id);
-```
-
-## Architecture Diagram
-
-```plantuml
-@startuml biergarten-pipeline
-!theme plain
-skinparam monochrome true
-skinparam classBackgroundColor #FFFFFF
-skinparam classBorderColor #000000
-
-package "Application Layer" {
-  class BiergartenDataGenerator {
-    - options: ApplicationOptions
-    - webClient: IWebClient
-    - database: SqliteDatabase
-    - generator: IDataGenerator
-    --
-    + Run() : int
-  }
-}
-
-package "Data Acquisition" {
-  class DataDownloader {
-    - webClient: IWebClient
-    --
-    + Download(url: string, filePath: string)
-    + DownloadWithCache(url: string, cachePath: string)
-  }
-
-  interface IWebClient {
-    + DownloadToFile(url: string, filePath: string)
-    + Get(url: string) : string
-    + UrlEncode(value: string) : string
-  }
-
-  class CURLWebClient {
-    - globalState: CurlGlobalState
-    --
-    + DownloadToFile(url: string, filePath: string)
-    + Get(url: string) : string
-    + UrlEncode(value: string) : string
-  }
-}
-
-package "JSON Processing" {
-  class StreamingJsonParser {
-    - depth: int
-    --
-    + on_object_begin()
-    + on_object_end()
-    + on_array_begin()
-    + on_array_end()
-    + on_key(str: string)
-    + on_string(str: string)
-    + on_number(value: int)
-  }
-
-  class JsonLoader {
-    --
-    + LoadWorldCities(jsonPath: string, db: SqliteDatabase)
-  }
-}
-
-package "Data Storage" {
-  class SqliteDatabase {
-    - db: sqlite3*
-    - dbMutex: std::mutex
-    --
-    + Initialize(dbPath: string)
-    + InsertCountry(id: int, name: string, iso2: string, iso3: string)
-    + InsertState(id: int, countryId: int, name: string, iso2: string)
-    + InsertCity(id: int, stateId: int, countryId: int, name: string, lat: double, lon: double)
-    + QueryCountries(limit: int) : vector<Country>
-    + QueryStates(limit: int) : vector<State>
-    + QueryCities() : vector<City>
-    + BeginTransaction()
-    + CommitTransaction()
-    # InitializeSchema()
-  }
-
-  struct Country {
-    id: int
-    name: string
-    iso2: string
-    iso3: string
-  }
-
-  struct State {
-    id: int
-    name: string
-    iso2: string
-    countryId: int
-  }
-
-  struct City {
-    id: int
-    name: string
-    countryId: int
-  }
-}
-
-package "Data Generation" {
-  interface IDataGenerator {
-    + load(modelPath: string)
-    + generateBrewery(cityName: string, countryName: string, regionContext: string) : BreweryResult
-    + generateUser(locale: string) : UserResult
-  }
-
-  class LlamaGenerator {
-    - model: llama_model*
-    - context: llama_context*
-    - sampling_temperature: float
-    - sampling_top_p: float
-    - sampling_seed: uint32_t
-    --
-    + load(modelPath: string)
-    + generateBrewery(...) : BreweryResult
-    + generateUser(locale: string) : UserResult
-    + setSamplingOptions(temperature: float, topP: float, seed: int)
-    # infer(prompt: string) : string
-  }
-
-  class MockGenerator {
-    --
-    + load(modelPath: string)
-    + generateBrewery(...) : BreweryResult
-    + generateUser(locale: string) : UserResult
-  }
-
-  struct BreweryResult {
-    name: string
-    description: string
-  }
-
-  struct UserResult {
-    username: string
-    bio: string
-  }
-}
-
-package "Enrichment (Planned)" {
-  class WikipediaService {
-    - webClient: IWebClient
-    --
-    + SearchCity(cityName: string, countryName: string) : string
-  }
-}
-
-' Relationships
-BiergartenDataGenerator --> DataDownloader
-BiergartenDataGenerator --> JsonLoader
-BiergartenDataGenerator --> SqliteDatabase
-BiergartenDataGenerator --> IDataGenerator
-
-DataDownloader --> IWebClient
-CURLWebClient ..|> IWebClient
-
-JsonLoader --> StreamingJsonParser
-JsonLoader --> SqliteDatabase
-
-LlamaGenerator ..|> IDataGenerator
-MockGenerator ..|> IDataGenerator
-
-SqliteDatabase --> Country
-SqliteDatabase --> State
-SqliteDatabase --> City
-
-LlamaGenerator --> BreweryResult
-LlamaGenerator --> UserResult
-MockGenerator --> BreweryResult
-MockGenerator --> UserResult
-
-WikipediaService --> IWebClient
-
-@enduml
-```
-
-## Configuration and Extensibility
-
-### Command-Line Arguments
-
-Boost.Program_options provides named CLI arguments. Running without arguments displays usage instructions.
-
-```bash
-./biergarten-pipeline [options]
-```
-
-**Requirement**: Exactly one of `--mocked` or `--model` must be specified.
-
-| Argument        | Short | Type   | Purpose                                                         |
-| --------------- | ----- | ------ | --------------------------------------------------------------- |
-| `--mocked`      | -     | flag   | Use mocked generator for brewery/user data                      |
-| `--model`       | `-m`  | string | Path to LLM model file (gguf); mutually exclusive with --mocked |
-| `--cache-dir`   | `-c`  | path   | Directory for cached JSON (default: `/tmp`)                     |
-| `--temperature` | -     | float  | LLM sampling temperature 0.0-1.0 (default: `0.8`)               |
-| `--top-p`       | -     | float  | Nucleus sampling parameter 0.0-1.0 (default: `0.92`)            |
-| `--seed`        | -     | int    | Random seed: -1 for random (default: `-1`)                      |
-| `--help`        | `-h`  | flag   | Show help message                                               |
-
-**Note**: The data source is always pinned to commit `c5eb7772` (stable 2026-03-28) and cannot be changed.
-
-**Note**: When `--mocked` is used, any sampling parameters (`--temperature`, `--top-p`, `--seed`) are ignored with a warning.
-
-### Usage Examples
-
-```bash
-# Mocked generator (deterministic, no LLM required)
-./biergarten-pipeline --mocked
-
-# With LLM model
-./biergarten-pipeline --model ./models/llama.gguf --cache-dir /var/cache
-
-# Mocked with extra parameters provided (will be ignored with warning)
-./biergarten-pipeline --mocked --temperature 0.5 --top-p 0.8 --seed 42
-
-# Show help
-./biergarten-pipeline --help
-```
-
-## Building and Running
-
-### Prerequisites
-
- **C++23 compiler** (g++, clang, MSVC)
- **CMake** 3.20+
- **curl** (for HTTP downloads)
- **sqlite3** (database backend)
- **Boost** 1.75+ (requires Boost.JSON and Boost.Program_options)
- **spdlog** v1.11.0 (fetched via CMake FetchContent)
- **llama.cpp** (fetched via CMake FetchContent for LLM inference)
-
-### Build
-
-```bash
-mkdir -p build
-cd build
-cmake ..
-cmake --build . --target biergarten-pipeline -- -j
-```
-
-### Run
-
-```bash
-./build/biergarten-pipeline
-```
-
-**Output**:
-
- Console logs with structured spdlog output
- Cached JSON file: `/tmp/countries+states+cities.json`
- SQLite database: `biergarten-pipeline.db` (in output directory)
-
-## Code Quality and Static Analysis
-
-### Formatting
-
-This project uses **clang-format** with the **Google C++ style guide**:
-
-```bash
-# Apply formatting to all source files
-cmake --build build --target format
-
-# Check formatting without modifications
-cmake --build build --target format-check
-```
-
-### Static Analysis
-
-This project uses **clang-tidy** with configurations for Google, modernize, performance, and bug-prone rules (`.clang-tidy`):
-
-Static analysis runs automatically during compilation if `clang-tidy` is available.
-
-## Code Implementation Summary
-
-### Key Achievements
-
-✅ **Full pipeline implementation** - Download → Parse → Store → Generate
-✅ **Streaming JSON parser** - Memory-efficient processing via Boost.JSON callbacks
-✅ **Thread-safe SQLite wrapper** - Mutex-protected database for future parallelization
-✅ **Flexible data generation** - Abstract IDataGenerator interface supporting both mock and LLM modes
-✅ **Comprehensive CLI** - Boost.Program_options with sensible defaults
-✅ **Production-grade logging** - spdlog integration for structured output
-✅ **Build quality** - CMake with clang-format/clang-tidy integration
-
-### Architecture Patterns
-
- **Interface-based design**: `IWebClient`, `IDataGenerator` abstract base classes enable substitution and testing
- **Dependency injection**: Components receive dependencies via constructors (BiergartenDataGenerator)
- **RAII principle**: SQLite connections and resources managed via destructors
- **Callback-driven parsing**: Boost.JSON parser emits events to processing callbacks
- **Transaction-scoped inserts**: BeginTransaction/CommitTransaction for batch performance
-
-### External Dependencies
-
-| Dependency | Version | Purpose                            | Type    |
-| ---------- | ------- | ---------------------------------- | ------- |
-| Boost      | 1.75+   | JSON parsing, CLI argument parsing | Library |
-| SQLite3    | -       | Persistent data storage            | System  |
-| libcurl    | -       | HTTP downloads                     | System  |
-| spdlog     | v1.11.0 | Structured logging                 | Fetched |
-| llama.cpp  | b8611   | LLM inference engine               | Fetched |
-
-to validate formatting without modifying files.
-
-clang-tidy runs automatically on the biergarten-pipeline target when available. You can disable it at configure time:
-
-cmake -DENABLE_CLANG_TIDY=OFF ..
-
-You can also disable format helper targets:
-
-cmake -DENABLE_CLANG_FORMAT_TARGETS=OFF ..
--- a/pipeline/includes/biergarten_data_generator.h
+++ b/pipeline/includes/biergarten_data_generator.h
@@ -1,157 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
-#define BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
-
-#include <memory>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "data_generation/data_generator.h"
-#include "database/database.h"
-#include "web_client/web_client.h"
-#include "wikipedia/wikipedia_service.h"
-
-/**
- * @brief Program options for the Biergarten pipeline application.
- */
-struct ApplicationOptions {
-   /// @brief Path to the LLM model file (gguf format); mutually exclusive with
-   /// use_mocked.
-   std::string model_path;
-
-   /// @brief Use mocked generator instead of LLM; mutually exclusive with
-   /// model_path.
-   bool use_mocked = false;
-
-   /// @brief Directory for cached JSON and database files.
-   std::string cache_dir;
-
-   /// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
-   float temperature = 0.8f;
-
-   /// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more
-   /// random).
-   float top_p = 0.92f;
-
-   /// @brief Context window size (tokens) for LLM inference. Higher values
-   /// support longer prompts but use more memory.
-   uint32_t n_ctx = 2048;
-
-   /// @brief Random seed for sampling (-1 for random, otherwise non-negative).
-   int seed = -1;
-
-   /// @brief Git commit hash for database consistency (always pinned to
-   /// c5eb7772).
-   std::string commit = "c5eb7772";
-};
-
-/**
- * @brief Main data generator class for the Biergarten pipeline.
- *
- * This class encapsulates the core logic for generating brewery data.
- * It handles database initialization, data loading/downloading, and brewery
- * generation.
- */
-class BiergartenDataGenerator {
-  public:
-   /**
-    * @brief Construct a BiergartenDataGenerator with injected dependencies.
-    *
-    * @param options Application configuration options.
-    * @param web_client HTTP client for downloading data.
-    * @param database SQLite database instance.
-    */
-   BiergartenDataGenerator(const ApplicationOptions& options,
-                           std::shared_ptr<WebClient> web_client,
-                           SqliteDatabase& database);
-
-   /**
-    * @brief Run the data generation pipeline.
-    *
-    * Performs the following steps:
-    * 1. Initialize database
-    * 2. Download geographic data if needed
-    * 3. Initialize the generator (LLM or Mock)
-    * 4. Generate brewery data for sample cities
-    *
-    * @return 0 on success, 1 on failure.
-    */
-   int Run();
-
-  private:
-   /// @brief Immutable application options.
-   const ApplicationOptions options_;
-
-   /// @brief Shared HTTP client dependency.
-   std::shared_ptr<WebClient> webClient_;
-
-   /// @brief Database dependency.
-   SqliteDatabase& database_;
-
-   /**
-    * @brief Enriched city data with Wikipedia context.
-    */
-   struct EnrichedCity {
-      int city_id;
-      std::string city_name;
-      std::string country_name;
-      std::string region_context;
-   };
-
-   /**
-    * @brief Initialize the data generator based on options.
-    *
-    * Creates either a MockGenerator (if no model path) or LlamaGenerator.
-    *
-    * @return A unique_ptr to the initialized generator.
-    */
-   std::unique_ptr<DataGenerator> InitializeGenerator();
-
-   /**
-    * @brief Download and load geographic data if not cached.
-    */
-   void LoadGeographicData();
-
-   /**
-    * @brief Query cities from database and build country name map.
-    *
-    * @return Vector of (City, country_name) pairs capped at 30 entries.
-    */
-   std::vector<std::pair<City, std::string>> QueryCitiesWithCountries();
-
-   /**
-    * @brief Enrich cities with Wikipedia summaries.
-    *
-    * @param cities Vector of (City, country_name) pairs.
-    * @return Vector of enriched city data with context.
-    */
-   std::vector<EnrichedCity> EnrichWithWikipedia(
-       const std::vector<std::pair<City, std::string>>& cities);
-
-   /**
-    * @brief Generate breweries for enriched cities.
-    *
-    * @param generator The data generator instance.
-    * @param cities Vector of enriched city data.
-    */
-   void GenerateBreweries(DataGenerator& generator,
-                          const std::vector<EnrichedCity>& cities);
-
-   /**
-    * @brief Log the generated brewery results.
-    */
-   void LogResults() const;
-
-   /**
-    * @brief Helper struct to store generated brewery data.
-    */
-   struct GeneratedBrewery {
-      int city_id;
-      std::string city_name;
-      BreweryResult brewery;
-   };
-
-   /// @brief Stores generated brewery data.
-   std::vector<GeneratedBrewery> generatedBreweries_;
-};
-#endif  // BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
--- a/pipeline/includes/data_downloader.h
+++ b/pipeline/includes/data_downloader.h
@@ -0,0 +1,111 @@
+/**
+ * @file data_downloader.h
+ * @brief Download geographic data from GitHub repositories using libcurl.
+ *
+ * Provides functionality to fetch JSON data from GitHub using libcurl, with
+ * support for commit-based versioning to ensure reproducible builds. Downloads
+ * are cached to avoid repeated network requests.
+ *
+ * Example usage:
+ * @code
+ *   DataDownloader downloader;
+ *   std::string jsonPath = downloader.DownloadCountriesDatabase(
+ *       "/tmp/countries-data.json",  // local cache path
+ *       "c5eb7772"                    // optional commit hash or HEAD
+ *   );
+ *   // Now use jsonPath with JsonLoader::LoadWorldCities(jsonPath, db)
+ * @endcode
+ */
+
+#ifndef DATA_DOWNLOADER_H
+#define DATA_DOWNLOADER_H
+
+#include <stdexcept>
+#include <string>
+
+/**
+ * @class DataDownloader
+ * @brief Manages downloading and caching of geographic data from GitHub.
+ *
+ * This class encapsulates libcurl networking operations for reproducible
+ * data fetching. All methods are non-blocking and synchronous.
+ *
+ * @note Requires libcurl to be available at runtime.
+ * @note GitHub raw content CDN is used for efficient downloads.
+ */
+class DataDownloader {
+public:
+  /**
+   * @brief Default constructor.
+   *
+   * Initializes the downloader without any specific state. The downloader
+   * is ready to use immediately.
+   */
+  DataDownloader();
+
+  /**
+   * @brief Destructor.
+   *
+   * Cleans up any resources. No explicit cleanup needed beyond destruction.
+   */
+  ~DataDownloader();
+
+  /**
+   * @brief Download the countries+states+cities JSON database from GitHub.
+   *
+   * Downloads the geographic data from the
+   * dr5hn/countries-states-cities-database repository. If the file already
+   * exists at cachePath, it is used directly without downloading again.
+   *
+   * The download URL format is:
+   * @verbatim
+   * https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/
+   * {commit}/json/countries+states+cities.json
+   * @endverbatim
+   *
+   * @param cachePath   Local filesystem path where the JSON file should be
+   * stored. If the file already exists, download is skipped.
+   * @param commit      Git commit hash or branch name (default: "c5eb7772").
+   *                    Examples: "HEAD", "main", "c5eb7772",
+   * "c5eb7772225f6b1802a54f39adb8c73464a85be1a"
+   *
+   * @return            The file path where JSON was saved (same as cachePath).
+   *
+   * @throws std::runtime_error if:
+   *         - Network download fails
+   *         - File cannot be written to cachePath
+   *         - Commit hash is invalid (404 on GitHub)
+   *
+   * Example with default commit (stable v2026-03-28):
+   * @code
+   *   std::string path =
+   * downloader.DownloadCountriesDatabase("/tmp/data.json");
+   * @endcode
+   *
+   * Example with custom commit:
+   * @code
+   *   std::string path = downloader.DownloadCountriesDatabase(
+   *       "/tmp/data.json",
+   *       "main"  // Download latest from main branch
+   *   );
+   * @endcode
+   */
+  std::string DownloadCountriesDatabase(
+      const std::string &cachePath,
+      const std::string &commit = "c5eb7772" // Stable commit: 2026-03-28 export
+  );
+
+private:
+  /**
+   * @brief Check if a file already exists at the given path.
+   *
+   * Used internally to implement cache-hit logic. No download occurs if
+   * the file already exists.
+   *
+   * @param filePath  Path to check.
+   * @return          True if file exists and is readable, false otherwise.
+   */
+  bool FileExists(const std::string &filePath) const;
+};
+
+#endif // DATA_DOWNLOADER_H
--- a/pipeline/includes/data_generation/data_downloader.h
+++ b/pipeline/includes/data_generation/data_downloader.h
@@ -1,31 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
-#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
-
-#include <memory>
-#include <stdexcept>
-#include <string>
-
-#include "web_client/web_client.h"
-
-/// @brief Downloads and caches source geography JSON payloads.
-class DataDownloader {
-  public:
-   /// @brief Initializes global curl state used by this downloader.
-   explicit DataDownloader(std::shared_ptr<WebClient> web_client);
-
-   /// @brief Cleans up global curl state.
-   ~DataDownloader();
-
-   /// @brief Returns a local JSON path, downloading it when cache is missing.
-   std::string DownloadCountriesDatabase(
-       const std::string& cache_path,
-       const std::string& commit =
-           "c5eb7772"  // Stable commit: 2026-03-28 export
-   );
-
-  private:
-   static bool FileExists(const std::string& file_path);
-   std::shared_ptr<WebClient> web_client_;
-};
-
-#endif  // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
--- a/pipeline/includes/data_generation/data_generator.h
+++ b/pipeline/includes/data_generation/data_generator.h
@@ -1,29 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
-#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
-
-#include <string>
-
-struct BreweryResult {
-   std::string name;
-   std::string description;
-};
-
-struct UserResult {
-   std::string username;
-   std::string bio;
-};
-
-class DataGenerator {
-  public:
-   virtual ~DataGenerator() = default;
-
-   virtual void Load(const std::string& model_path) = 0;
-
-   virtual BreweryResult GenerateBrewery(const std::string& city_name,
-                                         const std::string& country_name,
-                                         const std::string& region_context) = 0;
-
-   virtual UserResult GenerateUser(const std::string& locale) = 0;
-};
-
-#endif  // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
--- a/pipeline/includes/data_generation/llama_generator.h
+++ b/pipeline/includes/data_generation/llama_generator.h
@@ -1,51 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
-#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
-
-#include <cstdint>
-#include <string>
-
-#include "data_generation/data_generator.h"
-
-struct llama_model;
-struct llama_context;
-
-class LlamaGenerator final : public DataGenerator {
-  public:
-   LlamaGenerator() = default;
-   ~LlamaGenerator() override;
-
-   void SetSamplingOptions(float temperature, float top_p, int seed = -1);
-
-   void SetContextSize(uint32_t n_ctx);
-
-   void Load(const std::string& model_path) override;
-   BreweryResult GenerateBrewery(const std::string& city_name,
-                                 const std::string& country_name,
-                                 const std::string& region_context) override;
-   UserResult GenerateUser(const std::string& locale) override;
-
-  private:
-   std::string Infer(const std::string& prompt, int max_tokens = 10000);
-   // Overload that allows passing a system message separately so chat-capable
-   // models receive a proper system role instead of having the system text
-   // concatenated into the user prompt (helps avoid revealing internal
-   // reasoning or instructions in model output).
-   std::string Infer(const std::string& system_prompt,
-                     const std::string& prompt, int max_tokens = 10000);
-
-   std::string InferFormatted(const std::string& formatted_prompt,
-                              int max_tokens = 10000);
-
-   std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path);
-   std::string GetFallbackBreweryPrompt();
-
-   llama_model* model_ = nullptr;
-   llama_context* context_ = nullptr;
-   float sampling_temperature_ = 0.8f;
-   float sampling_top_p_ = 0.92f;
-   uint32_t sampling_seed_ = 0xFFFFFFFFu;
-   uint32_t n_ctx_ = 8192;
-   std::string brewery_system_prompt_;
-};
-
-#endif  // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
--- a/pipeline/includes/data_generation/llama_generator_helpers.h
+++ b/pipeline/includes/data_generation/llama_generator_helpers.h
@@ -1,32 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
-#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
-
-#include <string>
-#include <utility>
-
-struct llama_model;
-struct llama_vocab;
-typedef int llama_token;
-
-// Helper functions for LlamaGenerator methods
-std::string PrepareRegionContextPublic(std::string_view region_context,
-                                       std::size_t max_chars = 700);
-
-std::pair<std::string, std::string> ParseTwoLineResponsePublic(
-    const std::string& raw, const std::string& error_message);
-
-std::string ToChatPromptPublic(const llama_model* model,
-                               const std::string& user_prompt);
-
-std::string ToChatPromptPublic(const llama_model* model,
-                               const std::string& system_prompt,
-                               const std::string& user_prompt);
-
-void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
-                            std::string& output);
-
-std::string ValidateBreweryJsonPublic(const std::string& raw,
-                                      std::string& name_out,
-                                      std::string& description_out);
-
-#endif  // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
--- a/pipeline/includes/data_generation/mock_generator.h
+++ b/pipeline/includes/data_generation/mock_generator.h
@@ -1,28 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
-#define BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
-
-#include <string>
-#include <vector>
-
-#include "data_generation/data_generator.h"
-
-class MockGenerator final : public DataGenerator {
-  public:
-   void Load(const std::string& model_path) override;
-   BreweryResult GenerateBrewery(const std::string& city_name,
-                                 const std::string& country_name,
-                                 const std::string& region_context) override;
-   UserResult GenerateUser(const std::string& locale) override;
-
-  private:
-   static std::size_t DeterministicHash(const std::string& a,
-                                        const std::string& b);
-
-   static const std::vector<std::string> kBreweryAdjectives;
-   static const std::vector<std::string> kBreweryNouns;
-   static const std::vector<std::string> kBreweryDescriptions;
-   static const std::vector<std::string> kUsernames;
-   static const std::vector<std::string> kBios;
-};
-
-#endif  // BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
--- a/pipeline/includes/database.h
+++ b/pipeline/includes/database.h
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <mutex>
+#include <sqlite3.h>
+#include <string>
+#include <vector>
+
+/// @struct Country
+/// @brief Represents a country with geographic identifiers
+struct Country {
+  int id;
+  std::string name;
+  std::string iso2; ///< 2-letter ISO code (e.g., "US", "CA")
+  std::string iso3; ///< 3-letter ISO code (e.g., "USA", "CAN")
+};
+
+/// @struct State
+/// @brief Represents a state or province with geographic identifiers
+struct State {
+  int id;
+  std::string name;
+  std::string iso2; ///< 2-letter state code (e.g., "CA", "ON")
+  int countryId;
+};
+
+/**
+ * @class SqliteDatabase
+ * @brief Thread-safe in-memory SQLite database wrapper for geographic data
+ *
+ * Manages a local in-memory SQLite database with countries, states, and cities.
+ * All write operations are serialized via mutex to enable safe concurrent
+ * access from multiple threads. Uses INSERT OR IGNORE for idempotent
+ * operations.
+ *
+ * Schema Relationships:
+ *   countries (id, name, iso2, iso3)
+ *      ↓ (one-to-many)
+ *   states (id, country_id, name, iso2)
+ *      ↓ (one-to-many)
+ *   cities (id, state_id, country_id, name, latitude, longitude)
+ */
+class SqliteDatabase {
+private:
+  sqlite3 *db = nullptr; ///< SQLite database connection handle
+  std::mutex dbMutex; ///< Protects all database operations from race conditions
+
+  /// @brief Creates the schema with three related tables and foreign keys
+  void InitializeSchema();
+
+public:
+  /// @brief Destructor: safely closes the database connection
+  ~SqliteDatabase();
+
+  /// @brief Opens an in-memory SQLite database and initializes the schema
+  void Initialize();
+
+  /// @brief Inserts a country record
+  /// @param id Unique country identifier
+  /// @param name Country name
+  /// @param iso2 2-letter ISO country code
+  /// @param iso3 3-letter ISO country code
+  /// @note Thread-safe: uses mutex lock. Idempotent: INSERT OR IGNORE prevents
+  /// duplicates
+  void InsertCountry(int id, const std::string &name, const std::string &iso2,
+                     const std::string &iso3);
+
+  /// @brief Inserts a state/province record
+  /// @param id Unique state identifier
+  /// @param countryId Foreign key reference to parent country
+  /// @param name State/province name
+  /// @param iso2 2-letter state code (e.g., "CA", "ON")
+  /// @note Thread-safe and idempotent via mutex and INSERT OR IGNORE
+  void InsertState(int id, int countryId, const std::string &name,
+                   const std::string &iso2);
+
+  /// @brief Inserts a city record with geographic coordinates
+  /// @param id Unique city identifier
+  /// @param stateId Foreign key reference to parent state
+  /// @param countryId Foreign key reference to parent country
+  /// @param name City name
+  /// @param latitude Geographic latitude coordinate (WGS84)
+  /// @param longitude Geographic longitude coordinate (WGS84)
+  /// @note Thread-safe and idempotent. Called by multithreaded JSON loader.
+  void InsertCity(int id, int stateId, int countryId, const std::string &name,
+                  double latitude, double longitude);
+
+  /// @brief Queries all cities from the database
+  /// @return Vector of (city_id, city_name) pairs sorted alphabetically
+  std::vector<std::pair<int, std::string>> QueryCities();
+
+  /// @brief Queries all countries from the database with ISO codes
+  /// @param limit Maximum number of records to return (0 = all)
+  /// @return Vector of Country structs (includes id, name, iso2, iso3) sorted
+  /// alphabetically
+  std::vector<Country> QueryCountries(int limit = 0);
+
+  /// @brief Queries all states from the database with ISO codes
+  /// @param limit Maximum number of records to return (0 = all)
+  /// @return Vector of State structs (includes id, name, iso2, countryId)
+  /// sorted alphabetically
+  std::vector<State> QueryStates(int limit = 0);
+};
--- a/pipeline/includes/database/database.h
+++ b/pipeline/includes/database/database.h
@@ -1,87 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
-#define BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
-
-#include <sqlite3.h>
-
-#include <mutex>
-#include <string>
-#include <vector>
-
-struct Country {
-   /// @brief Country identifier from the source dataset.
-   int id;
-   /// @brief Country display name.
-   std::string name;
-   /// @brief ISO 3166-1 alpha-2 code.
-   std::string iso2;
-   /// @brief ISO 3166-1 alpha-3 code.
-   std::string iso3;
-};
-
-struct State {
-   /// @brief State or province identifier from the source dataset.
-   int id;
-   /// @brief State or province display name.
-   std::string name;
-   /// @brief State or province short code.
-   std::string iso2;
-   /// @brief Parent country identifier.
-   int country_id;
-};
-
-struct City {
-   /// @brief City identifier from the source dataset.
-   int id;
-   /// @brief City display name.
-   std::string name;
-   /// @brief Parent country identifier.
-   int country_id;
-};
-
-/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks.
-class SqliteDatabase {
-  private:
-   sqlite3* db_ = nullptr;
-   std::mutex db_mutex_;
-
-   void InitializeSchema();
-
-  public:
-   /// @brief Closes the SQLite connection if initialized.
-   ~SqliteDatabase();
-
-   /// @brief Opens the SQLite database at db_path and creates schema objects.
-   void Initialize(const std::string& db_path = ":memory:");
-
-   /// @brief Starts a database transaction for batched writes.
-   void BeginTransaction();
-
-   /// @brief Commits the active database transaction.
-   void CommitTransaction();
-
-   /// @brief Rolls back the active database transaction.
-   void RollbackTransaction();
-
-   /// @brief Inserts a country row.
-   void InsertCountry(int id, const std::string& name, const std::string& iso2,
-                      const std::string& iso3);
-
-   /// @brief Inserts a state row linked to a country.
-   void InsertState(int id, int country_id, const std::string& name,
-                    const std::string& iso2);
-
-   /// @brief Inserts a city row linked to state and country.
-   void InsertCity(int id, int state_id, int country_id,
-                   const std::string& name, double latitude, double longitude);
-
-   /// @brief Returns city records including parent country id.
-   std::vector<City> QueryCities();
-
-   /// @brief Returns countries with optional row limit.
-   std::vector<Country> QueryCountries(int limit = 0);
-
-   /// @brief Returns states with optional row limit.
-   std::vector<State> QueryStates(int limit = 0);
-};
-
-#endif  // BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
--- a/pipeline/includes/generator.h
+++ b/pipeline/includes/generator.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <string>
+#include <vector>
+
+/**
+ * @class LlamaBreweryGenerator
+ * @brief Generates brewery names and descriptions for cities
+ *
+ * Currently provides a deterministic mock implementation that generates
+ * brewery names and descriptions based on city name hashing.
+ *
+ * Design Pattern: Strategy pattern ready for swapping real llama.cpp
+ * implementation later. The LoadModel() and GenerateBrewery() interface
+ * will remain the same once actual LM inference is integrated.
+ *
+ * Mock Implementation: Uses std::hash to deterministically map city names
+ * to brewery templates, ensuring reproducible results for testing.
+ */
+class LlamaBreweryGenerator {
+private:
+  /// Adjectives for brewery names (e.g., "Craft", "Heritage", etc.)
+  const std::vector<std::string> breweryAdjectives = {
+      "Craft",   "Heritage", "Local",  "Artisan",
+      "Pioneer", "Golden",   "Modern", "Classic"};
+
+  /// Nouns for brewery names (e.g., "Brewing Co.", "Brewery", etc.)
+  const std::vector<std::string> breweryNouns = {
+      "Brewing Co.", "Brewery", "Bier Haus",  "Taproom",
+      "Works",       "House",   "Fermentery", "Ale Co."};
+
+  /// Pre-written brewery descriptions (currently hand-crafted)
+  const std::vector<std::string> descriptions = {
+      "Handcrafted pale ales and seasonal IPAs with local ingredients.",
+      "Traditional lagers and experimental sours in small batches.",
+      "Award-winning stouts and wildly hoppy blonde ales.",
+      "Craft brewery specializing in Belgian-style triples and dark porters.",
+      "Modern brewery blending tradition with bold experimental flavors."};
+
+public:
+  /// @struct Brewery
+  /// @brief Output structure for generated brewery data
+  struct Brewery {
+    std::string name; ///< Generated brewery name (e.g., "Craft Brewing Co.")
+    std::string description; ///< Short description of brewery style/offerings
+  };
+
+  /// @brief Loads a language model (currently mocked)
+  /// @param modelPath Path to GGUF model file (not used in mock)
+  /// @note In real implementation, loads llama.cpp model into memory
+  void LoadModel(const std::string &modelPath);
+
+  /// @brief Generates a brewery name and description for a city
+  /// @param cityName City name to generate brewery for
+  /// @param seed Integer seed (used for deterministic output in mock)
+  /// @return Brewery struct with name and description
+  /// @note Deterministic: same cityName+seed always produces same brewery
+  Brewery GenerateBrewery(const std::string &cityName, int seed);
+};
--- a/pipeline/includes/json_handling/json_loader.h
+++ b/pipeline/includes/json_handling/json_loader.h
@@ -1,17 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
-#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
-
-#include <string>
-
-#include "database/database.h"
-#include "json_handling/stream_parser.h"
-
-/// @brief Loads world-city JSON data into SQLite through streaming parsing.
-class JsonLoader {
-  public:
-   /// @brief Parses a JSON file and writes country/state/city rows into db.
-   static void LoadWorldCities(const std::string& json_path,
-                               SqliteDatabase& db);
-};
-
-#endif  // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
--- a/pipeline/includes/json_handling/stream_parser.h
+++ b/pipeline/includes/json_handling/stream_parser.h
@@ -1,52 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
-#define BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
-
-#include <functional>
-#include <string>
-
-#include "database/database.h"
-
-// Forward declaration to avoid circular dependency
-class SqliteDatabase;
-
-/// @brief In-memory representation of one parsed city entry.
-struct CityRecord {
-   int id;
-   int state_id;
-   int country_id;
-   std::string name;
-   double latitude;
-   double longitude;
-};
-
-/// @brief Streaming SAX parser that emits city records during traversal.
-class StreamingJsonParser {
-  public:
-   /// @brief Parses file_path and invokes callbacks for city rows and progress.
-   static void Parse(const std::string& file_path, SqliteDatabase& db,
-                     std::function<void(const CityRecord&)> on_city,
-                     std::function<void(size_t, size_t)> on_progress = nullptr);
-
-  private:
-   /// @brief Mutable SAX handler state while traversing nested JSON arrays.
-   struct ParseState {
-      int current_country_id = 0;
-      int current_state_id = 0;
-
-      CityRecord current_city = {};
-      bool building_city = false;
-      std::string current_key;
-
-      int array_depth = 0;
-      int object_depth = 0;
-      bool in_countries_array = false;
-      bool in_states_array = false;
-      bool in_cities_array = false;
-
-      std::function<void(const CityRecord&)> on_city;
-      std::function<void(size_t, size_t)> on_progress;
-      size_t bytes_processed = 0;
-   };
-};
-
-#endif  // BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
--- a/pipeline/includes/json_loader.h
+++ b/pipeline/includes/json_loader.h
@@ -0,0 +1,85 @@
+#pragma once
+
+#include "database.h"
+#include <nlohmann/json.hpp>
+#include <string>
+
+using json = nlohmann::json;
+
+/**
+ * @class JsonLoader
+ * @brief Loads world geographic data from JSON file into SQLite database
+ *
+ * Handles parsing and population of world cities, states, and countries from
+ * a structured JSON source file. The loader uses parallel threads to chunk
+ * the city records and maximize database insertion throughput.
+ *
+ * Input Format (JSON Structure):
+ * @code
+ * {
+ *   "countries": [
+ *     {"id": 1, "name": "Canada", "iso2": "CA", "iso3": "CAN"},
+ *     ...
+ *   ],
+ *   "states": [
+ *     {"id": 1, "country_id": 1, "name": "Ontario", "iso2": "ON"},
+ *     ...
+ *   ],
+ *   "cities": [
+ *     {"id": 1, "state_id": 1, "country_id": 1, "name": "Toronto",
+ *      "latitude": 43.6532, "longitude": -79.3832},
+ *     ...
+ *   ]
+ * }
+ * @endcode
+ *
+ * Performance Characteristics:
+ * - Reads entire JSON file into memory (nlohmann/json parser)
+ * - Iterates through countries: typically 200+ records
+ * - Iterates through states: typically 3000+ records
+ * - Iterates through cities: typically 50,000+ records (MAJOR DATASET)
+ * - Uses multithreading to chunk city insertion across threads
+ * - Thread pool size defaults to number of CPU cores
+ *
+ * Multithreading Strategy:
+ * - Divides cities into N chunks (N = CPU core count)
+ * - Each thread processes one chunk sequentially
+ * - Database has mutex protection for thread-safe concurrent access
+ * - Allows safe parallel writing to same SQLite database
+ *
+ * Example Usage:
+ * @code
+ *   SqliteDatabase db;
+ *   db.Initialize();
+ *   JsonLoader::LoadWorldCities("../data/world_city_data.json", db);
+ *   // Database now contains all countries, states, and cities
+ * @endcode
+ */
+class JsonLoader {
+public:
+  /// @brief Loads world geographic data from JSON and populates database
+  ///
+  /// Process:
+  /// 1. Reads and parses entire JSON file
+  /// 2. Inserts all countries into database (typically 200-250 records)
+  /// 3. Inserts all states/provinces (typically 3000+ records)
+  /// 4. Spawns worker threads to insert cities (typically 50,000+ records)
+  /// 5. Waits for all threads to complete
+  /// 6. Prints statistics about loaded data
+  ///
+  /// @param jsonPath Filesystem path to world_city_data.json
+  /// @param db Reference to initialized SqliteDatabase to populate
+  ///
+  /// @throws std::runtime_error if JSON file cannot be read or parsed
+  /// @throws std::runtime_error if database insertion fails
+  ///
+  /// Output Examples:
+  /// @code
+  ///   Loading JSON: ../data/world_city_data.json
+  ///   Loaded countries: 250
+  ///   Loaded states: 3500
+  ///   Loaded cities: 52000
+  ///   ✓ World city data loaded successfully
+  /// @endcode
+  static void LoadWorldCities(const std::string &jsonPath, SqliteDatabase &db);
+};
--- a/pipeline/includes/web_client/curl_web_client.h
+++ b/pipeline/includes/web_client/curl_web_client.h
@@ -1,30 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
-#define BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
-
-#include <memory>
-
-#include "web_client/web_client.h"
-
-// RAII for curl_global_init/cleanup.
-// An instance of this class should be created in main() before any curl
-// operations and exist for the lifetime of the application.
-class CurlGlobalState {
-  public:
-   CurlGlobalState();
-   ~CurlGlobalState();
-   CurlGlobalState(const CurlGlobalState&) = delete;
-   CurlGlobalState& operator=(const CurlGlobalState&) = delete;
-};
-
-class CURLWebClient : public WebClient {
-  public:
-   CURLWebClient();
-   ~CURLWebClient() override;
-
-   void DownloadToFile(const std::string& url,
-                       const std::string& file_path) override;
-   std::string Get(const std::string& url) override;
-   std::string UrlEncode(const std::string& value) override;
-};
-
-#endif  // BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
--- a/pipeline/includes/web_client/web_client.h
+++ b/pipeline/includes/web_client/web_client.h
@@ -1,22 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
-#define BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
-
-#include <string>
-
-class WebClient {
-  public:
-   virtual ~WebClient() = default;
-
-   // Downloads content from a URL to a file. Throws on error.
-   virtual void DownloadToFile(const std::string& url,
-                               const std::string& file_path) = 0;
-
-   // Performs a GET request and returns the response body as a string. Throws
-   // on error.
-   virtual std::string Get(const std::string& url) = 0;
-
-   // URL-encodes a string.
-   virtual std::string UrlEncode(const std::string& value) = 0;
-};
-
-#endif  // BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
--- a/pipeline/includes/wikipedia/wikipedia_service.h
+++ b/pipeline/includes/wikipedia/wikipedia_service.h
@@ -1,27 +0,0 @@
-#ifndef BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
-#define BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
-
-#include <memory>
-#include <string>
-#include <string_view>
-#include <unordered_map>
-
-#include "web_client/web_client.h"
-
-/// @brief Provides cached Wikipedia summary lookups for city and country pairs.
-class WikipediaService {
-  public:
-   /// @brief Creates a new Wikipedia service with the provided web client.
-   explicit WikipediaService(std::shared_ptr<WebClient> client);
-
-   /// @brief Returns the Wikipedia summary extract for city and country.
-   [[nodiscard]] std::string GetSummary(std::string_view city,
-                                        std::string_view country);
-
-  private:
-   std::string FetchExtract(std::string_view query);
-   std::shared_ptr<WebClient> client_;
-   std::unordered_map<std::string, std::string> cache_;
-};
-
-#endif  // BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
--- a/pipeline/prompts/brewery_system_prompt.txt
+++ b/pipeline/prompts/brewery_system_prompt.txt
@@ -1,425 +0,0 @@
-================================================================================
-BREWERY DATA GENERATION - COMPREHENSIVE SYSTEM PROMPT
-================================================================================
-
-ROLE AND OBJECTIVE
-You are an experienced brewmaster and owner of a local craft brewery. Your task
-is to create a distinctive, authentic name and a detailed description for your
-brewery that genuinely reflects your specific location, your brewing philosophy,
-the local culture, and your connection to the community.
-
-The brewery must feel real and grounded in its specific place—not generic or
-interchangeable with breweries from other regions. Every detail should build
-authenticity and distinctiveness.
-
-================================================================================
-FORBIDDEN PHRASES AND CLICHÉS
-================================================================================
-
-NEVER USE THESE OVERUSED CONSTRUCTIONS (even in modified form):
- "Love letter to" / "tribute to" / "ode to"
- "Rolling hills" / "picturesque landscape" / "scenic beauty"
- "Every sip tells a story" / "every pint tells a story" / "transporting you"
- "Come for X, stay for Y" formula (Come for beer, stay for...)
- "Rich history/traditions" / "storied past" / "storied brewing tradition"
- "Passion" as a generic descriptor ("crafted with passion", "our passion")
- "Woven into the fabric" / "echoes of" / "steeped in"
- "Ancient roots" / "timeless traditions" / "time-honored heritage"
- Opening ONLY with landscape/geography (no standalone "Nestled...", "Where...")
- "Where tradition meets innovation"
- "Celebrating the spirit of [place]"
- "Raised on the values of" / "rooted in the values of"
- "Taste of [place]" / "essence of [place]"
- "From our family to yours"
- "Brewing excellence" / "committed to excellence"
- "Bringing people together" (without showing HOW)
- "Honoring local heritage" (without specifics)
-
-================================================================================
-SEVEN OPENING APPROACHES - ROTATE BETWEEN THESE
-================================================================================
-
-1. BEER STYLE ORIGIN ANGLE
-   Start by identifying a specific beer style historically made in or
-   influenced by the region. Explain why THIS place inspired that style.
-   Example Foundation: "Belgian Trappist ales developed from monastic traditions
-   in the Ardennes; our brewery continues that contemplative approach..."
-
-2. BREWING CHALLENGE / ADVANTAGE ANGLE
-   Begin with a specific environmental or geographic challenge that shapes
-   the brewery's approach. Water hardness, altitude, climate, ingredient scarcity.
-   Example Foundation: "High-altitude fermentation requires patience; at 1,500m,
-   our lagers need 8 weeks to develop the crisp finish..."
-
-3. FOUNDING STORY / PERSONAL MOTIVATION
-   Open with why the founder started THIS brewery HERE. Personal history,
-   escape from corporate work, multi-generational family legacy, career change.
-   Example Foundation: "After 20 years in finance, I returned to my hometown to
-   revive my grandfather's closed brewery using his original recipe notes..."
-
-4. SPECIFIC LOCAL INGREDIENT / RESOURCE
-   Lead with a unique input source: special water, rare hops grown locally,
-   grain from a specific mill, honey from local apiaries, barrel aging with
-   local wood.
-   Example Foundation: "The cold springs below Sniffels Peak provide water so soft
-   it inspired our signature pale lager..."
-
-5. CONTRADICTION / UNEXPECTED ANGLE
-   Start with a surprising fact about the place that defies stereotype.
-   Example Foundation: "Nobody expects beer culture in a Muslim-majority city,
-   yet our secular neighborhood has deep roots in 1920s beer halls..."
-
-6. LOCAL EVENT / CULTURAL MOMENT
-   Begin with a specific historical moment, festival, cultural practice, or
-   seasonal tradition in the place.
-   Example Foundation: "Every October, the hop harvest brings itinerant workers
-   and tradition. Our brewery grew from a harvest celebration in 2008..."
-
-7. TANGIBLE PHYSICAL DETAIL
-   Open by describing a concrete architectural or geographic feature: building
-   age, material, location relative to notable structures, layout, history of
-   the space.
-   Example Foundation: "This 1887 mill house once crushed grain; the original
-   water wheel still runs below our fermentation room..."
-
-================================================================================
-SPECIFICITY AND CONCRETENESS REQUIREMENTS
-================================================================================
-
-DO NOT GENERALIZE. Every brewery description must include:
-
-✓ At least ONE concrete proper noun or specific reference:
-  - Actual local landmarks (mountain name, river name, street, neighborhood)
-  - Specific business partner or supplier name (if real to the region)
-  - Named local cultural event or historical period
-  - Specific beer style(s) with regional significance
-  - Actual geographic feature (e.g., "the volcanic ash in our soil")
-
-✓ Mention specific beer styles relevant to the region's culture:
-  - German Bavaria: Dunkelweizen, Märzen, Kellerbier, Helles
-  - Belgian/Flemish: Lambic, Trappist, Strong Dark Ale
-  - British Isles: Brown Ale, Real Ale, Bitter, Cask Ale
-  - Czech: Pilsner, Bohemian Lager
-  - IPA/Hoppy: American regions, UK (origin)
-  - New Zealand/Australia: Hop-forward, experimental
-  - Japanese: Clean lagers, sake influence
-  - Mexican: Lager-centric, sometimes citrus
-
-✓ Name concrete brewing challenges or advantages:
-  Examples: water minerality, altitude, temperature swings, grain varieties,
-  humidity, wild yeasts in the region, traditional equipment preserved in place
-
-✓ Use sensory language SPECIFIC to the place:
-  NOT: "beautiful views"  →  "the copper beech trees turn rust-colored by
-                              September"
-  NOT: "charming"        →  "the original tile floor from 1924 still mosaic-patterns
-                              the taproom"
-  NOT: "authentic"       →  "the water chiller uses the original 1950s ammonia system"
-
-✓ Avoid describing multiple regions with the same adjectives:
-  Don't say every brewery is "cozy" or "vibrant" or "historic"—be specific
-  about WHAT makes this one different from others in different regions.
-
-================================================================================
-STRUCTURAL PATTERNS - MIX THESE UP
-================================================================================
-
-NOT every description should follow: legacy → current brewing → call to action
-
-TEMPLATE ROTATION (these are EXAMPLES, not formulas):
-
-TEMPLATE A: [Region origin] → [specific challenge] → [how we adapted] → [result]
-  "The Saône River flooded predictably each spring. Medieval brewers learned
-  to schedule production around it. We use the same seasonal rhythm..."
-
-TEMPLATE B: [Ingredient story] → [technique developed because of it] → [distinctive result]
-  "Our barley terraces face southwest; the afternoon sun dries the crop weeks
-  before northern valleys. This inspired our crisp, mineral-forward pale ale..."
-
-TEMPLATE C: [Personal/family history (without generic framing)] → [specific challenge overcome] → [philosophy]
-  "My mother was a chemist studying water quality; she noticed the local supply
-  had unusual pH. Rather than fight it, we formulated our entire range around
-  it. The sulfate content sharpens our bitters..."
-
-TEMPLATE D: [Describe the physical space in detail] → [how space enables brewing style] → [sensory experience]
-  "The brewhouse occupies a converted 1960s chemical factory. The stainless steel
-  vats still bear faded original markings. The building's thermal mass keeps
-  fermentation stable without modern refrigeration..."
-
-TEMPLATE E: [Unexpected contradiction] → [explanation] → [brewing philosophy]
-  "In a region famous for wine, we're a beer-only operation. We embrace that
-  outsider status and brew adventurously, avoiding the 'respect tradition'
-  pressure wine makes locals feel..."
-
-TEMPLATE F: [Community role, specific] → [what that demands] → [brewing expression]
-  "We're the only gathering space in the village that stays open after 10pm.
-  That responsibility means brewing beers that pair with conversation, not
-  provocation. Sessionable, food-friendly, endlessly drinkable..."
-
-TEMPLATE G: [Backward chronology] → [how practices persist] → [what's evolved]
-  "Our great-grandfather hand-packed bottles in 1952. We still own his bench.
-  Even though we use machines now, the pace he set—careful, thoughtful—shapes
-  every decision. Nothing about us is fast..."
-
-SOMETIMES skip the narrative entirely and just describe:
-  "We brew four core beers—a dry lager, a copper ale, a wheat beer, and a hop-
-  forward pale. The range itself tells our story: accessible, varied,
-  unpretentious. No flagship. No hero beer. Balance."
-
-================================================================================
-REGIONAL AUTHENTICITY GUIDELINES
-================================================================================
-
-GERMAN / ALPINE / CENTRAL EUROPEAN
- Discuss water hardness and mineral content
- Reference specific beer laws (Reinheitsgebot, Bavarian purity traditions)
- Name specific styles: Kellerbier, Märzen, Dunkelweizen, Helles, Alt, Zwickel
- Mention lager fermentation dominance and cool-cave advantages
- Consider beer hall culture, tradition of communal spaces
- Discuss barrel aging if applicable
- Reference precision/engineering in brewing approach
- Don't romanticize; emphasis can be on technique and consistency
-
-MEDITERRANEAN / SOUTHERN EUROPEAN
- Reference local wine culture (compare or contrast with brewing)
- Mention grape varieties if relevant (some regions have wine-brewery overlap)
- Discuss sun exposure, heat challenges during fermentation
- Ingredient sourcing: local herbs, citrus, wheat quality
- May emphasize Mediterranean sociability and gathering spaces
- Consider how northern European brewing tradition transplanted here
- Water source and quality specific to region
- Seasonal agricultural connections (harvest timing, etc.)
-
-ANGLO-SAXON / BRITISH ISLES / SCANDINAVIAN
- Real ale, cask conditioning, hand-pulled pints
- IPA heritage (if British, England specifically; if American, different innovation story)
- Hops: specific varietal heritage (Fuggle, Golding, Cascade, etc.)
- Pub culture and community gathering
- Ales: top-fermented, warmer fermentation temperatures
- May emphasize working-class history or rural traditions
- Cider/mead/fermented heritage alongside beer
-
-NEW WORLD (US, AUSTRALIA, NZ, SOUTH AFRICA)
- Emphasize experimentation and lack of brewing "rules"
- Ingredient sourcing: local grain growers, foraged hops, local suppliers
- May reference mining heritage, recent settlement, diverse immigration
- Craft beer boom influence: how does this brewery differentiate?
- Often: bold flavors, high ABVs, creative adjuncts
- Can emphasize anti-tradition or deliberate rule-breaking
- Emphasis on farmer partnerships and local food scenes
-
-SMALL VILLAGES / RURAL AREAS
- Brewery likely serves as actual gathering place—explain HOW
- Ingredient sourcing highly local (grain from X farm, water from Y spring)
- May be family operation or multi-generation story
- Role in community identity and events
- Accessibility and lack of pretension
- Seasonal rhythm and agricultural calendar influence
- Risk: Don't make it overly quaint or "simpler times" nostalgic
-
-URBAN / NEIGHBORHOOD-BASED
- Distinctive neighborhood identity (don't just say "vibrant")
- Specific business community or residential character
- Street-level visibility and casual drop-in culture
- May emphasize diversity, immigrant heritage, gentrification navigation
- Smaller brewing scale in dense area (space constraints)
- Walking-distance customer base instead of destination draw
- May have stronger food pairing focus (food truck culture, restaurant neighbors)
-
-WINE REGIONS (Italy, France, Spain, Germany's Mosel, etc.)
- Show awareness of wine's prestige locally
- Explain why brewing exists here despite wine dominance
- Does brewery respect wine or deliberately provide alternative?
- Ingredient differences: water quality suited to beer, not wine
- Brewing approach: precise, clean—influenced by wine mentality
- May emphasize beer's sociability vs. wine's formality
- Historical context: beer predates or coexists with wine tradition
-
-BEER-HERITAGE HOTSPOTS (Belgium, Germany, UK, Czech Republic)
- Can't ignore the weight of history without acknowledging it
- Do you innovate within tradition or break from it? Say which.
- Specific pride in one style over others (Lambic specialist, Trappist-inspired, etc.)
- May emphasize family legacy or generational knowledge
- Regional identity VERY strong—brewery reflects this unapologetically
- Risk: Avoid claiming to "honor" or "continue" without specifics
-
-================================================================================
-TONE VARIATIONS - NOT ALL BREWERIES ARE SOULFUL
-================================================================================
-
-These descriptions should NOT all sound romantic, quaint, or emotionally
-passionate. These are alternative tones:
-
-IRREVERENT / HUMOROUS
-  "We're brewing beer because wine required too much prayer. Less spirituality,
-   more hops. Our ales are big, unpolished, and perfect after a day's work."
-
-MATTER-OF-FACT / ENGINEERING-FOCUSED
-  "Brewing is chemistry. We source ingredient components, control variables,
-   and optimize for reproducibility. If that sounds clinical, good—consistency
-   is our craft."
-
-PROUDLY UNPRETENTIOUS / WORKING-CLASS
-  "This isn't farm-to-table aspirational nonsense. It's a neighborhood beer.
-   $4 pints. No reservations. No sipping notes. Tastes good, fills the glass,
-   keeps you coming back."
-
-MINIMALIST / DIRECT
-  "We brew three beers. They're good. Come drink one."
-
-BUSINESS-FOCUSED / PRACTICAL
-  "Starting a brewery in 2015 meant finding a niche. We're the only nano-
-   brewery serving the airport district. Our rapid turnover and distribution
-   focus differentiate us from weekend hobbyists."
-
-CONFRONTATIONAL / REBELLIOUS
-  "Craft beer got boring. Expensive IPAs and flavor-chasing. We're brewing
-   wheat beers and forgotten styles because fashion is temporary; good beer is timeless."
-
-MIX these tones across your descriptions. Some breweries should sound romantic
-and place-proud. Others should sound irreverent or practical.
-
-================================================================================
-NARRATIVE CLICHÉS TO ABSOLUTELY AVOID
-================================================================================
-
-1. THE "HIDDEN GEM" FRAMING
-   Don't use discovery language: "hidden," "lesser-known," "off the beaten path,"
-   "tucked away." Implies marketing speak, not authenticity.
-
-2. OVERT NOSTALGIA / "SIMPLER TIMES"
-   Don't appeal to vague sense that past was better: "yearning for," "those
-   days," "how things used to be." Lazy and off-putting.
-
-3. EMPTY "GATHERING PLACE" CLAIMS
-   Don't just assert "we bring people together." Show HOW: local workers' lunch
-   spot? Trivia night tradition? Live music venue? Political meeting ground?
-
-4. "SPECIAL" WITHOUT EVIDENCE
-   Don't declare location is "special" or "unique." SHOW what makes it distinct
-   through specific details, not assertion.
-
-5. "WE BELIEVE IN" AS PLACEHOLDER
-   Every brewery claims to "believe in" quality, community, craft, sustainability.
-   These are empty. What specific belief drives THIS brewery's choices?
-
-6. "ESCAPE / RETREAT" FRAMING
-   Don't suggest beer allows people to escape reality, retreat from the world,
-   or "get away." Implies you don't trust the place itself to be compelling.
-
-7. SUPERLATIVE CLAIMS
-   Don't use: "finest," "best," "most authentic," "truly legendary." Let details
-   prove these implied claims instead.
-
-8. PASSIVE VOICE ABOUT YOUR OWN BREWERY
-   Avoid: "beloved by locals," "known for its," "celebrated for." Active voice:
-   what does the brewery actively DO?
-
-================================================================================
-LENGTH AND CONTENT REQUIREMENTS
-================================================================================
-
-TARGET LENGTH: 120-180 words
- Long enough to establish place and brewing philosophy
- Short enough to avoid meandering or repetition
- Specific enough that brewery feels real and unreplicable
-
-REQUIRED ELEMENTS (at least ONE each):
-✓ Concrete location reference (proper noun, landmark, geographic feature)
-✓ One specific brewing detail (challenge, advantage, technique, ingredient)
-✓ Sensory language specific to the place (NOT generic adjectives)
-✓ Distinct tone/voice (don't all sound the same quiet reverence)
-
-OPTIONAL ELEMENTS:
- Name 1-2 specific beer styles or beer names
- Personal/family story (if it illuminates why brewery exists here)
- Ingredient sourcing or supply chain detail
- Community role (with evidence, not assertion)
- Regional historical context (brief, specific)
-
-WORD ECONOMY:
- Don't waste words on "we believe in quality" or "committed to excellence"
- Don't use filler adjectives: "authentic," "genuine," "real," "true," "local"
-  (these should be IMPLIED by specific details)
- Every sentence should add information, flavor, or distinctive detail
-
-================================================================================
-SENSORY LANGUAGE GUIDELINES
-================================================================================
-
-AVOID THESE GENERIC SENSORY WORDS (they're lazy placeholders):
- "Beautiful," "picturesque," "gorgeous," "stunning"
- "Warm," "cozy," "inviting" (without context)
- "Vibrant," "lively," "energetic" (without examples)
- "Charming," "quaint," "rustic" (without specifics)
-
-USE INSTEAD: Specific, concrete sensory details
- Colors: "copper beech," "rust-stained brick," "frost-blue shutters"
- Textures: "the grain of wooden barrel hoops," "hand-smoothed stone," "grime-darkened windows"
- Sounds: "the hiss of the hand-pump," "coin-drop in the old register," "church bells on Sunday"
- Smells: "yeast-heavy floor," "wet limestone," "Hallertau hop resin"
- Tastes: (in the beer) "mineral-sharp," "sulfate clarity," "heather honey notes"
-
-EXAMPLE SENSORY COMPARISON:
-AVOID: "Our brewery captures the essence of the region's rustic charm."
-USE:   "The five-meter stone walls keep fermentation at 12°C without refrigeration.
-        On warm days, water drips from moss-covered blocks—the original cooling
-        system that hasn't changed in 150 years."
-
-================================================================================
-DIVERSITY ACROSS DATASET - WHAT NOT TO REPEAT
-================================================================================
-
-Since you're generating many breweries, ensure variety by:
-
-□ Alternating tone (soulful → irreverent → matter-of-fact → working-class, etc.)
-□ Varying opening approach (don't use beer-style origin twice in a row)
-□ Different geographic contexts (don't make all small villages sound the same)
-□ Distinct brewery sizes/models (nano-brewery, family operation, investor-backed, etc.)
-□ Various types of "draw" (neighborhood destination vs. local-only vs. tourist
-  attraction vs. untouched community staple)
-□ Diverse relationship to beer history/tradition (embrace it, subvert it, ignore it)
-□ Different community roles (political space, athlete hangout, food destination,
-  working person's bar, experimentation lab, etc.)
-
-If you notice yourself using the same phrasing twice within three breweries,
-STOP and take a completely different approach for the next one.
-
-================================================================================
-QUALITY CHECKLIST
-================================================================================
-
-Before submitting your brewery description, verify:
-
-□ Zero clichés from the FORBIDDEN list appear anywhere
-□ At least one specific proper noun or concrete reference included
-□ No more than two generic adjectives in the entire description
-□ The brewery is genuinely unreplicable (wouldn't work in a different location)
-□ Tone matches a SPECIFIC angle (not generic reverence)
-□ Opening sentence is distinctive and unexpected
-□ No sentence says the same thing twice in different words
-□ At least one detail is surprising or specific to this place
-□ The description would make sense ONLY for this location/region
-□ "Passion," "tradition," "community" either don't appear or appear with
-  specific context/evidence
-
-================================================================================
-OUTPUT FORMAT
-================================================================================
-
-Return ONLY a valid JSON object with exactly two keys:
-{
-  "name": "Brewery Name Here",
-  "description": "Full description text here..."
-}
-
-Requirements:
- name: 2-5 words, distinctive, memorable
- description: 120-180 words, follows all guidelines above
- Valid JSON (escaped quotes, no line breaks in strings)
- No markdown, no backticks, no code formatting
- No preamble before the JSON
- No trailing text after the JSON
- No explanations or commentary
-
-================================================================================
--- a/pipeline/prompts/brewery_system_prompt_expanded.txt
+++ b/pipeline/prompts/brewery_system_prompt_expanded.txt
@@ -1,169 +0,0 @@
-================================================================================
-BREWERY DATA GENERATION SYSTEM PROMPT
-================================================================================
-
-ROLE AND OBJECTIVE
-You are an experienced brewmaster creating authentic brewery descriptions that
-feel real and grounded in specific places. Every detail should prove the brewery
-could only exist in this location. Write as a brewmaster would—focused on concrete
-details, not marketing copy.
-
-================================================================================
-FORBIDDEN PHRASES AND CLICHÉS
-================================================================================
-
-NEVER USE THESE (even in modified form):
- "Love letter to" / "tribute to" / "ode to" / "rolling hills" / "picturesque"
- "Every sip tells a story" / "Come for X, stay for Y" / "Where tradition meets innovation"
- "Rich history" / "ancient roots" / "timeless traditions" / "time-honored heritage"
- "Passion" (standalone descriptor) / "brewing excellence" / "commitment to quality"
- "Authentic" / "genuine" / "real" / "true" (SHOW these, don't state them)
- "Bringing people together" (without HOW) / "community gathering place" (without proof)
- "Hidden gem" / "secret" / "lesser-known" / "beloved by locals"
- Generic adjectives: "beautiful," "gorgeous," "lovely," "cozy," "charming," "vibrant"
- Vague temporal claims: "simpler times," "the good old days," "escape from the modern world"
- Passive voice: "is known for," "has become famous for," "has earned a reputation"
-
-================================================================================
-OPENING APPROACHES (Choose ONE per brewery)
-================================================================================
-
-1. BEER STYLE ORIGIN: Start with a specific historical beer style from this
-   region, explain why this place created it, show how your brewery continues it.
-   Key: Name specific style → why this region made it → how you continue it
-
-2. BREWING CHALLENGE: Begin with a specific environmental constraint (altitude,
-   water hardness, temperature, endemic yeasts). Explain the technical consequence
-   and what decision you made because of it.
-   Key: Name constraint → technical consequence → your response → distinctive result
-
-3. FOUNDING STORY: Why did the founder return/move HERE? What did they discover?
-   What specific brewing decision followed? Include a concrete artifact (logs, equipment).
-   Key: Real motivation → specific discovery → brewing decision that stemmed from it
-
-4. LOCAL INGREDIENT: What unique resource defines your brewery? Why is it unique?
-   What brewing constraint or opportunity does it create?
-   Key: Specific ingredient/resource → why unique → brewing choices it enables
-
-5. CONTRADICTION: What is the region famous for? Why does your brewery do the
-   opposite? Make the contradiction a strength, not an apology.
-   Key: Regional identity → why you diverge → what you do instead → why it works
-
-6. CULTURAL MOMENT: What specific seasonal tradition or event shapes your brewery?
-   How do you connect to it? What brewing decisions follow?
-   Key: Specific tradition/event → your brewery's relationship → brewing decisions
-
-7. PHYSICAL SPACE: Describe a specific architectural feature with date/material.
-   How does it create technical advantage? What sensory details matter? Why keep
-   constraints instead of modernizing?
-   Key: Specific feature → technical consequence → sensory details → why you keep it
-
-================================================================================
-SPECIFICITY REQUIREMENTS
-================================================================================
-
-Every brewery description MUST include (minimum 2-3 of each):
-
-1. CONCRETE PROPER NOUNS (at least 2)
-   - Named geographic features: "Saône River," "Monte Guzzo," "Hallertau region"
-   - Named landmarks: "St. Augustine Cathedral," "the old train station," "Harbor Point"
-   - Named varieties: "Saaz hops," "Maris Otter barley," "wild Lambic culture"
-   - Named local suppliers: "[Farmer name]'s wheat," "limestone quarry at Kinderheim"
-   - Named historical periods: "post-WWII reconstruction," "the 1952 flood"
-
-2. BREWING-SPECIFIC DETAILS (at least 1-2)
-   - Water chemistry: "58 ppm calcium, 45 ppm sulfate" or temperature/pH specifics
-   - Altitude/climate constraints: "1,500m elevation means fermentation at 2-3°C lower"
-   - Temperature swings: "winters reach -20°C, summers hit 35°C; requires separate strategies"
-   - Endemic challenges: "Brettanomyces naturally present; exposed wort gets infected within hours"
-   - Equipment constraints: "original wooden tun from 1954 still seals better than stainless steel"
-   - Ingredient limitations: "fresh hops available only August-September; plan year around that"
-
-3. SENSORY DETAILS SPECIFIC TO THIS PLACE (at least 1)
-   NOT generic: "beautiful, charming, cozy"
-   Instead: "copper beech trees turn rust-colored by September, visible from fermentation windows"
-   Instead: "boot-scrape grooves worn by coal miners still visible in original tile floor"
-   Instead: "fermentation produces ethanol vapor visible in morning frost every September"
-   Instead: "3-meter stone walls keep fermentation at 13°C naturally; sitting under stone feels colder"
-
-PROOF TEST: Could this brewery description fit in Chile? Germany? Japan?
- If YES: add more place-specific details
- If NO: you're on track. Identity should be inseparable from location.
-
-
-================================================================================
-TONE VARIATIONS
-================================================================================
-
-Rotate tones consciously. Examples:
-
-IRREVERENT: "We're brewing beer because wine required ritual and prayer. Less
-spirituality, more hops. Our ales are big, unpolished. Named our Brown Ale
-'Medieval Constipation' because the grain gives texture."
-
-MATTER-OF-FACT: "Brewing is applied chemistry. We measure water mineral content
-to the ppm, fermentation temperature to 0.5°C. Our Märzen has the same gravity,
-ABV, and color every single batch. Precision is our craft."
-
-WORKING-CLASS PROUD: "This isn't farm-to-table aspirational nonsense. It's a
-neighborhood beer. Four dollars a pint. No reservations, no tasting notes.
-Workers need somewhere to go."
-
-MINIMALIST: "We brew three beers. They're good. That's it."
-
-NOSTALGIC-GROUNDED: "My grandfather brewed in his basement. When he died in
-1995, I found his brewing logs in 2015. I copied his exact recipes. Now the
-fermentation smells like his basement."
-
-
-================================================================================
-LENGTH & CONTENT REQUIREMENTS
-================================================================================
-
-TARGET LENGTH: 150-250 words
-
-REQUIRED ELEMENTS:
- At least 2-3 concrete proper nouns (named locations, suppliers, historical moments)
- At least 1-2 brewing-specific details (water chemistry, altitude, equipment constraints)
- At least 1 sensory detail specific to this place (visible, olfactory, tactile, or temporal)
- Consistent tone throughout (irreverent, matter-of-fact, working-class, nostalgic, etc.)
- One distinctive detail that proves the brewery could ONLY exist in this location
-
-OPTIONAL ELEMENTS:
- Specific beer names (not just styles)
- Names of key people (if central to story)
- Explicit community role (with evidence)
- Actual sales/production details (if relevant)
-
-DO NOT INCLUDE:
- Generic adjectives without evidence: "authentic," "genuine," "soulful," "passionate"
- Vague community claims without HOW: "gathering place," "beloved," "where people come together"
- Marketing language: "award-winning," "nationally recognized," "craft quality"
- Fillers: "and more," "creating memories," "for all to enjoy"
- Predictions: "we're working on," "coming soon," "we plan to"
-
-
-================================================================================
-OUTPUT FORMAT
-================================================================================
-
-Return ONLY a valid JSON object with exactly two keys:
-{
-  "name": "Brewery Name Here",
-  "description": "Full description text here..."
-}
-
-Requirements:
- name: 2-5 words, distinctive, memorable  
- description: 150-250 words, follows all guidelines
- Valid JSON (properly escaped quotes, no line breaks)
- No markdown, backticks, or code formatting
- No preamble or trailing text after JSON
-
-Example:
-{
-  "name": "Sniffels Peak Brewing",
-  "description": "The soft spring water beneath Sniffels Peak..."
-}
-
-================================================================================
--- a/pipeline/src/biergarten_data_generator.cpp
+++ b/pipeline/src/biergarten_data_generator.cpp
@@ -1,158 +0,0 @@
-#include "biergarten_data_generator.h"
-
-#include <spdlog/spdlog.h>
-
-#include <algorithm>
-#include <filesystem>
-#include <unordered_map>
-
-#include "data_generation/data_downloader.h"
-#include "data_generation/llama_generator.h"
-#include "data_generation/mock_generator.h"
-#include "json_handling/json_loader.h"
-#include "wikipedia/wikipedia_service.h"
-
-BiergartenDataGenerator::BiergartenDataGenerator(
-    const ApplicationOptions& options, std::shared_ptr<WebClient> web_client,
-    SqliteDatabase& database)
-    : options_(options), webClient_(web_client), database_(database) {}
-
-std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
-   spdlog::info("Initializing brewery generator...");
-
-   std::unique_ptr<DataGenerator> generator;
-   if (options_.model_path.empty()) {
-      generator = std::make_unique<MockGenerator>();
-      spdlog::info("[Generator] Using MockGenerator (no model path provided)");
-   } else {
-      auto llama_generator = std::make_unique<LlamaGenerator>();
-      llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
-                                          options_.seed);
-      llama_generator->SetContextSize(options_.n_ctx);
-      spdlog::info(
-          "[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
-          "n_ctx={}, seed={})",
-          options_.model_path, options_.temperature, options_.top_p,
-          options_.n_ctx, options_.seed);
-      generator = std::move(llama_generator);
-   }
-   generator->Load(options_.model_path);
-
-   return generator;
-}
-
-void BiergartenDataGenerator::LoadGeographicData() {
-   std::string json_path = options_.cache_dir + "/countries+states+cities.json";
-   std::string db_path = options_.cache_dir + "/biergarten-pipeline.db";
-
-   bool has_json_cache = std::filesystem::exists(json_path);
-   bool has_db_cache = std::filesystem::exists(db_path);
-
-   spdlog::info("Initializing SQLite database at {}...", db_path);
-   database_.Initialize(db_path);
-
-   if (has_db_cache && has_json_cache) {
-      spdlog::info("[Pipeline] Cache hit: skipping download and parse");
-   } else {
-      spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
-      DataDownloader downloader(webClient_);
-      downloader.DownloadCountriesDatabase(json_path, options_.commit);
-
-      JsonLoader::LoadWorldCities(json_path, database_);
-   }
-}
-
-std::vector<std::pair<City, std::string>>
-BiergartenDataGenerator::QueryCitiesWithCountries() {
-   spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
-
-   auto cities = database_.QueryCities();
-
-   // Build a quick map of country id -> name for per-city lookups.
-   auto all_countries = database_.QueryCountries(0);
-   std::unordered_map<int, std::string> country_map;
-   for (const auto& c : all_countries) {
-      country_map[c.id] = c.name;
-   }
-
-   spdlog::info("\nTotal records loaded:");
-   spdlog::info("  Countries: {}", database_.QueryCountries(0).size());
-   spdlog::info("  States: {}", database_.QueryStates(0).size());
-   spdlog::info("  Cities: {}", cities.size());
-
-   // Cap at 30 entries.
-   const size_t sample_count = std::min(size_t(30), cities.size());
-   std::vector<std::pair<City, std::string>> result;
-
-   for (size_t i = 0; i < sample_count; i++) {
-      const auto& city = cities[i];
-      std::string country_name;
-      const auto country_it = country_map.find(city.country_id);
-      if (country_it != country_map.end()) {
-         country_name = country_it->second;
-      }
-      result.push_back({city, country_name});
-   }
-
-   return result;
-}
-
-std::vector<BiergartenDataGenerator::EnrichedCity>
-BiergartenDataGenerator::EnrichWithWikipedia(
-    const std::vector<std::pair<City, std::string>>& cities) {
-   WikipediaService wikipedia_service(webClient_);
-   std::vector<EnrichedCity> enriched;
-
-   for (const auto& [city, country_name] : cities) {
-      const std::string region_context =
-          wikipedia_service.GetSummary(city.name, country_name);
-      spdlog::debug("[Pipeline] Region context for {}: {}", city.name,
-                    region_context);
-
-      enriched.push_back({city.id, city.name, country_name, region_context});
-   }
-
-   return enriched;
-}
-
-void BiergartenDataGenerator::GenerateBreweries(
-    DataGenerator& generator, const std::vector<EnrichedCity>& cities) {
-   spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
-   generatedBreweries_.clear();
-
-   for (const auto& enriched_city : cities) {
-      auto brewery = generator.GenerateBrewery(enriched_city.city_name,
-                                               enriched_city.country_name,
-                                               enriched_city.region_context);
-      generatedBreweries_.push_back(
-          {enriched_city.city_id, enriched_city.city_name, brewery});
-   }
-}
-
-void BiergartenDataGenerator::LogResults() const {
-   spdlog::info("\n=== GENERATED DATA DUMP ===");
-   for (size_t i = 0; i < generatedBreweries_.size(); i++) {
-      const auto& entry = generatedBreweries_[i];
-      spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.city_id,
-                   entry.city_name);
-      spdlog::info("   brewery_name=\"{}\"", entry.brewery.name);
-      spdlog::info("   brewery_description=\"{}\"", entry.brewery.description);
-   }
-}
-
-int BiergartenDataGenerator::Run() {
-   try {
-      LoadGeographicData();
-      auto generator = InitializeGenerator();
-      auto cities = QueryCitiesWithCountries();
-      auto enriched = EnrichWithWikipedia(cities);
-      GenerateBreweries(*generator, enriched);
-      LogResults();
-
-      spdlog::info("\nOK: Pipeline completed successfully");
-      return 0;
-   } catch (const std::exception& e) {
-      spdlog::error("ERROR: Pipeline failed: {}", e.what());
-      return 1;
-   }
-}
--- a/pipeline/src/data_downloader.cpp
+++ b/pipeline/src/data_downloader.cpp
@@ -0,0 +1,163 @@
+/**
+ * @file data_downloader.cpp
+ * @brief Implementation of DataDownloader using libcurl for HTTP downloads.
+ *
+ * Provides robust downloading with proper error handling, timeout management,
+ * and local caching to avoid repeated network calls. Uses GitHub's raw content
+ * CDN for reliable high-bandwidth downloads.
+ */
+
+#include "data_downloader.h"
+#include <cstdio>
+#include <curl/curl.h>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <sys/stat.h>
+
+/**
+ * @brief Callback function for libcurl to write downloaded content to file.
+ *
+ * This callback is invoked repeatedly by curl as data arrives over the network.
+ * Each invocation contains a chunk of the response body. The function writes
+ * the content to the output file stream.
+ *
+ * @param contents  Pointer to buffer containing data chunk.
+ * @param size      Element size (always 1 for text).
+ * @param nmemb     Number of elements in chunk.
+ * @param userp     Opaque pointer to std::ofstream (FILE*).
+ *
+ * @return          Total bytes written. Must match (size * nmemb) for success;
+ *                  returning less signals an error to curl.
+ *
+ * @note libcurl requires this signature: (char* ptr, size_t size, size_t nmemb,
+ * void* userp)
+ */
+static size_t WriteCallback(void *contents, size_t size, size_t nmemb,
+                            void *userp) {
+  // Calculate total bytes in this chunk
+  size_t realsize = size * nmemb;
+
+  // Cast userp back to ofstream
+  std::ofstream *outFile = static_cast<std::ofstream *>(userp);
+
+  // Write to file
+  outFile->write(static_cast<char *>(contents), realsize);
+
+  // Return actual bytes written (success = requested amount)
+  return realsize;
+}
+
+DataDownloader::DataDownloader() {
+  // curl_global_init is called by user or external subsystem in a thread-safe
+  // manner. Not calling it here to avoid multiple initialization in
+  // multi-downloader scenarios.
+}
+
+DataDownloader::~DataDownloader() {
+  // No explicit cleanup needed; curl_global_cleanup managed externally.
+}
+
+bool DataDownloader::FileExists(const std::string &filePath) const {
+  // Use POSIX stat() to check file existence without opening it
+  struct stat buffer;
+  return (stat(filePath.c_str(), &buffer) == 0);
+}
+
+std::string
+DataDownloader::DownloadCountriesDatabase(const std::string &cachePath,
+                                          const std::string &commit) {
+  // Check if file already cached locally
+  if (FileExists(cachePath)) {
+    std::cout << "[DataDownloader] Cache hit: " << cachePath << std::endl;
+    return cachePath;
+  }
+
+  // Construct download URL
+  // Full commit hash is accepted, but only first 7 chars (short hash) are
+  // needed
+  std::string shortCommit = commit;
+  if (commit.length() > 7) {
+    shortCommit = commit.substr(0, 7);
+  }
+
+  std::string url = "https://raw.githubusercontent.com/dr5hn/"
+                    "countries-states-cities-database/" +
+                    shortCommit + "/json/countries+states+cities.json";
+
+  std::cout << "[DataDownloader] Downloading: " << url << std::endl;
+
+  // Initialize curl handle
+  CURL *curl = curl_easy_init();
+  if (!curl) {
+    throw std::runtime_error("[DataDownloader] Failed to initialize libcurl");
+  }
+
+  // Open output file for writing (binary mode to preserve exact bytes)
+  std::ofstream outFile(cachePath, std::ios::binary);
+  if (!outFile.is_open()) {
+    curl_easy_cleanup(curl);
+    throw std::runtime_error("[DataDownloader] Cannot open file for writing: " +
+                             cachePath);
+  }
+
+  // Configure curl for download
+  curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
+  curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
+  curl_easy_setopt(curl, CURLOPT_WRITEDATA, static_cast<void *>(&outFile));
+
+  // Set reasonable timeout (30 seconds for initial connection, 300s for
+  // transfer)
+  curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30L);
+  curl_easy_setopt(curl, CURLOPT_TIMEOUT, 300L);
+
+  // Follow redirects (CDN may redirect)
+  curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+  curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
+
+  // Use gzip compression if server supports it
+  curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
+
+  // Set user agent to identify the application
+  curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
+
+  // Perform the download
+  CURLcode res = curl_easy_perform(curl);
+  outFile.close();
+
+  // Check for curl errors
+  if (res != CURLE_OK) {
+    curl_easy_cleanup(curl);
+
+    // Remove partially downloaded file
+    std::remove(cachePath.c_str());
+
+    std::string error = std::string("[DataDownloader] Download failed: ") +
+                        curl_easy_strerror(res);
+    throw std::runtime_error(error);
+  }
+
+  // Check HTTP response code
+  long httpCode = 0;
+  curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &httpCode);
+  curl_easy_cleanup(curl);
+
+  if (httpCode != 200) {
+    // Remove partially downloaded or error file
+    std::remove(cachePath.c_str());
+
+    std::stringstream ss;
+    ss << "[DataDownloader] HTTP error " << httpCode
+       << " (commit: " << shortCommit << ")";
+    throw std::runtime_error(ss.str());
+  }
+
+  // Get file size for diagnostics
+  std::ifstream fileCheck(cachePath, std::ios::binary | std::ios::ate);
+  std::streamsize size = fileCheck.tellg();
+  fileCheck.close();
+
+  std::cout << "[DataDownloader] ✓ Download complete: " << cachePath << " ("
+            << (size / (1024.0 * 1024.0)) << " MB)" << std::endl;
+  return cachePath;
+}
--- a/pipeline/src/data_generation/data_downloader.cpp
+++ b/pipeline/src/data_generation/data_downloader.cpp
@@ -1,44 +0,0 @@
-#include "data_generation/data_downloader.h"
-
-#include <spdlog/spdlog.h>
-
-#include <filesystem>
-#include <fstream>
-#include <sstream>
-#include <stdexcept>
-
-#include "web_client/web_client.h"
-
-DataDownloader::DataDownloader(std::shared_ptr<WebClient> web_client)
-    : web_client_(std::move(web_client)) {}
-
-DataDownloader::~DataDownloader() {}
-
-bool DataDownloader::FileExists(const std::string& file_path) {
-   return std::filesystem::exists(file_path);
-}
-
-std::string DataDownloader::DownloadCountriesDatabase(
-    const std::string& cache_path, const std::string& commit) {
-   if (FileExists(cache_path)) {
-      spdlog::info("[DataDownloader] Cache hit: {}", cache_path);
-      return cache_path;
-   }
-
-   std::string url =
-       "https://raw.githubusercontent.com/dr5hn/"
-       "countries-states-cities-database/" +
-       commit + "/json/countries+states+cities.json";
-
-   spdlog::info("[DataDownloader] Downloading: {}", url);
-
-   web_client_->DownloadToFile(url, cache_path);
-
-   std::ifstream file_check(cache_path, std::ios::binary | std::ios::ate);
-   std::streamsize size = file_check.tellg();
-   file_check.close();
-
-   spdlog::info("[DataDownloader] OK: Download complete: {} ({:.2f} MB)",
-                cache_path, (size / (1024.0 * 1024.0)));
-   return cache_path;
-}
--- a/pipeline/src/data_generation/llama/destructor.cpp
+++ b/pipeline/src/data_generation/llama/destructor.cpp
@@ -1,31 +0,0 @@
-/**
- * Destructor Module
- * Ensures proper cleanup of llama.cpp resources (context and model) when the
- * generator is destroyed, preventing memory leaks and resource exhaustion.
- */
-
-#include "data_generation/llama_generator.h"
-#include "llama.h"
-
-LlamaGenerator::~LlamaGenerator() {
-   /**
-    * Free the inference context (contains KV cache and computation state)
-    */
-   if (context_ != nullptr) {
-      llama_free(context_);
-      context_ = nullptr;
-   }
-
-   /**
-    * Free the loaded model (contains weights and vocabulary)
-    */
-   if (model_ != nullptr) {
-      llama_model_free(model_);
-      model_ = nullptr;
-   }
-
-   /**
-    * Clean up the backend (GPU/CPU acceleration resources)
-    */
-   llama_backend_free();
-}
--- a/pipeline/src/data_generation/llama/generate_brewery.cpp
+++ b/pipeline/src/data_generation/llama/generate_brewery.cpp
@@ -1,107 +0,0 @@
-/**
- * Brewery Data Generation Module
- * Uses the LLM to generate realistic brewery names and descriptions for a given
- * location. Implements retry logic with validation and error correction to
- * ensure valid JSON output conforming to the expected schema.
- */
-
-#include <spdlog/spdlog.h>
-
-#include <stdexcept>
-#include <string>
-
-#include "data_generation/llama_generator.h"
-#include "data_generation/llama_generator_helpers.h"
-
-BreweryResult LlamaGenerator::GenerateBrewery(
-    const std::string& city_name, const std::string& country_name,
-    const std::string& region_context) {
-   /**
-    * Preprocess and truncate region context to manageable size
-    */
-   const std::string safe_region_context =
-       PrepareRegionContextPublic(region_context);
-
-   /**
-    * Load brewery system prompt from file
-    * Falls back to minimal inline prompt if file not found
-    * Default path: prompts/brewery_system_prompt_expanded.txt
-    */
-   const std::string system_prompt =
-       LoadBrewerySystemPrompt("prompts/brewery_system_prompt_expanded.txt");
-
-   /**
-    * User prompt: provides geographic context to guide generation towards
-    * culturally appropriate and locally-inspired brewery attributes
-    */
-   std::string prompt =
-       "Write a brewery name and place-specific long description for a craft "
-       "brewery in " +
-       city_name +
-       (country_name.empty() ? std::string("")
-                             : std::string(", ") + country_name) +
-       (safe_region_context.empty()
-            ? std::string(".")
-            : std::string(". Regional context: ") + safe_region_context);
-
-   /**
-    * Store location context for retry prompts (without repeating full context)
-    */
-   const std::string retry_location =
-       "Location: " + city_name +
-       (country_name.empty() ? std::string("")
-                             : std::string(", ") + country_name);
-
-   /**
-    * RETRY LOOP with validation and error correction
-    * Attempts to generate valid brewery data up to 3 times, with feedback-based
-    * refinement
-    */
-   const int max_attempts = 3;
-   std::string raw;
-   std::string last_error;
-
-   // Limit output length to keep it concise and focused
-   constexpr int max_tokens = 1052;
-   for (int attempt = 0; attempt < max_attempts; ++attempt) {
-      // Generate brewery data from LLM
-      raw = Infer(system_prompt, prompt, max_tokens);
-      spdlog::debug("LlamaGenerator: raw output (attempt {}): {}", attempt + 1,
-                    raw);
-
-      // Validate output: parse JSON and check required fields
-
-      std::string name;
-      std::string description;
-      const std::string validation_error =
-          ValidateBreweryJsonPublic(raw, name, description);
-      if (validation_error.empty()) {
-         // Success: return parsed brewery data
-         return {std::move(name), std::move(description)};
-      }
-
-      // Validation failed: log error and prepare corrective feedback
-
-      last_error = validation_error;
-      spdlog::warn("LlamaGenerator: malformed brewery JSON (attempt {}): {}",
-                   attempt + 1, validation_error);
-
-      // Update prompt with error details to guide LLM toward correct output.
-      // For retries, use a compact prompt format to avoid exceeding token
-      // limits.
-      prompt =
-          "Your previous response was invalid. Error: " + validation_error +
-          "\nReturn ONLY valid JSON with this exact schema: "
-          "{\"name\": \"string\", \"description\": \"string\"}."
-          "\nDo not include markdown, comments, or extra keys."
-          "\n\n" +
-          retry_location;
-   }
-
-   // All retry attempts exhausted: log failure and throw exception
-   spdlog::error(
-       "LlamaGenerator: malformed brewery response after {} attempts: "
-       "{}",
-       max_attempts, last_error.empty() ? raw : last_error);
-   throw std::runtime_error("LlamaGenerator: malformed brewery response");
-}
--- a/pipeline/src/data_generation/llama/generate_user.cpp
+++ b/pipeline/src/data_generation/llama/generate_user.cpp
@@ -1,102 +0,0 @@
-/**
- * User Profile Generation Module
- * Uses the LLM to generate realistic user profiles (username and bio) for craft
- * beer enthusiasts. Implements retry logic to handle parsing failures and
- * ensures output adheres to strict format constraints (two lines, specific
- * character limits).
- */
-
-#include <spdlog/spdlog.h>
-
-#include <algorithm>
-#include <stdexcept>
-#include <string>
-
-#include "data_generation/llama_generator.h"
-#include "data_generation/llama_generator_helpers.h"
-
-UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
-   /**
-    * System prompt: specifies exact output format to minimize parsing errors
-    * Constraints: 2-line output, username format, bio length bounds
-    */
-   const std::string system_prompt =
-       "You generate plausible social media profiles for craft beer "
-       "enthusiasts. "
-       "Respond with exactly two lines: "
-       "the first line is a username (lowercase, no spaces, 8-20 characters), "
-       "the second line is a one-sentence bio (20-40 words). "
-       "The profile should feel consistent with the locale. "
-       "No preamble, no labels.";
-
-   /**
-    * User prompt: locale parameter guides cultural appropriateness of generated
-    * profiles
-    */
-   std::string prompt =
-       "Generate a craft beer enthusiast profile. Locale: " + locale;
-
-   /**
-    * RETRY LOOP with format validation
-    * Attempts up to 3 times to generate valid user profile with correct format
-    */
-   const int max_attempts = 3;
-   std::string raw;
-   for (int attempt = 0; attempt < max_attempts; ++attempt) {
-      /**
-       * Generate user profile (max 128 tokens - should fit 2 lines easily)
-       */
-      raw = Infer(system_prompt, prompt, 128);
-      spdlog::debug("LlamaGenerator (user): raw output (attempt {}): {}",
-                    attempt + 1, raw);
-
-      try {
-         /**
-          * Parse two-line response: first line = username, second line = bio
-          */
-         auto [username, bio] = ParseTwoLineResponsePublic(
-             raw, "LlamaGenerator: malformed user response");
-
-         /**
-          * Remove any whitespace from username (usernames shouldn't have
-          * spaces)
-          */
-         username.erase(
-             std::remove_if(username.begin(), username.end(),
-                            [](unsigned char ch) { return std::isspace(ch); }),
-             username.end());
-
-         /**
-          * Validate both fields are non-empty after processing
-          */
-         if (username.empty() || bio.empty()) {
-            throw std::runtime_error("LlamaGenerator: malformed user response");
-         }
-
-         /**
-          * Truncate bio if exceeds reasonable length for bio field
-          */
-         if (bio.size() > 200) bio = bio.substr(0, 200);
-
-         /**
-          * Success: return parsed user profile
-          */
-         return {username, bio};
-      } catch (const std::exception& e) {
-         /**
-          * Parsing failed: log and continue to next attempt
-          */
-         spdlog::warn(
-             "LlamaGenerator: malformed user response (attempt {}): {}",
-             attempt + 1, e.what());
-      }
-   }
-
-   /**
-    * All retry attempts exhausted: log failure and throw exception
-    */
-   spdlog::error(
-       "LlamaGenerator: malformed user response after {} attempts: {}",
-       max_attempts, raw);
-   throw std::runtime_error("LlamaGenerator: malformed user response");
-}
--- a/pipeline/src/data_generation/llama/helpers.cpp
+++ b/pipeline/src/data_generation/llama/helpers.cpp
@@ -1,441 +0,0 @@
-/**
- * Helper Functions Module
- * Provides utility functions for text processing, parsing, and chat template
- * formatting. Functions handle whitespace normalization, response parsing, and
- * conversion of prompts to proper chat format using the model's built-in
- * template.
- */
-
-#include <algorithm>
-#include <array>
-#include <boost/json.hpp>
-#include <cctype>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include "data_generation/llama_generator.h"
-#include "llama.h"
-
-namespace {
-
-/**
- * String trimming: removes leading and trailing whitespace
- */
-std::string Trim(std::string value) {
-   auto not_space = [](unsigned char ch) { return !std::isspace(ch); };
-
-   value.erase(value.begin(),
-               std::find_if(value.begin(), value.end(), not_space));
-   value.erase(std::find_if(value.rbegin(), value.rend(), not_space).base(),
-               value.end());
-
-   return value;
-}
-
-/**
- * Normalize whitespace: collapses multiple spaces/tabs/newlines into single
- * spaces
- */
-std::string CondenseWhitespace(std::string text) {
-   std::string out;
-   out.reserve(text.size());
-
-   bool in_whitespace = false;
-   for (unsigned char ch : text) {
-      if (std::isspace(ch)) {
-         if (!in_whitespace) {
-            out.push_back(' ');
-            in_whitespace = true;
-         }
-         continue;
-      }
-
-      in_whitespace = false;
-      out.push_back(static_cast<char>(ch));
-   }
-
-   return Trim(std::move(out));
-}
-
-/**
- * Truncate region context to fit within max length while preserving word
- * boundaries
- */
-std::string PrepareRegionContext(std::string_view region_context,
-                                 std::size_t max_chars) {
-   std::string normalized = CondenseWhitespace(std::string(region_context));
-   if (normalized.size() <= max_chars) {
-      return normalized;
-   }
-
-   normalized.resize(max_chars);
-   const std::size_t last_space = normalized.find_last_of(' ');
-   if (last_space != std::string::npos && last_space > max_chars / 2) {
-      normalized.resize(last_space);
-   }
-
-   normalized += "...";
-   return normalized;
-}
-
-/**
- * Remove common bullet points, numbers, and field labels added by LLM in output
- */
-std::string StripCommonPrefix(std::string line) {
-   line = Trim(std::move(line));
-
-   if (!line.empty() && (line[0] == '-' || line[0] == '*')) {
-      line = Trim(line.substr(1));
-   } else {
-      std::size_t i = 0;
-      while (i < line.size() &&
-             std::isdigit(static_cast<unsigned char>(line[i]))) {
-         ++i;
-      }
-      if (i > 0 && i < line.size() && (line[i] == '.' || line[i] == ')')) {
-         line = Trim(line.substr(i + 1));
-      }
-   }
-
-   auto strip_label = [&line](const std::string& label) {
-      if (line.size() >= label.size()) {
-         bool matches = true;
-         for (std::size_t i = 0; i < label.size(); ++i) {
-            if (std::tolower(static_cast<unsigned char>(line[i])) !=
-                std::tolower(static_cast<unsigned char>(label[i]))) {
-               matches = false;
-               break;
-            }
-         }
-         if (matches) {
-            line = Trim(line.substr(label.size()));
-         }
-      }
-   };
-
-   strip_label("name:");
-   strip_label("brewery name:");
-   strip_label("description:");
-   strip_label("username:");
-   strip_label("bio:");
-
-   return Trim(std::move(line));
-}
-
-/**
- * Parse two-line response from LLM: normalize line endings, strip formatting,
- * filter spurious output, and combine remaining lines if needed
- */
-std::pair<std::string, std::string> ParseTwoLineResponse(
-    const std::string& raw, const std::string& error_message) {
-   std::string normalized = raw;
-   std::replace(normalized.begin(), normalized.end(), '\r', '\n');
-
-   std::vector<std::string> lines;
-   std::stringstream stream(normalized);
-   std::string line;
-   while (std::getline(stream, line)) {
-      line = StripCommonPrefix(std::move(line));
-      if (!line.empty()) lines.push_back(std::move(line));
-   }
-
-   std::vector<std::string> filtered;
-   for (auto& l : lines) {
-      std::string low = l;
-      std::transform(low.begin(), low.end(), low.begin(), [](unsigned char c) {
-         return static_cast<char>(std::tolower(c));
-      });
-      // Filter known thinking tags like <think>...</think>, but be conservative
-      // to avoid removing legitimate output. Only filter specific known
-      // patterns.
-      if (!l.empty() && l.front() == '<' && low.back() == '>') {
-         // Only filter if it's a known thinking tag: <think>, <reasoning>, etc.
-         if (low.find("think") != std::string::npos ||
-             low.find("reasoning") != std::string::npos ||
-             low.find("reflect") != std::string::npos) {
-            continue;
-         }
-      }
-      if (low.rfind("okay,", 0) == 0 || low.rfind("hmm", 0) == 0) continue;
-      filtered.push_back(std::move(l));
-   }
-
-   if (filtered.size() < 2) throw std::runtime_error(error_message);
-
-   std::string first = Trim(filtered.front());
-   std::string second;
-   for (size_t i = 1; i < filtered.size(); ++i) {
-      if (!second.empty()) second += ' ';
-      second += filtered[i];
-   }
-   second = Trim(std::move(second));
-
-   if (first.empty() || second.empty()) throw std::runtime_error(error_message);
-   return {first, second};
-}
-
-/**
- * Apply model's chat template to user-only prompt, formatting it for the model
- */
-std::string ToChatPrompt(const llama_model* model,
-                         const std::string& user_prompt) {
-   const char* tmpl = llama_model_chat_template(model, nullptr);
-   if (tmpl == nullptr) {
-      return user_prompt;
-   }
-
-   const llama_chat_message message{"user", user_prompt.c_str()};
-
-   std::vector<char> buffer(
-       std::max<std::size_t>(1024, user_prompt.size() * 4));
-   int32_t required =
-       llama_chat_apply_template(tmpl, &message, 1, true, buffer.data(),
-                                 static_cast<int32_t>(buffer.size()));
-
-   if (required < 0) {
-      throw std::runtime_error("LlamaGenerator: failed to apply chat template");
-   }
-
-   if (required >= static_cast<int32_t>(buffer.size())) {
-      buffer.resize(static_cast<std::size_t>(required) + 1);
-      required =
-          llama_chat_apply_template(tmpl, &message, 1, true, buffer.data(),
-                                    static_cast<int32_t>(buffer.size()));
-      if (required < 0) {
-         throw std::runtime_error(
-             "LlamaGenerator: failed to apply chat template");
-      }
-   }
-
-   return std::string(buffer.data(), static_cast<std::size_t>(required));
-}
-
-/**
- * Apply model's chat template to system+user prompt pair, formatting for the
- * model
- */
-std::string ToChatPrompt(const llama_model* model,
-                         const std::string& system_prompt,
-                         const std::string& user_prompt) {
-   const char* tmpl = llama_model_chat_template(model, nullptr);
-   if (tmpl == nullptr) {
-      return system_prompt + "\n\n" + user_prompt;
-   }
-
-   const llama_chat_message messages[2] = {{"system", system_prompt.c_str()},
-                                           {"user", user_prompt.c_str()}};
-
-   std::vector<char> buffer(std::max<std::size_t>(
-       1024, (system_prompt.size() + user_prompt.size()) * 4));
-   int32_t required =
-       llama_chat_apply_template(tmpl, messages, 2, true, buffer.data(),
-                                 static_cast<int32_t>(buffer.size()));
-
-   if (required < 0) {
-      throw std::runtime_error("LlamaGenerator: failed to apply chat template");
-   }
-
-   if (required >= static_cast<int32_t>(buffer.size())) {
-      buffer.resize(static_cast<std::size_t>(required) + 1);
-      required =
-          llama_chat_apply_template(tmpl, messages, 2, true, buffer.data(),
-                                    static_cast<int32_t>(buffer.size()));
-      if (required < 0) {
-         throw std::runtime_error(
-             "LlamaGenerator: failed to apply chat template");
-      }
-   }
-
-   return std::string(buffer.data(), static_cast<std::size_t>(required));
-}
-
-void AppendTokenPiece(const llama_vocab* vocab, llama_token token,
-                      std::string& output) {
-   std::array<char, 256> buffer{};
-   int32_t bytes =
-       llama_token_to_piece(vocab, token, buffer.data(),
-                            static_cast<int32_t>(buffer.size()), 0, true);
-
-   if (bytes < 0) {
-      std::vector<char> dynamic_buffer(static_cast<std::size_t>(-bytes));
-      bytes = llama_token_to_piece(vocab, token, dynamic_buffer.data(),
-                                   static_cast<int32_t>(dynamic_buffer.size()),
-                                   0, true);
-      if (bytes < 0) {
-         throw std::runtime_error(
-             "LlamaGenerator: failed to decode sampled token piece");
-      }
-
-      output.append(dynamic_buffer.data(), static_cast<std::size_t>(bytes));
-      return;
-   }
-
-   output.append(buffer.data(), static_cast<std::size_t>(bytes));
-}
-
-bool ExtractFirstJsonObject(const std::string& text, std::string& json_out) {
-   std::size_t start = std::string::npos;
-   int depth = 0;
-   bool in_string = false;
-   bool escaped = false;
-
-   for (std::size_t i = 0; i < text.size(); ++i) {
-      const char ch = text[i];
-
-      if (in_string) {
-         if (escaped) {
-            escaped = false;
-         } else if (ch == '\\') {
-            escaped = true;
-         } else if (ch == '"') {
-            in_string = false;
-         }
-         continue;
-      }
-
-      if (ch == '"') {
-         in_string = true;
-         continue;
-      }
-
-      if (ch == '{') {
-         if (depth == 0) {
-            start = i;
-         }
-         ++depth;
-         continue;
-      }
-
-      if (ch == '}') {
-         if (depth == 0) {
-            continue;
-         }
-         --depth;
-         if (depth == 0 && start != std::string::npos) {
-            json_out = text.substr(start, i - start + 1);
-            return true;
-         }
-      }
-   }
-
-   return false;
-}
-
-std::string ValidateBreweryJson(const std::string& raw, std::string& name_out,
-                                std::string& description_out) {
-   auto validate_object = [&](const boost::json::value& jv,
-                              std::string& error_out) -> bool {
-      if (!jv.is_object()) {
-         error_out = "JSON root must be an object";
-         return false;
-      }
-
-      const auto& obj = jv.get_object();
-      if (!obj.contains("name") || !obj.at("name").is_string()) {
-         error_out = "JSON field 'name' is missing or not a string";
-         return false;
-      }
-
-      if (!obj.contains("description") || !obj.at("description").is_string()) {
-         error_out = "JSON field 'description' is missing or not a string";
-         return false;
-      }
-
-      name_out = Trim(std::string(obj.at("name").as_string().c_str()));
-      description_out =
-          Trim(std::string(obj.at("description").as_string().c_str()));
-
-      if (name_out.empty()) {
-         error_out = "JSON field 'name' must not be empty";
-         return false;
-      }
-
-      if (description_out.empty()) {
-         error_out = "JSON field 'description' must not be empty";
-         return false;
-      }
-
-      std::string name_lower = name_out;
-      std::string description_lower = description_out;
-      std::transform(
-          name_lower.begin(), name_lower.end(), name_lower.begin(),
-          [](unsigned char c) { return static_cast<char>(std::tolower(c)); });
-      std::transform(description_lower.begin(), description_lower.end(),
-                     description_lower.begin(), [](unsigned char c) {
-                        return static_cast<char>(std::tolower(c));
-                     });
-
-      if (name_lower == "string" || description_lower == "string") {
-         error_out = "JSON appears to be a schema placeholder, not content";
-         return false;
-      }
-
-      error_out.clear();
-      return true;
-   };
-
-   boost::system::error_code ec;
-   boost::json::value jv = boost::json::parse(raw, ec);
-   std::string validation_error;
-   if (ec) {
-      std::string extracted;
-      if (!ExtractFirstJsonObject(raw, extracted)) {
-         return "JSON parse error: " + ec.message();
-      }
-
-      ec.clear();
-      jv = boost::json::parse(extracted, ec);
-      if (ec) {
-         return "JSON parse error: " + ec.message();
-      }
-
-      if (!validate_object(jv, validation_error)) {
-         return validation_error;
-      }
-
-      return {};
-   }
-
-   if (!validate_object(jv, validation_error)) {
-      return validation_error;
-   }
-
-   return {};
-}
-
-}  // namespace
-
-// Forward declarations for helper functions exposed to other translation units
-std::string PrepareRegionContextPublic(std::string_view region_context,
-                                       std::size_t max_chars) {
-   return PrepareRegionContext(region_context, max_chars);
-}
-
-std::pair<std::string, std::string> ParseTwoLineResponsePublic(
-    const std::string& raw, const std::string& error_message) {
-   return ParseTwoLineResponse(raw, error_message);
-}
-
-std::string ToChatPromptPublic(const llama_model* model,
-                               const std::string& user_prompt) {
-   return ToChatPrompt(model, user_prompt);
-}
-
-std::string ToChatPromptPublic(const llama_model* model,
-                               const std::string& system_prompt,
-                               const std::string& user_prompt) {
-   return ToChatPrompt(model, system_prompt, user_prompt);
-}
-
-void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
-                            std::string& output) {
-   AppendTokenPiece(vocab, token, output);
-}
-
-std::string ValidateBreweryJsonPublic(const std::string& raw,
-                                      std::string& name_out,
-                                      std::string& description_out) {
-   return ValidateBreweryJson(raw, name_out, description_out);
-}
--- a/pipeline/src/data_generation/llama/infer.cpp
+++ b/pipeline/src/data_generation/llama/infer.cpp
@@ -1,196 +0,0 @@
-/**
- * Text Generation / Inference Module
- * Core module that performs LLM inference: converts text prompts into tokens,
- * runs the neural network forward pass, samples the next token, and converts
- * output tokens back to text. Supports both simple and system+user prompts.
- */
-
-#include <spdlog/spdlog.h>
-
-#include <algorithm>
-#include <memory>
-#include <stdexcept>
-#include <string>
-#include <vector>
-
-#include "data_generation/llama_generator.h"
-#include "data_generation/llama_generator_helpers.h"
-#include "llama.h"
-
-std::string LlamaGenerator::Infer(const std::string& prompt, int max_tokens) {
-   return InferFormatted(ToChatPromptPublic(model_, prompt), max_tokens);
-}
-
-std::string LlamaGenerator::Infer(const std::string& system_prompt,
-                                  const std::string& prompt, int max_tokens) {
-   return InferFormatted(ToChatPromptPublic(model_, system_prompt, prompt),
-                         max_tokens);
-}
-
-std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
-                                           int max_tokens) {
-   /**
-    * Validate that model and context are loaded
-    */
-   if (model_ == nullptr || context_ == nullptr)
-      throw std::runtime_error("LlamaGenerator: model not loaded");
-
-   /**
-    * Get vocabulary for tokenization and token-to-text conversion
-    */
-   const llama_vocab* vocab = llama_model_get_vocab(model_);
-   if (vocab == nullptr)
-      throw std::runtime_error("LlamaGenerator: vocab unavailable");
-
-   /**
-    * Clear KV cache to ensure clean inference state (no residual context)
-    */
-   llama_memory_clear(llama_get_memory(context_), true);
-
-   /**
-    * TOKENIZATION PHASE
-    * Convert text prompt into token IDs (integers) that the model understands
-    */
-   std::vector<llama_token> prompt_tokens(formatted_prompt.size() + 8);
-   int32_t token_count = llama_tokenize(
-       vocab, formatted_prompt.c_str(),
-       static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
-       static_cast<int32_t>(prompt_tokens.size()), true, true);
-
-   /**
-    * If buffer too small, negative return indicates required size
-    */
-   if (token_count < 0) {
-      prompt_tokens.resize(static_cast<std::size_t>(-token_count));
-      token_count = llama_tokenize(
-          vocab, formatted_prompt.c_str(),
-          static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
-          static_cast<int32_t>(prompt_tokens.size()), true, true);
-   }
-
-   if (token_count < 0)
-      throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
-
-   /**
-    * CONTEXT SIZE VALIDATION
-    * Validate and compute effective token budgets based on context window
-    * constraints
-    */
-   const int32_t n_ctx = static_cast<int32_t>(llama_n_ctx(context_));
-   const int32_t n_batch = static_cast<int32_t>(llama_n_batch(context_));
-   if (n_ctx <= 1 || n_batch <= 0)
-      throw std::runtime_error("LlamaGenerator: invalid context or batch size");
-
-   /**
-    * Clamp generation limit to available context window, reserve space for
-    * output
-    */
-   const int32_t effective_max_tokens =
-       std::max(1, std::min(max_tokens, n_ctx - 1));
-   /**
-    * Prompt can use remaining context after reserving space for generation
-    */
-   int32_t prompt_budget = std::min(n_batch, n_ctx - effective_max_tokens);
-   prompt_budget = std::max<int32_t>(1, prompt_budget);
-
-   /**
-    * Truncate prompt if necessary to fit within constraints
-    */
-   prompt_tokens.resize(static_cast<std::size_t>(token_count));
-   if (token_count > prompt_budget) {
-      spdlog::warn(
-          "LlamaGenerator: prompt too long ({} tokens), truncating to {} "
-          "tokens to fit n_batch/n_ctx limits",
-          token_count, prompt_budget);
-      prompt_tokens.resize(static_cast<std::size_t>(prompt_budget));
-      token_count = prompt_budget;
-   }
-
-   /**
-    * PROMPT PROCESSING PHASE
-    * Create a batch containing all prompt tokens and feed through the model
-    * This computes internal representations and fills the KV cache
-    */
-   const llama_batch prompt_batch = llama_batch_get_one(
-       prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
-   if (llama_decode(context_, prompt_batch) != 0)
-      throw std::runtime_error("LlamaGenerator: prompt decode failed");
-
-   /**
-    * SAMPLER CONFIGURATION PHASE
-    * Set up the probabilistic token selection pipeline (sampler chain)
-    * Samplers are applied in sequence: temperature -> top-p -> distribution
-    */
-   llama_sampler_chain_params sampler_params =
-       llama_sampler_chain_default_params();
-   using SamplerPtr =
-       std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
-   SamplerPtr sampler(llama_sampler_chain_init(sampler_params),
-                      &llama_sampler_free);
-   if (!sampler)
-      throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
-
-   /**
-    * Temperature: scales logits before softmax (controls randomness)
-    */
-   llama_sampler_chain_add(sampler.get(),
-                           llama_sampler_init_temp(sampling_temperature_));
-   /**
-    * Top-P: nucleus sampling - filters to most likely tokens summing to top_p
-    * probability
-    */
-   llama_sampler_chain_add(sampler.get(),
-                           llama_sampler_init_top_p(sampling_top_p_, 1));
-   /**
-    * Distribution sampler: selects actual token using configured seed for
-    * reproducibility
-    */
-   llama_sampler_chain_add(sampler.get(),
-                           llama_sampler_init_dist(sampling_seed_));
-
-   /**
-    * TOKEN GENERATION LOOP
-    * Iteratively generate tokens one at a time until max_tokens or
-    * end-of-sequence
-    */
-   std::vector<llama_token> generated_tokens;
-   generated_tokens.reserve(static_cast<std::size_t>(effective_max_tokens));
-
-   for (int i = 0; i < effective_max_tokens; ++i) {
-      /**
-       * Sample next token using configured sampler chain and model logits
-       * Index -1 means use the last output position from previous batch
-       */
-      const llama_token next =
-          llama_sampler_sample(sampler.get(), context_, -1);
-      /**
-       * Stop if model predicts end-of-generation token (EOS/EOT)
-       */
-      if (llama_vocab_is_eog(vocab, next)) break;
-      generated_tokens.push_back(next);
-      /**
-       * Feed the sampled token back into model for next iteration
-       * (autoregressive)
-       */
-      llama_token token = next;
-      const llama_batch one_token_batch = llama_batch_get_one(&token, 1);
-      if (llama_decode(context_, one_token_batch) != 0)
-         throw std::runtime_error(
-             "LlamaGenerator: decode failed during generation");
-   }
-
-   /**
-    * DETOKENIZATION PHASE
-    * Convert generated token IDs back to text using vocabulary
-    */
-   std::string output;
-   for (const llama_token token : generated_tokens)
-      AppendTokenPiecePublic(vocab, token, output);
-
-   /**
-    * Advance seed for next generation to improve output diversity
-    */
-   sampling_seed_ = (sampling_seed_ == 0xFFFFFFFFu) ? 0 : sampling_seed_ + 1;
-
-   return output;
-}
--- a/pipeline/src/data_generation/llama/load.cpp
+++ b/pipeline/src/data_generation/llama/load.cpp
@@ -1,56 +0,0 @@
-/**
- * Model Loading Module
- * This module handles loading a pre-trained LLM model from disk and
- * initializing the llama.cpp context for inference. It performs one-time setup
- * required before any inference operations can be performed.
- */
-
-#include <spdlog/spdlog.h>
-
-#include <stdexcept>
-#include <string>
-
-#include "data_generation/llama_generator.h"
-#include "llama.h"
-
-void LlamaGenerator::Load(const std::string& model_path) {
-   /**
-    * Validate input and clean up any previously loaded model/context
-    */
-   if (model_path.empty())
-      throw std::runtime_error("LlamaGenerator: model path must not be empty");
-
-   if (context_ != nullptr) {
-      llama_free(context_);
-      context_ = nullptr;
-   }
-   if (model_ != nullptr) {
-      llama_model_free(model_);
-      model_ = nullptr;
-   }
-
-   /**
-    * Initialize the llama backend (one-time setup for GPU/CPU acceleration)
-    */
-   llama_backend_init();
-
-   llama_model_params model_params = llama_model_default_params();
-   model_ = llama_model_load_from_file(model_path.c_str(), model_params);
-   if (model_ == nullptr) {
-      throw std::runtime_error(
-          "LlamaGenerator: failed to load model from path: " + model_path);
-   }
-
-   llama_context_params context_params = llama_context_default_params();
-   context_params.n_ctx = n_ctx_;
-   context_params.n_batch = n_ctx_;  // Set batch size equal to context window
-
-   context_ = llama_init_from_model(model_, context_params);
-   if (context_ == nullptr) {
-      llama_model_free(model_);
-      model_ = nullptr;
-      throw std::runtime_error("LlamaGenerator: failed to create context");
-   }
-
-   spdlog::info("[LlamaGenerator] Loaded model: {}", model_path);
-}
--- a/pipeline/src/data_generation/llama/load_brewery_prompt.cpp
+++ b/pipeline/src/data_generation/llama/load_brewery_prompt.cpp
@@ -1,74 +0,0 @@
-#include <fstream>
-#include <filesystem>
-#include <spdlog/spdlog.h>
-
-#include "data_generation/llama_generator.h"
-
-namespace fs = std::filesystem;
-
-std::string LlamaGenerator::LoadBrewerySystemPrompt(
-    const std::string& prompt_file_path) {
-   // Return cached version if already loaded
-   if (!brewery_system_prompt_.empty()) {
-      return brewery_system_prompt_;
-   }
-
-   // Try multiple path locations
-   std::vector<std::string> paths_to_try = {
-       prompt_file_path,                          // As provided
-       "../" + prompt_file_path,                  // One level up
-       "../../" + prompt_file_path,               // Two levels up
-   };
-
-   for (const auto& path : paths_to_try) {
-      std::ifstream prompt_file(path);
-      if (prompt_file.is_open()) {
-         std::string prompt((std::istreambuf_iterator<char>(prompt_file)),
-                            std::istreambuf_iterator<char>());
-         prompt_file.close();
-
-         if (!prompt.empty()) {
-            spdlog::info(
-                "LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)",
-                path, prompt.length());
-            brewery_system_prompt_ = prompt;
-            return brewery_system_prompt_;
-         }
-      }
-   }
-
-   spdlog::warn(
-       "LlamaGenerator: Could not open brewery system prompt file at any of the "
-       "expected locations. Using fallback inline prompt.");
-   return GetFallbackBreweryPrompt();
-}
-
-// Fallback: minimal inline prompt if file fails to load
-std::string LlamaGenerator::GetFallbackBreweryPrompt() {
-   return "You are an experienced brewmaster and owner of a local craft brewery. "
-          "Create a distinctive, authentic name and detailed description that "
-          "genuinely reflects your specific location, brewing philosophy, local "
-          "culture, and community connection. The brewery must feel real and "
-          "grounded—not generic or interchangeable.\n\n"
-          "AVOID REPETITIVE PHRASES - Never use:\n"
-          "Love letter to, tribute to, rolling hills, picturesque, every sip "
-          "tells a story, Come for X stay for Y, rich history, passion, woven "
-          "into, ancient roots, timeless, where tradition meets innovation\n\n"
-          "OPENING APPROACHES - Choose ONE:\n"
-          "1. Start with specific beer style and its regional origins\n"
-          "2. Begin with specific brewing challenge (water, altitude, climate)\n"
-          "3. Open with founding story or personal motivation\n"
-          "4. Lead with specific local ingredient or resource\n"
-          "5. Start with unexpected angle or contradiction\n"
-          "6. Open with local event, tradition, or cultural moment\n"
-          "7. Begin with tangible architectural or geographic detail\n\n"
-          "BE SPECIFIC - Include:\n"
-          "- At least ONE concrete proper noun (landmark, river, neighborhood)\n"
-          "- Specific beer styles relevant to the REGION'S culture\n"
-          "- Concrete brewing challenges or advantages\n"
-          "- Sensory details SPECIFIC to place—not generic adjectives\n\n"
-          "LENGTH: 150-250 words. TONE: Can be soulful, irreverent, "
-          "matter-of-fact, unpretentious, or minimalist.\n\n"
-          "Output ONLY a raw JSON object with keys name and description. "
-          "No markdown, backticks, preamble, or trailing text.";
-}
--- a/pipeline/src/data_generation/llama/set_sampling_options.cpp
+++ b/pipeline/src/data_generation/llama/set_sampling_options.cpp
@@ -1,65 +0,0 @@
-/**
- * Sampling Configuration Module
- * Configures the hyperparameters that control probabilistic token selection
- * during text generation. These settings affect the randomness, diversity, and
- * quality of generated output.
- */
-
-#include <stdexcept>
-
-#include "data_generation/llama_generator.h"
-#include "llama.h"
-
-void LlamaGenerator::SetSamplingOptions(float temperature, float top_p,
-                                        int seed) {
-   /**
-    * Validate temperature: controls randomness in output distribution
-    * 0.0 = deterministic (always pick highest probability token)
-    * Higher values = more random/diverse output
-    */
-   if (temperature < 0.0f) {
-      throw std::runtime_error(
-          "LlamaGenerator: sampling temperature must be >= 0");
-   }
-
-   /**
-    * Validate top-p (nucleus sampling): only sample from top cumulative
-    * probability e.g., top-p=0.9 means sample from tokens that make up 90% of
-    * probability mass
-    */
-   if (!(top_p > 0.0f && top_p <= 1.0f)) {
-      throw std::runtime_error(
-          "LlamaGenerator: sampling top-p must be in (0, 1]");
-   }
-
-   /**
-    * Validate seed: for reproducible results (-1 uses random seed)
-    */
-   if (seed < -1) {
-      throw std::runtime_error(
-          "LlamaGenerator: seed must be >= 0, or -1 for random");
-   }
-
-   /**
-    * Store sampling parameters for use during token generation
-    */
-   sampling_temperature_ = temperature;
-   sampling_top_p_ = top_p;
-   sampling_seed_ = (seed < 0) ? static_cast<uint32_t>(LLAMA_DEFAULT_SEED)
-                               : static_cast<uint32_t>(seed);
-}
-
-void LlamaGenerator::SetContextSize(uint32_t n_ctx) {
-   /**
-    * Validate context size: must be positive and reasonable for the model
-    */
-   if (n_ctx == 0 || n_ctx > 32768) {
-      throw std::runtime_error(
-          "LlamaGenerator: context size must be in range [1, 32768]");
-   }
-
-   /**
-    * Store context size for use during model loading
-    */
-   n_ctx_ = n_ctx;
-}
--- a/pipeline/src/data_generation/mock/data.cpp
+++ b/pipeline/src/data_generation/mock/data.cpp
@@ -1,65 +0,0 @@
-#include <string>
-#include <vector>
-
-#include "data_generation/mock_generator.h"
-
-const std::vector<std::string> MockGenerator::kBreweryAdjectives = {
-    "Craft",      "Heritage", "Local",  "Artisan",  "Pioneer",    "Golden",
-    "Modern",     "Classic",  "Summit", "Northern", "Riverstone", "Barrel",
-    "Hinterland", "Harbor",   "Wild",   "Granite",  "Copper",     "Maple"};
-
-const std::vector<std::string> MockGenerator::kBreweryNouns = {
-    "Brewing Co.", "Brewery",    "Bier Haus", "Taproom",      "Works",
-    "House",       "Fermentery", "Ale Co.",   "Cellars",      "Collective",
-    "Project",     "Foundry",    "Malthouse", "Public House", "Co-op",
-    "Lab",         "Beer Hall",  "Guild"};
-
-const std::vector<std::string> MockGenerator::kBreweryDescriptions = {
-    "Handcrafted pale ales and seasonal IPAs with local ingredients.",
-    "Traditional lagers and experimental sours in small batches.",
-    "Award-winning stouts and wildly hoppy blonde ales.",
-    "Craft brewery specializing in Belgian-style triples and dark porters.",
-    "Modern brewery blending tradition with bold experimental flavors.",
-    "Neighborhood-focused taproom pouring crisp pilsners and citrusy pale "
-    "ales.",
-    "Small-batch brewery known for barrel-aged releases and smoky lagers.",
-    "Independent brewhouse pairing farmhouse ales with rotating food pop-ups.",
-    "Community brewpub making balanced bitters, saisons, and hazy IPAs.",
-    "Experimental nanobrewery exploring local yeast and regional grains.",
-    "Family-run brewery producing smooth amber ales and robust porters.",
-    "Urban brewery crafting clean lagers and bright, fruit-forward sours.",
-    "Riverfront brewhouse featuring oak-matured ales and seasonal blends.",
-    "Modern taproom focused on sessionable lagers and classic pub styles.",
-    "Brewery rooted in tradition with a lineup of malty reds and crisp lagers.",
-    "Creative brewery offering rotating collaborations and limited draft-only "
-    "pours.",
-    "Locally inspired brewery serving approachable ales with bold hop "
-    "character.",
-    "Destination taproom known for balanced IPAs and cocoa-rich stouts."};
-
-const std::vector<std::string> MockGenerator::kUsernames = {
-    "hopseeker",     "malttrail",   "yeastwhisper",  "lagerlane",
-    "barrelbound",   "foamfinder",  "taphunter",     "graingeist",
-    "brewscout",     "aleatlas",    "caskcompass",   "hopsandmaps",
-    "mashpilot",     "pintnomad",   "fermentfriend", "stoutsignal",
-    "sessionwander", "kettlekeeper"};
-
-const std::vector<std::string> MockGenerator::kBios = {
-    "Always chasing balanced IPAs and crisp lagers across local taprooms.",
-    "Weekend brewery explorer with a soft spot for dark, roasty stouts.",
-    "Documenting tiny brewpubs, fresh pours, and unforgettable beer gardens.",
-    "Fan of farmhouse ales, food pairings, and long tasting flights.",
-    "Collecting favorite pilsners one city at a time.",
-    "Hops-first drinker who still saves room for classic malt-forward styles.",
-    "Finding hidden tap lists and sharing the best seasonal releases.",
-    "Brewery road-tripper focused on local ingredients and clean fermentation.",
-    "Always comparing house lagers and ranking patio pint vibes.",
-    "Curious about yeast strains, barrel programs, and cellar experiments.",
-    "Believes every neighborhood deserves a great community taproom.",
-    "Looking for session beers that taste great from first sip to last.",
-    "Belgian ale enthusiast who never skips a new saison.",
-    "Hazy IPA critic with deep respect for a perfectly clear pilsner.",
-    "Visits breweries for the stories, stays for the flagship pours.",
-    "Craft beer fan mapping tasting notes and favorite brew routes.",
-    "Always ready to trade recommendations for underrated local breweries.",
-    "Keeping a running list of must-try collab releases and tap takeovers."};
--- a/pipeline/src/data_generation/mock/deterministic_hash.cpp
+++ b/pipeline/src/data_generation/mock/deterministic_hash.cpp
@@ -1,12 +0,0 @@
-#include <string>
-
-#include "data_generation/mock_generator.h"
-
-std::size_t MockGenerator::DeterministicHash(const std::string& a,
-                                             const std::string& b) {
-   std::size_t seed = std::hash<std::string>{}(a);
-   const std::size_t mixed = std::hash<std::string>{}(b);
-   seed ^= mixed + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
-   seed = (seed << 13) | (seed >> ((sizeof(std::size_t) * 8) - 13));
-   return seed;
-}
--- a/pipeline/src/data_generation/mock/generate_brewery.cpp
+++ b/pipeline/src/data_generation/mock/generate_brewery.cpp
@@ -1,21 +0,0 @@
-#include <functional>
-#include <string>
-
-#include "data_generation/mock_generator.h"
-
-BreweryResult MockGenerator::GenerateBrewery(
-    const std::string& city_name, const std::string& country_name,
-    const std::string& region_context) {
-   const std::string location_key =
-       country_name.empty() ? city_name : city_name + "," + country_name;
-   const std::size_t hash =
-       region_context.empty() ? std::hash<std::string>{}(location_key)
-                              : DeterministicHash(location_key, region_context);
-
-   BreweryResult result;
-   result.name = kBreweryAdjectives[hash % kBreweryAdjectives.size()] + " " +
-                 kBreweryNouns[(hash / 7) % kBreweryNouns.size()];
-   result.description =
-       kBreweryDescriptions[(hash / 13) % kBreweryDescriptions.size()];
-   return result;
-}
--- a/pipeline/src/data_generation/mock/generate_user.cpp
+++ b/pipeline/src/data_generation/mock/generate_user.cpp
@@ -1,13 +0,0 @@
-#include <functional>
-#include <string>
-
-#include "data_generation/mock_generator.h"
-
-UserResult MockGenerator::GenerateUser(const std::string& locale) {
-   const std::size_t hash = std::hash<std::string>{}(locale);
-
-   UserResult result;
-   result.username = kUsernames[hash % kUsernames.size()];
-   result.bio = kBios[(hash / 11) % kBios.size()];
-   return result;
-}
--- a/pipeline/src/data_generation/mock/load.cpp
+++ b/pipeline/src/data_generation/mock/load.cpp
@@ -1,9 +0,0 @@
-#include <spdlog/spdlog.h>
-
-#include <string>
-
-#include "data_generation/mock_generator.h"
-
-void MockGenerator::Load(const std::string& /*modelPath*/) {
-   spdlog::info("[MockGenerator] No model needed");
-}
--- a/pipeline/src/database.cpp
+++ b/pipeline/src/database.cpp
@@ -0,0 +1,229 @@
+#include "database.h"
+#include <iostream>
+#include <stdexcept>
+
+void SqliteDatabase::InitializeSchema() {
+  std::lock_guard<std::mutex> lock(dbMutex);
+
+  const char *schema = R"(
+    CREATE TABLE IF NOT EXISTS countries (
+      id INTEGER PRIMARY KEY,
+      name TEXT NOT NULL,
+      iso2 TEXT,
+      iso3 TEXT
+    );
+
+    CREATE TABLE IF NOT EXISTS states (
+      id INTEGER PRIMARY KEY,
+      country_id INTEGER NOT NULL,
+      name TEXT NOT NULL,
+      iso2 TEXT,
+      FOREIGN KEY(country_id) REFERENCES countries(id)
+    );
+
+    CREATE TABLE IF NOT EXISTS cities (
+      id INTEGER PRIMARY KEY,
+      state_id INTEGER NOT NULL,
+      country_id INTEGER NOT NULL,
+      name TEXT NOT NULL,
+      latitude REAL,
+      longitude REAL,
+      FOREIGN KEY(state_id) REFERENCES states(id),
+      FOREIGN KEY(country_id) REFERENCES countries(id)
+    );
+  )";
+
+  char *errMsg = nullptr;
+  int rc = sqlite3_exec(db, schema, nullptr, nullptr, &errMsg);
+  if (rc != SQLITE_OK) {
+    std::string error = errMsg ? std::string(errMsg) : "Unknown error";
+    sqlite3_free(errMsg);
+    throw std::runtime_error("Failed to create schema: " + error);
+  }
+}
+
+SqliteDatabase::~SqliteDatabase() {
+  if (db) {
+    sqlite3_close(db);
+  }
+}
+
+void SqliteDatabase::Initialize() {
+  int rc = sqlite3_open(":memory:", &db);
+  if (rc) {
+    throw std::runtime_error("Failed to create in-memory SQLite database");
+  }
+  std::cout << "✓ In-memory SQLite database created\n";
+  InitializeSchema();
+}
+
+void SqliteDatabase::InsertCountry(int id, const std::string &name,
+                                   const std::string &iso2,
+                                   const std::string &iso3) {
+  std::lock_guard<std::mutex> lock(dbMutex);
+
+  const char *query = R"(
+    INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
+    VALUES (?, ?, ?, ?)
+  )";
+
+  sqlite3_stmt *stmt;
+  int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
+  if (rc != SQLITE_OK)
+    throw std::runtime_error("Failed to prepare country insert");
+
+  sqlite3_bind_int(stmt, 1, id);
+  sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_STATIC);
+  sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_STATIC);
+  sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_STATIC);
+
+  if (sqlite3_step(stmt) != SQLITE_DONE) {
+    throw std::runtime_error("Failed to insert country");
+  }
+  sqlite3_finalize(stmt);
+}
+
+void SqliteDatabase::InsertState(int id, int countryId, const std::string &name,
+                                 const std::string &iso2) {
+  std::lock_guard<std::mutex> lock(dbMutex);
+
+  const char *query = R"(
+    INSERT OR IGNORE INTO states (id, country_id, name, iso2)
+    VALUES (?, ?, ?, ?)
+  )";
+
+  sqlite3_stmt *stmt;
+  int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
+  if (rc != SQLITE_OK)
+    throw std::runtime_error("Failed to prepare state insert");
+
+  sqlite3_bind_int(stmt, 1, id);
+  sqlite3_bind_int(stmt, 2, countryId);
+  sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_STATIC);
+  sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_STATIC);
+
+  if (sqlite3_step(stmt) != SQLITE_DONE) {
+    throw std::runtime_error("Failed to insert state");
+  }
+  sqlite3_finalize(stmt);
+}
+
+void SqliteDatabase::InsertCity(int id, int stateId, int countryId,
+                                const std::string &name, double latitude,
+                                double longitude) {
+  std::lock_guard<std::mutex> lock(dbMutex);
+
+  const char *query = R"(
+    INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
+    VALUES (?, ?, ?, ?, ?, ?)
+  )";
+
+  sqlite3_stmt *stmt;
+  int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
+  if (rc != SQLITE_OK)
+    throw std::runtime_error("Failed to prepare city insert");
+
+  sqlite3_bind_int(stmt, 1, id);
+  sqlite3_bind_int(stmt, 2, stateId);
+  sqlite3_bind_int(stmt, 3, countryId);
+  sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_STATIC);
+  sqlite3_bind_double(stmt, 5, latitude);
+  sqlite3_bind_double(stmt, 6, longitude);
+
+  if (sqlite3_step(stmt) != SQLITE_DONE) {
+    throw std::runtime_error("Failed to insert city");
+  }
+  sqlite3_finalize(stmt);
+}
+
+std::vector<std::pair<int, std::string>> SqliteDatabase::QueryCities() {
+  std::lock_guard<std::mutex> lock(dbMutex);
+
+  std::vector<std::pair<int, std::string>> cities;
+  sqlite3_stmt *stmt = nullptr;
+
+  const char *query = "SELECT id, name FROM cities ORDER BY name";
+  int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
+
+  if (rc != SQLITE_OK) {
+    throw std::runtime_error("Failed to prepare query");
+  }
+
+  while (sqlite3_step(stmt) == SQLITE_ROW) {
+    int id = sqlite3_column_int(stmt, 0);
+    const char *name =
+        reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
+    cities.push_back({id, name ? std::string(name) : ""});
+  }
+
+  sqlite3_finalize(stmt);
+  return cities;
+}
+
+std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
+  std::lock_guard<std::mutex> lock(dbMutex);
+
+  std::vector<Country> countries;
+  sqlite3_stmt *stmt = nullptr;
+
+  std::string query =
+      "SELECT id, name, iso2, iso3 FROM countries ORDER BY name";
+  if (limit > 0) {
+    query += " LIMIT " + std::to_string(limit);
+  }
+
+  int rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt, nullptr);
+
+  if (rc != SQLITE_OK) {
+    throw std::runtime_error("Failed to prepare countries query");
+  }
+
+  while (sqlite3_step(stmt) == SQLITE_ROW) {
+    int id = sqlite3_column_int(stmt, 0);
+    const char *name =
+        reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
+    const char *iso2 =
+        reinterpret_cast<const char *>(sqlite3_column_text(stmt, 2));
+    const char *iso3 =
+        reinterpret_cast<const char *>(sqlite3_column_text(stmt, 3));
+    countries.push_back({id, name ? std::string(name) : "",
+                         iso2 ? std::string(iso2) : "",
+                         iso3 ? std::string(iso3) : ""});
+  }
+
+  sqlite3_finalize(stmt);
+  return countries;
+}
+
+std::vector<State> SqliteDatabase::QueryStates(int limit) {
+  std::lock_guard<std::mutex> lock(dbMutex);
+
+  std::vector<State> states;
+  sqlite3_stmt *stmt = nullptr;
+
+  std::string query =
+      "SELECT id, name, iso2, country_id FROM states ORDER BY name";
+  if (limit > 0) {
+    query += " LIMIT " + std::to_string(limit);
+  }
+
+  int rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt, nullptr);
+
+  if (rc != SQLITE_OK) {
+    throw std::runtime_error("Failed to prepare states query");
+  }
+
+  while (sqlite3_step(stmt) == SQLITE_ROW) {
+    int id = sqlite3_column_int(stmt, 0);
+    const char *name =
+        reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
+    const char *iso2 =
+        reinterpret_cast<const char *>(sqlite3_column_text(stmt, 2));
+    int countryId = sqlite3_column_int(stmt, 3);
+    states.push_back({id, name ? std::string(name) : "",
+                      iso2 ? std::string(iso2) : "", countryId});
+  }
+
+  sqlite3_finalize(stmt);
+  return states;
+}
--- a/pipeline/src/database/database.cpp
+++ b/pipeline/src/database/database.cpp
@@ -1,264 +0,0 @@
-#include "database/database.h"
-
-#include <spdlog/spdlog.h>
-
-#include <stdexcept>
-
-void SqliteDatabase::InitializeSchema() {
-   std::lock_guard<std::mutex> lock(db_mutex_);
-
-   const char* schema = R"(
-    CREATE TABLE IF NOT EXISTS countries (
-      id INTEGER PRIMARY KEY,
-      name TEXT NOT NULL,
-      iso2 TEXT,
-      iso3 TEXT
-    );
-
-    CREATE TABLE IF NOT EXISTS states (
-      id INTEGER PRIMARY KEY,
-      country_id INTEGER NOT NULL,
-      name TEXT NOT NULL,
-      iso2 TEXT,
-      FOREIGN KEY(country_id) REFERENCES countries(id)
-    );
-
-    CREATE TABLE IF NOT EXISTS cities (
-      id INTEGER PRIMARY KEY,
-      state_id INTEGER NOT NULL,
-      country_id INTEGER NOT NULL,
-      name TEXT NOT NULL,
-      latitude REAL,
-      longitude REAL,
-      FOREIGN KEY(state_id) REFERENCES states(id),
-      FOREIGN KEY(country_id) REFERENCES countries(id)
-    );
-  )";
-
-   char* errMsg = nullptr;
-   int rc = sqlite3_exec(db_, schema, nullptr, nullptr, &errMsg);
-   if (rc != SQLITE_OK) {
-      std::string error = errMsg ? std::string(errMsg) : "Unknown error";
-      sqlite3_free(errMsg);
-      throw std::runtime_error("Failed to create schema: " + error);
-   }
-}
-
-SqliteDatabase::~SqliteDatabase() {
-   if (db_) {
-      sqlite3_close(db_);
-   }
-}
-
-void SqliteDatabase::Initialize(const std::string& db_path) {
-   int rc = sqlite3_open(db_path.c_str(), &db_);
-   if (rc) {
-      throw std::runtime_error("Failed to open SQLite database: " + db_path);
-   }
-   spdlog::info("OK: SQLite database opened: {}", db_path);
-   InitializeSchema();
-}
-
-void SqliteDatabase::BeginTransaction() {
-   std::lock_guard<std::mutex> lock(db_mutex_);
-   char* err = nullptr;
-   if (sqlite3_exec(db_, "BEGIN TRANSACTION", nullptr, nullptr, &err) !=
-       SQLITE_OK) {
-      std::string msg = err ? err : "unknown";
-      sqlite3_free(err);
-      throw std::runtime_error("BeginTransaction failed: " + msg);
-   }
-}
-
-void SqliteDatabase::CommitTransaction() {
-   std::lock_guard<std::mutex> lock(db_mutex_);
-   char* err = nullptr;
-   if (sqlite3_exec(db_, "COMMIT", nullptr, nullptr, &err) != SQLITE_OK) {
-      std::string msg = err ? err : "unknown";
-      sqlite3_free(err);
-      throw std::runtime_error("CommitTransaction failed: " + msg);
-   }
-}
-
-void SqliteDatabase::RollbackTransaction() {
-   std::lock_guard<std::mutex> lock(db_mutex_);
-   char* err = nullptr;
-   if (sqlite3_exec(db_, "ROLLBACK", nullptr, nullptr, &err) != SQLITE_OK) {
-      std::string msg = err ? err : "unknown";
-      sqlite3_free(err);
-      throw std::runtime_error("RollbackTransaction failed: " + msg);
-   }
-}
-
-void SqliteDatabase::InsertCountry(int id, const std::string& name,
-                                   const std::string& iso2,
-                                   const std::string& iso3) {
-   std::lock_guard<std::mutex> lock(db_mutex_);
-
-   const char* query = R"(
-    INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
-    VALUES (?, ?, ?, ?)
-  )";
-
-   sqlite3_stmt* stmt;
-   int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
-   if (rc != SQLITE_OK)
-      throw std::runtime_error("Failed to prepare country insert");
-
-   sqlite3_bind_int(stmt, 1, id);
-   sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_TRANSIENT);
-   sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_TRANSIENT);
-   sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_TRANSIENT);
-
-   if (sqlite3_step(stmt) != SQLITE_DONE) {
-      throw std::runtime_error("Failed to insert country");
-   }
-   sqlite3_finalize(stmt);
-}
-
-void SqliteDatabase::InsertState(int id, int country_id,
-                                 const std::string& name,
-                                 const std::string& iso2) {
-   std::lock_guard<std::mutex> lock(db_mutex_);
-
-   const char* query = R"(
-    INSERT OR IGNORE INTO states (id, country_id, name, iso2)
-    VALUES (?, ?, ?, ?)
-  )";
-
-   sqlite3_stmt* stmt;
-   int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
-   if (rc != SQLITE_OK)
-      throw std::runtime_error("Failed to prepare state insert");
-
-   sqlite3_bind_int(stmt, 1, id);
-   sqlite3_bind_int(stmt, 2, country_id);
-   sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_TRANSIENT);
-   sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_TRANSIENT);
-
-   if (sqlite3_step(stmt) != SQLITE_DONE) {
-      throw std::runtime_error("Failed to insert state");
-   }
-   sqlite3_finalize(stmt);
-}
-
-void SqliteDatabase::InsertCity(int id, int state_id, int country_id,
-                                const std::string& name, double latitude,
-                                double longitude) {
-   std::lock_guard<std::mutex> lock(db_mutex_);
-
-   const char* query = R"(
-    INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
-    VALUES (?, ?, ?, ?, ?, ?)
-  )";
-
-   sqlite3_stmt* stmt;
-   int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
-   if (rc != SQLITE_OK)
-      throw std::runtime_error("Failed to prepare city insert");
-
-   sqlite3_bind_int(stmt, 1, id);
-   sqlite3_bind_int(stmt, 2, state_id);
-   sqlite3_bind_int(stmt, 3, country_id);
-   sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_TRANSIENT);
-   sqlite3_bind_double(stmt, 5, latitude);
-   sqlite3_bind_double(stmt, 6, longitude);
-
-   if (sqlite3_step(stmt) != SQLITE_DONE) {
-      throw std::runtime_error("Failed to insert city");
-   }
-   sqlite3_finalize(stmt);
-}
-
-std::vector<City> SqliteDatabase::QueryCities() {
-   std::lock_guard<std::mutex> lock(db_mutex_);
-   std::vector<City> cities;
-   sqlite3_stmt* stmt = nullptr;
-
-   const char* query =
-       "SELECT id, name, country_id FROM cities ORDER BY RANDOM()";
-   int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
-
-   if (rc != SQLITE_OK) {
-      throw std::runtime_error("Failed to prepare query");
-   }
-
-   while (sqlite3_step(stmt) == SQLITE_ROW) {
-      int id = sqlite3_column_int(stmt, 0);
-      const char* name =
-          reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
-      int country_id = sqlite3_column_int(stmt, 2);
-      cities.push_back({id, name ? std::string(name) : "", country_id});
-   }
-
-   sqlite3_finalize(stmt);
-   return cities;
-}
-
-std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
-   std::lock_guard<std::mutex> lock(db_mutex_);
-
-   std::vector<Country> countries;
-   sqlite3_stmt* stmt = nullptr;
-
-   std::string query =
-       "SELECT id, name, iso2, iso3 FROM countries ORDER BY name";
-   if (limit > 0) {
-      query += " LIMIT " + std::to_string(limit);
-   }
-
-   int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
-
-   if (rc != SQLITE_OK) {
-      throw std::runtime_error("Failed to prepare countries query");
-   }
-
-   while (sqlite3_step(stmt) == SQLITE_ROW) {
-      int id = sqlite3_column_int(stmt, 0);
-      const char* name =
-          reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
-      const char* iso2 =
-          reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
-      const char* iso3 =
-          reinterpret_cast<const char*>(sqlite3_column_text(stmt, 3));
-      countries.push_back({id, name ? std::string(name) : "",
-                           iso2 ? std::string(iso2) : "",
-                           iso3 ? std::string(iso3) : ""});
-   }
-
-   sqlite3_finalize(stmt);
-   return countries;
-}
-
-std::vector<State> SqliteDatabase::QueryStates(int limit) {
-   std::lock_guard<std::mutex> lock(db_mutex_);
-
-   std::vector<State> states;
-   sqlite3_stmt* stmt = nullptr;
-
-   std::string query =
-       "SELECT id, name, iso2, country_id FROM states ORDER BY name";
-   if (limit > 0) {
-      query += " LIMIT " + std::to_string(limit);
-   }
-
-   int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
-
-   if (rc != SQLITE_OK) {
-      throw std::runtime_error("Failed to prepare states query");
-   }
-
-   while (sqlite3_step(stmt) == SQLITE_ROW) {
-      int id = sqlite3_column_int(stmt, 0);
-      const char* name =
-          reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
-      const char* iso2 =
-          reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
-      int country_id = sqlite3_column_int(stmt, 3);
-      states.push_back({id, name ? std::string(name) : "",
-                        iso2 ? std::string(iso2) : "", country_id});
-   }
-
-   sqlite3_finalize(stmt);
-   return states;
-}
--- a/pipeline/src/generator.cpp
+++ b/pipeline/src/generator.cpp
@@ -0,0 +1,81 @@
+#include "generator.h"
+#include <functional>
+#include <iostream>
+
+/**
+ * @brief Initializes the brewery generator by loading a language model
+ *
+ * Current Implementation (Mock):
+ * - Outputs informational messages about model initialization
+ * - Does not load actual llama.cpp model yet
+ * - Serves as interface definition for future real implementation
+ *
+ * Future Implementation:
+ * - Will load a GGUF-format LLM model file using llama.cpp
+ * - Will initialize CPU/GPU inference context
+ * - Will cache model weights for repeated brewery generation
+ *
+ * @param modelPath Path to GGUF model file (e.g., "models/llama-7b.gguf")
+ *
+ * Example output:
+ * @code
+ *   [Mock] Initialized llama model: models/llama-7b.gguf
+ *     ✓ Model ready
+ * @endcode
+ */
+void LlamaBreweryGenerator::LoadModel(const std::string &modelPath) {
+  std::cout << "  [Mock] Initialized llama model: " << modelPath << "\n";
+  std::cout << "    ✓ Model ready\n";
+}
+
+/**
+ * @brief Generates a brewery name and description for a city using
+ * deterministic hashing
+ *
+ * Algorithm:
+ * 1. Combines city name with seed to create unique hash input
+ * 2. Uses std::hash<std::string> to compute deterministic hash value
+ * 3. Uses modulo arithmetic to map hash to template arrays:
+ *    - name: adjective[hash % 8] + noun[(hash/7) % 8]
+ *    - description: descriptions[(hash/13) % 5]
+ * 4. Returns Brewery struct with generated name and description
+ *
+ * Determinism:
+ * - Same cityName + seed ALWAYS produces same result
+ * - Enables reproducible testing and consistent brewery assignments
+ * - Hash distribution spreads city names across template combinations
+ *
+ * Example:
+ * @code
+ *   auto gen = LlamaBreweryGenerator();
+ *   auto brewery = gen.GenerateBrewery("Toronto", 1);
+ *   // Always produces same brewery for same city/seed
+ *   assert(gen.GenerateBrewery("Toronto", 1).name == brewery.name);
+ * @endcode
+ *
+ * @param cityName The city to generate a brewery for
+ * @param seed An integer seed for deterministic variation (usually 0 or row ID)
+ * @return Brewery struct containing:
+ *   - name: Combined adjective + noun (e.g., "Craft Brewing Co.")
+ *   - description: Pre-written description matching brewery style
+ *
+ * @note Future: Replace hashing with actual LLM inference
+ *       Interface will remain identical for smooth migration
+ */
+LlamaBreweryGenerator::Brewery
+LlamaBreweryGenerator::GenerateBrewery(const std::string &cityName, int seed) {
+  // Deterministic mock generation based on city name and seed
+  // Combines city name with seed to ensure different results for same city
+  // with different seed values (useful for generating multiple breweries per
+  // city)
+  size_t nameHash = std::hash<std::string>{}(cityName + std::to_string(seed));
+
+  Brewery result;
+  // Select adjective and noun using hash modulo
+  // Divided by 7 and 13 to ensure different modulo results from same hash
+  result.name = breweryAdjectives[nameHash % breweryAdjectives.size()] + " " +
+                breweryNouns[(nameHash / 7) % breweryNouns.size()];
+  result.description = descriptions[(nameHash / 13) % descriptions.size()];
+
+  return result;
+}
--- a/pipeline/src/json_handling/json_loader.cpp
+++ b/pipeline/src/json_handling/json_loader.cpp
@@ -1,67 +0,0 @@
-#include "json_handling/json_loader.h"
-
-#include <spdlog/spdlog.h>
-
-#include <chrono>
-
-#include "json_handling/stream_parser.h"
-
-void JsonLoader::LoadWorldCities(const std::string& json_path,
-                                 SqliteDatabase& db) {
-   constexpr size_t kBatchSize = 10000;
-
-   auto startTime = std::chrono::high_resolution_clock::now();
-   spdlog::info("\nLoading {} (streaming Boost.JSON SAX)...", json_path);
-
-   db.BeginTransaction();
-   bool transactionOpen = true;
-
-   size_t citiesProcessed = 0;
-   try {
-      StreamingJsonParser::Parse(
-          json_path, db,
-          [&](const CityRecord& record) {
-             db.InsertCity(record.id, record.state_id, record.country_id,
-                           record.name, record.latitude, record.longitude);
-             ++citiesProcessed;
-
-             if (citiesProcessed % kBatchSize == 0) {
-                db.CommitTransaction();
-                db.BeginTransaction();
-             }
-          },
-          [&](size_t current, size_t /*total*/) {
-             if (current % kBatchSize == 0 && current > 0) {
-                spdlog::info("  [Progress] Parsed {} cities...", current);
-             }
-          });
-
-      spdlog::info("  OK: Parsed all cities from JSON");
-
-      if (transactionOpen) {
-         db.CommitTransaction();
-         transactionOpen = false;
-      }
-   } catch (...) {
-      if (transactionOpen) {
-         db.RollbackTransaction();
-         transactionOpen = false;
-      }
-      throw;
-   }
-
-   auto endTime = std::chrono::high_resolution_clock::now();
-   auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
-       endTime - startTime);
-
-   spdlog::info("\n=== World City Data Loading Summary ===\n");
-   spdlog::info("Cities inserted: {}", citiesProcessed);
-   spdlog::info("Elapsed time:    {} ms", duration.count());
-   long long throughput =
-       (citiesProcessed > 0 && duration.count() > 0)
-           ? (1000LL * static_cast<long long>(citiesProcessed)) /
-                 static_cast<long long>(duration.count())
-           : 0LL;
-   spdlog::info("Throughput:      {} cities/sec", throughput);
-   spdlog::info("=======================================\n");
-}
--- a/pipeline/src/json_handling/stream_parser.cpp
+++ b/pipeline/src/json_handling/stream_parser.cpp
@@ -1,289 +0,0 @@
-#include "json_handling/stream_parser.h"
-
-#include <spdlog/spdlog.h>
-
-#include <boost/json.hpp>
-#include <boost/json/basic_parser_impl.hpp>
-#include <cstdio>
-#include <stdexcept>
-
-#include "database/database.h"
-
-class CityRecordHandler {
-   friend class boost::json::basic_parser<CityRecordHandler>;
-
-  public:
-   static constexpr std::size_t max_array_size = static_cast<std::size_t>(-1);
-   static constexpr std::size_t max_object_size = static_cast<std::size_t>(-1);
-   static constexpr std::size_t max_string_size = static_cast<std::size_t>(-1);
-   static constexpr std::size_t max_key_size = static_cast<std::size_t>(-1);
-
-   struct ParseContext {
-      SqliteDatabase* db = nullptr;
-      std::function<void(const CityRecord&)> on_city;
-      std::function<void(size_t, size_t)> on_progress;
-      size_t cities_emitted = 0;
-      size_t total_file_size = 0;
-      int countries_inserted = 0;
-      int states_inserted = 0;
-   };
-
-   explicit CityRecordHandler(ParseContext& ctx) : context(ctx) {}
-
-  private:
-   ParseContext& context;
-
-   int depth = 0;
-   bool in_countries_array = false;
-   bool in_country_object = false;
-   bool in_states_array = false;
-   bool in_state_object = false;
-   bool in_cities_array = false;
-   bool building_city = false;
-
-   int current_country_id = 0;
-   int current_state_id = 0;
-   CityRecord current_city = {};
-   std::string current_key;
-   std::string current_key_val;
-   std::string current_string_val;
-
-   std::string country_info[3];
-   std::string state_info[2];
-
-   // Boost.JSON SAX Hooks
-   bool on_document_begin(boost::system::error_code&) { return true; }
-   bool on_document_end(boost::system::error_code&) { return true; }
-
-   bool on_array_begin(boost::system::error_code&) {
-      depth++;
-      if (depth == 1) {
-         in_countries_array = true;
-      } else if (depth == 3 && current_key == "states") {
-         in_states_array = true;
-      } else if (depth == 5 && current_key == "cities") {
-         in_cities_array = true;
-      }
-      return true;
-   }
-
-   bool on_array_end(std::size_t, boost::system::error_code&) {
-      if (depth == 1) {
-         in_countries_array = false;
-      } else if (depth == 3) {
-         in_states_array = false;
-      } else if (depth == 5) {
-         in_cities_array = false;
-      }
-      depth--;
-      return true;
-   }
-
-   bool on_object_begin(boost::system::error_code&) {
-      depth++;
-      if (depth == 2 && in_countries_array) {
-         in_country_object = true;
-         current_country_id = 0;
-         country_info[0].clear();
-         country_info[1].clear();
-         country_info[2].clear();
-      } else if (depth == 4 && in_states_array) {
-         in_state_object = true;
-         current_state_id = 0;
-         state_info[0].clear();
-         state_info[1].clear();
-      } else if (depth == 6 && in_cities_array) {
-         building_city = true;
-         current_city = {};
-      }
-      return true;
-   }
-
-   bool on_object_end(std::size_t, boost::system::error_code&) {
-      if (depth == 6 && building_city) {
-         if (current_city.id > 0 && current_state_id > 0 &&
-             current_country_id > 0) {
-            current_city.state_id = current_state_id;
-            current_city.country_id = current_country_id;
-
-            try {
-               context.on_city(current_city);
-               context.cities_emitted++;
-
-               if (context.on_progress && context.cities_emitted % 10000 == 0) {
-                  context.on_progress(context.cities_emitted,
-                                      context.total_file_size);
-               }
-            } catch (const std::exception& e) {
-               spdlog::warn("Record parsing failed: {}", e.what());
-            }
-         }
-         building_city = false;
-      } else if (depth == 4 && in_state_object) {
-         if (current_state_id > 0 && current_country_id > 0) {
-            try {
-               context.db->InsertState(current_state_id, current_country_id,
-                                       state_info[0], state_info[1]);
-               context.states_inserted++;
-            } catch (const std::exception& e) {
-               spdlog::warn("Record parsing failed: {}", e.what());
-            }
-         }
-         in_state_object = false;
-      } else if (depth == 2 && in_country_object) {
-         if (current_country_id > 0) {
-            try {
-               context.db->InsertCountry(current_country_id, country_info[0],
-                                         country_info[1], country_info[2]);
-               context.countries_inserted++;
-            } catch (const std::exception& e) {
-               spdlog::warn("Record parsing failed: {}", e.what());
-            }
-         }
-         in_country_object = false;
-      }
-
-      depth--;
-      return true;
-   }
-
-   bool on_key_part(boost::json::string_view s, std::size_t,
-                    boost::system::error_code&) {
-      current_key_val.append(s.data(), s.size());
-      return true;
-   }
-
-   bool on_key(boost::json::string_view s, std::size_t,
-               boost::system::error_code&) {
-      current_key_val.append(s.data(), s.size());
-      current_key = current_key_val;
-      current_key_val.clear();
-      return true;
-   }
-
-   bool on_string_part(boost::json::string_view s, std::size_t,
-                       boost::system::error_code&) {
-      current_string_val.append(s.data(), s.size());
-      return true;
-   }
-
-   bool on_string(boost::json::string_view s, std::size_t,
-                  boost::system::error_code&) {
-      current_string_val.append(s.data(), s.size());
-
-      if (building_city && current_key == "name") {
-         current_city.name = current_string_val;
-      } else if (in_state_object && current_key == "name") {
-         state_info[0] = current_string_val;
-      } else if (in_state_object && current_key == "iso2") {
-         state_info[1] = current_string_val;
-      } else if (in_country_object && current_key == "name") {
-         country_info[0] = current_string_val;
-      } else if (in_country_object && current_key == "iso2") {
-         country_info[1] = current_string_val;
-      } else if (in_country_object && current_key == "iso3") {
-         country_info[2] = current_string_val;
-      }
-
-      current_string_val.clear();
-      return true;
-   }
-
-   bool on_number_part(boost::json::string_view, boost::system::error_code&) {
-      return true;
-   }
-
-   bool on_int64(int64_t i, boost::json::string_view,
-                 boost::system::error_code&) {
-      if (building_city && current_key == "id") {
-         current_city.id = static_cast<int>(i);
-      } else if (in_state_object && current_key == "id") {
-         current_state_id = static_cast<int>(i);
-      } else if (in_country_object && current_key == "id") {
-         current_country_id = static_cast<int>(i);
-      }
-      return true;
-   }
-
-   bool on_uint64(uint64_t u, boost::json::string_view,
-                  boost::system::error_code& ec) {
-      return on_int64(static_cast<int64_t>(u), "", ec);
-   }
-
-   bool on_double(double d, boost::json::string_view,
-                  boost::system::error_code&) {
-      if (building_city) {
-         if (current_key == "latitude") {
-            current_city.latitude = d;
-         } else if (current_key == "longitude") {
-            current_city.longitude = d;
-         }
-      }
-      return true;
-   }
-
-   bool on_bool(bool, boost::system::error_code&) { return true; }
-   bool on_null(boost::system::error_code&) { return true; }
-   bool on_comment_part(boost::json::string_view, boost::system::error_code&) {
-      return true;
-   }
-   bool on_comment(boost::json::string_view, boost::system::error_code&) {
-      return true;
-   }
-};
-
-void StreamingJsonParser::Parse(
-    const std::string& file_path, SqliteDatabase& db,
-    std::function<void(const CityRecord&)> on_city,
-    std::function<void(size_t, size_t)> on_progress) {
-   spdlog::info("  Streaming parse of {} (Boost.JSON)...", file_path);
-
-   FILE* file = std::fopen(file_path.c_str(), "rb");
-   if (!file) {
-      throw std::runtime_error("Failed to open JSON file: " + file_path);
-   }
-
-   size_t total_size = 0;
-   if (std::fseek(file, 0, SEEK_END) == 0) {
-      long file_size = std::ftell(file);
-      if (file_size > 0) {
-         total_size = static_cast<size_t>(file_size);
-      }
-      std::rewind(file);
-   }
-
-   CityRecordHandler::ParseContext ctx{&db, on_city, on_progress, 0, total_size,
-                                       0,   0};
-   boost::json::basic_parser<CityRecordHandler> parser(
-       boost::json::parse_options{}, ctx);
-
-   char buf[65536];
-   size_t bytes_read;
-   boost::system::error_code ec;
-
-   while ((bytes_read = std::fread(buf, 1, sizeof(buf), file)) > 0) {
-      char const* p = buf;
-      std::size_t remain = bytes_read;
-
-      while (remain > 0) {
-         std::size_t consumed = parser.write_some(true, p, remain, ec);
-         if (ec) {
-            std::fclose(file);
-            throw std::runtime_error("JSON parse error: " + ec.message());
-         }
-         p += consumed;
-         remain -= consumed;
-      }
-   }
-
-   parser.write_some(false, nullptr, 0, ec);  // Signal EOF
-   std::fclose(file);
-
-   if (ec) {
-      throw std::runtime_error("JSON parse error at EOF: " + ec.message());
-   }
-
-   spdlog::info("    OK: Parsed {} countries, {} states, {} cities",
-                ctx.countries_inserted, ctx.states_inserted,
-                ctx.cities_emitted);
-}
--- a/pipeline/src/json_loader.cpp
+++ b/pipeline/src/json_loader.cpp
@@ -0,0 +1,222 @@
+#include "json_loader.h"
+#include <fstream>
+#include <iostream>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+/**
+ * @brief Loads world geographic data from JSON file into SQLite database
+ *
+ * This function implements a hierarchical multithreaded loading strategy:
+ *
+ * THREADING ARCHITECTURE:
+ * ┌─────────────────────────────────────────────────────────────────┐
+ * │  Main Thread: Parse JSON (45 MB)                                │
+ * └────────────────────┬────────────────────────────────────────────┘
+ *                      │
+ *        ┌─────────────┴──────────────┬──────────────┐
+ *        ▼                            ▼              ▼
+ *   Country Thread 0          Country Thread 1  ...  Thread N
+ *   ├─ Insert Country         ├─ Insert Country      └─ Insert Country
+ *   │
+ *   ├─ State Thread A         ├─ State Thread C
+ *   │  ├─ Insert State        │  ├─ Insert State
+ *   │  ├─ Insert 100 cities   │  └─ Insert 150 cities
+ *   │  └─ +stats              └─ +stats
+ *   │
+ *   └─ State Thread B
+ *      ├─ Insert State
+ *      ├─ Insert 200 cities
+ *      └─ +stats
+ *
+ * THREADING DETAILS:
+ * - Countries loop: divided among CPU_CORE_COUNT threads
+ * - Each country: states processed in dedicated threads (nested parallelism)
+ * - Each state: cities inserted sequentially (within thread)
+ * - All writes protected by mutex in SqliteDatabase
+ * - Processing stats (city count) synchronized with mutex
+ *
+ * INPUT JSON STRUCTURE:
+ * The JSON file contains three main arrays:
+ *
+ * 1. Countries (~250 records):
+ *    { id: int, name: string, iso2: string, iso3: string }
+ *
+ * 2. States/Provinces (~3500 records):
+ *    { id: int, country_id: int, name: string, iso2: string }
+ *
+ * 3. Cities (~50000 records):
+ *    { id: int, state_id: int, country_id: int, name: string,
+ *      latitude: double, longitude: double }
+ *
+ * PERFORMANCE:
+ * - JSON parsing: Single-threaded, happens once at start
+ * - Country insertion: Parallelized across CPU cores
+ * - State insertion: Parallelized within each country via nested threads
+ * - City insertion: Sequential within each state (reduces serialization)
+ * - Total expected runtime: 2-5 seconds for 50k cities on modern CPU
+ *
+ * ERROR HANDLING:
+ * - Missing JSON file: throws std::runtime_error
+ * - Invalid JSON: throws nlohmann::json::parse_error
+ * - Bad city records: silently skipped (try-catch within loop)
+ * - Database errors: re-thrown from db.Insert*() calls
+ *
+ * STATISTICS:
+ * Prints progress messages showing:
+ * - Number of countries loaded
+ * - Number of worker threads created
+ * - Total cities inserted into database
+ *
+ * @param jsonPath Path to JSON file (typically: ../data/world_city_data.json)
+ * @param db Reference to initialized SqliteDatabase to populate
+ */
+void JsonLoader::LoadWorldCities(const std::string &jsonPath,
+                                 SqliteDatabase &db) {
+  std::cout << "\nLoading " << jsonPath << " (45 MB)...\n";
+
+  // Open and read JSON file from disk
+  std::ifstream jsonFile(jsonPath);
+  if (!jsonFile.is_open()) {
+    throw std::runtime_error("Failed to open JSON file: " + jsonPath);
+  }
+
+  // Parse entire JSON into memory (nlohmann/json library)
+  json data;
+  try {
+    jsonFile >> data;
+  } catch (const std::exception &e) {
+    throw std::runtime_error("JSON parse error: " + std::string(e.what()));
+  }
+  jsonFile.close();
+
+  // DEBUG: Check JSON structure
+  if (!data.is_array()) {
+    std::cerr << "[DEBUG] JSON root is not an array. Type: " << data.type_name()
+              << std::endl;
+    if (data.is_object()) {
+      std::cerr << "[DEBUG] JSON root is object with keys: ";
+      for (auto &[key, val] : data.items()) {
+        std::cerr << key << " ";
+      }
+      std::cerr << std::endl;
+    }
+  }
+
+  std::cout << "✓ Loaded " << data.size()
+            << " records (expecting countries array)\n";
+
+  if (data.size() == 0) {
+    throw std::runtime_error("JSON file appears to be empty or malformed. "
+                             "Check download succeeded.");
+  }
+
+  std::cout << "Processing countries with multithreading...\n";
+
+  // Determine optimal thread count based on CPU cores
+  unsigned int numThreads = std::thread::hardware_concurrency();
+  std::cout << "  Using " << numThreads << " threads\n\n";
+
+  // Shared counter for statistics (protected by mutex)
+  int processedCities = 0;
+  std::mutex statsMutex;
+
+  // Spawn threads to process countries in parallel
+  std::vector<std::thread> countryThreads;
+  const size_t countriesPerThread = (data.size() + numThreads - 1) / numThreads;
+
+  for (size_t t = 0; t < numThreads; ++t) {
+    countryThreads.push_back(std::thread([&, t]() {
+      // Each thread processes a range of countries
+      size_t start = t * countriesPerThread;
+      size_t end = std::min((t + 1) * countriesPerThread, data.size());
+
+      for (size_t i = start; i < end; ++i) {
+        const auto &country = data[i];
+        int countryId = country["id"];
+        std::string countryName = country["name"];
+        std::string iso2 = country.value("iso2", "");
+        std::string iso3 = country.value("iso3", "");
+
+        // Insert country record
+        db.InsertCountry(countryId, countryName, iso2, iso3);
+
+        // Process states within this country
+        if (country.contains("states") && country["states"].is_array()) {
+          const auto &states = country["states"];
+
+          // Spawn threads to process states in parallel
+          // This creates nested parallelism: country threads spawn state
+          // threads
+          std::vector<std::thread> stateThreads;
+
+          for (size_t s = 0; s < states.size(); ++s) {
+            stateThreads.push_back(std::thread([&, s, countryId]() {
+              const auto &state = states[s];
+              int stateId = state["id"];
+              std::string stateName = state["name"];
+              std::string stateIso2 = state.value("iso2", "");
+
+              // Insert state record
+              db.InsertState(stateId, countryId, stateName, stateIso2);
+
+              // Process cities for this state
+              if (state.contains("cities") && state["cities"].is_array()) {
+                // Cities within a state are processed sequentially
+                // (within the state thread - reduces context switching)
+                for (const auto &city : state["cities"]) {
+                  try {
+                    int cityId = city["id"].get<int>();
+                    std::string cityName = city["name"];
+
+                    // Parse latitude and longitude as strings first (they're
+                    // stored as strings in JSON), then convert to double
+                    double lat = 0.0;
+                    double lng = 0.0;
+                    if (city.contains("latitude")) {
+                      lat = std::stod(city["latitude"].get<std::string>());
+                    }
+                    if (city.contains("longitude")) {
+                      lng = std::stod(city["longitude"].get<std::string>());
+                    }
+
+                    // Insert city record to database
+                    // Database has mutex protection for thread-safe access
+                    db.InsertCity(cityId, stateId, countryId, cityName, lat,
+                                  lng);
+
+                    // Update shared statistics counter (protected by mutex)
+                    {
+                      std::lock_guard<std::mutex> lock(statsMutex);
+                      processedCities++;
+                    }
+                  } catch (const std::exception &e) {
+                    // Silently skip malformed city entries
+                    // Example: missing required fields, invalid coordinates
+                  }
+                }
+              }
+            }));
+          }
+
+          // Wait for all state threads to complete
+          // Important: don't proceed to next country until states are done
+          for (auto &t : stateThreads) {
+            if (t.joinable())
+              t.join();
+          }
+        }
+      }
+    }));
+  }
+
+  // Wait for all country threads to complete
+  // This blocks until all nested state/city insertions are done
+  for (auto &t : countryThreads) {
+    if (t.joinable())
+      t.join();
+  }
+
+  std::cout << "✓ Loaded " << processedCities << " cities into database\n\n";
+}
--- a/pipeline/src/main.cpp
+++ b/pipeline/src/main.cpp
@@ -1,134 +1,154 @@
-#include <spdlog/spdlog.h>
-
-#include <boost/program_options.hpp>
-#include <iostream>
-#include <memory>
-
-#include "biergarten_data_generator.h"
-#include "database/database.h"
-#include "web_client/curl_web_client.h"
-
-namespace po = boost::program_options;
-
 /**
- * @brief Parse command-line arguments into ApplicationOptions.
+ * @file main.cpp
+ * @brief Entry point for the brewery data pipeline
 *
- * @param argc Command-line argument count.
- * @param argv Command-line arguments.
- * @param options Output ApplicationOptions struct.
- * @return true if parsing succeeded and should proceed, false otherwise.
+ * Pipeline Overview:
+ * This is the main data processing pipeline that:
+ * 1. Initializes an in-memory SQLite database
+ * 2. Loads world city data from a JSON file (50k+ cities)
+ * 3. Initializes the brewery generation system (currently mocked)
+ * 4. Demonstrates brewery generation for sample cities
+ *
+ * Architecture:
+ * ┌─────────────┐
+ * │  JSON File  │ (world_city_data.json - 50k+ cities)
+ * └──────┬──────┘
+ *        │
+ *        ▼
+ * ┌─────────────────────┐
+ * │  JsonLoader::Load   │ Parse and validate JSON
+ * └──────┬──────────────┘
+ *        │
+ *        ▼
+ * ┌─────────────────────┐
+ * │  SQLite Database    │ Store cities in-memory
+ * └──────┬──────────────┘
+ *        │
+ *        ▼
+ * ┌─────────────────────┐
+ * │  BreweryGenerator   │ Mock generation (hash-based)
+ * │  .GenerateBrewery() │ Future: LLM-based generation
+ * └─────────────────────┘
+ *
+ * Command Line Arguments:
+ * - argv[1]: Path to GGUF model file (default: ./model.gguf)
+ * - argv[2]: Path to cache directory for JSON downloads (default: /tmp)
+ * - argv[3]: Git commit hash for reproducible data version (default: c5eb7772)
+ *
+ * The pipeline automatically downloads the geographic data from GitHub on first
+ * run and caches it locally to avoid repeated network calls.
+ *
+ * Example Usage - Auto-download (stable 2026-03-28 build):
+ * @code
+ *   ./brewery-pipeline ./llama-7b.gguf
+ * @endcode
+ *
+ * Example Usage - Custom commit:
+ * @code
+ *   ./brewery-pipeline ./llama-7b.gguf /tmp main
+ * @endcode
+ *
+ * Exit Codes:
+ * - 0: Pipeline completed successfully
+ * - 1: Pipeline failed (exception caught)
 */
-bool ParseArguments(int argc, char** argv, ApplicationOptions& options) {
-   // If no arguments provided, display usage and exit
-   if (argc == 1) {
-      std::cout << "Biergarten Pipeline - Geographic Data Pipeline with "
-                   "Brewery Generation\n\n";
-      std::cout << "Usage: biergarten-pipeline [options]\n\n";
-      std::cout << "Options:\n";
-      std::cout << "  --mocked             Use mocked generator for "
-                   "brewery/user data\n";
-      std::cout << "  --model, -m PATH     Path to LLM model file (gguf) for "
-                   "generation\n";
-      std::cout << "  --cache-dir, -c DIR  Directory for cached JSON (default: "
-                   "/tmp)\n";
-      std::cout << "  --temperature TEMP   LLM sampling temperature 0.0-1.0 "
-                   "(default: 0.8)\n";
-      std::cout << "  --top-p VALUE        Nucleus sampling parameter 0.0-1.0 "
-                   "(default: 0.92)\n";
-      std::cout << "  --n-ctx SIZE         Context window size in tokens "
-                   "(default: 4096)\n";
-      std::cout << "  --seed SEED          Random seed: -1 for random "
-                   "(default: -1)\n";
-      std::cout << "  --help, -h           Show this help message\n\n";
-      std::cout << "Note: --mocked and --model are mutually exclusive. Exactly "
-                   "one must be provided.\n";
-      std::cout << "Data source is always pinned to commit c5eb7772 (stable "
-                   "2026-03-28).\n";
-      return false;
-   }

-   po::options_description desc("Pipeline Options");
-   desc.add_options()("help,h", "Produce help message")(
-       "mocked", po::bool_switch(),
-       "Use mocked generator for brewery/user data")(
-       "model,m", po::value<std::string>()->default_value(""),
-       "Path to LLM model (gguf)")(
-       "cache-dir,c", po::value<std::string>()->default_value("/tmp"),
-       "Directory for cached JSON")(
-       "temperature", po::value<float>()->default_value(0.8f),
-       "Sampling temperature (higher = more random)")(
-       "top-p", po::value<float>()->default_value(0.92f),
-       "Nucleus sampling top-p in (0,1] (higher = more random)")(
-       "n-ctx", po::value<uint32_t>()->default_value(8192),
-       "Context window size in tokens (1-32768)")(
-       "seed", po::value<int>()->default_value(-1),
-       "Sampler seed: -1 for random, otherwise non-negative integer");
+#include "data_downloader.h"
+#include "database.h"
+#include "generator.h"
+#include "json_loader.h"
+#include <curl/curl.h>
+#include <iostream>

-   po::variables_map vm;
-   po::store(po::parse_command_line(argc, argv, desc), vm);
-   po::notify(vm);
-
-   if (vm.count("help")) {
-      std::cout << desc << "\n";
-      return false;
-   }
-
-   // Check for mutually exclusive --mocked and --model flags
-   bool use_mocked = vm["mocked"].as<bool>();
-   std::string model_path = vm["model"].as<std::string>();
-
-   if (use_mocked && !model_path.empty()) {
-      spdlog::error("ERROR: --mocked and --model are mutually exclusive");
-      return false;
-   }
-
-   if (!use_mocked && model_path.empty()) {
-      spdlog::error("ERROR: Either --mocked or --model must be specified");
-      return false;
-   }
-
-   // Warn if sampling parameters are provided with --mocked
-   if (use_mocked) {
-      bool hasTemperature = vm["temperature"].defaulted() == false;
-      bool hasTopP = vm["top-p"].defaulted() == false;
-      bool hasSeed = vm["seed"].defaulted() == false;
-
-      if (hasTemperature || hasTopP || hasSeed) {
-         spdlog::warn(
-             "WARNING: Sampling parameters (--temperature, --top-p, --seed) "
-             "are ignored when using --mocked");
-      }
-   }
-
-   options.use_mocked = use_mocked;
-   options.model_path = model_path;
-   options.cache_dir = vm["cache-dir"].as<std::string>();
-   options.temperature = vm["temperature"].as<float>();
-   options.top_p = vm["top-p"].as<float>();
-   options.n_ctx = vm["n-ctx"].as<uint32_t>();
-   options.seed = vm["seed"].as<int>();
-   // commit is always pinned to c5eb7772
-
-   return true;
-}
-
-int main(int argc, char* argv[]) {
+int main(int argc, char *argv[]) {
  try {
-      const CurlGlobalState curl_state;
+    // Initialize libcurl globally (thread-safe mode)
+    curl_global_init(CURL_GLOBAL_DEFAULT);

-      ApplicationOptions options;
-      if (!ParseArguments(argc, argv, options)) {
-         return 0;
+    // Parse command-line arguments
+    std::string modelPath = argc > 1 ? argv[1] : "./model.gguf";
+    std::string cacheDir = argc > 2 ? argv[2] : "/tmp";
+    std::string commit =
+        argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28
+
+    // Construct cache path for downloaded JSON
+    std::string jsonPath = cacheDir + "/countries+states+cities.json";
+
+    // Step 0: Download geographic data from GitHub (cached locally)
+    // On first run, downloads 45MB JSON. On subsequent runs, uses cached file.
+    // Commit hash allows pinning to specific data versions for reproducibility.
+    std::cout << "\n[Pipeline] Downloading geographic data from GitHub...\n";
+    DataDownloader downloader;
+    downloader.DownloadCountriesDatabase(jsonPath, commit);
+
+    SqliteDatabase db;
+
+    // Step 1: Initialize empty in-memory database
+    std::cout << "Initializing in-memory SQLite database...\n";
+    db.Initialize();
+
+    // Step 2: Load world city data from JSON file
+    // This populates the database with ~50k city records
+    // Each record includes: city name, country, latitude, longitude, population
+    JsonLoader::LoadWorldCities(jsonPath, db);
+
+    // Step 3: Initialize brewery generator
+    // Current: Mock implementation using deterministic hashing
+    // Future: LLM-based generation with llama.cpp
+    std::cout << "Initializing brewery generator...\n";
+    LlamaBreweryGenerator generator;
+    generator.LoadModel(modelPath);
+
+    // Step 4: Query geographic data from database
+    std::cout << "\n=== GEOGRAPHIC DATA OVERVIEW ===\n";
+
+    auto countries = db.QueryCountries(50);
+    auto states = db.QueryStates(50);
+    auto cities = db.QueryCities();
+
+    std::cout << "\nTotal records loaded:";
+    std::cout << "\n  Countries: " << db.QueryCountries(0).size();
+    std::cout << "\n  States: " << db.QueryStates(0).size();
+    std::cout << "\n  Cities: " << cities.size() << "\n";
+
+    // Display 50 countries
+    std::cout << "\n--- 50 COUNTRIES ---\n";
+    for (size_t i = 0; i < countries.size(); i++) {
+      std::cout << (i + 1) << ". " << countries[i].iso2 << " ("
+                << countries[i].iso3 << ") " << countries[i].name << "\n";
    }

-      auto webClient = std::make_shared<CURLWebClient>();
-      SqliteDatabase database;
+    // Display 50 states
+    std::cout << "\n--- 50 STATES ---\n";
+    for (size_t i = 0; i < states.size(); i++) {
+      std::cout << (i + 1) << ". " << states[i].iso2 << ": " << states[i].name
+                << "\n";
+    }

-      BiergartenDataGenerator generator(options, webClient, database);
-      return generator.Run();
+    // Display 50 cities
+    std::cout << "\n--- 50 CITIES ---\n";
+    for (size_t i = 0; i < std::min(size_t(50), cities.size()); i++) {
+      std::cout << (i + 1) << ". " << cities[i].second << "\n";
+    }

-   } catch (const std::exception& e) {
-      spdlog::error("ERROR: Application failed: {}", e.what());
+    // Step 5: Demonstrate brewery generation on sample cities
+    std::cout << "\n=== SAMPLE BREWERY GENERATION ===\n\n";
+    for (size_t i = 0; i < std::min(size_t(5), cities.size()); i++) {
+      const auto &[cityId, cityName] = cities[i];
+      auto brewery = generator.GenerateBrewery(cityName, i);
+      std::cout << "  " << cityName << ": " << brewery.name << "\n";
+      std::cout << "    → " << brewery.description << "\n";
+    }
+
+    std::cout << "\n✓ Pipeline completed successfully\n";
+
+    // Cleanup
+    curl_global_cleanup();
+    return 0;
+
+  } catch (const std::exception &e) {
+    std::cerr << "✗ Pipeline failed: " << e.what() << "\n";
+    curl_global_cleanup();
    return 1;
  }
 }
--- a/pipeline/src/web_client/curl_web_client.cpp
+++ b/pipeline/src/web_client/curl_web_client.cpp
@@ -1,141 +0,0 @@
-#include "web_client/curl_web_client.h"
-
-#include <curl/curl.h>
-
-#include <cstdio>
-#include <fstream>
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-
-CurlGlobalState::CurlGlobalState() {
-   if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
-      throw std::runtime_error(
-          "[CURLWebClient] Failed to initialize libcurl globally");
-   }
-}
-
-CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }
-
-namespace {
-// curl write callback that appends response data into a std::string
-size_t WriteCallbackString(void* contents, size_t size, size_t nmemb,
-                           void* userp) {
-   size_t realsize = size * nmemb;
-   auto* s = static_cast<std::string*>(userp);
-   s->append(static_cast<char*>(contents), realsize);
-   return realsize;
-}
-
-// curl write callback that writes to a file stream
-size_t WriteCallbackFile(void* contents, size_t size, size_t nmemb,
-                         void* userp) {
-   size_t realsize = size * nmemb;
-   auto* outFile = static_cast<std::ofstream*>(userp);
-   outFile->write(static_cast<char*>(contents), realsize);
-   return realsize;
-}
-
-// RAII wrapper for CURL handle using unique_ptr
-using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
-
-CurlHandle create_handle() {
-   CURL* handle = curl_easy_init();
-   if (!handle) {
-      throw std::runtime_error(
-          "[CURLWebClient] Failed to initialize libcurl handle");
-   }
-   return CurlHandle(handle, &curl_easy_cleanup);
-}
-
-void set_common_get_options(CURL* curl, const std::string& url,
-                            long connect_timeout, long total_timeout) {
-   curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
-   curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
-   curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
-   curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
-   curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, connect_timeout);
-   curl_easy_setopt(curl, CURLOPT_TIMEOUT, total_timeout);
-   curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
-}
-}  // namespace
-
-CURLWebClient::CURLWebClient() {}
-
-CURLWebClient::~CURLWebClient() {}
-
-void CURLWebClient::DownloadToFile(const std::string& url,
-                                   const std::string& file_path) {
-   auto curl = create_handle();
-
-   std::ofstream outFile(file_path, std::ios::binary);
-   if (!outFile.is_open()) {
-      throw std::runtime_error(
-          "[CURLWebClient] Cannot open file for writing: " + file_path);
-   }
-
-   set_common_get_options(curl.get(), url, 30L, 300L);
-   curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackFile);
-   curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA,
-                    static_cast<void*>(&outFile));
-
-   CURLcode res = curl_easy_perform(curl.get());
-   outFile.close();
-
-   if (res != CURLE_OK) {
-      std::remove(file_path.c_str());
-      std::string error = std::string("[CURLWebClient] Download failed: ") +
-                          curl_easy_strerror(res);
-      throw std::runtime_error(error);
-   }
-
-   long httpCode = 0;
-   curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
-
-   if (httpCode != 200) {
-      std::remove(file_path.c_str());
-      std::stringstream ss;
-      ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
-      throw std::runtime_error(ss.str());
-   }
-}
-
-std::string CURLWebClient::Get(const std::string& url) {
-   auto curl = create_handle();
-
-   std::string response_string;
-   set_common_get_options(curl.get(), url, 10L, 20L);
-   curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
-   curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
-
-   CURLcode res = curl_easy_perform(curl.get());
-
-   if (res != CURLE_OK) {
-      std::string error =
-          std::string("[CURLWebClient] GET failed: ") + curl_easy_strerror(res);
-      throw std::runtime_error(error);
-   }
-
-   long httpCode = 0;
-   curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
-
-   if (httpCode != 200) {
-      std::stringstream ss;
-      ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
-      throw std::runtime_error(ss.str());
-   }
-
-   return response_string;
-}
-
-std::string CURLWebClient::UrlEncode(const std::string& value) {
-   // A NULL handle is fine for UTF-8 encoding according to libcurl docs.
-   char* output = curl_easy_escape(nullptr, value.c_str(), 0);
-
-   if (output) {
-      std::string result(output);
-      curl_free(output);
-      return result;
-   }
-   throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
-}
--- a/pipeline/src/wikipedia/wikipedia_service.cpp
+++ b/pipeline/src/wikipedia/wikipedia_service.cpp
@@ -1,89 +0,0 @@
-#include "wikipedia/wikipedia_service.h"
-
-#include <spdlog/spdlog.h>
-
-#include <boost/json.hpp>
-
-WikipediaService::WikipediaService(std::shared_ptr<WebClient> client)
-    : client_(std::move(client)) {}
-
-std::string WikipediaService::FetchExtract(std::string_view query) {
-   const std::string encoded = client_->UrlEncode(std::string(query));
-   const std::string url =
-       "https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
-       "&prop=extracts&explaintext=1&format=json";
-
-   const std::string body = client_->Get(url);
-
-   boost::system::error_code ec;
-   boost::json::value doc = boost::json::parse(body, ec);
-
-   if (!ec && doc.is_object()) {
-      try {
-         auto& pages = doc.at("query").at("pages").get_object();
-         if (!pages.empty()) {
-            auto& page = pages.begin()->value().get_object();
-            if (page.contains("extract") && page.at("extract").is_string()) {
-               std::string extract(page.at("extract").as_string().c_str());
-               spdlog::debug("WikipediaService fetched {} chars for '{}'",
-                             extract.size(), query);
-               return extract;
-            }
-         }
-      } catch (const std::exception& e) {
-         spdlog::warn(
-             "WikipediaService: failed to parse response structure for '{}': "
-             "{}",
-             query, e.what());
-         return {};
-      }
-   } else if (ec) {
-      spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
-                   ec.message());
-   }
-
-   return {};
-}
-
-std::string WikipediaService::GetSummary(std::string_view city,
-                                         std::string_view country) {
-   const std::string key = std::string(city) + "|" + std::string(country);
-   const auto cacheIt = cache_.find(key);
-   if (cacheIt != cache_.end()) {
-      return cacheIt->second;
-   }
-
-   std::string result;
-
-   if (!client_) {
-      cache_.emplace(key, result);
-      return result;
-   }
-
-   std::string regionQuery(city);
-   if (!country.empty()) {
-      regionQuery += ", ";
-      regionQuery += country;
-   }
-
-   const std::string beerQuery = "beer in " + std::string(country);
-
-   try {
-      const std::string regionExtract = FetchExtract(regionQuery);
-      const std::string beerExtract = FetchExtract(beerQuery);
-
-      if (!regionExtract.empty()) {
-         result += regionExtract;
-      }
-      if (!beerExtract.empty()) {
-         if (!result.empty()) result += "\n\n";
-         result += beerExtract;
-      }
-   } catch (const std::runtime_error& e) {
-      spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery,
-                    e.what());
-   }
-
-   cache_.emplace(key, result);
-   return result;
-}