14 Commits

Author SHA1 Message Date
Aaron Po
077f6ab4ae edit prompt 2026-04-02 22:56:18 -04:00
Aaron Po
534403734a Refactor BiergartenDataGenerator and LlamaGenerator 2026-04-02 22:46:00 -04:00
Aaron Po
3af053f0eb format codebase 2026-04-02 21:46:46 -04:00
Aaron Po
ba165d8aa7 Separate llama generator class src file into method files 2026-04-02 21:37:46 -04:00
Aaron Po
eb9a2767b4 Refactor web client interface and related components 2026-04-02 18:55:58 -04:00
Aaron Po
29ea47fdb6 update cli arg handling 2026-04-02 18:41:25 -04:00
Aaron Po
52e2333304 Reorganize directory structure 2026-04-02 18:27:01 -04:00
Aaron Po
a1f0ca5b20 Refactor DataDownloader and CURLWebClient: update constructor and modify FileExists method signature 2026-04-02 18:06:40 -04:00
Aaron Po
2ea8aa52b4 update readme and add clangformat and clang tidy 2026-04-02 17:12:22 -04:00
Aaron Po
98083ab40c Pipeline: add CURL/WebClient & Wikipedia service
Introduce a pluggable web client interface and concrete CURL implementation: adds IWebClient, CURLWebClient, and CurlGlobalState (headers + curl_web_client.cpp). DataDownloader now accepts an IWebClient and delegates downloads. Add WikipediaService for cached Wikipedia summary lookups. Refactor SqliteDatabase to return full City records and update consumers accordingly. Improve JsonLoader to use batched transactions during streaming parses. Enhance LlamaGenerator with sampling options, increased token limits, JSON extraction/validation, and other parsing helpers. Modernize CMake: set policy/version, add project_options, simplify FetchContent usage (spdlog), require Boost components (program_options/json), list pipeline sources explicitly, and tweak post-build/memcheck targets. Update README to match implementation changes and new CLI/config conventions.
2026-04-02 16:29:16 -04:00
Aaron Po
ac136f7179 Enhance brewery generation: add country name parameter and improve prompt handling 2026-04-02 01:04:41 -04:00
Aaron Po
280c9c61bd Implement Llama-based brewery and user data generation; remove mock generator and related files 2026-04-01 23:29:16 -04:00
Aaron Po
248a51b35f cleanup 2026-04-01 21:35:02 -04:00
Aaron Po
35aa7bc0df Begin work on biergarten data generator pipeline 2026-04-01 21:18:45 -04:00
60 changed files with 3197 additions and 954 deletions

5
pipeline/.clang-format Normal file
View File

@@ -0,0 +1,5 @@
---
BasedOnStyle: Google
ColumnLimit: 80
IndentWidth: 3
...

17
pipeline/.clang-tidy Normal file
View File

@@ -0,0 +1,17 @@
---
Checks: >
-*,
bugprone-*,
clang-analyzer-*,
cppcoreguidelines-*,
google-*,
modernize-*,
performance-*,
readability-*,
-cppcoreguidelines-avoid-magic-numbers,
-cppcoreguidelines-owning-memory,
-readability-magic-numbers,
-google-readability-todo
HeaderFilterRegex: "^(src|includes)/.*"
FormatStyle: file
...

3
pipeline/.gitignore vendored Normal file
View File

@@ -0,0 +1,3 @@
dist
build
data

169
pipeline/CMakeLists.txt Normal file
View File

@@ -0,0 +1,169 @@
cmake_minimum_required(VERSION 3.20)
project(biergarten-pipeline VERSION 0.1.0 LANGUAGES CXX)
# Allows older dependencies to configure on newer CMake.
set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
# Policies
cmake_policy(SET CMP0167 NEW) # FindBoost improvements
# Global Settings
set(CMAKE_CXX_STANDARD 23)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
option(ENABLE_CLANG_TIDY "Enable clang-tidy static analysis for project targets" ON)
option(ENABLE_CLANG_FORMAT_TARGETS "Enable clang-format helper targets" ON)
if(ENABLE_CLANG_TIDY)
find_program(CLANG_TIDY_EXE NAMES clang-tidy)
if(CLANG_TIDY_EXE)
set(BIERGARTEN_CLANG_TIDY_COMMAND
"${CLANG_TIDY_EXE};--config-file=${CMAKE_CURRENT_SOURCE_DIR}/.clang-tidy")
message(STATUS "clang-tidy enabled: ${CLANG_TIDY_EXE}")
else()
message(STATUS "clang-tidy not found; static analysis is disabled")
endif()
endif()
# -----------------------------------------------------------------------------
# Compiler Options & Warnings (Interface Library)
# -----------------------------------------------------------------------------
add_library(project_options INTERFACE)
target_compile_options(project_options INTERFACE
$<$<CXX_COMPILER_ID:GNU,Clang>:
-Wall -Wextra -Wpedantic -Wshadow -Wconversion -Wsign-conversion -Wunused
>
$<$<CXX_COMPILER_ID:MSVC>:
/W4 /WX /permissive-
>
)
# -----------------------------------------------------------------------------
# Dependencies
# -----------------------------------------------------------------------------
find_package(CURL REQUIRED)
find_package(SQLite3 REQUIRED)
find_package(Boost 1.75 REQUIRED COMPONENTS program_options json)
include(FetchContent)
# spdlog (Logging)
FetchContent_Declare(
spdlog
GIT_REPOSITORY https://github.com/gabime/spdlog.git
GIT_TAG v1.11.0
)
FetchContent_MakeAvailable(spdlog)
# llama.cpp (LLM Inference)
set(LLAMA_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
set(LLAMA_BUILD_SERVER OFF CACHE BOOL "" FORCE)
FetchContent_Declare(
llama_cpp
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
GIT_TAG b8611
)
FetchContent_MakeAvailable(llama_cpp)
if(TARGET llama)
target_compile_options(llama PRIVATE
$<$<CXX_COMPILER_ID:AppleClang>:-include algorithm>
)
endif()
# -----------------------------------------------------------------------------
# Main Executable
# -----------------------------------------------------------------------------
set(PIPELINE_SOURCES
src/biergarten_data_generator.cpp
src/web_client/curl_web_client.cpp
src/data_generation/data_downloader.cpp
src/database/database.cpp
src/json_handling/json_loader.cpp
src/data_generation/llama/destructor.cpp
src/data_generation/llama/set_sampling_options.cpp
src/data_generation/llama/load.cpp
src/data_generation/llama/infer.cpp
src/data_generation/llama/generate_brewery.cpp
src/data_generation/llama/generate_user.cpp
src/data_generation/llama/helpers.cpp
src/data_generation/mock/data.cpp
src/data_generation/mock/deterministic_hash.cpp
src/data_generation/mock/load.cpp
src/data_generation/mock/generate_brewery.cpp
src/data_generation/mock/generate_user.cpp
src/json_handling/stream_parser.cpp
src/wikipedia/wikipedia_service.cpp
src/main.cpp
)
add_executable(biergarten-pipeline ${PIPELINE_SOURCES})
if(BIERGARTEN_CLANG_TIDY_COMMAND)
set_target_properties(biergarten-pipeline PROPERTIES
CXX_CLANG_TIDY "${BIERGARTEN_CLANG_TIDY_COMMAND}"
)
endif()
target_include_directories(biergarten-pipeline
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/includes
${llama_cpp_SOURCE_DIR}/include
)
target_link_libraries(biergarten-pipeline
PRIVATE
project_options
CURL::libcurl
SQLite::SQLite3
spdlog::spdlog
llama
Boost::program_options
Boost::json
)
if(ENABLE_CLANG_FORMAT_TARGETS)
find_program(CLANG_FORMAT_EXE NAMES clang-format)
if(CLANG_FORMAT_EXE)
file(GLOB_RECURSE FORMAT_SOURCES CONFIGURE_DEPENDS
${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cc
${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.h
${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.hpp
)
add_custom_target(format
COMMAND ${CLANG_FORMAT_EXE} -style=file -i ${FORMAT_SOURCES}
COMMENT "Formatting source files with clang-format (Google style)"
VERBATIM
)
add_custom_target(format-check
COMMAND ${CLANG_FORMAT_EXE} -style=file --dry-run --Werror ${FORMAT_SOURCES}
COMMENT "Checking source formatting with clang-format (Google style)"
VERBATIM
)
else()
message(STATUS "clang-format not found; format targets are disabled")
endif()
endif()
# -----------------------------------------------------------------------------
# Post-Build Steps & Utilities
# -----------------------------------------------------------------------------
add_custom_command(TARGET biergarten-pipeline POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_SOURCE_DIR}/output
COMMENT "Ensuring output directory exists"
)
find_program(VALGRIND valgrind)
if(VALGRIND)
add_custom_target(memcheck
COMMAND ${VALGRIND} --leak-check=full --error-exitcode=1 $<TARGET_FILE:biergarten-pipeline> --help
DEPENDS biergarten-pipeline
COMMENT "Running Valgrind memory check"
)
endif()

406
pipeline/README.md Normal file
View File

@@ -0,0 +1,406 @@
# Biergarten Pipeline
A high-performance C++23 data pipeline for fetching, parsing, and storing geographic data (countries, states, cities) with brewery metadata generation capabilities. The system supports both mock and LLM-based (llama.cpp) generation modes.
## Overview
The pipeline orchestrates **four key stages**:
1. **Download** - Fetches `countries+states+cities.json` from a pinned GitHub commit with optional local filesystem caching
2. **Parse** - Streams JSON using Boost.JSON's `basic_parser` to extract country/state/city records without loading the entire file into memory
3. **Store** - Inserts records into a file-based SQLite database with all operations performed sequentially in a single thread
4. **Generate** - Produces brewery metadata or user profiles (mock implementation; supports future LLM integration via llama.cpp)
## System Architecture
### Data Sources and Formats
- **Hierarchical Structure**: Countries array → states per country → cities per state
- **Data Fields**:
- `id` (integer)
- `name` (string)
- `iso2` / `iso3` (ISO country/state codes)
- `latitude` / `longitude` (geographic coordinates)
- **Source**: [dr5hn/countries-states-cities-database](https://github.com/dr5hn/countries-states-cities-database) on GitHub
- **Output**: Structured SQLite file-based database (`biergarten-pipeline.db`) + structured logging via spdlog
### Concurrency Model
The pipeline currently operates **single-threaded** with sequential stage execution:
1. **Download Phase**: Main thread blocks while downloading the source JSON file (if not in cache)
2. **Parse & Store Phase**: Main thread performs streaming JSON parse with immediate SQLite inserts
**Thread Safety**: While single-threaded, the `SqliteDatabase` component is **mutex-protected** using `std::mutex` (`dbMutex`) for all database operations. This design enables safe future parallelization without code modifications.
## Core Components
| Component | Purpose | Thread Safety | Dependencies |
| ----------------------------- | ----------------------------------------------------------------------------------------------- | -------------------------------------------- | --------------------------------------------- |
| **BiergartenDataGenerator** | Orchestrates pipeline execution; manages lifecycle of downloader, parser, and generator | Single-threaded coordinator | ApplicationOptions, WebClient, SqliteDatabase |
| **DataDownloader** | HTTP fetch with curl; optional filesystem cache; ETag support and retries | Blocking I/O; safe for startup | IWebClient, filesystem |
| **StreamingJsonParser** | Extends `boost::json::basic_parser`; emits country/state/city via callbacks; tracks parse depth | Single-threaded parse; callbacks thread-safe | Boost.JSON |
| **JsonLoader** | Wraps parser; dispatches callbacks for country/state/city; manages WorkQueue lifecycle | Produces to WorkQueue; safe callbacks | StreamingJsonParser, SqliteDatabase |
| **SqliteDatabase** | Manages schema initialization; insert/query methods for geographic data | Mutex-guarded all operations | SQLite3 |
| **IDataGenerator** (Abstract) | Interface for brewery/user metadata generation | Stateless virtual methods | N/A |
| **LlamaGenerator** | LLM-based generation via llama.cpp; configurable sampling (temperature, top-p, seed) | Manages llama_model* and llama_context* | llama.cpp, BreweryResult, UserResult |
| **MockGenerator** | Deterministic mock generation using seeded randomization | Stateless; thread-safe | N/A |
| **CURLWebClient** | HTTP client adapter; URL encoding; file downloads | cURL library bindings | libcurl |
| **WikipediaService** | (Planned) Wikipedia data lookups for enrichment | N/A | IWebClient |
## Database Schema
SQLite file-based database with **three core tables** and **indexes for fast lookups**:
### Countries
```sql
CREATE TABLE countries (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
iso2 TEXT,
iso3 TEXT
);
CREATE INDEX idx_countries_iso2 ON countries(iso2);
```
### States
```sql
CREATE TABLE states (
id INTEGER PRIMARY KEY,
country_id INTEGER NOT NULL,
name TEXT NOT NULL,
iso2 TEXT,
FOREIGN KEY (country_id) REFERENCES countries(id)
);
CREATE INDEX idx_states_country ON states(country_id);
```
### Cities
```sql
CREATE TABLE cities (
id INTEGER PRIMARY KEY,
state_id INTEGER NOT NULL,
country_id INTEGER NOT NULL,
name TEXT NOT NULL,
latitude REAL,
longitude REAL,
FOREIGN KEY (state_id) REFERENCES states(id),
FOREIGN KEY (country_id) REFERENCES countries(id)
);
CREATE INDEX idx_cities_state ON cities(state_id);
CREATE INDEX idx_cities_country ON cities(country_id);
```
## Architecture Diagram
```plantuml
@startuml biergarten-pipeline
!theme plain
skinparam monochrome true
skinparam classBackgroundColor #FFFFFF
skinparam classBorderColor #000000
package "Application Layer" {
class BiergartenDataGenerator {
- options: ApplicationOptions
- webClient: IWebClient
- database: SqliteDatabase
- generator: IDataGenerator
--
+ Run() : int
}
}
package "Data Acquisition" {
class DataDownloader {
- webClient: IWebClient
--
+ Download(url: string, filePath: string)
+ DownloadWithCache(url: string, cachePath: string)
}
interface IWebClient {
+ DownloadToFile(url: string, filePath: string)
+ Get(url: string) : string
+ UrlEncode(value: string) : string
}
class CURLWebClient {
- globalState: CurlGlobalState
--
+ DownloadToFile(url: string, filePath: string)
+ Get(url: string) : string
+ UrlEncode(value: string) : string
}
}
package "JSON Processing" {
class StreamingJsonParser {
- depth: int
--
+ on_object_begin()
+ on_object_end()
+ on_array_begin()
+ on_array_end()
+ on_key(str: string)
+ on_string(str: string)
+ on_number(value: int)
}
class JsonLoader {
--
+ LoadWorldCities(jsonPath: string, db: SqliteDatabase)
}
}
package "Data Storage" {
class SqliteDatabase {
- db: sqlite3*
- dbMutex: std::mutex
--
+ Initialize(dbPath: string)
+ InsertCountry(id: int, name: string, iso2: string, iso3: string)
+ InsertState(id: int, countryId: int, name: string, iso2: string)
+ InsertCity(id: int, stateId: int, countryId: int, name: string, lat: double, lon: double)
+ QueryCountries(limit: int) : vector<Country>
+ QueryStates(limit: int) : vector<State>
+ QueryCities() : vector<City>
+ BeginTransaction()
+ CommitTransaction()
# InitializeSchema()
}
struct Country {
id: int
name: string
iso2: string
iso3: string
}
struct State {
id: int
name: string
iso2: string
countryId: int
}
struct City {
id: int
name: string
countryId: int
}
}
package "Data Generation" {
interface IDataGenerator {
+ load(modelPath: string)
+ generateBrewery(cityName: string, countryName: string, regionContext: string) : BreweryResult
+ generateUser(locale: string) : UserResult
}
class LlamaGenerator {
- model: llama_model*
- context: llama_context*
- sampling_temperature: float
- sampling_top_p: float
- sampling_seed: uint32_t
--
+ load(modelPath: string)
+ generateBrewery(...) : BreweryResult
+ generateUser(locale: string) : UserResult
+ setSamplingOptions(temperature: float, topP: float, seed: int)
# infer(prompt: string) : string
}
class MockGenerator {
--
+ load(modelPath: string)
+ generateBrewery(...) : BreweryResult
+ generateUser(locale: string) : UserResult
}
struct BreweryResult {
name: string
description: string
}
struct UserResult {
username: string
bio: string
}
}
package "Enrichment (Planned)" {
class WikipediaService {
- webClient: IWebClient
--
+ SearchCity(cityName: string, countryName: string) : string
}
}
' Relationships
BiergartenDataGenerator --> DataDownloader
BiergartenDataGenerator --> JsonLoader
BiergartenDataGenerator --> SqliteDatabase
BiergartenDataGenerator --> IDataGenerator
DataDownloader --> IWebClient
CURLWebClient ..|> IWebClient
JsonLoader --> StreamingJsonParser
JsonLoader --> SqliteDatabase
LlamaGenerator ..|> IDataGenerator
MockGenerator ..|> IDataGenerator
SqliteDatabase --> Country
SqliteDatabase --> State
SqliteDatabase --> City
LlamaGenerator --> BreweryResult
LlamaGenerator --> UserResult
MockGenerator --> BreweryResult
MockGenerator --> UserResult
WikipediaService --> IWebClient
@enduml
```
## Configuration and Extensibility
### Command-Line Arguments
Boost.Program_options provides named CLI arguments. Running without arguments displays usage instructions.
```bash
./biergarten-pipeline [options]
```
**Requirement**: Exactly one of `--mocked` or `--model` must be specified.
| Argument | Short | Type | Purpose |
| --------------- | ----- | ------ | --------------------------------------------------------------- |
| `--mocked` | - | flag | Use mocked generator for brewery/user data |
| `--model` | `-m` | string | Path to LLM model file (gguf); mutually exclusive with --mocked |
| `--cache-dir` | `-c` | path | Directory for cached JSON (default: `/tmp`) |
| `--temperature` | - | float | LLM sampling temperature 0.0-1.0 (default: `0.8`) |
| `--top-p` | - | float | Nucleus sampling parameter 0.0-1.0 (default: `0.92`) |
| `--seed` | - | int | Random seed: -1 for random (default: `-1`) |
| `--help` | `-h` | flag | Show help message |
**Note**: The data source is always pinned to commit `c5eb7772` (stable 2026-03-28) and cannot be changed.
**Note**: When `--mocked` is used, any sampling parameters (`--temperature`, `--top-p`, `--seed`) are ignored with a warning.
### Usage Examples
```bash
# Mocked generator (deterministic, no LLM required)
./biergarten-pipeline --mocked
# With LLM model
./biergarten-pipeline --model ./models/llama.gguf --cache-dir /var/cache
# Mocked with extra parameters provided (will be ignored with warning)
./biergarten-pipeline --mocked --temperature 0.5 --top-p 0.8 --seed 42
# Show help
./biergarten-pipeline --help
```
## Building and Running
### Prerequisites
- **C++23 compiler** (g++, clang, MSVC)
- **CMake** 3.20+
- **curl** (for HTTP downloads)
- **sqlite3** (database backend)
- **Boost** 1.75+ (requires Boost.JSON and Boost.Program_options)
- **spdlog** v1.11.0 (fetched via CMake FetchContent)
- **llama.cpp** (fetched via CMake FetchContent for LLM inference)
### Build
```bash
mkdir -p build
cd build
cmake ..
cmake --build . --target biergarten-pipeline -- -j
```
### Run
```bash
./build/biergarten-pipeline
```
**Output**:
- Console logs with structured spdlog output
- Cached JSON file: `/tmp/countries+states+cities.json`
- SQLite database: `biergarten-pipeline.db` (in output directory)
## Code Quality and Static Analysis
### Formatting
This project uses **clang-format** with the **Google C++ style guide**:
```bash
# Apply formatting to all source files
cmake --build build --target format
# Check formatting without modifications
cmake --build build --target format-check
```
### Static Analysis
This project uses **clang-tidy** with configurations for Google, modernize, performance, and bug-prone rules (`.clang-tidy`):
Static analysis runs automatically during compilation if `clang-tidy` is available.
## Code Implementation Summary
### Key Achievements
**Full pipeline implementation** - Download → Parse → Store → Generate
**Streaming JSON parser** - Memory-efficient processing via Boost.JSON callbacks
**Thread-safe SQLite wrapper** - Mutex-protected database for future parallelization
**Flexible data generation** - Abstract IDataGenerator interface supporting both mock and LLM modes
**Comprehensive CLI** - Boost.Program_options with sensible defaults
**Production-grade logging** - spdlog integration for structured output
**Build quality** - CMake with clang-format/clang-tidy integration
### Architecture Patterns
- **Interface-based design**: `IWebClient`, `IDataGenerator` abstract base classes enable substitution and testing
- **Dependency injection**: Components receive dependencies via constructors (BiergartenDataGenerator)
- **RAII principle**: SQLite connections and resources managed via destructors
- **Callback-driven parsing**: Boost.JSON parser emits events to processing callbacks
- **Transaction-scoped inserts**: BeginTransaction/CommitTransaction for batch performance
### External Dependencies
| Dependency | Version | Purpose | Type |
| ---------- | ------- | ---------------------------------- | ------- |
| Boost | 1.75+ | JSON parsing, CLI argument parsing | Library |
| SQLite3 | - | Persistent data storage | System |
| libcurl | - | HTTP downloads | System |
| spdlog | v1.11.0 | Structured logging | Fetched |
| llama.cpp | b8611 | LLM inference engine | Fetched |
to validate formatting without modifying files.
clang-tidy runs automatically on the biergarten-pipeline target when available. You can disable it at configure time:
cmake -DENABLE_CLANG_TIDY=OFF ..
You can also disable format helper targets:
cmake -DENABLE_CLANG_FORMAT_TARGETS=OFF ..

View File

@@ -0,0 +1,153 @@
#ifndef BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
#define BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "data_generation/data_generator.h"
#include "database/database.h"
#include "web_client/web_client.h"
#include "wikipedia/wikipedia_service.h"
/**
* @brief Program options for the Biergarten pipeline application.
*/
struct ApplicationOptions {
/// @brief Path to the LLM model file (gguf format); mutually exclusive with
/// use_mocked.
std::string model_path;
/// @brief Use mocked generator instead of LLM; mutually exclusive with
/// model_path.
bool use_mocked = false;
/// @brief Directory for cached JSON and database files.
std::string cache_dir;
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
float temperature = 0.8f;
/// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more
/// random).
float top_p = 0.92f;
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
int seed = -1;
/// @brief Git commit hash for database consistency (always pinned to
/// c5eb7772).
std::string commit = "c5eb7772";
};
/**
* @brief Main data generator class for the Biergarten pipeline.
*
* This class encapsulates the core logic for generating brewery data.
* It handles database initialization, data loading/downloading, and brewery
* generation.
*/
class BiergartenDataGenerator {
public:
/**
* @brief Construct a BiergartenDataGenerator with injected dependencies.
*
* @param options Application configuration options.
* @param web_client HTTP client for downloading data.
* @param database SQLite database instance.
*/
BiergartenDataGenerator(const ApplicationOptions& options,
std::shared_ptr<WebClient> web_client,
SqliteDatabase& database);
/**
* @brief Run the data generation pipeline.
*
* Performs the following steps:
* 1. Initialize database
* 2. Download geographic data if needed
* 3. Initialize the generator (LLM or Mock)
* 4. Generate brewery data for sample cities
*
* @return 0 on success, 1 on failure.
*/
int Run();
private:
/// @brief Immutable application options.
const ApplicationOptions options_;
/// @brief Shared HTTP client dependency.
std::shared_ptr<WebClient> webClient_;
/// @brief Database dependency.
SqliteDatabase& database_;
/**
* @brief Enriched city data with Wikipedia context.
*/
struct EnrichedCity {
int city_id;
std::string city_name;
std::string country_name;
std::string region_context;
};
/**
* @brief Initialize the data generator based on options.
*
* Creates either a MockGenerator (if no model path) or LlamaGenerator.
*
* @return A unique_ptr to the initialized generator.
*/
std::unique_ptr<DataGenerator> InitializeGenerator();
/**
* @brief Download and load geographic data if not cached.
*/
void LoadGeographicData();
/**
* @brief Query cities from database and build country name map.
*
* @return Vector of (City, country_name) pairs capped at 30 entries.
*/
std::vector<std::pair<City, std::string>> QueryCitiesWithCountries();
/**
* @brief Enrich cities with Wikipedia summaries.
*
* @param cities Vector of (City, country_name) pairs.
* @return Vector of enriched city data with context.
*/
std::vector<EnrichedCity> EnrichWithWikipedia(
const std::vector<std::pair<City, std::string>>& cities);
/**
* @brief Generate breweries for enriched cities.
*
* @param generator The data generator instance.
* @param cities Vector of enriched city data.
*/
void GenerateBreweries(DataGenerator& generator,
const std::vector<EnrichedCity>& cities);
/**
* @brief Log the generated brewery results.
*/
void LogResults() const;
/**
* @brief Helper struct to store generated brewery data.
*/
struct GeneratedBrewery {
int city_id;
std::string city_name;
BreweryResult brewery;
};
/// @brief Stores generated brewery data.
std::vector<GeneratedBrewery> generatedBreweries_;
};
#endif // BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_

View File

@@ -0,0 +1,31 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
#include <memory>
#include <stdexcept>
#include <string>
#include "web_client/web_client.h"
/// @brief Downloads and caches source geography JSON payloads.
class DataDownloader {
public:
/// @brief Initializes global curl state used by this downloader.
explicit DataDownloader(std::shared_ptr<WebClient> web_client);
/// @brief Cleans up global curl state.
~DataDownloader();
/// @brief Returns a local JSON path, downloading it when cache is missing.
std::string DownloadCountriesDatabase(
const std::string& cache_path,
const std::string& commit =
"c5eb7772" // Stable commit: 2026-03-28 export
);
private:
static bool FileExists(const std::string& file_path);
std::shared_ptr<WebClient> web_client_;
};
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_

View File

@@ -0,0 +1,29 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
#include <string>
struct BreweryResult {
std::string name;
std::string description;
};
struct UserResult {
std::string username;
std::string bio;
};
class DataGenerator {
public:
virtual ~DataGenerator() = default;
virtual void Load(const std::string& model_path) = 0;
virtual BreweryResult GenerateBrewery(const std::string& city_name,
const std::string& country_name,
const std::string& region_context) = 0;
virtual UserResult GenerateUser(const std::string& locale) = 0;
};
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_

View File

@@ -0,0 +1,44 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
#include <cstdint>
#include <string>
#include "data_generation/data_generator.h"
struct llama_model;
struct llama_context;
class LlamaGenerator final : public DataGenerator {
public:
LlamaGenerator() = default;
~LlamaGenerator() override;
void SetSamplingOptions(float temperature, float top_p, int seed = -1);
void Load(const std::string& model_path) override;
BreweryResult GenerateBrewery(const std::string& city_name,
const std::string& country_name,
const std::string& region_context) override;
UserResult GenerateUser(const std::string& locale) override;
private:
std::string Infer(const std::string& prompt, int max_tokens = 10000);
// Overload that allows passing a system message separately so chat-capable
// models receive a proper system role instead of having the system text
// concatenated into the user prompt (helps avoid revealing internal
// reasoning or instructions in model output).
std::string Infer(const std::string& system_prompt,
const std::string& prompt, int max_tokens = 10000);
std::string InferFormatted(const std::string& formatted_prompt,
int max_tokens = 10000);
llama_model* model_ = nullptr;
llama_context* context_ = nullptr;
float sampling_temperature_ = 0.8f;
float sampling_top_p_ = 0.92f;
uint32_t sampling_seed_ = 0xFFFFFFFFu;
};
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_

View File

@@ -0,0 +1,32 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
#include <string>
#include <utility>
struct llama_model;
struct llama_vocab;
typedef int llama_token;
// Helper functions for LlamaGenerator methods
std::string PrepareRegionContextPublic(std::string_view region_context,
std::size_t max_chars = 700);
std::pair<std::string, std::string> ParseTwoLineResponsePublic(
const std::string& raw, const std::string& error_message);
std::string ToChatPromptPublic(const llama_model* model,
const std::string& user_prompt);
std::string ToChatPromptPublic(const llama_model* model,
const std::string& system_prompt,
const std::string& user_prompt);
void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
std::string& output);
std::string ValidateBreweryJsonPublic(const std::string& raw,
std::string& name_out,
std::string& description_out);
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_

View File

@@ -0,0 +1,28 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
#include <string>
#include <vector>
#include "data_generation/data_generator.h"
class MockGenerator final : public DataGenerator {
public:
void Load(const std::string& model_path) override;
BreweryResult GenerateBrewery(const std::string& city_name,
const std::string& country_name,
const std::string& region_context) override;
UserResult GenerateUser(const std::string& locale) override;
private:
static std::size_t DeterministicHash(const std::string& a,
const std::string& b);
static const std::vector<std::string> kBreweryAdjectives;
static const std::vector<std::string> kBreweryNouns;
static const std::vector<std::string> kBreweryDescriptions;
static const std::vector<std::string> kUsernames;
static const std::vector<std::string> kBios;
};
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_

View File

@@ -0,0 +1,84 @@
#ifndef BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
#define BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
#include <sqlite3.h>
#include <mutex>
#include <string>
#include <vector>
struct Country {
/// @brief Country identifier from the source dataset.
int id;
/// @brief Country display name.
std::string name;
/// @brief ISO 3166-1 alpha-2 code.
std::string iso2;
/// @brief ISO 3166-1 alpha-3 code.
std::string iso3;
};
struct State {
/// @brief State or province identifier from the source dataset.
int id;
/// @brief State or province display name.
std::string name;
/// @brief State or province short code.
std::string iso2;
/// @brief Parent country identifier.
int country_id;
};
struct City {
/// @brief City identifier from the source dataset.
int id;
/// @brief City display name.
std::string name;
/// @brief Parent country identifier.
int country_id;
};
/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks.
class SqliteDatabase {
private:
sqlite3* db_ = nullptr;
std::mutex db_mutex_;
void InitializeSchema();
public:
/// @brief Closes the SQLite connection if initialized.
~SqliteDatabase();
/// @brief Opens the SQLite database at db_path and creates schema objects.
void Initialize(const std::string& db_path = ":memory:");
/// @brief Starts a database transaction for batched writes.
void BeginTransaction();
/// @brief Commits the active database transaction.
void CommitTransaction();
/// @brief Inserts a country row.
void InsertCountry(int id, const std::string& name, const std::string& iso2,
const std::string& iso3);
/// @brief Inserts a state row linked to a country.
void InsertState(int id, int country_id, const std::string& name,
const std::string& iso2);
/// @brief Inserts a city row linked to state and country.
void InsertCity(int id, int state_id, int country_id,
const std::string& name, double latitude, double longitude);
/// @brief Returns city records including parent country id.
std::vector<City> QueryCities();
/// @brief Returns countries with optional row limit.
std::vector<Country> QueryCountries(int limit = 0);
/// @brief Returns states with optional row limit.
std::vector<State> QueryStates(int limit = 0);
};
#endif // BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_

View File

@@ -0,0 +1,17 @@
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
#include <string>
#include "database/database.h"
#include "json_handling/stream_parser.h"
/// @brief Loads world-city JSON data into SQLite through streaming parsing.
class JsonLoader {
public:
/// @brief Parses a JSON file and writes country/state/city rows into db.
static void LoadWorldCities(const std::string& json_path,
SqliteDatabase& db);
};
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_

View File

@@ -0,0 +1,52 @@
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
#define BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
#include <functional>
#include <string>
#include "database/database.h"
// Forward declaration to avoid circular dependency
class SqliteDatabase;
/// @brief In-memory representation of one parsed city entry.
struct CityRecord {
int id;
int state_id;
int country_id;
std::string name;
double latitude;
double longitude;
};
/// @brief Streaming SAX parser that emits city records during traversal.
class StreamingJsonParser {
public:
/// @brief Parses file_path and invokes callbacks for city rows and progress.
static void Parse(const std::string& file_path, SqliteDatabase& db,
std::function<void(const CityRecord&)> on_city,
std::function<void(size_t, size_t)> on_progress = nullptr);
private:
/// @brief Mutable SAX handler state while traversing nested JSON arrays.
struct ParseState {
int current_country_id = 0;
int current_state_id = 0;
CityRecord current_city = {};
bool building_city = false;
std::string current_key;
int array_depth = 0;
int object_depth = 0;
bool in_countries_array = false;
bool in_states_array = false;
bool in_cities_array = false;
std::function<void(const CityRecord&)> on_city;
std::function<void(size_t, size_t)> on_progress;
size_t bytes_processed = 0;
};
};
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_

View File

@@ -0,0 +1,30 @@
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
#define BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
#include <memory>
#include "web_client/web_client.h"
// RAII for curl_global_init/cleanup.
// An instance of this class should be created in main() before any curl
// operations and exist for the lifetime of the application.
class CurlGlobalState {
public:
CurlGlobalState();
~CurlGlobalState();
CurlGlobalState(const CurlGlobalState&) = delete;
CurlGlobalState& operator=(const CurlGlobalState&) = delete;
};
class CURLWebClient : public WebClient {
public:
CURLWebClient();
~CURLWebClient() override;
void DownloadToFile(const std::string& url,
const std::string& file_path) override;
std::string Get(const std::string& url) override;
std::string UrlEncode(const std::string& value) override;
};
#endif // BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_

View File

@@ -0,0 +1,22 @@
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
#define BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
#include <string>
class WebClient {
public:
virtual ~WebClient() = default;
// Downloads content from a URL to a file. Throws on error.
virtual void DownloadToFile(const std::string& url,
const std::string& file_path) = 0;
// Performs a GET request and returns the response body as a string. Throws
// on error.
virtual std::string Get(const std::string& url) = 0;
// URL-encodes a string.
virtual std::string UrlEncode(const std::string& value) = 0;
};
#endif // BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_

View File

@@ -0,0 +1,27 @@
#ifndef BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
#define BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
#include <memory>
#include <string>
#include <string_view>
#include <unordered_map>
#include "web_client/web_client.h"
/// @brief Provides cached Wikipedia summary lookups for city and country pairs.
class WikipediaService {
public:
/// @brief Creates a new Wikipedia service with the provided web client.
explicit WikipediaService(std::shared_ptr<WebClient> client);
/// @brief Returns the Wikipedia summary extract for city and country.
[[nodiscard]] std::string GetSummary(std::string_view city,
std::string_view country);
private:
std::string FetchExtract(std::string_view query);
std::shared_ptr<WebClient> client_;
std::unordered_map<std::string, std::string> cache_;
};
#endif // BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_

View File

@@ -0,0 +1,157 @@
#include "biergarten_data_generator.h"
#include <spdlog/spdlog.h>
#include <algorithm>
#include <filesystem>
#include <unordered_map>
#include "data_generation/data_downloader.h"
#include "data_generation/llama_generator.h"
#include "data_generation/mock_generator.h"
#include "json_handling/json_loader.h"
#include "wikipedia/wikipedia_service.h"
BiergartenDataGenerator::BiergartenDataGenerator(
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client,
SqliteDatabase& database)
: options_(options), webClient_(web_client), database_(database) {}
std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
spdlog::info("Initializing brewery generator...");
std::unique_ptr<DataGenerator> generator;
if (options_.model_path.empty()) {
generator = std::make_unique<MockGenerator>();
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
} else {
auto llama_generator = std::make_unique<LlamaGenerator>();
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
options_.seed);
spdlog::info(
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
"seed={})",
options_.model_path, options_.temperature, options_.top_p,
options_.seed);
generator = std::move(llama_generator);
}
generator->Load(options_.model_path);
return generator;
}
void BiergartenDataGenerator::LoadGeographicData() {
std::string json_path = options_.cache_dir + "/countries+states+cities.json";
std::string db_path = options_.cache_dir + "/biergarten-pipeline.db";
bool has_json_cache = std::filesystem::exists(json_path);
bool has_db_cache = std::filesystem::exists(db_path);
spdlog::info("Initializing SQLite database at {}...", db_path);
database_.Initialize(db_path);
if (has_db_cache && has_json_cache) {
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
} else {
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
DataDownloader downloader(webClient_);
downloader.DownloadCountriesDatabase(json_path, options_.commit);
JsonLoader::LoadWorldCities(json_path, database_);
}
}
std::vector<std::pair<City, std::string>>
BiergartenDataGenerator::QueryCitiesWithCountries() {
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
auto cities = database_.QueryCities();
// Build a quick map of country id -> name for per-city lookups.
auto all_countries = database_.QueryCountries(0);
std::unordered_map<int, std::string> country_map;
for (const auto& c : all_countries) {
country_map[c.id] = c.name;
}
spdlog::info("\nTotal records loaded:");
spdlog::info(" Countries: {}", database_.QueryCountries(0).size());
spdlog::info(" States: {}", database_.QueryStates(0).size());
spdlog::info(" Cities: {}", cities.size());
// Cap at 30 entries.
const size_t sample_count = std::min(size_t(30), cities.size());
std::vector<std::pair<City, std::string>> result;
for (size_t i = 0; i < sample_count; i++) {
const auto& city = cities[i];
std::string country_name;
const auto country_it = country_map.find(city.country_id);
if (country_it != country_map.end()) {
country_name = country_it->second;
}
result.push_back({city, country_name});
}
return result;
}
std::vector<BiergartenDataGenerator::EnrichedCity>
BiergartenDataGenerator::EnrichWithWikipedia(
const std::vector<std::pair<City, std::string>>& cities) {
WikipediaService wikipedia_service(webClient_);
std::vector<EnrichedCity> enriched;
for (const auto& [city, country_name] : cities) {
const std::string region_context =
wikipedia_service.GetSummary(city.name, country_name);
spdlog::debug("[Pipeline] Region context for {}: {}", city.name,
region_context);
enriched.push_back({city.id, city.name, country_name, region_context});
}
return enriched;
}
void BiergartenDataGenerator::GenerateBreweries(
DataGenerator& generator, const std::vector<EnrichedCity>& cities) {
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
generatedBreweries_.clear();
for (const auto& enriched_city : cities) {
auto brewery = generator.GenerateBrewery(enriched_city.city_name,
enriched_city.country_name,
enriched_city.region_context);
generatedBreweries_.push_back(
{enriched_city.city_id, enriched_city.city_name, brewery});
}
}
void BiergartenDataGenerator::LogResults() const {
spdlog::info("\n=== GENERATED DATA DUMP ===");
for (size_t i = 0; i < generatedBreweries_.size(); i++) {
const auto& entry = generatedBreweries_[i];
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.city_id,
entry.city_name);
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
}
}
int BiergartenDataGenerator::Run() {
try {
LoadGeographicData();
auto generator = InitializeGenerator();
auto cities = QueryCitiesWithCountries();
auto enriched = EnrichWithWikipedia(cities);
GenerateBreweries(*generator, enriched);
LogResults();
spdlog::info("\nOK: Pipeline completed successfully");
return 0;
} catch (const std::exception& e) {
spdlog::error("ERROR: Pipeline failed: {}", e.what());
return 1;
}
}

View File

@@ -0,0 +1,49 @@
#include "data_generation/data_downloader.h"
#include <spdlog/spdlog.h>
#include <filesystem>
#include <fstream>
#include <sstream>
#include <stdexcept>
#include "web_client/web_client.h"
DataDownloader::DataDownloader(std::shared_ptr<WebClient> web_client)
: web_client_(std::move(web_client)) {}
DataDownloader::~DataDownloader() {}
bool DataDownloader::FileExists(const std::string& file_path) {
return std::filesystem::exists(file_path);
}
std::string DataDownloader::DownloadCountriesDatabase(
const std::string& cache_path, const std::string& commit) {
if (FileExists(cache_path)) {
spdlog::info("[DataDownloader] Cache hit: {}", cache_path);
return cache_path;
}
std::string short_commit = commit;
if (commit.length() > 7) {
short_commit = commit.substr(0, 7);
}
std::string url =
"https://raw.githubusercontent.com/dr5hn/"
"countries-states-cities-database/" +
short_commit + "/json/countries+states+cities.json";
spdlog::info("[DataDownloader] Downloading: {}", url);
web_client_->DownloadToFile(url, cache_path);
std::ifstream file_check(cache_path, std::ios::binary | std::ios::ate);
std::streamsize size = file_check.tellg();
file_check.close();
spdlog::info("[DataDownloader] OK: Download complete: {} ({:.2f} MB)",
cache_path, (size / (1024.0 * 1024.0)));
return cache_path;
}

View File

@@ -0,0 +1,16 @@
#include "data_generation/llama_generator.h"
#include "llama.h"
LlamaGenerator::~LlamaGenerator() {
if (context_ != nullptr) {
llama_free(context_);
context_ = nullptr;
}
if (model_ != nullptr) {
llama_model_free(model_);
model_ = nullptr;
}
llama_backend_free();
}

View File

@@ -0,0 +1,74 @@
#include <spdlog/spdlog.h>
#include <stdexcept>
#include <string>
#include "data_generation/llama_generator.h"
#include "data_generation/llama_generator_helpers.h"
BreweryResult LlamaGenerator::GenerateBrewery(
const std::string& city_name, const std::string& country_name,
const std::string& region_context) {
const std::string safe_region_context =
PrepareRegionContextPublic(region_context);
const std::string system_prompt =
"You are the brewmaster and owner of a local craft brewery. "
"Write a name and a short, soulful description for your brewery that "
"reflects your pride in the local community and your craft. "
"The tone should be authentic and welcoming, like a note on a "
"chalkboard "
"menu. Output ONLY a single JSON object with keys \"name\" and "
"\"description\". "
"Do not include markdown formatting or backticks.";
std::string prompt =
"Write a brewery name and place-specific long description for a craft "
"brewery in " +
city_name +
(country_name.empty() ? std::string("")
: std::string(", ") + country_name) +
(safe_region_context.empty()
? std::string(".")
: std::string(". Regional context: ") + safe_region_context);
const int max_attempts = 3;
std::string raw;
std::string last_error;
for (int attempt = 0; attempt < max_attempts; ++attempt) {
raw = Infer(system_prompt, prompt, 384);
spdlog::debug("LlamaGenerator: raw output (attempt {}): {}", attempt + 1,
raw);
std::string name;
std::string description;
const std::string validation_error =
ValidateBreweryJsonPublic(raw, name, description);
if (validation_error.empty()) {
return {std::move(name), std::move(description)};
}
last_error = validation_error;
spdlog::warn("LlamaGenerator: malformed brewery JSON (attempt {}): {}",
attempt + 1, validation_error);
prompt =
"Your previous response was invalid. Error: " + validation_error +
"\nReturn ONLY valid JSON with this exact schema: "
"{\"name\": \"string\", \"description\": \"string\"}."
"\nDo not include markdown, comments, or extra keys."
"\n\nLocation: " +
city_name +
(country_name.empty() ? std::string("")
: std::string(", ") + country_name) +
(safe_region_context.empty()
? std::string("")
: std::string("\nRegional context: ") + safe_region_context);
}
spdlog::error(
"LlamaGenerator: malformed brewery response after {} attempts: "
"{}",
max_attempts, last_error.empty() ? raw : last_error);
throw std::runtime_error("LlamaGenerator: malformed brewery response");
}

View File

@@ -0,0 +1,57 @@
#include <spdlog/spdlog.h>
#include <algorithm>
#include <stdexcept>
#include <string>
#include "data_generation/llama_generator.h"
#include "data_generation/llama_generator_helpers.h"
UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
const std::string system_prompt =
"You generate plausible social media profiles for craft beer "
"enthusiasts. "
"Respond with exactly two lines: "
"the first line is a username (lowercase, no spaces, 8-20 characters), "
"the second line is a one-sentence bio (20-40 words). "
"The profile should feel consistent with the locale. "
"No preamble, no labels.";
std::string prompt =
"Generate a craft beer enthusiast profile. Locale: " + locale;
const int max_attempts = 3;
std::string raw;
for (int attempt = 0; attempt < max_attempts; ++attempt) {
raw = Infer(system_prompt, prompt, 128);
spdlog::debug("LlamaGenerator (user): raw output (attempt {}): {}",
attempt + 1, raw);
try {
auto [username, bio] = ParseTwoLineResponsePublic(
raw, "LlamaGenerator: malformed user response");
username.erase(
std::remove_if(username.begin(), username.end(),
[](unsigned char ch) { return std::isspace(ch); }),
username.end());
if (username.empty() || bio.empty()) {
throw std::runtime_error("LlamaGenerator: malformed user response");
}
if (bio.size() > 200) bio = bio.substr(0, 200);
return {username, bio};
} catch (const std::exception& e) {
spdlog::warn(
"LlamaGenerator: malformed user response (attempt {}): {}",
attempt + 1, e.what());
}
}
spdlog::error(
"LlamaGenerator: malformed user response after {} attempts: {}",
max_attempts, raw);
throw std::runtime_error("LlamaGenerator: malformed user response");
}

View File

@@ -0,0 +1,398 @@
#include <algorithm>
#include <array>
#include <boost/json.hpp>
#include <cctype>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>
#include "data_generation/llama_generator.h"
#include "llama.h"
namespace {
std::string Trim(std::string value) {
auto not_space = [](unsigned char ch) { return !std::isspace(ch); };
value.erase(value.begin(),
std::find_if(value.begin(), value.end(), not_space));
value.erase(std::find_if(value.rbegin(), value.rend(), not_space).base(),
value.end());
return value;
}
std::string CondenseWhitespace(std::string text) {
std::string out;
out.reserve(text.size());
bool in_whitespace = false;
for (unsigned char ch : text) {
if (std::isspace(ch)) {
if (!in_whitespace) {
out.push_back(' ');
in_whitespace = true;
}
continue;
}
in_whitespace = false;
out.push_back(static_cast<char>(ch));
}
return Trim(std::move(out));
}
std::string PrepareRegionContext(std::string_view region_context,
std::size_t max_chars) {
std::string normalized = CondenseWhitespace(std::string(region_context));
if (normalized.size() <= max_chars) {
return normalized;
}
normalized.resize(max_chars);
const std::size_t last_space = normalized.find_last_of(' ');
if (last_space != std::string::npos && last_space > max_chars / 2) {
normalized.resize(last_space);
}
normalized += "...";
return normalized;
}
std::string StripCommonPrefix(std::string line) {
line = Trim(std::move(line));
if (!line.empty() && (line[0] == '-' || line[0] == '*')) {
line = Trim(line.substr(1));
} else {
std::size_t i = 0;
while (i < line.size() &&
std::isdigit(static_cast<unsigned char>(line[i]))) {
++i;
}
if (i > 0 && i < line.size() && (line[i] == '.' || line[i] == ')')) {
line = Trim(line.substr(i + 1));
}
}
auto strip_label = [&line](const std::string& label) {
if (line.size() >= label.size()) {
bool matches = true;
for (std::size_t i = 0; i < label.size(); ++i) {
if (std::tolower(static_cast<unsigned char>(line[i])) !=
std::tolower(static_cast<unsigned char>(label[i]))) {
matches = false;
break;
}
}
if (matches) {
line = Trim(line.substr(label.size()));
}
}
};
strip_label("name:");
strip_label("brewery name:");
strip_label("description:");
strip_label("username:");
strip_label("bio:");
return Trim(std::move(line));
}
std::pair<std::string, std::string> ParseTwoLineResponse(
const std::string& raw, const std::string& error_message) {
std::string normalized = raw;
std::replace(normalized.begin(), normalized.end(), '\r', '\n');
std::vector<std::string> lines;
std::stringstream stream(normalized);
std::string line;
while (std::getline(stream, line)) {
line = StripCommonPrefix(std::move(line));
if (!line.empty()) lines.push_back(std::move(line));
}
std::vector<std::string> filtered;
for (auto& l : lines) {
std::string low = l;
std::transform(low.begin(), low.end(), low.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
if (!l.empty() && l.front() == '<' && low.back() == '>') continue;
if (low.rfind("okay,", 0) == 0 || low.rfind("hmm", 0) == 0) continue;
filtered.push_back(std::move(l));
}
if (filtered.size() < 2) throw std::runtime_error(error_message);
std::string first = Trim(filtered.front());
std::string second;
for (size_t i = 1; i < filtered.size(); ++i) {
if (!second.empty()) second += ' ';
second += filtered[i];
}
second = Trim(std::move(second));
if (first.empty() || second.empty()) throw std::runtime_error(error_message);
return {first, second};
}
std::string ToChatPrompt(const llama_model* model,
const std::string& user_prompt) {
const char* tmpl = llama_model_chat_template(model, nullptr);
if (tmpl == nullptr) {
return user_prompt;
}
const llama_chat_message message{"user", user_prompt.c_str()};
std::vector<char> buffer(
std::max<std::size_t>(1024, user_prompt.size() * 4));
int32_t required =
llama_chat_apply_template(tmpl, &message, 1, true, buffer.data(),
static_cast<int32_t>(buffer.size()));
if (required < 0) {
throw std::runtime_error("LlamaGenerator: failed to apply chat template");
}
if (required >= static_cast<int32_t>(buffer.size())) {
buffer.resize(static_cast<std::size_t>(required) + 1);
required =
llama_chat_apply_template(tmpl, &message, 1, true, buffer.data(),
static_cast<int32_t>(buffer.size()));
if (required < 0) {
throw std::runtime_error(
"LlamaGenerator: failed to apply chat template");
}
}
return std::string(buffer.data(), static_cast<std::size_t>(required));
}
std::string ToChatPrompt(const llama_model* model,
const std::string& system_prompt,
const std::string& user_prompt) {
const char* tmpl = llama_model_chat_template(model, nullptr);
if (tmpl == nullptr) {
return system_prompt + "\n\n" + user_prompt;
}
const llama_chat_message messages[2] = {{"system", system_prompt.c_str()},
{"user", user_prompt.c_str()}};
std::vector<char> buffer(std::max<std::size_t>(
1024, (system_prompt.size() + user_prompt.size()) * 4));
int32_t required =
llama_chat_apply_template(tmpl, messages, 2, true, buffer.data(),
static_cast<int32_t>(buffer.size()));
if (required < 0) {
throw std::runtime_error("LlamaGenerator: failed to apply chat template");
}
if (required >= static_cast<int32_t>(buffer.size())) {
buffer.resize(static_cast<std::size_t>(required) + 1);
required =
llama_chat_apply_template(tmpl, messages, 2, true, buffer.data(),
static_cast<int32_t>(buffer.size()));
if (required < 0) {
throw std::runtime_error(
"LlamaGenerator: failed to apply chat template");
}
}
return std::string(buffer.data(), static_cast<std::size_t>(required));
}
void AppendTokenPiece(const llama_vocab* vocab, llama_token token,
std::string& output) {
std::array<char, 256> buffer{};
int32_t bytes =
llama_token_to_piece(vocab, token, buffer.data(),
static_cast<int32_t>(buffer.size()), 0, true);
if (bytes < 0) {
std::vector<char> dynamic_buffer(static_cast<std::size_t>(-bytes));
bytes = llama_token_to_piece(vocab, token, dynamic_buffer.data(),
static_cast<int32_t>(dynamic_buffer.size()),
0, true);
if (bytes < 0) {
throw std::runtime_error(
"LlamaGenerator: failed to decode sampled token piece");
}
output.append(dynamic_buffer.data(), static_cast<std::size_t>(bytes));
return;
}
output.append(buffer.data(), static_cast<std::size_t>(bytes));
}
bool ExtractFirstJsonObject(const std::string& text, std::string& json_out) {
std::size_t start = std::string::npos;
int depth = 0;
bool in_string = false;
bool escaped = false;
for (std::size_t i = 0; i < text.size(); ++i) {
const char ch = text[i];
if (in_string) {
if (escaped) {
escaped = false;
} else if (ch == '\\') {
escaped = true;
} else if (ch == '"') {
in_string = false;
}
continue;
}
if (ch == '"') {
in_string = true;
continue;
}
if (ch == '{') {
if (depth == 0) {
start = i;
}
++depth;
continue;
}
if (ch == '}') {
if (depth == 0) {
continue;
}
--depth;
if (depth == 0 && start != std::string::npos) {
json_out = text.substr(start, i - start + 1);
return true;
}
}
}
return false;
}
std::string ValidateBreweryJson(const std::string& raw, std::string& name_out,
std::string& description_out) {
auto validate_object = [&](const boost::json::value& jv,
std::string& error_out) -> bool {
if (!jv.is_object()) {
error_out = "JSON root must be an object";
return false;
}
const auto& obj = jv.get_object();
if (!obj.contains("name") || !obj.at("name").is_string()) {
error_out = "JSON field 'name' is missing or not a string";
return false;
}
if (!obj.contains("description") || !obj.at("description").is_string()) {
error_out = "JSON field 'description' is missing or not a string";
return false;
}
name_out = Trim(std::string(obj.at("name").as_string().c_str()));
description_out =
Trim(std::string(obj.at("description").as_string().c_str()));
if (name_out.empty()) {
error_out = "JSON field 'name' must not be empty";
return false;
}
if (description_out.empty()) {
error_out = "JSON field 'description' must not be empty";
return false;
}
std::string name_lower = name_out;
std::string description_lower = description_out;
std::transform(
name_lower.begin(), name_lower.end(), name_lower.begin(),
[](unsigned char c) { return static_cast<char>(std::tolower(c)); });
std::transform(description_lower.begin(), description_lower.end(),
description_lower.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c));
});
if (name_lower == "string" || description_lower == "string") {
error_out = "JSON appears to be a schema placeholder, not content";
return false;
}
error_out.clear();
return true;
};
boost::system::error_code ec;
boost::json::value jv = boost::json::parse(raw, ec);
std::string validation_error;
if (ec) {
std::string extracted;
if (!ExtractFirstJsonObject(raw, extracted)) {
return "JSON parse error: " + ec.message();
}
ec.clear();
jv = boost::json::parse(extracted, ec);
if (ec) {
return "JSON parse error: " + ec.message();
}
if (!validate_object(jv, validation_error)) {
return validation_error;
}
return {};
}
if (!validate_object(jv, validation_error)) {
return validation_error;
}
return {};
}
} // namespace
// Forward declarations for helper functions exposed to other translation units
std::string PrepareRegionContextPublic(std::string_view region_context,
std::size_t max_chars) {
return PrepareRegionContext(region_context, max_chars);
}
std::pair<std::string, std::string> ParseTwoLineResponsePublic(
const std::string& raw, const std::string& error_message) {
return ParseTwoLineResponse(raw, error_message);
}
std::string ToChatPromptPublic(const llama_model* model,
const std::string& user_prompt) {
return ToChatPrompt(model, user_prompt);
}
std::string ToChatPromptPublic(const llama_model* model,
const std::string& system_prompt,
const std::string& user_prompt) {
return ToChatPrompt(model, system_prompt, user_prompt);
}
void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
std::string& output) {
AppendTokenPiece(vocab, token, output);
}
std::string ValidateBreweryJsonPublic(const std::string& raw,
std::string& name_out,
std::string& description_out) {
return ValidateBreweryJson(raw, name_out, description_out);
}

View File

@@ -0,0 +1,111 @@
#include <spdlog/spdlog.h>
#include <algorithm>
#include <memory>
#include <stdexcept>
#include <string>
#include <vector>
#include "data_generation/llama_generator.h"
#include "data_generation/llama_generator_helpers.h"
#include "llama.h"
std::string LlamaGenerator::Infer(const std::string& prompt, int max_tokens) {
return InferFormatted(ToChatPromptPublic(model_, prompt), max_tokens);
}
std::string LlamaGenerator::Infer(const std::string& system_prompt,
const std::string& prompt, int max_tokens) {
return InferFormatted(ToChatPromptPublic(model_, system_prompt, prompt),
max_tokens);
}
std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
int max_tokens) {
if (model_ == nullptr || context_ == nullptr)
throw std::runtime_error("LlamaGenerator: model not loaded");
const llama_vocab* vocab = llama_model_get_vocab(model_);
if (vocab == nullptr)
throw std::runtime_error("LlamaGenerator: vocab unavailable");
llama_memory_clear(llama_get_memory(context_), true);
std::vector<llama_token> prompt_tokens(formatted_prompt.size() + 8);
int32_t token_count = llama_tokenize(
vocab, formatted_prompt.c_str(),
static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
static_cast<int32_t>(prompt_tokens.size()), true, true);
if (token_count < 0) {
prompt_tokens.resize(static_cast<std::size_t>(-token_count));
token_count = llama_tokenize(
vocab, formatted_prompt.c_str(),
static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
static_cast<int32_t>(prompt_tokens.size()), true, true);
}
if (token_count < 0)
throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
const int32_t n_ctx = static_cast<int32_t>(llama_n_ctx(context_));
const int32_t n_batch = static_cast<int32_t>(llama_n_batch(context_));
if (n_ctx <= 1 || n_batch <= 0)
throw std::runtime_error("LlamaGenerator: invalid context or batch size");
const int32_t effective_max_tokens =
std::max(1, std::min(max_tokens, n_ctx - 1));
int32_t prompt_budget = std::min(n_batch, n_ctx - effective_max_tokens);
prompt_budget = std::max<int32_t>(1, prompt_budget);
prompt_tokens.resize(static_cast<std::size_t>(token_count));
if (token_count > prompt_budget) {
spdlog::warn(
"LlamaGenerator: prompt too long ({} tokens), truncating to {} "
"tokens to fit n_batch/n_ctx limits",
token_count, prompt_budget);
prompt_tokens.resize(static_cast<std::size_t>(prompt_budget));
token_count = prompt_budget;
}
const llama_batch prompt_batch = llama_batch_get_one(
prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
if (llama_decode(context_, prompt_batch) != 0)
throw std::runtime_error("LlamaGenerator: prompt decode failed");
llama_sampler_chain_params sampler_params =
llama_sampler_chain_default_params();
using SamplerPtr =
std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
SamplerPtr sampler(llama_sampler_chain_init(sampler_params),
&llama_sampler_free);
if (!sampler)
throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
llama_sampler_chain_add(sampler.get(),
llama_sampler_init_temp(sampling_temperature_));
llama_sampler_chain_add(sampler.get(),
llama_sampler_init_top_p(sampling_top_p_, 1));
llama_sampler_chain_add(sampler.get(),
llama_sampler_init_dist(sampling_seed_));
std::vector<llama_token> generated_tokens;
generated_tokens.reserve(static_cast<std::size_t>(effective_max_tokens));
for (int i = 0; i < effective_max_tokens; ++i) {
const llama_token next =
llama_sampler_sample(sampler.get(), context_, -1);
if (llama_vocab_is_eog(vocab, next)) break;
generated_tokens.push_back(next);
llama_token token = next;
const llama_batch one_token_batch = llama_batch_get_one(&token, 1);
if (llama_decode(context_, one_token_batch) != 0)
throw std::runtime_error(
"LlamaGenerator: decode failed during generation");
}
std::string output;
for (const llama_token token : generated_tokens)
AppendTokenPiecePublic(vocab, token, output);
return output;
}

View File

@@ -0,0 +1,42 @@
#include <spdlog/spdlog.h>
#include <stdexcept>
#include <string>
#include "data_generation/llama_generator.h"
#include "llama.h"
void LlamaGenerator::Load(const std::string& model_path) {
if (model_path.empty())
throw std::runtime_error("LlamaGenerator: model path must not be empty");
if (context_ != nullptr) {
llama_free(context_);
context_ = nullptr;
}
if (model_ != nullptr) {
llama_model_free(model_);
model_ = nullptr;
}
llama_backend_init();
llama_model_params model_params = llama_model_default_params();
model_ = llama_model_load_from_file(model_path.c_str(), model_params);
if (model_ == nullptr) {
throw std::runtime_error(
"LlamaGenerator: failed to load model from path: " + model_path);
}
llama_context_params context_params = llama_context_default_params();
context_params.n_ctx = 2048;
context_ = llama_init_from_model(model_, context_params);
if (context_ == nullptr) {
llama_model_free(model_);
model_ = nullptr;
throw std::runtime_error("LlamaGenerator: failed to create context");
}
spdlog::info("[LlamaGenerator] Loaded model: {}", model_path);
}

View File

@@ -0,0 +1,25 @@
#include <stdexcept>
#include "data_generation/llama_generator.h"
#include "llama.h"
void LlamaGenerator::SetSamplingOptions(float temperature, float top_p,
int seed) {
if (temperature < 0.0f) {
throw std::runtime_error(
"LlamaGenerator: sampling temperature must be >= 0");
}
if (!(top_p > 0.0f && top_p <= 1.0f)) {
throw std::runtime_error(
"LlamaGenerator: sampling top-p must be in (0, 1]");
}
if (seed < -1) {
throw std::runtime_error(
"LlamaGenerator: seed must be >= 0, or -1 for random");
}
sampling_temperature_ = temperature;
sampling_top_p_ = top_p;
sampling_seed_ = (seed < 0) ? static_cast<uint32_t>(LLAMA_DEFAULT_SEED)
: static_cast<uint32_t>(seed);
}

View File

@@ -0,0 +1,65 @@
#include <string>
#include <vector>
#include "data_generation/mock_generator.h"
const std::vector<std::string> MockGenerator::kBreweryAdjectives = {
"Craft", "Heritage", "Local", "Artisan", "Pioneer", "Golden",
"Modern", "Classic", "Summit", "Northern", "Riverstone", "Barrel",
"Hinterland", "Harbor", "Wild", "Granite", "Copper", "Maple"};
const std::vector<std::string> MockGenerator::kBreweryNouns = {
"Brewing Co.", "Brewery", "Bier Haus", "Taproom", "Works",
"House", "Fermentery", "Ale Co.", "Cellars", "Collective",
"Project", "Foundry", "Malthouse", "Public House", "Co-op",
"Lab", "Beer Hall", "Guild"};
const std::vector<std::string> MockGenerator::kBreweryDescriptions = {
"Handcrafted pale ales and seasonal IPAs with local ingredients.",
"Traditional lagers and experimental sours in small batches.",
"Award-winning stouts and wildly hoppy blonde ales.",
"Craft brewery specializing in Belgian-style triples and dark porters.",
"Modern brewery blending tradition with bold experimental flavors.",
"Neighborhood-focused taproom pouring crisp pilsners and citrusy pale "
"ales.",
"Small-batch brewery known for barrel-aged releases and smoky lagers.",
"Independent brewhouse pairing farmhouse ales with rotating food pop-ups.",
"Community brewpub making balanced bitters, saisons, and hazy IPAs.",
"Experimental nanobrewery exploring local yeast and regional grains.",
"Family-run brewery producing smooth amber ales and robust porters.",
"Urban brewery crafting clean lagers and bright, fruit-forward sours.",
"Riverfront brewhouse featuring oak-matured ales and seasonal blends.",
"Modern taproom focused on sessionable lagers and classic pub styles.",
"Brewery rooted in tradition with a lineup of malty reds and crisp lagers.",
"Creative brewery offering rotating collaborations and limited draft-only "
"pours.",
"Locally inspired brewery serving approachable ales with bold hop "
"character.",
"Destination taproom known for balanced IPAs and cocoa-rich stouts."};
const std::vector<std::string> MockGenerator::kUsernames = {
"hopseeker", "malttrail", "yeastwhisper", "lagerlane",
"barrelbound", "foamfinder", "taphunter", "graingeist",
"brewscout", "aleatlas", "caskcompass", "hopsandmaps",
"mashpilot", "pintnomad", "fermentfriend", "stoutsignal",
"sessionwander", "kettlekeeper"};
const std::vector<std::string> MockGenerator::kBios = {
"Always chasing balanced IPAs and crisp lagers across local taprooms.",
"Weekend brewery explorer with a soft spot for dark, roasty stouts.",
"Documenting tiny brewpubs, fresh pours, and unforgettable beer gardens.",
"Fan of farmhouse ales, food pairings, and long tasting flights.",
"Collecting favorite pilsners one city at a time.",
"Hops-first drinker who still saves room for classic malt-forward styles.",
"Finding hidden tap lists and sharing the best seasonal releases.",
"Brewery road-tripper focused on local ingredients and clean fermentation.",
"Always comparing house lagers and ranking patio pint vibes.",
"Curious about yeast strains, barrel programs, and cellar experiments.",
"Believes every neighborhood deserves a great community taproom.",
"Looking for session beers that taste great from first sip to last.",
"Belgian ale enthusiast who never skips a new saison.",
"Hazy IPA critic with deep respect for a perfectly clear pilsner.",
"Visits breweries for the stories, stays for the flagship pours.",
"Craft beer fan mapping tasting notes and favorite brew routes.",
"Always ready to trade recommendations for underrated local breweries.",
"Keeping a running list of must-try collab releases and tap takeovers."};

View File

@@ -0,0 +1,12 @@
#include <string>
#include "data_generation/mock_generator.h"
std::size_t MockGenerator::DeterministicHash(const std::string& a,
const std::string& b) {
std::size_t seed = std::hash<std::string>{}(a);
const std::size_t mixed = std::hash<std::string>{}(b);
seed ^= mixed + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
seed = (seed << 13) | (seed >> ((sizeof(std::size_t) * 8) - 13));
return seed;
}

View File

@@ -0,0 +1,21 @@
#include <functional>
#include <string>
#include "data_generation/mock_generator.h"
BreweryResult MockGenerator::GenerateBrewery(
const std::string& city_name, const std::string& country_name,
const std::string& region_context) {
const std::string location_key =
country_name.empty() ? city_name : city_name + "," + country_name;
const std::size_t hash =
region_context.empty() ? std::hash<std::string>{}(location_key)
: DeterministicHash(location_key, region_context);
BreweryResult result;
result.name = kBreweryAdjectives[hash % kBreweryAdjectives.size()] + " " +
kBreweryNouns[(hash / 7) % kBreweryNouns.size()];
result.description =
kBreweryDescriptions[(hash / 13) % kBreweryDescriptions.size()];
return result;
}

View File

@@ -0,0 +1,13 @@
#include <functional>
#include <string>
#include "data_generation/mock_generator.h"
UserResult MockGenerator::GenerateUser(const std::string& locale) {
const std::size_t hash = std::hash<std::string>{}(locale);
UserResult result;
result.username = kUsernames[hash % kUsernames.size()];
result.bio = kBios[(hash / 11) % kBios.size()];
return result;
}

View File

@@ -0,0 +1,9 @@
#include <spdlog/spdlog.h>
#include <string>
#include "data_generation/mock_generator.h"
void MockGenerator::Load(const std::string& /*modelPath*/) {
spdlog::info("[MockGenerator] No model needed");
}

View File

@@ -0,0 +1,253 @@
#include "database/database.h"
#include <spdlog/spdlog.h>
#include <stdexcept>
void SqliteDatabase::InitializeSchema() {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* schema = R"(
CREATE TABLE IF NOT EXISTS countries (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
iso2 TEXT,
iso3 TEXT
);
CREATE TABLE IF NOT EXISTS states (
id INTEGER PRIMARY KEY,
country_id INTEGER NOT NULL,
name TEXT NOT NULL,
iso2 TEXT,
FOREIGN KEY(country_id) REFERENCES countries(id)
);
CREATE TABLE IF NOT EXISTS cities (
id INTEGER PRIMARY KEY,
state_id INTEGER NOT NULL,
country_id INTEGER NOT NULL,
name TEXT NOT NULL,
latitude REAL,
longitude REAL,
FOREIGN KEY(state_id) REFERENCES states(id),
FOREIGN KEY(country_id) REFERENCES countries(id)
);
)";
char* errMsg = nullptr;
int rc = sqlite3_exec(db_, schema, nullptr, nullptr, &errMsg);
if (rc != SQLITE_OK) {
std::string error = errMsg ? std::string(errMsg) : "Unknown error";
sqlite3_free(errMsg);
throw std::runtime_error("Failed to create schema: " + error);
}
}
SqliteDatabase::~SqliteDatabase() {
if (db_) {
sqlite3_close(db_);
}
}
void SqliteDatabase::Initialize(const std::string& db_path) {
int rc = sqlite3_open(db_path.c_str(), &db_);
if (rc) {
throw std::runtime_error("Failed to open SQLite database: " + db_path);
}
spdlog::info("OK: SQLite database opened: {}", db_path);
InitializeSchema();
}
void SqliteDatabase::BeginTransaction() {
std::lock_guard<std::mutex> lock(db_mutex_);
char* err = nullptr;
if (sqlite3_exec(db_, "BEGIN TRANSACTION", nullptr, nullptr, &err) !=
SQLITE_OK) {
std::string msg = err ? err : "unknown";
sqlite3_free(err);
throw std::runtime_error("BeginTransaction failed: " + msg);
}
}
void SqliteDatabase::CommitTransaction() {
std::lock_guard<std::mutex> lock(db_mutex_);
char* err = nullptr;
if (sqlite3_exec(db_, "COMMIT", nullptr, nullptr, &err) != SQLITE_OK) {
std::string msg = err ? err : "unknown";
sqlite3_free(err);
throw std::runtime_error("CommitTransaction failed: " + msg);
}
}
void SqliteDatabase::InsertCountry(int id, const std::string& name,
const std::string& iso2,
const std::string& iso3) {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* query = R"(
INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
VALUES (?, ?, ?, ?)
)";
sqlite3_stmt* stmt;
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK)
throw std::runtime_error("Failed to prepare country insert");
sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_STATIC);
sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_STATIC);
sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_STATIC);
if (sqlite3_step(stmt) != SQLITE_DONE) {
throw std::runtime_error("Failed to insert country");
}
sqlite3_finalize(stmt);
}
void SqliteDatabase::InsertState(int id, int country_id,
const std::string& name,
const std::string& iso2) {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* query = R"(
INSERT OR IGNORE INTO states (id, country_id, name, iso2)
VALUES (?, ?, ?, ?)
)";
sqlite3_stmt* stmt;
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK)
throw std::runtime_error("Failed to prepare state insert");
sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_int(stmt, 2, country_id);
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_STATIC);
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_STATIC);
if (sqlite3_step(stmt) != SQLITE_DONE) {
throw std::runtime_error("Failed to insert state");
}
sqlite3_finalize(stmt);
}
void SqliteDatabase::InsertCity(int id, int state_id, int country_id,
const std::string& name, double latitude,
double longitude) {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* query = R"(
INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
VALUES (?, ?, ?, ?, ?, ?)
)";
sqlite3_stmt* stmt;
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK)
throw std::runtime_error("Failed to prepare city insert");
sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_int(stmt, 2, state_id);
sqlite3_bind_int(stmt, 3, country_id);
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_STATIC);
sqlite3_bind_double(stmt, 5, latitude);
sqlite3_bind_double(stmt, 6, longitude);
if (sqlite3_step(stmt) != SQLITE_DONE) {
throw std::runtime_error("Failed to insert city");
}
sqlite3_finalize(stmt);
}
std::vector<City> SqliteDatabase::QueryCities() {
std::lock_guard<std::mutex> lock(db_mutex_);
std::vector<City> cities;
sqlite3_stmt* stmt = nullptr;
const char* query = "SELECT id, name, country_id FROM cities ORDER BY name";
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK) {
throw std::runtime_error("Failed to prepare query");
}
while (sqlite3_step(stmt) == SQLITE_ROW) {
int id = sqlite3_column_int(stmt, 0);
const char* name =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
int country_id = sqlite3_column_int(stmt, 2);
cities.push_back({id, name ? std::string(name) : "", country_id});
}
sqlite3_finalize(stmt);
return cities;
}
std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
std::lock_guard<std::mutex> lock(db_mutex_);
std::vector<Country> countries;
sqlite3_stmt* stmt = nullptr;
std::string query =
"SELECT id, name, iso2, iso3 FROM countries ORDER BY name";
if (limit > 0) {
query += " LIMIT " + std::to_string(limit);
}
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
if (rc != SQLITE_OK) {
throw std::runtime_error("Failed to prepare countries query");
}
while (sqlite3_step(stmt) == SQLITE_ROW) {
int id = sqlite3_column_int(stmt, 0);
const char* name =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
const char* iso2 =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
const char* iso3 =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 3));
countries.push_back({id, name ? std::string(name) : "",
iso2 ? std::string(iso2) : "",
iso3 ? std::string(iso3) : ""});
}
sqlite3_finalize(stmt);
return countries;
}
std::vector<State> SqliteDatabase::QueryStates(int limit) {
std::lock_guard<std::mutex> lock(db_mutex_);
std::vector<State> states;
sqlite3_stmt* stmt = nullptr;
std::string query =
"SELECT id, name, iso2, country_id FROM states ORDER BY name";
if (limit > 0) {
query += " LIMIT " + std::to_string(limit);
}
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
if (rc != SQLITE_OK) {
throw std::runtime_error("Failed to prepare states query");
}
while (sqlite3_step(stmt) == SQLITE_ROW) {
int id = sqlite3_column_int(stmt, 0);
const char* name =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
const char* iso2 =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
int country_id = sqlite3_column_int(stmt, 3);
states.push_back({id, name ? std::string(name) : "",
iso2 ? std::string(iso2) : "", country_id});
}
sqlite3_finalize(stmt);
return states;
}

View File

@@ -0,0 +1,66 @@
#include "json_handling/json_loader.h"
#include <spdlog/spdlog.h>
#include <chrono>
#include "json_handling/stream_parser.h"
void JsonLoader::LoadWorldCities(const std::string& json_path,
SqliteDatabase& db) {
constexpr size_t kBatchSize = 10000;
auto startTime = std::chrono::high_resolution_clock::now();
spdlog::info("\nLoading {} (streaming RapidJSON SAX)...", json_path);
db.BeginTransaction();
bool transactionOpen = true;
size_t citiesProcessed = 0;
try {
StreamingJsonParser::Parse(
json_path, db,
[&](const CityRecord& record) {
db.InsertCity(record.id, record.state_id, record.country_id,
record.name, record.latitude, record.longitude);
++citiesProcessed;
if (citiesProcessed % kBatchSize == 0) {
db.CommitTransaction();
db.BeginTransaction();
}
},
[&](size_t current, size_t /*total*/) {
if (current % kBatchSize == 0 && current > 0) {
spdlog::info(" [Progress] Parsed {} cities...", current);
}
});
spdlog::info(" OK: Parsed all cities from JSON");
if (transactionOpen) {
db.CommitTransaction();
transactionOpen = false;
}
} catch (...) {
if (transactionOpen) {
db.CommitTransaction();
}
throw;
}
auto endTime = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
endTime - startTime);
spdlog::info("\n=== World City Data Loading Summary ===\n");
spdlog::info("Cities inserted: {}", citiesProcessed);
spdlog::info("Elapsed time: {} ms", duration.count());
long long throughput =
(citiesProcessed > 0 && duration.count() > 0)
? (1000LL * static_cast<long long>(citiesProcessed)) /
static_cast<long long>(duration.count())
: 0LL;
spdlog::info("Throughput: {} cities/sec", throughput);
spdlog::info("=======================================\n");
}

View File

@@ -0,0 +1,289 @@
#include "json_handling/stream_parser.h"
#include <spdlog/spdlog.h>
#include <boost/json.hpp>
#include <boost/json/basic_parser_impl.hpp>
#include <cstdio>
#include <stdexcept>
#include "database/database.h"
class CityRecordHandler {
friend class boost::json::basic_parser<CityRecordHandler>;
public:
static constexpr std::size_t max_array_size = static_cast<std::size_t>(-1);
static constexpr std::size_t max_object_size = static_cast<std::size_t>(-1);
static constexpr std::size_t max_string_size = static_cast<std::size_t>(-1);
static constexpr std::size_t max_key_size = static_cast<std::size_t>(-1);
struct ParseContext {
SqliteDatabase* db = nullptr;
std::function<void(const CityRecord&)> on_city;
std::function<void(size_t, size_t)> on_progress;
size_t cities_emitted = 0;
size_t total_file_size = 0;
int countries_inserted = 0;
int states_inserted = 0;
};
explicit CityRecordHandler(ParseContext& ctx) : context(ctx) {}
private:
ParseContext& context;
int depth = 0;
bool in_countries_array = false;
bool in_country_object = false;
bool in_states_array = false;
bool in_state_object = false;
bool in_cities_array = false;
bool building_city = false;
int current_country_id = 0;
int current_state_id = 0;
CityRecord current_city = {};
std::string current_key;
std::string current_key_val;
std::string current_string_val;
std::string country_info[3];
std::string state_info[2];
// Boost.JSON SAX Hooks
bool on_document_begin(boost::system::error_code&) { return true; }
bool on_document_end(boost::system::error_code&) { return true; }
bool on_array_begin(boost::system::error_code&) {
depth++;
if (depth == 1) {
in_countries_array = true;
} else if (depth == 3 && current_key == "states") {
in_states_array = true;
} else if (depth == 5 && current_key == "cities") {
in_cities_array = true;
}
return true;
}
bool on_array_end(std::size_t, boost::system::error_code&) {
if (depth == 1) {
in_countries_array = false;
} else if (depth == 3) {
in_states_array = false;
} else if (depth == 5) {
in_cities_array = false;
}
depth--;
return true;
}
bool on_object_begin(boost::system::error_code&) {
depth++;
if (depth == 2 && in_countries_array) {
in_country_object = true;
current_country_id = 0;
country_info[0].clear();
country_info[1].clear();
country_info[2].clear();
} else if (depth == 4 && in_states_array) {
in_state_object = true;
current_state_id = 0;
state_info[0].clear();
state_info[1].clear();
} else if (depth == 6 && in_cities_array) {
building_city = true;
current_city = {};
}
return true;
}
bool on_object_end(std::size_t, boost::system::error_code&) {
if (depth == 6 && building_city) {
if (current_city.id > 0 && current_state_id > 0 &&
current_country_id > 0) {
current_city.state_id = current_state_id;
current_city.country_id = current_country_id;
try {
context.on_city(current_city);
context.cities_emitted++;
if (context.on_progress && context.cities_emitted % 10000 == 0) {
context.on_progress(context.cities_emitted,
context.total_file_size);
}
} catch (const std::exception& e) {
spdlog::warn("Record parsing failed: {}", e.what());
}
}
building_city = false;
} else if (depth == 4 && in_state_object) {
if (current_state_id > 0 && current_country_id > 0) {
try {
context.db->InsertState(current_state_id, current_country_id,
state_info[0], state_info[1]);
context.states_inserted++;
} catch (const std::exception& e) {
spdlog::warn("Record parsing failed: {}", e.what());
}
}
in_state_object = false;
} else if (depth == 2 && in_country_object) {
if (current_country_id > 0) {
try {
context.db->InsertCountry(current_country_id, country_info[0],
country_info[1], country_info[2]);
context.countries_inserted++;
} catch (const std::exception& e) {
spdlog::warn("Record parsing failed: {}", e.what());
}
}
in_country_object = false;
}
depth--;
return true;
}
bool on_key_part(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_key_val.append(s.data(), s.size());
return true;
}
bool on_key(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_key_val.append(s.data(), s.size());
current_key = current_key_val;
current_key_val.clear();
return true;
}
bool on_string_part(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_string_val.append(s.data(), s.size());
return true;
}
bool on_string(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_string_val.append(s.data(), s.size());
if (building_city && current_key == "name") {
current_city.name = current_string_val;
} else if (in_state_object && current_key == "name") {
state_info[0] = current_string_val;
} else if (in_state_object && current_key == "iso2") {
state_info[1] = current_string_val;
} else if (in_country_object && current_key == "name") {
country_info[0] = current_string_val;
} else if (in_country_object && current_key == "iso2") {
country_info[1] = current_string_val;
} else if (in_country_object && current_key == "iso3") {
country_info[2] = current_string_val;
}
current_string_val.clear();
return true;
}
bool on_number_part(boost::json::string_view, boost::system::error_code&) {
return true;
}
bool on_int64(int64_t i, boost::json::string_view,
boost::system::error_code&) {
if (building_city && current_key == "id") {
current_city.id = static_cast<int>(i);
} else if (in_state_object && current_key == "id") {
current_state_id = static_cast<int>(i);
} else if (in_country_object && current_key == "id") {
current_country_id = static_cast<int>(i);
}
return true;
}
bool on_uint64(uint64_t u, boost::json::string_view,
boost::system::error_code& ec) {
return on_int64(static_cast<int64_t>(u), "", ec);
}
bool on_double(double d, boost::json::string_view,
boost::system::error_code&) {
if (building_city) {
if (current_key == "latitude") {
current_city.latitude = d;
} else if (current_key == "longitude") {
current_city.longitude = d;
}
}
return true;
}
bool on_bool(bool, boost::system::error_code&) { return true; }
bool on_null(boost::system::error_code&) { return true; }
bool on_comment_part(boost::json::string_view, boost::system::error_code&) {
return true;
}
bool on_comment(boost::json::string_view, boost::system::error_code&) {
return true;
}
};
void StreamingJsonParser::Parse(
const std::string& file_path, SqliteDatabase& db,
std::function<void(const CityRecord&)> on_city,
std::function<void(size_t, size_t)> on_progress) {
spdlog::info(" Streaming parse of {} (Boost.JSON)...", file_path);
FILE* file = std::fopen(file_path.c_str(), "rb");
if (!file) {
throw std::runtime_error("Failed to open JSON file: " + file_path);
}
size_t total_size = 0;
if (std::fseek(file, 0, SEEK_END) == 0) {
long file_size = std::ftell(file);
if (file_size > 0) {
total_size = static_cast<size_t>(file_size);
}
std::rewind(file);
}
CityRecordHandler::ParseContext ctx{&db, on_city, on_progress, 0, total_size,
0, 0};
boost::json::basic_parser<CityRecordHandler> parser(
boost::json::parse_options{}, ctx);
char buf[65536];
size_t bytes_read;
boost::system::error_code ec;
while ((bytes_read = std::fread(buf, 1, sizeof(buf), file)) > 0) {
char const* p = buf;
std::size_t remain = bytes_read;
while (remain > 0) {
std::size_t consumed = parser.write_some(true, p, remain, ec);
if (ec) {
std::fclose(file);
throw std::runtime_error("JSON parse error: " + ec.message());
}
p += consumed;
remain -= consumed;
}
}
parser.write_some(false, nullptr, 0, ec); // Signal EOF
std::fclose(file);
if (ec) {
throw std::runtime_error("JSON parse error at EOF: " + ec.message());
}
spdlog::info(" OK: Parsed {} countries, {} states, {} cities",
ctx.countries_inserted, ctx.states_inserted,
ctx.cities_emitted);
}

118
pipeline/src/main.cpp Normal file
View File

@@ -0,0 +1,118 @@
#include <iostream>
#include <memory>
#include <boost/program_options.hpp>
#include <spdlog/spdlog.h>
#include "biergarten_data_generator.h"
#include "web_client/curl_web_client.h"
#include "database/database.h"
namespace po = boost::program_options;
/**
* @brief Parse command-line arguments into ApplicationOptions.
*
* @param argc Command-line argument count.
* @param argv Command-line arguments.
* @param options Output ApplicationOptions struct.
* @return true if parsing succeeded and should proceed, false otherwise.
*/
bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
// If no arguments provided, display usage and exit
if (argc == 1) {
std::cout << "Biergarten Pipeline - Geographic Data Pipeline with Brewery Generation\n\n";
std::cout << "Usage: biergarten-pipeline [options]\n\n";
std::cout << "Options:\n";
std::cout << " --mocked Use mocked generator for brewery/user data\n";
std::cout << " --model, -m PATH Path to LLM model file (gguf) for generation\n";
std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: /tmp)\n";
std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 (default: 0.8)\n";
std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 (default: 0.92)\n";
std::cout << " --seed SEED Random seed: -1 for random (default: -1)\n";
std::cout << " --help, -h Show this help message\n\n";
std::cout << "Note: --mocked and --model are mutually exclusive. Exactly one must be provided.\n";
std::cout << "Data source is always pinned to commit c5eb7772 (stable 2026-03-28).\n";
return false;
}
po::options_description desc("Pipeline Options");
desc.add_options()("help,h", "Produce help message")(
"mocked", po::bool_switch(),
"Use mocked generator for brewery/user data")(
"model,m", po::value<std::string>()->default_value(""),
"Path to LLM model (gguf)")(
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
"Directory for cached JSON")(
"temperature", po::value<float>()->default_value(0.8f),
"Sampling temperature (higher = more random)")(
"top-p", po::value<float>()->default_value(0.92f),
"Nucleus sampling top-p in (0,1] (higher = more random)")(
"seed", po::value<int>()->default_value(-1),
"Sampler seed: -1 for random, otherwise non-negative integer");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);
if (vm.count("help")) {
std::cout << desc << "\n";
return false;
}
// Check for mutually exclusive --mocked and --model flags
bool use_mocked = vm["mocked"].as<bool>();
std::string model_path = vm["model"].as<std::string>();
if (use_mocked && !model_path.empty()) {
spdlog::error("ERROR: --mocked and --model are mutually exclusive");
return false;
}
if (!use_mocked && model_path.empty()) {
spdlog::error("ERROR: Either --mocked or --model must be specified");
return false;
}
// Warn if sampling parameters are provided with --mocked
if (use_mocked) {
bool hasTemperature = vm["temperature"].defaulted() == false;
bool hasTopP = vm["top-p"].defaulted() == false;
bool hasSeed = vm["seed"].defaulted() == false;
if (hasTemperature || hasTopP || hasSeed) {
spdlog::warn("WARNING: Sampling parameters (--temperature, --top-p, --seed) are ignored when using --mocked");
}
}
options.use_mocked = use_mocked;
options.model_path = model_path;
options.cache_dir = vm["cache-dir"].as<std::string>();
options.temperature = vm["temperature"].as<float>();
options.top_p = vm["top-p"].as<float>();
options.seed = vm["seed"].as<int>();
// commit is always pinned to c5eb7772
return true;
}
int main(int argc, char *argv[]) {
try {
const CurlGlobalState curl_state;
ApplicationOptions options;
if (!ParseArguments(argc, argv, options)) {
return 0;
}
auto webClient = std::make_shared<CURLWebClient>();
SqliteDatabase database;
BiergartenDataGenerator generator(options, webClient, database);
return generator.Run();
} catch (const std::exception &e) {
spdlog::error("ERROR: Application failed: {}", e.what());
return 1;
}
}

View File

@@ -0,0 +1,141 @@
#include "web_client/curl_web_client.h"
#include <curl/curl.h>
#include <cstdio>
#include <fstream>
#include <memory>
#include <sstream>
#include <stdexcept>
CurlGlobalState::CurlGlobalState() {
if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
throw std::runtime_error(
"[CURLWebClient] Failed to initialize libcurl globally");
}
}
CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }
namespace {
// curl write callback that appends response data into a std::string
size_t WriteCallbackString(void* contents, size_t size, size_t nmemb,
void* userp) {
size_t realsize = size * nmemb;
auto* s = static_cast<std::string*>(userp);
s->append(static_cast<char*>(contents), realsize);
return realsize;
}
// curl write callback that writes to a file stream
size_t WriteCallbackFile(void* contents, size_t size, size_t nmemb,
void* userp) {
size_t realsize = size * nmemb;
auto* outFile = static_cast<std::ofstream*>(userp);
outFile->write(static_cast<char*>(contents), realsize);
return realsize;
}
// RAII wrapper for CURL handle using unique_ptr
using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
CurlHandle create_handle() {
CURL* handle = curl_easy_init();
if (!handle) {
throw std::runtime_error(
"[CURLWebClient] Failed to initialize libcurl handle");
}
return CurlHandle(handle, &curl_easy_cleanup);
}
void set_common_get_options(CURL* curl, const std::string& url,
long connect_timeout, long total_timeout) {
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, connect_timeout);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, total_timeout);
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
}
} // namespace
CURLWebClient::CURLWebClient() {}
CURLWebClient::~CURLWebClient() {}
void CURLWebClient::DownloadToFile(const std::string& url,
const std::string& file_path) {
auto curl = create_handle();
std::ofstream outFile(file_path, std::ios::binary);
if (!outFile.is_open()) {
throw std::runtime_error(
"[CURLWebClient] Cannot open file for writing: " + file_path);
}
set_common_get_options(curl.get(), url, 30L, 300L);
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackFile);
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA,
static_cast<void*>(&outFile));
CURLcode res = curl_easy_perform(curl.get());
outFile.close();
if (res != CURLE_OK) {
std::remove(file_path.c_str());
std::string error = std::string("[CURLWebClient] Download failed: ") +
curl_easy_strerror(res);
throw std::runtime_error(error);
}
long httpCode = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
if (httpCode != 200) {
std::remove(file_path.c_str());
std::stringstream ss;
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
throw std::runtime_error(ss.str());
}
}
std::string CURLWebClient::Get(const std::string& url) {
auto curl = create_handle();
std::string response_string;
set_common_get_options(curl.get(), url, 10L, 20L);
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
CURLcode res = curl_easy_perform(curl.get());
if (res != CURLE_OK) {
std::string error =
std::string("[CURLWebClient] GET failed: ") + curl_easy_strerror(res);
throw std::runtime_error(error);
}
long httpCode = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
if (httpCode != 200) {
std::stringstream ss;
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
throw std::runtime_error(ss.str());
}
return response_string;
}
std::string CURLWebClient::UrlEncode(const std::string& value) {
// A NULL handle is fine for UTF-8 encoding according to libcurl docs.
char* output = curl_easy_escape(nullptr, value.c_str(), 0);
if (output) {
std::string result(output);
curl_free(output);
return result;
}
throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
}

View File

@@ -0,0 +1,78 @@
#include "wikipedia/wikipedia_service.h"
#include <spdlog/spdlog.h>
#include <boost/json.hpp>
WikipediaService::WikipediaService(std::shared_ptr<WebClient> client)
: client_(std::move(client)) {}
std::string WikipediaService::FetchExtract(std::string_view query) {
const std::string encoded = client_->UrlEncode(std::string(query));
const std::string url =
"https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
"&prop=extracts&explaintext=true&format=json";
const std::string body = client_->Get(url);
boost::system::error_code ec;
boost::json::value doc = boost::json::parse(body, ec);
if (!ec && doc.is_object()) {
auto& pages = doc.at("query").at("pages").get_object();
if (!pages.empty()) {
auto& page = pages.begin()->value().get_object();
if (page.contains("extract") && page.at("extract").is_string()) {
std::string extract(page.at("extract").as_string().c_str());
spdlog::debug("WikipediaService fetched {} chars for '{}'",
extract.size(), query);
return extract;
}
}
}
return {};
}
std::string WikipediaService::GetSummary(std::string_view city,
std::string_view country) {
const std::string key = std::string(city) + "|" + std::string(country);
const auto cacheIt = cache_.find(key);
if (cacheIt != cache_.end()) {
return cacheIt->second;
}
std::string result;
if (!client_) {
cache_.emplace(key, result);
return result;
}
std::string regionQuery(city);
if (!country.empty()) {
regionQuery += ", ";
regionQuery += country;
}
const std::string beerQuery = "beer in " + std::string(city);
try {
const std::string regionExtract = FetchExtract(regionQuery);
const std::string beerExtract = FetchExtract(beerQuery);
if (!regionExtract.empty()) {
result += regionExtract;
}
if (!beerExtract.empty()) {
if (!result.empty()) result += "\n\n";
result += beerExtract;
}
} catch (const std::runtime_error& e) {
spdlog::debug("WikipediaService lookup failed for '{}': {}", regionQuery,
e.what());
}
cache_.emplace(key, result);
return result;
}

View File

@@ -31,7 +31,6 @@
<ProjectReference Include="..\..\Infrastructure\Infrastructure.Repository\Infrastructure.Repository.csproj" />
<ProjectReference Include="..\..\Infrastructure\Infrastructure.Jwt\Infrastructure.Jwt.csproj" />
<ProjectReference Include="..\..\Service\Service.Auth\Service.Auth.csproj" />
<ProjectReference Include="..\..\Service\Service.Breweries\Service.Breweries.csproj" />
<ProjectReference Include="..\..\Service\Service.UserManagement\Service.UserManagement.csproj" />
</ItemGroup>

View File

@@ -1,50 +0,0 @@
using FluentValidation;
namespace API.Core.Contracts.Breweries;
public class BreweryCreateDtoValidator : AbstractValidator<BreweryCreateDto>
{
public BreweryCreateDtoValidator()
{
RuleFor(x => x.PostedById)
.NotEmpty()
.WithMessage("PostedById is required.");
RuleFor(x => x.BreweryName)
.NotEmpty()
.WithMessage("Brewery name is required.")
.MaximumLength(256)
.WithMessage("Brewery name cannot exceed 256 characters.");
RuleFor(x => x.Description)
.NotEmpty()
.WithMessage("Description is required.")
.MaximumLength(512)
.WithMessage("Description cannot exceed 512 characters.");
RuleFor(x => x.Location)
.NotNull()
.WithMessage("Location is required.");
RuleFor(x => x.Location.CityId)
.NotEmpty()
.When(x => x.Location is not null)
.WithMessage("CityId is required.");
RuleFor(x => x.Location.AddressLine1)
.NotEmpty()
.When(x => x.Location is not null)
.WithMessage("Address line 1 is required.")
.MaximumLength(256)
.When(x => x.Location is not null)
.WithMessage("Address line 1 cannot exceed 256 characters.");
RuleFor(x => x.Location.PostalCode)
.NotEmpty()
.When(x => x.Location is not null)
.WithMessage("Postal code is required.")
.MaximumLength(20)
.When(x => x.Location is not null)
.WithMessage("Postal code cannot exceed 20 characters.");
}
}

View File

@@ -1,41 +0,0 @@
namespace API.Core.Contracts.Breweries;
public class BreweryLocationCreateDto
{
public Guid CityId { get; set; }
public string AddressLine1 { get; set; } = string.Empty;
public string? AddressLine2 { get; set; }
public string PostalCode { get; set; } = string.Empty;
public byte[]? Coordinates { get; set; }
}
public class BreweryLocationDto
{
public Guid BreweryPostLocationId { get; set; }
public Guid BreweryPostId { get; set; }
public Guid CityId { get; set; }
public string AddressLine1 { get; set; } = string.Empty;
public string? AddressLine2 { get; set; }
public string PostalCode { get; set; } = string.Empty;
public byte[]? Coordinates { get; set; }
}
public class BreweryCreateDto
{
public Guid PostedById { get; set; }
public string BreweryName { get; set; } = string.Empty;
public string Description { get; set; } = string.Empty;
public BreweryLocationCreateDto Location { get; set; } = null!;
}
public class BreweryDto
{
public Guid BreweryPostId { get; set; }
public Guid PostedById { get; set; }
public string BreweryName { get; set; } = string.Empty;
public string Description { get; set; } = string.Empty;
public DateTime CreatedAt { get; set; }
public DateTime? UpdatedAt { get; set; }
public byte[]? Timer { get; set; }
public BreweryLocationDto? Location { get; set; }
}

View File

@@ -86,13 +86,6 @@ namespace API.Core.Controllers
);
}
[HttpPost("confirm/resend")]
public async Task<ActionResult> ResendConfirmation([FromQuery] Guid userId)
{
await confirmationService.ResendConfirmationEmailAsync(userId);
return Ok(new ResponseBody { Message = "confirmation email has been resent" });
}
[AllowAnonymous]
[HttpPost("refresh")]
public async Task<ActionResult> Refresh(

View File

@@ -1,129 +0,0 @@
using API.Core.Contracts.Breweries;
using API.Core.Contracts.Common;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Mvc;
using Service.Breweries;
namespace API.Core.Controllers;
[ApiController]
[Route("api/[controller]")]
[Authorize(AuthenticationSchemes = "JWT")]
public class BreweryController(IBreweryService breweryService) : ControllerBase
{
[AllowAnonymous]
[HttpGet("{id:guid}")]
public async Task<ActionResult<ResponseBody<BreweryDto>>> GetById(Guid id)
{
var brewery = await breweryService.GetByIdAsync(id);
if (brewery is null)
return NotFound(new ResponseBody { Message = $"Brewery with ID {id} not found." });
return Ok(new ResponseBody<BreweryDto>
{
Message = "Brewery retrieved successfully.",
Payload = MapToDto(brewery),
});
}
[AllowAnonymous]
[HttpGet]
public async Task<ActionResult<ResponseBody<IEnumerable<BreweryDto>>>> GetAll(
[FromQuery] int? limit,
[FromQuery] int? offset)
{
var breweries = await breweryService.GetAllAsync(limit, offset);
return Ok(new ResponseBody<IEnumerable<BreweryDto>>
{
Message = "Breweries retrieved successfully.",
Payload = breweries.Select(MapToDto),
});
}
[HttpPost]
public async Task<ActionResult<ResponseBody<BreweryDto>>> Create([FromBody] BreweryCreateDto dto)
{
var request = new BreweryCreateRequest(
dto.PostedById,
dto.BreweryName,
dto.Description,
new BreweryLocationCreateRequest(
dto.Location.CityId,
dto.Location.AddressLine1,
dto.Location.AddressLine2,
dto.Location.PostalCode,
dto.Location.Coordinates
)
);
var result = await breweryService.CreateAsync(request);
if (!result.Success)
return BadRequest(new ResponseBody { Message = result.Message });
return Created($"/api/brewery/{result.Brewery.BreweryPostId}", new ResponseBody<BreweryDto>
{
Message = "Brewery created successfully.",
Payload = MapToDto(result.Brewery),
});
}
[HttpPut("{id:guid}")]
public async Task<ActionResult<ResponseBody<BreweryDto>>> Update(Guid id, [FromBody] BreweryDto dto)
{
if (dto.BreweryPostId != id)
return BadRequest(new ResponseBody { Message = "Route ID does not match payload ID." });
var request = new BreweryUpdateRequest(
dto.BreweryPostId,
dto.PostedById,
dto.BreweryName,
dto.Description,
dto.Location is null ? null : new BreweryLocationUpdateRequest(
dto.Location.BreweryPostLocationId,
dto.Location.CityId,
dto.Location.AddressLine1,
dto.Location.AddressLine2,
dto.Location.PostalCode,
dto.Location.Coordinates
)
);
var result = await breweryService.UpdateAsync(request);
if (!result.Success)
return BadRequest(new ResponseBody { Message = result.Message });
return Ok(new ResponseBody<BreweryDto>
{
Message = "Brewery updated successfully.",
Payload = MapToDto(result.Brewery),
});
}
[HttpDelete("{id:guid}")]
public async Task<ActionResult<ResponseBody>> Delete(Guid id)
{
await breweryService.DeleteAsync(id);
return Ok(new ResponseBody { Message = "Brewery deleted successfully." });
}
private static BreweryDto MapToDto(Domain.Entities.BreweryPost b) => new()
{
BreweryPostId = b.BreweryPostId,
PostedById = b.PostedById,
BreweryName = b.BreweryName,
Description = b.Description,
CreatedAt = b.CreatedAt,
UpdatedAt = b.UpdatedAt,
Timer = b.Timer,
Location = b.Location is null ? null : new BreweryLocationDto
{
BreweryPostLocationId = b.Location.BreweryPostLocationId,
BreweryPostId = b.Location.BreweryPostId,
CityId = b.Location.CityId,
AddressLine1 = b.Location.AddressLine1,
AddressLine2 = b.Location.AddressLine2,
PostalCode = b.Location.PostalCode,
Coordinates = b.Location.Coordinates,
},
};
}

View File

@@ -1,15 +1,20 @@
using API.Core;
using API.Core.Authentication;
using API.Core.Contracts.Common;
using Domain.Exceptions;
using FluentValidation;
using FluentValidation.AspNetCore;
using Infrastructure.Email;
using Infrastructure.Email.Templates;
using Infrastructure.Email.Templates.Rendering;
using Infrastructure.Jwt;
using Infrastructure.PasswordHashing;
using Infrastructure.Repository.Auth;
using Infrastructure.Repository.Sql;
using Infrastructure.Repository.UserAccount;
using Infrastructure.Repository.Breweries;
using Microsoft.AspNetCore.Authentication;
using Microsoft.AspNetCore.Mvc;
using Microsoft.AspNetCore.Mvc.Filters;
using Service.Auth;
using Service.Emails;
using Service.UserManagement.User;
@@ -50,7 +55,6 @@ builder.Services.AddSingleton<
builder.Services.AddScoped<IUserAccountRepository, UserAccountRepository>();
builder.Services.AddScoped<IAuthRepository, AuthRepository>();
builder.Services.AddScoped<IBreweryRepository, BreweryRepository>();
builder.Services.AddScoped<IUserService, UserService>();
builder.Services.AddScoped<ILoginService, LoginService>();

View File

@@ -26,7 +26,6 @@
<Project Path="Service/Service.Auth.Tests/Service.Auth.Tests.csproj" />
<Project Path="Service/Service.Emails/Service.Emails.csproj" />
<Project Path="Service/Service.UserManagement/Service.UserManagement.csproj" />
<Project Path="Service/Service.Auth/Service.Auth.csproj" />
<Project Path="Service/Service.Breweries/Service.Breweries.csproj" />
<Project Path="Service\Service.Auth\Service.Auth.csproj" />
</Folder>
</Solution>

View File

@@ -37,7 +37,7 @@ CREATE TABLE dbo.UserAccount
UpdatedAt DATETIME,
DateOfBirth DATE NOT NULL,
DateOfBirth DATETIME NOT NULL,
Timer ROWVERSION,
@@ -49,6 +49,7 @@ CREATE TABLE dbo.UserAccount
CONSTRAINT AK_Email
UNIQUE (Email)
);
----------------------------------------------------------------------------
@@ -108,7 +109,7 @@ CREATE TABLE UserAvatar -- delete avatar photo when user account is deleted
CONSTRAINT AK_UserAvatar_UserAccountID
UNIQUE (UserAccountID)
);
)
CREATE NONCLUSTERED INDEX IX_UserAvatar_UserAccount
ON UserAvatar(UserAccountID);
@@ -124,7 +125,8 @@ CREATE TABLE UserVerification -- delete verification data when user account is d
UserAccountID UNIQUEIDENTIFIER NOT NULL,
VerificationDateTime DATETIME NOT NULL
CONSTRAINT DF_VerificationDateTime DEFAULT GETDATE(),
CONSTRAINT DF_VerificationDateTime
DEFAULT GETDATE(),
Timer ROWVERSION,
@@ -153,13 +155,13 @@ CREATE TABLE UserCredential -- delete credentials when user account is deleted
UserAccountID UNIQUEIDENTIFIER NOT NULL,
CreatedAt DATETIME NOT NULL
CONSTRAINT DF_UserCredential_CreatedAt DEFAULT GETDATE(),
CreatedAt DATETIME
CONSTRAINT DF_UserCredential_CreatedAt DEFAULT GETDATE() NOT NULL,
Expiry DATETIME NOT NULL
CONSTRAINT DF_UserCredential_Expiry DEFAULT DATEADD(DAY, 90, GETDATE()),
Expiry DATETIME
CONSTRAINT DF_UserCredential_Expiry DEFAULT DATEADD(DAY, 90, GETDATE()) NOT NULL,
Hash NVARCHAR(256) NOT NULL,
Hash NVARCHAR(MAX) NOT NULL,
-- uses argon2
IsRevoked BIT NOT NULL
@@ -175,16 +177,12 @@ CREATE TABLE UserCredential -- delete credentials when user account is deleted
CONSTRAINT FK_UserCredential_UserAccount
FOREIGN KEY (UserAccountID)
REFERENCES UserAccount(UserAccountID)
ON DELETE CASCADE
ON DELETE CASCADE,
);
CREATE NONCLUSTERED INDEX IX_UserCredential_UserAccount
ON UserCredential(UserAccountID);
CREATE NONCLUSTERED INDEX IX_UserCredential_Account_Active
ON UserCredential(UserAccountID, IsRevoked, Expiry)
INCLUDE (Hash);
----------------------------------------------------------------------------
----------------------------------------------------------------------------
@@ -197,8 +195,8 @@ CREATE TABLE UserFollow
FollowingID UNIQUEIDENTIFIER NOT NULL,
CreatedAt DATETIME NOT NULL
CONSTRAINT DF_UserFollow_CreatedAt DEFAULT GETDATE(),
CreatedAt DATETIME
CONSTRAINT DF_UserFollow_CreatedAt DEFAULT GETDATE() NOT NULL,
Timer ROWVERSION,
@@ -207,13 +205,11 @@ CREATE TABLE UserFollow
CONSTRAINT FK_UserFollow_UserAccount
FOREIGN KEY (UserAccountID)
REFERENCES UserAccount(UserAccountID)
ON DELETE NO ACTION,
REFERENCES UserAccount(UserAccountID),
CONSTRAINT FK_UserFollow_UserAccountFollowing
FOREIGN KEY (FollowingID)
REFERENCES UserAccount(UserAccountID)
ON DELETE NO ACTION,
REFERENCES UserAccount(UserAccountID),
CONSTRAINT CK_CannotFollowOwnAccount
CHECK (UserAccountID != FollowingID)
@@ -225,6 +221,7 @@ CREATE NONCLUSTERED INDEX IX_UserFollow_UserAccount_FollowingID
CREATE NONCLUSTERED INDEX IX_UserFollow_FollowingID_UserAccount
ON UserFollow(FollowingID, UserAccountID);
----------------------------------------------------------------------------
----------------------------------------------------------------------------
@@ -302,6 +299,7 @@ CREATE TABLE City
CREATE NONCLUSTERED INDEX IX_City_StateProvince
ON City(StateProvinceID);
----------------------------------------------------------------------------
----------------------------------------------------------------------------
@@ -310,8 +308,6 @@ CREATE TABLE BreweryPost -- A user cannot be deleted if they have a post
BreweryPostID UNIQUEIDENTIFIER
CONSTRAINT DF_BreweryPostID DEFAULT NEWID(),
BreweryName NVARCHAR(256) NOT NULL,
PostedByID UNIQUEIDENTIFIER NOT NULL,
Description NVARCHAR(512) NOT NULL,
@@ -329,15 +325,15 @@ CREATE TABLE BreweryPost -- A user cannot be deleted if they have a post
CONSTRAINT FK_BreweryPost_UserAccount
FOREIGN KEY (PostedByID)
REFERENCES UserAccount(UserAccountID)
ON DELETE NO ACTION
);
ON DELETE NO ACTION,
)
CREATE NONCLUSTERED INDEX IX_BreweryPost_PostedByID
ON BreweryPost(PostedByID);
----------------------------------------------------------------------------
----------------------------------------------------------------------------
CREATE TABLE BreweryPostLocation
(
BreweryPostLocationID UNIQUEIDENTIFIER
@@ -353,7 +349,7 @@ CREATE TABLE BreweryPostLocation
CityID UNIQUEIDENTIFIER NOT NULL,
Coordinates GEOGRAPHY NULL,
Coordinates GEOGRAPHY NOT NULL,
Timer ROWVERSION,
@@ -366,11 +362,7 @@ CREATE TABLE BreweryPostLocation
CONSTRAINT FK_BreweryPostLocation_BreweryPost
FOREIGN KEY (BreweryPostID)
REFERENCES BreweryPost(BreweryPostID)
ON DELETE CASCADE,
CONSTRAINT FK_BreweryPostLocation_City
FOREIGN KEY (CityID)
REFERENCES City(CityID)
ON DELETE CASCADE
);
CREATE NONCLUSTERED INDEX IX_BreweryPostLocation_BreweryPost
@@ -379,18 +371,6 @@ CREATE NONCLUSTERED INDEX IX_BreweryPostLocation_BreweryPost
CREATE NONCLUSTERED INDEX IX_BreweryPostLocation_City
ON BreweryPostLocation(CityID);
-- To assess when the time comes:
-- This would allow for efficient spatial queries to find breweries within a certain distance of a location, but it adds overhead to insert/update operations.
-- CREATE SPATIAL INDEX SIDX_BreweryPostLocation_Coordinates
-- ON BreweryPostLocation(Coordinates)
-- USING GEOGRAPHY_GRID
-- WITH (
-- GRIDS = (LEVEL_1 = MEDIUM, LEVEL_2 = MEDIUM, LEVEL_3 = MEDIUM, LEVEL_4 = MEDIUM),
-- CELLS_PER_OBJECT = 16
-- );
----------------------------------------------------------------------------
----------------------------------------------------------------------------
@@ -423,14 +403,13 @@ CREATE TABLE BreweryPostPhoto -- All photos linked to a post are deleted if the
);
CREATE NONCLUSTERED INDEX IX_BreweryPostPhoto_Photo_BreweryPost
ON BreweryPostPhoto(PhotoID, BreweryPostID);
ON BreweryPostPhoto(PhotoID, BreweryPostID);
CREATE NONCLUSTERED INDEX IX_BreweryPostPhoto_BreweryPost_Photo
ON BreweryPostPhoto(BreweryPostID, PhotoID);
ON BreweryPostPhoto(BreweryPostID, PhotoID);
----------------------------------------------------------------------------
----------------------------------------------------------------------------
CREATE TABLE BeerStyle
(
BeerStyleID UNIQUEIDENTIFIER
@@ -465,7 +444,7 @@ CREATE TABLE BeerPost
-- Alcohol By Volume (typically 0-67%)
IBU INT NOT NULL,
-- International Bitterness Units (typically 0-120)
-- International Bitterness Units (typically 0-100)
PostedByID UNIQUEIDENTIFIER NOT NULL,
@@ -485,8 +464,7 @@ CREATE TABLE BeerPost
CONSTRAINT FK_BeerPost_PostedBy
FOREIGN KEY (PostedByID)
REFERENCES UserAccount(UserAccountID)
ON DELETE NO ACTION,
REFERENCES UserAccount(UserAccountID),
CONSTRAINT FK_BeerPost_BeerStyle
FOREIGN KEY (BeerStyleID)
@@ -544,10 +522,10 @@ CREATE TABLE BeerPostPhoto -- All photos linked to a beer post are deleted if th
);
CREATE NONCLUSTERED INDEX IX_BeerPostPhoto_Photo_BeerPost
ON BeerPostPhoto(PhotoID, BeerPostID);
ON BeerPostPhoto(PhotoID, BeerPostID);
CREATE NONCLUSTERED INDEX IX_BeerPostPhoto_BeerPost_Photo
ON BeerPostPhoto(BeerPostID, PhotoID);
ON BeerPostPhoto(BeerPostID, PhotoID);
----------------------------------------------------------------------------
----------------------------------------------------------------------------
@@ -561,35 +539,17 @@ CREATE TABLE BeerPostComment
BeerPostID UNIQUEIDENTIFIER NOT NULL,
CommentedByID UNIQUEIDENTIFIER NOT NULL,
Rating INT NOT NULL,
CreatedAt DATETIME NOT NULL
CONSTRAINT DF_BeerPostComment_CreatedAt DEFAULT GETDATE(),
UpdatedAt DATETIME NULL,
Timer ROWVERSION,
CONSTRAINT PK_BeerPostComment
PRIMARY KEY (BeerPostCommentID),
CONSTRAINT FK_BeerPostComment_BeerPost
FOREIGN KEY (BeerPostID)
REFERENCES BeerPost(BeerPostID),
CONSTRAINT FK_BeerPostComment_UserAccount
FOREIGN KEY (CommentedByID)
REFERENCES UserAccount(UserAccountID)
ON DELETE NO ACTION,
CONSTRAINT CHK_BeerPostComment_Rating
CHECK (Rating BETWEEN 1 AND 5)
);
FOREIGN KEY (BeerPostID) REFERENCES BeerPost(BeerPostID)
)
CREATE NONCLUSTERED INDEX IX_BeerPostComment_BeerPost
ON BeerPostComment(BeerPostID);
ON BeerPostComment(BeerPostID)
CREATE NONCLUSTERED INDEX IX_BeerPostComment_CommentedBy
ON BeerPostComment(CommentedByID);

View File

@@ -1,50 +0,0 @@
CREATE OR ALTER PROCEDURE dbo.USP_CreateBrewery(
@BreweryName NVARCHAR(256),
@Description NVARCHAR(512),
@PostedByID UNIQUEIDENTIFIER,
@CityID UNIQUEIDENTIFIER,
@AddressLine1 NVARCHAR(256),
@AddressLine2 NVARCHAR(256) = NULL,
@PostalCode NVARCHAR(20),
@Coordinates GEOGRAPHY = NULL
)
AS
BEGIN
SET NOCOUNT ON;
SET XACT_ABORT ON;
IF @BreweryName IS NULL
THROW 50001, 'Brewery name cannot be null.', 1;
IF @Description IS NULL
THROW 50002, 'Brewery description cannot be null.', 1;
IF NOT EXISTS (SELECT 1
FROM dbo.UserAccount
WHERE UserAccountID = @PostedByID)
THROW 50404, 'User not found.', 1;
IF NOT EXISTS (SELECT 1
FROM dbo.City
WHERE CityID = @CityID)
THROW 50404, 'City not found.', 1;
DECLARE @NewBreweryID UNIQUEIDENTIFIER = NEWID();
DECLARE @NewBrewerLocationID UNIQUEIDENTIFIER = NEWID();
BEGIN TRANSACTION;
INSERT INTO dbo.BreweryPost
(BreweryPostID, BreweryName, Description, PostedByID)
VALUES (@NewBreweryID, @BreweryName, @Description, @PostedByID);
INSERT INTO dbo.BreweryPostLocation
(BreweryPostLocationID, BreweryPostID, CityID, AddressLine1, AddressLine2, PostalCode, Coordinates)
VALUES (@NewBrewerLocationID, @NewBreweryID, @CityID, @AddressLine1, @AddressLine2, @PostalCode, @Coordinates);
COMMIT TRANSACTION;
SELECT @NewBreweryID AS BreweryPostID,
@NewBrewerLocationID AS BreweryPostLocationID;
END

View File

@@ -1,9 +0,0 @@
CREATE OR ALTER PROCEDURE dbo.USP_GetBreweryById @BreweryPostID UNIQUEIDENTIFIER
AS
BEGIN
SELECT *
FROM BreweryPost bp
INNER JOIN BreweryPostLocation bpl
ON bp.BreweryPostID = bpl.BreweryPostID
WHERE bp.BreweryPostID = @BreweryPostID;
END

View File

@@ -1,13 +0,0 @@
namespace Domain.Entities;
public class BreweryPost
{
public Guid BreweryPostId { get; set; }
public Guid PostedById { get; set; }
public string BreweryName { get; set; } = string.Empty;
public string Description { get; set; } = string.Empty;
public DateTime CreatedAt { get; set; }
public DateTime? UpdatedAt { get; set; }
public byte[]? Timer { get; set; }
public BreweryPostLocation? Location { get; set; }
}

View File

@@ -1,13 +0,0 @@
namespace Domain.Entities;
public class BreweryPostLocation
{
public Guid BreweryPostLocationId { get; set; }
public Guid BreweryPostId { get; set; }
public string AddressLine1 { get; set; } = string.Empty;
public string? AddressLine2 { get; set; }
public string PostalCode { get; set; } = string.Empty;
public Guid CityId { get; set; }
public byte[]? Coordinates { get; set; }
public byte[]? Timer { get; set; }
}

View File

@@ -7,6 +7,6 @@
</PropertyGroup>
<ItemGroup>
<PackageReference Include="MailKit" Version="4.15.1" />
<PackageReference Include="MailKit" Version="4.9.0" />
</ItemGroup>
</Project>

View File

@@ -17,34 +17,10 @@ public class AuthRepositoryTest
var conn = new MockDbConnection();
conn.Mocks.When(cmd => cmd.CommandText == "USP_RegisterUser")
.ReturnsScalar(expectedUserId);
// Mock the subsequent read for the newly created user by id
conn.Mocks.When(cmd => cmd.CommandText == "usp_GetUserAccountById")
.ReturnsTable(
MockTable
.WithColumns(
("UserAccountId", typeof(Guid)),
("Username", typeof(string)),
("FirstName", typeof(string)),
("LastName", typeof(string)),
("Email", typeof(string)),
("CreatedAt", typeof(DateTime)),
("UpdatedAt", typeof(DateTime?)),
("DateOfBirth", typeof(DateTime)),
("Timer", typeof(byte[]))
)
.AddRow(
expectedUserId,
"testuser",
"Test",
"User",
"test@example.com",
DateTime.UtcNow,
null,
new DateTime(1990, 1, 1),
null
)
.WithColumns(("UserAccountId", typeof(Guid)))
.AddRow(expectedUserId)
);
var repo = CreateRepo(conn);

View File

@@ -1,108 +0,0 @@
using Apps72.Dev.Data.DbMocker;
using FluentAssertions;
using Infrastructure.Repository.Breweries;
using Infrastructure.Repository.Tests.Database;
using Domain.Entities;
namespace Infrastructure.Repository.Tests.Breweries;
public class BreweryRepositoryTest
{
private static BreweryRepository CreateRepo(MockDbConnection conn) =>
new(new TestConnectionFactory(conn));
[Fact]
public async Task GetByIdAsync_ReturnsBrewery_WhenExists()
{
var breweryId = Guid.NewGuid();
var conn = new MockDbConnection();
// Repository calls the stored procedure
const string getByIdSql = "USP_GetBreweryById";
var locationId = Guid.NewGuid();
conn.Mocks.When(cmd => cmd.CommandText == getByIdSql)
.ReturnsTable(
MockTable
.WithColumns(
("BreweryPostId", typeof(Guid)),
("PostedById", typeof(Guid)),
("BreweryName", typeof(string)),
("Description", typeof(string)),
("CreatedAt", typeof(DateTime)),
("UpdatedAt", typeof(DateTime?)),
("Timer", typeof(byte[])),
("BreweryPostLocationId", typeof(Guid)),
("CityId", typeof(Guid)),
("AddressLine1", typeof(string)),
("AddressLine2", typeof(string)),
("PostalCode", typeof(string)),
("Coordinates", typeof(byte[]))
)
.AddRow(
breweryId,
Guid.NewGuid(),
"Test Brewery",
"A test brewery description",
DateTime.UtcNow,
null,
null,
locationId,
Guid.NewGuid(),
"123 Main St",
null,
"12345",
null
)
);
var repo = CreateRepo(conn);
var result = await repo.GetByIdAsync(breweryId);
result.Should().NotBeNull();
result!.BreweryPostId.Should().Be(breweryId);
result.Location.Should().NotBeNull();
result.Location!.BreweryPostLocationId.Should().Be(locationId);
}
[Fact]
public async Task GetByIdAsync_ReturnsNull_WhenNotExists()
{
var conn = new MockDbConnection();
conn.Mocks.When(cmd => cmd.CommandText == "USP_GetBreweryById")
.ReturnsTable(MockTable.Empty());
var repo = CreateRepo(conn);
var result = await repo.GetByIdAsync(Guid.NewGuid());
result.Should().BeNull();
}
[Fact]
public async Task CreateAsync_ExecutesSuccessfully()
{
var conn = new MockDbConnection();
conn.Mocks.When(cmd => cmd.CommandText == "USP_CreateBrewery")
.ReturnsScalar(1);
var repo = CreateRepo(conn);
var brewery = new BreweryPost
{
BreweryPostId = Guid.NewGuid(),
PostedById = Guid.NewGuid(),
BreweryName = "Test Brewery",
Description = "A test brewery description",
CreatedAt = DateTime.UtcNow,
Location = new BreweryPostLocation
{
BreweryPostLocationId = Guid.NewGuid(),
CityId = Guid.NewGuid(),
AddressLine1 = "123 Main St",
PostalCode = "12345",
Coordinates = [0x00, 0x01]
}
};
// Should not throw
var act = async () => await repo.CreateAsync(brewery);
await act.Should().NotThrowAsync();
}
}

View File

@@ -33,39 +33,18 @@ public class AuthRepository(ISqlConnectionFactory connectionFactory)
AddParameter(command, "@Hash", passwordHash);
var result = await command.ExecuteScalarAsync();
var userAccountId = result != null ? (Guid)result : Guid.Empty;
Guid userAccountId = Guid.Empty;
if (result != null && result != DBNull.Value)
return new Domain.Entities.UserAccount
{
if (result is Guid g)
{
userAccountId = g;
}
else if (result is string s && Guid.TryParse(s, out var parsed))
{
userAccountId = parsed;
}
else if (result is byte[] bytes && bytes.Length == 16)
{
userAccountId = new Guid(bytes);
}
else
{
// Fallback: try to convert and parse string representation
try
{
var str = result.ToString();
if (!string.IsNullOrEmpty(str) && Guid.TryParse(str, out var p))
userAccountId = p;
}
catch
{
userAccountId = Guid.Empty;
}
}
}
return await GetUserByIdAsync(userAccountId) ?? throw new Exception("Failed to retrieve newly registered user.");
UserAccountId = userAccountId,
Username = username,
FirstName = firstName,
LastName = lastName,
Email = email,
DateOfBirth = dateOfBirth,
CreatedAt = DateTime.UtcNow,
};
}
public async Task<Domain.Entities.UserAccount?> GetUserByEmailAsync(

View File

@@ -1,147 +0,0 @@
using System.Data.Common;
using Domain.Entities;
using Infrastructure.Repository.Sql;
namespace Infrastructure.Repository.Breweries;
public class BreweryRepository(ISqlConnectionFactory connectionFactory)
: Repository<BreweryPost>(connectionFactory), IBreweryRepository
{
private readonly ISqlConnectionFactory _connectionFactory = connectionFactory;
public async Task<BreweryPost?> GetByIdAsync(Guid id)
{
await using var connection = await CreateConnection();
await using var command = connection.CreateCommand();
command.CommandType = System.Data.CommandType.StoredProcedure;
command.CommandText = "USP_GetBreweryById";
AddParameter(command, "@BreweryPostID", id);
await using var reader = await command.ExecuteReaderAsync();
if (await reader.ReadAsync())
{
return MapToEntity(reader);
}
return null;
}
public Task<IEnumerable<BreweryPost>> GetAllAsync(int? limit, int? offset)
{
throw new NotImplementedException();
}
public Task UpdateAsync(BreweryPost brewery)
{
throw new NotImplementedException();
}
public Task DeleteAsync(Guid id)
{
throw new NotImplementedException();
}
public async Task CreateAsync(BreweryPost brewery)
{
await using var connection = await CreateConnection();
await using var command = connection.CreateCommand();
command.CommandText = "USP_CreateBrewery";
command.CommandType = System.Data.CommandType.StoredProcedure;
if (brewery.Location is null)
{
throw new ArgumentException("Location must be provided when creating a brewery.");
}
AddParameter(command, "@BreweryName", brewery.BreweryName);
AddParameter(command, "@Description", brewery.Description);
AddParameter(command, "@PostedByID", brewery.PostedById);
AddParameter(command, "@CityID", brewery.Location?.CityId);
AddParameter(command, "@AddressLine1", brewery.Location?.AddressLine1);
AddParameter(command, "@AddressLine2", brewery.Location?.AddressLine2);
AddParameter(command, "@PostalCode", brewery.Location?.PostalCode);
AddParameter(command, "@Coordinates", brewery.Location?.Coordinates);
await command.ExecuteNonQueryAsync();
}
protected override BreweryPost MapToEntity(DbDataReader reader)
{
var brewery = new BreweryPost();
var ordBreweryPostId = reader.GetOrdinal("BreweryPostId");
var ordPostedById = reader.GetOrdinal("PostedById");
var ordBreweryName = reader.GetOrdinal("BreweryName");
var ordDescription = reader.GetOrdinal("Description");
var ordCreatedAt = reader.GetOrdinal("CreatedAt");
var ordUpdatedAt = reader.GetOrdinal("UpdatedAt");
var ordTimer = reader.GetOrdinal("Timer");
brewery.BreweryPostId = reader.GetGuid(ordBreweryPostId);
brewery.PostedById = reader.GetGuid(ordPostedById);
brewery.BreweryName = reader.GetString(ordBreweryName);
brewery.Description = reader.GetString(ordDescription);
brewery.CreatedAt = reader.GetDateTime(ordCreatedAt);
brewery.UpdatedAt = reader.IsDBNull(ordUpdatedAt) ? null : reader.GetDateTime(ordUpdatedAt);
// Read timer (varbinary/rowversion) robustly
if (reader.IsDBNull(ordTimer))
{
brewery.Timer = null;
}
else
{
try
{
brewery.Timer = reader.GetFieldValue<byte[]>(ordTimer);
}
catch
{
var length = reader.GetBytes(ordTimer, 0, null, 0, 0);
var buffer = new byte[length];
reader.GetBytes(ordTimer, 0, buffer, 0, (int)length);
brewery.Timer = buffer;
}
}
// Map BreweryPostLocation if columns are present
try
{
var ordLocationId = reader.GetOrdinal("BreweryPostLocationId");
if (!reader.IsDBNull(ordLocationId))
{
var location = new BreweryPostLocation
{
BreweryPostLocationId = reader.GetGuid(ordLocationId),
BreweryPostId = reader.GetGuid(reader.GetOrdinal("BreweryPostId")),
CityId = reader.GetGuid(reader.GetOrdinal("CityId")),
AddressLine1 = reader.GetString(reader.GetOrdinal("AddressLine1")),
AddressLine2 = reader.IsDBNull(reader.GetOrdinal("AddressLine2")) ? null : reader.GetString(reader.GetOrdinal("AddressLine2")),
PostalCode = reader.GetString(reader.GetOrdinal("PostalCode")),
Coordinates = reader.IsDBNull(reader.GetOrdinal("Coordinates")) ? null : reader.GetFieldValue<byte[]>(reader.GetOrdinal("Coordinates"))
};
brewery.Location = location;
}
}
catch (IndexOutOfRangeException)
{
// Location columns not present, skip mapping location
}
return brewery;
}
private static void AddParameter(
DbCommand command,
string name,
object? value
)
{
var p = command.CreateParameter();
p.ParameterName = name;
p.Value = value ?? DBNull.Value;
command.Parameters.Add(p);
}
}

View File

@@ -1,12 +0,0 @@
using Domain.Entities;
namespace Infrastructure.Repository.Breweries;
public interface IBreweryRepository
{
Task<BreweryPost?> GetByIdAsync(Guid id);
Task<IEnumerable<BreweryPost>> GetAllAsync(int? limit, int? offset);
Task UpdateAsync(BreweryPost brewery);
Task DeleteAsync(Guid id);
Task CreateAsync(BreweryPost brewery);
}

View File

@@ -1,69 +0,0 @@
using FluentAssertions;
using Xunit;
using Service.Breweries;
using API.Core.Contracts.Breweries;
using Domain.Entities;
namespace Service.Breweries.Tests;
public class BreweryServiceTests
{
private class FakeRepo : IBreweryRepository
{
public BreweryPost? Created;
public Task<BreweryPost?> GetByIdAsync(Guid id) => Task.FromResult<BreweryPost?>(null);
public Task<IEnumerable<BreweryPost>> GetAllAsync(int? limit, int? offset) => Task.FromResult<IEnumerable<BreweryPost>>(Array.Empty<BreweryPost>());
public Task UpdateAsync(BreweryPost brewery) { Created = brewery; return Task.CompletedTask; }
public Task DeleteAsync(Guid id) => Task.CompletedTask;
public Task CreateAsync(BreweryPost brewery) { Created = brewery; return Task.CompletedTask; }
}
[Fact]
public async Task CreateAsync_ReturnsFailure_WhenLocationMissing()
{
var repo = new FakeRepo();
var svc = new BreweryService(repo);
var dto = new BreweryCreateDto
{
PostedById = Guid.NewGuid(),
BreweryName = "X",
Description = "Y",
Location = null!
};
var result = await svc.CreateAsync(dto);
result.Success.Should().BeFalse();
result.Message.Should().Contain("Location");
}
[Fact]
public async Task CreateAsync_ReturnsSuccess_AndPersistsEntity()
{
var repo = new FakeRepo();
var svc = new BreweryService(repo);
var loc = new BreweryLocationCreateDto
{
CityId = Guid.NewGuid(),
AddressLine1 = "123 Main",
PostalCode = "12345"
};
var dto = new BreweryCreateDto
{
PostedById = Guid.NewGuid(),
BreweryName = "MyBrew",
Description = "Desc",
Location = loc
};
var result = await svc.CreateAsync(dto);
result.Success.Should().BeTrue();
repo.Created.Should().NotBeNull();
repo.Created!.BreweryName.Should().Be("MyBrew");
result.Brewery.BreweryName.Should().Be("MyBrew");
}
}

View File

@@ -1,28 +0,0 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<IsPackable>false</IsPackable>
<RootNamespace>Service.Breweries.Tests</RootNamespace>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="18.0.1" />
<PackageReference Include="xunit" Version="2.9.2" />
<PackageReference Include="xunit.runner.visualstudio" Version="2.8.2" />
<PackageReference Include="FluentAssertions" Version="6.9.0" />
</ItemGroup>
<ItemGroup>
<Using Include="Xunit" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\Service.Breweries\Service.Breweries.csproj" />
<ProjectReference Include="..\Service.Auth\Service.Auth.csproj" />
<ProjectReference
Include="..\..\Infrastructure\Infrastructure.Repository\Infrastructure.Repository.csproj" />
<ProjectReference Include="..\..\API\API.Core\API.Core.csproj" />
</ItemGroup>
</Project>

View File

@@ -1,65 +0,0 @@
using Domain.Entities;
using Infrastructure.Repository.Breweries;
namespace Service.Breweries;
public class BreweryService(IBreweryRepository repository) : IBreweryService
{
public Task<BreweryPost?> GetByIdAsync(Guid id) =>
repository.GetByIdAsync(id);
public Task<IEnumerable<BreweryPost>> GetAllAsync(int? limit = null, int? offset = null) =>
repository.GetAllAsync(limit, offset);
public async Task<BreweryServiceReturn> CreateAsync(BreweryCreateRequest request)
{
var entity = new BreweryPost
{
BreweryPostId = Guid.NewGuid(),
PostedById = request.PostedById,
BreweryName = request.BreweryName,
Description = request.Description,
CreatedAt = DateTime.UtcNow,
Location = new BreweryPostLocation
{
BreweryPostLocationId = Guid.NewGuid(),
CityId = request.Location.CityId,
AddressLine1 = request.Location.AddressLine1,
AddressLine2 = request.Location.AddressLine2,
PostalCode = request.Location.PostalCode,
Coordinates = request.Location.Coordinates,
},
};
await repository.CreateAsync(entity);
return new BreweryServiceReturn(entity);
}
public async Task<BreweryServiceReturn> UpdateAsync(BreweryUpdateRequest request)
{
var entity = new BreweryPost
{
BreweryPostId = request.BreweryPostId,
PostedById = request.PostedById,
BreweryName = request.BreweryName,
Description = request.Description,
UpdatedAt = DateTime.UtcNow,
Location = request.Location is null ? null : new BreweryPostLocation
{
BreweryPostLocationId = request.Location.BreweryPostLocationId,
BreweryPostId = request.BreweryPostId,
CityId = request.Location.CityId,
AddressLine1 = request.Location.AddressLine1,
AddressLine2 = request.Location.AddressLine2,
PostalCode = request.Location.PostalCode,
Coordinates = request.Location.Coordinates,
},
};
await repository.UpdateAsync(entity);
return new BreweryServiceReturn(entity);
}
public Task DeleteAsync(Guid id) =>
repository.DeleteAsync(id);
}

View File

@@ -1,64 +0,0 @@
using Domain.Entities;
namespace Service.Breweries;
public record BreweryCreateRequest(
Guid PostedById,
string BreweryName,
string Description,
BreweryLocationCreateRequest Location
);
public record BreweryLocationCreateRequest(
Guid CityId,
string AddressLine1,
string? AddressLine2,
string PostalCode,
byte[]? Coordinates
);
public record BreweryUpdateRequest(
Guid BreweryPostId,
Guid PostedById,
string BreweryName,
string Description,
BreweryLocationUpdateRequest? Location
);
public record BreweryLocationUpdateRequest(
Guid BreweryPostLocationId,
Guid CityId,
string AddressLine1,
string? AddressLine2,
string PostalCode,
byte[]? Coordinates
);
public record BreweryServiceReturn
{
public bool Success { get; init; }
public BreweryPost Brewery { get; init; }
public string Message { get; init; } = string.Empty;
public BreweryServiceReturn(BreweryPost brewery)
{
Success = true;
Brewery = brewery;
}
public BreweryServiceReturn(string message)
{
Success = false;
Brewery = default!;
Message = message;
}
}
public interface IBreweryService
{
Task<BreweryPost?> GetByIdAsync(Guid id);
Task<IEnumerable<BreweryPost>> GetAllAsync(int? limit = null, int? offset = null);
Task<BreweryServiceReturn> CreateAsync(BreweryCreateRequest request);
Task<BreweryServiceReturn> UpdateAsync(BreweryUpdateRequest request);
Task DeleteAsync(Guid id);
}

View File

@@ -1,12 +0,0 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\..\Domain\Domain.Entities\Domain.Entities.csproj" />
<ProjectReference Include="..\..\Infrastructure\Infrastructure.Repository\Infrastructure.Repository.csproj" />
</ItemGroup>
</Project>