Update documentation

This commit is contained in:
Aaron Po
2026-04-08 22:24:23 -04:00
parent 7807f0bc2a
commit b31be494d7
28 changed files with 487 additions and 93 deletions

View File

@@ -6,7 +6,7 @@ A C++23 tool for processing geographic data and generating brewery metadata. It
The pipeline runs in four stages: The pipeline runs in four stages:
- **Query**: Loads and samples from a local `locations.json` manifest. - **Query**: Loads and samples from a local `locations.json` file.
- **Enrich**: Fetches regional and cultural context from Wikipedia in parallel using `std::async`. - **Enrich**: Fetches regional and cultural context from Wikipedia in parallel using `std::async`.
- **Generate**: Creates authentic brewery names and descriptions using a local GGUF model or a deterministic mock. - **Generate**: Creates authentic brewery names and descriptions using a local GGUF model or a deterministic mock.
- **Log**: Outputs results and metadata summaries via spdlog. - **Log**: Outputs results and metadata summaries via spdlog.
@@ -26,7 +26,9 @@ The pipeline runs in four stages:
## Hardware & GPU Config ## Hardware & GPU Config
### Test Machine ### Test Machines
#### x86/64 Linux, NVIDIA RTX 2000
- **Host**: ThinkPad P1 Gen 7 (Fedora 43) - **Host**: ThinkPad P1 Gen 7 (Fedora 43)
- **CPU**: Intel Core Ultra 7 155H - **CPU**: Intel Core Ultra 7 155H
@@ -35,6 +37,15 @@ The pipeline runs in four stages:
- **Model**: Qwen3-8B-Q6-K - **Model**: Qwen3-8B-Q6-K
- **Inference**: llama.cpp with CUDA 12.x support - **Inference**: llama.cpp with CUDA 12.x support
#### ARM MacOS, M1 Pro
- **Host**: MacBook Pro 14" (2021)
- **CPU**: Apple M1 Pro (8-core)
- **GPU**: Apple M1 Pro (14-core) [Integrated]
- **Memory**: 16GB
- **Model**: Qwen3-8B-Q6-K
- **Inference**: llama.cpp with Metal (MPS) support
### GPU Build Flags ### GPU Build Flags
```bash ```bash
@@ -42,6 +53,11 @@ cmake -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89 ..
cmake --build . --config Release cmake --build . --config Release
``` ```
```zsh
cmake ..
cmake --build .
```
## Core Components ## Core Components
| Component | Function | | Component | Function |

View File

@@ -1,6 +1,11 @@
#ifndef BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_ #ifndef BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
#define BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_ #define BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
/**
* @file biergarten_data_generator.h
* @brief Core orchestration class for pipeline data generation.
*/
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>

View File

@@ -1,28 +1,68 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_ #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_ #define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
/**
* @file data_generation/data_generator.h
* @brief Shared generator interfaces and result models.
*/
#include <string> #include <string>
/**
* @brief Generated brewery payload.
*/
struct BreweryResult { struct BreweryResult {
/// @brief Brewery display name.
std::string name; std::string name;
/// @brief Brewery description text.
std::string description; std::string description;
}; };
/**
* @brief Generated user profile payload.
*/
struct UserResult { struct UserResult {
/// @brief Username handle.
std::string username; std::string username;
/// @brief Short user biography.
std::string bio; std::string bio;
}; };
/**
* @brief Interface for data generator implementations.
*/
class DataGenerator { class DataGenerator {
public: public:
/// @brief Virtual destructor for polymorphic cleanup.
virtual ~DataGenerator() = default; virtual ~DataGenerator() = default;
/**
* @brief Loads and initializes generator resources.
*
* @param model_path Path to model assets. Implementations may ignore this.
*/
virtual void Load(const std::string& model_path) = 0; virtual void Load(const std::string& model_path) = 0;
/**
* @brief Generates brewery data for a location.
*
* @param city_name City name.
* @param country_name Country name.
* @param region_context Additional regional context text.
* @return Brewery generation result.
*/
virtual BreweryResult GenerateBrewery(const std::string& city_name, virtual BreweryResult GenerateBrewery(const std::string& city_name,
const std::string& country_name, const std::string& country_name,
const std::string& region_context) = 0; const std::string& region_context) = 0;
/**
* @brief Generates a user profile for a locale.
*
* @param locale Locale hint used by generator.
* @return User generation result.
*/
virtual UserResult GenerateUser(const std::string& locale) = 0; virtual UserResult GenerateUser(const std::string& locale) = 0;
}; };

View File

@@ -1,6 +1,11 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_ #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_ #define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
/**
* @file data_generation/llama_generator.h
* @brief Llama.cpp-backed implementation of DataGenerator.
*/
#include <cstdint> #include <cstdint>
#include <string> #include <string>
@@ -9,34 +14,107 @@
struct llama_model; struct llama_model;
struct llama_context; struct llama_context;
/**
* @brief Data generator implementation backed by llama.cpp.
*/
class LlamaGenerator final : public DataGenerator { class LlamaGenerator final : public DataGenerator {
public: public:
/// @brief Constructs a generator with default sampling and context settings.
LlamaGenerator() = default; LlamaGenerator() = default;
/// @brief Releases model/context resources.
~LlamaGenerator() override; ~LlamaGenerator() override;
/**
* @brief Configures sampling parameters for generation.
*
* @param temperature Sampling temperature.
* @param top_p Nucleus sampling threshold.
* @param seed Seed for sampling; use -1 for random seed.
*/
void SetSamplingOptions(float temperature, float top_p, int seed = -1); void SetSamplingOptions(float temperature, float top_p, int seed = -1);
/**
* @brief Sets context window size used during model load.
*
* @param n_ctx Context size in tokens.
*/
void SetContextSize(uint32_t n_ctx); void SetContextSize(uint32_t n_ctx);
/**
* @brief Loads model and prepares inference context.
*
* @param model_path Filesystem path to GGUF model.
*/
void Load(const std::string& model_path) override; void Load(const std::string& model_path) override;
/**
* @brief Generates brewery data for a specific location.
*
* @param city_name City name.
* @param country_name Country name.
* @param region_context Additional regional context.
* @return Generated brewery result.
*/
BreweryResult GenerateBrewery(const std::string& city_name, BreweryResult GenerateBrewery(const std::string& city_name,
const std::string& country_name, const std::string& country_name,
const std::string& region_context) override; const std::string& region_context) override;
/**
* @brief Generates a user profile for the provided locale.
*
* @param locale Locale hint.
* @return Generated user profile.
*/
UserResult GenerateUser(const std::string& locale) override; UserResult GenerateUser(const std::string& locale) override;
private: private:
/**
* @brief Infers text from a user prompt.
*
* @param prompt User prompt.
* @param max_tokens Maximum tokens to generate.
* @return Generated text.
*/
std::string Infer(const std::string& prompt, int max_tokens = 10000); std::string Infer(const std::string& prompt, int max_tokens = 10000);
// Overload that allows passing a system message separately so chat-capable
// models receive a proper system role instead of having the system text /**
// concatenated into the user prompt (helps avoid revealing internal * @brief Infers text from separate system and user prompts.
// reasoning or instructions in model output). *
* This helps chat-capable models preserve system-role behavior instead of
* concatenating system text into user input.
*
* @param system_prompt System role prompt.
* @param prompt User prompt.
* @param max_tokens Maximum tokens to generate.
* @return Generated text.
*/
std::string Infer(const std::string& system_prompt, std::string Infer(const std::string& system_prompt,
const std::string& prompt, int max_tokens = 10000); const std::string& prompt, int max_tokens = 10000);
/**
* @brief Runs inference on an already-formatted prompt.
*
* @param formatted_prompt Prompt preformatted for model chat template.
* @param max_tokens Maximum tokens to generate.
* @return Generated text.
*/
std::string InferFormatted(const std::string& formatted_prompt, std::string InferFormatted(const std::string& formatted_prompt,
int max_tokens = 10000); int max_tokens = 10000);
/**
* @brief Loads the brewery system prompt from disk.
*
* @param prompt_file_path Prompt file path to try first.
* @return Loaded prompt text or fallback prompt.
*/
std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path); std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path);
/**
* @brief Returns a built-in fallback system prompt.
*
* @return Fallback prompt text.
*/
std::string GetFallbackBreweryPrompt(); std::string GetFallbackBreweryPrompt();
llama_model* model_ = nullptr; llama_model* model_ = nullptr;

View File

@@ -1,6 +1,11 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_ #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_ #define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
/**
* @file data_generation/llama_generator_helpers.h
* @brief Shared helper APIs used by LlamaGenerator translation units.
*/
#include <string> #include <string>
#include <utility> #include <utility>
@@ -8,23 +13,66 @@ struct llama_model;
struct llama_vocab; struct llama_vocab;
typedef int llama_token; typedef int llama_token;
// Helper functions for LlamaGenerator methods /**
* @brief Normalizes and truncates regional context.
*
* @param region_context Input regional context text.
* @param max_chars Maximum output length.
* @return Processed region context.
*/
std::string PrepareRegionContextPublic(std::string_view region_context, std::string PrepareRegionContextPublic(std::string_view region_context,
std::size_t max_chars = 700); std::size_t max_chars = 700);
/**
* @brief Parses a response expected to contain two logical lines.
*
* @param raw Raw model output.
* @param error_message Error message thrown on parse failure.
* @return Pair containing first and second parsed fields.
*/
std::pair<std::string, std::string> ParseTwoLineResponsePublic( std::pair<std::string, std::string> ParseTwoLineResponsePublic(
const std::string& raw, const std::string& error_message); const std::string& raw, const std::string& error_message);
/**
* @brief Applies model chat template to a user-only prompt.
*
* @param model Loaded llama model.
* @param user_prompt User prompt text.
* @return Model-formatted prompt.
*/
std::string ToChatPromptPublic(const llama_model* model, std::string ToChatPromptPublic(const llama_model* model,
const std::string& user_prompt); const std::string& user_prompt);
/**
* @brief Applies model chat template to system and user prompts.
*
* @param model Loaded llama model.
* @param system_prompt System prompt text.
* @param user_prompt User prompt text.
* @return Model-formatted prompt.
*/
std::string ToChatPromptPublic(const llama_model* model, std::string ToChatPromptPublic(const llama_model* model,
const std::string& system_prompt, const std::string& system_prompt,
const std::string& user_prompt); const std::string& user_prompt);
/**
* @brief Decodes a sampled token and appends it to output text.
*
* @param vocab Model vocabulary.
* @param token Sampled token id.
* @param output Output text buffer.
*/
void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token, void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
std::string& output); std::string& output);
/**
* @brief Validates and parses brewery JSON output.
*
* @param raw Raw model output.
* @param name_out Parsed brewery name.
* @param description_out Parsed brewery description.
* @return Empty string on success, or validation error message.
*/
std::string ValidateBreweryJsonPublic(const std::string& raw, std::string ValidateBreweryJsonPublic(const std::string& raw,
std::string& name_out, std::string& name_out,
std::string& description_out); std::string& description_out);

View File

@@ -1,20 +1,56 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_ #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_ #define BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
/**
* @file data_generation/mock_generator.h
* @brief Deterministic mock implementation of DataGenerator.
*/
#include <string> #include <string>
#include <vector> #include <vector>
#include "data_generation/data_generator.h" #include "data_generation/data_generator.h"
/**
* @brief Mock generator used for deterministic, model-free outputs.
*/
class MockGenerator final : public DataGenerator { class MockGenerator final : public DataGenerator {
public: public:
/**
* @brief Initializes the mock generator.
*
* @param model_path Unused for mock generation.
*/
void Load(const std::string& model_path) override; void Load(const std::string& model_path) override;
/**
* @brief Generates deterministic brewery data for a location.
*
* @param city_name City name.
* @param country_name Country name.
* @param region_context Unused for mock generation.
* @return Generated brewery result.
*/
BreweryResult GenerateBrewery(const std::string& city_name, BreweryResult GenerateBrewery(const std::string& city_name,
const std::string& country_name, const std::string& country_name,
const std::string& region_context) override; const std::string& region_context) override;
/**
* @brief Generates deterministic user data for a locale.
*
* @param locale Locale hint.
* @return Generated user result.
*/
UserResult GenerateUser(const std::string& locale) override; UserResult GenerateUser(const std::string& locale) override;
private: private:
/**
* @brief Combines two strings into a stable hash value.
*
* @param a First key.
* @param b Second key.
* @return Deterministic hash value.
*/
static std::size_t DeterministicHash(const std::string& a, static std::size_t DeterministicHash(const std::string& a,
const std::string& b); const std::string& b);

View File

@@ -1,15 +1,36 @@
#ifndef BIERGARTEN_PIPELINE_MODELS_LOCATION_H_ #ifndef BIERGARTEN_PIPELINE_MODELS_LOCATION_H_
#define BIERGARTEN_PIPELINE_MODELS_LOCATION_H_ #define BIERGARTEN_PIPELINE_MODELS_LOCATION_H_
/**
* @file data_model/location.h
* @brief Location data model used throughout generation pipeline.
*/
#include <string> #include <string>
/**
* @brief Canonical location record for city-level generation.
*/
struct Location { struct Location {
/// @brief City name.
std::string city; std::string city;
/// @brief State or province name.
std::string state_province; std::string state_province;
/// @brief ISO 3166-2 subdivision code.
std::string iso3166_2; std::string iso3166_2;
/// @brief Country name.
std::string country; std::string country;
/// @brief ISO 3166-1 country code.
std::string iso3166_1; std::string iso3166_1;
/// @brief Latitude in decimal degrees.
double latitude; double latitude;
/// @brief Longitude in decimal degrees.
double longitude; double longitude;
}; };

View File

@@ -1,6 +1,11 @@
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_ #ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_ #define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
/**
* @file json_handling/json_loader.h
* @brief Loader API for curated location data.
*/
#include <string> #include <string>
#include <vector> #include <vector>

View File

@@ -1,29 +1,70 @@
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_ #ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
#define BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_ #define BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
/**
* @file web_client/curl_web_client.h
* @brief libcurl-based WebClient implementation.
*/
#include <memory> #include <memory>
#include "web_client/web_client.h" #include "web_client/web_client.h"
// RAII for curl_global_init/cleanup. /**
// An instance of this class should be created in main() before any curl * @brief RAII wrapper for curl_global_init and curl_global_cleanup.
// operations and exist for the lifetime of the application. *
* Create one instance in application startup before using libcurl and keep it
* alive for application lifetime.
*/
class CurlGlobalState { class CurlGlobalState {
public: public:
/// @brief Initializes global libcurl state.
CurlGlobalState(); CurlGlobalState();
/// @brief Cleans up global libcurl state.
~CurlGlobalState(); ~CurlGlobalState();
/// @brief Non-copyable type.
CurlGlobalState(const CurlGlobalState&) = delete; CurlGlobalState(const CurlGlobalState&) = delete;
/// @brief Non-copyable type.
CurlGlobalState& operator=(const CurlGlobalState&) = delete; CurlGlobalState& operator=(const CurlGlobalState&) = delete;
}; };
/**
* @brief WebClient implementation backed by libcurl.
*/
class CURLWebClient : public WebClient { class CURLWebClient : public WebClient {
public: public:
/// @brief Constructs a CURL web client.
CURLWebClient(); CURLWebClient();
/// @brief Destroys the CURL web client.
~CURLWebClient() override; ~CURLWebClient() override;
/**
* @brief Downloads URL contents to a file.
*
* @param url Source URL.
* @param file_path Destination file path.
*/
void DownloadToFile(const std::string& url, void DownloadToFile(const std::string& url,
const std::string& file_path) override; const std::string& file_path) override;
/**
* @brief Executes an HTTP GET request.
*
* @param url Request URL.
* @return Response body.
*/
std::string Get(const std::string& url) override; std::string Get(const std::string& url) override;
/**
* @brief URL-encodes a string value.
*
* @param value Raw value.
* @return URL-encoded string.
*/
std::string UrlEncode(const std::string& value) override; std::string UrlEncode(const std::string& value) override;
}; };

View File

@@ -1,21 +1,44 @@
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_ #ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
#define BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_ #define BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
/**
* @file web_client/web_client.h
* @brief Abstract interface for HTTP and URL utilities.
*/
#include <string> #include <string>
/**
* @brief Abstract web client interface.
*/
class WebClient { class WebClient {
public: public:
/// @brief Virtual destructor for polymorphic cleanup.
virtual ~WebClient() = default; virtual ~WebClient() = default;
// Downloads content from a URL to a file. Throws on error. /**
* @brief Downloads content from a URL into a file.
*
* @param url Source URL.
* @param file_path Destination file path.
*/
virtual void DownloadToFile(const std::string& url, virtual void DownloadToFile(const std::string& url,
const std::string& file_path) = 0; const std::string& file_path) = 0;
// Performs a GET request and returns the response body as a string. Throws /**
// on error. * @brief Executes an HTTP GET request.
*
* @param url Request URL.
* @return Response body.
*/
virtual std::string Get(const std::string& url) = 0; virtual std::string Get(const std::string& url) = 0;
// URL-encodes a string. /**
* @brief URL-encodes a string value.
*
* @param value Raw string value.
* @return Encoded value safe for URL usage.
*/
virtual std::string UrlEncode(const std::string& value) = 0; virtual std::string UrlEncode(const std::string& value) = 0;
}; };

View File

@@ -1,6 +1,11 @@
#ifndef BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_ #ifndef BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
#define BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_ #define BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
/**
* @file wikipedia/wikipedia_service.h
* @brief Wikipedia summary retrieval service with in-memory caching.
*/
#include <memory> #include <memory>
#include <string> #include <string>
#include <string_view> #include <string_view>

View File

@@ -1,3 +1,9 @@
/**
* @file biergarten_data_generator.cpp
* @brief Orchestrates end-to-end pipeline execution for city sampling,
* Wikipedia enrichment, generator initialization, and brewery result output.
*/
#include "biergarten_data_generator.h" #include "biergarten_data_generator.h"
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
@@ -80,16 +86,13 @@ auto BiergartenDataGenerator::EnrichWithWikipedia(
pending.reserve(cities.size()); pending.reserve(cities.size());
for (const auto& city : cities) { for (const auto& city : cities) {
pending.push_back(std::async(std::launch::async, pending.push_back(
[web_client = webClient_, city]() { std::async(std::launch::async, [web_client = webClient_, city]() {
WikipediaService wikipedia_service( WikipediaService wikipedia_service(web_client);
web_client);
const std::string region_context = const std::string region_context =
wikipedia_service.GetSummary( wikipedia_service.GetSummary(city.city, city.country);
city.city, city.country); spdlog::debug("[Pipeline] Region context for {}: {}", city.city,
spdlog::debug( region_context);
"[Pipeline] Region context for {}: {}",
city.city, region_context);
return EnrichedCity{city, region_context}; return EnrichedCity{city, region_context};
})); }));
} }
@@ -110,21 +113,23 @@ void BiergartenDataGenerator::GenerateBreweries(
for (const auto& enriched_city : cities) { for (const auto& enriched_city : cities) {
try { try {
auto brewery = generator.GenerateBrewery(enriched_city.location.city, auto brewery = generator.GenerateBrewery(
enriched_city.location.country, enriched_city.location.city, enriched_city.location.country,
enriched_city.region_context); enriched_city.region_context);
generatedBreweries_.push_back({enriched_city.location, brewery}); generatedBreweries_.push_back({enriched_city.location, brewery});
} catch (const std::exception& e) { } catch (const std::exception& e) {
++skipped_count; ++skipped_count;
spdlog::warn( spdlog::warn(
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: {}", "[Pipeline] Skipping city '{}' ({}): brewery generation failed: "
"{}",
enriched_city.location.city, enriched_city.location.country, enriched_city.location.city, enriched_city.location.country,
e.what()); e.what());
} }
} }
if (skipped_count > 0) { if (skipped_count > 0) {
spdlog::warn("[Pipeline] Skipped {} city/cities due to generation " spdlog::warn(
"[Pipeline] Skipped {} city/cities due to generation "
"errors", "errors",
skipped_count); skipped_count);
} }
@@ -134,7 +139,8 @@ void BiergartenDataGenerator::LogResults() const {
spdlog::info("\n=== GENERATED DATA DUMP ==="); spdlog::info("\n=== GENERATED DATA DUMP ===");
size_t index = 1; size_t index = 1;
for (const auto& entry : generatedBreweries_) { for (const auto& entry : generatedBreweries_) {
spdlog::info("{}. city=\"{}\" country=\"{}\" state=\"{}\" " spdlog::info(
"{}. city=\"{}\" country=\"{}\" state=\"{}\" "
"iso3166_2={} lat={} lon={}", "iso3166_2={} lat={} lon={}",
index, entry.location.city, entry.location.country, index, entry.location.city, entry.location.country,
entry.location.state_province, entry.location.iso3166_2, entry.location.state_province, entry.location.iso3166_2,

View File

@@ -1,7 +1,7 @@
/** /**
* Destructor Module * @file data_generation/llama/destructor.cpp
* Ensures proper cleanup of llama.cpp resources (context and model) when the * @brief Releases llama model/context resources and backend state during
* generator is destroyed, preventing memory leaks and resource exhaustion. * LlamaGenerator teardown to avoid leaks across runs.
*/ */
#include "data_generation/llama_generator.h" #include "data_generation/llama_generator.h"

View File

@@ -1,8 +1,7 @@
/** /**
* Brewery Data Generation Module * @file data_generation/llama/generate_brewery.cpp
* Uses the LLM to generate realistic brewery names and descriptions for a given * @brief Builds brewery prompts with regional context, performs retry-based
* location. Implements retry logic with validation and error correction to * inference, and validates structured JSON output for brewery records.
* ensure valid JSON output conforming to the expected schema.
*/ */
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>

View File

@@ -1,9 +1,7 @@
/** /**
* User Profile Generation Module * @file data_generation/llama/generate_user.cpp
* Uses the LLM to generate realistic user profiles (username and bio) for craft * @brief Generates locale-aware user profiles with strict two-line formatting,
* beer enthusiasts. Implements retry logic to handle parsing failures and * retry handling, and output sanitization for downstream parsing.
* ensures output adheres to strict format constraints (two lines, specific
* character limits).
*/ */
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>

View File

@@ -1,9 +1,7 @@
/** /**
* Helper Functions Module * @file data_generation/llama/helpers.cpp
* Provides utility functions for text processing, parsing, and chat template * @brief Provides prompt formatting, whitespace normalization, response
* formatting. Functions handle whitespace normalization, response parsing, and * parsing, token decoding, and JSON validation helpers for Llama modules.
* conversion of prompts to proper chat format using the model's built-in
* template.
*/ */
#include <algorithm> #include <algorithm>

View File

@@ -1,8 +1,7 @@
/** /**
* Model Loading Module * @file data_generation/llama/load.cpp
* This module handles loading a pre-trained LLM model from disk and * @brief Initializes llama backend, loads model weights, creates inference
* initializing the llama.cpp context for inference. It performs one-time setup * context, and resets prior resources during model reload.
* required before any inference operations can be performed.
*/ */
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>

View File

@@ -1,11 +1,24 @@
#include <fstream> /**
#include <filesystem> * @file data_generation/llama/load_brewery_prompt.cpp
* @brief Resolves brewery system prompt content from cache or filesystem
* search paths and provides a robust inline fallback prompt when absent.
*/
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <filesystem>
#include <fstream>
#include "data_generation/llama_generator.h" #include "data_generation/llama_generator.h"
namespace fs = std::filesystem; namespace fs = std::filesystem;
/**
* @brief Loads brewery system prompt from disk or cache.
*
* @param prompt_file_path Preferred prompt file location.
* @return Prompt text loaded from disk or fallback content.
*/
std::string LlamaGenerator::LoadBrewerySystemPrompt( std::string LlamaGenerator::LoadBrewerySystemPrompt(
const std::string& prompt_file_path) { const std::string& prompt_file_path) {
// Return cached version if already loaded // Return cached version if already loaded
@@ -29,7 +42,8 @@ std::string LlamaGenerator::LoadBrewerySystemPrompt(
if (!prompt.empty()) { if (!prompt.empty()) {
spdlog::info( spdlog::info(
"LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)", "LlamaGenerator: Loaded brewery system prompt from '{}' ({} "
"chars)",
path, prompt.length()); path, prompt.length());
brewery_system_prompt_ = prompt; brewery_system_prompt_ = prompt;
return brewery_system_prompt_; return brewery_system_prompt_;
@@ -38,16 +52,23 @@ std::string LlamaGenerator::LoadBrewerySystemPrompt(
} }
spdlog::warn( spdlog::warn(
"LlamaGenerator: Could not open brewery system prompt file at any of the " "LlamaGenerator: Could not open brewery system prompt file at any of "
"the "
"expected locations. Using fallback inline prompt."); "expected locations. Using fallback inline prompt.");
return GetFallbackBreweryPrompt(); return GetFallbackBreweryPrompt();
} }
// Fallback: minimal inline prompt if file fails to load /**
* @brief Provides an inline fallback brewery system prompt.
*
* @return Default fallback prompt text.
*/
std::string LlamaGenerator::GetFallbackBreweryPrompt() { std::string LlamaGenerator::GetFallbackBreweryPrompt() {
return "You are an experienced brewmaster and owner of a local craft brewery. " return "You are an experienced brewmaster and owner of a local craft "
"brewery. "
"Create a distinctive, authentic name and detailed description that " "Create a distinctive, authentic name and detailed description that "
"genuinely reflects your specific location, brewing philosophy, local " "genuinely reflects your specific location, brewing philosophy, "
"local "
"culture, and community connection. The brewery must feel real and " "culture, and community connection. The brewery must feel real and "
"grounded—not generic or interchangeable.\n\n" "grounded—not generic or interchangeable.\n\n"
"AVOID REPETITIVE PHRASES - Never use:\n" "AVOID REPETITIVE PHRASES - Never use:\n"
@@ -56,14 +77,16 @@ std::string LlamaGenerator::GetFallbackBreweryPrompt() {
"into, ancient roots, timeless, where tradition meets innovation\n\n" "into, ancient roots, timeless, where tradition meets innovation\n\n"
"OPENING APPROACHES - Choose ONE:\n" "OPENING APPROACHES - Choose ONE:\n"
"1. Start with specific beer style and its regional origins\n" "1. Start with specific beer style and its regional origins\n"
"2. Begin with specific brewing challenge (water, altitude, climate)\n" "2. Begin with specific brewing challenge (water, altitude, "
"climate)\n"
"3. Open with founding story or personal motivation\n" "3. Open with founding story or personal motivation\n"
"4. Lead with specific local ingredient or resource\n" "4. Lead with specific local ingredient or resource\n"
"5. Start with unexpected angle or contradiction\n" "5. Start with unexpected angle or contradiction\n"
"6. Open with local event, tradition, or cultural moment\n" "6. Open with local event, tradition, or cultural moment\n"
"7. Begin with tangible architectural or geographic detail\n\n" "7. Begin with tangible architectural or geographic detail\n\n"
"BE SPECIFIC - Include:\n" "BE SPECIFIC - Include:\n"
"- At least ONE concrete proper noun (landmark, river, neighborhood)\n" "- At least ONE concrete proper noun (landmark, river, "
"neighborhood)\n"
"- Specific beer styles relevant to the REGION'S culture\n" "- Specific beer styles relevant to the REGION'S culture\n"
"- Concrete brewing challenges or advantages\n" "- Concrete brewing challenges or advantages\n"
"- Sensory details SPECIFIC to place—not generic adjectives\n\n" "- Sensory details SPECIFIC to place—not generic adjectives\n\n"

View File

@@ -1,8 +1,7 @@
/** /**
* Sampling Configuration Module * @file data_generation/llama/set_sampling_options.cpp
* Configures the hyperparameters that control probabilistic token selection * @brief Validates and stores sampling temperature, top-p, seed, and context
* during text generation. These settings affect the randomness, diversity, and * size configuration used by subsequent LlamaGenerator inference calls.
* quality of generated output.
*/ */
#include <stdexcept> #include <stdexcept>

View File

@@ -1,3 +1,9 @@
/**
* @file data_generation/mock/data.cpp
* @brief Defines static lookup tables used by MockGenerator for deterministic
* brewery names, descriptions, usernames, and bios.
*/
#include <string> #include <string>
#include <vector> #include <vector>

View File

@@ -1,12 +1,18 @@
/**
* @file data_generation/mock/deterministic_hash.cpp
* @brief Implements a stable hash combiner used by MockGenerator to derive
* repeatable pseudo-random indices from location input.
*/
#include <boost/container_hash/hash.hpp>
#include <string> #include <string>
#include "data_generation/mock_generator.h" #include "data_generation/mock_generator.h"
std::size_t MockGenerator::DeterministicHash(const std::string& a, std::size_t MockGenerator::DeterministicHash(const std::string& a,
const std::string& b) { const std::string& b) {
std::size_t seed = std::hash<std::string>{}(a); std::size_t seed = 0;
const std::size_t mixed = std::hash<std::string>{}(b); boost::hash_combine(seed, a);
seed ^= mixed + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2); boost::hash_combine(seed, b);
seed = (seed << 13) | (seed >> ((sizeof(std::size_t) * 8) - 13));
return seed; return seed;
} }

View File

@@ -1,3 +1,9 @@
/**
* @file data_generation/mock/generate_brewery.cpp
* @brief Builds deterministic brewery names and descriptions by hashing city
* and country into fixed mock phrase catalogs.
*/
#include <string> #include <string>
#include "data_generation/mock_generator.h" #include "data_generation/mock_generator.h"
@@ -10,7 +16,8 @@ auto MockGenerator::GenerateBrewery(const std::string& city_name,
const std::string& adjective = const std::string& adjective =
kBreweryAdjectives.at(hash % kBreweryAdjectives.size()); kBreweryAdjectives.at(hash % kBreweryAdjectives.size());
const std::string& noun = kBreweryNouns.at((hash / 7) % kBreweryNouns.size()); const std::string& noun =
kBreweryNouns.at((hash / 7) % kBreweryNouns.size());
const std::string& base_description = const std::string& base_description =
kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size()); kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size());

View File

@@ -1,3 +1,9 @@
/**
* @file data_generation/mock/generate_user.cpp
* @brief Generates deterministic mock user profiles by hashing locale values
* into predefined username and bio collections.
*/
#include <functional> #include <functional>
#include <string> #include <string>

View File

@@ -1,3 +1,9 @@
/**
* @file data_generation/mock/load.cpp
* @brief Provides MockGenerator initialization behavior, which is a no-op load
* path that logs readiness without model resources.
*/
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <string> #include <string>

View File

@@ -1,21 +1,26 @@
/**
* @file json_handling/json_loader.cpp
* @brief Parses curated location JSON input into strongly typed Location
* records with strict field validation and descriptive error reporting.
*/
#include "json_handling/json_loader.h" #include "json_handling/json_loader.h"
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <boost/json.hpp> #include <boost/json.hpp>
#include <fstream> #include <fstream>
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
namespace { namespace {
auto ReadRequiredString(const boost::json::object& object, auto ReadRequiredString(const boost::json::object& object, const char* key)
const char* key) -> std::string { -> std::string {
const boost::json::value* value = object.if_contains(key); const boost::json::value* value = object.if_contains(key);
if (value == nullptr || !value->is_string()) { if (value == nullptr || !value->is_string()) {
throw std::runtime_error(std::string("Missing or invalid string field: ") + throw std::runtime_error(
key); std::string("Missing or invalid string field: ") + key);
} }
return std::string(value->as_string().c_str()); return std::string(value->as_string().c_str());
} }
@@ -24,8 +29,8 @@ auto ReadRequiredNumber(const boost::json::object& object, const char* key)
-> double { -> double {
const boost::json::value* value = object.if_contains(key); const boost::json::value* value = object.if_contains(key);
if (value == nullptr || !value->is_number()) { if (value == nullptr || !value->is_number()) {
throw std::runtime_error(std::string("Missing or invalid numeric field: ") + throw std::runtime_error(
key); std::string("Missing or invalid numeric field: ") + key);
} }
return value->to_number<double>(); return value->to_number<double>();
} }

View File

@@ -1,3 +1,9 @@
/**
* @file main.cpp
* @brief Parses command-line options, validates runtime mode selection,
* initializes shared infrastructure, and executes the pipeline entry flow.
*/
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <boost/program_options.hpp> #include <boost/program_options.hpp>

View File

@@ -1,3 +1,9 @@
/**
* @file web_client/curl_web_client.cpp
* @brief Implements libcurl-backed HTTP utilities, including GET requests,
* file downloads, URL encoding, and RAII global curl lifecycle handling.
*/
#include "web_client/curl_web_client.h" #include "web_client/curl_web_client.h"
#include <curl/curl.h> #include <curl/curl.h>

View File

@@ -1,3 +1,9 @@
/**
* @file wikipedia/wikipedia_service.cpp
* @brief Implements Wikipedia extract retrieval and caching for city/country
* queries, including response parsing and resilient error handling.
*/
#include "wikipedia/wikipedia_service.h" #include "wikipedia/wikipedia_service.h"
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>