Update documentation

This commit is contained in:
Aaron Po
2026-04-08 22:24:23 -04:00
parent 7807f0bc2a
commit b31be494d7
28 changed files with 487 additions and 93 deletions

View File

@@ -1,3 +1,9 @@
/**
* @file biergarten_data_generator.cpp
* @brief Orchestrates end-to-end pipeline execution for city sampling,
* Wikipedia enrichment, generator initialization, and brewery result output.
*/
#include "biergarten_data_generator.h"
#include <spdlog/spdlog.h>
@@ -14,11 +20,11 @@
#include "wikipedia/wikipedia_service.h"
BiergartenDataGenerator::BiergartenDataGenerator(
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
: options_(options), webClient_(std::move(web_client)) {}
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
: options_(options), webClient_(std::move(web_client)) {}
auto BiergartenDataGenerator::InitializeGenerator()
-> std::unique_ptr<DataGenerator> {
-> std::unique_ptr<DataGenerator> {
spdlog::info("Initializing brewery generator...");
std::unique_ptr<DataGenerator> generator;
@@ -43,7 +49,7 @@ auto BiergartenDataGenerator::InitializeGenerator()
}
auto BiergartenDataGenerator::QueryCitiesWithCountries()
-> std::vector<Location> {
-> std::vector<Location> {
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
std::filesystem::path locations_path = "locations.json";
@@ -72,7 +78,7 @@ auto BiergartenDataGenerator::QueryCitiesWithCountries()
}
auto BiergartenDataGenerator::EnrichWithWikipedia(
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
std::vector<EnrichedCity> enriched;
enriched.reserve(cities.size());
@@ -80,18 +86,15 @@ auto BiergartenDataGenerator::EnrichWithWikipedia(
pending.reserve(cities.size());
for (const auto& city : cities) {
pending.push_back(std::async(std::launch::async,
[web_client = webClient_, city]() {
WikipediaService wikipedia_service(
web_client);
const std::string region_context =
wikipedia_service.GetSummary(
city.city, city.country);
spdlog::debug(
"[Pipeline] Region context for {}: {}",
city.city, region_context);
return EnrichedCity{city, region_context};
}));
pending.push_back(
std::async(std::launch::async, [web_client = webClient_, city]() {
WikipediaService wikipedia_service(web_client);
const std::string region_context =
wikipedia_service.GetSummary(city.city, city.country);
spdlog::debug("[Pipeline] Region context for {}: {}", city.city,
region_context);
return EnrichedCity{city, region_context};
}));
}
for (auto& task : pending) {
@@ -110,23 +113,25 @@ void BiergartenDataGenerator::GenerateBreweries(
for (const auto& enriched_city : cities) {
try {
auto brewery = generator.GenerateBrewery(enriched_city.location.city,
enriched_city.location.country,
enriched_city.region_context);
auto brewery = generator.GenerateBrewery(
enriched_city.location.city, enriched_city.location.country,
enriched_city.region_context);
generatedBreweries_.push_back({enriched_city.location, brewery});
} catch (const std::exception& e) {
++skipped_count;
spdlog::warn(
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: {}",
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: "
"{}",
enriched_city.location.city, enriched_city.location.country,
e.what());
}
}
if (skipped_count > 0) {
spdlog::warn("[Pipeline] Skipped {} city/cities due to generation "
"errors",
skipped_count);
spdlog::warn(
"[Pipeline] Skipped {} city/cities due to generation "
"errors",
skipped_count);
}
}
@@ -134,11 +139,12 @@ void BiergartenDataGenerator::LogResults() const {
spdlog::info("\n=== GENERATED DATA DUMP ===");
size_t index = 1;
for (const auto& entry : generatedBreweries_) {
spdlog::info("{}. city=\"{}\" country=\"{}\" state=\"{}\" "
"iso3166_2={} lat={} lon={}",
index, entry.location.city, entry.location.country,
entry.location.state_province, entry.location.iso3166_2,
entry.location.latitude, entry.location.longitude);
spdlog::info(
"{}. city=\"{}\" country=\"{}\" state=\"{}\" "
"iso3166_2={} lat={} lon={}",
index, entry.location.city, entry.location.country,
entry.location.state_province, entry.location.iso3166_2,
entry.location.latitude, entry.location.longitude);
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
++index;

View File

@@ -1,7 +1,7 @@
/**
* Destructor Module
* Ensures proper cleanup of llama.cpp resources (context and model) when the
* generator is destroyed, preventing memory leaks and resource exhaustion.
* @file data_generation/llama/destructor.cpp
* @brief Releases llama model/context resources and backend state during
* LlamaGenerator teardown to avoid leaks across runs.
*/
#include "data_generation/llama_generator.h"

View File

@@ -1,8 +1,7 @@
/**
* Brewery Data Generation Module
* Uses the LLM to generate realistic brewery names and descriptions for a given
* location. Implements retry logic with validation and error correction to
* ensure valid JSON output conforming to the expected schema.
* @file data_generation/llama/generate_brewery.cpp
* @brief Builds brewery prompts with regional context, performs retry-based
* inference, and validates structured JSON output for brewery records.
*/
#include <spdlog/spdlog.h>

View File

@@ -1,9 +1,7 @@
/**
* User Profile Generation Module
* Uses the LLM to generate realistic user profiles (username and bio) for craft
* beer enthusiasts. Implements retry logic to handle parsing failures and
* ensures output adheres to strict format constraints (two lines, specific
* character limits).
* @file data_generation/llama/generate_user.cpp
* @brief Generates locale-aware user profiles with strict two-line formatting,
* retry handling, and output sanitization for downstream parsing.
*/
#include <spdlog/spdlog.h>

View File

@@ -1,9 +1,7 @@
/**
* Helper Functions Module
* Provides utility functions for text processing, parsing, and chat template
* formatting. Functions handle whitespace normalization, response parsing, and
* conversion of prompts to proper chat format using the model's built-in
* template.
* @file data_generation/llama/helpers.cpp
* @brief Provides prompt formatting, whitespace normalization, response
* parsing, token decoding, and JSON validation helpers for Llama modules.
*/
#include <algorithm>

View File

@@ -1,8 +1,7 @@
/**
* Model Loading Module
* This module handles loading a pre-trained LLM model from disk and
* initializing the llama.cpp context for inference. It performs one-time setup
* required before any inference operations can be performed.
* @file data_generation/llama/load.cpp
* @brief Initializes llama backend, loads model weights, creates inference
* context, and resets prior resources during model reload.
*/
#include <spdlog/spdlog.h>

View File

@@ -1,11 +1,24 @@
#include <fstream>
#include <filesystem>
/**
* @file data_generation/llama/load_brewery_prompt.cpp
* @brief Resolves brewery system prompt content from cache or filesystem
* search paths and provides a robust inline fallback prompt when absent.
*/
#include <spdlog/spdlog.h>
#include <filesystem>
#include <fstream>
#include "data_generation/llama_generator.h"
namespace fs = std::filesystem;
/**
* @brief Loads brewery system prompt from disk or cache.
*
* @param prompt_file_path Preferred prompt file location.
* @return Prompt text loaded from disk or fallback content.
*/
std::string LlamaGenerator::LoadBrewerySystemPrompt(
const std::string& prompt_file_path) {
// Return cached version if already loaded
@@ -15,9 +28,9 @@ std::string LlamaGenerator::LoadBrewerySystemPrompt(
// Try multiple path locations
std::vector<std::string> paths_to_try = {
prompt_file_path, // As provided
"../" + prompt_file_path, // One level up
"../../" + prompt_file_path, // Two levels up
prompt_file_path, // As provided
"../" + prompt_file_path, // One level up
"../../" + prompt_file_path, // Two levels up
};
for (const auto& path : paths_to_try) {
@@ -29,7 +42,8 @@ std::string LlamaGenerator::LoadBrewerySystemPrompt(
if (!prompt.empty()) {
spdlog::info(
"LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)",
"LlamaGenerator: Loaded brewery system prompt from '{}' ({} "
"chars)",
path, prompt.length());
brewery_system_prompt_ = prompt;
return brewery_system_prompt_;
@@ -38,16 +52,23 @@ std::string LlamaGenerator::LoadBrewerySystemPrompt(
}
spdlog::warn(
"LlamaGenerator: Could not open brewery system prompt file at any of the "
"LlamaGenerator: Could not open brewery system prompt file at any of "
"the "
"expected locations. Using fallback inline prompt.");
return GetFallbackBreweryPrompt();
}
// Fallback: minimal inline prompt if file fails to load
/**
* @brief Provides an inline fallback brewery system prompt.
*
* @return Default fallback prompt text.
*/
std::string LlamaGenerator::GetFallbackBreweryPrompt() {
return "You are an experienced brewmaster and owner of a local craft brewery. "
return "You are an experienced brewmaster and owner of a local craft "
"brewery. "
"Create a distinctive, authentic name and detailed description that "
"genuinely reflects your specific location, brewing philosophy, local "
"genuinely reflects your specific location, brewing philosophy, "
"local "
"culture, and community connection. The brewery must feel real and "
"grounded—not generic or interchangeable.\n\n"
"AVOID REPETITIVE PHRASES - Never use:\n"
@@ -56,14 +77,16 @@ std::string LlamaGenerator::GetFallbackBreweryPrompt() {
"into, ancient roots, timeless, where tradition meets innovation\n\n"
"OPENING APPROACHES - Choose ONE:\n"
"1. Start with specific beer style and its regional origins\n"
"2. Begin with specific brewing challenge (water, altitude, climate)\n"
"2. Begin with specific brewing challenge (water, altitude, "
"climate)\n"
"3. Open with founding story or personal motivation\n"
"4. Lead with specific local ingredient or resource\n"
"5. Start with unexpected angle or contradiction\n"
"6. Open with local event, tradition, or cultural moment\n"
"7. Begin with tangible architectural or geographic detail\n\n"
"BE SPECIFIC - Include:\n"
"- At least ONE concrete proper noun (landmark, river, neighborhood)\n"
"- At least ONE concrete proper noun (landmark, river, "
"neighborhood)\n"
"- Specific beer styles relevant to the REGION'S culture\n"
"- Concrete brewing challenges or advantages\n"
"- Sensory details SPECIFIC to place—not generic adjectives\n\n"

View File

@@ -1,8 +1,7 @@
/**
* Sampling Configuration Module
* Configures the hyperparameters that control probabilistic token selection
* during text generation. These settings affect the randomness, diversity, and
* quality of generated output.
* @file data_generation/llama/set_sampling_options.cpp
* @brief Validates and stores sampling temperature, top-p, seed, and context
* size configuration used by subsequent LlamaGenerator inference calls.
*/
#include <stdexcept>

View File

@@ -1,3 +1,9 @@
/**
* @file data_generation/mock/data.cpp
* @brief Defines static lookup tables used by MockGenerator for deterministic
* brewery names, descriptions, usernames, and bios.
*/
#include <string>
#include <vector>

View File

@@ -1,12 +1,18 @@
/**
* @file data_generation/mock/deterministic_hash.cpp
* @brief Implements a stable hash combiner used by MockGenerator to derive
* repeatable pseudo-random indices from location input.
*/
#include <boost/container_hash/hash.hpp>
#include <string>
#include "data_generation/mock_generator.h"
std::size_t MockGenerator::DeterministicHash(const std::string& a,
const std::string& b) {
std::size_t seed = std::hash<std::string>{}(a);
const std::size_t mixed = std::hash<std::string>{}(b);
seed ^= mixed + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
seed = (seed << 13) | (seed >> ((sizeof(std::size_t) * 8) - 13));
std::size_t seed = 0;
boost::hash_combine(seed, a);
boost::hash_combine(seed, b);
return seed;
}

View File

@@ -1,3 +1,9 @@
/**
* @file data_generation/mock/generate_brewery.cpp
* @brief Builds deterministic brewery names and descriptions by hashing city
* and country into fixed mock phrase catalogs.
*/
#include <string>
#include "data_generation/mock_generator.h"
@@ -10,7 +16,8 @@ auto MockGenerator::GenerateBrewery(const std::string& city_name,
const std::string& adjective =
kBreweryAdjectives.at(hash % kBreweryAdjectives.size());
const std::string& noun = kBreweryNouns.at((hash / 7) % kBreweryNouns.size());
const std::string& noun =
kBreweryNouns.at((hash / 7) % kBreweryNouns.size());
const std::string& base_description =
kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size());

View File

@@ -1,3 +1,9 @@
/**
* @file data_generation/mock/generate_user.cpp
* @brief Generates deterministic mock user profiles by hashing locale values
* into predefined username and bio collections.
*/
#include <functional>
#include <string>

View File

@@ -1,3 +1,9 @@
/**
* @file data_generation/mock/load.cpp
* @brief Provides MockGenerator initialization behavior, which is a no-op load
* path that logs readiness without model resources.
*/
#include <spdlog/spdlog.h>
#include <string>

View File

@@ -1,21 +1,26 @@
/**
* @file json_handling/json_loader.cpp
* @brief Parses curated location JSON input into strongly typed Location
* records with strict field validation and descriptive error reporting.
*/
#include "json_handling/json_loader.h"
#include <spdlog/spdlog.h>
#include <boost/json.hpp>
#include <fstream>
#include <sstream>
#include <stdexcept>
namespace {
auto ReadRequiredString(const boost::json::object& object,
const char* key) -> std::string {
auto ReadRequiredString(const boost::json::object& object, const char* key)
-> std::string {
const boost::json::value* value = object.if_contains(key);
if (value == nullptr || !value->is_string()) {
throw std::runtime_error(std::string("Missing or invalid string field: ") +
key);
throw std::runtime_error(
std::string("Missing or invalid string field: ") + key);
}
return std::string(value->as_string().c_str());
}
@@ -24,8 +29,8 @@ auto ReadRequiredNumber(const boost::json::object& object, const char* key)
-> double {
const boost::json::value* value = object.if_contains(key);
if (value == nullptr || !value->is_number()) {
throw std::runtime_error(std::string("Missing or invalid numeric field: ") +
key);
throw std::runtime_error(
std::string("Missing or invalid numeric field: ") + key);
}
return value->to_number<double>();
}
@@ -33,7 +38,7 @@ auto ReadRequiredNumber(const boost::json::object& object, const char* key)
} // namespace
auto JsonLoader::LoadLocations(const std::string& filepath)
-> std::vector<Location> {
-> std::vector<Location> {
std::ifstream input(filepath);
if (!input.is_open()) {
throw std::runtime_error("Failed to open locations file: " + filepath);

View File

@@ -1,3 +1,9 @@
/**
* @file main.cpp
* @brief Parses command-line options, validates runtime mode selection,
* initializes shared infrastructure, and executes the pipeline entry flow.
*/
#include <spdlog/spdlog.h>
#include <boost/program_options.hpp>

View File

@@ -1,3 +1,9 @@
/**
* @file web_client/curl_web_client.cpp
* @brief Implements libcurl-backed HTTP utilities, including GET requests,
* file downloads, URL encoding, and RAII global curl lifecycle handling.
*/
#include "web_client/curl_web_client.h"
#include <curl/curl.h>

View File

@@ -1,3 +1,9 @@
/**
* @file wikipedia/wikipedia_service.cpp
* @brief Implements Wikipedia extract retrieval and caching for city/country
* queries, including response parsing and resilient error handling.
*/
#include "wikipedia/wikipedia_service.h"
#include <spdlog/spdlog.h>