mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
Update documentation
This commit is contained in:
@@ -6,7 +6,7 @@ A C++23 tool for processing geographic data and generating brewery metadata. It
|
|||||||
|
|
||||||
The pipeline runs in four stages:
|
The pipeline runs in four stages:
|
||||||
|
|
||||||
- **Query**: Loads and samples from a local `locations.json` manifest.
|
- **Query**: Loads and samples from a local `locations.json` file.
|
||||||
- **Enrich**: Fetches regional and cultural context from Wikipedia in parallel using `std::async`.
|
- **Enrich**: Fetches regional and cultural context from Wikipedia in parallel using `std::async`.
|
||||||
- **Generate**: Creates authentic brewery names and descriptions using a local GGUF model or a deterministic mock.
|
- **Generate**: Creates authentic brewery names and descriptions using a local GGUF model or a deterministic mock.
|
||||||
- **Log**: Outputs results and metadata summaries via spdlog.
|
- **Log**: Outputs results and metadata summaries via spdlog.
|
||||||
@@ -26,7 +26,9 @@ The pipeline runs in four stages:
|
|||||||
|
|
||||||
## Hardware & GPU Config
|
## Hardware & GPU Config
|
||||||
|
|
||||||
### Test Machine
|
### Test Machines
|
||||||
|
|
||||||
|
#### x86/64 Linux, NVIDIA RTX 2000
|
||||||
|
|
||||||
- **Host**: ThinkPad P1 Gen 7 (Fedora 43)
|
- **Host**: ThinkPad P1 Gen 7 (Fedora 43)
|
||||||
- **CPU**: Intel Core Ultra 7 155H
|
- **CPU**: Intel Core Ultra 7 155H
|
||||||
@@ -35,6 +37,15 @@ The pipeline runs in four stages:
|
|||||||
- **Model**: Qwen3-8B-Q6-K
|
- **Model**: Qwen3-8B-Q6-K
|
||||||
- **Inference**: llama.cpp with CUDA 12.x support
|
- **Inference**: llama.cpp with CUDA 12.x support
|
||||||
|
|
||||||
|
#### ARM MacOS, M1 Pro
|
||||||
|
|
||||||
|
- **Host**: MacBook Pro 14" (2021)
|
||||||
|
- **CPU**: Apple M1 Pro (8-core)
|
||||||
|
- **GPU**: Apple M1 Pro (14-core) [Integrated]
|
||||||
|
- **Memory**: 16GB
|
||||||
|
- **Model**: Qwen3-8B-Q6-K
|
||||||
|
- **Inference**: llama.cpp with Metal (MPS) support
|
||||||
|
|
||||||
### GPU Build Flags
|
### GPU Build Flags
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -42,6 +53,11 @@ cmake -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89 ..
|
|||||||
cmake --build . --config Release
|
cmake --build . --config Release
|
||||||
```
|
```
|
||||||
|
|
||||||
|
```zsh
|
||||||
|
cmake ..
|
||||||
|
cmake --build .
|
||||||
|
```
|
||||||
|
|
||||||
## Core Components
|
## Core Components
|
||||||
|
|
||||||
| Component | Function |
|
| Component | Function |
|
||||||
|
|||||||
@@ -1,6 +1,11 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
|
#ifndef BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
|
||||||
#define BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
|
#define BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file biergarten_data_generator.h
|
||||||
|
* @brief Core orchestration class for pipeline data generation.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|||||||
@@ -1,28 +1,68 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
|
||||||
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file data_generation/data_generator.h
|
||||||
|
* @brief Shared generator interfaces and result models.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Generated brewery payload.
|
||||||
|
*/
|
||||||
struct BreweryResult {
|
struct BreweryResult {
|
||||||
|
/// @brief Brewery display name.
|
||||||
std::string name;
|
std::string name;
|
||||||
|
|
||||||
|
/// @brief Brewery description text.
|
||||||
std::string description;
|
std::string description;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Generated user profile payload.
|
||||||
|
*/
|
||||||
struct UserResult {
|
struct UserResult {
|
||||||
|
/// @brief Username handle.
|
||||||
std::string username;
|
std::string username;
|
||||||
|
|
||||||
|
/// @brief Short user biography.
|
||||||
std::string bio;
|
std::string bio;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Interface for data generator implementations.
|
||||||
|
*/
|
||||||
class DataGenerator {
|
class DataGenerator {
|
||||||
public:
|
public:
|
||||||
|
/// @brief Virtual destructor for polymorphic cleanup.
|
||||||
virtual ~DataGenerator() = default;
|
virtual ~DataGenerator() = default;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Loads and initializes generator resources.
|
||||||
|
*
|
||||||
|
* @param model_path Path to model assets. Implementations may ignore this.
|
||||||
|
*/
|
||||||
virtual void Load(const std::string& model_path) = 0;
|
virtual void Load(const std::string& model_path) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Generates brewery data for a location.
|
||||||
|
*
|
||||||
|
* @param city_name City name.
|
||||||
|
* @param country_name Country name.
|
||||||
|
* @param region_context Additional regional context text.
|
||||||
|
* @return Brewery generation result.
|
||||||
|
*/
|
||||||
virtual BreweryResult GenerateBrewery(const std::string& city_name,
|
virtual BreweryResult GenerateBrewery(const std::string& city_name,
|
||||||
const std::string& country_name,
|
const std::string& country_name,
|
||||||
const std::string& region_context) = 0;
|
const std::string& region_context) = 0;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Generates a user profile for a locale.
|
||||||
|
*
|
||||||
|
* @param locale Locale hint used by generator.
|
||||||
|
* @return User generation result.
|
||||||
|
*/
|
||||||
virtual UserResult GenerateUser(const std::string& locale) = 0;
|
virtual UserResult GenerateUser(const std::string& locale) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,11 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||||
#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file data_generation/llama_generator.h
|
||||||
|
* @brief Llama.cpp-backed implementation of DataGenerator.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@@ -9,34 +14,107 @@
|
|||||||
struct llama_model;
|
struct llama_model;
|
||||||
struct llama_context;
|
struct llama_context;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Data generator implementation backed by llama.cpp.
|
||||||
|
*/
|
||||||
class LlamaGenerator final : public DataGenerator {
|
class LlamaGenerator final : public DataGenerator {
|
||||||
public:
|
public:
|
||||||
|
/// @brief Constructs a generator with default sampling and context settings.
|
||||||
LlamaGenerator() = default;
|
LlamaGenerator() = default;
|
||||||
|
|
||||||
|
/// @brief Releases model/context resources.
|
||||||
~LlamaGenerator() override;
|
~LlamaGenerator() override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Configures sampling parameters for generation.
|
||||||
|
*
|
||||||
|
* @param temperature Sampling temperature.
|
||||||
|
* @param top_p Nucleus sampling threshold.
|
||||||
|
* @param seed Seed for sampling; use -1 for random seed.
|
||||||
|
*/
|
||||||
void SetSamplingOptions(float temperature, float top_p, int seed = -1);
|
void SetSamplingOptions(float temperature, float top_p, int seed = -1);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Sets context window size used during model load.
|
||||||
|
*
|
||||||
|
* @param n_ctx Context size in tokens.
|
||||||
|
*/
|
||||||
void SetContextSize(uint32_t n_ctx);
|
void SetContextSize(uint32_t n_ctx);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Loads model and prepares inference context.
|
||||||
|
*
|
||||||
|
* @param model_path Filesystem path to GGUF model.
|
||||||
|
*/
|
||||||
void Load(const std::string& model_path) override;
|
void Load(const std::string& model_path) override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Generates brewery data for a specific location.
|
||||||
|
*
|
||||||
|
* @param city_name City name.
|
||||||
|
* @param country_name Country name.
|
||||||
|
* @param region_context Additional regional context.
|
||||||
|
* @return Generated brewery result.
|
||||||
|
*/
|
||||||
BreweryResult GenerateBrewery(const std::string& city_name,
|
BreweryResult GenerateBrewery(const std::string& city_name,
|
||||||
const std::string& country_name,
|
const std::string& country_name,
|
||||||
const std::string& region_context) override;
|
const std::string& region_context) override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Generates a user profile for the provided locale.
|
||||||
|
*
|
||||||
|
* @param locale Locale hint.
|
||||||
|
* @return Generated user profile.
|
||||||
|
*/
|
||||||
UserResult GenerateUser(const std::string& locale) override;
|
UserResult GenerateUser(const std::string& locale) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
/**
|
||||||
|
* @brief Infers text from a user prompt.
|
||||||
|
*
|
||||||
|
* @param prompt User prompt.
|
||||||
|
* @param max_tokens Maximum tokens to generate.
|
||||||
|
* @return Generated text.
|
||||||
|
*/
|
||||||
std::string Infer(const std::string& prompt, int max_tokens = 10000);
|
std::string Infer(const std::string& prompt, int max_tokens = 10000);
|
||||||
// Overload that allows passing a system message separately so chat-capable
|
|
||||||
// models receive a proper system role instead of having the system text
|
/**
|
||||||
// concatenated into the user prompt (helps avoid revealing internal
|
* @brief Infers text from separate system and user prompts.
|
||||||
// reasoning or instructions in model output).
|
*
|
||||||
|
* This helps chat-capable models preserve system-role behavior instead of
|
||||||
|
* concatenating system text into user input.
|
||||||
|
*
|
||||||
|
* @param system_prompt System role prompt.
|
||||||
|
* @param prompt User prompt.
|
||||||
|
* @param max_tokens Maximum tokens to generate.
|
||||||
|
* @return Generated text.
|
||||||
|
*/
|
||||||
std::string Infer(const std::string& system_prompt,
|
std::string Infer(const std::string& system_prompt,
|
||||||
const std::string& prompt, int max_tokens = 10000);
|
const std::string& prompt, int max_tokens = 10000);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Runs inference on an already-formatted prompt.
|
||||||
|
*
|
||||||
|
* @param formatted_prompt Prompt preformatted for model chat template.
|
||||||
|
* @param max_tokens Maximum tokens to generate.
|
||||||
|
* @return Generated text.
|
||||||
|
*/
|
||||||
std::string InferFormatted(const std::string& formatted_prompt,
|
std::string InferFormatted(const std::string& formatted_prompt,
|
||||||
int max_tokens = 10000);
|
int max_tokens = 10000);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Loads the brewery system prompt from disk.
|
||||||
|
*
|
||||||
|
* @param prompt_file_path Prompt file path to try first.
|
||||||
|
* @return Loaded prompt text or fallback prompt.
|
||||||
|
*/
|
||||||
std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path);
|
std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Returns a built-in fallback system prompt.
|
||||||
|
*
|
||||||
|
* @return Fallback prompt text.
|
||||||
|
*/
|
||||||
std::string GetFallbackBreweryPrompt();
|
std::string GetFallbackBreweryPrompt();
|
||||||
|
|
||||||
llama_model* model_ = nullptr;
|
llama_model* model_ = nullptr;
|
||||||
|
|||||||
@@ -1,6 +1,11 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
|
||||||
#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file data_generation/llama_generator_helpers.h
|
||||||
|
* @brief Shared helper APIs used by LlamaGenerator translation units.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
|
|
||||||
@@ -8,23 +13,66 @@ struct llama_model;
|
|||||||
struct llama_vocab;
|
struct llama_vocab;
|
||||||
typedef int llama_token;
|
typedef int llama_token;
|
||||||
|
|
||||||
// Helper functions for LlamaGenerator methods
|
/**
|
||||||
|
* @brief Normalizes and truncates regional context.
|
||||||
|
*
|
||||||
|
* @param region_context Input regional context text.
|
||||||
|
* @param max_chars Maximum output length.
|
||||||
|
* @return Processed region context.
|
||||||
|
*/
|
||||||
std::string PrepareRegionContextPublic(std::string_view region_context,
|
std::string PrepareRegionContextPublic(std::string_view region_context,
|
||||||
std::size_t max_chars = 700);
|
std::size_t max_chars = 700);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Parses a response expected to contain two logical lines.
|
||||||
|
*
|
||||||
|
* @param raw Raw model output.
|
||||||
|
* @param error_message Error message thrown on parse failure.
|
||||||
|
* @return Pair containing first and second parsed fields.
|
||||||
|
*/
|
||||||
std::pair<std::string, std::string> ParseTwoLineResponsePublic(
|
std::pair<std::string, std::string> ParseTwoLineResponsePublic(
|
||||||
const std::string& raw, const std::string& error_message);
|
const std::string& raw, const std::string& error_message);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Applies model chat template to a user-only prompt.
|
||||||
|
*
|
||||||
|
* @param model Loaded llama model.
|
||||||
|
* @param user_prompt User prompt text.
|
||||||
|
* @return Model-formatted prompt.
|
||||||
|
*/
|
||||||
std::string ToChatPromptPublic(const llama_model* model,
|
std::string ToChatPromptPublic(const llama_model* model,
|
||||||
const std::string& user_prompt);
|
const std::string& user_prompt);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Applies model chat template to system and user prompts.
|
||||||
|
*
|
||||||
|
* @param model Loaded llama model.
|
||||||
|
* @param system_prompt System prompt text.
|
||||||
|
* @param user_prompt User prompt text.
|
||||||
|
* @return Model-formatted prompt.
|
||||||
|
*/
|
||||||
std::string ToChatPromptPublic(const llama_model* model,
|
std::string ToChatPromptPublic(const llama_model* model,
|
||||||
const std::string& system_prompt,
|
const std::string& system_prompt,
|
||||||
const std::string& user_prompt);
|
const std::string& user_prompt);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Decodes a sampled token and appends it to output text.
|
||||||
|
*
|
||||||
|
* @param vocab Model vocabulary.
|
||||||
|
* @param token Sampled token id.
|
||||||
|
* @param output Output text buffer.
|
||||||
|
*/
|
||||||
void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
|
void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
|
||||||
std::string& output);
|
std::string& output);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Validates and parses brewery JSON output.
|
||||||
|
*
|
||||||
|
* @param raw Raw model output.
|
||||||
|
* @param name_out Parsed brewery name.
|
||||||
|
* @param description_out Parsed brewery description.
|
||||||
|
* @return Empty string on success, or validation error message.
|
||||||
|
*/
|
||||||
std::string ValidateBreweryJsonPublic(const std::string& raw,
|
std::string ValidateBreweryJsonPublic(const std::string& raw,
|
||||||
std::string& name_out,
|
std::string& name_out,
|
||||||
std::string& description_out);
|
std::string& description_out);
|
||||||
|
|||||||
@@ -1,20 +1,56 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
|
||||||
#define BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file data_generation/mock_generator.h
|
||||||
|
* @brief Deterministic mock implementation of DataGenerator.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "data_generation/data_generator.h"
|
#include "data_generation/data_generator.h"
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Mock generator used for deterministic, model-free outputs.
|
||||||
|
*/
|
||||||
class MockGenerator final : public DataGenerator {
|
class MockGenerator final : public DataGenerator {
|
||||||
public:
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Initializes the mock generator.
|
||||||
|
*
|
||||||
|
* @param model_path Unused for mock generation.
|
||||||
|
*/
|
||||||
void Load(const std::string& model_path) override;
|
void Load(const std::string& model_path) override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Generates deterministic brewery data for a location.
|
||||||
|
*
|
||||||
|
* @param city_name City name.
|
||||||
|
* @param country_name Country name.
|
||||||
|
* @param region_context Unused for mock generation.
|
||||||
|
* @return Generated brewery result.
|
||||||
|
*/
|
||||||
BreweryResult GenerateBrewery(const std::string& city_name,
|
BreweryResult GenerateBrewery(const std::string& city_name,
|
||||||
const std::string& country_name,
|
const std::string& country_name,
|
||||||
const std::string& region_context) override;
|
const std::string& region_context) override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Generates deterministic user data for a locale.
|
||||||
|
*
|
||||||
|
* @param locale Locale hint.
|
||||||
|
* @return Generated user result.
|
||||||
|
*/
|
||||||
UserResult GenerateUser(const std::string& locale) override;
|
UserResult GenerateUser(const std::string& locale) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
/**
|
||||||
|
* @brief Combines two strings into a stable hash value.
|
||||||
|
*
|
||||||
|
* @param a First key.
|
||||||
|
* @param b Second key.
|
||||||
|
* @return Deterministic hash value.
|
||||||
|
*/
|
||||||
static std::size_t DeterministicHash(const std::string& a,
|
static std::size_t DeterministicHash(const std::string& a,
|
||||||
const std::string& b);
|
const std::string& b);
|
||||||
|
|
||||||
|
|||||||
@@ -1,15 +1,36 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_MODELS_LOCATION_H_
|
#ifndef BIERGARTEN_PIPELINE_MODELS_LOCATION_H_
|
||||||
#define BIERGARTEN_PIPELINE_MODELS_LOCATION_H_
|
#define BIERGARTEN_PIPELINE_MODELS_LOCATION_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file data_model/location.h
|
||||||
|
* @brief Location data model used throughout generation pipeline.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Canonical location record for city-level generation.
|
||||||
|
*/
|
||||||
struct Location {
|
struct Location {
|
||||||
|
/// @brief City name.
|
||||||
std::string city;
|
std::string city;
|
||||||
|
|
||||||
|
/// @brief State or province name.
|
||||||
std::string state_province;
|
std::string state_province;
|
||||||
|
|
||||||
|
/// @brief ISO 3166-2 subdivision code.
|
||||||
std::string iso3166_2;
|
std::string iso3166_2;
|
||||||
|
|
||||||
|
/// @brief Country name.
|
||||||
std::string country;
|
std::string country;
|
||||||
|
|
||||||
|
/// @brief ISO 3166-1 country code.
|
||||||
std::string iso3166_1;
|
std::string iso3166_1;
|
||||||
|
|
||||||
|
/// @brief Latitude in decimal degrees.
|
||||||
double latitude;
|
double latitude;
|
||||||
|
|
||||||
|
/// @brief Longitude in decimal degrees.
|
||||||
double longitude;
|
double longitude;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,11 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||||
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file json_handling/json_loader.h
|
||||||
|
* @brief Loader API for curated location data.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
|||||||
@@ -1,29 +1,70 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
||||||
#define BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
#define BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file web_client/curl_web_client.h
|
||||||
|
* @brief libcurl-based WebClient implementation.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include "web_client/web_client.h"
|
#include "web_client/web_client.h"
|
||||||
|
|
||||||
// RAII for curl_global_init/cleanup.
|
/**
|
||||||
// An instance of this class should be created in main() before any curl
|
* @brief RAII wrapper for curl_global_init and curl_global_cleanup.
|
||||||
// operations and exist for the lifetime of the application.
|
*
|
||||||
|
* Create one instance in application startup before using libcurl and keep it
|
||||||
|
* alive for application lifetime.
|
||||||
|
*/
|
||||||
class CurlGlobalState {
|
class CurlGlobalState {
|
||||||
public:
|
public:
|
||||||
|
/// @brief Initializes global libcurl state.
|
||||||
CurlGlobalState();
|
CurlGlobalState();
|
||||||
|
|
||||||
|
/// @brief Cleans up global libcurl state.
|
||||||
~CurlGlobalState();
|
~CurlGlobalState();
|
||||||
|
|
||||||
|
/// @brief Non-copyable type.
|
||||||
CurlGlobalState(const CurlGlobalState&) = delete;
|
CurlGlobalState(const CurlGlobalState&) = delete;
|
||||||
|
|
||||||
|
/// @brief Non-copyable type.
|
||||||
CurlGlobalState& operator=(const CurlGlobalState&) = delete;
|
CurlGlobalState& operator=(const CurlGlobalState&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief WebClient implementation backed by libcurl.
|
||||||
|
*/
|
||||||
class CURLWebClient : public WebClient {
|
class CURLWebClient : public WebClient {
|
||||||
public:
|
public:
|
||||||
|
/// @brief Constructs a CURL web client.
|
||||||
CURLWebClient();
|
CURLWebClient();
|
||||||
|
|
||||||
|
/// @brief Destroys the CURL web client.
|
||||||
~CURLWebClient() override;
|
~CURLWebClient() override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Downloads URL contents to a file.
|
||||||
|
*
|
||||||
|
* @param url Source URL.
|
||||||
|
* @param file_path Destination file path.
|
||||||
|
*/
|
||||||
void DownloadToFile(const std::string& url,
|
void DownloadToFile(const std::string& url,
|
||||||
const std::string& file_path) override;
|
const std::string& file_path) override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Executes an HTTP GET request.
|
||||||
|
*
|
||||||
|
* @param url Request URL.
|
||||||
|
* @return Response body.
|
||||||
|
*/
|
||||||
std::string Get(const std::string& url) override;
|
std::string Get(const std::string& url) override;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief URL-encodes a string value.
|
||||||
|
*
|
||||||
|
* @param value Raw value.
|
||||||
|
* @return URL-encoded string.
|
||||||
|
*/
|
||||||
std::string UrlEncode(const std::string& value) override;
|
std::string UrlEncode(const std::string& value) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,21 +1,44 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
|
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
|
||||||
#define BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
|
#define BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file web_client/web_client.h
|
||||||
|
* @brief Abstract interface for HTTP and URL utilities.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Abstract web client interface.
|
||||||
|
*/
|
||||||
class WebClient {
|
class WebClient {
|
||||||
public:
|
public:
|
||||||
|
/// @brief Virtual destructor for polymorphic cleanup.
|
||||||
virtual ~WebClient() = default;
|
virtual ~WebClient() = default;
|
||||||
|
|
||||||
// Downloads content from a URL to a file. Throws on error.
|
/**
|
||||||
|
* @brief Downloads content from a URL into a file.
|
||||||
|
*
|
||||||
|
* @param url Source URL.
|
||||||
|
* @param file_path Destination file path.
|
||||||
|
*/
|
||||||
virtual void DownloadToFile(const std::string& url,
|
virtual void DownloadToFile(const std::string& url,
|
||||||
const std::string& file_path) = 0;
|
const std::string& file_path) = 0;
|
||||||
|
|
||||||
// Performs a GET request and returns the response body as a string. Throws
|
/**
|
||||||
// on error.
|
* @brief Executes an HTTP GET request.
|
||||||
|
*
|
||||||
|
* @param url Request URL.
|
||||||
|
* @return Response body.
|
||||||
|
*/
|
||||||
virtual std::string Get(const std::string& url) = 0;
|
virtual std::string Get(const std::string& url) = 0;
|
||||||
|
|
||||||
// URL-encodes a string.
|
/**
|
||||||
|
* @brief URL-encodes a string value.
|
||||||
|
*
|
||||||
|
* @param value Raw string value.
|
||||||
|
* @return Encoded value safe for URL usage.
|
||||||
|
*/
|
||||||
virtual std::string UrlEncode(const std::string& value) = 0;
|
virtual std::string UrlEncode(const std::string& value) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,11 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
#ifndef BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
||||||
#define BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
#define BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @file wikipedia/wikipedia_service.h
|
||||||
|
* @brief Wikipedia summary retrieval service with in-memory caching.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <string_view>
|
#include <string_view>
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
/**
|
||||||
|
* @file biergarten_data_generator.cpp
|
||||||
|
* @brief Orchestrates end-to-end pipeline execution for city sampling,
|
||||||
|
* Wikipedia enrichment, generator initialization, and brewery result output.
|
||||||
|
*/
|
||||||
|
|
||||||
#include "biergarten_data_generator.h"
|
#include "biergarten_data_generator.h"
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
@@ -14,11 +20,11 @@
|
|||||||
#include "wikipedia/wikipedia_service.h"
|
#include "wikipedia/wikipedia_service.h"
|
||||||
|
|
||||||
BiergartenDataGenerator::BiergartenDataGenerator(
|
BiergartenDataGenerator::BiergartenDataGenerator(
|
||||||
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
|
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
|
||||||
: options_(options), webClient_(std::move(web_client)) {}
|
: options_(options), webClient_(std::move(web_client)) {}
|
||||||
|
|
||||||
auto BiergartenDataGenerator::InitializeGenerator()
|
auto BiergartenDataGenerator::InitializeGenerator()
|
||||||
-> std::unique_ptr<DataGenerator> {
|
-> std::unique_ptr<DataGenerator> {
|
||||||
spdlog::info("Initializing brewery generator...");
|
spdlog::info("Initializing brewery generator...");
|
||||||
|
|
||||||
std::unique_ptr<DataGenerator> generator;
|
std::unique_ptr<DataGenerator> generator;
|
||||||
@@ -43,7 +49,7 @@ auto BiergartenDataGenerator::InitializeGenerator()
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto BiergartenDataGenerator::QueryCitiesWithCountries()
|
auto BiergartenDataGenerator::QueryCitiesWithCountries()
|
||||||
-> std::vector<Location> {
|
-> std::vector<Location> {
|
||||||
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||||
|
|
||||||
std::filesystem::path locations_path = "locations.json";
|
std::filesystem::path locations_path = "locations.json";
|
||||||
@@ -72,7 +78,7 @@ auto BiergartenDataGenerator::QueryCitiesWithCountries()
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto BiergartenDataGenerator::EnrichWithWikipedia(
|
auto BiergartenDataGenerator::EnrichWithWikipedia(
|
||||||
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
|
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
|
||||||
std::vector<EnrichedCity> enriched;
|
std::vector<EnrichedCity> enriched;
|
||||||
enriched.reserve(cities.size());
|
enriched.reserve(cities.size());
|
||||||
|
|
||||||
@@ -80,18 +86,15 @@ auto BiergartenDataGenerator::EnrichWithWikipedia(
|
|||||||
pending.reserve(cities.size());
|
pending.reserve(cities.size());
|
||||||
|
|
||||||
for (const auto& city : cities) {
|
for (const auto& city : cities) {
|
||||||
pending.push_back(std::async(std::launch::async,
|
pending.push_back(
|
||||||
[web_client = webClient_, city]() {
|
std::async(std::launch::async, [web_client = webClient_, city]() {
|
||||||
WikipediaService wikipedia_service(
|
WikipediaService wikipedia_service(web_client);
|
||||||
web_client);
|
const std::string region_context =
|
||||||
const std::string region_context =
|
wikipedia_service.GetSummary(city.city, city.country);
|
||||||
wikipedia_service.GetSummary(
|
spdlog::debug("[Pipeline] Region context for {}: {}", city.city,
|
||||||
city.city, city.country);
|
region_context);
|
||||||
spdlog::debug(
|
return EnrichedCity{city, region_context};
|
||||||
"[Pipeline] Region context for {}: {}",
|
}));
|
||||||
city.city, region_context);
|
|
||||||
return EnrichedCity{city, region_context};
|
|
||||||
}));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for (auto& task : pending) {
|
for (auto& task : pending) {
|
||||||
@@ -110,23 +113,25 @@ void BiergartenDataGenerator::GenerateBreweries(
|
|||||||
|
|
||||||
for (const auto& enriched_city : cities) {
|
for (const auto& enriched_city : cities) {
|
||||||
try {
|
try {
|
||||||
auto brewery = generator.GenerateBrewery(enriched_city.location.city,
|
auto brewery = generator.GenerateBrewery(
|
||||||
enriched_city.location.country,
|
enriched_city.location.city, enriched_city.location.country,
|
||||||
enriched_city.region_context);
|
enriched_city.region_context);
|
||||||
generatedBreweries_.push_back({enriched_city.location, brewery});
|
generatedBreweries_.push_back({enriched_city.location, brewery});
|
||||||
} catch (const std::exception& e) {
|
} catch (const std::exception& e) {
|
||||||
++skipped_count;
|
++skipped_count;
|
||||||
spdlog::warn(
|
spdlog::warn(
|
||||||
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: {}",
|
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: "
|
||||||
|
"{}",
|
||||||
enriched_city.location.city, enriched_city.location.country,
|
enriched_city.location.city, enriched_city.location.country,
|
||||||
e.what());
|
e.what());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (skipped_count > 0) {
|
if (skipped_count > 0) {
|
||||||
spdlog::warn("[Pipeline] Skipped {} city/cities due to generation "
|
spdlog::warn(
|
||||||
"errors",
|
"[Pipeline] Skipped {} city/cities due to generation "
|
||||||
skipped_count);
|
"errors",
|
||||||
|
skipped_count);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -134,11 +139,12 @@ void BiergartenDataGenerator::LogResults() const {
|
|||||||
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
||||||
size_t index = 1;
|
size_t index = 1;
|
||||||
for (const auto& entry : generatedBreweries_) {
|
for (const auto& entry : generatedBreweries_) {
|
||||||
spdlog::info("{}. city=\"{}\" country=\"{}\" state=\"{}\" "
|
spdlog::info(
|
||||||
"iso3166_2={} lat={} lon={}",
|
"{}. city=\"{}\" country=\"{}\" state=\"{}\" "
|
||||||
index, entry.location.city, entry.location.country,
|
"iso3166_2={} lat={} lon={}",
|
||||||
entry.location.state_province, entry.location.iso3166_2,
|
index, entry.location.city, entry.location.country,
|
||||||
entry.location.latitude, entry.location.longitude);
|
entry.location.state_province, entry.location.iso3166_2,
|
||||||
|
entry.location.latitude, entry.location.longitude);
|
||||||
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
|
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
|
||||||
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
|
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
|
||||||
++index;
|
++index;
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* Destructor Module
|
* @file data_generation/llama/destructor.cpp
|
||||||
* Ensures proper cleanup of llama.cpp resources (context and model) when the
|
* @brief Releases llama model/context resources and backend state during
|
||||||
* generator is destroyed, preventing memory leaks and resource exhaustion.
|
* LlamaGenerator teardown to avoid leaks across runs.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "data_generation/llama_generator.h"
|
#include "data_generation/llama_generator.h"
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* Brewery Data Generation Module
|
* @file data_generation/llama/generate_brewery.cpp
|
||||||
* Uses the LLM to generate realistic brewery names and descriptions for a given
|
* @brief Builds brewery prompts with regional context, performs retry-based
|
||||||
* location. Implements retry logic with validation and error correction to
|
* inference, and validates structured JSON output for brewery records.
|
||||||
* ensure valid JSON output conforming to the expected schema.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* User Profile Generation Module
|
* @file data_generation/llama/generate_user.cpp
|
||||||
* Uses the LLM to generate realistic user profiles (username and bio) for craft
|
* @brief Generates locale-aware user profiles with strict two-line formatting,
|
||||||
* beer enthusiasts. Implements retry logic to handle parsing failures and
|
* retry handling, and output sanitization for downstream parsing.
|
||||||
* ensures output adheres to strict format constraints (two lines, specific
|
|
||||||
* character limits).
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|||||||
@@ -1,9 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* Helper Functions Module
|
* @file data_generation/llama/helpers.cpp
|
||||||
* Provides utility functions for text processing, parsing, and chat template
|
* @brief Provides prompt formatting, whitespace normalization, response
|
||||||
* formatting. Functions handle whitespace normalization, response parsing, and
|
* parsing, token decoding, and JSON validation helpers for Llama modules.
|
||||||
* conversion of prompts to proper chat format using the model's built-in
|
|
||||||
* template.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* Model Loading Module
|
* @file data_generation/llama/load.cpp
|
||||||
* This module handles loading a pre-trained LLM model from disk and
|
* @brief Initializes llama backend, loads model weights, creates inference
|
||||||
* initializing the llama.cpp context for inference. It performs one-time setup
|
* context, and resets prior resources during model reload.
|
||||||
* required before any inference operations can be performed.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|||||||
@@ -1,11 +1,24 @@
|
|||||||
#include <fstream>
|
/**
|
||||||
#include <filesystem>
|
* @file data_generation/llama/load_brewery_prompt.cpp
|
||||||
|
* @brief Resolves brewery system prompt content from cache or filesystem
|
||||||
|
* search paths and provides a robust inline fallback prompt when absent.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include <filesystem>
|
||||||
|
#include <fstream>
|
||||||
|
|
||||||
#include "data_generation/llama_generator.h"
|
#include "data_generation/llama_generator.h"
|
||||||
|
|
||||||
namespace fs = std::filesystem;
|
namespace fs = std::filesystem;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Loads brewery system prompt from disk or cache.
|
||||||
|
*
|
||||||
|
* @param prompt_file_path Preferred prompt file location.
|
||||||
|
* @return Prompt text loaded from disk or fallback content.
|
||||||
|
*/
|
||||||
std::string LlamaGenerator::LoadBrewerySystemPrompt(
|
std::string LlamaGenerator::LoadBrewerySystemPrompt(
|
||||||
const std::string& prompt_file_path) {
|
const std::string& prompt_file_path) {
|
||||||
// Return cached version if already loaded
|
// Return cached version if already loaded
|
||||||
@@ -15,9 +28,9 @@ std::string LlamaGenerator::LoadBrewerySystemPrompt(
|
|||||||
|
|
||||||
// Try multiple path locations
|
// Try multiple path locations
|
||||||
std::vector<std::string> paths_to_try = {
|
std::vector<std::string> paths_to_try = {
|
||||||
prompt_file_path, // As provided
|
prompt_file_path, // As provided
|
||||||
"../" + prompt_file_path, // One level up
|
"../" + prompt_file_path, // One level up
|
||||||
"../../" + prompt_file_path, // Two levels up
|
"../../" + prompt_file_path, // Two levels up
|
||||||
};
|
};
|
||||||
|
|
||||||
for (const auto& path : paths_to_try) {
|
for (const auto& path : paths_to_try) {
|
||||||
@@ -29,7 +42,8 @@ std::string LlamaGenerator::LoadBrewerySystemPrompt(
|
|||||||
|
|
||||||
if (!prompt.empty()) {
|
if (!prompt.empty()) {
|
||||||
spdlog::info(
|
spdlog::info(
|
||||||
"LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)",
|
"LlamaGenerator: Loaded brewery system prompt from '{}' ({} "
|
||||||
|
"chars)",
|
||||||
path, prompt.length());
|
path, prompt.length());
|
||||||
brewery_system_prompt_ = prompt;
|
brewery_system_prompt_ = prompt;
|
||||||
return brewery_system_prompt_;
|
return brewery_system_prompt_;
|
||||||
@@ -38,16 +52,23 @@ std::string LlamaGenerator::LoadBrewerySystemPrompt(
|
|||||||
}
|
}
|
||||||
|
|
||||||
spdlog::warn(
|
spdlog::warn(
|
||||||
"LlamaGenerator: Could not open brewery system prompt file at any of the "
|
"LlamaGenerator: Could not open brewery system prompt file at any of "
|
||||||
|
"the "
|
||||||
"expected locations. Using fallback inline prompt.");
|
"expected locations. Using fallback inline prompt.");
|
||||||
return GetFallbackBreweryPrompt();
|
return GetFallbackBreweryPrompt();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fallback: minimal inline prompt if file fails to load
|
/**
|
||||||
|
* @brief Provides an inline fallback brewery system prompt.
|
||||||
|
*
|
||||||
|
* @return Default fallback prompt text.
|
||||||
|
*/
|
||||||
std::string LlamaGenerator::GetFallbackBreweryPrompt() {
|
std::string LlamaGenerator::GetFallbackBreweryPrompt() {
|
||||||
return "You are an experienced brewmaster and owner of a local craft brewery. "
|
return "You are an experienced brewmaster and owner of a local craft "
|
||||||
|
"brewery. "
|
||||||
"Create a distinctive, authentic name and detailed description that "
|
"Create a distinctive, authentic name and detailed description that "
|
||||||
"genuinely reflects your specific location, brewing philosophy, local "
|
"genuinely reflects your specific location, brewing philosophy, "
|
||||||
|
"local "
|
||||||
"culture, and community connection. The brewery must feel real and "
|
"culture, and community connection. The brewery must feel real and "
|
||||||
"grounded—not generic or interchangeable.\n\n"
|
"grounded—not generic or interchangeable.\n\n"
|
||||||
"AVOID REPETITIVE PHRASES - Never use:\n"
|
"AVOID REPETITIVE PHRASES - Never use:\n"
|
||||||
@@ -56,14 +77,16 @@ std::string LlamaGenerator::GetFallbackBreweryPrompt() {
|
|||||||
"into, ancient roots, timeless, where tradition meets innovation\n\n"
|
"into, ancient roots, timeless, where tradition meets innovation\n\n"
|
||||||
"OPENING APPROACHES - Choose ONE:\n"
|
"OPENING APPROACHES - Choose ONE:\n"
|
||||||
"1. Start with specific beer style and its regional origins\n"
|
"1. Start with specific beer style and its regional origins\n"
|
||||||
"2. Begin with specific brewing challenge (water, altitude, climate)\n"
|
"2. Begin with specific brewing challenge (water, altitude, "
|
||||||
|
"climate)\n"
|
||||||
"3. Open with founding story or personal motivation\n"
|
"3. Open with founding story or personal motivation\n"
|
||||||
"4. Lead with specific local ingredient or resource\n"
|
"4. Lead with specific local ingredient or resource\n"
|
||||||
"5. Start with unexpected angle or contradiction\n"
|
"5. Start with unexpected angle or contradiction\n"
|
||||||
"6. Open with local event, tradition, or cultural moment\n"
|
"6. Open with local event, tradition, or cultural moment\n"
|
||||||
"7. Begin with tangible architectural or geographic detail\n\n"
|
"7. Begin with tangible architectural or geographic detail\n\n"
|
||||||
"BE SPECIFIC - Include:\n"
|
"BE SPECIFIC - Include:\n"
|
||||||
"- At least ONE concrete proper noun (landmark, river, neighborhood)\n"
|
"- At least ONE concrete proper noun (landmark, river, "
|
||||||
|
"neighborhood)\n"
|
||||||
"- Specific beer styles relevant to the REGION'S culture\n"
|
"- Specific beer styles relevant to the REGION'S culture\n"
|
||||||
"- Concrete brewing challenges or advantages\n"
|
"- Concrete brewing challenges or advantages\n"
|
||||||
"- Sensory details SPECIFIC to place—not generic adjectives\n\n"
|
"- Sensory details SPECIFIC to place—not generic adjectives\n\n"
|
||||||
|
|||||||
@@ -1,8 +1,7 @@
|
|||||||
/**
|
/**
|
||||||
* Sampling Configuration Module
|
* @file data_generation/llama/set_sampling_options.cpp
|
||||||
* Configures the hyperparameters that control probabilistic token selection
|
* @brief Validates and stores sampling temperature, top-p, seed, and context
|
||||||
* during text generation. These settings affect the randomness, diversity, and
|
* size configuration used by subsequent LlamaGenerator inference calls.
|
||||||
* quality of generated output.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
/**
|
||||||
|
* @file data_generation/mock/data.cpp
|
||||||
|
* @brief Defines static lookup tables used by MockGenerator for deterministic
|
||||||
|
* brewery names, descriptions, usernames, and bios.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
|||||||
@@ -1,12 +1,18 @@
|
|||||||
|
/**
|
||||||
|
* @file data_generation/mock/deterministic_hash.cpp
|
||||||
|
* @brief Implements a stable hash combiner used by MockGenerator to derive
|
||||||
|
* repeatable pseudo-random indices from location input.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <boost/container_hash/hash.hpp>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "data_generation/mock_generator.h"
|
#include "data_generation/mock_generator.h"
|
||||||
|
|
||||||
std::size_t MockGenerator::DeterministicHash(const std::string& a,
|
std::size_t MockGenerator::DeterministicHash(const std::string& a,
|
||||||
const std::string& b) {
|
const std::string& b) {
|
||||||
std::size_t seed = std::hash<std::string>{}(a);
|
std::size_t seed = 0;
|
||||||
const std::size_t mixed = std::hash<std::string>{}(b);
|
boost::hash_combine(seed, a);
|
||||||
seed ^= mixed + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
|
boost::hash_combine(seed, b);
|
||||||
seed = (seed << 13) | (seed >> ((sizeof(std::size_t) * 8) - 13));
|
|
||||||
return seed;
|
return seed;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
/**
|
||||||
|
* @file data_generation/mock/generate_brewery.cpp
|
||||||
|
* @brief Builds deterministic brewery names and descriptions by hashing city
|
||||||
|
* and country into fixed mock phrase catalogs.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "data_generation/mock_generator.h"
|
#include "data_generation/mock_generator.h"
|
||||||
@@ -10,7 +16,8 @@ auto MockGenerator::GenerateBrewery(const std::string& city_name,
|
|||||||
|
|
||||||
const std::string& adjective =
|
const std::string& adjective =
|
||||||
kBreweryAdjectives.at(hash % kBreweryAdjectives.size());
|
kBreweryAdjectives.at(hash % kBreweryAdjectives.size());
|
||||||
const std::string& noun = kBreweryNouns.at((hash / 7) % kBreweryNouns.size());
|
const std::string& noun =
|
||||||
|
kBreweryNouns.at((hash / 7) % kBreweryNouns.size());
|
||||||
const std::string& base_description =
|
const std::string& base_description =
|
||||||
kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size());
|
kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size());
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
/**
|
||||||
|
* @file data_generation/mock/generate_user.cpp
|
||||||
|
* @brief Generates deterministic mock user profiles by hashing locale values
|
||||||
|
* into predefined username and bio collections.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
/**
|
||||||
|
* @file data_generation/mock/load.cpp
|
||||||
|
* @brief Provides MockGenerator initialization behavior, which is a no-op load
|
||||||
|
* path that logs readiness without model resources.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|||||||
@@ -1,21 +1,26 @@
|
|||||||
|
/**
|
||||||
|
* @file json_handling/json_loader.cpp
|
||||||
|
* @brief Parses curated location JSON input into strongly typed Location
|
||||||
|
* records with strict field validation and descriptive error reporting.
|
||||||
|
*/
|
||||||
|
|
||||||
#include "json_handling/json_loader.h"
|
#include "json_handling/json_loader.h"
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
#include <boost/json.hpp>
|
#include <boost/json.hpp>
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
auto ReadRequiredString(const boost::json::object& object,
|
auto ReadRequiredString(const boost::json::object& object, const char* key)
|
||||||
const char* key) -> std::string {
|
-> std::string {
|
||||||
const boost::json::value* value = object.if_contains(key);
|
const boost::json::value* value = object.if_contains(key);
|
||||||
if (value == nullptr || !value->is_string()) {
|
if (value == nullptr || !value->is_string()) {
|
||||||
throw std::runtime_error(std::string("Missing or invalid string field: ") +
|
throw std::runtime_error(
|
||||||
key);
|
std::string("Missing or invalid string field: ") + key);
|
||||||
}
|
}
|
||||||
return std::string(value->as_string().c_str());
|
return std::string(value->as_string().c_str());
|
||||||
}
|
}
|
||||||
@@ -24,8 +29,8 @@ auto ReadRequiredNumber(const boost::json::object& object, const char* key)
|
|||||||
-> double {
|
-> double {
|
||||||
const boost::json::value* value = object.if_contains(key);
|
const boost::json::value* value = object.if_contains(key);
|
||||||
if (value == nullptr || !value->is_number()) {
|
if (value == nullptr || !value->is_number()) {
|
||||||
throw std::runtime_error(std::string("Missing or invalid numeric field: ") +
|
throw std::runtime_error(
|
||||||
key);
|
std::string("Missing or invalid numeric field: ") + key);
|
||||||
}
|
}
|
||||||
return value->to_number<double>();
|
return value->to_number<double>();
|
||||||
}
|
}
|
||||||
@@ -33,7 +38,7 @@ auto ReadRequiredNumber(const boost::json::object& object, const char* key)
|
|||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
auto JsonLoader::LoadLocations(const std::string& filepath)
|
auto JsonLoader::LoadLocations(const std::string& filepath)
|
||||||
-> std::vector<Location> {
|
-> std::vector<Location> {
|
||||||
std::ifstream input(filepath);
|
std::ifstream input(filepath);
|
||||||
if (!input.is_open()) {
|
if (!input.is_open()) {
|
||||||
throw std::runtime_error("Failed to open locations file: " + filepath);
|
throw std::runtime_error("Failed to open locations file: " + filepath);
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
/**
|
||||||
|
* @file main.cpp
|
||||||
|
* @brief Parses command-line options, validates runtime mode selection,
|
||||||
|
* initializes shared infrastructure, and executes the pipeline entry flow.
|
||||||
|
*/
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
/**
|
||||||
|
* @file web_client/curl_web_client.cpp
|
||||||
|
* @brief Implements libcurl-backed HTTP utilities, including GET requests,
|
||||||
|
* file downloads, URL encoding, and RAII global curl lifecycle handling.
|
||||||
|
*/
|
||||||
|
|
||||||
#include "web_client/curl_web_client.h"
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
|
|||||||
@@ -1,3 +1,9 @@
|
|||||||
|
/**
|
||||||
|
* @file wikipedia/wikipedia_service.cpp
|
||||||
|
* @brief Implements Wikipedia extract retrieval and caching for city/country
|
||||||
|
* queries, including response parsing and resilient error handling.
|
||||||
|
*/
|
||||||
|
|
||||||
#include "wikipedia/wikipedia_service.h"
|
#include "wikipedia/wikipedia_service.h"
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|||||||
Reference in New Issue
Block a user