mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
Create biergarten brewery pipeline project (#199)
This commit is contained in:
@@ -0,0 +1,14 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/biergarten_data_generator.cc
|
||||
* @brief BiergartenDataGenerator constructor implementation.
|
||||
*/
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
|
||||
#include <utility>
|
||||
|
||||
BiergartenDataGenerator::BiergartenDataGenerator(
|
||||
std::unique_ptr<IEnrichmentService> context_service,
|
||||
std::unique_ptr<DataGenerator> generator)
|
||||
: context_service_(std::move(context_service)),
|
||||
generator_(std::move(generator)) {}
|
||||
39
pipeline/src/biergarten_data_generator/generate_breweries.cc
Normal file
39
pipeline/src/biergarten_data_generator/generate_breweries.cc
Normal file
@@ -0,0 +1,39 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/generate_breweries.cc
|
||||
* @brief BiergartenDataGenerator::GenerateBreweries() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
|
||||
void BiergartenDataGenerator::GenerateBreweries(
|
||||
std::span<const EnrichedCity> cities) {
|
||||
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
|
||||
|
||||
generated_breweries_.clear();
|
||||
size_t skipped_count = 0;
|
||||
|
||||
for (const auto& [location, region_context] : cities) {
|
||||
try {
|
||||
const BreweryResult brewery =
|
||||
generator_->GenerateBrewery(location, region_context);
|
||||
|
||||
const GeneratedBrewery gen{.location = location, .brewery = brewery};
|
||||
|
||||
generated_breweries_.push_back(gen);
|
||||
} catch (const std::exception& e) {
|
||||
++skipped_count;
|
||||
|
||||
spdlog::warn(
|
||||
"[Pipeline] Skipping city '{}' ({}): brewery generation failed: "
|
||||
"{}",
|
||||
location.city, location.country, e.what());
|
||||
}
|
||||
}
|
||||
|
||||
if (skipped_count > 0) {
|
||||
spdlog::warn("[Pipeline] Skipped {} city/cities due to generation errors",
|
||||
skipped_count);
|
||||
}
|
||||
}
|
||||
27
pipeline/src/biergarten_data_generator/log_results.cc
Normal file
27
pipeline/src/biergarten_data_generator/log_results.cc
Normal file
@@ -0,0 +1,27 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/log_results.cc
|
||||
* @brief BiergartenDataGenerator::LogResults() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
|
||||
void BiergartenDataGenerator::LogResults() const {
|
||||
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
||||
size_t index = 1;
|
||||
for (const auto& [location, brewery] : generated_breweries_) {
|
||||
spdlog::info(
|
||||
"{}. city=\"{}\" country=\"{}\" state=\"{}\" "
|
||||
"iso3166_2={} lat={} lon={}",
|
||||
index, location.city, location.country, location.state_province,
|
||||
location.iso3166_2, location.latitude, location.longitude);
|
||||
spdlog::info(" brewery_name_en=\"{}\"", brewery.name_en);
|
||||
spdlog::info(" brewery_description_en=\"{}\"",
|
||||
brewery.description_en);
|
||||
spdlog::info(" brewery_name_local=\"{}\"", brewery.name_local);
|
||||
spdlog::info(" brewery_description_local=\"{}\"",
|
||||
brewery.description_local);
|
||||
++index;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,41 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/query_cities_with_countries.cc
|
||||
* @brief BiergartenDataGenerator::QueryCitiesWithCountries() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <filesystem>
|
||||
#include <iterator>
|
||||
#include <random>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
#include "json_handling/json_loader.h"
|
||||
|
||||
static constexpr size_t kBreweryAmount = 50;
|
||||
|
||||
std::vector<Location> BiergartenDataGenerator::QueryCitiesWithCountries() {
|
||||
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||
|
||||
const std::filesystem::path locations_path = "locations.json";
|
||||
|
||||
auto all_locations = JsonLoader::LoadLocations(locations_path);
|
||||
spdlog::info(" Locations available: {}", all_locations.size());
|
||||
|
||||
const size_t sample_count = std::min(kBreweryAmount, all_locations.size());
|
||||
|
||||
const auto sample_count_signed =
|
||||
static_cast<std::iter_difference_t<decltype(all_locations.cbegin())>>(
|
||||
sample_count);
|
||||
|
||||
std::vector<Location> sampled_locations;
|
||||
sampled_locations.reserve(sample_count);
|
||||
|
||||
std::random_device random_generator;
|
||||
std::ranges::sample(all_locations, std::back_inserter(sampled_locations),
|
||||
sample_count_signed, random_generator);
|
||||
|
||||
spdlog::info(" Sampled locations: {}", sampled_locations.size());
|
||||
return sampled_locations;
|
||||
}
|
||||
49
pipeline/src/biergarten_data_generator/run.cc
Normal file
49
pipeline/src/biergarten_data_generator/run.cc
Normal file
@@ -0,0 +1,49 @@
|
||||
/**
|
||||
* @file biergarten_data_generator/run.cc
|
||||
* @brief BiergartenDataGenerator::Run() implementation.
|
||||
*/
|
||||
|
||||
#include <utility>
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
|
||||
bool BiergartenDataGenerator::Run() {
|
||||
try {
|
||||
std::vector<Location> cities = QueryCitiesWithCountries();
|
||||
std::vector<EnrichedCity> enriched;
|
||||
enriched.reserve(cities.size());
|
||||
|
||||
size_t skipped_count = 0;
|
||||
for (auto& city : cities) {
|
||||
try {
|
||||
std::string region_context = context_service_->GetLocationContext(city);
|
||||
spdlog::debug("[Pipeline] Context for '{}' ({}) gathered:\n{}",
|
||||
city.city, city.country, region_context);
|
||||
|
||||
enriched.push_back(
|
||||
EnrichedCity{.location = std::move(city),
|
||||
.region_context = std::move(region_context)});
|
||||
} catch (const std::exception& exception) {
|
||||
++skipped_count;
|
||||
spdlog::warn(
|
||||
"[Pipeline] Skipping city '{}' ({}): context lookup failed: {}",
|
||||
city.city, city.country, exception.what());
|
||||
}
|
||||
}
|
||||
|
||||
if (skipped_count > 0) {
|
||||
spdlog::warn(
|
||||
"[Pipeline] Skipped {} city/cities due to context lookup errors",
|
||||
skipped_count);
|
||||
}
|
||||
|
||||
this->GenerateBreweries(enriched);
|
||||
this->LogResults();
|
||||
return true;
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::error("Pipeline execution failed with error: {}", e.what());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
144
pipeline/src/data_generation/llama/generate_brewery.cc
Normal file
144
pipeline/src/data_generation/llama/generate_brewery.cc
Normal file
@@ -0,0 +1,144 @@
|
||||
/**
|
||||
* @file data_generation/llama/generate_brewery.cc
|
||||
* @brief Builds brewery prompts with regional context, performs retry-based
|
||||
* inference, and validates structured JSON output for brewery records.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <format>
|
||||
#include <optional>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "data_generation/llama_generator.h"
|
||||
#include "data_generation/llama_generator_helpers.h"
|
||||
|
||||
static std::string FormatLocalLanguageCodes(
|
||||
const std::vector<std::string>& codes) {
|
||||
if (codes.empty()) {
|
||||
return "Not provided";
|
||||
}
|
||||
|
||||
std::string formatted;
|
||||
for (const std::string& code : codes) {
|
||||
if (!formatted.empty()) {
|
||||
formatted += ", ";
|
||||
}
|
||||
formatted += code;
|
||||
}
|
||||
|
||||
return formatted;
|
||||
}
|
||||
|
||||
static constexpr std::string_view kBreweryJsonGrammar = R"json_brewery(
|
||||
root ::= thought-block "{" ws "\"name_en\"" ws ":" ws string ws "," ws "\"description_en\"" ws ":" ws string ws "," ws "\"name_local\"" ws ":" ws string ws "," ws "\"description_local\"" ws ":" ws string ws "}" ws
|
||||
thought-block ::= [^{]*
|
||||
ws ::= [ \t\n\r]*
|
||||
string ::= "\"" char+ "\""
|
||||
char ::= [^"\\\x7F\x00-\x1F] | [\\] escape
|
||||
escape ::= ["\\/bfnrt] | "u" hex hex hex hex
|
||||
hex ::= [0-9a-fA-F]
|
||||
)json_brewery";
|
||||
|
||||
static constexpr int kBreweryInitialMaxTokens = 2800;
|
||||
|
||||
BreweryResult LlamaGenerator::GenerateBrewery(
|
||||
const Location& location, const std::string& region_context) {
|
||||
/**
|
||||
* Preprocess and truncate region context to manageable size
|
||||
*/
|
||||
const std::string safe_region_context = PrepareRegionContext(region_context);
|
||||
|
||||
const std::string local_language_codes =
|
||||
FormatLocalLanguageCodes(location.local_languages);
|
||||
|
||||
const std::string country_suffix =
|
||||
location.country.empty() ? std::string{}
|
||||
: std::format(", {}", location.country);
|
||||
/**
|
||||
* Load brewery system prompt from file
|
||||
* Falls back to minimal inline prompt if file not found
|
||||
*/
|
||||
const std::string system_prompt =
|
||||
LoadBrewerySystemPrompt("prompts/system.md");
|
||||
|
||||
std::string user_prompt = std::format(
|
||||
"## CITY:\n{}\n\n## COUNTRY:\n{}\n\n## LOCAL LANGUAGE CODES:\n{}\n\n## "
|
||||
"CONTEXT:\n{}",
|
||||
location.city, location.country, local_language_codes,
|
||||
safe_region_context);
|
||||
|
||||
/**
|
||||
* Store location context for retry prompts (without repeating full context)
|
||||
*/
|
||||
const std::string retry_location =
|
||||
std::format("Location: {}{}\nLocal language codes: {}", location.city,
|
||||
country_suffix, local_language_codes);
|
||||
|
||||
/**
|
||||
* RETRY LOOP with validation and error correction
|
||||
* Attempts to generate valid brewery data up to 3 times, with feedback-based
|
||||
* refinement
|
||||
*/
|
||||
constexpr int max_attempts = 3;
|
||||
std::string raw;
|
||||
std::string last_error;
|
||||
|
||||
// Token budget: too small risks truncating valid JSON mid-string.
|
||||
// Start conservatively but allow adaptive increases on truncation.
|
||||
int max_tokens = kBreweryInitialMaxTokens;
|
||||
|
||||
// Limit output length to keep it concise and focused
|
||||
for (int attempt = 0; attempt < max_attempts; ++attempt) {
|
||||
// Generate brewery data from LLM
|
||||
raw = this->Infer(system_prompt, user_prompt, max_tokens,
|
||||
kBreweryJsonGrammar);
|
||||
spdlog::debug("LlamaGenerator: raw output (attempt {}): {}", attempt + 1,
|
||||
raw);
|
||||
|
||||
// Validate output: parse JSON and check required fields
|
||||
|
||||
BreweryResult brewery;
|
||||
const std::optional<std::string> validation_error =
|
||||
ValidateBreweryJson(raw, brewery);
|
||||
|
||||
if (!validation_error.has_value()) {
|
||||
// Success: return parsed brewery data
|
||||
|
||||
spdlog::info(
|
||||
"LlamaGenerator: successfully generated brewery data on attempt {}",
|
||||
attempt + 1);
|
||||
|
||||
return brewery;
|
||||
}
|
||||
|
||||
// Validation failed: log error and prepare corrective feedback
|
||||
|
||||
last_error = *validation_error;
|
||||
spdlog::warn("LlamaGenerator: malformed brewery JSON (attempt {}): {}",
|
||||
attempt + 1, *validation_error);
|
||||
|
||||
// Update prompt with error details to guide LLM toward correct output.
|
||||
user_prompt = std::format(
|
||||
"Your previous response was invalid. Error: {}\nReturn the thought "
|
||||
"process before the JSON if needed, then return ONLY valid JSON with "
|
||||
"exactly these keys, in this exact order: {{\"name_en\": \"<English "
|
||||
"brewery name>\", \"description_en\": \"<English single-paragraph "
|
||||
"description>\", \"name_local\": \"<local-language brewery name>\", "
|
||||
"\"description_local\": \"<local-language single-paragraph "
|
||||
"description>\"}}.\nDo not include markdown, comments, extra keys, or "
|
||||
"literal placeholder values.\n\nKeep the JSON strings concise enough "
|
||||
"to fit within the token budget.\n\n{}",
|
||||
*validation_error, retry_location);
|
||||
}
|
||||
|
||||
// All retry attempts exhausted: log failure and throw exception
|
||||
spdlog::error(
|
||||
"LlamaGenerator: malformed brewery response after {} attempts: "
|
||||
"{}",
|
||||
max_attempts, last_error.empty() ? raw : last_error);
|
||||
throw std::runtime_error("LlamaGenerator: malformed brewery response");
|
||||
}
|
||||
18
pipeline/src/data_generation/llama/generate_user.cc
Normal file
18
pipeline/src/data_generation/llama/generate_user.cc
Normal file
@@ -0,0 +1,18 @@
|
||||
/**
|
||||
* @file data_generation/llama/generate_user.cc
|
||||
* @brief Generates locale-aware user profiles with strict two-line formatting,
|
||||
* retry handling, and output sanitization for downstream parsing.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#include "data_generation/llama_generator.h"
|
||||
#include "data_generation/llama_generator_helpers.h"
|
||||
|
||||
UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
|
||||
return {.username = "test_user",
|
||||
.bio = "This is a test user profile from " + locale + "."};
|
||||
}
|
||||
215
pipeline/src/data_generation/llama/helpers.cc
Normal file
215
pipeline/src/data_generation/llama/helpers.cc
Normal file
@@ -0,0 +1,215 @@
|
||||
/**
|
||||
* @file data_generation/llama/helpers.cc
|
||||
* @brief Provides prompt formatting, whitespace normalization, response
|
||||
* parsing, token decoding, and JSON validation helpers for Llama modules.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <boost/json.hpp>
|
||||
#include <cctype>
|
||||
#include <optional>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "data_generation/llama_generator_helpers.h"
|
||||
#include "llama.h"
|
||||
|
||||
/**
|
||||
* String trimming: removes leading and trailing whitespace
|
||||
*/
|
||||
static std::string Trim(std::string_view value) {
|
||||
constexpr std::string_view whitespace = " \t\n\r\f\v";
|
||||
const size_t first_index = value.find_first_not_of(whitespace);
|
||||
if (first_index == std::string_view::npos) {
|
||||
return {};
|
||||
}
|
||||
|
||||
const size_t last_index = value.find_last_not_of(whitespace);
|
||||
return std::string(value.substr(first_index, last_index - first_index + 1));
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize whitespace: collapses multiple spaces/tabs/newlines into single
|
||||
* spaces
|
||||
*/
|
||||
static std::string CondenseWhitespace(std::string_view text) {
|
||||
std::string out;
|
||||
out.reserve(text.size());
|
||||
|
||||
bool pending_space = false;
|
||||
for (const char chr : text) {
|
||||
if (std::isspace(static_cast<unsigned char>(chr)) != 0) {
|
||||
if (!out.empty()) {
|
||||
pending_space = true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if (pending_space) {
|
||||
out.push_back(' ');
|
||||
pending_space = false;
|
||||
}
|
||||
out.push_back(chr);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate region context to fit within max length while preserving word
|
||||
* boundaries
|
||||
*/
|
||||
std::string PrepareRegionContext(std::string_view region_context,
|
||||
const size_t max_chars) {
|
||||
std::string normalized = CondenseWhitespace(region_context);
|
||||
if (normalized.size() <= max_chars) {
|
||||
return normalized;
|
||||
}
|
||||
|
||||
normalized.resize(max_chars);
|
||||
const size_t last_space = normalized.find_last_of(' ');
|
||||
if (last_space != std::string::npos && last_space > max_chars / 2) {
|
||||
normalized.resize(last_space);
|
||||
}
|
||||
|
||||
normalized += "...";
|
||||
return normalized;
|
||||
}
|
||||
|
||||
void AppendTokenPiece(const llama_vocab* vocab, llama_token token,
|
||||
std::string& output) {
|
||||
constexpr size_t initial_buffer_size = 256;
|
||||
|
||||
std::array<char, initial_buffer_size> buffer{};
|
||||
|
||||
// serialize the sampled token into UTF-8 bytes
|
||||
|
||||
auto buffer_too_small = [](int32_t result) -> bool { return result < 0; };
|
||||
|
||||
int32_t bytes =
|
||||
llama_token_to_piece(vocab, token, buffer.data(), buffer.size(), 0, true);
|
||||
|
||||
if (!buffer_too_small(bytes)) {
|
||||
// Append the decoded bytes from the stack buffer.
|
||||
output.append(buffer.data(), static_cast<size_t>(bytes));
|
||||
return;
|
||||
}
|
||||
|
||||
const int32_t required_size = -bytes;
|
||||
std::vector<char> dynamic_buffer(static_cast<size_t>(required_size));
|
||||
|
||||
// Retry token decoding against the larger heap buffer.
|
||||
bytes = llama_token_to_piece(vocab, token, dynamic_buffer.data(),
|
||||
static_cast<int32_t>(dynamic_buffer.size()), 0,
|
||||
true);
|
||||
|
||||
if (!buffer_too_small(bytes)) {
|
||||
output.append(dynamic_buffer.data(), static_cast<size_t>(bytes));
|
||||
return;
|
||||
}
|
||||
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: failed to decode sampled token piece");
|
||||
}
|
||||
|
||||
static bool ReadRequiredTrimmedStringField(const boost::json::object& obj,
|
||||
std::string_view key,
|
||||
std::string& out,
|
||||
std::string* error_out) {
|
||||
const boost::json::value* field = obj.if_contains(key);
|
||||
if (field == nullptr || !field->is_string()) {
|
||||
if (error_out != nullptr) {
|
||||
*error_out = "JSON field '" + std::string(key) +
|
||||
"' is missing or not a string";
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto& string_value = field->as_string();
|
||||
out = Trim(std::string_view(string_value.data(), string_value.size()));
|
||||
if (out.empty()) {
|
||||
if (error_out != nullptr) {
|
||||
*error_out = "JSON field '" + std::string(key) + "' must not be empty";
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool HasSchemaPlaceholder(const std::array<std::string*, 4>& values) {
|
||||
for (const std::string* value : values) {
|
||||
std::string lowered = *value;
|
||||
std::ranges::transform(lowered, lowered.begin(),
|
||||
[](unsigned char character) {
|
||||
return static_cast<char>(std::tolower(character));
|
||||
});
|
||||
|
||||
if (lowered == "string") {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
std::optional<std::string> ValidateBreweryJson(const std::string& raw,
|
||||
BreweryResult& brewery_out) {
|
||||
boost::system::error_code error_code;
|
||||
const std::string_view raw_view(raw);
|
||||
const size_t opening_brace = raw_view.find('{');
|
||||
if (opening_brace == std::string_view::npos) {
|
||||
return "JSON parse error: missing opening brace '{'";
|
||||
}
|
||||
|
||||
const std::string_view json_payload = raw_view.substr(opening_brace);
|
||||
boost::json::value json_value = boost::json::parse(json_payload, error_code);
|
||||
if (error_code) {
|
||||
return "JSON parse error: " + error_code.message();
|
||||
}
|
||||
|
||||
if (!json_value.is_object()) {
|
||||
return "JSON root must be an object";
|
||||
}
|
||||
|
||||
const auto& obj = json_value.get_object();
|
||||
if (obj.size() != 4) {
|
||||
return "JSON object must contain exactly four keys";
|
||||
}
|
||||
|
||||
std::string validation_error;
|
||||
if (!ReadRequiredTrimmedStringField(obj, "name_en", brewery_out.name_en,
|
||||
&validation_error)) {
|
||||
return validation_error;
|
||||
}
|
||||
|
||||
if (!ReadRequiredTrimmedStringField(obj, "description_en",
|
||||
brewery_out.description_en,
|
||||
&validation_error)) {
|
||||
return validation_error;
|
||||
}
|
||||
|
||||
if (!ReadRequiredTrimmedStringField(obj, "name_local",
|
||||
brewery_out.name_local,
|
||||
&validation_error)) {
|
||||
return validation_error;
|
||||
}
|
||||
|
||||
if (!ReadRequiredTrimmedStringField(obj, "description_local",
|
||||
brewery_out.description_local,
|
||||
&validation_error)) {
|
||||
return validation_error;
|
||||
}
|
||||
|
||||
const std::array<std::string*, 4> schema_placeholders = {
|
||||
&brewery_out.name_en, &brewery_out.description_en,
|
||||
&brewery_out.name_local, &brewery_out.description_local};
|
||||
if (HasSchemaPlaceholder(schema_placeholders)) {
|
||||
return "JSON appears to be a schema placeholder, not content";
|
||||
}
|
||||
|
||||
return std::nullopt;
|
||||
}
|
||||
241
pipeline/src/data_generation/llama/infer.cc
Normal file
241
pipeline/src/data_generation/llama/infer.cc
Normal file
@@ -0,0 +1,241 @@
|
||||
/**
|
||||
* Text Generation / Inference Module
|
||||
* Core module that performs LLM inference: converts text prompts into tokens,
|
||||
* runs the neural network forward pass, samples the next token, and converts
|
||||
* output tokens back to text for system+user chat prompts.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
|
||||
#include "data_generation/llama_generator.h"
|
||||
#include "data_generation/llama_generator_helpers.h"
|
||||
#include "llama.h"
|
||||
|
||||
static constexpr size_t kPromptTokenSlack = 8;
|
||||
|
||||
namespace {
|
||||
|
||||
using SamplerHandle = std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
|
||||
|
||||
struct SamplerConfig {
|
||||
float temperature;
|
||||
uint32_t top_k;
|
||||
float top_p;
|
||||
uint32_t seed;
|
||||
};
|
||||
|
||||
SamplerHandle MakeSamplerChain(const llama_vocab* vocab,
|
||||
const SamplerConfig& config,
|
||||
std::string_view grammar) {
|
||||
const llama_sampler_chain_params sampler_params =
|
||||
llama_sampler_chain_default_params();
|
||||
|
||||
SamplerHandle chain(llama_sampler_chain_init(sampler_params),
|
||||
llama_sampler_free);
|
||||
if (!chain) {
|
||||
throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
|
||||
}
|
||||
|
||||
auto add_sampler = [&](llama_sampler* sampler, const char* error_message) {
|
||||
if (sampler == nullptr) {
|
||||
throw std::runtime_error(error_message);
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(chain.get(), sampler);
|
||||
};
|
||||
|
||||
if (!grammar.empty()) {
|
||||
const std::string grammar_text(grammar);
|
||||
add_sampler(llama_sampler_init_grammar(vocab, grammar_text.c_str(), "root"),
|
||||
"LlamaGenerator: failed to initialize grammar sampler");
|
||||
}
|
||||
|
||||
add_sampler(llama_sampler_init_temp(config.temperature),
|
||||
"LlamaGenerator: failed to initialize temperature sampler");
|
||||
add_sampler(llama_sampler_init_top_k(static_cast<int32_t>(config.top_k)),
|
||||
"LlamaGenerator: failed to initialize top-k sampler");
|
||||
add_sampler(llama_sampler_init_top_p(config.top_p, 1),
|
||||
"LlamaGenerator: failed to initialize top-p sampler");
|
||||
add_sampler(llama_sampler_init_dist(config.seed),
|
||||
"LlamaGenerator: failed to initialize distribution sampler");
|
||||
|
||||
return chain;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
std::string LlamaGenerator::Infer(const std::string& system_prompt,
|
||||
const std::string& prompt,
|
||||
const int max_tokens,
|
||||
std::string_view grammar) {
|
||||
return InferFormatted(prompt_formatter_->Format(system_prompt, prompt),
|
||||
max_tokens, grammar);
|
||||
}
|
||||
|
||||
std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
|
||||
const int max_tokens,
|
||||
std::string_view grammar) {
|
||||
/**
|
||||
* Validate that model and context are loaded
|
||||
*/
|
||||
if (!model_ || !context_) {
|
||||
throw std::runtime_error("LlamaGenerator: model not loaded");
|
||||
}
|
||||
|
||||
/**
|
||||
* Get vocabulary for tokenization and token-to-text conversion
|
||||
*/
|
||||
const llama_vocab* vocab = llama_model_get_vocab(model_.get());
|
||||
if (vocab == nullptr) {
|
||||
throw std::runtime_error("LlamaGenerator: vocab unavailable");
|
||||
}
|
||||
|
||||
const SamplerConfig sampler_config{
|
||||
.temperature = sampling_temperature_,
|
||||
.top_k = sampling_top_k_,
|
||||
.top_p = sampling_top_p_,
|
||||
.seed = static_cast<uint32_t>(rng_()),
|
||||
};
|
||||
auto sampler = MakeSamplerChain(vocab, sampler_config, grammar);
|
||||
|
||||
/**
|
||||
* Clear KV cache to ensure clean inference state (no residual context)
|
||||
*/
|
||||
llama_memory_clear(llama_get_memory(context_.get()), true);
|
||||
|
||||
/**
|
||||
* TOKENIZATION PHASE
|
||||
* Convert text prompt into token IDs (integers) that the model understands
|
||||
*/
|
||||
std::vector<llama_token> prompt_tokens(formatted_prompt.size() +
|
||||
kPromptTokenSlack);
|
||||
|
||||
|
||||
|
||||
|
||||
int32_t token_count = llama_tokenize(
|
||||
vocab,
|
||||
formatted_prompt.c_str(),
|
||||
static_cast<int32_t>(formatted_prompt.size()),
|
||||
prompt_tokens.data(),
|
||||
static_cast<int32_t>(prompt_tokens.size()),
|
||||
true,
|
||||
true);
|
||||
|
||||
/**
|
||||
* If buffer too small, negative return indicates required size
|
||||
*/
|
||||
if (token_count < 0) {
|
||||
prompt_tokens.resize(static_cast<size_t>(-token_count));
|
||||
|
||||
|
||||
token_count = llama_tokenize(
|
||||
vocab, formatted_prompt.c_str(),
|
||||
static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
|
||||
static_cast<int32_t>(prompt_tokens.size()), true, true);
|
||||
}
|
||||
|
||||
if (token_count < 0) {
|
||||
throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
|
||||
}
|
||||
|
||||
/**
|
||||
* CONTEXT SIZE VALIDATION
|
||||
* Validate and compute effective token budgets based on context window
|
||||
* constraints
|
||||
*/
|
||||
const auto n_ctx = static_cast<int32_t>(llama_n_ctx(context_.get()));
|
||||
const auto n_batch = static_cast<int32_t>(llama_n_batch(context_.get()));
|
||||
if (n_ctx <= 1 || n_batch <= 0) {
|
||||
throw std::runtime_error("LlamaGenerator: invalid context or batch size");
|
||||
}
|
||||
|
||||
/**
|
||||
* Clamp generation limit to available context window, reserve space for
|
||||
* output
|
||||
*/
|
||||
const int32_t effective_max_tokens =
|
||||
std::max(1, std::min(max_tokens, n_ctx - 1));
|
||||
|
||||
/**
|
||||
* Prompt can use remaining context after reserving space for generation
|
||||
*/
|
||||
int32_t prompt_budget = std::min(n_batch, n_ctx - effective_max_tokens);
|
||||
prompt_budget = std::max<int32_t>(1, prompt_budget);
|
||||
|
||||
/**
|
||||
* Truncate prompt if necessary to fit within constraints
|
||||
*/
|
||||
prompt_tokens.resize(static_cast<size_t>(token_count));
|
||||
if (token_count > prompt_budget) {
|
||||
spdlog::warn(
|
||||
"LlamaGenerator: prompt too long ({} tokens), truncating to {} "
|
||||
"tokens to fit n_batch/n_ctx limits",
|
||||
token_count, prompt_budget);
|
||||
prompt_tokens.resize(static_cast<size_t>(prompt_budget));
|
||||
token_count = prompt_budget;
|
||||
}
|
||||
|
||||
/**
|
||||
* PROMPT PROCESSING PHASE
|
||||
* Create a batch containing all prompt tokens and feed through the model
|
||||
* This computes internal representations and fills the KV cache
|
||||
*/
|
||||
const llama_batch prompt_batch = llama_batch_get_one(
|
||||
prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
|
||||
if (llama_decode(context_.get(), prompt_batch) != 0) {
|
||||
throw std::runtime_error("LlamaGenerator: prompt decode failed");
|
||||
}
|
||||
|
||||
/**
|
||||
* TOKEN GENERATION LOOP
|
||||
* Iteratively generate tokens one at a time until max_tokens or
|
||||
* end-of-sequence
|
||||
*/
|
||||
std::vector<llama_token> generated_tokens;
|
||||
generated_tokens.reserve(static_cast<size_t>(effective_max_tokens));
|
||||
|
||||
for (int i = 0; i < effective_max_tokens; ++i) {
|
||||
/**
|
||||
* Sample next token using configured sampler chain and model logits
|
||||
* Index -1 means use the last output position from previous batch
|
||||
*/
|
||||
const llama_token next =
|
||||
llama_sampler_sample(sampler.get(), context_.get(), -1);
|
||||
/**
|
||||
* Stop if model predicts end-of-generation token (EOS/EOT)
|
||||
*/
|
||||
if (llama_vocab_is_eog(vocab, next)) {
|
||||
break;
|
||||
}
|
||||
generated_tokens.push_back(next);
|
||||
/**
|
||||
* Feed the sampled token back into model for next iteration
|
||||
* (autoregressive)
|
||||
*/
|
||||
llama_token decode_token = next;
|
||||
const llama_batch one_token_batch = llama_batch_get_one(&decode_token, 1);
|
||||
if (llama_decode(context_.get(), one_token_batch) != 0) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: decode failed during generation");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* DETOKENIZATION PHASE
|
||||
* Convert generated token IDs back to text using vocabulary
|
||||
*/
|
||||
std::string output;
|
||||
for (const llama_token token : generated_tokens) {
|
||||
AppendTokenPiece(vocab, token, output);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
86
pipeline/src/data_generation/llama/llama_generator.cc
Normal file
86
pipeline/src/data_generation/llama/llama_generator.cc
Normal file
@@ -0,0 +1,86 @@
|
||||
/**
|
||||
* @file data_generation/llama/llama_generator.cc
|
||||
* @brief LlamaGenerator constructor and destructor implementation.
|
||||
*/
|
||||
|
||||
#include "data_generation/llama_generator.h"
|
||||
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <filesystem>
|
||||
|
||||
#include "data_model/application_options.h"
|
||||
#include "llama.h"
|
||||
|
||||
static constexpr uint32_t kMaxContextSize = 32768U;
|
||||
|
||||
void LlamaGenerator::ModelDeleter::operator()(
|
||||
llama_model* model) const noexcept {
|
||||
if (model != nullptr) {
|
||||
llama_model_free(model);
|
||||
}
|
||||
}
|
||||
|
||||
void LlamaGenerator::ContextDeleter::operator()(
|
||||
llama_context* context) const noexcept {
|
||||
if (context != nullptr) {
|
||||
llama_free(context);
|
||||
}
|
||||
}
|
||||
|
||||
LlamaGenerator::LlamaGenerator(const ApplicationOptions& options,
|
||||
const std::string& model_path,
|
||||
std::unique_ptr<IPromptFormatter> prompt_formatter)
|
||||
: rng_(std::random_device{}()),
|
||||
prompt_formatter_(std::move(prompt_formatter)) {
|
||||
if (model_path.empty()) {
|
||||
throw std::runtime_error("LlamaGenerator: model path must not be empty");
|
||||
}
|
||||
|
||||
if (!prompt_formatter_) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: prompt formatter dependency must not be null");
|
||||
}
|
||||
|
||||
if (options.temperature < 0.0F) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: sampling temperature must be >= 0");
|
||||
}
|
||||
|
||||
if (options.top_p <= 0.0F || options.top_p > 1.0F) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: sampling top-p must be in (0, 1]");
|
||||
}
|
||||
|
||||
if (options.top_k == 0U) {
|
||||
throw std::runtime_error("LlamaGenerator: sampling top-k must be > 0");
|
||||
}
|
||||
|
||||
if (options.seed < -1) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: seed must be >= 0, or -1 for random");
|
||||
}
|
||||
|
||||
if (options.n_ctx == 0 || options.n_ctx > kMaxContextSize) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: context size must be in range [1, 32768]");
|
||||
}
|
||||
|
||||
sampling_temperature_ = options.temperature;
|
||||
sampling_top_p_ = options.top_p;
|
||||
sampling_top_k_ = options.top_k;
|
||||
|
||||
if (options.seed == -1) {
|
||||
std::random_device random_device;
|
||||
rng_.seed(random_device());
|
||||
} else {
|
||||
rng_.seed(static_cast<uint32_t>(options.seed));
|
||||
}
|
||||
n_ctx_ = options.n_ctx;
|
||||
|
||||
this->Load(model_path);
|
||||
}
|
||||
|
||||
LlamaGenerator::~LlamaGenerator() = default;
|
||||
43
pipeline/src/data_generation/llama/load.cc
Normal file
43
pipeline/src/data_generation/llama/load.cc
Normal file
@@ -0,0 +1,43 @@
|
||||
/**
|
||||
* @file data_generation/llama/load.cc
|
||||
* @brief Initializes llama backend, loads model weights, creates inference
|
||||
* context, and resets prior resources during model initialization.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
|
||||
#include "data_generation/llama_generator.h"
|
||||
#include "llama.h"
|
||||
|
||||
void LlamaGenerator::Load(const std::string& model_path) {
|
||||
context_.reset();
|
||||
model_.reset();
|
||||
|
||||
const llama_model_params model_params = llama_model_default_params();
|
||||
LlamaGenerator::ModelHandle loaded_model(
|
||||
llama_model_load_from_file(model_path.c_str(), model_params));
|
||||
if (!loaded_model) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: failed to load model from path: " + model_path);
|
||||
}
|
||||
|
||||
llama_context_params context_params = llama_context_default_params();
|
||||
context_params.n_ctx = n_ctx_;
|
||||
context_params.n_batch = std::min(n_ctx_, static_cast<uint32_t>(5000));
|
||||
|
||||
LlamaGenerator::ContextHandle loaded_context(
|
||||
llama_init_from_model(loaded_model.get(), context_params));
|
||||
if (!loaded_context) {
|
||||
throw std::runtime_error("LlamaGenerator: failed to create context");
|
||||
}
|
||||
|
||||
model_ = std::move(loaded_model);
|
||||
context_ = std::move(loaded_context);
|
||||
|
||||
spdlog::info("[LlamaGenerator] Loaded model: {}", model_path);
|
||||
}
|
||||
56
pipeline/src/data_generation/llama/load_brewery_prompt.cc
Normal file
56
pipeline/src/data_generation/llama/load_brewery_prompt.cc
Normal file
@@ -0,0 +1,56 @@
|
||||
/**
|
||||
* @file data_generation/llama/load_brewery_prompt.cc
|
||||
* @brief Resolves brewery system prompt content from cache or a configured
|
||||
* filesystem path and provides a robust inline fallback prompt when absent.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "data_generation/llama_generator.h"
|
||||
|
||||
/**
|
||||
* @brief Loads brewery system prompt from disk or cache.
|
||||
*
|
||||
* @param prompt_file_path Preferred prompt file location.
|
||||
* @return Prompt text loaded from disk.
|
||||
*/
|
||||
std::string LlamaGenerator::LoadBrewerySystemPrompt(
|
||||
const std::filesystem::path& prompt_file_path) {
|
||||
// Return cached version if already loaded
|
||||
if (!brewery_system_prompt_.empty()) {
|
||||
return brewery_system_prompt_;
|
||||
}
|
||||
|
||||
|
||||
std::ifstream prompt_file(prompt_file_path);
|
||||
if (!prompt_file.is_open()) {
|
||||
spdlog::error(
|
||||
"LlamaGenerator: Failed to open brewery system prompt file '{}'",
|
||||
prompt_file_path.string());
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: missing brewery system prompt file: " +
|
||||
prompt_file_path.string());
|
||||
}
|
||||
|
||||
const std::string prompt((std::istreambuf_iterator(prompt_file)),
|
||||
std::istreambuf_iterator<char>());
|
||||
prompt_file.close();
|
||||
|
||||
if (prompt.empty()) {
|
||||
spdlog::error("LlamaGenerator: Brewery system prompt file '{}' is empty",
|
||||
prompt_file_path.string());
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: empty brewery system prompt file: " +
|
||||
prompt_file_path.string());
|
||||
}
|
||||
|
||||
spdlog::info(
|
||||
"LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)",
|
||||
prompt_file_path.string(), prompt.length());
|
||||
brewery_system_prompt_ = prompt;
|
||||
return brewery_system_prompt_;
|
||||
}
|
||||
16
pipeline/src/data_generation/mock/deterministic_hash.cc
Normal file
16
pipeline/src/data_generation/mock/deterministic_hash.cc
Normal file
@@ -0,0 +1,16 @@
|
||||
/**
|
||||
* @file data_generation/mock/deterministic_hash.cc
|
||||
* @brief Implements a stable hash combiner used by MockGenerator to derive
|
||||
* repeatable pseudo-random indices from location input.
|
||||
*/
|
||||
|
||||
#include <boost/container_hash/hash.hpp>
|
||||
|
||||
#include "data_generation/mock_generator.h"
|
||||
|
||||
size_t MockGenerator::DeterministicHash(const Location& location) {
|
||||
size_t seed = 0;
|
||||
boost::hash_combine(seed, location.city);
|
||||
boost::hash_combine(seed, location.country);
|
||||
return seed;
|
||||
}
|
||||
44
pipeline/src/data_generation/mock/generate_brewery.cc
Normal file
44
pipeline/src/data_generation/mock/generate_brewery.cc
Normal file
@@ -0,0 +1,44 @@
|
||||
/**
|
||||
* @file data_generation/mock/generate_brewery.cc
|
||||
* @brief Builds deterministic brewery names and descriptions by hashing city
|
||||
* and country into fixed mock phrase catalogs.
|
||||
*/
|
||||
|
||||
#include <format>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "data_generation/mock_generator.h"
|
||||
|
||||
BreweryResult MockGenerator::GenerateBrewery(
|
||||
const Location& location, const std::string& /*region_context*/) {
|
||||
const size_t hash = DeterministicHash(location);
|
||||
|
||||
const std::string_view adjective =
|
||||
kBreweryAdjectives.at(hash % kBreweryAdjectives.size());
|
||||
const std::string_view noun =
|
||||
kBreweryNouns.at(hash / 7 % kBreweryNouns.size());
|
||||
const std::string_view base_description =
|
||||
kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size());
|
||||
|
||||
const std::string name =
|
||||
std::format("{} {} {}", location.city, adjective, noun);
|
||||
|
||||
const std::string state_suffix =
|
||||
location.state_province.empty()
|
||||
? std::string{}
|
||||
: std::format(", {}", location.state_province);
|
||||
const std::string country_suffix =
|
||||
location.country.empty() ? std::string{}
|
||||
: std::format(", {}", location.country);
|
||||
const std::string description =
|
||||
std::format("{} Located in {}{}{}.", base_description, location.city,
|
||||
state_suffix, country_suffix);
|
||||
|
||||
return {
|
||||
.name_en = name,
|
||||
.description_en = description,
|
||||
.name_local = name,
|
||||
.description_local = description,
|
||||
};
|
||||
}
|
||||
22
pipeline/src/data_generation/mock/generate_user.cc
Normal file
22
pipeline/src/data_generation/mock/generate_user.cc
Normal file
@@ -0,0 +1,22 @@
|
||||
/**
|
||||
* @file data_generation/mock/generate_user.cc
|
||||
* @brief Generates deterministic mock user profiles by hashing locale values
|
||||
* into predefined username and bio collections.
|
||||
*/
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "data_generation/mock_generator.h"
|
||||
|
||||
UserResult MockGenerator::GenerateUser(const std::string& locale) {
|
||||
const size_t hash = std::hash<std::string>{}(locale);
|
||||
|
||||
UserResult result;
|
||||
const std::string_view username = kUsernames[hash % kUsernames.size()];
|
||||
const std::string_view bio = kBios[hash / 11 % kBios.size()];
|
||||
result.username = username;
|
||||
result.bio = bio;
|
||||
return result;
|
||||
}
|
||||
@@ -0,0 +1,32 @@
|
||||
#include "data_generation/prompt_formatting/gemma4_jinja_prompt_formatter.h"
|
||||
|
||||
#include <format>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
static constexpr std::string_view kWhitespace = " \t\n\r\f\v";
|
||||
|
||||
// Strips leading and trailing whitespace to ensure clean prompt injection.
|
||||
static std::string_view Trim(std::string_view value) {
|
||||
const size_t first_index = value.find_first_not_of(kWhitespace);
|
||||
|
||||
const bool is_all_whitespace = (first_index == std::string_view::npos);
|
||||
if (is_all_whitespace) {
|
||||
return "";
|
||||
}
|
||||
|
||||
const size_t last_index = value.find_last_not_of(kWhitespace);
|
||||
return value.substr(first_index, last_index - first_index + 1);
|
||||
}
|
||||
|
||||
std::string Gemma4JinjaPromptFormatter::Format(
|
||||
std::string_view system_prompt, std::string_view user_prompt) const {
|
||||
std::string_view trimmed_system = Trim(system_prompt);
|
||||
std::string_view trimmed_user = Trim(user_prompt);
|
||||
|
||||
return std::format(
|
||||
"<|turn|>system\n<|think|>\n{}\n<|turn|>\n"
|
||||
"<|turn|>user\n{}\n<|turn|>\n"
|
||||
"<|turn|>model\n<|channel>thought\n",
|
||||
trimmed_system, trimmed_user);
|
||||
}
|
||||
111
pipeline/src/json_handling/json_loader.cc
Normal file
111
pipeline/src/json_handling/json_loader.cc
Normal file
@@ -0,0 +1,111 @@
|
||||
/**
|
||||
* @file json_handling/json_loader.cc
|
||||
* @brief Parses curated location JSON input into strongly typed Location
|
||||
* records with strict field validation and descriptive error reporting.
|
||||
*/
|
||||
|
||||
#include "json_handling/json_loader.h"
|
||||
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include <boost/json.hpp>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
static std::string ReadRequiredString(const boost::json::object& object,
|
||||
const char* key) {
|
||||
const boost::json::value* value = object.if_contains(key);
|
||||
if (value == nullptr || !value->is_string()) {
|
||||
throw std::runtime_error(std::string("Missing or invalid string field: ") +
|
||||
key);
|
||||
}
|
||||
const std::string_view text = value->as_string();
|
||||
return std::string(text);
|
||||
}
|
||||
|
||||
static double ReadRequiredNumber(const boost::json::object& object,
|
||||
const char* key) {
|
||||
const boost::json::value* value = object.if_contains(key);
|
||||
if (value == nullptr || !value->is_number()) {
|
||||
throw std::runtime_error(std::string("Missing or invalid numeric field: ") +
|
||||
key);
|
||||
}
|
||||
return value->to_number<double>();
|
||||
}
|
||||
|
||||
static std::vector<std::string> ReadRequiredStringArray(
|
||||
const boost::json::object& object, const char* key) {
|
||||
const boost::json::value* value = object.if_contains(key);
|
||||
if (value == nullptr || !value->is_array()) {
|
||||
throw std::runtime_error(std::string("Missing or invalid string array field: ") +
|
||||
key);
|
||||
}
|
||||
|
||||
const auto& array = value->as_array();
|
||||
std::vector<std::string> items;
|
||||
items.reserve(array.size());
|
||||
for (const auto& item : array) {
|
||||
if (!item.is_string()) {
|
||||
throw std::runtime_error(std::string("Missing or invalid string array field: ") +
|
||||
key);
|
||||
}
|
||||
items.emplace_back(item.as_string());
|
||||
}
|
||||
return items;
|
||||
}
|
||||
|
||||
std::vector<Location> JsonLoader::LoadLocations(
|
||||
const std::filesystem::path& filepath) {
|
||||
std::ifstream input(filepath);
|
||||
if (!input.is_open()) {
|
||||
throw std::runtime_error("Failed to open locations file: " +
|
||||
filepath.string());
|
||||
}
|
||||
|
||||
std::stringstream buffer;
|
||||
buffer << input.rdbuf();
|
||||
const std::string content = buffer.str();
|
||||
|
||||
boost::system::error_code error;
|
||||
boost::json::value root = boost::json::parse(content, error);
|
||||
if (error) {
|
||||
throw std::runtime_error("Failed to parse locations JSON: " +
|
||||
error.message());
|
||||
}
|
||||
|
||||
if (!root.is_array()) {
|
||||
throw std::runtime_error(
|
||||
"Invalid locations JSON: root element must be an array");
|
||||
}
|
||||
|
||||
std::vector<Location> locations;
|
||||
const auto& items = root.as_array();
|
||||
locations.reserve(items.size());
|
||||
|
||||
for (const auto& item : items) {
|
||||
if (!item.is_object()) {
|
||||
throw std::runtime_error(
|
||||
"Invalid locations JSON: each entry must be an object");
|
||||
}
|
||||
|
||||
const auto& object = item.as_object();
|
||||
locations.push_back(Location{
|
||||
.city = ReadRequiredString(object, "city"),
|
||||
.state_province = ReadRequiredString(object, "state_province"),
|
||||
.iso3166_2 = ReadRequiredString(object, "iso3166_2"),
|
||||
.country = ReadRequiredString(object, "country"),
|
||||
.iso3166_1 = ReadRequiredString(object, "iso3166_1"),
|
||||
.local_languages =
|
||||
ReadRequiredStringArray(object, "local_languages"),
|
||||
.latitude = ReadRequiredNumber(object, "latitude"),
|
||||
.longitude = ReadRequiredNumber(object, "longitude"),
|
||||
});
|
||||
}
|
||||
|
||||
spdlog::info("[JsonLoader] Loaded {} locations from {}", locations.size(),
|
||||
filepath.string());
|
||||
return locations;
|
||||
}
|
||||
194
pipeline/src/main.cc
Normal file
194
pipeline/src/main.cc
Normal file
@@ -0,0 +1,194 @@
|
||||
/**
|
||||
* @file main.cc
|
||||
* @brief Parses command-line options, validates runtime mode selection,
|
||||
* initializes shared infrastructure, and executes the pipeline entry flow.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <boost/di.hpp>
|
||||
#include <boost/program_options.hpp>
|
||||
#include <chrono>
|
||||
#include <exception>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
#include "data_generation/llama_generator.h"
|
||||
#include "data_generation/mock_generator.h"
|
||||
#include "data_generation/prompt_formatting/gemma4_jinja_prompt_formatter.h"
|
||||
#include "data_model/application_options.h"
|
||||
#include "llama_backend_state.h"
|
||||
#include "services/enrichment_service.h"
|
||||
#include "services/wikipedia_service.h"
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
namespace prog_opts = boost::program_options;
|
||||
namespace di = boost::di;
|
||||
|
||||
/**
|
||||
* @brief Parse command-line arguments into ApplicationOptions.
|
||||
*
|
||||
* @param argc Command-line argument count.
|
||||
* @param argv Command-line arguments.
|
||||
* @return Parsed ApplicationOptions if parsing succeeded, std::nullopt
|
||||
* otherwise.
|
||||
*/
|
||||
std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
|
||||
prog_opts::options_description desc("Pipeline Options");
|
||||
|
||||
auto opt = desc.add_options();
|
||||
|
||||
opt("help,h", "Produce help message");
|
||||
|
||||
opt("mocked", prog_opts::bool_switch(),
|
||||
"Use mocked generator for brewery/user data");
|
||||
|
||||
opt("model,m", prog_opts::value<std::string>()->default_value(""),
|
||||
"Path to LLM model (gguf)");
|
||||
|
||||
opt("temperature", prog_opts::value<float>()->default_value(1.0F),
|
||||
"Sampling temperature (higher = more random)");
|
||||
|
||||
opt("top-p", prog_opts::value<float>()->default_value(0.95F),
|
||||
"Nucleus sampling top-p in (0,1] (higher = more random)");
|
||||
|
||||
opt("top-k", prog_opts::value<uint32_t>()->default_value(64),
|
||||
"Top-k sampling parameter (higher = more candidate tokens)");
|
||||
|
||||
opt("n-ctx", prog_opts::value<uint32_t>()->default_value(8192),
|
||||
"Context window size in tokens (1-32768)");
|
||||
|
||||
opt("seed", prog_opts::value<int>()->default_value(-1),
|
||||
"Sampler seed: -1 for random, otherwise non-negative integer");
|
||||
|
||||
// Handle the "no arguments" or "help" case
|
||||
if (argc == 1) {
|
||||
spdlog::info("Biergarten Pipeline");
|
||||
std::stringstream usage_stream;
|
||||
usage_stream << "\nUsage: biergarten-pipeline [options]\n\n" << desc;
|
||||
spdlog::info(usage_stream.str());
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
try {
|
||||
prog_opts::variables_map variables_map;
|
||||
prog_opts::store(prog_opts::parse_command_line(argc, argv, desc),
|
||||
variables_map);
|
||||
prog_opts::notify(variables_map);
|
||||
|
||||
if (variables_map.contains("help")) {
|
||||
std::stringstream help_stream;
|
||||
help_stream << "\n" << desc;
|
||||
spdlog::info(help_stream.str());
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
const auto use_mocked = variables_map["mocked"].as<bool>();
|
||||
const auto model_path = variables_map["model"].as<std::string>();
|
||||
|
||||
if (use_mocked && !model_path.empty()) {
|
||||
spdlog::error(
|
||||
"Invalid arguments: --mocked and --model are mutually exclusive");
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (!use_mocked && model_path.empty()) {
|
||||
spdlog::error(
|
||||
"Invalid arguments: Either --mocked or --model must be specified");
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
const bool has_llm_params = !variables_map["temperature"].defaulted() ||
|
||||
!variables_map["top-p"].defaulted() ||
|
||||
!variables_map["top-k"].defaulted() ||
|
||||
!variables_map["seed"].defaulted();
|
||||
|
||||
if (use_mocked && has_llm_params) {
|
||||
spdlog::warn(
|
||||
"Sampling parameters (--temperature, --top-p, --top-k, --seed) are"
|
||||
" ignored when using --mocked");
|
||||
}
|
||||
|
||||
ApplicationOptions options;
|
||||
options.use_mocked = use_mocked;
|
||||
options.model_path = model_path;
|
||||
options.temperature = variables_map["temperature"].as<float>();
|
||||
options.top_p = variables_map["top-p"].as<float>();
|
||||
options.top_k = variables_map["top-k"].as<uint32_t>();
|
||||
options.n_ctx = variables_map["n-ctx"].as<uint32_t>();
|
||||
options.seed = variables_map["seed"].as<int>();
|
||||
|
||||
return options;
|
||||
} catch (const std::exception& exception) {
|
||||
spdlog::error("Failed to parse command-line arguments: {}",
|
||||
exception.what());
|
||||
return std::nullopt;
|
||||
} catch (...) {
|
||||
spdlog::error("Failed to parse command-line arguments: unknown error");
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
struct Timer {
|
||||
std::chrono::steady_clock::time_point start_time =
|
||||
std::chrono::steady_clock::now();
|
||||
[[nodiscard]] int64_t Elapsed() const {
|
||||
return std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - start_time)
|
||||
.count();
|
||||
}
|
||||
};
|
||||
|
||||
int main(const int argc, char** argv) {
|
||||
try {
|
||||
Timer timer;
|
||||
const CurlGlobalState curl_state;
|
||||
const LlamaBackendState llama_backend_state;
|
||||
spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v");
|
||||
|
||||
const auto parsed_options = ParseArguments(argc, argv);
|
||||
if (!parsed_options.has_value()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
const auto options = *parsed_options;
|
||||
|
||||
const auto injector = di::make_injector(
|
||||
di::bind<WebClient>().to<CURLWebClient>(),
|
||||
di::bind<ApplicationOptions>().to(options),
|
||||
di::bind<IEnrichmentService>().to<WikipediaService>(),
|
||||
di::bind<IPromptFormatter>().to<Gemma4JinjaPromptFormatter>(),
|
||||
di::bind<std::string>().to(options.model_path),
|
||||
di::bind<DataGenerator>().to(
|
||||
[options](const auto& inj) -> std::unique_ptr<DataGenerator> {
|
||||
if (options.use_mocked) {
|
||||
spdlog::info(
|
||||
"[Generator] Using MockGenerator (no model path provided)");
|
||||
return std::make_unique<MockGenerator>();
|
||||
}
|
||||
|
||||
spdlog::info(
|
||||
"[Generator] Using LlamaGenerator: {} (temperature={}, "
|
||||
"top-p={}, top-k={}, n_ctx={}, seed={})",
|
||||
options.model_path, options.temperature, options.top_p,
|
||||
options.top_k, options.n_ctx, options.seed);
|
||||
return inj.template create<std::unique_ptr<LlamaGenerator>>();
|
||||
}));
|
||||
|
||||
auto generator = injector.create<BiergartenDataGenerator>();
|
||||
|
||||
if (!generator.Run()) {
|
||||
spdlog::error("Pipeline execution failed");
|
||||
return 1;
|
||||
}
|
||||
|
||||
spdlog::info("Pipeline executed successfully in {} ms", timer.Elapsed());
|
||||
return 0;
|
||||
} catch (const std::exception& exception) {
|
||||
spdlog::critical("Unhandled fatal error in main: {}", exception.what());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
61
pipeline/src/services/wikipedia/fetch_extract.cc
Normal file
61
pipeline/src/services/wikipedia/fetch_extract.cc
Normal file
@@ -0,0 +1,61 @@
|
||||
/**
|
||||
* @file wikipedia/fetch_extract.cc
|
||||
* @brief WikipediaService::FetchExtract() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <boost/json.hpp>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "services/wikipedia_service.h"
|
||||
|
||||
std::string WikipediaService::FetchExtract(std::string_view query) {
|
||||
const std::string cache_key(query);
|
||||
const auto cache_it = this->extract_cache_.find(cache_key);
|
||||
if (cache_it != this->extract_cache_.end()) {
|
||||
return cache_it->second;
|
||||
}
|
||||
|
||||
const std::string encoded = this->client_->UrlEncode(cache_key);
|
||||
const std::string url =
|
||||
"https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
|
||||
"&prop=extracts&explaintext=1&format=json";
|
||||
|
||||
const std::string body = this->client_->Get(url);
|
||||
|
||||
boost::system::error_code parse_error;
|
||||
boost::json::value doc = boost::json::parse(body, parse_error);
|
||||
|
||||
if (!parse_error && doc.is_object()) {
|
||||
try {
|
||||
auto& pages = doc.at("query").at("pages").get_object();
|
||||
if (!pages.empty()) {
|
||||
auto& page = pages.begin()->value().get_object();
|
||||
if (page.contains("extract") && page.at("extract").is_string()) {
|
||||
const std::string_view extract_view = page.at("extract").as_string();
|
||||
std::string extract(extract_view);
|
||||
|
||||
spdlog::debug("WikipediaService fetched {} chars for '{}'",
|
||||
extract.size(), query);
|
||||
|
||||
this->extract_cache_.emplace(cache_key, extract);
|
||||
return extract;
|
||||
}
|
||||
}
|
||||
this->extract_cache_.emplace(cache_key, std::string{});
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::warn(
|
||||
"WikipediaService: failed to parse response structure for '{}': "
|
||||
"{}",
|
||||
query, e.what());
|
||||
return {};
|
||||
}
|
||||
} else if (parse_error) {
|
||||
spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
|
||||
parse_error.message());
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
47
pipeline/src/services/wikipedia/get_summary.cc
Normal file
47
pipeline/src/services/wikipedia/get_summary.cc
Normal file
@@ -0,0 +1,47 @@
|
||||
/**
|
||||
* @file wikipedia/get_summary.cc
|
||||
* @brief WikipediaService::GetLocationContext() implementation.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "services/wikipedia_service.h"
|
||||
|
||||
std::string WikipediaService::GetLocationContext(const Location& loc) {
|
||||
if (!client_) {
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string result;
|
||||
|
||||
std::string region_query(loc.city);
|
||||
if (!loc.country.empty()) {
|
||||
region_query += ", ";
|
||||
region_query += loc.country;
|
||||
}
|
||||
|
||||
const std::string beer_query = "beer in " + loc.country;
|
||||
const std::string city_beer_query = "beer in " + loc.city;
|
||||
|
||||
auto append_extract = [&result](const std::string& extract) -> void {
|
||||
if (extract.empty()) {
|
||||
return;
|
||||
}
|
||||
if (!result.empty()) {
|
||||
result += "\n\n";
|
||||
}
|
||||
result += extract;
|
||||
};
|
||||
|
||||
try {
|
||||
append_extract(FetchExtract(region_query));
|
||||
append_extract(FetchExtract(beer_query));
|
||||
append_extract(FetchExtract(city_beer_query));
|
||||
} catch (const std::runtime_error& e) {
|
||||
spdlog::debug("WikipediaService lookup failed for '{}': {}", region_query,
|
||||
e.what());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
11
pipeline/src/services/wikipedia/wikipedia_service.cc
Normal file
11
pipeline/src/services/wikipedia/wikipedia_service.cc
Normal file
@@ -0,0 +1,11 @@
|
||||
/**
|
||||
* @file services/wikipedia/wikipedia_service.cc
|
||||
* @brief WikipediaService constructor implementation.
|
||||
*/
|
||||
|
||||
#include "services/wikipedia_service.h"
|
||||
|
||||
#include <utility>
|
||||
|
||||
WikipediaService::WikipediaService(std::unique_ptr<WebClient> client)
|
||||
: client_(std::move(client)) {}
|
||||
19
pipeline/src/web_client/curl_global_state.cc
Normal file
19
pipeline/src/web_client/curl_global_state.cc
Normal file
@@ -0,0 +1,19 @@
|
||||
/**
|
||||
* @file web_client/curl_global_state.cc
|
||||
* @brief CurlGlobalState constructor and destructor implementation.
|
||||
*/
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
CurlGlobalState::CurlGlobalState() {
|
||||
if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
|
||||
throw std::runtime_error(
|
||||
"[CURLWebClient] Failed to initialize libcurl globally");
|
||||
}
|
||||
}
|
||||
|
||||
CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }
|
||||
86
pipeline/src/web_client/curl_web_client_get.cc
Normal file
86
pipeline/src/web_client/curl_web_client_get.cc
Normal file
@@ -0,0 +1,86 @@
|
||||
/**
|
||||
* @file web_client/curl_web_client_get.cc
|
||||
* @brief CURLWebClient::Get() implementation.
|
||||
*/
|
||||
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
#include <cstdint>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
||||
|
||||
static constexpr long kConnectionTimeout = 10;
|
||||
static constexpr long kRequestTimeout = 30;
|
||||
static constexpr int32_t kOkHttpStatus = 200;
|
||||
|
||||
static CurlHandle CreateHandle() {
|
||||
CURL* handle = curl_easy_init();
|
||||
if (handle == nullptr) {
|
||||
throw std::runtime_error(
|
||||
"[CURLWebClient] Failed to initialize libcurl handle");
|
||||
}
|
||||
return {handle, &curl_easy_cleanup};
|
||||
}
|
||||
|
||||
static void SetCommonGetOptions(CURL* curl, const std::string& url) {
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
|
||||
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, kConnectionTimeout);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, kRequestTimeout);
|
||||
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
|
||||
}
|
||||
|
||||
// curl write callback that appends response data into a std::string
|
||||
static size_t WriteCallbackString(void* contents, const size_t size,
|
||||
const size_t nmemb, void* userp) {
|
||||
const size_t real_size = size * nmemb;
|
||||
auto* str = static_cast<std::string*>(userp);
|
||||
str->append(static_cast<char*>(contents), real_size);
|
||||
return real_size;
|
||||
}
|
||||
|
||||
std::string CURLWebClient::Get(const std::string& url) {
|
||||
const CurlHandle curl = CreateHandle();
|
||||
|
||||
std::string response_string;
|
||||
|
||||
SetCommonGetOptions(curl.get(), url);
|
||||
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
|
||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
|
||||
|
||||
CURLcode curl_result = curl_easy_perform(curl.get());
|
||||
|
||||
if (curl_result != CURLE_OK) {
|
||||
const auto error = std::string("[CURLWebClient] GET failed: ") +
|
||||
curl_easy_strerror(curl_result);
|
||||
throw std::runtime_error(error);
|
||||
}
|
||||
|
||||
long curl_http_code = 0;
|
||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &curl_http_code);
|
||||
|
||||
if (curl_http_code < std::numeric_limits<int32_t>::min() ||
|
||||
curl_http_code > std::numeric_limits<int32_t>::max()) {
|
||||
throw std::runtime_error("[CURLWebClient] Invalid HTTP status code: " +
|
||||
std::to_string(curl_http_code));
|
||||
}
|
||||
|
||||
const int32_t http_code = static_cast<int32_t>(curl_http_code);
|
||||
|
||||
if (http_code != kOkHttpStatus) {
|
||||
const std::string error = "[CURLWebClient] HTTP error " +
|
||||
std::to_string(http_code) + " for URL " + url;
|
||||
throw std::runtime_error(error);
|
||||
}
|
||||
|
||||
return response_string;
|
||||
}
|
||||
24
pipeline/src/web_client/curl_web_client_url_encode.cc
Normal file
24
pipeline/src/web_client/curl_web_client_url_encode.cc
Normal file
@@ -0,0 +1,24 @@
|
||||
/**
|
||||
* @file web_client/curl_web_client_url_encode.cc
|
||||
* @brief CURLWebClient::UrlEncode() implementation.
|
||||
*/
|
||||
|
||||
#include <curl/curl.h>
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
std::string CURLWebClient::UrlEncode(const std::string& value) {
|
||||
// A NULL handle is fine for UTF-8 encoding according to libcurl docs.
|
||||
char* output = curl_easy_escape(nullptr, value.c_str(), 0);
|
||||
|
||||
if (!output) {
|
||||
throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
|
||||
}
|
||||
|
||||
std::string result(output);
|
||||
curl_free(output);
|
||||
return result;
|
||||
}
|
||||
Reference in New Issue
Block a user