Refactor BiergartenDataGenerator and LlamaGenerator

This commit is contained in:
Aaron Po
2026-04-02 22:46:00 -04:00
parent 3af053f0eb
commit 534403734a
4 changed files with 242 additions and 265 deletions

View File

@@ -3,114 +3,151 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector>
#include <unordered_map> #include <unordered_map>
#include <vector>
#include "data_generation/data_generator.h" #include "data_generation/data_generator.h"
#include "database/database.h" #include "database/database.h"
#include "web_client/web_client.h" #include "web_client/web_client.h"
#include "wikipedia/wikipedia_service.h" #include "wikipedia/wikipedia_service.h"
/** /**
* @brief Program options for the Biergarten pipeline application. * @brief Program options for the Biergarten pipeline application.
*/ */
struct ApplicationOptions { struct ApplicationOptions {
/// @brief Path to the LLM model file (gguf format); mutually exclusive with use_mocked. /// @brief Path to the LLM model file (gguf format); mutually exclusive with
std::string model_path; /// use_mocked.
std::string model_path;
/// @brief Use mocked generator instead of LLM; mutually exclusive with model_path. /// @brief Use mocked generator instead of LLM; mutually exclusive with
bool use_mocked = false; /// model_path.
bool use_mocked = false;
/// @brief Directory for cached JSON and database files. /// @brief Directory for cached JSON and database files.
std::string cache_dir; std::string cache_dir;
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random). /// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
float temperature = 0.8f; float temperature = 0.8f;
/// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more random). /// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more
float top_p = 0.92f; /// random).
float top_p = 0.92f;
/// @brief Random seed for sampling (-1 for random, otherwise non-negative). /// @brief Random seed for sampling (-1 for random, otherwise non-negative).
int seed = -1; int seed = -1;
/// @brief Git commit hash for database consistency (always pinned to c5eb7772). /// @brief Git commit hash for database consistency (always pinned to
std::string commit = "c5eb7772"; /// c5eb7772).
std::string commit = "c5eb7772";
}; };
#endif // BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
/** /**
* @brief Main data generator class for the Biergarten pipeline. * @brief Main data generator class for the Biergarten pipeline.
* *
* This class encapsulates the core logic for generating brewery data. * This class encapsulates the core logic for generating brewery data.
* It handles database initialization, data loading/downloading, and brewery generation. * It handles database initialization, data loading/downloading, and brewery
* generation.
*/ */
class BiergartenDataGenerator { class BiergartenDataGenerator {
public: public:
/** /**
* @brief Construct a BiergartenDataGenerator with injected dependencies. * @brief Construct a BiergartenDataGenerator with injected dependencies.
* *
* @param options Application configuration options. * @param options Application configuration options.
* @param web_client HTTP client for downloading data. * @param web_client HTTP client for downloading data.
* @param database SQLite database instance. * @param database SQLite database instance.
*/ */
BiergartenDataGenerator(const ApplicationOptions &options, BiergartenDataGenerator(const ApplicationOptions& options,
std::shared_ptr<WebClient> web_client, std::shared_ptr<WebClient> web_client,
SqliteDatabase &database); SqliteDatabase& database);
/** /**
* @brief Run the data generation pipeline. * @brief Run the data generation pipeline.
* *
* Performs the following steps: * Performs the following steps:
* 1. Initialize database * 1. Initialize database
* 2. Download geographic data if needed * 2. Download geographic data if needed
* 3. Initialize the generator (LLM or Mock) * 3. Initialize the generator (LLM or Mock)
* 4. Generate brewery data for sample cities * 4. Generate brewery data for sample cities
* *
* @return 0 on success, 1 on failure. * @return 0 on success, 1 on failure.
*/ */
int Run(); int Run();
private: private:
/// @brief Immutable application options. /// @brief Immutable application options.
const ApplicationOptions options_; const ApplicationOptions options_;
/// @brief Shared HTTP client dependency. /// @brief Shared HTTP client dependency.
std::shared_ptr<WebClient> webClient_; std::shared_ptr<WebClient> webClient_;
/// @brief Database dependency. /// @brief Database dependency.
SqliteDatabase &database_; SqliteDatabase& database_;
/** /**
* @brief Initialize the data generator based on options. * @brief Enriched city data with Wikipedia context.
* */
* Creates either a MockGenerator (if no model path) or LlamaGenerator. struct EnrichedCity {
* int city_id;
* @return A unique_ptr to the initialized generator. std::string city_name;
*/ std::string country_name;
std::unique_ptr<DataGenerator> InitializeGenerator(); std::string region_context;
};
/** /**
* @brief Download and load geographic data if not cached. * @brief Initialize the data generator based on options.
*/ *
void LoadGeographicData(); * Creates either a MockGenerator (if no model path) or LlamaGenerator.
*
* @return A unique_ptr to the initialized generator.
*/
std::unique_ptr<DataGenerator> InitializeGenerator();
/** /**
* @brief Generate sample breweries for demonstration. * @brief Download and load geographic data if not cached.
*/ */
void GenerateSampleBreweries(); void LoadGeographicData();
/** /**
* @brief Helper struct to store generated brewery data. * @brief Query cities from database and build country name map.
*/ *
struct GeneratedBrewery { * @return Vector of (City, country_name) pairs capped at 30 entries.
int city_id; */
std::string city_name; std::vector<std::pair<City, std::string>> QueryCitiesWithCountries();
BreweryResult brewery;
};
/// @brief Stores generated brewery data. /**
std::vector<GeneratedBrewery> generatedBreweries_; * @brief Enrich cities with Wikipedia summaries.
*
* @param cities Vector of (City, country_name) pairs.
* @return Vector of enriched city data with context.
*/
std::vector<EnrichedCity> EnrichWithWikipedia(
const std::vector<std::pair<City, std::string>>& cities);
/**
* @brief Generate breweries for enriched cities.
*
* @param generator The data generator instance.
* @param cities Vector of enriched city data.
*/
void GenerateBreweries(DataGenerator& generator,
const std::vector<EnrichedCity>& cities);
/**
* @brief Log the generated brewery results.
*/
void LogResults() const;
/**
* @brief Helper struct to store generated brewery data.
*/
struct GeneratedBrewery {
int city_id;
std::string city_name;
BreweryResult brewery;
};
/// @brief Stores generated brewery data.
std::vector<GeneratedBrewery> generatedBreweries_;
}; };
#endif // BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_

View File

@@ -31,6 +31,9 @@ class LlamaGenerator final : public DataGenerator {
std::string Infer(const std::string& system_prompt, std::string Infer(const std::string& system_prompt,
const std::string& prompt, int max_tokens = 10000); const std::string& prompt, int max_tokens = 10000);
std::string InferFormatted(const std::string& formatted_prompt,
int max_tokens = 10000);
llama_model* model_ = nullptr; llama_model* model_ = nullptr;
llama_context* context_ = nullptr; llama_context* context_ = nullptr;
float sampling_temperature_ = 0.8f; float sampling_temperature_ = 0.8f;

View File

@@ -1,132 +1,157 @@
#include "biergarten_data_generator.h" #include "biergarten_data_generator.h"
#include <spdlog/spdlog.h>
#include <algorithm> #include <algorithm>
#include <filesystem> #include <filesystem>
#include <unordered_map> #include <unordered_map>
#include <spdlog/spdlog.h>
#include "data_generation/data_downloader.h" #include "data_generation/data_downloader.h"
#include "json_handling/json_loader.h"
#include "data_generation/llama_generator.h" #include "data_generation/llama_generator.h"
#include "data_generation/mock_generator.h" #include "data_generation/mock_generator.h"
#include "json_handling/json_loader.h"
#include "wikipedia/wikipedia_service.h" #include "wikipedia/wikipedia_service.h"
BiergartenDataGenerator::BiergartenDataGenerator( BiergartenDataGenerator::BiergartenDataGenerator(
const ApplicationOptions &options, const ApplicationOptions& options, std::shared_ptr<WebClient> web_client,
std::shared_ptr<WebClient> web_client, SqliteDatabase& database)
SqliteDatabase &database)
: options_(options), webClient_(web_client), database_(database) {} : options_(options), webClient_(web_client), database_(database) {}
std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() { std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
spdlog::info("Initializing brewery generator..."); spdlog::info("Initializing brewery generator...");
std::unique_ptr<DataGenerator> generator; std::unique_ptr<DataGenerator> generator;
if (options_.model_path.empty()) { if (options_.model_path.empty()) {
generator = std::make_unique<MockGenerator>(); generator = std::make_unique<MockGenerator>();
spdlog::info("[Generator] Using MockGenerator (no model path provided)"); spdlog::info("[Generator] Using MockGenerator (no model path provided)");
} else { } else {
auto llama_generator = std::make_unique<LlamaGenerator>(); auto llama_generator = std::make_unique<LlamaGenerator>();
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p, llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
options_.seed); options_.seed);
spdlog::info( spdlog::info(
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, " "[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
"seed={})", "seed={})",
options_.model_path, options_.temperature, options_.top_p, options_.model_path, options_.temperature, options_.top_p,
options_.seed); options_.seed);
generator = std::move(llama_generator); generator = std::move(llama_generator);
} }
generator->Load(options_.model_path); generator->Load(options_.model_path);
return generator; return generator;
} }
void BiergartenDataGenerator::LoadGeographicData() { void BiergartenDataGenerator::LoadGeographicData() {
std::string json_path = options_.cache_dir + "/countries+states+cities.json"; std::string json_path = options_.cache_dir + "/countries+states+cities.json";
std::string db_path = options_.cache_dir + "/biergarten-pipeline.db"; std::string db_path = options_.cache_dir + "/biergarten-pipeline.db";
bool has_json_cache = std::filesystem::exists(json_path); bool has_json_cache = std::filesystem::exists(json_path);
bool has_db_cache = std::filesystem::exists(db_path); bool has_db_cache = std::filesystem::exists(db_path);
spdlog::info("Initializing SQLite database at {}...", db_path); spdlog::info("Initializing SQLite database at {}...", db_path);
database_.Initialize(db_path); database_.Initialize(db_path);
if (has_db_cache && has_json_cache) { if (has_db_cache && has_json_cache) {
spdlog::info("[Pipeline] Cache hit: skipping download and parse"); spdlog::info("[Pipeline] Cache hit: skipping download and parse");
} else { } else {
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub..."); spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
DataDownloader downloader(webClient_); DataDownloader downloader(webClient_);
downloader.DownloadCountriesDatabase(json_path, options_.commit); downloader.DownloadCountriesDatabase(json_path, options_.commit);
JsonLoader::LoadWorldCities(json_path, database_); JsonLoader::LoadWorldCities(json_path, database_);
} }
} }
void BiergartenDataGenerator::GenerateSampleBreweries() { std::vector<std::pair<City, std::string>>
auto generator = InitializeGenerator(); BiergartenDataGenerator::QueryCitiesWithCountries() {
WikipediaService wikipedia_service(webClient_); spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ==="); auto cities = database_.QueryCities();
auto countries = database_.QueryCountries(50); // Build a quick map of country id -> name for per-city lookups.
auto states = database_.QueryStates(50); auto all_countries = database_.QueryCountries(0);
auto cities = database_.QueryCities(); std::unordered_map<int, std::string> country_map;
for (const auto& c : all_countries) {
country_map[c.id] = c.name;
}
// Build a quick map of country id -> name for per-city lookups. spdlog::info("\nTotal records loaded:");
auto all_countries = database_.QueryCountries(0); spdlog::info(" Countries: {}", database_.QueryCountries(0).size());
std::unordered_map<int, std::string> country_map; spdlog::info(" States: {}", database_.QueryStates(0).size());
for (const auto &c : all_countries) spdlog::info(" Cities: {}", cities.size());
country_map[c.id] = c.name;
spdlog::info("\nTotal records loaded:"); // Cap at 30 entries.
spdlog::info(" Countries: {}", database_.QueryCountries(0).size()); const size_t sample_count = std::min(size_t(30), cities.size());
spdlog::info(" States: {}", database_.QueryStates(0).size()); std::vector<std::pair<City, std::string>> result;
spdlog::info(" Cities: {}", cities.size());
generatedBreweries_.clear(); for (size_t i = 0; i < sample_count; i++) {
const size_t sample_count = std::min(size_t(30), cities.size()); const auto& city = cities[i];
std::string country_name;
const auto country_it = country_map.find(city.country_id);
if (country_it != country_map.end()) {
country_name = country_it->second;
}
result.push_back({city, country_name});
}
spdlog::info("\n=== SAMPLE BREWERY GENERATION ==="); return result;
for (size_t i = 0; i < sample_count; i++) { }
const auto &city = cities[i];
const int city_id = city.id;
const std::string city_name = city.name;
std::string local_country; std::vector<BiergartenDataGenerator::EnrichedCity>
const auto country_it = country_map.find(city.country_id); BiergartenDataGenerator::EnrichWithWikipedia(
if (country_it != country_map.end()) { const std::vector<std::pair<City, std::string>>& cities) {
local_country = country_it->second; WikipediaService wikipedia_service(webClient_);
} std::vector<EnrichedCity> enriched;
const std::string region_context = for (const auto& [city, country_name] : cities) {
wikipedia_service.GetSummary(city_name, local_country); const std::string region_context =
spdlog::debug("[Pipeline] Region context for {}: {}", city_name, wikipedia_service.GetSummary(city.name, country_name);
region_context); spdlog::debug("[Pipeline] Region context for {}: {}", city.name,
region_context);
auto brewery = enriched.push_back({city.id, city.name, country_name, region_context});
generator->GenerateBrewery(city_name, local_country, region_context); }
generatedBreweries_.push_back({city_id, city_name, brewery});
}
spdlog::info("\n=== GENERATED DATA DUMP ==="); return enriched;
for (size_t i = 0; i < generatedBreweries_.size(); i++) { }
const auto &entry = generatedBreweries_[i];
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.city_id, void BiergartenDataGenerator::GenerateBreweries(
entry.city_name); DataGenerator& generator, const std::vector<EnrichedCity>& cities) {
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name); spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description); generatedBreweries_.clear();
}
for (const auto& enriched_city : cities) {
auto brewery = generator.GenerateBrewery(enriched_city.city_name,
enriched_city.country_name,
enriched_city.region_context);
generatedBreweries_.push_back(
{enriched_city.city_id, enriched_city.city_name, brewery});
}
}
void BiergartenDataGenerator::LogResults() const {
spdlog::info("\n=== GENERATED DATA DUMP ===");
for (size_t i = 0; i < generatedBreweries_.size(); i++) {
const auto& entry = generatedBreweries_[i];
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.city_id,
entry.city_name);
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
}
} }
int BiergartenDataGenerator::Run() { int BiergartenDataGenerator::Run() {
try { try {
LoadGeographicData(); LoadGeographicData();
GenerateSampleBreweries(); auto generator = InitializeGenerator();
auto cities = QueryCitiesWithCountries();
auto enriched = EnrichWithWikipedia(cities);
GenerateBreweries(*generator, enriched);
LogResults();
spdlog::info("\nOK: Pipeline completed successfully"); spdlog::info("\nOK: Pipeline completed successfully");
return 0; return 0;
} catch (const std::exception &e) { } catch (const std::exception& e) {
spdlog::error("ERROR: Pipeline failed: {}", e.what()); spdlog::error("ERROR: Pipeline failed: {}", e.what());
return 1; return 1;
} }
} }

View File

@@ -11,100 +11,17 @@
#include "llama.h" #include "llama.h"
std::string LlamaGenerator::Infer(const std::string& prompt, int max_tokens) { std::string LlamaGenerator::Infer(const std::string& prompt, int max_tokens) {
if (model_ == nullptr || context_ == nullptr) return InferFormatted(ToChatPromptPublic(model_, prompt), max_tokens);
throw std::runtime_error("LlamaGenerator: model not loaded");
const llama_vocab* vocab = llama_model_get_vocab(model_);
if (vocab == nullptr)
throw std::runtime_error("LlamaGenerator: vocab unavailable");
llama_memory_clear(llama_get_memory(context_), true);
const std::string formatted_prompt = ToChatPromptPublic(model_, prompt);
std::vector<llama_token> prompt_tokens(formatted_prompt.size() + 8);
int32_t token_count = llama_tokenize(
vocab, formatted_prompt.c_str(),
static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
static_cast<int32_t>(prompt_tokens.size()), true, true);
if (token_count < 0) {
prompt_tokens.resize(static_cast<std::size_t>(-token_count));
token_count = llama_tokenize(
vocab, formatted_prompt.c_str(),
static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
static_cast<int32_t>(prompt_tokens.size()), true, true);
}
if (token_count < 0)
throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
const int32_t n_ctx = static_cast<int32_t>(llama_n_ctx(context_));
const int32_t n_batch = static_cast<int32_t>(llama_n_batch(context_));
if (n_ctx <= 1 || n_batch <= 0) {
throw std::runtime_error("LlamaGenerator: invalid context or batch size");
}
const int32_t effective_max_tokens =
std::max(1, std::min(max_tokens, n_ctx - 1));
int32_t prompt_budget = std::min(n_batch, n_ctx - effective_max_tokens);
prompt_budget = std::max<int32_t>(1, prompt_budget);
prompt_tokens.resize(static_cast<std::size_t>(token_count));
if (token_count > prompt_budget) {
spdlog::warn(
"LlamaGenerator: prompt too long ({} tokens), truncating to {} "
"tokens "
"to fit n_batch/n_ctx limits",
token_count, prompt_budget);
prompt_tokens.resize(static_cast<std::size_t>(prompt_budget));
token_count = prompt_budget;
}
const llama_batch prompt_batch = llama_batch_get_one(
prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
if (llama_decode(context_, prompt_batch) != 0)
throw std::runtime_error("LlamaGenerator: prompt decode failed");
llama_sampler_chain_params sampler_params =
llama_sampler_chain_default_params();
using SamplerPtr =
std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
SamplerPtr sampler(llama_sampler_chain_init(sampler_params),
&llama_sampler_free);
if (!sampler)
throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
llama_sampler_chain_add(sampler.get(),
llama_sampler_init_temp(sampling_temperature_));
llama_sampler_chain_add(sampler.get(),
llama_sampler_init_top_p(sampling_top_p_, 1));
llama_sampler_chain_add(sampler.get(),
llama_sampler_init_dist(sampling_seed_));
std::vector<llama_token> generated_tokens;
generated_tokens.reserve(static_cast<std::size_t>(max_tokens));
for (int i = 0; i < effective_max_tokens; ++i) {
const llama_token next =
llama_sampler_sample(sampler.get(), context_, -1);
if (llama_vocab_is_eog(vocab, next)) break;
generated_tokens.push_back(next);
llama_token token = next;
const llama_batch one_token_batch = llama_batch_get_one(&token, 1);
if (llama_decode(context_, one_token_batch) != 0)
throw std::runtime_error(
"LlamaGenerator: decode failed during generation");
}
std::string output;
for (const llama_token token : generated_tokens)
AppendTokenPiecePublic(vocab, token, output);
return output;
} }
std::string LlamaGenerator::Infer(const std::string& system_prompt, std::string LlamaGenerator::Infer(const std::string& system_prompt,
const std::string& prompt, int max_tokens) { const std::string& prompt, int max_tokens) {
return InferFormatted(ToChatPromptPublic(model_, system_prompt, prompt),
max_tokens);
}
std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
int max_tokens) {
if (model_ == nullptr || context_ == nullptr) if (model_ == nullptr || context_ == nullptr)
throw std::runtime_error("LlamaGenerator: model not loaded"); throw std::runtime_error("LlamaGenerator: model not loaded");
@@ -114,9 +31,6 @@ std::string LlamaGenerator::Infer(const std::string& system_prompt,
llama_memory_clear(llama_get_memory(context_), true); llama_memory_clear(llama_get_memory(context_), true);
const std::string formatted_prompt =
ToChatPromptPublic(model_, system_prompt, prompt);
std::vector<llama_token> prompt_tokens(formatted_prompt.size() + 8); std::vector<llama_token> prompt_tokens(formatted_prompt.size() + 8);
int32_t token_count = llama_tokenize( int32_t token_count = llama_tokenize(
vocab, formatted_prompt.c_str(), vocab, formatted_prompt.c_str(),
@@ -136,9 +50,8 @@ std::string LlamaGenerator::Infer(const std::string& system_prompt,
const int32_t n_ctx = static_cast<int32_t>(llama_n_ctx(context_)); const int32_t n_ctx = static_cast<int32_t>(llama_n_ctx(context_));
const int32_t n_batch = static_cast<int32_t>(llama_n_batch(context_)); const int32_t n_batch = static_cast<int32_t>(llama_n_batch(context_));
if (n_ctx <= 1 || n_batch <= 0) { if (n_ctx <= 1 || n_batch <= 0)
throw std::runtime_error("LlamaGenerator: invalid context or batch size"); throw std::runtime_error("LlamaGenerator: invalid context or batch size");
}
const int32_t effective_max_tokens = const int32_t effective_max_tokens =
std::max(1, std::min(max_tokens, n_ctx - 1)); std::max(1, std::min(max_tokens, n_ctx - 1));
@@ -149,8 +62,7 @@ std::string LlamaGenerator::Infer(const std::string& system_prompt,
if (token_count > prompt_budget) { if (token_count > prompt_budget) {
spdlog::warn( spdlog::warn(
"LlamaGenerator: prompt too long ({} tokens), truncating to {} " "LlamaGenerator: prompt too long ({} tokens), truncating to {} "
"tokens " "tokens to fit n_batch/n_ctx limits",
"to fit n_batch/n_ctx limits",
token_count, prompt_budget); token_count, prompt_budget);
prompt_tokens.resize(static_cast<std::size_t>(prompt_budget)); prompt_tokens.resize(static_cast<std::size_t>(prompt_budget));
token_count = prompt_budget; token_count = prompt_budget;
@@ -178,7 +90,7 @@ std::string LlamaGenerator::Infer(const std::string& system_prompt,
llama_sampler_init_dist(sampling_seed_)); llama_sampler_init_dist(sampling_seed_));
std::vector<llama_token> generated_tokens; std::vector<llama_token> generated_tokens;
generated_tokens.reserve(static_cast<std::size_t>(max_tokens)); generated_tokens.reserve(static_cast<std::size_t>(effective_max_tokens));
for (int i = 0; i < effective_max_tokens; ++i) { for (int i = 0; i < effective_max_tokens; ++i) {
const llama_token next = const llama_token next =