Refactor BiergartenDataGenerator and LlamaGenerator

2026-07-16 17:47:22 +00:00 · 2026-04-02 22:46:00 -04:00
parent 3af053f0eb
commit 534403734a
4 changed files with 242 additions and 265 deletions
--- a/pipeline/includes/biergarten_data_generator.h
+++ b/pipeline/includes/biergarten_data_generator.h
@@ -3,23 +3,24 @@
 #include <memory>
 #include <string>
 #include <vector>
 #include <unordered_map>
 #include <vector>
 #include "data_generation/data_generator.h"
 #include "database/database.h"
 #include "web_client/web_client.h"
 #include "wikipedia/wikipedia_service.h"
 /**
 * @brief Program options for the Biergarten pipeline application.
 */
 struct ApplicationOptions {
-  /// @brief Path to the LLM model file (gguf format); mutually exclusive with use_mocked.
+   /// @brief Path to the LLM model file (gguf format); mutually exclusive with
   /// use_mocked.
   std::string model_path;
-  /// @brief Use mocked generator instead of LLM; mutually exclusive with model_path.
+   /// @brief Use mocked generator instead of LLM; mutually exclusive with
   /// model_path.
   bool use_mocked = false;
   /// @brief Directory for cached JSON and database files.
@@ -28,27 +29,27 @@ struct ApplicationOptions {
   /// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
   float temperature = 0.8f;
-  /// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more random).
+   /// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more
   /// random).
   float top_p = 0.92f;
   /// @brief Random seed for sampling (-1 for random, otherwise non-negative).
   int seed = -1;
-  /// @brief Git commit hash for database consistency (always pinned to c5eb7772).
+   /// @brief Git commit hash for database consistency (always pinned to
   /// c5eb7772).
   std::string commit = "c5eb7772";
 };
 #endif  // BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
 /**
 * @brief Main data generator class for the Biergarten pipeline.
 *
 * This class encapsulates the core logic for generating brewery data.
- * It handles database initialization, data loading/downloading, and brewery generation.
+ * It handles database initialization, data loading/downloading, and brewery
 * generation.
 */
 class BiergartenDataGenerator {
-public:
+  public:
   /**
    * @brief Construct a BiergartenDataGenerator with injected dependencies.
    *
@@ -56,9 +57,9 @@ public:
    * @param web_client HTTP client for downloading data.
    * @param database SQLite database instance.
    */
-  BiergartenDataGenerator(const ApplicationOptions &options,
+   BiergartenDataGenerator(const ApplicationOptions& options,
                           std::shared_ptr<WebClient> web_client,
-                          SqliteDatabase &database);
+                           SqliteDatabase& database);
   /**
    * @brief Run the data generation pipeline.
@@ -73,7 +74,7 @@ public:
    */
   int Run();
-private:
+  private:
   /// @brief Immutable application options.
   const ApplicationOptions options_;
@@ -81,7 +82,17 @@ private:
   std::shared_ptr<WebClient> webClient_;
   /// @brief Database dependency.
-  SqliteDatabase &database_;
+   SqliteDatabase& database_;
   /**
    * @brief Enriched city data with Wikipedia context.
    */
   struct EnrichedCity {
      int city_id;
      std::string city_name;
      std::string country_name;
      std::string region_context;
   };
   /**
    * @brief Initialize the data generator based on options.
@@ -98,9 +109,34 @@ private:
   void LoadGeographicData();
   /**
-   * @brief Generate sample breweries for demonstration.
+    * @brief Query cities from database and build country name map.
    *
    * @return Vector of (City, country_name) pairs capped at 30 entries.
    */
-  void GenerateSampleBreweries();
+   std::vector<std::pair<City, std::string>> QueryCitiesWithCountries();
   /**
    * @brief Enrich cities with Wikipedia summaries.
    *
    * @param cities Vector of (City, country_name) pairs.
    * @return Vector of enriched city data with context.
    */
   std::vector<EnrichedCity> EnrichWithWikipedia(
       const std::vector<std::pair<City, std::string>>& cities);
   /**
    * @brief Generate breweries for enriched cities.
    *
    * @param generator The data generator instance.
    * @param cities Vector of enriched city data.
    */
   void GenerateBreweries(DataGenerator& generator,
                          const std::vector<EnrichedCity>& cities);
   /**
    * @brief Log the generated brewery results.
    */
   void LogResults() const;
   /**
    * @brief Helper struct to store generated brewery data.
@@ -114,3 +150,4 @@ private:
   /// @brief Stores generated brewery data.
   std::vector<GeneratedBrewery> generatedBreweries_;
 };
 #endif  // BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
--- a/pipeline/includes/data_generation/llama_generator.h
+++ b/pipeline/includes/data_generation/llama_generator.h
@@ -31,6 +31,9 @@ class LlamaGenerator final : public DataGenerator {
   std::string Infer(const std::string& system_prompt,
                     const std::string& prompt, int max_tokens = 10000);
   std::string InferFormatted(const std::string& formatted_prompt,
                              int max_tokens = 10000);
   llama_model* model_ = nullptr;
   llama_context* context_ = nullptr;
   float sampling_temperature_ = 0.8f;
--- a/pipeline/src/biergarten_data_generator.cpp
+++ b/pipeline/src/biergarten_data_generator.cpp
@@ -1,21 +1,20 @@
 #include "biergarten_data_generator.h"
 #include <spdlog/spdlog.h>
 #include <algorithm>
 #include <filesystem>
 #include <unordered_map>
 #include <spdlog/spdlog.h>
 #include "data_generation/data_downloader.h"
 #include "json_handling/json_loader.h"
 #include "data_generation/llama_generator.h"
 #include "data_generation/mock_generator.h"
 #include "json_handling/json_loader.h"
 #include "wikipedia/wikipedia_service.h"
 BiergartenDataGenerator::BiergartenDataGenerator(
-    const ApplicationOptions &options,
+    const ApplicationOptions& options, std::shared_ptr<WebClient> web_client,
-    std::shared_ptr<WebClient> web_client,
+    SqliteDatabase& database)
    SqliteDatabase &database)
    : options_(options), webClient_(web_client), database_(database) {}
 std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
@@ -62,55 +61,77 @@ void BiergartenDataGenerator::LoadGeographicData() {
   }
 }
-void BiergartenDataGenerator::GenerateSampleBreweries() {
+std::vector<std::pair<City, std::string>>
-  auto generator = InitializeGenerator();
+BiergartenDataGenerator::QueryCitiesWithCountries() {
  WikipediaService wikipedia_service(webClient_);
   spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
  auto countries = database_.QueryCountries(50);
  auto states = database_.QueryStates(50);
   auto cities = database_.QueryCities();
   // Build a quick map of country id -> name for per-city lookups.
   auto all_countries = database_.QueryCountries(0);
   std::unordered_map<int, std::string> country_map;
-  for (const auto &c : all_countries)
+   for (const auto& c : all_countries) {
      country_map[c.id] = c.name;
   }
   spdlog::info("\nTotal records loaded:");
   spdlog::info("  Countries: {}", database_.QueryCountries(0).size());
   spdlog::info("  States: {}", database_.QueryStates(0).size());
   spdlog::info("  Cities: {}", cities.size());
-  generatedBreweries_.clear();
+   // Cap at 30 entries.
   const size_t sample_count = std::min(size_t(30), cities.size());
   std::vector<std::pair<City, std::string>> result;
  spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
   for (size_t i = 0; i < sample_count; i++) {
-    const auto &city = cities[i];
+      const auto& city = cities[i];
-    const int city_id = city.id;
+      std::string country_name;
    const std::string city_name = city.name;
    std::string local_country;
      const auto country_it = country_map.find(city.country_id);
      if (country_it != country_map.end()) {
-      local_country = country_it->second;
+         country_name = country_it->second;
      }
      result.push_back({city, country_name});
   }
   return result;
 }
 std::vector<BiergartenDataGenerator::EnrichedCity>
 BiergartenDataGenerator::EnrichWithWikipedia(
    const std::vector<std::pair<City, std::string>>& cities) {
   WikipediaService wikipedia_service(webClient_);
   std::vector<EnrichedCity> enriched;
   for (const auto& [city, country_name] : cities) {
      const std::string region_context =
-        wikipedia_service.GetSummary(city_name, local_country);
+          wikipedia_service.GetSummary(city.name, country_name);
-    spdlog::debug("[Pipeline] Region context for {}: {}", city_name,
+      spdlog::debug("[Pipeline] Region context for {}: {}", city.name,
                    region_context);
-    auto brewery =
+      enriched.push_back({city.id, city.name, country_name, region_context});
        generator->GenerateBrewery(city_name, local_country, region_context);
    generatedBreweries_.push_back({city_id, city_name, brewery});
   }
   return enriched;
 }
 void BiergartenDataGenerator::GenerateBreweries(
    DataGenerator& generator, const std::vector<EnrichedCity>& cities) {
   spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
   generatedBreweries_.clear();
   for (const auto& enriched_city : cities) {
      auto brewery = generator.GenerateBrewery(enriched_city.city_name,
                                               enriched_city.country_name,
                                               enriched_city.region_context);
      generatedBreweries_.push_back(
          {enriched_city.city_id, enriched_city.city_name, brewery});
   }
 }
 void BiergartenDataGenerator::LogResults() const {
   spdlog::info("\n=== GENERATED DATA DUMP ===");
   for (size_t i = 0; i < generatedBreweries_.size(); i++) {
-    const auto &entry = generatedBreweries_[i];
+      const auto& entry = generatedBreweries_[i];
      spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.city_id,
                   entry.city_name);
      spdlog::info("   brewery_name=\"{}\"", entry.brewery.name);
@@ -121,11 +142,15 @@ void BiergartenDataGenerator::GenerateSampleBreweries() {
 int BiergartenDataGenerator::Run() {
   try {
      LoadGeographicData();
-    GenerateSampleBreweries();
+      auto generator = InitializeGenerator();
      auto cities = QueryCitiesWithCountries();
      auto enriched = EnrichWithWikipedia(cities);
      GenerateBreweries(*generator, enriched);
      LogResults();
      spdlog::info("\nOK: Pipeline completed successfully");
      return 0;
-  } catch (const std::exception &e) {
+   } catch (const std::exception& e) {
      spdlog::error("ERROR: Pipeline failed: {}", e.what());
      return 1;
   }
--- a/pipeline/src/data_generation/llama/infer.cpp
+++ b/pipeline/src/data_generation/llama/infer.cpp
@@ -11,100 +11,17 @@
 #include "llama.h"
 std::string LlamaGenerator::Infer(const std::string& prompt, int max_tokens) {
-   if (model_ == nullptr || context_ == nullptr)
+   return InferFormatted(ToChatPromptPublic(model_, prompt), max_tokens);
      throw std::runtime_error("LlamaGenerator: model not loaded");
   const llama_vocab* vocab = llama_model_get_vocab(model_);
   if (vocab == nullptr)
      throw std::runtime_error("LlamaGenerator: vocab unavailable");
   llama_memory_clear(llama_get_memory(context_), true);
   const std::string formatted_prompt = ToChatPromptPublic(model_, prompt);
   std::vector<llama_token> prompt_tokens(formatted_prompt.size() + 8);
   int32_t token_count = llama_tokenize(
       vocab, formatted_prompt.c_str(),
       static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
       static_cast<int32_t>(prompt_tokens.size()), true, true);
   if (token_count < 0) {
      prompt_tokens.resize(static_cast<std::size_t>(-token_count));
      token_count = llama_tokenize(
          vocab, formatted_prompt.c_str(),
          static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
          static_cast<int32_t>(prompt_tokens.size()), true, true);
   }
   if (token_count < 0)
      throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
   const int32_t n_ctx = static_cast<int32_t>(llama_n_ctx(context_));
   const int32_t n_batch = static_cast<int32_t>(llama_n_batch(context_));
   if (n_ctx <= 1 || n_batch <= 0) {
      throw std::runtime_error("LlamaGenerator: invalid context or batch size");
   }
   const int32_t effective_max_tokens =
       std::max(1, std::min(max_tokens, n_ctx - 1));
   int32_t prompt_budget = std::min(n_batch, n_ctx - effective_max_tokens);
   prompt_budget = std::max<int32_t>(1, prompt_budget);
   prompt_tokens.resize(static_cast<std::size_t>(token_count));
   if (token_count > prompt_budget) {
      spdlog::warn(
          "LlamaGenerator: prompt too long ({} tokens), truncating to {} "
          "tokens "
          "to fit n_batch/n_ctx limits",
          token_count, prompt_budget);
      prompt_tokens.resize(static_cast<std::size_t>(prompt_budget));
      token_count = prompt_budget;
   }
   const llama_batch prompt_batch = llama_batch_get_one(
       prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
   if (llama_decode(context_, prompt_batch) != 0)
      throw std::runtime_error("LlamaGenerator: prompt decode failed");
   llama_sampler_chain_params sampler_params =
       llama_sampler_chain_default_params();
   using SamplerPtr =
       std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
   SamplerPtr sampler(llama_sampler_chain_init(sampler_params),
                      &llama_sampler_free);
   if (!sampler)
      throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
   llama_sampler_chain_add(sampler.get(),
                           llama_sampler_init_temp(sampling_temperature_));
   llama_sampler_chain_add(sampler.get(),
                           llama_sampler_init_top_p(sampling_top_p_, 1));
   llama_sampler_chain_add(sampler.get(),
                           llama_sampler_init_dist(sampling_seed_));
   std::vector<llama_token> generated_tokens;
   generated_tokens.reserve(static_cast<std::size_t>(max_tokens));
   for (int i = 0; i < effective_max_tokens; ++i) {
      const llama_token next =
          llama_sampler_sample(sampler.get(), context_, -1);
      if (llama_vocab_is_eog(vocab, next)) break;
      generated_tokens.push_back(next);
      llama_token token = next;
      const llama_batch one_token_batch = llama_batch_get_one(&token, 1);
      if (llama_decode(context_, one_token_batch) != 0)
         throw std::runtime_error(
             "LlamaGenerator: decode failed during generation");
   }
   std::string output;
   for (const llama_token token : generated_tokens)
      AppendTokenPiecePublic(vocab, token, output);
   return output;
 }
 std::string LlamaGenerator::Infer(const std::string& system_prompt,
                                  const std::string& prompt, int max_tokens) {
   return InferFormatted(ToChatPromptPublic(model_, system_prompt, prompt),
                         max_tokens);
 }
 std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
                                           int max_tokens) {
   if (model_ == nullptr || context_ == nullptr)
      throw std::runtime_error("LlamaGenerator: model not loaded");
@@ -114,9 +31,6 @@ std::string LlamaGenerator::Infer(const std::string& system_prompt,
   llama_memory_clear(llama_get_memory(context_), true);
   const std::string formatted_prompt =
       ToChatPromptPublic(model_, system_prompt, prompt);
   std::vector<llama_token> prompt_tokens(formatted_prompt.size() + 8);
   int32_t token_count = llama_tokenize(
       vocab, formatted_prompt.c_str(),
@@ -136,9 +50,8 @@ std::string LlamaGenerator::Infer(const std::string& system_prompt,
   const int32_t n_ctx = static_cast<int32_t>(llama_n_ctx(context_));
   const int32_t n_batch = static_cast<int32_t>(llama_n_batch(context_));
-   if (n_ctx <= 1 || n_batch <= 0) {
+   if (n_ctx <= 1 || n_batch <= 0)
      throw std::runtime_error("LlamaGenerator: invalid context or batch size");
   }
   const int32_t effective_max_tokens =
       std::max(1, std::min(max_tokens, n_ctx - 1));
@@ -149,8 +62,7 @@ std::string LlamaGenerator::Infer(const std::string& system_prompt,
   if (token_count > prompt_budget) {
      spdlog::warn(
          "LlamaGenerator: prompt too long ({} tokens), truncating to {} "
-          "tokens "
+          "tokens to fit n_batch/n_ctx limits",
          "to fit n_batch/n_ctx limits",
          token_count, prompt_budget);
      prompt_tokens.resize(static_cast<std::size_t>(prompt_budget));
      token_count = prompt_budget;
@@ -178,7 +90,7 @@ std::string LlamaGenerator::Infer(const std::string& system_prompt,
                           llama_sampler_init_dist(sampling_seed_));
   std::vector<llama_token> generated_tokens;
-   generated_tokens.reserve(static_cast<std::size_t>(max_tokens));
+   generated_tokens.reserve(static_cast<std::size_t>(effective_max_tokens));
   for (int i = 0; i < effective_max_tokens; ++i) {
      const llama_token next =