Implement Llama-based brewery and user data generation; remove mock generator and related files

2026-07-16 17:47:22 +00:00 · 2026-04-01 23:29:16 -04:00
parent 248a51b35f
commit 280c9c61bd
11 changed files with 445 additions and 128 deletions
--- a/pipeline/CMakeLists.txt
+++ b/pipeline/CMakeLists.txt
@@ -39,6 +39,24 @@ if(NOT spdlog_POPULATED)
    add_subdirectory(${spdlog_SOURCE_DIR} ${spdlog_BINARY_DIR} EXCLUDE_FROM_ALL)
 endif()
 # llama.cpp (on-device inference)
 set(LLAMA_BUILD_TESTS OFF CACHE BOOL "" FORCE)
 set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
 set(LLAMA_BUILD_SERVER OFF CACHE BOOL "" FORCE)
 FetchContent_Declare(
    llama_cpp
    GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
    GIT_TAG        b8611
 )
 FetchContent_MakeAvailable(llama_cpp)
 if(TARGET llama)
    target_compile_options(llama PRIVATE
        $<$<CXX_COMPILER_ID:AppleClang>:-include algorithm>
    )
 endif()
 file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS
    src/*.cpp
 )
@@ -49,6 +67,7 @@ target_include_directories(biergarten-pipeline
    PRIVATE
        ${CMAKE_CURRENT_SOURCE_DIR}/includes
        ${rapidjson_SOURCE_DIR}/include
        ${llama_cpp_SOURCE_DIR}/include
 )
 target_link_libraries(biergarten-pipeline
@@ -56,7 +75,8 @@ target_link_libraries(biergarten-pipeline
        CURL::libcurl
        Boost::unit_test_framework
        SQLite::SQLite3
-    spdlog::spdlog
+        spdlog::spdlog
        llama
 )
 target_compile_options(biergarten-pipeline PRIVATE
@@ -116,7 +136,6 @@ if(BUILD_TESTING)
            Boost::unit_test_framework
            CURL::libcurl
            nlohmann_json::nlohmann_json
            llama
      )
      add_test(
--- a/pipeline/includes/data_generator.h
+++ b/pipeline/includes/data_generator.h
@@ -0,0 +1,25 @@
 #pragma once
 #include <string>
 struct BreweryResult {
  std::string name;
  std::string description;
 };
 struct UserResult {
  std::string username;
  std::string bio;
 };
 class IDataGenerator {
 public:
  virtual ~IDataGenerator() = default;
  virtual void load(const std::string &modelPath) = 0;
  virtual BreweryResult generateBrewery(const std::string &cityName,
                                        const std::string &regionContext) = 0;
  virtual UserResult generateUser(const std::string &locale) = 0;
 };
--- a/pipeline/includes/generator.h
+++ b/pipeline/includes/generator.h
@@ -1,36 +0,0 @@
 #pragma once
 #include <string>
 #include <vector>
 /// @brief Deterministic mock brewery text generator used in pipeline output.
 class LlamaBreweryGenerator {
 private:
  const std::vector<std::string> breweryAdjectives = {
      "Craft",   "Heritage", "Local",  "Artisan",
      "Pioneer", "Golden",   "Modern", "Classic"};
  const std::vector<std::string> breweryNouns = {
      "Brewing Co.", "Brewery", "Bier Haus",  "Taproom",
      "Works",       "House",   "Fermentery", "Ale Co."};
  const std::vector<std::string> descriptions = {
      "Handcrafted pale ales and seasonal IPAs with local ingredients.",
      "Traditional lagers and experimental sours in small batches.",
      "Award-winning stouts and wildly hoppy blonde ales.",
      "Craft brewery specializing in Belgian-style triples and dark porters.",
      "Modern brewery blending tradition with bold experimental flavors."};
 public:
  /// @brief Generated brewery payload for one city.
  struct Brewery {
    std::string name;
    std::string description;
  };
  /// @brief Loads model resources (mock implementation in this project).
  void LoadModel(const std::string &modelPath);
  /// @brief Generates deterministic brewery text for a city and seed.
  Brewery GenerateBrewery(const std::string &cityName, int seed);
 };
--- a/pipeline/includes/json_loader.h
+++ b/pipeline/includes/json_loader.h
@@ -2,7 +2,6 @@
 #include "database.h"
 #include "stream_parser.h"
 #include "work_queue.h"
 #include <string>
 /// @brief Loads world-city JSON data into SQLite through streaming parsing.
--- a/pipeline/includes/llama_generator.h
+++ b/pipeline/includes/llama_generator.h
@@ -0,0 +1,24 @@
 #pragma once
 #include "data_generator.h"
 #include <memory>
 #include <string>
 struct llama_model;
 struct llama_context;
 class LlamaGenerator final : public IDataGenerator {
 public:
  ~LlamaGenerator() override;
  void load(const std::string &modelPath) override;
  BreweryResult generateBrewery(const std::string &cityName,
                                const std::string &regionContext) override;
  UserResult generateUser(const std::string &locale) override;
 private:
  std::string infer(const std::string &prompt, int maxTokens = 256);
  llama_model *model_ = nullptr;
  llama_context *context_ = nullptr;
 };
--- a/pipeline/includes/mock_generator.h
+++ b/pipeline/includes/mock_generator.h
@@ -0,0 +1,23 @@
 #pragma once
 #include "data_generator.h"
 #include <string>
 #include <vector>
 class MockGenerator final : public IDataGenerator {
 public:
  void load(const std::string &modelPath) override;
  BreweryResult generateBrewery(const std::string &cityName,
                                const std::string &regionContext) override;
  UserResult generateUser(const std::string &locale) override;
 private:
  static std::size_t deterministicHash(const std::string &a,
                                       const std::string &b);
  static const std::vector<std::string> kBreweryAdjectives;
  static const std::vector<std::string> kBreweryNouns;
  static const std::vector<std::string> kBreweryDescriptions;
  static const std::vector<std::string> kUsernames;
  static const std::vector<std::string> kBios;
 };
--- a/pipeline/includes/work_queue.h
+++ b/pipeline/includes/work_queue.h
@@ -1,63 +0,0 @@
 #pragma once
 #include <condition_variable>
 #include <mutex>
 #include <optional>
 #include <queue>
 /// @brief Bounded thread-safe queue with blocking push/pop and shutdown.
 template <typename T> class WorkQueue {
 private:
  std::queue<T> queue;
  std::mutex mutex;
  std::condition_variable cv_not_empty;
  std::condition_variable cv_not_full;
  size_t max_size;
  bool shutdown = false;
 public:
  /// @brief Creates a queue with fixed capacity.
  explicit WorkQueue(size_t capacity) : max_size(capacity) {}
  /// @brief Pushes an item, blocking while full unless shutdown is signaled.
  bool push(T item) {
    std::unique_lock<std::mutex> lock(mutex);
    cv_not_full.wait(lock,
                     [this] { return queue.size() < max_size || shutdown; });
    if (shutdown)
      return false;
    queue.push(std::move(item));
    cv_not_empty.notify_one();
    return true;
  }
  /// @brief Pops an item, blocking while empty unless shutdown is signaled.
  std::optional<T> pop() {
    std::unique_lock<std::mutex> lock(mutex);
    cv_not_empty.wait(lock, [this] { return !queue.empty() || shutdown; });
    if (queue.empty())
      return std::nullopt;
    T item = std::move(queue.front());
    queue.pop();
    cv_not_full.notify_one();
    return item;
  }
  /// @brief Signals queue shutdown and wakes all waiting producers/consumers.
  void shutdown_queue() {
    std::unique_lock<std::mutex> lock(mutex);
    shutdown = true;
    cv_not_empty.notify_all();
    cv_not_full.notify_all();
  }
  /// @brief Returns current queue size.
  size_t size() const {
    std::lock_guard<std::mutex> lock(mutex);
    return queue.size();
  }
 };
--- a/pipeline/src/generator.cpp
+++ b/pipeline/src/generator.cpp
@@ -1,21 +0,0 @@
 #include "generator.h"
 #include <functional>
 #include <spdlog/spdlog.h>
 void LlamaBreweryGenerator::LoadModel(const std::string &modelPath) {
  spdlog::info("  [Mock] Initialized llama model: {}", modelPath);
  spdlog::info("    OK: Model ready");
 }
 LlamaBreweryGenerator::Brewery
 LlamaBreweryGenerator::GenerateBrewery(const std::string &cityName, int seed) {
  // Deterministic mock generation for stable test output.
  size_t nameHash = std::hash<std::string>{}(cityName + std::to_string(seed));
  Brewery result;
  result.name = breweryAdjectives[nameHash % breweryAdjectives.size()] + " " +
                breweryNouns[(nameHash / 7) % breweryNouns.size()];
  result.description = descriptions[(nameHash / 13) % descriptions.size()];
  return result;
 }
--- a/pipeline/src/llama_generator.cpp
+++ b/pipeline/src/llama_generator.cpp
@@ -0,0 +1,236 @@
 #include "llama_generator.h"
 #include "llama.h"
 #include <algorithm>
 #include <array>
 #include <cctype>
 #include <memory>
 #include <stdexcept>
 #include <string>
 #include <vector>
 #include <spdlog/spdlog.h>
 namespace {
 std::string trim(std::string value) {
  auto notSpace = [](unsigned char ch) { return !std::isspace(ch); };
  value.erase(value.begin(),
              std::find_if(value.begin(), value.end(), notSpace));
  value.erase(std::find_if(value.rbegin(), value.rend(), notSpace).base(),
              value.end());
  return value;
 }
 void appendTokenPiece(const llama_vocab *vocab, llama_token token,
                      std::string &output) {
  std::array<char, 256> buffer{};
  int32_t bytes =
      llama_token_to_piece(vocab, token, buffer.data(),
                           static_cast<int32_t>(buffer.size()), 0, true);
  if (bytes < 0) {
    std::vector<char> dynamicBuffer(static_cast<std::size_t>(-bytes));
    bytes = llama_token_to_piece(vocab, token, dynamicBuffer.data(),
                                 static_cast<int32_t>(dynamicBuffer.size()), 0,
                                 true);
    if (bytes < 0) {
      throw std::runtime_error(
          "LlamaGenerator: failed to decode sampled token piece");
    }
    output.append(dynamicBuffer.data(), static_cast<std::size_t>(bytes));
    return;
  }
  output.append(buffer.data(), static_cast<std::size_t>(bytes));
 }
 std::pair<std::string, std::string>
 parseTwoLineResponse(const std::string &raw, const std::string &errorMessage) {
  const auto newlinePos = raw.find('\n');
  if (newlinePos == std::string::npos) {
    throw std::runtime_error(errorMessage);
  }
  std::string first = trim(raw.substr(0, newlinePos));
  std::string second = trim(raw.substr(newlinePos + 1));
  if (first.empty() || second.empty()) {
    throw std::runtime_error(errorMessage);
  }
  return {first, second};
 }
 } // namespace
 LlamaGenerator::~LlamaGenerator() {
  if (context_ != nullptr) {
    llama_free(context_);
    context_ = nullptr;
  }
  if (model_ != nullptr) {
    llama_model_free(model_);
    model_ = nullptr;
  }
  llama_backend_free();
 }
 void LlamaGenerator::load(const std::string &modelPath) {
  if (modelPath.empty()) {
    throw std::runtime_error("LlamaGenerator: model path must not be empty");
  }
  if (context_ != nullptr) {
    llama_free(context_);
    context_ = nullptr;
  }
  if (model_ != nullptr) {
    llama_model_free(model_);
    model_ = nullptr;
  }
  llama_backend_init();
  llama_model_params modelParams = llama_model_default_params();
  model_ = llama_load_model_from_file(modelPath.c_str(), modelParams);
  if (model_ == nullptr) {
    throw std::runtime_error(
        "LlamaGenerator: failed to load model from path: " + modelPath);
  }
  llama_context_params contextParams = llama_context_default_params();
  contextParams.n_ctx = 2048;
  context_ = llama_init_from_model(model_, contextParams);
  if (context_ == nullptr) {
    llama_model_free(model_);
    model_ = nullptr;
    throw std::runtime_error("LlamaGenerator: failed to create context");
  }
  spdlog::info("[LlamaGenerator] Loaded model: {}", modelPath);
 }
 std::string LlamaGenerator::infer(const std::string &prompt, int maxTokens) {
  if (model_ == nullptr || context_ == nullptr) {
    throw std::runtime_error("LlamaGenerator: model not loaded");
  }
  const llama_vocab *vocab = llama_model_get_vocab(model_);
  if (vocab == nullptr) {
    throw std::runtime_error("LlamaGenerator: vocab unavailable");
  }
  std::vector<llama_token> promptTokens(prompt.size() + 8);
  int32_t tokenCount =
      llama_tokenize(vocab, prompt.c_str(), static_cast<int32_t>(prompt.size()),
                     promptTokens.data(),
                     static_cast<int32_t>(promptTokens.size()), true, true);
  if (tokenCount < 0) {
    promptTokens.resize(static_cast<std::size_t>(-tokenCount));
    tokenCount =
        llama_tokenize(vocab, prompt.c_str(),
                       static_cast<int32_t>(prompt.size()), promptTokens.data(),
                       static_cast<int32_t>(promptTokens.size()), true, true);
  }
  if (tokenCount < 0) {
    throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
  }
  promptTokens.resize(static_cast<std::size_t>(tokenCount));
  const llama_batch promptBatch = llama_batch_get_one(
      promptTokens.data(), static_cast<int32_t>(promptTokens.size()));
  if (llama_decode(context_, promptBatch) != 0) {
    throw std::runtime_error("LlamaGenerator: prompt decode failed");
  }
  llama_sampler_chain_params samplerParams =
      llama_sampler_chain_default_params();
  using SamplerPtr =
      std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
  SamplerPtr sampler(llama_sampler_chain_init(samplerParams),
                     &llama_sampler_free);
  if (!sampler) {
    throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
  }
  llama_sampler_chain_add(sampler.get(), llama_sampler_init_greedy());
  std::vector<llama_token> generatedTokens;
  generatedTokens.reserve(static_cast<std::size_t>(maxTokens));
  for (int i = 0; i < maxTokens; ++i) {
    const llama_token next = llama_sampler_sample(sampler.get(), context_, -1);
    if (llama_vocab_is_eog(vocab, next)) {
      break;
    }
    generatedTokens.push_back(next);
    llama_token token = next;
    const llama_batch oneTokenBatch = llama_batch_get_one(&token, 1);
    if (llama_decode(context_, oneTokenBatch) != 0) {
      throw std::runtime_error(
          "LlamaGenerator: decode failed during generation");
    }
  }
  std::string output;
  for (const llama_token token : generatedTokens) {
    appendTokenPiece(vocab, token, output);
  }
  return output;
 }
 BreweryResult
 LlamaGenerator::generateBrewery(const std::string &cityName,
                                const std::string &regionContext) {
  std::string prompt =
      "Generate a craft brewery name and one-sentence description for a "
      "brewery located in " +
      cityName + ". " + regionContext +
      " Respond with exactly two lines: first line is the name, second "
      "line is the description.";
  const std::string raw = infer(prompt, 128);
  auto [name, description] =
      parseTwoLineResponse(raw, "LlamaGenerator: malformed brewery response");
  return {name, description};
 }
 UserResult LlamaGenerator::generateUser(const std::string &locale) {
  std::string prompt =
      "Generate a plausible craft beer enthusiast username and a one-sentence "
      "bio. Locale: " +
      locale +
      ". Respond with exactly two lines: first line is the username (no "
      "spaces), second line is the bio.";
  const std::string raw = infer(prompt, 128);
  auto [username, bio] =
      parseTwoLineResponse(raw, "LlamaGenerator: malformed user response");
  username.erase(
      std::remove_if(username.begin(), username.end(),
                     [](unsigned char ch) { return std::isspace(ch); }),
      username.end());
  if (username.empty() || bio.empty()) {
    throw std::runtime_error("LlamaGenerator: malformed user response");
  }
  return {username, bio};
 }
--- a/pipeline/src/main.cpp
+++ b/pipeline/src/main.cpp
@@ -1,9 +1,12 @@
 #include "data_downloader.h"
 #include "data_generator.h"
 #include "database.h"
 #include "generator.h"
 #include "json_loader.h"
 #include "llama_generator.h"
 #include "mock_generator.h"
 #include <curl/curl.h>
 #include <filesystem>
 #include <memory>
 #include <spdlog/spdlog.h>
 static bool FileExists(const std::string &filePath) {
@@ -14,7 +17,7 @@ int main(int argc, char *argv[]) {
  try {
    curl_global_init(CURL_GLOBAL_DEFAULT);
-    std::string modelPath = argc > 1 ? argv[1] : "./model.gguf";
+    std::string modelPath = argc > 1 ? argv[1] : "";
    std::string cacheDir = argc > 2 ? argv[2] : "/tmp";
    std::string commit =
        argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28
@@ -41,8 +44,15 @@ int main(int argc, char *argv[]) {
    }
    spdlog::info("Initializing brewery generator...");
-    LlamaBreweryGenerator generator;
+    std::unique_ptr<IDataGenerator> generator;
-    generator.LoadModel(modelPath);
+    if (modelPath.empty()) {
      generator = std::make_unique<MockGenerator>();
      spdlog::info("[Generator] Using MockGenerator (no model path provided)");
    } else {
      generator = std::make_unique<LlamaGenerator>();
      spdlog::info("[Generator] Using LlamaGenerator: {}", modelPath);
    }
    generator->load(modelPath);
    spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
@@ -74,7 +84,7 @@ int main(int argc, char *argv[]) {
    spdlog::info("\n=== SAMPLE BREWERY GENERATION ===\n");
    for (size_t i = 0; i < std::min(size_t(5), cities.size()); i++) {
      const auto &[cityId, cityName] = cities[i];
-      auto brewery = generator.GenerateBrewery(cityName, i);
+      auto brewery = generator->generateBrewery(cityName, "");
      spdlog::info("  {}: {}", cityName, brewery.name);
      spdlog::info("    -> {}", brewery.description);
    }
--- a/pipeline/src/mock_generator.cpp
+++ b/pipeline/src/mock_generator.cpp
@@ -0,0 +1,101 @@
 #include "mock_generator.h"
 #include <functional>
 #include <spdlog/spdlog.h>
 const std::vector<std::string> MockGenerator::kBreweryAdjectives = {
    "Craft",      "Heritage", "Local",  "Artisan",  "Pioneer",    "Golden",
    "Modern",     "Classic",  "Summit", "Northern", "Riverstone", "Barrel",
    "Hinterland", "Harbor",   "Wild",   "Granite",  "Copper",     "Maple"};
 const std::vector<std::string> MockGenerator::kBreweryNouns = {
    "Brewing Co.", "Brewery",    "Bier Haus", "Taproom",      "Works",
    "House",       "Fermentery", "Ale Co.",   "Cellars",      "Collective",
    "Project",     "Foundry",    "Malthouse", "Public House", "Co-op",
    "Lab",         "Beer Hall",  "Guild"};
 const std::vector<std::string> MockGenerator::kBreweryDescriptions = {
    "Handcrafted pale ales and seasonal IPAs with local ingredients.",
    "Traditional lagers and experimental sours in small batches.",
    "Award-winning stouts and wildly hoppy blonde ales.",
    "Craft brewery specializing in Belgian-style triples and dark porters.",
    "Modern brewery blending tradition with bold experimental flavors.",
    "Neighborhood-focused taproom pouring crisp pilsners and citrusy pale "
    "ales.",
    "Small-batch brewery known for barrel-aged releases and smoky lagers.",
    "Independent brewhouse pairing farmhouse ales with rotating food pop-ups.",
    "Community brewpub making balanced bitters, saisons, and hazy IPAs.",
    "Experimental nanobrewery exploring local yeast and regional grains.",
    "Family-run brewery producing smooth amber ales and robust porters.",
    "Urban brewery crafting clean lagers and bright, fruit-forward sours.",
    "Riverfront brewhouse featuring oak-matured ales and seasonal blends.",
    "Modern taproom focused on sessionable lagers and classic pub styles.",
    "Brewery rooted in tradition with a lineup of malty reds and crisp lagers.",
    "Creative brewery offering rotating collaborations and limited draft-only "
    "pours.",
    "Locally inspired brewery serving approachable ales with bold hop "
    "character.",
    "Destination taproom known for balanced IPAs and cocoa-rich stouts."};
 const std::vector<std::string> MockGenerator::kUsernames = {
    "hopseeker",     "malttrail",   "yeastwhisper",  "lagerlane",
    "barrelbound",   "foamfinder",  "taphunter",     "graingeist",
    "brewscout",     "aleatlas",    "caskcompass",   "hopsandmaps",
    "mashpilot",     "pintnomad",   "fermentfriend", "stoutsignal",
    "sessionwander", "kettlekeeper"};
 const std::vector<std::string> MockGenerator::kBios = {
    "Always chasing balanced IPAs and crisp lagers across local taprooms.",
    "Weekend brewery explorer with a soft spot for dark, roasty stouts.",
    "Documenting tiny brewpubs, fresh pours, and unforgettable beer gardens.",
    "Fan of farmhouse ales, food pairings, and long tasting flights.",
    "Collecting favorite pilsners one city at a time.",
    "Hops-first drinker who still saves room for classic malt-forward styles.",
    "Finding hidden tap lists and sharing the best seasonal releases.",
    "Brewery road-tripper focused on local ingredients and clean fermentation.",
    "Always comparing house lagers and ranking patio pint vibes.",
    "Curious about yeast strains, barrel programs, and cellar experiments.",
    "Believes every neighborhood deserves a great community taproom.",
    "Looking for session beers that taste great from first sip to last.",
    "Belgian ale enthusiast who never skips a new saison.",
    "Hazy IPA critic with deep respect for a perfectly clear pilsner.",
    "Visits breweries for the stories, stays for the flagship pours.",
    "Craft beer fan mapping tasting notes and favorite brew routes.",
    "Always ready to trade recommendations for underrated local breweries.",
    "Keeping a running list of must-try collab releases and tap takeovers."};
 void MockGenerator::load(const std::string & /*modelPath*/) {
  spdlog::info("[MockGenerator] No model needed");
 }
 std::size_t MockGenerator::deterministicHash(const std::string &a,
                                             const std::string &b) {
  std::size_t seed = std::hash<std::string>{}(a);
  const std::size_t mixed = std::hash<std::string>{}(b);
  seed ^= mixed + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
  seed = (seed << 13) | (seed >> ((sizeof(std::size_t) * 8) - 13));
  return seed;
 }
 BreweryResult MockGenerator::generateBrewery(const std::string &cityName,
                                             const std::string &regionContext) {
  const std::size_t hash = regionContext.empty()
                               ? std::hash<std::string>{}(cityName)
                               : deterministicHash(cityName, regionContext);
  BreweryResult result;
  result.name = kBreweryAdjectives[hash % kBreweryAdjectives.size()] + " " +
                kBreweryNouns[(hash / 7) % kBreweryNouns.size()];
  result.description =
      kBreweryDescriptions[(hash / 13) % kBreweryDescriptions.size()];
  return result;
 }
 UserResult MockGenerator::generateUser(const std::string &locale) {
  const std::size_t hash = std::hash<std::string>{}(locale);
  UserResult result;
  result.username = kUsernames[hash % kUsernames.size()];
  result.bio = kBios[(hash / 11) % kBios.size()];
  return result;
 }