Implement Llama-based brewery and user data generation; remove mock generator and related files

2026-07-17 01:47:22 +00:00 · 2026-04-01 23:29:16 -04:00
parent 248a51b35f
commit 280c9c61bd
11 changed files with 445 additions and 128 deletions
--- a/pipeline/src/generator.cpp
+++ b/pipeline/src/generator.cpp
@@ -1,21 +0,0 @@
-#include "generator.h"
-#include <functional>
-#include <spdlog/spdlog.h>
-
-void LlamaBreweryGenerator::LoadModel(const std::string &modelPath) {
-  spdlog::info("  [Mock] Initialized llama model: {}", modelPath);
-  spdlog::info("    OK: Model ready");
-}
-
-LlamaBreweryGenerator::Brewery
-LlamaBreweryGenerator::GenerateBrewery(const std::string &cityName, int seed) {
-  // Deterministic mock generation for stable test output.
-  size_t nameHash = std::hash<std::string>{}(cityName + std::to_string(seed));
-
-  Brewery result;
-  result.name = breweryAdjectives[nameHash % breweryAdjectives.size()] + " " +
-                breweryNouns[(nameHash / 7) % breweryNouns.size()];
-  result.description = descriptions[(nameHash / 13) % descriptions.size()];
-
-  return result;
-}
--- a/pipeline/src/llama_generator.cpp
+++ b/pipeline/src/llama_generator.cpp
@@ -0,0 +1,236 @@
+#include "llama_generator.h"
+
+#include "llama.h"
+
+#include <algorithm>
+#include <array>
+#include <cctype>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <spdlog/spdlog.h>
+
+namespace {
+
+std::string trim(std::string value) {
+  auto notSpace = [](unsigned char ch) { return !std::isspace(ch); };
+
+  value.erase(value.begin(),
+              std::find_if(value.begin(), value.end(), notSpace));
+  value.erase(std::find_if(value.rbegin(), value.rend(), notSpace).base(),
+              value.end());
+
+  return value;
+}
+
+void appendTokenPiece(const llama_vocab *vocab, llama_token token,
+                      std::string &output) {
+  std::array<char, 256> buffer{};
+  int32_t bytes =
+      llama_token_to_piece(vocab, token, buffer.data(),
+                           static_cast<int32_t>(buffer.size()), 0, true);
+
+  if (bytes < 0) {
+    std::vector<char> dynamicBuffer(static_cast<std::size_t>(-bytes));
+    bytes = llama_token_to_piece(vocab, token, dynamicBuffer.data(),
+                                 static_cast<int32_t>(dynamicBuffer.size()), 0,
+                                 true);
+    if (bytes < 0) {
+      throw std::runtime_error(
+          "LlamaGenerator: failed to decode sampled token piece");
+    }
+
+    output.append(dynamicBuffer.data(), static_cast<std::size_t>(bytes));
+    return;
+  }
+
+  output.append(buffer.data(), static_cast<std::size_t>(bytes));
+}
+
+std::pair<std::string, std::string>
+parseTwoLineResponse(const std::string &raw, const std::string &errorMessage) {
+  const auto newlinePos = raw.find('\n');
+  if (newlinePos == std::string::npos) {
+    throw std::runtime_error(errorMessage);
+  }
+
+  std::string first = trim(raw.substr(0, newlinePos));
+  std::string second = trim(raw.substr(newlinePos + 1));
+
+  if (first.empty() || second.empty()) {
+    throw std::runtime_error(errorMessage);
+  }
+
+  return {first, second};
+}
+
+} // namespace
+
+LlamaGenerator::~LlamaGenerator() {
+  if (context_ != nullptr) {
+    llama_free(context_);
+    context_ = nullptr;
+  }
+
+  if (model_ != nullptr) {
+    llama_model_free(model_);
+    model_ = nullptr;
+  }
+
+  llama_backend_free();
+}
+
+void LlamaGenerator::load(const std::string &modelPath) {
+  if (modelPath.empty()) {
+    throw std::runtime_error("LlamaGenerator: model path must not be empty");
+  }
+
+  if (context_ != nullptr) {
+    llama_free(context_);
+    context_ = nullptr;
+  }
+  if (model_ != nullptr) {
+    llama_model_free(model_);
+    model_ = nullptr;
+  }
+
+  llama_backend_init();
+
+  llama_model_params modelParams = llama_model_default_params();
+  model_ = llama_load_model_from_file(modelPath.c_str(), modelParams);
+  if (model_ == nullptr) {
+    throw std::runtime_error(
+        "LlamaGenerator: failed to load model from path: " + modelPath);
+  }
+
+  llama_context_params contextParams = llama_context_default_params();
+  contextParams.n_ctx = 2048;
+
+  context_ = llama_init_from_model(model_, contextParams);
+  if (context_ == nullptr) {
+    llama_model_free(model_);
+    model_ = nullptr;
+    throw std::runtime_error("LlamaGenerator: failed to create context");
+  }
+
+  spdlog::info("[LlamaGenerator] Loaded model: {}", modelPath);
+}
+
+std::string LlamaGenerator::infer(const std::string &prompt, int maxTokens) {
+  if (model_ == nullptr || context_ == nullptr) {
+    throw std::runtime_error("LlamaGenerator: model not loaded");
+  }
+
+  const llama_vocab *vocab = llama_model_get_vocab(model_);
+  if (vocab == nullptr) {
+    throw std::runtime_error("LlamaGenerator: vocab unavailable");
+  }
+
+  std::vector<llama_token> promptTokens(prompt.size() + 8);
+  int32_t tokenCount =
+      llama_tokenize(vocab, prompt.c_str(), static_cast<int32_t>(prompt.size()),
+                     promptTokens.data(),
+                     static_cast<int32_t>(promptTokens.size()), true, true);
+
+  if (tokenCount < 0) {
+    promptTokens.resize(static_cast<std::size_t>(-tokenCount));
+    tokenCount =
+        llama_tokenize(vocab, prompt.c_str(),
+                       static_cast<int32_t>(prompt.size()), promptTokens.data(),
+                       static_cast<int32_t>(promptTokens.size()), true, true);
+  }
+
+  if (tokenCount < 0) {
+    throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
+  }
+
+  promptTokens.resize(static_cast<std::size_t>(tokenCount));
+
+  const llama_batch promptBatch = llama_batch_get_one(
+      promptTokens.data(), static_cast<int32_t>(promptTokens.size()));
+  if (llama_decode(context_, promptBatch) != 0) {
+    throw std::runtime_error("LlamaGenerator: prompt decode failed");
+  }
+
+  llama_sampler_chain_params samplerParams =
+      llama_sampler_chain_default_params();
+  using SamplerPtr =
+      std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
+  SamplerPtr sampler(llama_sampler_chain_init(samplerParams),
+                     &llama_sampler_free);
+
+  if (!sampler) {
+    throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
+  }
+
+  llama_sampler_chain_add(sampler.get(), llama_sampler_init_greedy());
+
+  std::vector<llama_token> generatedTokens;
+  generatedTokens.reserve(static_cast<std::size_t>(maxTokens));
+
+  for (int i = 0; i < maxTokens; ++i) {
+    const llama_token next = llama_sampler_sample(sampler.get(), context_, -1);
+    if (llama_vocab_is_eog(vocab, next)) {
+      break;
+    }
+
+    generatedTokens.push_back(next);
+
+    llama_token token = next;
+    const llama_batch oneTokenBatch = llama_batch_get_one(&token, 1);
+    if (llama_decode(context_, oneTokenBatch) != 0) {
+      throw std::runtime_error(
+          "LlamaGenerator: decode failed during generation");
+    }
+  }
+
+  std::string output;
+  for (const llama_token token : generatedTokens) {
+    appendTokenPiece(vocab, token, output);
+  }
+
+  return output;
+}
+
+BreweryResult
+LlamaGenerator::generateBrewery(const std::string &cityName,
+                                const std::string &regionContext) {
+  std::string prompt =
+      "Generate a craft brewery name and one-sentence description for a "
+      "brewery located in " +
+      cityName + ". " + regionContext +
+      " Respond with exactly two lines: first line is the name, second "
+      "line is the description.";
+
+  const std::string raw = infer(prompt, 128);
+  auto [name, description] =
+      parseTwoLineResponse(raw, "LlamaGenerator: malformed brewery response");
+
+  return {name, description};
+}
+
+UserResult LlamaGenerator::generateUser(const std::string &locale) {
+  std::string prompt =
+      "Generate a plausible craft beer enthusiast username and a one-sentence "
+      "bio. Locale: " +
+      locale +
+      ". Respond with exactly two lines: first line is the username (no "
+      "spaces), second line is the bio.";
+
+  const std::string raw = infer(prompt, 128);
+  auto [username, bio] =
+      parseTwoLineResponse(raw, "LlamaGenerator: malformed user response");
+
+  username.erase(
+      std::remove_if(username.begin(), username.end(),
+                     [](unsigned char ch) { return std::isspace(ch); }),
+      username.end());
+
+  if (username.empty() || bio.empty()) {
+    throw std::runtime_error("LlamaGenerator: malformed user response");
+  }
+
+  return {username, bio};
+}
--- a/pipeline/src/main.cpp
+++ b/pipeline/src/main.cpp
@@ -1,9 +1,12 @@
 #include "data_downloader.h"
+#include "data_generator.h"
 #include "database.h"
-#include "generator.h"
 #include "json_loader.h"
+#include "llama_generator.h"
+#include "mock_generator.h"
 #include <curl/curl.h>
 #include <filesystem>
+#include <memory>
 #include <spdlog/spdlog.h>

 static bool FileExists(const std::string &filePath) {
@@ -14,7 +17,7 @@ int main(int argc, char *argv[]) {
  try {
    curl_global_init(CURL_GLOBAL_DEFAULT);

-    std::string modelPath = argc > 1 ? argv[1] : "./model.gguf";
+    std::string modelPath = argc > 1 ? argv[1] : "";
    std::string cacheDir = argc > 2 ? argv[2] : "/tmp";
    std::string commit =
        argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28
@@ -41,8 +44,15 @@ int main(int argc, char *argv[]) {
    }

    spdlog::info("Initializing brewery generator...");
-    LlamaBreweryGenerator generator;
-    generator.LoadModel(modelPath);
+    std::unique_ptr<IDataGenerator> generator;
+    if (modelPath.empty()) {
+      generator = std::make_unique<MockGenerator>();
+      spdlog::info("[Generator] Using MockGenerator (no model path provided)");
+    } else {
+      generator = std::make_unique<LlamaGenerator>();
+      spdlog::info("[Generator] Using LlamaGenerator: {}", modelPath);
+    }
+    generator->load(modelPath);

    spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");

@@ -74,7 +84,7 @@ int main(int argc, char *argv[]) {
    spdlog::info("\n=== SAMPLE BREWERY GENERATION ===\n");
    for (size_t i = 0; i < std::min(size_t(5), cities.size()); i++) {
      const auto &[cityId, cityName] = cities[i];
-      auto brewery = generator.GenerateBrewery(cityName, i);
+      auto brewery = generator->generateBrewery(cityName, "");
      spdlog::info("  {}: {}", cityName, brewery.name);
      spdlog::info("    -> {}", brewery.description);
    }
--- a/pipeline/src/mock_generator.cpp
+++ b/pipeline/src/mock_generator.cpp
@@ -0,0 +1,101 @@
+#include "mock_generator.h"
+
+#include <functional>
+#include <spdlog/spdlog.h>
+
+const std::vector<std::string> MockGenerator::kBreweryAdjectives = {
+    "Craft",      "Heritage", "Local",  "Artisan",  "Pioneer",    "Golden",
+    "Modern",     "Classic",  "Summit", "Northern", "Riverstone", "Barrel",
+    "Hinterland", "Harbor",   "Wild",   "Granite",  "Copper",     "Maple"};
+
+const std::vector<std::string> MockGenerator::kBreweryNouns = {
+    "Brewing Co.", "Brewery",    "Bier Haus", "Taproom",      "Works",
+    "House",       "Fermentery", "Ale Co.",   "Cellars",      "Collective",
+    "Project",     "Foundry",    "Malthouse", "Public House", "Co-op",
+    "Lab",         "Beer Hall",  "Guild"};
+
+const std::vector<std::string> MockGenerator::kBreweryDescriptions = {
+    "Handcrafted pale ales and seasonal IPAs with local ingredients.",
+    "Traditional lagers and experimental sours in small batches.",
+    "Award-winning stouts and wildly hoppy blonde ales.",
+    "Craft brewery specializing in Belgian-style triples and dark porters.",
+    "Modern brewery blending tradition with bold experimental flavors.",
+    "Neighborhood-focused taproom pouring crisp pilsners and citrusy pale "
+    "ales.",
+    "Small-batch brewery known for barrel-aged releases and smoky lagers.",
+    "Independent brewhouse pairing farmhouse ales with rotating food pop-ups.",
+    "Community brewpub making balanced bitters, saisons, and hazy IPAs.",
+    "Experimental nanobrewery exploring local yeast and regional grains.",
+    "Family-run brewery producing smooth amber ales and robust porters.",
+    "Urban brewery crafting clean lagers and bright, fruit-forward sours.",
+    "Riverfront brewhouse featuring oak-matured ales and seasonal blends.",
+    "Modern taproom focused on sessionable lagers and classic pub styles.",
+    "Brewery rooted in tradition with a lineup of malty reds and crisp lagers.",
+    "Creative brewery offering rotating collaborations and limited draft-only "
+    "pours.",
+    "Locally inspired brewery serving approachable ales with bold hop "
+    "character.",
+    "Destination taproom known for balanced IPAs and cocoa-rich stouts."};
+
+const std::vector<std::string> MockGenerator::kUsernames = {
+    "hopseeker",     "malttrail",   "yeastwhisper",  "lagerlane",
+    "barrelbound",   "foamfinder",  "taphunter",     "graingeist",
+    "brewscout",     "aleatlas",    "caskcompass",   "hopsandmaps",
+    "mashpilot",     "pintnomad",   "fermentfriend", "stoutsignal",
+    "sessionwander", "kettlekeeper"};
+
+const std::vector<std::string> MockGenerator::kBios = {
+    "Always chasing balanced IPAs and crisp lagers across local taprooms.",
+    "Weekend brewery explorer with a soft spot for dark, roasty stouts.",
+    "Documenting tiny brewpubs, fresh pours, and unforgettable beer gardens.",
+    "Fan of farmhouse ales, food pairings, and long tasting flights.",
+    "Collecting favorite pilsners one city at a time.",
+    "Hops-first drinker who still saves room for classic malt-forward styles.",
+    "Finding hidden tap lists and sharing the best seasonal releases.",
+    "Brewery road-tripper focused on local ingredients and clean fermentation.",
+    "Always comparing house lagers and ranking patio pint vibes.",
+    "Curious about yeast strains, barrel programs, and cellar experiments.",
+    "Believes every neighborhood deserves a great community taproom.",
+    "Looking for session beers that taste great from first sip to last.",
+    "Belgian ale enthusiast who never skips a new saison.",
+    "Hazy IPA critic with deep respect for a perfectly clear pilsner.",
+    "Visits breweries for the stories, stays for the flagship pours.",
+    "Craft beer fan mapping tasting notes and favorite brew routes.",
+    "Always ready to trade recommendations for underrated local breweries.",
+    "Keeping a running list of must-try collab releases and tap takeovers."};
+
+void MockGenerator::load(const std::string & /*modelPath*/) {
+  spdlog::info("[MockGenerator] No model needed");
+}
+
+std::size_t MockGenerator::deterministicHash(const std::string &a,
+                                             const std::string &b) {
+  std::size_t seed = std::hash<std::string>{}(a);
+  const std::size_t mixed = std::hash<std::string>{}(b);
+  seed ^= mixed + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
+  seed = (seed << 13) | (seed >> ((sizeof(std::size_t) * 8) - 13));
+  return seed;
+}
+
+BreweryResult MockGenerator::generateBrewery(const std::string &cityName,
+                                             const std::string &regionContext) {
+  const std::size_t hash = regionContext.empty()
+                               ? std::hash<std::string>{}(cityName)
+                               : deterministicHash(cityName, regionContext);
+
+  BreweryResult result;
+  result.name = kBreweryAdjectives[hash % kBreweryAdjectives.size()] + " " +
+                kBreweryNouns[(hash / 7) % kBreweryNouns.size()];
+  result.description =
+      kBreweryDescriptions[(hash / 13) % kBreweryDescriptions.size()];
+  return result;
+}
+
+UserResult MockGenerator::generateUser(const std::string &locale) {
+  const std::size_t hash = std::hash<std::string>{}(locale);
+
+  UserResult result;
+  result.username = kUsernames[hash % kUsernames.size()];
+  result.bio = kBios[(hash / 11) % kBios.size()];
+  return result;
+}