Create biergarten brewery pipeline project (#199)

2026-06-01 01:54:00 +00:00 · 2026-04-18 19:19:14 -04:00
parent fd3c172e35
commit 898cc8971b
59 changed files with 5638 additions and 0 deletions
--- a/pipeline/src/data_generation/llama/generate_brewery.cc
+++ b/pipeline/src/data_generation/llama/generate_brewery.cc
@@ -0,0 +1,144 @@
+/**
+ * @file data_generation/llama/generate_brewery.cc
+ * @brief Builds brewery prompts with regional context, performs retry-based
+ * inference, and validates structured JSON output for brewery records.
+ */
+
+#include <spdlog/spdlog.h>
+
+#include <format>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "data_generation/llama_generator.h"
+#include "data_generation/llama_generator_helpers.h"
+
+static std::string FormatLocalLanguageCodes(
+    const std::vector<std::string>& codes) {
+  if (codes.empty()) {
+    return "Not provided";
+  }
+
+  std::string formatted;
+  for (const std::string& code : codes) {
+    if (!formatted.empty()) {
+      formatted += ", ";
+    }
+    formatted += code;
+  }
+
+  return formatted;
+}
+
+static constexpr std::string_view kBreweryJsonGrammar = R"json_brewery(
+root ::= thought-block "{" ws "\"name_en\"" ws ":" ws string ws "," ws "\"description_en\"" ws ":" ws string ws "," ws "\"name_local\"" ws ":" ws string ws "," ws "\"description_local\"" ws ":" ws string ws "}" ws
+thought-block ::= [^{]*
+ws ::= [ \t\n\r]*
+string ::= "\"" char+ "\""
+char ::= [^"\\\x7F\x00-\x1F] | [\\] escape
+escape ::= ["\\/bfnrt] | "u" hex hex hex hex
+hex ::= [0-9a-fA-F]
+)json_brewery";
+
+static constexpr int kBreweryInitialMaxTokens = 2800;
+
+BreweryResult LlamaGenerator::GenerateBrewery(
+    const Location& location, const std::string& region_context) {
+  /**
+   * Preprocess and truncate region context to manageable size
+   */
+  const std::string safe_region_context = PrepareRegionContext(region_context);
+
+  const std::string local_language_codes =
+      FormatLocalLanguageCodes(location.local_languages);
+
+  const std::string country_suffix =
+      location.country.empty() ? std::string{}
+                               : std::format(", {}", location.country);
+  /**
+   * Load brewery system prompt from file
+   * Falls back to minimal inline prompt if file not found
+   */
+  const std::string system_prompt =
+      LoadBrewerySystemPrompt("prompts/system.md");
+
+  std::string user_prompt = std::format(
+      "## CITY:\n{}\n\n## COUNTRY:\n{}\n\n## LOCAL LANGUAGE CODES:\n{}\n\n## "
+      "CONTEXT:\n{}",
+      location.city, location.country, local_language_codes,
+      safe_region_context);
+
+  /**
+   * Store location context for retry prompts (without repeating full context)
+   */
+  const std::string retry_location =
+      std::format("Location: {}{}\nLocal language codes: {}", location.city,
+                  country_suffix, local_language_codes);
+
+  /**
+   * RETRY LOOP with validation and error correction
+   * Attempts to generate valid brewery data up to 3 times, with feedback-based
+   * refinement
+   */
+  constexpr int max_attempts = 3;
+  std::string raw;
+  std::string last_error;
+
+  // Token budget: too small risks truncating valid JSON mid-string.
+  // Start conservatively but allow adaptive increases on truncation.
+  int max_tokens = kBreweryInitialMaxTokens;
+
+  // Limit output length to keep it concise and focused
+  for (int attempt = 0; attempt < max_attempts; ++attempt) {
+    // Generate brewery data from LLM
+    raw = this->Infer(system_prompt, user_prompt, max_tokens,
+                      kBreweryJsonGrammar);
+    spdlog::debug("LlamaGenerator: raw output (attempt {}): {}", attempt + 1,
+                  raw);
+
+    // Validate output: parse JSON and check required fields
+
+    BreweryResult brewery;
+    const std::optional<std::string> validation_error =
+        ValidateBreweryJson(raw, brewery);
+
+    if (!validation_error.has_value()) {
+      // Success: return parsed brewery data
+
+      spdlog::info(
+          "LlamaGenerator: successfully generated brewery data on attempt {}",
+          attempt + 1);
+
+      return brewery;
+    }
+
+    // Validation failed: log error and prepare corrective feedback
+
+    last_error = *validation_error;
+    spdlog::warn("LlamaGenerator: malformed brewery JSON (attempt {}): {}",
+                 attempt + 1, *validation_error);
+
+    // Update prompt with error details to guide LLM toward correct output.
+    user_prompt = std::format(
+        "Your previous response was invalid. Error: {}\nReturn the thought "
+        "process before the JSON if needed, then return ONLY valid JSON with "
+        "exactly these keys, in this exact order: {{\"name_en\": \"<English "
+        "brewery name>\", \"description_en\": \"<English single-paragraph "
+        "description>\", \"name_local\": \"<local-language brewery name>\", "
+        "\"description_local\": \"<local-language single-paragraph "
+        "description>\"}}.\nDo not include markdown, comments, extra keys, or "
+        "literal placeholder values.\n\nKeep the JSON strings concise enough "
+        "to fit within the token budget.\n\n{}",
+        *validation_error, retry_location);
+  }
+
+  // All retry attempts exhausted: log failure and throw exception
+  spdlog::error(
+      "LlamaGenerator: malformed brewery response after {} attempts: "
+      "{}",
+      max_attempts, last_error.empty() ? raw : last_error);
+  throw std::runtime_error("LlamaGenerator: malformed brewery response");
+}
--- a/pipeline/src/data_generation/llama/generate_user.cc
+++ b/pipeline/src/data_generation/llama/generate_user.cc
@@ -0,0 +1,18 @@
+/**
+ * @file data_generation/llama/generate_user.cc
+ * @brief Generates locale-aware user profiles with strict two-line formatting,
+ * retry handling, and output sanitization for downstream parsing.
+ */
+
+#include <spdlog/spdlog.h>
+
+#include <stdexcept>
+#include <string>
+
+#include "data_generation/llama_generator.h"
+#include "data_generation/llama_generator_helpers.h"
+
+UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
+  return {.username = "test_user",
+          .bio = "This is a test user profile from " + locale + "."};
+}
--- a/pipeline/src/data_generation/llama/helpers.cc
+++ b/pipeline/src/data_generation/llama/helpers.cc
@@ -0,0 +1,215 @@
+/**
+ * @file data_generation/llama/helpers.cc
+ * @brief Provides prompt formatting, whitespace normalization, response
+ * parsing, token decoding, and JSON validation helpers for Llama modules.
+ */
+
+#include <algorithm>
+#include <array>
+#include <boost/json.hpp>
+#include <cctype>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "data_generation/llama_generator_helpers.h"
+#include "llama.h"
+
+/**
+ * String trimming: removes leading and trailing whitespace
+ */
+static std::string Trim(std::string_view value) {
+  constexpr std::string_view whitespace = " \t\n\r\f\v";
+  const size_t first_index = value.find_first_not_of(whitespace);
+  if (first_index == std::string_view::npos) {
+    return {};
+  }
+
+  const size_t last_index = value.find_last_not_of(whitespace);
+  return std::string(value.substr(first_index, last_index - first_index + 1));
+}
+
+/**
+ * Normalize whitespace: collapses multiple spaces/tabs/newlines into single
+ * spaces
+ */
+static std::string CondenseWhitespace(std::string_view text) {
+  std::string out;
+  out.reserve(text.size());
+
+  bool pending_space = false;
+  for (const char chr : text) {
+    if (std::isspace(static_cast<unsigned char>(chr)) != 0) {
+      if (!out.empty()) {
+        pending_space = true;
+      }
+      continue;
+    }
+
+    if (pending_space) {
+      out.push_back(' ');
+      pending_space = false;
+    }
+    out.push_back(chr);
+  }
+
+  return out;
+}
+
+/**
+ * Truncate region context to fit within max length while preserving word
+ * boundaries
+ */
+std::string PrepareRegionContext(std::string_view region_context,
+                                 const size_t max_chars) {
+  std::string normalized = CondenseWhitespace(region_context);
+  if (normalized.size() <= max_chars) {
+    return normalized;
+  }
+
+  normalized.resize(max_chars);
+  const size_t last_space = normalized.find_last_of(' ');
+  if (last_space != std::string::npos && last_space > max_chars / 2) {
+    normalized.resize(last_space);
+  }
+
+  normalized += "...";
+  return normalized;
+}
+
+void AppendTokenPiece(const llama_vocab* vocab, llama_token token,
+                      std::string& output) {
+  constexpr size_t initial_buffer_size = 256;
+
+  std::array<char, initial_buffer_size> buffer{};
+
+  // serialize the sampled token into UTF-8 bytes
+
+  auto buffer_too_small = [](int32_t result) -> bool { return result < 0; };
+
+  int32_t bytes =
+      llama_token_to_piece(vocab, token, buffer.data(), buffer.size(), 0, true);
+
+  if (!buffer_too_small(bytes)) {
+    // Append the decoded bytes from the stack buffer.
+    output.append(buffer.data(), static_cast<size_t>(bytes));
+    return;
+  }
+
+  const int32_t required_size = -bytes;
+  std::vector<char> dynamic_buffer(static_cast<size_t>(required_size));
+
+  // Retry token decoding against the larger heap buffer.
+  bytes = llama_token_to_piece(vocab, token, dynamic_buffer.data(),
+                               static_cast<int32_t>(dynamic_buffer.size()), 0,
+                               true);
+
+  if (!buffer_too_small(bytes)) {
+    output.append(dynamic_buffer.data(), static_cast<size_t>(bytes));
+    return;
+  }
+
+  throw std::runtime_error(
+      "LlamaGenerator: failed to decode sampled token piece");
+}
+
+static bool ReadRequiredTrimmedStringField(const boost::json::object& obj,
+                                           std::string_view key,
+                                           std::string& out,
+                                           std::string* error_out) {
+  const boost::json::value* field = obj.if_contains(key);
+  if (field == nullptr || !field->is_string()) {
+    if (error_out != nullptr) {
+      *error_out = "JSON field '" + std::string(key) +
+                   "' is missing or not a string";
+    }
+    return false;
+  }
+
+  const auto& string_value = field->as_string();
+  out = Trim(std::string_view(string_value.data(), string_value.size()));
+  if (out.empty()) {
+    if (error_out != nullptr) {
+      *error_out = "JSON field '" + std::string(key) + "' must not be empty";
+    }
+    return false;
+  }
+
+  return true;
+}
+
+static bool HasSchemaPlaceholder(const std::array<std::string*, 4>& values) {
+  for (const std::string* value : values) {
+    std::string lowered = *value;
+    std::ranges::transform(lowered, lowered.begin(),
+                           [](unsigned char character) {
+                             return static_cast<char>(std::tolower(character));
+                           });
+
+    if (lowered == "string") {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+std::optional<std::string> ValidateBreweryJson(const std::string& raw,
+                                               BreweryResult& brewery_out) {
+  boost::system::error_code error_code;
+  const std::string_view raw_view(raw);
+  const size_t opening_brace = raw_view.find('{');
+  if (opening_brace == std::string_view::npos) {
+    return "JSON parse error: missing opening brace '{'";
+  }
+
+  const std::string_view json_payload = raw_view.substr(opening_brace);
+  boost::json::value json_value = boost::json::parse(json_payload, error_code);
+  if (error_code) {
+    return "JSON parse error: " + error_code.message();
+  }
+
+  if (!json_value.is_object()) {
+    return "JSON root must be an object";
+  }
+
+  const auto& obj = json_value.get_object();
+  if (obj.size() != 4) {
+    return "JSON object must contain exactly four keys";
+  }
+
+  std::string validation_error;
+  if (!ReadRequiredTrimmedStringField(obj, "name_en", brewery_out.name_en,
+                                      &validation_error)) {
+    return validation_error;
+  }
+
+  if (!ReadRequiredTrimmedStringField(obj, "description_en",
+                                      brewery_out.description_en,
+                                      &validation_error)) {
+    return validation_error;
+  }
+
+  if (!ReadRequiredTrimmedStringField(obj, "name_local",
+                                      brewery_out.name_local,
+                                      &validation_error)) {
+    return validation_error;
+  }
+
+  if (!ReadRequiredTrimmedStringField(obj, "description_local",
+                                      brewery_out.description_local,
+                                      &validation_error)) {
+    return validation_error;
+  }
+
+  const std::array<std::string*, 4> schema_placeholders = {
+      &brewery_out.name_en, &brewery_out.description_en,
+      &brewery_out.name_local, &brewery_out.description_local};
+  if (HasSchemaPlaceholder(schema_placeholders)) {
+    return "JSON appears to be a schema placeholder, not content";
+  }
+
+  return std::nullopt;
+}
--- a/pipeline/src/data_generation/llama/infer.cc
+++ b/pipeline/src/data_generation/llama/infer.cc
@@ -0,0 +1,241 @@
+/**
+ * Text Generation / Inference Module
+ * Core module that performs LLM inference: converts text prompts into tokens,
+ * runs the neural network forward pass, samples the next token, and converts
+ * output tokens back to text for system+user chat prompts.
+ */
+
+#include <spdlog/spdlog.h>
+
+#include <algorithm>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "data_generation/llama_generator.h"
+#include "data_generation/llama_generator_helpers.h"
+#include "llama.h"
+
+static constexpr size_t kPromptTokenSlack = 8;
+
+namespace {
+
+using SamplerHandle = std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
+
+struct SamplerConfig {
+  float temperature;
+  uint32_t top_k;
+  float top_p;
+  uint32_t seed;
+};
+
+SamplerHandle MakeSamplerChain(const llama_vocab* vocab,
+                               const SamplerConfig& config,
+                               std::string_view grammar) {
+  const llama_sampler_chain_params sampler_params =
+      llama_sampler_chain_default_params();
+
+  SamplerHandle chain(llama_sampler_chain_init(sampler_params),
+                      llama_sampler_free);
+  if (!chain) {
+    throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
+  }
+
+  auto add_sampler = [&](llama_sampler* sampler, const char* error_message) {
+    if (sampler == nullptr) {
+      throw std::runtime_error(error_message);
+    }
+
+    llama_sampler_chain_add(chain.get(), sampler);
+  };
+
+  if (!grammar.empty()) {
+    const std::string grammar_text(grammar);
+    add_sampler(llama_sampler_init_grammar(vocab, grammar_text.c_str(), "root"),
+                "LlamaGenerator: failed to initialize grammar sampler");
+  }
+
+  add_sampler(llama_sampler_init_temp(config.temperature),
+              "LlamaGenerator: failed to initialize temperature sampler");
+  add_sampler(llama_sampler_init_top_k(static_cast<int32_t>(config.top_k)),
+              "LlamaGenerator: failed to initialize top-k sampler");
+  add_sampler(llama_sampler_init_top_p(config.top_p, 1),
+              "LlamaGenerator: failed to initialize top-p sampler");
+  add_sampler(llama_sampler_init_dist(config.seed),
+              "LlamaGenerator: failed to initialize distribution sampler");
+
+  return chain;
+}
+
+}  // namespace
+
+std::string LlamaGenerator::Infer(const std::string& system_prompt,
+                                  const std::string& prompt,
+                                  const int max_tokens,
+                                  std::string_view grammar) {
+  return InferFormatted(prompt_formatter_->Format(system_prompt, prompt),
+                        max_tokens, grammar);
+}
+
+std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
+                                           const int max_tokens,
+                                           std::string_view grammar) {
+  /**
+   * Validate that model and context are loaded
+   */
+  if (!model_ || !context_) {
+    throw std::runtime_error("LlamaGenerator: model not loaded");
+  }
+
+  /**
+   * Get vocabulary for tokenization and token-to-text conversion
+   */
+  const llama_vocab* vocab = llama_model_get_vocab(model_.get());
+  if (vocab == nullptr) {
+    throw std::runtime_error("LlamaGenerator: vocab unavailable");
+  }
+
+  const SamplerConfig sampler_config{
+      .temperature = sampling_temperature_,
+      .top_k = sampling_top_k_,
+      .top_p = sampling_top_p_,
+      .seed = static_cast<uint32_t>(rng_()),
+  };
+  auto sampler = MakeSamplerChain(vocab, sampler_config, grammar);
+
+  /**
+   * Clear KV cache to ensure clean inference state (no residual context)
+   */
+  llama_memory_clear(llama_get_memory(context_.get()), true);
+
+  /**
+   * TOKENIZATION PHASE
+   * Convert text prompt into token IDs (integers) that the model understands
+   */
+  std::vector<llama_token> prompt_tokens(formatted_prompt.size() +
+                                         kPromptTokenSlack);
+
+
+
+
+  int32_t token_count = llama_tokenize(
+      vocab,
+      formatted_prompt.c_str(),
+      static_cast<int32_t>(formatted_prompt.size()),
+      prompt_tokens.data(),
+      static_cast<int32_t>(prompt_tokens.size()),
+      true,
+      true);
+
+  /**
+   * If buffer too small, negative return indicates required size
+   */
+  if (token_count < 0) {
+    prompt_tokens.resize(static_cast<size_t>(-token_count));
+
+
+    token_count = llama_tokenize(
+        vocab, formatted_prompt.c_str(),
+        static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
+        static_cast<int32_t>(prompt_tokens.size()), true, true);
+  }
+
+  if (token_count < 0) {
+    throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
+  }
+
+  /**
+   * CONTEXT SIZE VALIDATION
+   * Validate and compute effective token budgets based on context window
+   * constraints
+   */
+  const auto n_ctx = static_cast<int32_t>(llama_n_ctx(context_.get()));
+  const auto n_batch = static_cast<int32_t>(llama_n_batch(context_.get()));
+  if (n_ctx <= 1 || n_batch <= 0) {
+    throw std::runtime_error("LlamaGenerator: invalid context or batch size");
+  }
+
+  /**
+   * Clamp generation limit to available context window, reserve space for
+   * output
+   */
+  const int32_t effective_max_tokens =
+      std::max(1, std::min(max_tokens, n_ctx - 1));
+
+  /**
+   * Prompt can use remaining context after reserving space for generation
+   */
+  int32_t prompt_budget = std::min(n_batch, n_ctx - effective_max_tokens);
+  prompt_budget = std::max<int32_t>(1, prompt_budget);
+
+  /**
+   * Truncate prompt if necessary to fit within constraints
+   */
+  prompt_tokens.resize(static_cast<size_t>(token_count));
+  if (token_count > prompt_budget) {
+    spdlog::warn(
+        "LlamaGenerator: prompt too long ({} tokens), truncating to {} "
+        "tokens to fit n_batch/n_ctx limits",
+        token_count, prompt_budget);
+    prompt_tokens.resize(static_cast<size_t>(prompt_budget));
+    token_count = prompt_budget;
+  }
+
+  /**
+   * PROMPT PROCESSING PHASE
+   * Create a batch containing all prompt tokens and feed through the model
+   * This computes internal representations and fills the KV cache
+   */
+  const llama_batch prompt_batch = llama_batch_get_one(
+      prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
+  if (llama_decode(context_.get(), prompt_batch) != 0) {
+    throw std::runtime_error("LlamaGenerator: prompt decode failed");
+  }
+
+  /**
+   * TOKEN GENERATION LOOP
+   * Iteratively generate tokens one at a time until max_tokens or
+   * end-of-sequence
+   */
+  std::vector<llama_token> generated_tokens;
+  generated_tokens.reserve(static_cast<size_t>(effective_max_tokens));
+
+  for (int i = 0; i < effective_max_tokens; ++i) {
+    /**
+     * Sample next token using configured sampler chain and model logits
+     * Index -1 means use the last output position from previous batch
+     */
+    const llama_token next =
+        llama_sampler_sample(sampler.get(), context_.get(), -1);
+    /**
+     * Stop if model predicts end-of-generation token (EOS/EOT)
+     */
+    if (llama_vocab_is_eog(vocab, next)) {
+      break;
+    }
+    generated_tokens.push_back(next);
+    /**
+     * Feed the sampled token back into model for next iteration
+     * (autoregressive)
+     */
+    llama_token decode_token = next;
+    const llama_batch one_token_batch = llama_batch_get_one(&decode_token, 1);
+    if (llama_decode(context_.get(), one_token_batch) != 0) {
+      throw std::runtime_error(
+          "LlamaGenerator: decode failed during generation");
+    }
+  }
+
+  /**
+   * DETOKENIZATION PHASE
+   * Convert generated token IDs back to text using vocabulary
+   */
+  std::string output;
+  for (const llama_token token : generated_tokens) {
+    AppendTokenPiece(vocab, token, output);
+  }
+
+  return output;
+}
--- a/pipeline/src/data_generation/llama/llama_generator.cc
+++ b/pipeline/src/data_generation/llama/llama_generator.cc
@@ -0,0 +1,86 @@
+/**
+ * @file data_generation/llama/llama_generator.cc
+ * @brief LlamaGenerator constructor and destructor implementation.
+ */
+
+#include "data_generation/llama_generator.h"
+
+#include <memory>
+#include <random>
+#include <stdexcept>
+#include <string>
+#include <filesystem>
+
+#include "data_model/application_options.h"
+#include "llama.h"
+
+static constexpr uint32_t kMaxContextSize = 32768U;
+
+void LlamaGenerator::ModelDeleter::operator()(
+    llama_model* model) const noexcept {
+  if (model != nullptr) {
+    llama_model_free(model);
+  }
+}
+
+void LlamaGenerator::ContextDeleter::operator()(
+    llama_context* context) const noexcept {
+  if (context != nullptr) {
+    llama_free(context);
+  }
+}
+
+LlamaGenerator::LlamaGenerator(const ApplicationOptions& options,
+                               const std::string& model_path,
+                               std::unique_ptr<IPromptFormatter> prompt_formatter)
+    : rng_(std::random_device{}()),
+      prompt_formatter_(std::move(prompt_formatter)) {
+  if (model_path.empty()) {
+    throw std::runtime_error("LlamaGenerator: model path must not be empty");
+  }
+
+  if (!prompt_formatter_) {
+    throw std::runtime_error(
+        "LlamaGenerator: prompt formatter dependency must not be null");
+  }
+
+  if (options.temperature < 0.0F) {
+    throw std::runtime_error(
+        "LlamaGenerator: sampling temperature must be >= 0");
+  }
+
+  if (options.top_p <= 0.0F || options.top_p > 1.0F) {
+    throw std::runtime_error(
+        "LlamaGenerator: sampling top-p must be in (0, 1]");
+  }
+
+  if (options.top_k == 0U) {
+    throw std::runtime_error("LlamaGenerator: sampling top-k must be > 0");
+  }
+
+  if (options.seed < -1) {
+    throw std::runtime_error(
+        "LlamaGenerator: seed must be >= 0, or -1 for random");
+  }
+
+  if (options.n_ctx == 0 || options.n_ctx > kMaxContextSize) {
+    throw std::runtime_error(
+        "LlamaGenerator: context size must be in range [1, 32768]");
+  }
+
+  sampling_temperature_ = options.temperature;
+  sampling_top_p_ = options.top_p;
+  sampling_top_k_ = options.top_k;
+
+  if (options.seed == -1) {
+    std::random_device random_device;
+    rng_.seed(random_device());
+  } else {
+    rng_.seed(static_cast<uint32_t>(options.seed));
+  }
+  n_ctx_ = options.n_ctx;
+
+  this->Load(model_path);
+}
+
+LlamaGenerator::~LlamaGenerator() = default;
--- a/pipeline/src/data_generation/llama/load.cc
+++ b/pipeline/src/data_generation/llama/load.cc
@@ -0,0 +1,43 @@
+/**
+ * @file data_generation/llama/load.cc
+ * @brief Initializes llama backend, loads model weights, creates inference
+ * context, and resets prior resources during model initialization.
+ */
+
+#include <spdlog/spdlog.h>
+
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+#include <utility>
+
+#include "data_generation/llama_generator.h"
+#include "llama.h"
+
+void LlamaGenerator::Load(const std::string& model_path) {
+  context_.reset();
+  model_.reset();
+
+  const llama_model_params model_params = llama_model_default_params();
+  LlamaGenerator::ModelHandle loaded_model(
+      llama_model_load_from_file(model_path.c_str(), model_params));
+  if (!loaded_model) {
+    throw std::runtime_error(
+        "LlamaGenerator: failed to load model from path: " + model_path);
+  }
+
+  llama_context_params context_params = llama_context_default_params();
+  context_params.n_ctx = n_ctx_;
+  context_params.n_batch = std::min(n_ctx_, static_cast<uint32_t>(5000));
+
+  LlamaGenerator::ContextHandle loaded_context(
+      llama_init_from_model(loaded_model.get(), context_params));
+  if (!loaded_context) {
+    throw std::runtime_error("LlamaGenerator: failed to create context");
+  }
+
+  model_ = std::move(loaded_model);
+  context_ = std::move(loaded_context);
+
+  spdlog::info("[LlamaGenerator] Loaded model: {}", model_path);
+}
--- a/pipeline/src/data_generation/llama/load_brewery_prompt.cc
+++ b/pipeline/src/data_generation/llama/load_brewery_prompt.cc
@@ -0,0 +1,56 @@
+/**
+ * @file data_generation/llama/load_brewery_prompt.cc
+ * @brief Resolves brewery system prompt content from cache or a configured
+ * filesystem path and provides a robust inline fallback prompt when absent.
+ */
+
+#include <spdlog/spdlog.h>
+
+#include <filesystem>
+#include <fstream>
+#include <stdexcept>
+
+#include "data_generation/llama_generator.h"
+
+/**
+ * @brief Loads brewery system prompt from disk or cache.
+ *
+ * @param prompt_file_path Preferred prompt file location.
+ * @return Prompt text loaded from disk.
+ */
+std::string LlamaGenerator::LoadBrewerySystemPrompt(
+    const std::filesystem::path& prompt_file_path) {
+  // Return cached version if already loaded
+  if (!brewery_system_prompt_.empty()) {
+    return brewery_system_prompt_;
+  }
+
+
+  std::ifstream prompt_file(prompt_file_path);
+  if (!prompt_file.is_open()) {
+    spdlog::error(
+        "LlamaGenerator: Failed to open brewery system prompt file '{}'",
+        prompt_file_path.string());
+    throw std::runtime_error(
+        "LlamaGenerator: missing brewery system prompt file: " +
+        prompt_file_path.string());
+  }
+
+  const std::string prompt((std::istreambuf_iterator(prompt_file)),
+                           std::istreambuf_iterator<char>());
+  prompt_file.close();
+
+  if (prompt.empty()) {
+    spdlog::error("LlamaGenerator: Brewery system prompt file '{}' is empty",
+                  prompt_file_path.string());
+    throw std::runtime_error(
+        "LlamaGenerator: empty brewery system prompt file: " +
+        prompt_file_path.string());
+  }
+
+  spdlog::info(
+      "LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)",
+      prompt_file_path.string(), prompt.length());
+  brewery_system_prompt_ = prompt;
+  return brewery_system_prompt_;
+}