Refactor Llama generator, helpers, and build assets

make Gemma 4 the default model, enable thinking mode style updates
2026-06-01 01:54:00 +00:00 · 2026-04-10 00:03:45 -04:00
parent 7ca651a886
commit 56ec728ba7
61 changed files with 1430 additions and 1905 deletions
--- a/pipeline/includes/data_generation/llama_generator.h
+++ b/pipeline/includes/data_generation/llama_generator.h
@@ -1,5 +1,5 @@
-#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
-#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
+#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_
+#define BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_

 /**
 * @file data_generation/llama_generator.h
@@ -9,115 +9,123 @@
 #include <cstdint>
 #include <random>
 #include <string>
+#include <string_view>

 #include "data_generation/data_generator.h"
-
-struct ApplicationOptions;
+#include "data_model/application_options.h"

 struct llama_model;
 struct llama_context;
+struct llama_sampler;

 /**
 * @brief Data generator implementation backed by llama.cpp.
 */
 class LlamaGenerator final : public DataGenerator {
-  public:
-   /**
-    * @brief Constructs a generator using parsed application options and loads
-    * the configured model immediately.
-    *
-    * @param options Parsed application options.
-    * @param model_path Filesystem path to GGUF model assets.
-    */
-   LlamaGenerator(const ApplicationOptions& options,
-                  const std::string& model_path);
+ public:
+  /**
+   * @brief Constructs a generator using parsed application options and loads
+   * the configured model immediately.
+   *
+   * @param options Parsed application options.
+   * @param model_path Filesystem path to GGUF model assets.
+   */
+  LlamaGenerator(const ApplicationOptions& options,
+                 const std::string& model_path);

-   /// @brief Releases model/context resources.
-   ~LlamaGenerator() override;
+  /// @brief Releases model/context resources.
+  ~LlamaGenerator() override;

-   /**
-    * @brief Generates brewery data for a specific location.
-    *
-    * @param city_name City name.
-    * @param country_name Country name.
-    * @param region_context Additional regional context.
-    * @return Generated brewery result.
-    */
-   BreweryResult GenerateBrewery(const std::string& city_name,
-                                 const std::string& country_name,
-                                 const std::string& region_context) override;
+  LlamaGenerator(const LlamaGenerator&) = delete;
+  LlamaGenerator& operator=(const LlamaGenerator&) = delete;
+  LlamaGenerator(LlamaGenerator&&) = delete;
+  LlamaGenerator& operator=(LlamaGenerator&&) = delete;

-   /**
-    * @brief Generates a user profile for the provided locale.
-    *
-    * @param locale Locale hint.
-    * @return Generated user profile.
-    */
-   UserResult GenerateUser(const std::string& locale) override;
+  /**
+   * @brief Generates brewery data for a specific location.
+   *
+   * @param location Location object.
+   * @param region_context Additional regional context.
+   * @return Generated brewery result.
+   */
+  BreweryResult GenerateBrewery(const Location& location,
+                                const std::string& region_context) override;

-  private:
-   /**
-    * @brief Loads model and prepares inference context.
-    *
-    * @param model_path Filesystem path to GGUF model.
-    */
-   void Load(const std::string& model_path);
+  /**
+   * @brief Generates a user profile for the provided locale.
+   *
+   * @param locale Locale hint.
+   * @return Generated user profile.
+   */
+  UserResult GenerateUser(const std::string& locale) override;

-   /**
-    * @brief Infers text from a user prompt.
-    *
-    * @param prompt User prompt.
-    * @param max_tokens Maximum tokens to generate.
-    * @return Generated text.
-    */
-   std::string Infer(const std::string& prompt, int max_tokens = 10000);
+ private:
+  static constexpr int kDefaultMaxTokens = 10000;
+  static constexpr float kDefaultSamplingTopP = 0.95F;
+  static constexpr uint32_t kDefaultSamplingTopK = 64;
+  static constexpr uint32_t kDefaultContextSize = 8192;

-   /**
-    * @brief Infers text from separate system and user prompts.
-    *
-    * This helps chat-capable models preserve system-role behavior instead of
-    * concatenating system text into user input.
-    *
-    * @param system_prompt System role prompt.
-    * @param prompt User prompt.
-    * @param max_tokens Maximum tokens to generate.
-    * @return Generated text.
-    */
-   std::string Infer(const std::string& system_prompt,
-                     const std::string& prompt, int max_tokens = 10000);
+  struct SamplerState {
+    SamplerState() = default;
+    ~SamplerState();

-   /**
-    * @brief Runs inference on an already-formatted prompt.
-    *
-    * @param formatted_prompt Prompt preformatted for model chat template.
-    * @param max_tokens Maximum tokens to generate.
-    * @return Generated text.
-    */
-   std::string InferFormatted(const std::string& formatted_prompt,
-                              int max_tokens = 10000);
+    SamplerState(const SamplerState&) = delete;
+    SamplerState& operator=(const SamplerState&) = delete;
+    SamplerState(SamplerState&&) = delete;
+    SamplerState& operator=(SamplerState&&) = delete;

-   /**
-    * @brief Loads the brewery system prompt from disk.
-    *
-    * @param prompt_file_path Prompt file path to try first.
-    * @return Loaded prompt text or fallback prompt.
-    */
-   std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path);
+    llama_sampler* chain = nullptr;
+  };

-   /**
-    * @brief Returns a built-in fallback system prompt.
-    *
-    * @return Fallback prompt text.
-    */
-   std::string GetFallbackBreweryPrompt();
+  /**
+   * @brief Loads model and prepares inference context.
+   *
+   * @param model_path Filesystem path to GGUF model.
+   */
+  void Load(const std::string& model_path);

-   llama_model* model_ = nullptr;
-   llama_context* context_ = nullptr;
-   float sampling_temperature_ = 0.8f;
-   float sampling_top_p_ = 0.92f;
-   std::mt19937 rng_;
-   uint32_t n_ctx_ = 8192;
-   std::string brewery_system_prompt_;
+  /**
+   * @brief Infers text from separate system and user prompts.
+   *
+   * This helps chat-capable models preserve system-role behavior instead of
+   * concatenating system text into user input.
+   *
+   * @param system_prompt System role prompt.
+   * @param prompt User prompt.
+   * @param max_tokens Maximum tokens to generate.
+   * @return Generated text.
+   */
+  std::string Infer(const std::string& system_prompt, const std::string& prompt,
+                    int max_tokens = kDefaultMaxTokens);
+
+  /**
+   * @brief Runs inference on an already-formatted prompt.
+   *
+   * @param formatted_prompt Prompt preformatted for model chat template.
+   * @param max_tokens Maximum tokens to generate.
+   * @return Generated text.
+   */
+  std::string InferFormatted(const std::string& formatted_prompt,
+                             int max_tokens = kDefaultMaxTokens);
+
+  /**
+   * @brief Loads the brewery system prompt from disk.
+   *
+   * @param prompt_file_path Prompt file path to try first.
+   * @return Loaded prompt text.
+   */
+  std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path);
+
+  llama_model* model_ = nullptr;
+  llama_context* context_ = nullptr;
+  /// @brief Persistent sampler chain reused across inference calls.
+  std::unique_ptr<SamplerState> sampler_;
+  float sampling_temperature_ = 1.0F;
+  float sampling_top_p_ = kDefaultSamplingTopP;
+  uint32_t sampling_top_k_ = kDefaultSamplingTopK;
+  std::mt19937 rng_;
+  uint32_t n_ctx_ = kDefaultContextSize;
+  std::string brewery_system_prompt_;
 };

-#endif  // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
+#endif  // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_