Update documentation

2026-07-16 17:47:22 +00:00 · 2026-04-08 22:24:23 -04:00
parent 7807f0bc2a
commit b31be494d7
28 changed files with 487 additions and 93 deletions
--- a/pipeline/includes/data_generation/data_generator.h
+++ b/pipeline/includes/data_generation/data_generator.h
@@ -1,28 +1,68 @@
 #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
 #define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_

+/**
+ * @file data_generation/data_generator.h
+ * @brief Shared generator interfaces and result models.
+ */
+
 #include <string>

+/**
+ * @brief Generated brewery payload.
+ */
 struct BreweryResult {
+   /// @brief Brewery display name.
   std::string name;
+
+   /// @brief Brewery description text.
   std::string description;
 };

+/**
+ * @brief Generated user profile payload.
+ */
 struct UserResult {
+   /// @brief Username handle.
   std::string username;
+
+   /// @brief Short user biography.
   std::string bio;
 };

+/**
+ * @brief Interface for data generator implementations.
+ */
 class DataGenerator {
  public:
+   /// @brief Virtual destructor for polymorphic cleanup.
   virtual ~DataGenerator() = default;

+   /**
+    * @brief Loads and initializes generator resources.
+    *
+    * @param model_path Path to model assets. Implementations may ignore this.
+    */
   virtual void Load(const std::string& model_path) = 0;

+   /**
+    * @brief Generates brewery data for a location.
+    *
+    * @param city_name City name.
+    * @param country_name Country name.
+    * @param region_context Additional regional context text.
+    * @return Brewery generation result.
+    */
   virtual BreweryResult GenerateBrewery(const std::string& city_name,
                                         const std::string& country_name,
                                         const std::string& region_context) = 0;

+   /**
+    * @brief Generates a user profile for a locale.
+    *
+    * @param locale Locale hint used by generator.
+    * @return User generation result.
+    */
   virtual UserResult GenerateUser(const std::string& locale) = 0;
 };

--- a/pipeline/includes/data_generation/llama_generator.h
+++ b/pipeline/includes/data_generation/llama_generator.h
@@ -1,6 +1,11 @@
 #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
 #define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_

+/**
+ * @file data_generation/llama_generator.h
+ * @brief Llama.cpp-backed implementation of DataGenerator.
+ */
+
 #include <cstdint>
 #include <string>

@@ -9,34 +14,107 @@
 struct llama_model;
 struct llama_context;

+/**
+ * @brief Data generator implementation backed by llama.cpp.
+ */
 class LlamaGenerator final : public DataGenerator {
  public:
+   /// @brief Constructs a generator with default sampling and context settings.
   LlamaGenerator() = default;
+
+   /// @brief Releases model/context resources.
   ~LlamaGenerator() override;

+   /**
+    * @brief Configures sampling parameters for generation.
+    *
+    * @param temperature Sampling temperature.
+    * @param top_p Nucleus sampling threshold.
+    * @param seed Seed for sampling; use -1 for random seed.
+    */
   void SetSamplingOptions(float temperature, float top_p, int seed = -1);

+   /**
+    * @brief Sets context window size used during model load.
+    *
+    * @param n_ctx Context size in tokens.
+    */
   void SetContextSize(uint32_t n_ctx);

+   /**
+    * @brief Loads model and prepares inference context.
+    *
+    * @param model_path Filesystem path to GGUF model.
+    */
   void Load(const std::string& model_path) override;
+
+   /**
+    * @brief Generates brewery data for a specific location.
+    *
+    * @param city_name City name.
+    * @param country_name Country name.
+    * @param region_context Additional regional context.
+    * @return Generated brewery result.
+    */
   BreweryResult GenerateBrewery(const std::string& city_name,
                                 const std::string& country_name,
                                 const std::string& region_context) override;
+
+   /**
+    * @brief Generates a user profile for the provided locale.
+    *
+    * @param locale Locale hint.
+    * @return Generated user profile.
+    */
   UserResult GenerateUser(const std::string& locale) override;

  private:
+   /**
+    * @brief Infers text from a user prompt.
+    *
+    * @param prompt User prompt.
+    * @param max_tokens Maximum tokens to generate.
+    * @return Generated text.
+    */
   std::string Infer(const std::string& prompt, int max_tokens = 10000);
-   // Overload that allows passing a system message separately so chat-capable
-   // models receive a proper system role instead of having the system text
-   // concatenated into the user prompt (helps avoid revealing internal
-   // reasoning or instructions in model output).
+
+   /**
+    * @brief Infers text from separate system and user prompts.
+    *
+    * This helps chat-capable models preserve system-role behavior instead of
+    * concatenating system text into user input.
+    *
+    * @param system_prompt System role prompt.
+    * @param prompt User prompt.
+    * @param max_tokens Maximum tokens to generate.
+    * @return Generated text.
+    */
   std::string Infer(const std::string& system_prompt,
                     const std::string& prompt, int max_tokens = 10000);

+   /**
+    * @brief Runs inference on an already-formatted prompt.
+    *
+    * @param formatted_prompt Prompt preformatted for model chat template.
+    * @param max_tokens Maximum tokens to generate.
+    * @return Generated text.
+    */
   std::string InferFormatted(const std::string& formatted_prompt,
                              int max_tokens = 10000);

+   /**
+    * @brief Loads the brewery system prompt from disk.
+    *
+    * @param prompt_file_path Prompt file path to try first.
+    * @return Loaded prompt text or fallback prompt.
+    */
   std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path);
+
+   /**
+    * @brief Returns a built-in fallback system prompt.
+    *
+    * @return Fallback prompt text.
+    */
   std::string GetFallbackBreweryPrompt();

   llama_model* model_ = nullptr;
--- a/pipeline/includes/data_generation/llama_generator_helpers.h
+++ b/pipeline/includes/data_generation/llama_generator_helpers.h
@@ -1,6 +1,11 @@
 #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
 #define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_

+/**
+ * @file data_generation/llama_generator_helpers.h
+ * @brief Shared helper APIs used by LlamaGenerator translation units.
+ */
+
 #include <string>
 #include <utility>

@@ -8,23 +13,66 @@ struct llama_model;
 struct llama_vocab;
 typedef int llama_token;

-// Helper functions for LlamaGenerator methods
+/**
+ * @brief Normalizes and truncates regional context.
+ *
+ * @param region_context Input regional context text.
+ * @param max_chars Maximum output length.
+ * @return Processed region context.
+ */
 std::string PrepareRegionContextPublic(std::string_view region_context,
                                       std::size_t max_chars = 700);

+/**
+ * @brief Parses a response expected to contain two logical lines.
+ *
+ * @param raw Raw model output.
+ * @param error_message Error message thrown on parse failure.
+ * @return Pair containing first and second parsed fields.
+ */
 std::pair<std::string, std::string> ParseTwoLineResponsePublic(
    const std::string& raw, const std::string& error_message);

+/**
+ * @brief Applies model chat template to a user-only prompt.
+ *
+ * @param model Loaded llama model.
+ * @param user_prompt User prompt text.
+ * @return Model-formatted prompt.
+ */
 std::string ToChatPromptPublic(const llama_model* model,
                               const std::string& user_prompt);

+/**
+ * @brief Applies model chat template to system and user prompts.
+ *
+ * @param model Loaded llama model.
+ * @param system_prompt System prompt text.
+ * @param user_prompt User prompt text.
+ * @return Model-formatted prompt.
+ */
 std::string ToChatPromptPublic(const llama_model* model,
                               const std::string& system_prompt,
                               const std::string& user_prompt);

+/**
+ * @brief Decodes a sampled token and appends it to output text.
+ *
+ * @param vocab Model vocabulary.
+ * @param token Sampled token id.
+ * @param output Output text buffer.
+ */
 void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
                            std::string& output);

+/**
+ * @brief Validates and parses brewery JSON output.
+ *
+ * @param raw Raw model output.
+ * @param name_out Parsed brewery name.
+ * @param description_out Parsed brewery description.
+ * @return Empty string on success, or validation error message.
+ */
 std::string ValidateBreweryJsonPublic(const std::string& raw,
                                      std::string& name_out,
                                      std::string& description_out);
--- a/pipeline/includes/data_generation/mock_generator.h
+++ b/pipeline/includes/data_generation/mock_generator.h
@@ -1,20 +1,56 @@
 #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
 #define BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_

+/**
+ * @file data_generation/mock_generator.h
+ * @brief Deterministic mock implementation of DataGenerator.
+ */
+
 #include <string>
 #include <vector>

 #include "data_generation/data_generator.h"

+/**
+ * @brief Mock generator used for deterministic, model-free outputs.
+ */
 class MockGenerator final : public DataGenerator {
  public:
+   /**
+    * @brief Initializes the mock generator.
+    *
+    * @param model_path Unused for mock generation.
+    */
   void Load(const std::string& model_path) override;
+
+   /**
+    * @brief Generates deterministic brewery data for a location.
+    *
+    * @param city_name City name.
+    * @param country_name Country name.
+    * @param region_context Unused for mock generation.
+    * @return Generated brewery result.
+    */
   BreweryResult GenerateBrewery(const std::string& city_name,
                                 const std::string& country_name,
                                 const std::string& region_context) override;
+
+   /**
+    * @brief Generates deterministic user data for a locale.
+    *
+    * @param locale Locale hint.
+    * @return Generated user result.
+    */
   UserResult GenerateUser(const std::string& locale) override;

  private:
+   /**
+    * @brief Combines two strings into a stable hash value.
+    *
+    * @param a First key.
+    * @param b Second key.
+    * @return Deterministic hash value.
+    */
   static std::size_t DeterministicHash(const std::string& a,
                                        const std::string& b);