Update documentation

2026-07-16 09:37:23 +00:00 · 2026-04-08 22:24:23 -04:00
parent 7807f0bc2a
commit b31be494d7
28 changed files with 487 additions and 93 deletions
--- a/pipeline/README.md
+++ b/pipeline/README.md
@@ -6,7 +6,7 @@ A C++23 tool for processing geographic data and generating brewery metadata. It

 The pipeline runs in four stages:

- **Query**: Loads and samples from a local `locations.json` manifest.
+- **Query**: Loads and samples from a local `locations.json` file.
 - **Enrich**: Fetches regional and cultural context from Wikipedia in parallel using `std::async`.
 - **Generate**: Creates authentic brewery names and descriptions using a local GGUF model or a deterministic mock.
 - **Log**: Outputs results and metadata summaries via spdlog.
@@ -26,7 +26,9 @@ The pipeline runs in four stages:

 ## Hardware & GPU Config

-### Test Machine
+### Test Machines
+
+#### x86/64 Linux, NVIDIA RTX 2000

 - **Host**: ThinkPad P1 Gen 7 (Fedora 43)
 - **CPU**: Intel Core Ultra 7 155H
@@ -35,6 +37,15 @@ The pipeline runs in four stages:
 - **Model**: Qwen3-8B-Q6-K
 - **Inference**: llama.cpp with CUDA 12.x support

+#### ARM MacOS, M1 Pro
+
+- **Host**: MacBook Pro 14" (2021)
+- **CPU**: Apple M1 Pro (8-core)
+- **GPU**: Apple M1 Pro (14-core) [Integrated]
+- **Memory**: 16GB
+- **Model**: Qwen3-8B-Q6-K
+- **Inference**: llama.cpp with Metal (MPS) support
+
 ### GPU Build Flags

 ```bash
@@ -42,6 +53,11 @@ cmake -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89 ..
 cmake --build . --config Release
 ```

+```zsh
+cmake ..
+cmake --build .
+```
+
 ## Core Components

 | Component               | Function                                                          |
--- a/pipeline/includes/biergarten_data_generator.h
+++ b/pipeline/includes/biergarten_data_generator.h
@@ -1,6 +1,11 @@
 #ifndef BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
 #define BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_

+/**
+ * @file biergarten_data_generator.h
+ * @brief Core orchestration class for pipeline data generation.
+ */
+
 #include <memory>
 #include <string>
 #include <vector>
--- a/pipeline/includes/data_generation/data_generator.h
+++ b/pipeline/includes/data_generation/data_generator.h
@@ -1,28 +1,68 @@
 #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
 #define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_

+/**
+ * @file data_generation/data_generator.h
+ * @brief Shared generator interfaces and result models.
+ */
+
 #include <string>

+/**
+ * @brief Generated brewery payload.
+ */
 struct BreweryResult {
+   /// @brief Brewery display name.
   std::string name;
+
+   /// @brief Brewery description text.
   std::string description;
 };

+/**
+ * @brief Generated user profile payload.
+ */
 struct UserResult {
+   /// @brief Username handle.
   std::string username;
+
+   /// @brief Short user biography.
   std::string bio;
 };

+/**
+ * @brief Interface for data generator implementations.
+ */
 class DataGenerator {
  public:
+   /// @brief Virtual destructor for polymorphic cleanup.
   virtual ~DataGenerator() = default;

+   /**
+    * @brief Loads and initializes generator resources.
+    *
+    * @param model_path Path to model assets. Implementations may ignore this.
+    */
   virtual void Load(const std::string& model_path) = 0;

+   /**
+    * @brief Generates brewery data for a location.
+    *
+    * @param city_name City name.
+    * @param country_name Country name.
+    * @param region_context Additional regional context text.
+    * @return Brewery generation result.
+    */
   virtual BreweryResult GenerateBrewery(const std::string& city_name,
                                         const std::string& country_name,
                                         const std::string& region_context) = 0;

+   /**
+    * @brief Generates a user profile for a locale.
+    *
+    * @param locale Locale hint used by generator.
+    * @return User generation result.
+    */
   virtual UserResult GenerateUser(const std::string& locale) = 0;
 };

--- a/pipeline/includes/data_generation/llama_generator.h
+++ b/pipeline/includes/data_generation/llama_generator.h
@@ -1,6 +1,11 @@
 #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
 #define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_

+/**
+ * @file data_generation/llama_generator.h
+ * @brief Llama.cpp-backed implementation of DataGenerator.
+ */
+
 #include <cstdint>
 #include <string>

@@ -9,34 +14,107 @@
 struct llama_model;
 struct llama_context;

+/**
+ * @brief Data generator implementation backed by llama.cpp.
+ */
 class LlamaGenerator final : public DataGenerator {
  public:
+   /// @brief Constructs a generator with default sampling and context settings.
   LlamaGenerator() = default;
+
+   /// @brief Releases model/context resources.
   ~LlamaGenerator() override;

+   /**
+    * @brief Configures sampling parameters for generation.
+    *
+    * @param temperature Sampling temperature.
+    * @param top_p Nucleus sampling threshold.
+    * @param seed Seed for sampling; use -1 for random seed.
+    */
   void SetSamplingOptions(float temperature, float top_p, int seed = -1);

+   /**
+    * @brief Sets context window size used during model load.
+    *
+    * @param n_ctx Context size in tokens.
+    */
   void SetContextSize(uint32_t n_ctx);

+   /**
+    * @brief Loads model and prepares inference context.
+    *
+    * @param model_path Filesystem path to GGUF model.
+    */
   void Load(const std::string& model_path) override;
+
+   /**
+    * @brief Generates brewery data for a specific location.
+    *
+    * @param city_name City name.
+    * @param country_name Country name.
+    * @param region_context Additional regional context.
+    * @return Generated brewery result.
+    */
   BreweryResult GenerateBrewery(const std::string& city_name,
                                 const std::string& country_name,
                                 const std::string& region_context) override;
+
+   /**
+    * @brief Generates a user profile for the provided locale.
+    *
+    * @param locale Locale hint.
+    * @return Generated user profile.
+    */
   UserResult GenerateUser(const std::string& locale) override;

  private:
+   /**
+    * @brief Infers text from a user prompt.
+    *
+    * @param prompt User prompt.
+    * @param max_tokens Maximum tokens to generate.
+    * @return Generated text.
+    */
   std::string Infer(const std::string& prompt, int max_tokens = 10000);
-   // Overload that allows passing a system message separately so chat-capable
-   // models receive a proper system role instead of having the system text
-   // concatenated into the user prompt (helps avoid revealing internal
-   // reasoning or instructions in model output).
+
+   /**
+    * @brief Infers text from separate system and user prompts.
+    *
+    * This helps chat-capable models preserve system-role behavior instead of
+    * concatenating system text into user input.
+    *
+    * @param system_prompt System role prompt.
+    * @param prompt User prompt.
+    * @param max_tokens Maximum tokens to generate.
+    * @return Generated text.
+    */
   std::string Infer(const std::string& system_prompt,
                     const std::string& prompt, int max_tokens = 10000);

+   /**
+    * @brief Runs inference on an already-formatted prompt.
+    *
+    * @param formatted_prompt Prompt preformatted for model chat template.
+    * @param max_tokens Maximum tokens to generate.
+    * @return Generated text.
+    */
   std::string InferFormatted(const std::string& formatted_prompt,
                              int max_tokens = 10000);

+   /**
+    * @brief Loads the brewery system prompt from disk.
+    *
+    * @param prompt_file_path Prompt file path to try first.
+    * @return Loaded prompt text or fallback prompt.
+    */
   std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path);
+
+   /**
+    * @brief Returns a built-in fallback system prompt.
+    *
+    * @return Fallback prompt text.
+    */
   std::string GetFallbackBreweryPrompt();

   llama_model* model_ = nullptr;
--- a/pipeline/includes/data_generation/llama_generator_helpers.h
+++ b/pipeline/includes/data_generation/llama_generator_helpers.h
@@ -1,6 +1,11 @@
 #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
 #define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_

+/**
+ * @file data_generation/llama_generator_helpers.h
+ * @brief Shared helper APIs used by LlamaGenerator translation units.
+ */
+
 #include <string>
 #include <utility>

@@ -8,23 +13,66 @@ struct llama_model;
 struct llama_vocab;
 typedef int llama_token;

-// Helper functions for LlamaGenerator methods
+/**
+ * @brief Normalizes and truncates regional context.
+ *
+ * @param region_context Input regional context text.
+ * @param max_chars Maximum output length.
+ * @return Processed region context.
+ */
 std::string PrepareRegionContextPublic(std::string_view region_context,
                                       std::size_t max_chars = 700);

+/**
+ * @brief Parses a response expected to contain two logical lines.
+ *
+ * @param raw Raw model output.
+ * @param error_message Error message thrown on parse failure.
+ * @return Pair containing first and second parsed fields.
+ */
 std::pair<std::string, std::string> ParseTwoLineResponsePublic(
    const std::string& raw, const std::string& error_message);

+/**
+ * @brief Applies model chat template to a user-only prompt.
+ *
+ * @param model Loaded llama model.
+ * @param user_prompt User prompt text.
+ * @return Model-formatted prompt.
+ */
 std::string ToChatPromptPublic(const llama_model* model,
                               const std::string& user_prompt);

+/**
+ * @brief Applies model chat template to system and user prompts.
+ *
+ * @param model Loaded llama model.
+ * @param system_prompt System prompt text.
+ * @param user_prompt User prompt text.
+ * @return Model-formatted prompt.
+ */
 std::string ToChatPromptPublic(const llama_model* model,
                               const std::string& system_prompt,
                               const std::string& user_prompt);

+/**
+ * @brief Decodes a sampled token and appends it to output text.
+ *
+ * @param vocab Model vocabulary.
+ * @param token Sampled token id.
+ * @param output Output text buffer.
+ */
 void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
                            std::string& output);

+/**
+ * @brief Validates and parses brewery JSON output.
+ *
+ * @param raw Raw model output.
+ * @param name_out Parsed brewery name.
+ * @param description_out Parsed brewery description.
+ * @return Empty string on success, or validation error message.
+ */
 std::string ValidateBreweryJsonPublic(const std::string& raw,
                                      std::string& name_out,
                                      std::string& description_out);
--- a/pipeline/includes/data_generation/mock_generator.h
+++ b/pipeline/includes/data_generation/mock_generator.h
@@ -1,20 +1,56 @@
 #ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
 #define BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_

+/**
+ * @file data_generation/mock_generator.h
+ * @brief Deterministic mock implementation of DataGenerator.
+ */
+
 #include <string>
 #include <vector>

 #include "data_generation/data_generator.h"

+/**
+ * @brief Mock generator used for deterministic, model-free outputs.
+ */
 class MockGenerator final : public DataGenerator {
  public:
+   /**
+    * @brief Initializes the mock generator.
+    *
+    * @param model_path Unused for mock generation.
+    */
   void Load(const std::string& model_path) override;
+
+   /**
+    * @brief Generates deterministic brewery data for a location.
+    *
+    * @param city_name City name.
+    * @param country_name Country name.
+    * @param region_context Unused for mock generation.
+    * @return Generated brewery result.
+    */
   BreweryResult GenerateBrewery(const std::string& city_name,
                                 const std::string& country_name,
                                 const std::string& region_context) override;
+
+   /**
+    * @brief Generates deterministic user data for a locale.
+    *
+    * @param locale Locale hint.
+    * @return Generated user result.
+    */
   UserResult GenerateUser(const std::string& locale) override;

  private:
+   /**
+    * @brief Combines two strings into a stable hash value.
+    *
+    * @param a First key.
+    * @param b Second key.
+    * @return Deterministic hash value.
+    */
   static std::size_t DeterministicHash(const std::string& a,
                                        const std::string& b);

--- a/pipeline/includes/data_model/location.h
+++ b/pipeline/includes/data_model/location.h
@@ -1,15 +1,36 @@
 #ifndef BIERGARTEN_PIPELINE_MODELS_LOCATION_H_
 #define BIERGARTEN_PIPELINE_MODELS_LOCATION_H_

+/**
+ * @file data_model/location.h
+ * @brief Location data model used throughout generation pipeline.
+ */
+
 #include <string>

+/**
+ * @brief Canonical location record for city-level generation.
+ */
 struct Location {
+   /// @brief City name.
   std::string city;
+
+   /// @brief State or province name.
   std::string state_province;
+
+   /// @brief ISO 3166-2 subdivision code.
   std::string iso3166_2;
+
+   /// @brief Country name.
   std::string country;
+
+   /// @brief ISO 3166-1 country code.
   std::string iso3166_1;
+
+   /// @brief Latitude in decimal degrees.
   double latitude;
+
+   /// @brief Longitude in decimal degrees.
   double longitude;
 };

--- a/pipeline/includes/json_handling/json_loader.h
+++ b/pipeline/includes/json_handling/json_loader.h
@@ -1,6 +1,11 @@
 #ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
 #define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_

+/**
+ * @file json_handling/json_loader.h
+ * @brief Loader API for curated location data.
+ */
+
 #include <string>
 #include <vector>

--- a/pipeline/includes/web_client/curl_web_client.h
+++ b/pipeline/includes/web_client/curl_web_client.h
@@ -1,29 +1,70 @@
 #ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
 #define BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_

+/**
+ * @file web_client/curl_web_client.h
+ * @brief libcurl-based WebClient implementation.
+ */
+
 #include <memory>

 #include "web_client/web_client.h"

-// RAII for curl_global_init/cleanup.
-// An instance of this class should be created in main() before any curl
-// operations and exist for the lifetime of the application.
+/**
+ * @brief RAII wrapper for curl_global_init and curl_global_cleanup.
+ *
+ * Create one instance in application startup before using libcurl and keep it
+ * alive for application lifetime.
+ */
 class CurlGlobalState {
  public:
+   /// @brief Initializes global libcurl state.
   CurlGlobalState();
+
+   /// @brief Cleans up global libcurl state.
   ~CurlGlobalState();
+
+   /// @brief Non-copyable type.
   CurlGlobalState(const CurlGlobalState&) = delete;
+
+   /// @brief Non-copyable type.
   CurlGlobalState& operator=(const CurlGlobalState&) = delete;
 };

+/**
+ * @brief WebClient implementation backed by libcurl.
+ */
 class CURLWebClient : public WebClient {
  public:
+   /// @brief Constructs a CURL web client.
   CURLWebClient();
+
+   /// @brief Destroys the CURL web client.
   ~CURLWebClient() override;

+   /**
+    * @brief Downloads URL contents to a file.
+    *
+    * @param url Source URL.
+    * @param file_path Destination file path.
+    */
   void DownloadToFile(const std::string& url,
                       const std::string& file_path) override;
+
+   /**
+    * @brief Executes an HTTP GET request.
+    *
+    * @param url Request URL.
+    * @return Response body.
+    */
   std::string Get(const std::string& url) override;
+
+   /**
+    * @brief URL-encodes a string value.
+    *
+    * @param value Raw value.
+    * @return URL-encoded string.
+    */
   std::string UrlEncode(const std::string& value) override;
 };

--- a/pipeline/includes/web_client/web_client.h
+++ b/pipeline/includes/web_client/web_client.h
@@ -1,21 +1,44 @@
 #ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
 #define BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_

+/**
+ * @file web_client/web_client.h
+ * @brief Abstract interface for HTTP and URL utilities.
+ */
+
 #include <string>

+/**
+ * @brief Abstract web client interface.
+ */
 class WebClient {
  public:
+   /// @brief Virtual destructor for polymorphic cleanup.
   virtual ~WebClient() = default;

-   // Downloads content from a URL to a file. Throws on error.
+   /**
+    * @brief Downloads content from a URL into a file.
+    *
+    * @param url Source URL.
+    * @param file_path Destination file path.
+    */
   virtual void DownloadToFile(const std::string& url,
                               const std::string& file_path) = 0;

-   // Performs a GET request and returns the response body as a string. Throws
-   // on error.
+   /**
+    * @brief Executes an HTTP GET request.
+    *
+    * @param url Request URL.
+    * @return Response body.
+    */
   virtual std::string Get(const std::string& url) = 0;

-   // URL-encodes a string.
+   /**
+    * @brief URL-encodes a string value.
+    *
+    * @param value Raw string value.
+    * @return Encoded value safe for URL usage.
+    */
   virtual std::string UrlEncode(const std::string& value) = 0;
 };

--- a/pipeline/includes/wikipedia/wikipedia_service.h
+++ b/pipeline/includes/wikipedia/wikipedia_service.h
@@ -1,6 +1,11 @@
 #ifndef BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
 #define BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_

+/**
+ * @file wikipedia/wikipedia_service.h
+ * @brief Wikipedia summary retrieval service with in-memory caching.
+ */
+
 #include <memory>
 #include <string>
 #include <string_view>
--- a/pipeline/src/biergarten_data_generator.cpp
+++ b/pipeline/src/biergarten_data_generator.cpp
@@ -1,3 +1,9 @@
+/**
+ * @file biergarten_data_generator.cpp
+ * @brief Orchestrates end-to-end pipeline execution for city sampling,
+ * Wikipedia enrichment, generator initialization, and brewery result output.
+ */
+
 #include "biergarten_data_generator.h"

 #include <spdlog/spdlog.h>
@@ -14,11 +20,11 @@
 #include "wikipedia/wikipedia_service.h"

 BiergartenDataGenerator::BiergartenDataGenerator(
-   const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
-   : options_(options), webClient_(std::move(web_client)) {}
+    const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
+    : options_(options), webClient_(std::move(web_client)) {}

 auto BiergartenDataGenerator::InitializeGenerator()
-   -> std::unique_ptr<DataGenerator> {
+    -> std::unique_ptr<DataGenerator> {
   spdlog::info("Initializing brewery generator...");

   std::unique_ptr<DataGenerator> generator;
@@ -43,7 +49,7 @@ auto BiergartenDataGenerator::InitializeGenerator()
 }

 auto BiergartenDataGenerator::QueryCitiesWithCountries()
-   -> std::vector<Location> {
+    -> std::vector<Location> {
   spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");

   std::filesystem::path locations_path = "locations.json";
@@ -72,7 +78,7 @@ auto BiergartenDataGenerator::QueryCitiesWithCountries()
 }

 auto BiergartenDataGenerator::EnrichWithWikipedia(
-   const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
+    const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
   std::vector<EnrichedCity> enriched;
   enriched.reserve(cities.size());

@@ -80,18 +86,15 @@ auto BiergartenDataGenerator::EnrichWithWikipedia(
   pending.reserve(cities.size());

   for (const auto& city : cities) {
-      pending.push_back(std::async(std::launch::async,
-                                   [web_client = webClient_, city]() {
-                                      WikipediaService wikipedia_service(
-                                          web_client);
-                                      const std::string region_context =
-                                          wikipedia_service.GetSummary(
-                                              city.city, city.country);
-                                      spdlog::debug(
-                                          "[Pipeline] Region context for {}: {}",
-                                          city.city, region_context);
-                                      return EnrichedCity{city, region_context};
-                                   }));
+      pending.push_back(
+          std::async(std::launch::async, [web_client = webClient_, city]() {
+             WikipediaService wikipedia_service(web_client);
+             const std::string region_context =
+                 wikipedia_service.GetSummary(city.city, city.country);
+             spdlog::debug("[Pipeline] Region context for {}: {}", city.city,
+                           region_context);
+             return EnrichedCity{city, region_context};
+          }));
   }

   for (auto& task : pending) {
@@ -110,23 +113,25 @@ void BiergartenDataGenerator::GenerateBreweries(

   for (const auto& enriched_city : cities) {
      try {
-         auto brewery = generator.GenerateBrewery(enriched_city.location.city,
-                                                  enriched_city.location.country,
-                                                  enriched_city.region_context);
+         auto brewery = generator.GenerateBrewery(
+             enriched_city.location.city, enriched_city.location.country,
+             enriched_city.region_context);
         generatedBreweries_.push_back({enriched_city.location, brewery});
      } catch (const std::exception& e) {
         ++skipped_count;
         spdlog::warn(
-             "[Pipeline] Skipping city '{}' ({}): brewery generation failed: {}",
+             "[Pipeline] Skipping city '{}' ({}): brewery generation failed: "
+             "{}",
             enriched_city.location.city, enriched_city.location.country,
             e.what());
      }
   }

   if (skipped_count > 0) {
-      spdlog::warn("[Pipeline] Skipped {} city/cities due to generation "
-                   "errors",
-                   skipped_count);
+      spdlog::warn(
+          "[Pipeline] Skipped {} city/cities due to generation "
+          "errors",
+          skipped_count);
   }
 }

@@ -134,11 +139,12 @@ void BiergartenDataGenerator::LogResults() const {
   spdlog::info("\n=== GENERATED DATA DUMP ===");
   size_t index = 1;
   for (const auto& entry : generatedBreweries_) {
-      spdlog::info("{}. city=\"{}\" country=\"{}\" state=\"{}\" "
-                   "iso3166_2={} lat={} lon={}",
-                   index, entry.location.city, entry.location.country,
-                   entry.location.state_province, entry.location.iso3166_2,
-                   entry.location.latitude, entry.location.longitude);
+      spdlog::info(
+          "{}. city=\"{}\" country=\"{}\" state=\"{}\" "
+          "iso3166_2={} lat={} lon={}",
+          index, entry.location.city, entry.location.country,
+          entry.location.state_province, entry.location.iso3166_2,
+          entry.location.latitude, entry.location.longitude);
      spdlog::info("   brewery_name=\"{}\"", entry.brewery.name);
      spdlog::info("   brewery_description=\"{}\"", entry.brewery.description);
      ++index;
--- a/pipeline/src/data_generation/llama/destructor.cpp
+++ b/pipeline/src/data_generation/llama/destructor.cpp
@@ -1,7 +1,7 @@
 /**
- * Destructor Module
- * Ensures proper cleanup of llama.cpp resources (context and model) when the
- * generator is destroyed, preventing memory leaks and resource exhaustion.
+ * @file data_generation/llama/destructor.cpp
+ * @brief Releases llama model/context resources and backend state during
+ * LlamaGenerator teardown to avoid leaks across runs.
 */

 #include "data_generation/llama_generator.h"
--- a/pipeline/src/data_generation/llama/generate_brewery.cpp
+++ b/pipeline/src/data_generation/llama/generate_brewery.cpp
@@ -1,8 +1,7 @@
 /**
- * Brewery Data Generation Module
- * Uses the LLM to generate realistic brewery names and descriptions for a given
- * location. Implements retry logic with validation and error correction to
- * ensure valid JSON output conforming to the expected schema.
+ * @file data_generation/llama/generate_brewery.cpp
+ * @brief Builds brewery prompts with regional context, performs retry-based
+ * inference, and validates structured JSON output for brewery records.
 */

 #include <spdlog/spdlog.h>
--- a/pipeline/src/data_generation/llama/generate_user.cpp
+++ b/pipeline/src/data_generation/llama/generate_user.cpp
@@ -1,9 +1,7 @@
 /**
- * User Profile Generation Module
- * Uses the LLM to generate realistic user profiles (username and bio) for craft
- * beer enthusiasts. Implements retry logic to handle parsing failures and
- * ensures output adheres to strict format constraints (two lines, specific
- * character limits).
+ * @file data_generation/llama/generate_user.cpp
+ * @brief Generates locale-aware user profiles with strict two-line formatting,
+ * retry handling, and output sanitization for downstream parsing.
 */

 #include <spdlog/spdlog.h>
--- a/pipeline/src/data_generation/llama/helpers.cpp
+++ b/pipeline/src/data_generation/llama/helpers.cpp
@@ -1,9 +1,7 @@
 /**
- * Helper Functions Module
- * Provides utility functions for text processing, parsing, and chat template
- * formatting. Functions handle whitespace normalization, response parsing, and
- * conversion of prompts to proper chat format using the model's built-in
- * template.
+ * @file data_generation/llama/helpers.cpp
+ * @brief Provides prompt formatting, whitespace normalization, response
+ * parsing, token decoding, and JSON validation helpers for Llama modules.
 */

 #include <algorithm>
--- a/pipeline/src/data_generation/llama/load.cpp
+++ b/pipeline/src/data_generation/llama/load.cpp
@@ -1,8 +1,7 @@
 /**
- * Model Loading Module
- * This module handles loading a pre-trained LLM model from disk and
- * initializing the llama.cpp context for inference. It performs one-time setup
- * required before any inference operations can be performed.
+ * @file data_generation/llama/load.cpp
+ * @brief Initializes llama backend, loads model weights, creates inference
+ * context, and resets prior resources during model reload.
 */

 #include <spdlog/spdlog.h>
--- a/pipeline/src/data_generation/llama/load_brewery_prompt.cpp
+++ b/pipeline/src/data_generation/llama/load_brewery_prompt.cpp
@@ -1,11 +1,24 @@
-#include <fstream>
-#include <filesystem>
+/**
+ * @file data_generation/llama/load_brewery_prompt.cpp
+ * @brief Resolves brewery system prompt content from cache or filesystem
+ * search paths and provides a robust inline fallback prompt when absent.
+ */
+
 #include <spdlog/spdlog.h>

+#include <filesystem>
+#include <fstream>
+
 #include "data_generation/llama_generator.h"

 namespace fs = std::filesystem;

+/**
+ * @brief Loads brewery system prompt from disk or cache.
+ *
+ * @param prompt_file_path Preferred prompt file location.
+ * @return Prompt text loaded from disk or fallback content.
+ */
 std::string LlamaGenerator::LoadBrewerySystemPrompt(
    const std::string& prompt_file_path) {
   // Return cached version if already loaded
@@ -15,9 +28,9 @@ std::string LlamaGenerator::LoadBrewerySystemPrompt(

   // Try multiple path locations
   std::vector<std::string> paths_to_try = {
-       prompt_file_path,                          // As provided
-       "../" + prompt_file_path,                  // One level up
-       "../../" + prompt_file_path,               // Two levels up
+       prompt_file_path,             // As provided
+       "../" + prompt_file_path,     // One level up
+       "../../" + prompt_file_path,  // Two levels up
   };

   for (const auto& path : paths_to_try) {
@@ -29,7 +42,8 @@ std::string LlamaGenerator::LoadBrewerySystemPrompt(

         if (!prompt.empty()) {
            spdlog::info(
-                "LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)",
+                "LlamaGenerator: Loaded brewery system prompt from '{}' ({} "
+                "chars)",
                path, prompt.length());
            brewery_system_prompt_ = prompt;
            return brewery_system_prompt_;
@@ -38,16 +52,23 @@ std::string LlamaGenerator::LoadBrewerySystemPrompt(
   }

   spdlog::warn(
-       "LlamaGenerator: Could not open brewery system prompt file at any of the "
+       "LlamaGenerator: Could not open brewery system prompt file at any of "
+       "the "
       "expected locations. Using fallback inline prompt.");
   return GetFallbackBreweryPrompt();
 }

-// Fallback: minimal inline prompt if file fails to load
+/**
+ * @brief Provides an inline fallback brewery system prompt.
+ *
+ * @return Default fallback prompt text.
+ */
 std::string LlamaGenerator::GetFallbackBreweryPrompt() {
-   return "You are an experienced brewmaster and owner of a local craft brewery. "
+   return "You are an experienced brewmaster and owner of a local craft "
+          "brewery. "
          "Create a distinctive, authentic name and detailed description that "
-          "genuinely reflects your specific location, brewing philosophy, local "
+          "genuinely reflects your specific location, brewing philosophy, "
+          "local "
          "culture, and community connection. The brewery must feel real and "
          "grounded—not generic or interchangeable.\n\n"
          "AVOID REPETITIVE PHRASES - Never use:\n"
@@ -56,14 +77,16 @@ std::string LlamaGenerator::GetFallbackBreweryPrompt() {
          "into, ancient roots, timeless, where tradition meets innovation\n\n"
          "OPENING APPROACHES - Choose ONE:\n"
          "1. Start with specific beer style and its regional origins\n"
-          "2. Begin with specific brewing challenge (water, altitude, climate)\n"
+          "2. Begin with specific brewing challenge (water, altitude, "
+          "climate)\n"
          "3. Open with founding story or personal motivation\n"
          "4. Lead with specific local ingredient or resource\n"
          "5. Start with unexpected angle or contradiction\n"
          "6. Open with local event, tradition, or cultural moment\n"
          "7. Begin with tangible architectural or geographic detail\n\n"
          "BE SPECIFIC - Include:\n"
-          "- At least ONE concrete proper noun (landmark, river, neighborhood)\n"
+          "- At least ONE concrete proper noun (landmark, river, "
+          "neighborhood)\n"
          "- Specific beer styles relevant to the REGION'S culture\n"
          "- Concrete brewing challenges or advantages\n"
          "- Sensory details SPECIFIC to place—not generic adjectives\n\n"
--- a/pipeline/src/data_generation/llama/set_sampling_options.cpp
+++ b/pipeline/src/data_generation/llama/set_sampling_options.cpp
@@ -1,8 +1,7 @@
 /**
- * Sampling Configuration Module
- * Configures the hyperparameters that control probabilistic token selection
- * during text generation. These settings affect the randomness, diversity, and
- * quality of generated output.
+ * @file data_generation/llama/set_sampling_options.cpp
+ * @brief Validates and stores sampling temperature, top-p, seed, and context
+ * size configuration used by subsequent LlamaGenerator inference calls.
 */

 #include <stdexcept>
--- a/pipeline/src/data_generation/mock/data.cpp
+++ b/pipeline/src/data_generation/mock/data.cpp
@@ -1,3 +1,9 @@
+/**
+ * @file data_generation/mock/data.cpp
+ * @brief Defines static lookup tables used by MockGenerator for deterministic
+ * brewery names, descriptions, usernames, and bios.
+ */
+
 #include <string>
 #include <vector>

--- a/pipeline/src/data_generation/mock/deterministic_hash.cpp
+++ b/pipeline/src/data_generation/mock/deterministic_hash.cpp
@@ -1,12 +1,18 @@
+/**
+ * @file data_generation/mock/deterministic_hash.cpp
+ * @brief Implements a stable hash combiner used by MockGenerator to derive
+ * repeatable pseudo-random indices from location input.
+ */
+
+#include <boost/container_hash/hash.hpp>
 #include <string>

 #include "data_generation/mock_generator.h"

 std::size_t MockGenerator::DeterministicHash(const std::string& a,
                                             const std::string& b) {
-   std::size_t seed = std::hash<std::string>{}(a);
-   const std::size_t mixed = std::hash<std::string>{}(b);
-   seed ^= mixed + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
-   seed = (seed << 13) | (seed >> ((sizeof(std::size_t) * 8) - 13));
+   std::size_t seed = 0;
+   boost::hash_combine(seed, a);
+   boost::hash_combine(seed, b);
   return seed;
 }
--- a/pipeline/src/data_generation/mock/generate_brewery.cpp
+++ b/pipeline/src/data_generation/mock/generate_brewery.cpp
@@ -1,3 +1,9 @@
+/**
+ * @file data_generation/mock/generate_brewery.cpp
+ * @brief Builds deterministic brewery names and descriptions by hashing city
+ * and country into fixed mock phrase catalogs.
+ */
+
 #include <string>

 #include "data_generation/mock_generator.h"
@@ -10,7 +16,8 @@ auto MockGenerator::GenerateBrewery(const std::string& city_name,

   const std::string& adjective =
       kBreweryAdjectives.at(hash % kBreweryAdjectives.size());
-   const std::string& noun = kBreweryNouns.at((hash / 7) % kBreweryNouns.size());
+   const std::string& noun =
+       kBreweryNouns.at((hash / 7) % kBreweryNouns.size());
   const std::string& base_description =
       kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size());

--- a/pipeline/src/data_generation/mock/generate_user.cpp
+++ b/pipeline/src/data_generation/mock/generate_user.cpp
@@ -1,3 +1,9 @@
+/**
+ * @file data_generation/mock/generate_user.cpp
+ * @brief Generates deterministic mock user profiles by hashing locale values
+ * into predefined username and bio collections.
+ */
+
 #include <functional>
 #include <string>

--- a/pipeline/src/data_generation/mock/load.cpp
+++ b/pipeline/src/data_generation/mock/load.cpp
@@ -1,3 +1,9 @@
+/**
+ * @file data_generation/mock/load.cpp
+ * @brief Provides MockGenerator initialization behavior, which is a no-op load
+ * path that logs readiness without model resources.
+ */
+
 #include <spdlog/spdlog.h>

 #include <string>
--- a/pipeline/src/json_handling/json_loader.cpp
+++ b/pipeline/src/json_handling/json_loader.cpp
@@ -1,21 +1,26 @@
+/**
+ * @file json_handling/json_loader.cpp
+ * @brief Parses curated location JSON input into strongly typed Location
+ * records with strict field validation and descriptive error reporting.
+ */
+
 #include "json_handling/json_loader.h"

 #include <spdlog/spdlog.h>

 #include <boost/json.hpp>
-
 #include <fstream>
 #include <sstream>
 #include <stdexcept>

 namespace {

-auto ReadRequiredString(const boost::json::object& object,
-                        const char* key) -> std::string {
+auto ReadRequiredString(const boost::json::object& object, const char* key)
+    -> std::string {
   const boost::json::value* value = object.if_contains(key);
   if (value == nullptr || !value->is_string()) {
-      throw std::runtime_error(std::string("Missing or invalid string field: ") +
-                               key);
+      throw std::runtime_error(
+          std::string("Missing or invalid string field: ") + key);
   }
   return std::string(value->as_string().c_str());
 }
@@ -24,8 +29,8 @@ auto ReadRequiredNumber(const boost::json::object& object, const char* key)
    -> double {
   const boost::json::value* value = object.if_contains(key);
   if (value == nullptr || !value->is_number()) {
-      throw std::runtime_error(std::string("Missing or invalid numeric field: ") +
-                               key);
+      throw std::runtime_error(
+          std::string("Missing or invalid numeric field: ") + key);
   }
   return value->to_number<double>();
 }
@@ -33,7 +38,7 @@ auto ReadRequiredNumber(const boost::json::object& object, const char* key)
 }  // namespace

 auto JsonLoader::LoadLocations(const std::string& filepath)
-   -> std::vector<Location> {
+    -> std::vector<Location> {
   std::ifstream input(filepath);
   if (!input.is_open()) {
      throw std::runtime_error("Failed to open locations file: " + filepath);
--- a/pipeline/src/main.cpp
+++ b/pipeline/src/main.cpp
@@ -1,3 +1,9 @@
+/**
+ * @file main.cpp
+ * @brief Parses command-line options, validates runtime mode selection,
+ * initializes shared infrastructure, and executes the pipeline entry flow.
+ */
+
 #include <spdlog/spdlog.h>

 #include <boost/program_options.hpp>
--- a/pipeline/src/web_client/curl_web_client.cpp
+++ b/pipeline/src/web_client/curl_web_client.cpp
@@ -1,3 +1,9 @@
+/**
+ * @file web_client/curl_web_client.cpp
+ * @brief Implements libcurl-backed HTTP utilities, including GET requests,
+ * file downloads, URL encoding, and RAII global curl lifecycle handling.
+ */
+
 #include "web_client/curl_web_client.h"

 #include <curl/curl.h>
--- a/pipeline/src/wikipedia/wikipedia_service.cpp
+++ b/pipeline/src/wikipedia/wikipedia_service.cpp
@@ -1,3 +1,9 @@
+/**
+ * @file wikipedia/wikipedia_service.cpp
+ * @brief Implements Wikipedia extract retrieval and caching for city/country
+ * queries, including response parsing and resilient error handling.
+ */
+
 #include "wikipedia/wikipedia_service.h"

 #include <spdlog/spdlog.h>