From 91e18888fef52262755cca979ad8c19899f09abd Mon Sep 17 00:00:00 2001
From: Aaron Po <apo2@uwo.ca>
Date: Fri, 1 May 2026 17:38:16 -0400
Subject: [PATCH] readability updates: remove magic numbers, update comments

---
 .../includes/data_generation/mock_generator.h |  7 +++
 .../includes/services/sqlite_export_service.h |  1 -
 .../services/sqlite_statement_helpers.h       |  1 -
 .../{system.md => BREWERY_GENERATION.md}      |  0
 .../data_generation/llama/generate_brewery.cc |  2 +
 .../data_generation/llama/generate_user.cc    |  7 +++
 .../src/data_generation/llama/helpers.cc      |  8 ++-
 .../src/data_generation/llama/infer.cc        |  5 +-
 .../src/data_generation/llama/load.cc         |  6 ++-
 .../data_generation/mock/generate_brewery.cc  |  6 +--
 .../src/data_generation/mock/generate_user.cc |  2 +-
 tooling/pipeline/src/main.cc                  | 51 ++++++++++---------
 .../helpers/sqlite_statement_helpers.cpp      | 10 ----
 .../src/services/sqlite/process_record.cc     |  2 +
 .../src/web_client/curl_web_client_get.cc     |  3 +-
 15 files changed, 68 insertions(+), 43 deletions(-)
 rename tooling/pipeline/prompts/{system.md => BREWERY_GENERATION.md} (100%)

diff --git a/tooling/pipeline/includes/data_generation/mock_generator.h b/tooling/pipeline/includes/data_generation/mock_generator.h
index d91c1e3..5924164 100644
--- a/tooling/pipeline/includes/data_generation/mock_generator.h
+++ b/tooling/pipeline/includes/data_generation/mock_generator.h
@@ -44,6 +44,13 @@ class MockGenerator final : public DataGenerator {
    */
   static size_t DeterministicHash(const Location& location);
 
+  // Hash stride constants for deterministic distribution across fixed-size
+  // arrays. These coprime strides spread hash values uniformly without
+  // clustering, ensuring diverse output across different hash inputs.
+  static constexpr size_t kNounHashStride = 7;
+  static constexpr size_t kDescriptionHashStride = 13;
+  static constexpr size_t kBioHashStride = 11;
+
   static constexpr std::array<std::string_view, 18> kBreweryAdjectives = {
       "Craft",      "Heritage", "Local",  "Artisan",  "Pioneer",    "Golden",
       "Modern",     "Classic",  "Summit", "Northern", "Riverstone", "Barrel",
diff --git a/tooling/pipeline/includes/services/sqlite_export_service.h b/tooling/pipeline/includes/services/sqlite_export_service.h
index fdae8ff..ebae1ec 100644
--- a/tooling/pipeline/includes/services/sqlite_export_service.h
+++ b/tooling/pipeline/includes/services/sqlite_export_service.h
@@ -42,7 +42,6 @@ class SqliteExportService final : public IExportService {
   void InitializeSchema() const;
   void PrepareStatements();
   void RollbackAndCloseNoThrow() noexcept;
-  void FinalizeStatements() noexcept;
 
   [[nodiscard]] std::filesystem::path BuildDatabasePath() const;
   [[nodiscard]] static std::string BuildLocationKey(const Location& location);
diff --git a/tooling/pipeline/includes/services/sqlite_statement_helpers.h b/tooling/pipeline/includes/services/sqlite_statement_helpers.h
index 5f3315b..74f54b1 100644
--- a/tooling/pipeline/includes/services/sqlite_statement_helpers.h
+++ b/tooling/pipeline/includes/services/sqlite_statement_helpers.h
@@ -107,7 +107,6 @@ void StepStatement(const SqliteDatabaseHandle& db_handle,
 
 sqlite3_int64 LastInsertRowId(const SqliteDatabaseHandle& db_handle);
 
-std::string SerializeLocalLanguages(const std::vector<std::string>& local_languages);
 std::string SerializeVector(const std::vector<std::string>& str_vec);
 
 }  // namespace sqlite_export_service_internal
diff --git a/tooling/pipeline/prompts/system.md b/tooling/pipeline/prompts/BREWERY_GENERATION.md
similarity index 100%
rename from tooling/pipeline/prompts/system.md
rename to tooling/pipeline/prompts/BREWERY_GENERATION.md
diff --git a/tooling/pipeline/src/data_generation/llama/generate_brewery.cc b/tooling/pipeline/src/data_generation/llama/generate_brewery.cc
index f29c637..011a64f 100644
--- a/tooling/pipeline/src/data_generation/llama/generate_brewery.cc
+++ b/tooling/pipeline/src/data_generation/llama/generate_brewery.cc
@@ -33,6 +33,8 @@ static std::string FormatLocalLanguageCodes(
   return formatted;
 }
 
+// GBNF grammar for structured brewery JSON output.
+// @TODO move to a separate gbnf file if it grows in complexity or is shared across modules.
 static constexpr std::string_view kBreweryJsonGrammar = R"json_brewery(
 root ::= thought-block "{" ws "\"name_en\"" ws ":" ws string ws "," ws "\"description_en\"" ws ":" ws string ws "," ws "\"name_local\"" ws ":" ws string ws "," ws "\"description_local\"" ws ":" ws string ws "}" ws
 thought-block ::= [^{]*
diff --git a/tooling/pipeline/src/data_generation/llama/generate_user.cc b/tooling/pipeline/src/data_generation/llama/generate_user.cc
index eaebc09..7ed6426 100644
--- a/tooling/pipeline/src/data_generation/llama/generate_user.cc
+++ b/tooling/pipeline/src/data_generation/llama/generate_user.cc
@@ -12,6 +12,13 @@
 #include "data_generation/llama_generator.h"
 #include "data_generation/llama_generator_helpers.h"
 
+// TODO: Implement locale-aware user profile generation.
+// Current implementation returns a hardcoded test value and ignores the
+// locale parameter. Future implementation should:
+// 1. Load a USER_GENERATION.md prompt template with locale context
+// 2. Perform LLM inference with locale-specific username/bio generation
+// 3. Parse and validate JSON output with retry handling (similar to brewery)
+// 4. Return locale-aware username and biography
 UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
   return {.username = "test_user",
           .bio = "This is a test user profile from " + locale + "."};
diff --git a/tooling/pipeline/src/data_generation/llama/helpers.cc b/tooling/pipeline/src/data_generation/llama/helpers.cc
index 8556b8d..1433523 100644
--- a/tooling/pipeline/src/data_generation/llama/helpers.cc
+++ b/tooling/pipeline/src/data_generation/llama/helpers.cc
@@ -58,6 +58,11 @@ static std::string CondenseWhitespace(std::string_view text) {
   return out;
 }
 
+// Guard against truncating in the first half of the string.
+// This preserves the critical opening content and avoids cutting critical
+// context words early in the region description.
+static constexpr size_t kTruncationGuardDivisor = 2;
+
 /**
  * Truncate region context to fit within max length while preserving word
  * boundaries
@@ -71,7 +76,8 @@ std::string PrepareRegionContext(std::string_view region_context,
 
   normalized.resize(max_chars);
   const size_t last_space = normalized.find_last_of(' ');
-  if (last_space != std::string::npos && last_space > max_chars / 2) {
+  if (last_space != std::string::npos &&
+      last_space > max_chars / kTruncationGuardDivisor) {
     normalized.resize(last_space);
   }
 
diff --git a/tooling/pipeline/src/data_generation/llama/infer.cc b/tooling/pipeline/src/data_generation/llama/infer.cc
index 2a2c116..dc06d0b 100644
--- a/tooling/pipeline/src/data_generation/llama/infer.cc
+++ b/tooling/pipeline/src/data_generation/llama/infer.cc
@@ -19,6 +19,9 @@
 #include "llama.h"
 
 static constexpr size_t kPromptTokenSlack = 8;
+// Minimum tokens to keep when using top-p sampling. Ensures at least one
+// candidate token remains available even with very restrictive top-p values.
+static constexpr size_t kTopPMinKeep = 1;
 
 namespace {
 
@@ -62,7 +65,7 @@ SamplerHandle MakeSamplerChain(const llama_vocab* vocab,
               "LlamaGenerator: failed to initialize temperature sampler");
   add_sampler(llama_sampler_init_top_k(static_cast<int32_t>(config.top_k)),
               "LlamaGenerator: failed to initialize top-k sampler");
-  add_sampler(llama_sampler_init_top_p(config.top_p, 1),
+  add_sampler(llama_sampler_init_top_p(config.top_p, kTopPMinKeep),
               "LlamaGenerator: failed to initialize top-p sampler");
   add_sampler(llama_sampler_init_dist(config.seed),
               "LlamaGenerator: failed to initialize distribution sampler");
diff --git a/tooling/pipeline/src/data_generation/llama/load.cc b/tooling/pipeline/src/data_generation/llama/load.cc
index 98feb5a..8ce3142 100644
--- a/tooling/pipeline/src/data_generation/llama/load.cc
+++ b/tooling/pipeline/src/data_generation/llama/load.cc
@@ -14,6 +14,10 @@
 #include "data_generation/llama_generator.h"
 #include "llama.h"
 
+// Maximum batch size for decode operations. Capping the batch prevents
+// excessive memory allocation while maintaining inference performance.
+static constexpr uint32_t kMaxBatchSize = 5000U;
+
 void LlamaGenerator::Load(const std::string& model_path) {
   context_.reset();
   model_.reset();
@@ -28,7 +32,7 @@ void LlamaGenerator::Load(const std::string& model_path) {
 
   llama_context_params context_params = llama_context_default_params();
   context_params.n_ctx = n_ctx_;
-  context_params.n_batch = std::min(n_ctx_, static_cast<uint32_t>(5000));
+  context_params.n_batch = std::min(n_ctx_, kMaxBatchSize);
 
   LlamaGenerator::ContextHandle loaded_context(
       llama_init_from_model(loaded_model.get(), context_params));
diff --git a/tooling/pipeline/src/data_generation/mock/generate_brewery.cc b/tooling/pipeline/src/data_generation/mock/generate_brewery.cc
index c2495e0..20a4ff6 100644
--- a/tooling/pipeline/src/data_generation/mock/generate_brewery.cc
+++ b/tooling/pipeline/src/data_generation/mock/generate_brewery.cc
@@ -17,9 +17,9 @@ BreweryResult MockGenerator::GenerateBrewery(
   const std::string_view adjective =
       kBreweryAdjectives.at(hash % kBreweryAdjectives.size());
   const std::string_view noun =
-      kBreweryNouns.at(hash / 7 % kBreweryNouns.size());
-  const std::string_view base_description =
-      kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size());
+      kBreweryNouns.at(hash / kNounHashStride % kBreweryNouns.size());
+  const std::string_view base_description = kBreweryDescriptions.at(
+      (hash / kDescriptionHashStride) % kBreweryDescriptions.size());
 
   const std::string name =
       std::format("{} {} {}", location.city, adjective, noun);
diff --git a/tooling/pipeline/src/data_generation/mock/generate_user.cc b/tooling/pipeline/src/data_generation/mock/generate_user.cc
index 51c26d2..38257fb 100644
--- a/tooling/pipeline/src/data_generation/mock/generate_user.cc
+++ b/tooling/pipeline/src/data_generation/mock/generate_user.cc
@@ -15,7 +15,7 @@ UserResult MockGenerator::GenerateUser(const std::string& locale) {
 
   UserResult result;
   const std::string_view username = kUsernames[hash % kUsernames.size()];
-  const std::string_view bio = kBios[hash / 11 % kBios.size()];
+  const std::string_view bio = kBios[hash / kBioHashStride % kBios.size()];
   result.username = username;
   result.bio = bio;
   return result;
diff --git a/tooling/pipeline/src/main.cc b/tooling/pipeline/src/main.cc
index 241c84d..6d18d05 100644
--- a/tooling/pipeline/src/main.cc
+++ b/tooling/pipeline/src/main.cc
@@ -53,16 +53,21 @@ std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
   opt("model,m", prog_opts::value<std::string>()->default_value(""),
       "Path to LLM model (gguf)");
 
-  // Sampling Options
-  opt("temperature", prog_opts::value<float>()->default_value(1.0F),
+  // Sampling Options - defaults driven from SamplingOptions struct
+  const SamplingOptions kSamplingDefaults{};
+  opt("temperature",
+      prog_opts::value<float>()->default_value(kSamplingDefaults.temperature),
       "Sampling temperature (higher = more random)");
-  opt("top-p", prog_opts::value<float>()->default_value(0.95F),
+  opt("top-p",
+      prog_opts::value<float>()->default_value(kSamplingDefaults.top_p),
       "Nucleus sampling top-p in (0,1] (higher = more random)");
-  opt("top-k", prog_opts::value<uint32_t>()->default_value(64),
+  opt("top-k",
+      prog_opts::value<uint32_t>()->default_value(kSamplingDefaults.top_k),
       "Top-k sampling parameter (higher = more candidate tokens)");
-  opt("n-ctx", prog_opts::value<uint32_t>()->default_value(8192),
+  opt("n-ctx",
+      prog_opts::value<uint32_t>()->default_value(kSamplingDefaults.n_ctx),
       "Context window size in tokens");
-  opt("seed", prog_opts::value<int>()->default_value(-1),
+  opt("seed", prog_opts::value<int>()->default_value(kSamplingDefaults.seed),
       "Sampler seed: -1 for random, otherwise non-negative integer");
 
   // Pipeline Options
@@ -84,11 +89,11 @@ std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
   }
 
   try {
-    prog_opts::variables_map vm;
-    prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), vm);
-    prog_opts::notify(vm);
+    prog_opts::variables_map var_map;
+    prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), var_map);
+    prog_opts::notify(var_map);
 
-    if (vm.contains("help")) {
+    if (var_map.contains("help")) {
       std::stringstream help_stream;
       help_stream << "\n" << desc;
       spdlog::info(help_stream.str());
@@ -97,12 +102,12 @@ std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
 
     ApplicationOptions options;
 
-    options.pipeline.output_path = vm["output"].as<std::string>();
-    options.pipeline.log_path = vm["log-path"].as<std::string>();
-    options.pipeline.prompt_dir = vm["prompt-dir"].as<std::string>();
+    options.pipeline.output_path = var_map["output"].as<std::string>();
+    options.pipeline.log_path = var_map["log-path"].as<std::string>();
+    options.pipeline.prompt_dir = var_map["prompt-dir"].as<std::string>();
 
-    const bool use_mocked = vm["mocked"].as<bool>();
-    const std::string model_path = vm["model"].as<std::string>();
+    const bool use_mocked = var_map["mocked"].as<bool>();
+    const std::string model_path = var_map["model"].as<std::string>();
 
     if (use_mocked && !model_path.empty()) {
       spdlog::error(
@@ -127,9 +132,9 @@ std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
     options.generator.model_path = model_path;
 
     const bool user_provided_sampling =
-        !vm["temperature"].defaulted() || !vm["top-p"].defaulted() ||
-        !vm["top-k"].defaulted() || !vm["n-ctx"].defaulted() ||
-        !vm["seed"].defaulted();
+        !var_map["temperature"].defaulted() || !var_map["top-p"].defaulted() ||
+        !var_map["top-k"].defaulted() || !var_map["n-ctx"].defaulted() ||
+        !var_map["seed"].defaulted();
 
     if (use_mocked) {
       if (user_provided_sampling) {
@@ -137,11 +142,11 @@ std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
       }
     } else if (user_provided_sampling) {
       SamplingOptions sampling;
-      sampling.temperature = vm["temperature"].as<float>();
-      sampling.top_p = vm["top-p"].as<float>();
-      sampling.top_k = vm["top-k"].as<uint32_t>();
-      sampling.n_ctx = vm["n-ctx"].as<uint32_t>();
-      sampling.seed = vm["seed"].as<int>();
+      sampling.temperature = var_map["temperature"].as<float>();
+      sampling.top_p = var_map["top-p"].as<float>();
+      sampling.top_k = var_map["top-k"].as<uint32_t>();
+      sampling.n_ctx = var_map["n-ctx"].as<uint32_t>();
+      sampling.seed = var_map["seed"].as<int>();
 
       options.generator.sampling = sampling;
     }
diff --git a/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp b/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp
index fd09056..12620f8 100644
--- a/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp
+++ b/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp
@@ -86,16 +86,6 @@ sqlite3_int64 LastInsertRowId(const SqliteDatabaseHandle& db_handle) {
   return sqlite3_last_insert_rowid(db_handle.get());
 }
 
-std::string SerializeLocalLanguages(
-    const std::vector<std::string>& local_languages) {
-  boost::json::array array;
-  array.reserve(local_languages.size());
-  for (const auto& language : local_languages) {
-    array.emplace_back(language);
-  }
-  return boost::json::serialize(array);
-}
-
 std::string SerializeVector(const std::vector<std::string>& str_vec) {
   boost::json::array array(str_vec.size());
   for (const auto& s : str_vec) {
diff --git a/tooling/pipeline/src/services/sqlite/process_record.cc b/tooling/pipeline/src/services/sqlite/process_record.cc
index 786aa19..be3af22 100644
--- a/tooling/pipeline/src/services/sqlite/process_record.cc
+++ b/tooling/pipeline/src/services/sqlite/process_record.cc
@@ -3,6 +3,8 @@
  * @brief SqliteExportService::ProcessRecord() implementation.
  */
 
+#include <iomanip>
+#include <sstream>
 #include <stdexcept>
 #include <string>
 
diff --git a/tooling/pipeline/src/web_client/curl_web_client_get.cc b/tooling/pipeline/src/web_client/curl_web_client_get.cc
index 2e178f7..0a8473e 100644
--- a/tooling/pipeline/src/web_client/curl_web_client_get.cc
+++ b/tooling/pipeline/src/web_client/curl_web_client_get.cc
@@ -17,6 +17,7 @@ using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
 
 static constexpr long kConnectionTimeout = 10;
 static constexpr long kRequestTimeout = 30;
+static constexpr long kMaxRedirects = 5;
 static constexpr int32_t kOkHttpStatus = 200;
 
 static CurlHandle CreateHandle() {
@@ -32,7 +33,7 @@ static void SetCommonGetOptions(CURL* curl, const std::string& url) {
   curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
   curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
   curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
-  curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
+  curl_easy_setopt(curl, CURLOPT_MAXREDIRS, kMaxRedirects);
   curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, kConnectionTimeout);
   curl_easy_setopt(curl, CURLOPT_TIMEOUT, kRequestTimeout);
   curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");