From 91e18888fef52262755cca979ad8c19899f09abd Mon Sep 17 00:00:00 2001 From: Aaron Po Date: Fri, 1 May 2026 17:38:16 -0400 Subject: [PATCH] readability updates: remove magic numbers, update comments --- .../includes/data_generation/mock_generator.h | 7 +++ .../includes/services/sqlite_export_service.h | 1 - .../services/sqlite_statement_helpers.h | 1 - .../{system.md => BREWERY_GENERATION.md} | 0 .../data_generation/llama/generate_brewery.cc | 2 + .../data_generation/llama/generate_user.cc | 7 +++ .../src/data_generation/llama/helpers.cc | 8 ++- .../src/data_generation/llama/infer.cc | 5 +- .../src/data_generation/llama/load.cc | 6 ++- .../data_generation/mock/generate_brewery.cc | 6 +-- .../src/data_generation/mock/generate_user.cc | 2 +- tooling/pipeline/src/main.cc | 51 ++++++++++--------- .../helpers/sqlite_statement_helpers.cpp | 10 ---- .../src/services/sqlite/process_record.cc | 2 + .../src/web_client/curl_web_client_get.cc | 3 +- 15 files changed, 68 insertions(+), 43 deletions(-) rename tooling/pipeline/prompts/{system.md => BREWERY_GENERATION.md} (100%) diff --git a/tooling/pipeline/includes/data_generation/mock_generator.h b/tooling/pipeline/includes/data_generation/mock_generator.h index d91c1e3..5924164 100644 --- a/tooling/pipeline/includes/data_generation/mock_generator.h +++ b/tooling/pipeline/includes/data_generation/mock_generator.h @@ -44,6 +44,13 @@ class MockGenerator final : public DataGenerator { */ static size_t DeterministicHash(const Location& location); + // Hash stride constants for deterministic distribution across fixed-size + // arrays. These coprime strides spread hash values uniformly without + // clustering, ensuring diverse output across different hash inputs. + static constexpr size_t kNounHashStride = 7; + static constexpr size_t kDescriptionHashStride = 13; + static constexpr size_t kBioHashStride = 11; + static constexpr std::array kBreweryAdjectives = { "Craft", "Heritage", "Local", "Artisan", "Pioneer", "Golden", "Modern", "Classic", "Summit", "Northern", "Riverstone", "Barrel", diff --git a/tooling/pipeline/includes/services/sqlite_export_service.h b/tooling/pipeline/includes/services/sqlite_export_service.h index fdae8ff..ebae1ec 100644 --- a/tooling/pipeline/includes/services/sqlite_export_service.h +++ b/tooling/pipeline/includes/services/sqlite_export_service.h @@ -42,7 +42,6 @@ class SqliteExportService final : public IExportService { void InitializeSchema() const; void PrepareStatements(); void RollbackAndCloseNoThrow() noexcept; - void FinalizeStatements() noexcept; [[nodiscard]] std::filesystem::path BuildDatabasePath() const; [[nodiscard]] static std::string BuildLocationKey(const Location& location); diff --git a/tooling/pipeline/includes/services/sqlite_statement_helpers.h b/tooling/pipeline/includes/services/sqlite_statement_helpers.h index 5f3315b..74f54b1 100644 --- a/tooling/pipeline/includes/services/sqlite_statement_helpers.h +++ b/tooling/pipeline/includes/services/sqlite_statement_helpers.h @@ -107,7 +107,6 @@ void StepStatement(const SqliteDatabaseHandle& db_handle, sqlite3_int64 LastInsertRowId(const SqliteDatabaseHandle& db_handle); -std::string SerializeLocalLanguages(const std::vector& local_languages); std::string SerializeVector(const std::vector& str_vec); } // namespace sqlite_export_service_internal diff --git a/tooling/pipeline/prompts/system.md b/tooling/pipeline/prompts/BREWERY_GENERATION.md similarity index 100% rename from tooling/pipeline/prompts/system.md rename to tooling/pipeline/prompts/BREWERY_GENERATION.md diff --git a/tooling/pipeline/src/data_generation/llama/generate_brewery.cc b/tooling/pipeline/src/data_generation/llama/generate_brewery.cc index f29c637..011a64f 100644 --- a/tooling/pipeline/src/data_generation/llama/generate_brewery.cc +++ b/tooling/pipeline/src/data_generation/llama/generate_brewery.cc @@ -33,6 +33,8 @@ static std::string FormatLocalLanguageCodes( return formatted; } +// GBNF grammar for structured brewery JSON output. +// @TODO move to a separate gbnf file if it grows in complexity or is shared across modules. static constexpr std::string_view kBreweryJsonGrammar = R"json_brewery( root ::= thought-block "{" ws "\"name_en\"" ws ":" ws string ws "," ws "\"description_en\"" ws ":" ws string ws "," ws "\"name_local\"" ws ":" ws string ws "," ws "\"description_local\"" ws ":" ws string ws "}" ws thought-block ::= [^{]* diff --git a/tooling/pipeline/src/data_generation/llama/generate_user.cc b/tooling/pipeline/src/data_generation/llama/generate_user.cc index eaebc09..7ed6426 100644 --- a/tooling/pipeline/src/data_generation/llama/generate_user.cc +++ b/tooling/pipeline/src/data_generation/llama/generate_user.cc @@ -12,6 +12,13 @@ #include "data_generation/llama_generator.h" #include "data_generation/llama_generator_helpers.h" +// TODO: Implement locale-aware user profile generation. +// Current implementation returns a hardcoded test value and ignores the +// locale parameter. Future implementation should: +// 1. Load a USER_GENERATION.md prompt template with locale context +// 2. Perform LLM inference with locale-specific username/bio generation +// 3. Parse and validate JSON output with retry handling (similar to brewery) +// 4. Return locale-aware username and biography UserResult LlamaGenerator::GenerateUser(const std::string& locale) { return {.username = "test_user", .bio = "This is a test user profile from " + locale + "."}; diff --git a/tooling/pipeline/src/data_generation/llama/helpers.cc b/tooling/pipeline/src/data_generation/llama/helpers.cc index 8556b8d..1433523 100644 --- a/tooling/pipeline/src/data_generation/llama/helpers.cc +++ b/tooling/pipeline/src/data_generation/llama/helpers.cc @@ -58,6 +58,11 @@ static std::string CondenseWhitespace(std::string_view text) { return out; } +// Guard against truncating in the first half of the string. +// This preserves the critical opening content and avoids cutting critical +// context words early in the region description. +static constexpr size_t kTruncationGuardDivisor = 2; + /** * Truncate region context to fit within max length while preserving word * boundaries @@ -71,7 +76,8 @@ std::string PrepareRegionContext(std::string_view region_context, normalized.resize(max_chars); const size_t last_space = normalized.find_last_of(' '); - if (last_space != std::string::npos && last_space > max_chars / 2) { + if (last_space != std::string::npos && + last_space > max_chars / kTruncationGuardDivisor) { normalized.resize(last_space); } diff --git a/tooling/pipeline/src/data_generation/llama/infer.cc b/tooling/pipeline/src/data_generation/llama/infer.cc index 2a2c116..dc06d0b 100644 --- a/tooling/pipeline/src/data_generation/llama/infer.cc +++ b/tooling/pipeline/src/data_generation/llama/infer.cc @@ -19,6 +19,9 @@ #include "llama.h" static constexpr size_t kPromptTokenSlack = 8; +// Minimum tokens to keep when using top-p sampling. Ensures at least one +// candidate token remains available even with very restrictive top-p values. +static constexpr size_t kTopPMinKeep = 1; namespace { @@ -62,7 +65,7 @@ SamplerHandle MakeSamplerChain(const llama_vocab* vocab, "LlamaGenerator: failed to initialize temperature sampler"); add_sampler(llama_sampler_init_top_k(static_cast(config.top_k)), "LlamaGenerator: failed to initialize top-k sampler"); - add_sampler(llama_sampler_init_top_p(config.top_p, 1), + add_sampler(llama_sampler_init_top_p(config.top_p, kTopPMinKeep), "LlamaGenerator: failed to initialize top-p sampler"); add_sampler(llama_sampler_init_dist(config.seed), "LlamaGenerator: failed to initialize distribution sampler"); diff --git a/tooling/pipeline/src/data_generation/llama/load.cc b/tooling/pipeline/src/data_generation/llama/load.cc index 98feb5a..8ce3142 100644 --- a/tooling/pipeline/src/data_generation/llama/load.cc +++ b/tooling/pipeline/src/data_generation/llama/load.cc @@ -14,6 +14,10 @@ #include "data_generation/llama_generator.h" #include "llama.h" +// Maximum batch size for decode operations. Capping the batch prevents +// excessive memory allocation while maintaining inference performance. +static constexpr uint32_t kMaxBatchSize = 5000U; + void LlamaGenerator::Load(const std::string& model_path) { context_.reset(); model_.reset(); @@ -28,7 +32,7 @@ void LlamaGenerator::Load(const std::string& model_path) { llama_context_params context_params = llama_context_default_params(); context_params.n_ctx = n_ctx_; - context_params.n_batch = std::min(n_ctx_, static_cast(5000)); + context_params.n_batch = std::min(n_ctx_, kMaxBatchSize); LlamaGenerator::ContextHandle loaded_context( llama_init_from_model(loaded_model.get(), context_params)); diff --git a/tooling/pipeline/src/data_generation/mock/generate_brewery.cc b/tooling/pipeline/src/data_generation/mock/generate_brewery.cc index c2495e0..20a4ff6 100644 --- a/tooling/pipeline/src/data_generation/mock/generate_brewery.cc +++ b/tooling/pipeline/src/data_generation/mock/generate_brewery.cc @@ -17,9 +17,9 @@ BreweryResult MockGenerator::GenerateBrewery( const std::string_view adjective = kBreweryAdjectives.at(hash % kBreweryAdjectives.size()); const std::string_view noun = - kBreweryNouns.at(hash / 7 % kBreweryNouns.size()); - const std::string_view base_description = - kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size()); + kBreweryNouns.at(hash / kNounHashStride % kBreweryNouns.size()); + const std::string_view base_description = kBreweryDescriptions.at( + (hash / kDescriptionHashStride) % kBreweryDescriptions.size()); const std::string name = std::format("{} {} {}", location.city, adjective, noun); diff --git a/tooling/pipeline/src/data_generation/mock/generate_user.cc b/tooling/pipeline/src/data_generation/mock/generate_user.cc index 51c26d2..38257fb 100644 --- a/tooling/pipeline/src/data_generation/mock/generate_user.cc +++ b/tooling/pipeline/src/data_generation/mock/generate_user.cc @@ -15,7 +15,7 @@ UserResult MockGenerator::GenerateUser(const std::string& locale) { UserResult result; const std::string_view username = kUsernames[hash % kUsernames.size()]; - const std::string_view bio = kBios[hash / 11 % kBios.size()]; + const std::string_view bio = kBios[hash / kBioHashStride % kBios.size()]; result.username = username; result.bio = bio; return result; diff --git a/tooling/pipeline/src/main.cc b/tooling/pipeline/src/main.cc index 241c84d..6d18d05 100644 --- a/tooling/pipeline/src/main.cc +++ b/tooling/pipeline/src/main.cc @@ -53,16 +53,21 @@ std::optional ParseArguments(const int argc, char** argv) { opt("model,m", prog_opts::value()->default_value(""), "Path to LLM model (gguf)"); - // Sampling Options - opt("temperature", prog_opts::value()->default_value(1.0F), + // Sampling Options - defaults driven from SamplingOptions struct + const SamplingOptions kSamplingDefaults{}; + opt("temperature", + prog_opts::value()->default_value(kSamplingDefaults.temperature), "Sampling temperature (higher = more random)"); - opt("top-p", prog_opts::value()->default_value(0.95F), + opt("top-p", + prog_opts::value()->default_value(kSamplingDefaults.top_p), "Nucleus sampling top-p in (0,1] (higher = more random)"); - opt("top-k", prog_opts::value()->default_value(64), + opt("top-k", + prog_opts::value()->default_value(kSamplingDefaults.top_k), "Top-k sampling parameter (higher = more candidate tokens)"); - opt("n-ctx", prog_opts::value()->default_value(8192), + opt("n-ctx", + prog_opts::value()->default_value(kSamplingDefaults.n_ctx), "Context window size in tokens"); - opt("seed", prog_opts::value()->default_value(-1), + opt("seed", prog_opts::value()->default_value(kSamplingDefaults.seed), "Sampler seed: -1 for random, otherwise non-negative integer"); // Pipeline Options @@ -84,11 +89,11 @@ std::optional ParseArguments(const int argc, char** argv) { } try { - prog_opts::variables_map vm; - prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), vm); - prog_opts::notify(vm); + prog_opts::variables_map var_map; + prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), var_map); + prog_opts::notify(var_map); - if (vm.contains("help")) { + if (var_map.contains("help")) { std::stringstream help_stream; help_stream << "\n" << desc; spdlog::info(help_stream.str()); @@ -97,12 +102,12 @@ std::optional ParseArguments(const int argc, char** argv) { ApplicationOptions options; - options.pipeline.output_path = vm["output"].as(); - options.pipeline.log_path = vm["log-path"].as(); - options.pipeline.prompt_dir = vm["prompt-dir"].as(); + options.pipeline.output_path = var_map["output"].as(); + options.pipeline.log_path = var_map["log-path"].as(); + options.pipeline.prompt_dir = var_map["prompt-dir"].as(); - const bool use_mocked = vm["mocked"].as(); - const std::string model_path = vm["model"].as(); + const bool use_mocked = var_map["mocked"].as(); + const std::string model_path = var_map["model"].as(); if (use_mocked && !model_path.empty()) { spdlog::error( @@ -127,9 +132,9 @@ std::optional ParseArguments(const int argc, char** argv) { options.generator.model_path = model_path; const bool user_provided_sampling = - !vm["temperature"].defaulted() || !vm["top-p"].defaulted() || - !vm["top-k"].defaulted() || !vm["n-ctx"].defaulted() || - !vm["seed"].defaulted(); + !var_map["temperature"].defaulted() || !var_map["top-p"].defaulted() || + !var_map["top-k"].defaulted() || !var_map["n-ctx"].defaulted() || + !var_map["seed"].defaulted(); if (use_mocked) { if (user_provided_sampling) { @@ -137,11 +142,11 @@ std::optional ParseArguments(const int argc, char** argv) { } } else if (user_provided_sampling) { SamplingOptions sampling; - sampling.temperature = vm["temperature"].as(); - sampling.top_p = vm["top-p"].as(); - sampling.top_k = vm["top-k"].as(); - sampling.n_ctx = vm["n-ctx"].as(); - sampling.seed = vm["seed"].as(); + sampling.temperature = var_map["temperature"].as(); + sampling.top_p = var_map["top-p"].as(); + sampling.top_k = var_map["top-k"].as(); + sampling.n_ctx = var_map["n-ctx"].as(); + sampling.seed = var_map["seed"].as(); options.generator.sampling = sampling; } diff --git a/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp b/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp index fd09056..12620f8 100644 --- a/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp +++ b/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp @@ -86,16 +86,6 @@ sqlite3_int64 LastInsertRowId(const SqliteDatabaseHandle& db_handle) { return sqlite3_last_insert_rowid(db_handle.get()); } -std::string SerializeLocalLanguages( - const std::vector& local_languages) { - boost::json::array array; - array.reserve(local_languages.size()); - for (const auto& language : local_languages) { - array.emplace_back(language); - } - return boost::json::serialize(array); -} - std::string SerializeVector(const std::vector& str_vec) { boost::json::array array(str_vec.size()); for (const auto& s : str_vec) { diff --git a/tooling/pipeline/src/services/sqlite/process_record.cc b/tooling/pipeline/src/services/sqlite/process_record.cc index 786aa19..be3af22 100644 --- a/tooling/pipeline/src/services/sqlite/process_record.cc +++ b/tooling/pipeline/src/services/sqlite/process_record.cc @@ -3,6 +3,8 @@ * @brief SqliteExportService::ProcessRecord() implementation. */ +#include +#include #include #include diff --git a/tooling/pipeline/src/web_client/curl_web_client_get.cc b/tooling/pipeline/src/web_client/curl_web_client_get.cc index 2e178f7..0a8473e 100644 --- a/tooling/pipeline/src/web_client/curl_web_client_get.cc +++ b/tooling/pipeline/src/web_client/curl_web_client_get.cc @@ -17,6 +17,7 @@ using CurlHandle = std::unique_ptr; static constexpr long kConnectionTimeout = 10; static constexpr long kRequestTimeout = 30; +static constexpr long kMaxRedirects = 5; static constexpr int32_t kOkHttpStatus = 200; static CurlHandle CreateHandle() { @@ -32,7 +33,7 @@ static void SetCommonGetOptions(CURL* curl, const std::string& url) { curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0"); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L); + curl_easy_setopt(curl, CURLOPT_MAXREDIRS, kMaxRedirects); curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, kConnectionTimeout); curl_easy_setopt(curl, CURLOPT_TIMEOUT, kRequestTimeout); curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");