readability updates: remove magic numbers, update comments

This commit is contained in:
Aaron Po
2026-05-01 17:38:16 -04:00
parent 9051f55114
commit 91e18888fe
15 changed files with 68 additions and 43 deletions

View File

@@ -33,6 +33,8 @@ static std::string FormatLocalLanguageCodes(
return formatted;
}
// GBNF grammar for structured brewery JSON output.
// @TODO move to a separate gbnf file if it grows in complexity or is shared across modules.
static constexpr std::string_view kBreweryJsonGrammar = R"json_brewery(
root ::= thought-block "{" ws "\"name_en\"" ws ":" ws string ws "," ws "\"description_en\"" ws ":" ws string ws "," ws "\"name_local\"" ws ":" ws string ws "," ws "\"description_local\"" ws ":" ws string ws "}" ws
thought-block ::= [^{]*

View File

@@ -12,6 +12,13 @@
#include "data_generation/llama_generator.h"
#include "data_generation/llama_generator_helpers.h"
// TODO: Implement locale-aware user profile generation.
// Current implementation returns a hardcoded test value and ignores the
// locale parameter. Future implementation should:
// 1. Load a USER_GENERATION.md prompt template with locale context
// 2. Perform LLM inference with locale-specific username/bio generation
// 3. Parse and validate JSON output with retry handling (similar to brewery)
// 4. Return locale-aware username and biography
UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
return {.username = "test_user",
.bio = "This is a test user profile from " + locale + "."};

View File

@@ -58,6 +58,11 @@ static std::string CondenseWhitespace(std::string_view text) {
return out;
}
// Guard against truncating in the first half of the string.
// This preserves the critical opening content and avoids cutting critical
// context words early in the region description.
static constexpr size_t kTruncationGuardDivisor = 2;
/**
* Truncate region context to fit within max length while preserving word
* boundaries
@@ -71,7 +76,8 @@ std::string PrepareRegionContext(std::string_view region_context,
normalized.resize(max_chars);
const size_t last_space = normalized.find_last_of(' ');
if (last_space != std::string::npos && last_space > max_chars / 2) {
if (last_space != std::string::npos &&
last_space > max_chars / kTruncationGuardDivisor) {
normalized.resize(last_space);
}

View File

@@ -19,6 +19,9 @@
#include "llama.h"
static constexpr size_t kPromptTokenSlack = 8;
// Minimum tokens to keep when using top-p sampling. Ensures at least one
// candidate token remains available even with very restrictive top-p values.
static constexpr size_t kTopPMinKeep = 1;
namespace {
@@ -62,7 +65,7 @@ SamplerHandle MakeSamplerChain(const llama_vocab* vocab,
"LlamaGenerator: failed to initialize temperature sampler");
add_sampler(llama_sampler_init_top_k(static_cast<int32_t>(config.top_k)),
"LlamaGenerator: failed to initialize top-k sampler");
add_sampler(llama_sampler_init_top_p(config.top_p, 1),
add_sampler(llama_sampler_init_top_p(config.top_p, kTopPMinKeep),
"LlamaGenerator: failed to initialize top-p sampler");
add_sampler(llama_sampler_init_dist(config.seed),
"LlamaGenerator: failed to initialize distribution sampler");

View File

@@ -14,6 +14,10 @@
#include "data_generation/llama_generator.h"
#include "llama.h"
// Maximum batch size for decode operations. Capping the batch prevents
// excessive memory allocation while maintaining inference performance.
static constexpr uint32_t kMaxBatchSize = 5000U;
void LlamaGenerator::Load(const std::string& model_path) {
context_.reset();
model_.reset();
@@ -28,7 +32,7 @@ void LlamaGenerator::Load(const std::string& model_path) {
llama_context_params context_params = llama_context_default_params();
context_params.n_ctx = n_ctx_;
context_params.n_batch = std::min(n_ctx_, static_cast<uint32_t>(5000));
context_params.n_batch = std::min(n_ctx_, kMaxBatchSize);
LlamaGenerator::ContextHandle loaded_context(
llama_init_from_model(loaded_model.get(), context_params));

View File

@@ -17,9 +17,9 @@ BreweryResult MockGenerator::GenerateBrewery(
const std::string_view adjective =
kBreweryAdjectives.at(hash % kBreweryAdjectives.size());
const std::string_view noun =
kBreweryNouns.at(hash / 7 % kBreweryNouns.size());
const std::string_view base_description =
kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size());
kBreweryNouns.at(hash / kNounHashStride % kBreweryNouns.size());
const std::string_view base_description = kBreweryDescriptions.at(
(hash / kDescriptionHashStride) % kBreweryDescriptions.size());
const std::string name =
std::format("{} {} {}", location.city, adjective, noun);

View File

@@ -15,7 +15,7 @@ UserResult MockGenerator::GenerateUser(const std::string& locale) {
UserResult result;
const std::string_view username = kUsernames[hash % kUsernames.size()];
const std::string_view bio = kBios[hash / 11 % kBios.size()];
const std::string_view bio = kBios[hash / kBioHashStride % kBios.size()];
result.username = username;
result.bio = bio;
return result;

View File

@@ -53,16 +53,21 @@ std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
opt("model,m", prog_opts::value<std::string>()->default_value(""),
"Path to LLM model (gguf)");
// Sampling Options
opt("temperature", prog_opts::value<float>()->default_value(1.0F),
// Sampling Options - defaults driven from SamplingOptions struct
const SamplingOptions kSamplingDefaults{};
opt("temperature",
prog_opts::value<float>()->default_value(kSamplingDefaults.temperature),
"Sampling temperature (higher = more random)");
opt("top-p", prog_opts::value<float>()->default_value(0.95F),
opt("top-p",
prog_opts::value<float>()->default_value(kSamplingDefaults.top_p),
"Nucleus sampling top-p in (0,1] (higher = more random)");
opt("top-k", prog_opts::value<uint32_t>()->default_value(64),
opt("top-k",
prog_opts::value<uint32_t>()->default_value(kSamplingDefaults.top_k),
"Top-k sampling parameter (higher = more candidate tokens)");
opt("n-ctx", prog_opts::value<uint32_t>()->default_value(8192),
opt("n-ctx",
prog_opts::value<uint32_t>()->default_value(kSamplingDefaults.n_ctx),
"Context window size in tokens");
opt("seed", prog_opts::value<int>()->default_value(-1),
opt("seed", prog_opts::value<int>()->default_value(kSamplingDefaults.seed),
"Sampler seed: -1 for random, otherwise non-negative integer");
// Pipeline Options
@@ -84,11 +89,11 @@ std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
}
try {
prog_opts::variables_map vm;
prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), vm);
prog_opts::notify(vm);
prog_opts::variables_map var_map;
prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), var_map);
prog_opts::notify(var_map);
if (vm.contains("help")) {
if (var_map.contains("help")) {
std::stringstream help_stream;
help_stream << "\n" << desc;
spdlog::info(help_stream.str());
@@ -97,12 +102,12 @@ std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
ApplicationOptions options;
options.pipeline.output_path = vm["output"].as<std::string>();
options.pipeline.log_path = vm["log-path"].as<std::string>();
options.pipeline.prompt_dir = vm["prompt-dir"].as<std::string>();
options.pipeline.output_path = var_map["output"].as<std::string>();
options.pipeline.log_path = var_map["log-path"].as<std::string>();
options.pipeline.prompt_dir = var_map["prompt-dir"].as<std::string>();
const bool use_mocked = vm["mocked"].as<bool>();
const std::string model_path = vm["model"].as<std::string>();
const bool use_mocked = var_map["mocked"].as<bool>();
const std::string model_path = var_map["model"].as<std::string>();
if (use_mocked && !model_path.empty()) {
spdlog::error(
@@ -127,9 +132,9 @@ std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
options.generator.model_path = model_path;
const bool user_provided_sampling =
!vm["temperature"].defaulted() || !vm["top-p"].defaulted() ||
!vm["top-k"].defaulted() || !vm["n-ctx"].defaulted() ||
!vm["seed"].defaulted();
!var_map["temperature"].defaulted() || !var_map["top-p"].defaulted() ||
!var_map["top-k"].defaulted() || !var_map["n-ctx"].defaulted() ||
!var_map["seed"].defaulted();
if (use_mocked) {
if (user_provided_sampling) {
@@ -137,11 +142,11 @@ std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
}
} else if (user_provided_sampling) {
SamplingOptions sampling;
sampling.temperature = vm["temperature"].as<float>();
sampling.top_p = vm["top-p"].as<float>();
sampling.top_k = vm["top-k"].as<uint32_t>();
sampling.n_ctx = vm["n-ctx"].as<uint32_t>();
sampling.seed = vm["seed"].as<int>();
sampling.temperature = var_map["temperature"].as<float>();
sampling.top_p = var_map["top-p"].as<float>();
sampling.top_k = var_map["top-k"].as<uint32_t>();
sampling.n_ctx = var_map["n-ctx"].as<uint32_t>();
sampling.seed = var_map["seed"].as<int>();
options.generator.sampling = sampling;
}

View File

@@ -86,16 +86,6 @@ sqlite3_int64 LastInsertRowId(const SqliteDatabaseHandle& db_handle) {
return sqlite3_last_insert_rowid(db_handle.get());
}
std::string SerializeLocalLanguages(
const std::vector<std::string>& local_languages) {
boost::json::array array;
array.reserve(local_languages.size());
for (const auto& language : local_languages) {
array.emplace_back(language);
}
return boost::json::serialize(array);
}
std::string SerializeVector(const std::vector<std::string>& str_vec) {
boost::json::array array(str_vec.size());
for (const auto& s : str_vec) {

View File

@@ -3,6 +3,8 @@
* @brief SqliteExportService::ProcessRecord() implementation.
*/
#include <iomanip>
#include <sstream>
#include <stdexcept>
#include <string>

View File

@@ -17,6 +17,7 @@ using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
static constexpr long kConnectionTimeout = 10;
static constexpr long kRequestTimeout = 30;
static constexpr long kMaxRedirects = 5;
static constexpr int32_t kOkHttpStatus = 200;
static CurlHandle CreateHandle() {
@@ -32,7 +33,7 @@ static void SetCommonGetOptions(CURL* curl, const std::string& url) {
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, kMaxRedirects);
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, kConnectionTimeout);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, kRequestTimeout);
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");