From bc435a7bca84e1df7a218e379e95dade96053fec Mon Sep 17 00:00:00 2001 From: Aaron Po Date: Sat, 2 May 2026 01:07:00 -0400 Subject: [PATCH] Extract argument parsing, timer out of main --- .../includes/data_model/application_options.h | 5 + tooling/pipeline/includes/services/timer.h | 39 +++++ .../application_options/parse_arguments.cc | 150 ++++++++++++++++++ tooling/pipeline/src/main.cc | 140 +--------------- .../services/sqlite/build_database_path.cc | 23 --- ...lpers.cpp => sqlite_connection_helpers.cc} | 0 ...elpers.cpp => sqlite_statement_helpers.cc} | 0 .../src/services/sqlite/initialize.cc | 14 ++ 8 files changed, 209 insertions(+), 162 deletions(-) create mode 100644 tooling/pipeline/includes/services/timer.h create mode 100644 tooling/pipeline/src/application_options/parse_arguments.cc delete mode 100644 tooling/pipeline/src/services/sqlite/build_database_path.cc rename tooling/pipeline/src/services/sqlite/helpers/{sqlite_connection_helpers.cpp => sqlite_connection_helpers.cc} (100%) rename tooling/pipeline/src/services/sqlite/helpers/{sqlite_statement_helpers.cpp => sqlite_statement_helpers.cc} (100%) diff --git a/tooling/pipeline/includes/data_model/application_options.h b/tooling/pipeline/includes/data_model/application_options.h index 5893ec8..1dae82b 100644 --- a/tooling/pipeline/includes/data_model/application_options.h +++ b/tooling/pipeline/includes/data_model/application_options.h @@ -11,6 +11,10 @@ #include #include +#include + +namespace prog_opts = boost::program_options; + /** * @brief LLM sampling parameters. */ @@ -69,4 +73,5 @@ struct ApplicationOptions { PipelineOptions pipeline; }; +std::optional ParseArguments(const int argc, char** argv); #endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_APPLICATION_OPTIONS_H_ diff --git a/tooling/pipeline/includes/services/timer.h b/tooling/pipeline/includes/services/timer.h new file mode 100644 index 0000000..8623f59 --- /dev/null +++ b/tooling/pipeline/includes/services/timer.h @@ -0,0 +1,39 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_ + +#include + +/** + * @file services/timer.h + * @brief Simple timer utility for measuring elapsed time. + */ +class Timer { + std::chrono::steady_clock::time_point start_time = + std::chrono::steady_clock::now(); + + public: + Timer(const Timer&) = delete; + Timer& operator=(const Timer&) = delete; + Timer(Timer&&) = delete; + Timer& operator=(Timer&&) = delete; + Timer() = default; + ~Timer() = default; + + [[nodiscard]] int64_t Elapsed() const { + return std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time) + .count(); + } + + [[nodiscard]] int64_t Reset() { + auto previous_elapsed = Elapsed(); + start_time = std::chrono::steady_clock::now(); + return previous_elapsed; + } + + + + +}; + +#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_ diff --git a/tooling/pipeline/src/application_options/parse_arguments.cc b/tooling/pipeline/src/application_options/parse_arguments.cc new file mode 100644 index 0000000..c938e4d --- /dev/null +++ b/tooling/pipeline/src/application_options/parse_arguments.cc @@ -0,0 +1,150 @@ +#include +#include +#include +#include + +#include "data_model/application_options.h" + + +std::optional ParseArguments(const int argc, char** argv) { + prog_opts::options_description desc("Pipeline Options"); + auto opt = desc.add_options(); + + opt("help,h", "Produce help message"); + + // Defaults sourced from SamplingOptions{} so the CLI and LlamaGenerator + // share a single source of truth — changing the struct updates both. + auto add_sampling_options = [&]() -> void { + const SamplingOptions sampling_defaults{}; + opt("temperature", + prog_opts::value()->default_value(sampling_defaults.temperature), + "Sampling temperature (higher = more random)"); + opt("top-p", + prog_opts::value()->default_value(sampling_defaults.top_p), + "Nucleus sampling top-p in (0,1] (higher = more random)"); + opt("top-k", + prog_opts::value()->default_value(sampling_defaults.top_k), + "Top-k sampling parameter (higher = more candidate tokens)"); + opt("n-ctx", + prog_opts::value()->default_value(sampling_defaults.n_ctx), + "Context window size in tokens"); + opt("seed", prog_opts::value()->default_value(sampling_defaults.seed), + "Sampler seed: -1 for random, otherwise non-negative integer"); + }; + + // --mocked and --model are mutually exclusive; validation is enforced below + // rather than at registration to produce a clear diagnostic message. + auto add_generator_options = [&]() -> void { + opt("mocked", prog_opts::bool_switch(), + "Use mocked generator for brewery/user data"); + opt("model,m", prog_opts::value()->default_value(""), + "Path to LLM model (gguf)"); + }; + + auto add_pipeline_options = [&]() -> void { + opt("output,o", prog_opts::value()->default_value("output"), + "Directory for generated artifacts"); + opt("log-path", + prog_opts::value()->default_value("pipeline.log"), + "Path for application logs"); + opt("prompt-dir", prog_opts::value()->default_value(""), + "Directory containing named prompt files (e.g. BREWERY_GENERATION.md)." + " Required when not using --mocked."); + }; + + add_sampling_options(); + add_generator_options(); + add_pipeline_options(); + + // No flags provided — treat as a help request rather than an error. + if (argc == 1) { + spdlog::info("Biergarten Pipeline"); + std::stringstream usage_stream; + usage_stream << "\nUsage: biergarten-pipeline [options]\n\n" << desc; + spdlog::info(usage_stream.str()); + return std::nullopt; + } + + try { + prog_opts::variables_map var_map; + prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), var_map); + prog_opts::notify(var_map); + + if (var_map.contains("help")) { + std::stringstream help_stream; + help_stream << "\n" << desc; + spdlog::info(help_stream.str()); + return std::nullopt; + } + + ApplicationOptions options; + + options.pipeline.output_path = var_map["output"].as(); + options.pipeline.log_path = var_map["log-path"].as(); + options.pipeline.prompt_dir = var_map["prompt-dir"].as(); + + const bool use_mocked = var_map["mocked"].as(); + const std::string model_path = var_map["model"].as(); + + // Enforce mutual exclusivity before any further configuration is applied. + if (use_mocked && !model_path.empty()) { + spdlog::error( + "Invalid arguments: --mocked and --model are mutually exclusive"); + return std::nullopt; + } + + if (!use_mocked && model_path.empty()) { + spdlog::error( + "Invalid arguments: either --mocked or --model must be specified"); + return std::nullopt; + } + + // Prompt directory is only meaningful for live inference — the mock + // generator has no use for it and should not require it to be present. + if (!use_mocked && options.pipeline.prompt_dir.empty()) { + spdlog::error( + "Invalid arguments: --prompt-dir is required when not using " + "--mocked"); + return std::nullopt; + } + + options.generator.use_mocked = use_mocked; + options.generator.model_path = model_path; + + // Only populate sampling config when the user explicitly overrides at + // least one value. Leaving it as std::nullopt lets LlamaGenerator fall + // back to its own SamplingOptions{} defaults, keeping the two paths + // consistent without redundant copies. + const bool user_provided_sampling = + !var_map["temperature"].defaulted() || !var_map["top-p"].defaulted() || + !var_map["top-k"].defaulted() || !var_map["n-ctx"].defaulted() || + !var_map["seed"].defaulted(); + + if (user_provided_sampling) { + // Warn but do not fail — the run is still valid, the flags are just + // silently irrelevant when no model is loaded. + if (use_mocked) { + spdlog::warn("Sampling parameters are ignored when using --mocked"); + } else { + SamplingOptions sampling; + sampling.temperature = var_map["temperature"].as(); + sampling.top_p = var_map["top-p"].as(); + sampling.top_k = var_map["top-k"].as(); + sampling.n_ctx = var_map["n-ctx"].as(); + sampling.seed = var_map["seed"].as(); + + options.generator.sampling = sampling; + } + } + + return options; + + } catch (const std::exception& exception) { + spdlog::error("Failed to parse command-line arguments: {}", + exception.what()); + return std::nullopt; + } catch (...) { + spdlog::error("Failed to parse command-line arguments: unknown error"); + return std::nullopt; + } +} diff --git a/tooling/pipeline/src/main.cc b/tooling/pipeline/src/main.cc index 2c5cec9..f340939 100644 --- a/tooling/pipeline/src/main.cc +++ b/tooling/pipeline/src/main.cc @@ -28,149 +28,11 @@ #include "services/sqlite_export_service.h" #include "services/wikipedia_service.h" #include "web_client/curl_web_client.h" +#include "services/timer.h" -namespace prog_opts = boost::program_options; namespace di = boost::di; -/** - * @brief Parse command-line arguments into ApplicationOptions. - * - * @param argc Command-line argument count. - * @param argv Command-line arguments. - * @return Parsed ApplicationOptions if parsing succeeded, std::nullopt - * otherwise. - */ -std::optional ParseArguments(const int argc, char** argv) { - prog_opts::options_description desc("Pipeline Options"); - auto opt = desc.add_options(); - - opt("help,h", "Produce help message"); - - // Generator Options - opt("mocked", prog_opts::bool_switch(), - "Use mocked generator for brewery/user data"); - opt("model,m", prog_opts::value()->default_value(""), - "Path to LLM model (gguf)"); - - // Sampling Options - defaults driven from SamplingOptions struct - const SamplingOptions kSamplingDefaults{}; - opt("temperature", - prog_opts::value()->default_value(kSamplingDefaults.temperature), - "Sampling temperature (higher = more random)"); - opt("top-p", - prog_opts::value()->default_value(kSamplingDefaults.top_p), - "Nucleus sampling top-p in (0,1] (higher = more random)"); - opt("top-k", - prog_opts::value()->default_value(kSamplingDefaults.top_k), - "Top-k sampling parameter (higher = more candidate tokens)"); - opt("n-ctx", - prog_opts::value()->default_value(kSamplingDefaults.n_ctx), - "Context window size in tokens"); - opt("seed", prog_opts::value()->default_value(kSamplingDefaults.seed), - "Sampler seed: -1 for random, otherwise non-negative integer"); - - // Pipeline Options - opt("output,o", prog_opts::value()->default_value("output"), - "Directory for generated artifacts"); - opt("log-path", - prog_opts::value()->default_value("pipeline.log"), - "Path for application logs"); - opt("prompt-dir", prog_opts::value()->default_value(""), - "Directory containing named prompt files (e.g. BREWERY_GENERATION.md)." - " Required when not using --mocked."); - - if (argc == 1) { - spdlog::info("Biergarten Pipeline"); - std::stringstream usage_stream; - usage_stream << "\nUsage: biergarten-pipeline [options]\n\n" << desc; - spdlog::info(usage_stream.str()); - return std::nullopt; - } - - try { - prog_opts::variables_map var_map; - prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), var_map); - prog_opts::notify(var_map); - - if (var_map.contains("help")) { - std::stringstream help_stream; - help_stream << "\n" << desc; - spdlog::info(help_stream.str()); - return std::nullopt; - } - - ApplicationOptions options; - - options.pipeline.output_path = var_map["output"].as(); - options.pipeline.log_path = var_map["log-path"].as(); - options.pipeline.prompt_dir = var_map["prompt-dir"].as(); - - const bool use_mocked = var_map["mocked"].as(); - const std::string model_path = var_map["model"].as(); - - if (use_mocked && !model_path.empty()) { - spdlog::error( - "Invalid arguments: --mocked and --model are mutually exclusive"); - return std::nullopt; - } - - if (!use_mocked && model_path.empty()) { - spdlog::error( - "Invalid arguments: Either --mocked or --model must be specified"); - return std::nullopt; - } - - if (!use_mocked && options.pipeline.prompt_dir.empty()) { - spdlog::error( - "Invalid arguments: --prompt-dir is required when not using " - "--mocked"); - return std::nullopt; - } - - options.generator.use_mocked = use_mocked; - options.generator.model_path = model_path; - - const bool user_provided_sampling = - !var_map["temperature"].defaulted() || !var_map["top-p"].defaulted() || - !var_map["top-k"].defaulted() || !var_map["n-ctx"].defaulted() || - !var_map["seed"].defaulted(); - - if (use_mocked) { - if (user_provided_sampling) { - spdlog::warn("Sampling parameters are ignored when using --mocked"); - } - } else if (user_provided_sampling) { - SamplingOptions sampling; - sampling.temperature = var_map["temperature"].as(); - sampling.top_p = var_map["top-p"].as(); - sampling.top_k = var_map["top-k"].as(); - sampling.n_ctx = var_map["n-ctx"].as(); - sampling.seed = var_map["seed"].as(); - - options.generator.sampling = sampling; - } - - return options; - } catch (const std::exception& exception) { - spdlog::error("Failed to parse command-line arguments: {}", - exception.what()); - return std::nullopt; - } catch (...) { - spdlog::error("Failed to parse command-line arguments: unknown error"); - return std::nullopt; - } -} - -struct Timer { - std::chrono::steady_clock::time_point start_time = - std::chrono::steady_clock::now(); - [[nodiscard]] int64_t Elapsed() const { - return std::chrono::duration_cast( - std::chrono::steady_clock::now() - start_time) - .count(); - } -}; int main(const int argc, char** argv) { try { diff --git a/tooling/pipeline/src/services/sqlite/build_database_path.cc b/tooling/pipeline/src/services/sqlite/build_database_path.cc deleted file mode 100644 index 8786fe4..0000000 --- a/tooling/pipeline/src/services/sqlite/build_database_path.cc +++ /dev/null @@ -1,23 +0,0 @@ -/** - * @file services/sqlite/build_database_path.cc - * @brief SqliteExportService::BuildDatabasePath() implementation. - */ - -#include -#include - -#include "services/sqlite_export_service.h" - -std::filesystem::path SqliteExportService::BuildDatabasePath() const { - std::filesystem::path base_filename("biergarten_seed_" + run_timestamp_utc_ + - ".sqlite"); - std::filesystem::path candidate = output_path_ / base_filename; - - for (int suffix = 1; std::filesystem::exists(candidate); ++suffix) { - candidate = output_path_ / - std::filesystem::path("biergarten_seed_" + run_timestamp_utc_ + - "-" + std::to_string(suffix) + ".sqlite"); - } - - return candidate; -} diff --git a/tooling/pipeline/src/services/sqlite/helpers/sqlite_connection_helpers.cpp b/tooling/pipeline/src/services/sqlite/helpers/sqlite_connection_helpers.cc similarity index 100% rename from tooling/pipeline/src/services/sqlite/helpers/sqlite_connection_helpers.cpp rename to tooling/pipeline/src/services/sqlite/helpers/sqlite_connection_helpers.cc diff --git a/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp b/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cc similarity index 100% rename from tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp rename to tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cc diff --git a/tooling/pipeline/src/services/sqlite/initialize.cc b/tooling/pipeline/src/services/sqlite/initialize.cc index e86b21d..bbe6515 100644 --- a/tooling/pipeline/src/services/sqlite/initialize.cc +++ b/tooling/pipeline/src/services/sqlite/initialize.cc @@ -11,6 +11,20 @@ #include "services/sqlite_export_service.h" #include "services/sqlite_export_service_helpers.h" +std::filesystem::path SqliteExportService::BuildDatabasePath() const { + std::filesystem::path base_filename("biergarten_seed_" + run_timestamp_utc_ + + ".sqlite"); + std::filesystem::path candidate = output_path_ / base_filename; + + for (int suffix = 1; std::filesystem::exists(candidate); ++suffix) { + candidate = output_path_ / + std::filesystem::path("biergarten_seed_" + run_timestamp_utc_ + + "-" + std::to_string(suffix) + ".sqlite"); + } + + return candidate; +} + void SqliteExportService::InitializeSchema() const { sqlite_export_service_internal::ExecSql( db_handle_, sqlite_export_service_internal::kCreateLocationsTableSql,