refactor(pipeline): restructure config, add PromptDirectory, consolidate SQLite layer (#217)

* Refactor ApplicationOptions to separate config concerns * add prompt dir app option * readability updates: remove magic numbers, update comments * codebase formatting * Update docs * Extract argument parsing, timer out of
2026-06-01 01:54:00 +00:00 · 2026-05-02 18:27:14 -04:00
parent 641a479b6a
commit b1dc8e0b5d
35 changed files with 561 additions and 310 deletions
--- a/tooling/pipeline/src/application_options/parse_arguments.cc
+++ b/tooling/pipeline/src/application_options/parse_arguments.cc
@@ -0,0 +1,150 @@
+#include <spdlog/spdlog.h>
+
+#include <optional>
+#include <sstream>
+#include <string>
+
+#include "data_model/application_options.h"
+
+std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
+  prog_opts::options_description desc("Pipeline Options");
+  auto opt = desc.add_options();
+
+  opt("help,h", "Produce help message");
+
+  // Defaults sourced from SamplingOptions{} so the CLI and LlamaGenerator
+  // share a single source of truth — changing the struct updates both.
+  auto add_sampling_options = [&]() -> void {
+    const SamplingOptions sampling_defaults{};
+    opt("temperature",
+        prog_opts::value<float>()->default_value(sampling_defaults.temperature),
+        "Sampling temperature (higher = more random)");
+    opt("top-p",
+        prog_opts::value<float>()->default_value(sampling_defaults.top_p),
+        "Nucleus sampling top-p in (0,1] (higher = more random)");
+    opt("top-k",
+        prog_opts::value<uint32_t>()->default_value(sampling_defaults.top_k),
+        "Top-k sampling parameter (higher = more candidate tokens)");
+    opt("n-ctx",
+        prog_opts::value<uint32_t>()->default_value(sampling_defaults.n_ctx),
+        "Context window size in tokens");
+    opt("seed", prog_opts::value<int>()->default_value(sampling_defaults.seed),
+        "Sampler seed: -1 for random, otherwise non-negative integer");
+  };
+
+  // --mocked and --model are mutually exclusive; validation is enforced below
+  // rather than at registration to produce a clear diagnostic message.
+  auto add_generator_options = [&]() -> void {
+    opt("mocked", prog_opts::bool_switch(),
+        "Use mocked generator for brewery/user data");
+    opt("model,m", prog_opts::value<std::string>()->default_value(""),
+        "Path to LLM model (gguf)");
+  };
+
+  auto add_pipeline_options = [&]() -> void {
+    opt("output,o", prog_opts::value<std::string>()->default_value("output"),
+        "Directory for generated artifacts");
+    opt("log-path",
+        prog_opts::value<std::string>()->default_value("pipeline.log"),
+        "Path for application logs");
+    opt("prompt-dir", prog_opts::value<std::string>()->default_value(""),
+        "Directory containing named prompt files (e.g. BREWERY_GENERATION.md)."
+        " Required when not using --mocked.");
+  };
+
+  add_sampling_options();
+  add_generator_options();
+  add_pipeline_options();
+
+  // No flags provided — treat as a help request rather than an error.
+  if (argc == 1) {
+    spdlog::info("Biergarten Pipeline");
+    std::stringstream usage_stream;
+    usage_stream << "\nUsage: biergarten-pipeline [options]\n\n" << desc;
+    spdlog::info(usage_stream.str());
+    return std::nullopt;
+  }
+
+  try {
+    prog_opts::variables_map var_map;
+    prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), var_map);
+    prog_opts::notify(var_map);
+
+    if (var_map.contains("help")) {
+      std::stringstream help_stream;
+      help_stream << "\n" << desc;
+      spdlog::info(help_stream.str());
+      return std::nullopt;
+    }
+
+    ApplicationOptions options;
+
+    options.pipeline.output_path = var_map["output"].as<std::string>();
+    options.pipeline.log_path = var_map["log-path"].as<std::string>();
+    options.pipeline.prompt_dir = var_map["prompt-dir"].as<std::string>();
+
+    const bool use_mocked = var_map["mocked"].as<bool>();
+    const std::string model_path = var_map["model"].as<std::string>();
+
+    // Enforce mutual exclusivity before any further configuration is applied.
+    if (use_mocked && !model_path.empty()) {
+      spdlog::error(
+          "Invalid arguments: --mocked and --model are mutually exclusive");
+      return std::nullopt;
+    }
+
+    if (!use_mocked && model_path.empty()) {
+      spdlog::error(
+          "Invalid arguments: either --mocked or --model must be specified");
+      return std::nullopt;
+    }
+
+    // Prompt directory is only meaningful for live inference — the mock
+    // generator has no use for it and should not require it to be present.
+    if (!use_mocked && options.pipeline.prompt_dir.empty()) {
+      spdlog::error(
+          "Invalid arguments: --prompt-dir is required when not using "
+          "--mocked");
+      return std::nullopt;
+    }
+
+    options.generator.use_mocked = use_mocked;
+    options.generator.model_path = model_path;
+
+    // Only populate sampling config when the user explicitly overrides at
+    // least one value. Leaving it as std::nullopt lets LlamaGenerator fall
+    // back to its own SamplingOptions{} defaults, keeping the two paths
+    // consistent without redundant copies.
+    const bool user_provided_sampling =
+        !var_map["temperature"].defaulted() || !var_map["top-p"].defaulted() ||
+        !var_map["top-k"].defaulted() || !var_map["n-ctx"].defaulted() ||
+        !var_map["seed"].defaulted();
+
+    if (user_provided_sampling) {
+      // Warn but do not fail — the run is still valid, the flags are just
+      // silently irrelevant when no model is loaded.
+      if (use_mocked) {
+        spdlog::warn("Sampling parameters are ignored when using --mocked");
+      } else {
+        SamplingOptions sampling;
+        sampling.temperature = var_map["temperature"].as<float>();
+        sampling.top_p = var_map["top-p"].as<float>();
+        sampling.top_k = var_map["top-k"].as<uint32_t>();
+        sampling.n_ctx = var_map["n-ctx"].as<uint32_t>();
+        sampling.seed = var_map["seed"].as<int>();
+
+        options.generator.sampling = sampling;
+      }
+    }
+
+    return options;
+
+  } catch (const std::exception& exception) {
+    spdlog::error("Failed to parse command-line arguments: {}",
+                  exception.what());
+    return std::nullopt;
+  } catch (...) {
+    spdlog::error("Failed to parse command-line arguments: unknown error");
+    return std::nullopt;
+  }
+}