diff --git a/tooling/pipeline/CMakeLists.txt b/tooling/pipeline/CMakeLists.txt index 24628fd..2985138 100644 --- a/tooling/pipeline/CMakeLists.txt +++ b/tooling/pipeline/CMakeLists.txt @@ -133,7 +133,7 @@ set(SOURCES src/data_generation/llama/helpers.cc src/data_generation/llama/infer.cc src/data_generation/llama/load.cc - src/data_generation/llama/load_brewery_prompt.cc + src/services/prompt_directory.cc src/data_generation/prompt_formatting/gemma4_jinja_prompt_formatter.cc src/data_generation/mock/deterministic_hash.cc src/data_generation/mock/generate_brewery.cc diff --git a/tooling/pipeline/includes/data_generation/llama_generator.h b/tooling/pipeline/includes/data_generation/llama_generator.h index 1e648a3..7479337 100644 --- a/tooling/pipeline/includes/data_generation/llama_generator.h +++ b/tooling/pipeline/includes/data_generation/llama_generator.h @@ -17,6 +17,7 @@ #include "data_generation/data_generator.h" #include "data_generation/prompt_formatting/prompt_formatter.h" #include "data_model/application_options.h" +#include "services/prompt_directory.h" struct llama_model; struct llama_context; @@ -33,10 +34,12 @@ class LlamaGenerator final : public DataGenerator { * @param options Parsed application options. * @param model_path Filesystem path to GGUF model assets. * @param prompt_formatter Formatter that produces model-specific prompts. + * @param prompt_directory Directory service for loading named prompt files. */ LlamaGenerator(const ApplicationOptions& options, const std::string& model_path, - std::unique_ptr prompt_formatter); + std::unique_ptr prompt_formatter, + std::unique_ptr prompt_directory); ~LlamaGenerator() override; @@ -119,15 +122,6 @@ class LlamaGenerator final : public DataGenerator { int max_tokens = kDefaultMaxTokens, std::string_view grammar = {}); - /** - * @brief Loads the brewery system prompt from disk. - * - * @param prompt_file_path Prompt file path to try first. - * @return Loaded prompt text. - */ - std::string LoadBrewerySystemPrompt( - const std::filesystem::path& prompt_file_path); - ModelHandle model_; ContextHandle context_; float sampling_temperature_ = 1.0F; @@ -135,8 +129,8 @@ class LlamaGenerator final : public DataGenerator { uint32_t sampling_top_k_ = kDefaultSamplingTopK; std::mt19937 rng_; uint32_t n_ctx_ = kDefaultContextSize; - std::string brewery_system_prompt_; std::unique_ptr prompt_formatter_; + std::unique_ptr prompt_directory_; }; #endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_ diff --git a/tooling/pipeline/includes/data_model/application_options.h b/tooling/pipeline/includes/data_model/application_options.h index 08b7d8c..5893ec8 100644 --- a/tooling/pipeline/includes/data_model/application_options.h +++ b/tooling/pipeline/includes/data_model/application_options.h @@ -53,6 +53,10 @@ struct PipelineOptions { /// @brief Directory for generated artifacts. std::filesystem::path output_path; + /// @brief Directory that contains named prompt files (e.g. + /// BREWERY_GENERATION.md). + std::filesystem::path prompt_dir; + /// @brief Path for application logs. std::filesystem::path log_path; }; diff --git a/tooling/pipeline/includes/services/prompt_directory.h b/tooling/pipeline/includes/services/prompt_directory.h new file mode 100644 index 0000000..53410ad --- /dev/null +++ b/tooling/pipeline/includes/services/prompt_directory.h @@ -0,0 +1,76 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_ + +/** + * @file services/prompt_directory.h + * @brief Interface and filesystem-backed implementation for named prompt + * loading. + * + * Prompt files are resolved by key: a key of "BREWERY_GENERATION" maps to the + * file /BREWERY_GENERATION.md. The interface is kept intentionally + * narrow so test doubles can be injected without touching the filesystem. + */ + +#include +#include +#include +#include +#include + +/** + * @brief Interface for loading named prompt files. + */ +class IPromptDirectory { + public: + IPromptDirectory() = default; + IPromptDirectory(const IPromptDirectory&) = delete; + IPromptDirectory& operator=(const IPromptDirectory&) = delete; + IPromptDirectory(IPromptDirectory&&) = delete; + IPromptDirectory& operator=(IPromptDirectory&&) = delete; + virtual ~IPromptDirectory() = default; + + /** + * @brief Loads the prompt associated with @p key. + * + * @param key Logical prompt key, e.g. "BREWERY_GENERATION". + * @return Prompt text. + * @throws std::runtime_error if the prompt file cannot be found or read. + */ + [[nodiscard]] virtual std::string Load(std::string_view key) = 0; +}; + +/** + * @brief Filesystem-backed IPromptDirectory implementation. + * + * Each call to Load() checks an in-process cache first, then reads + * /.md from disk. The directory must exist and be readable + * at construction time; individual file absence is reported lazily at Load(). + */ +class PromptDirectory final : public IPromptDirectory { + public: + /** + * @brief Constructs a PromptDirectory rooted at @p prompt_dir. + * + * @param prompt_dir Absolute or relative path to the prompt directory. + * @throws std::runtime_error if @p prompt_dir does not exist or is not a + * directory. + */ + explicit PromptDirectory(const std::filesystem::path& prompt_dir); + + /** + * @brief Loads the prompt for @p key, caching the result. + * + * Maps @p key → /.md. + * + * @param key Logical prompt key. + * @return Prompt text. + * @throws std::runtime_error if the file does not exist or is empty. + */ + [[nodiscard]] std::string Load(std::string_view key) override; + + private: + std::filesystem::path prompt_dir_; + std::unordered_map cache_; +}; + +#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_ diff --git a/tooling/pipeline/output/biergarten_seed_2026-05-01T15-12-28.300376Z.sqlite b/tooling/pipeline/output/biergarten_seed_2026-05-01T15-12-28.300376Z.sqlite new file mode 100644 index 0000000..8dbf1c1 Binary files /dev/null and b/tooling/pipeline/output/biergarten_seed_2026-05-01T15-12-28.300376Z.sqlite differ diff --git a/tooling/pipeline/src/data_generation/llama/generate_brewery.cc b/tooling/pipeline/src/data_generation/llama/generate_brewery.cc index 7e92664..f29c637 100644 --- a/tooling/pipeline/src/data_generation/llama/generate_brewery.cc +++ b/tooling/pipeline/src/data_generation/llama/generate_brewery.cc @@ -59,11 +59,12 @@ BreweryResult LlamaGenerator::GenerateBrewery( location.country.empty() ? std::string{} : std::format(", {}", location.country); /** - * Load brewery system prompt from file - * Falls back to minimal inline prompt if file not found + * Load brewery system prompt via the injected prompt directory. + * The key "BREWERY_GENERATION" resolves to BREWERY_GENERATION.md inside + * the configured --prompt-dir. Throws on missing or empty file. */ const std::string system_prompt = - LoadBrewerySystemPrompt("prompts/system.md"); + prompt_directory_->Load("BREWERY_GENERATION"); std::string user_prompt = std::format( "## CITY:\n{}\n\n## COUNTRY:\n{}\n\n## LOCAL LANGUAGE CODES:\n{}\n\n## " diff --git a/tooling/pipeline/src/data_generation/llama/llama_generator.cc b/tooling/pipeline/src/data_generation/llama/llama_generator.cc index 5f28b1b..2a12dc8 100644 --- a/tooling/pipeline/src/data_generation/llama/llama_generator.cc +++ b/tooling/pipeline/src/data_generation/llama/llama_generator.cc @@ -32,9 +32,11 @@ void LlamaGenerator::ContextDeleter::operator()( LlamaGenerator::LlamaGenerator( const ApplicationOptions& options, const std::string& model_path, - std::unique_ptr prompt_formatter) + std::unique_ptr prompt_formatter, + std::unique_ptr prompt_directory) : rng_(std::random_device{}()), - prompt_formatter_(std::move(prompt_formatter)) { + prompt_formatter_(std::move(prompt_formatter)), + prompt_directory_(std::move(prompt_directory)) { if (model_path.empty()) { throw std::runtime_error("LlamaGenerator: model path must not be empty"); } @@ -44,6 +46,11 @@ LlamaGenerator::LlamaGenerator( "LlamaGenerator: prompt formatter dependency must not be null"); } + if (!prompt_directory_) { + throw std::runtime_error( + "LlamaGenerator: prompt directory dependency must not be null"); + } + const auto sampling = options.generator.sampling.value_or(SamplingOptions{}); if (sampling.temperature < 0.0F) { diff --git a/tooling/pipeline/src/data_generation/llama/load_brewery_prompt.cc b/tooling/pipeline/src/data_generation/llama/load_brewery_prompt.cc deleted file mode 100644 index f59d590..0000000 --- a/tooling/pipeline/src/data_generation/llama/load_brewery_prompt.cc +++ /dev/null @@ -1,55 +0,0 @@ -/** - * @file data_generation/llama/load_brewery_prompt.cc - * @brief Resolves brewery system prompt content from cache or a configured - * filesystem path and provides a robust inline fallback prompt when absent. - */ - -#include - -#include -#include -#include - -#include "data_generation/llama_generator.h" - -/** - * @brief Loads brewery system prompt from disk or cache. - * - * @param prompt_file_path Preferred prompt file location. - * @return Prompt text loaded from disk. - */ -std::string LlamaGenerator::LoadBrewerySystemPrompt( - const std::filesystem::path& prompt_file_path) { - // Return cached version if already loaded - if (!brewery_system_prompt_.empty()) { - return brewery_system_prompt_; - } - - std::ifstream prompt_file(prompt_file_path); - if (!prompt_file.is_open()) { - spdlog::error( - "LlamaGenerator: Failed to open brewery system prompt file '{}'", - prompt_file_path.string()); - throw std::runtime_error( - "LlamaGenerator: missing brewery system prompt file: " + - prompt_file_path.string()); - } - - const std::string prompt((std::istreambuf_iterator(prompt_file)), - std::istreambuf_iterator()); - prompt_file.close(); - - if (prompt.empty()) { - spdlog::error("LlamaGenerator: Brewery system prompt file '{}' is empty", - prompt_file_path.string()); - throw std::runtime_error( - "LlamaGenerator: empty brewery system prompt file: " + - prompt_file_path.string()); - } - - spdlog::info( - "LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)", - prompt_file_path.string(), prompt.length()); - brewery_system_prompt_ = prompt; - return brewery_system_prompt_; -} diff --git a/tooling/pipeline/src/main.cc b/tooling/pipeline/src/main.cc index 6206f4f..241c84d 100644 --- a/tooling/pipeline/src/main.cc +++ b/tooling/pipeline/src/main.cc @@ -24,6 +24,7 @@ #include "llama_backend_state.h" #include "services/enrichment_service.h" #include "services/export_service.h" +#include "services/prompt_directory.h" #include "services/sqlite_export_service.h" #include "services/wikipedia_service.h" #include "web_client/curl_web_client.h" @@ -70,6 +71,9 @@ std::optional ParseArguments(const int argc, char** argv) { opt("log-path", prog_opts::value()->default_value("pipeline.log"), "Path for application logs"); + opt("prompt-dir", prog_opts::value()->default_value(""), + "Directory containing named prompt files (e.g. BREWERY_GENERATION.md)." + " Required when not using --mocked."); if (argc == 1) { spdlog::info("Biergarten Pipeline"); @@ -95,6 +99,7 @@ std::optional ParseArguments(const int argc, char** argv) { options.pipeline.output_path = vm["output"].as(); options.pipeline.log_path = vm["log-path"].as(); + options.pipeline.prompt_dir = vm["prompt-dir"].as(); const bool use_mocked = vm["mocked"].as(); const std::string model_path = vm["model"].as(); @@ -111,6 +116,13 @@ std::optional ParseArguments(const int argc, char** argv) { return std::nullopt; } + if (!use_mocked && options.pipeline.prompt_dir.empty()) { + spdlog::error( + "Invalid arguments: --prompt-dir is required when not using " + "--mocked"); + return std::nullopt; + } + options.generator.use_mocked = use_mocked; options.generator.model_path = model_path; @@ -172,6 +184,19 @@ int main(const int argc, char** argv) { const auto sampling = options.generator.sampling.value_or(SamplingOptions{}); + // Scenario 4: Validate the prompt directory up-front, before any DI + // wiring, so the error surfaces immediately with a clear message. + std::unique_ptr prompt_directory; + if (!options.generator.use_mocked) { + try { + prompt_directory = + std::make_unique(options.pipeline.prompt_dir); + } catch (const std::exception& dir_error) { + spdlog::error("[Startup] Invalid --prompt-dir: {}", dir_error.what()); + return 1; + } + } + const auto injector = di::make_injector( di::bind().to(), di::bind().to(options), @@ -180,8 +205,8 @@ int main(const int argc, char** argv) { di::bind().to(), di::bind().to(model_path), di::bind().to( - [options, model_path, - sampling](const auto& inj) -> std::unique_ptr { + [options, model_path, sampling, &prompt_directory]( + const auto& inj) -> std::unique_ptr { if (options.generator.use_mocked) { spdlog::info( "[Generator] Using MockGenerator (no model path provided)"); @@ -193,7 +218,13 @@ int main(const int argc, char** argv) { "top-p={}, top-k={}, n_ctx={}, seed={})", model_path, sampling.temperature, sampling.top_p, sampling.top_k, sampling.n_ctx, sampling.seed); - return inj.template create>(); + // Transfer ownership of the pre-validated PromptDirectory into + // the LlamaGenerator. The lambda captures by reference so the + // unique_ptr is moved exactly once. + return std::make_unique( + options, model_path, + inj.template create>(), + std::move(prompt_directory)); })); auto generator = diff --git a/tooling/pipeline/src/services/prompt_directory.cc b/tooling/pipeline/src/services/prompt_directory.cc new file mode 100644 index 0000000..ab908b2 --- /dev/null +++ b/tooling/pipeline/src/services/prompt_directory.cc @@ -0,0 +1,85 @@ +/** + * @file services/prompt_directory.cc + * @brief PromptDirectory implementation: validates the directory at + * construction and loads named prompt files on demand with in-process caching. + */ + +#include "services/prompt_directory.h" + +#include + +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// PromptDirectory +// --------------------------------------------------------------------------- + +PromptDirectory::PromptDirectory(const std::filesystem::path& prompt_dir) + : prompt_dir_(prompt_dir) { + std::error_code ec; + + // Scenario 4: directory must exist. + if (!std::filesystem::exists(prompt_dir_, ec) || ec) { + throw std::runtime_error( + "PromptDirectory: prompt directory does not exist: " + + prompt_dir_.string()); + } + + // Scenario 4: path must be a directory, not a file. + if (!std::filesystem::is_directory(prompt_dir_, ec) || ec) { + throw std::runtime_error( + "PromptDirectory: prompt directory path is not a directory: " + + prompt_dir_.string()); + } + + // Scenario 4: directory must be readable (probe with directory_iterator). + std::filesystem::directory_iterator probe(prompt_dir_, ec); + if (ec) { + throw std::runtime_error( + "PromptDirectory: prompt directory is not readable: " + + prompt_dir_.string() + " (" + ec.message() + ")"); + } + + spdlog::info("[PromptDirectory] Resolved prompt directory: {}", + prompt_dir_.string()); +} + +std::string PromptDirectory::Load(std::string_view key) { + const std::string key_str(key); + + // Return cached content if already loaded during this run. + const auto cache_it = cache_.find(key_str); + if (cache_it != cache_.end()) { + return cache_it->second; + } + + // Scenario 3: resolve /.md and require it to exist. + const std::filesystem::path file_path = + prompt_dir_ / std::filesystem::path(key_str + ".md"); + + std::ifstream file(file_path); + if (!file.is_open()) { + throw std::runtime_error( + "PromptDirectory: prompt file not found for key '" + key_str + + "': " + file_path.string()); + } + + std::string content((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); + file.close(); + + if (content.empty()) { + throw std::runtime_error("PromptDirectory: prompt file for key '" + + key_str + "' is empty: " + file_path.string()); + } + + spdlog::info("[PromptDirectory] Loaded prompt '{}' from '{}' ({} chars)", + key_str, file_path.string(), content.size()); + + cache_.emplace(key_str, content); + return content; +}