diff --git a/docs/pipeline/diagrams/planned/class.puml b/docs/pipeline/diagrams/planned/class.puml index 3b775cf..fd950c9 100644 --- a/docs/pipeline/diagrams/planned/class.puml +++ b/docs/pipeline/diagrams/planned/class.puml @@ -141,37 +141,38 @@ package "Domain: Models" { LocationContext *-- Completeness } - -package "Domain: Application Configuration"{ +@startuml +package "Domain: Application Configuration" { class SamplingOptions { - + temperature : float = 1.0F - + top_p : float = 0.95F - + top_k : uint32_t = 64 - + n_ctx : uint32_t = 8192 - + seed : int = -1 + + temperature: float = 1.0F + + top_p: float = 0.95F + + top_k: uint32_t = 64 + + n_ctx: uint32_t = 8192 + + seed: int = -1 } class GeneratorOptions { - + model_path : std::filesystem::path - + use_mocked : bool = false - + sampling : SamplingOptions + + model_path: std::filesystem::path + + use_mocked: bool = false + + sampling: std::optional } class PipelineOptions { - + output_path : std::filesystem::path - + log_path : std::filesystem::path + + output_path: std::filesystem::path + + log_path: std::filesystem::path } class ApplicationOptions { - + generator : GeneratorOptions - + pipeline : PipelineOptions + + generator: GeneratorOptions + + pipeline: PipelineOptions } ' --- Domain Model Relationships --- ApplicationOptions *-- GeneratorOptions ApplicationOptions *-- PipelineOptions - GeneratorOptions *-- SamplingOptions + GeneratorOptions o-- SamplingOptions } +@endum package "Domain: Policy" { diff --git a/tooling/pipeline/.gitignore b/tooling/pipeline/.gitignore index c7078bb..c3075e5 100644 --- a/tooling/pipeline/.gitignore +++ b/tooling/pipeline/.gitignore @@ -6,3 +6,4 @@ data models *.gguf BiergartenPipeline.png +output \ No newline at end of file diff --git a/tooling/pipeline/CMakeLists.txt b/tooling/pipeline/CMakeLists.txt index 651b985..2985138 100644 --- a/tooling/pipeline/CMakeLists.txt +++ b/tooling/pipeline/CMakeLists.txt @@ -85,14 +85,14 @@ endif() FetchContent_Declare( llama-cpp GIT_REPOSITORY https://github.com/ggml-org/llama.cpp.git - GIT_TAG b8742 + GIT_TAG b8742 ) FetchContent_MakeAvailable(llama-cpp) FetchContent_Declare( boost-di GIT_REPOSITORY https://github.com/boost-ext/di.git - GIT_TAG v1.3.0 + GIT_TAG v1.3.0 ) FetchContent_MakeAvailable(boost-di) if(TARGET Boost.DI AND NOT TARGET boost::di) @@ -102,7 +102,7 @@ endif() FetchContent_Declare( spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git - GIT_TAG v1.15.3 + GIT_TAG v1.15.3 ) FetchContent_MakeAvailable(spdlog) @@ -121,8 +121,8 @@ set(SOURCES src/services/wikipedia/fetch_extract.cc src/services/sqlite/sqlite_export_service.cc src/services/sqlite/build_database_path.cc - src/services/sqlite/process_record.cc - src/services/sqlite/initialize.cc + src/services/sqlite/process_record.cc + src/services/sqlite/initialize.cc src/services/sqlite/finalize.cc src/web_client/curl_global_state.cc src/web_client/curl_web_client_get.cc @@ -133,14 +133,14 @@ set(SOURCES src/data_generation/llama/helpers.cc src/data_generation/llama/infer.cc src/data_generation/llama/load.cc - src/data_generation/llama/load_brewery_prompt.cc + src/services/prompt_directory.cc src/data_generation/prompt_formatting/gemma4_jinja_prompt_formatter.cc src/data_generation/mock/deterministic_hash.cc src/data_generation/mock/generate_brewery.cc src/data_generation/mock/generate_user.cc src/json_handling/json_loader.cc - src/services/sqlite/helpers/sqlite_connection_helpers.cpp - src/services/sqlite/helpers/sqlite_statement_helpers.cpp + src/services/sqlite/helpers/sqlite_connection_helpers.cpp + src/services/sqlite/helpers/sqlite_statement_helpers.cpp ) # ============================================================================= @@ -173,6 +173,6 @@ configure_file( add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory - ${CMAKE_SOURCE_DIR}/prompts - ${CMAKE_BINARY_DIR}/prompts + ${CMAKE_SOURCE_DIR}/prompts + ${CMAKE_BINARY_DIR}/prompts ) diff --git a/tooling/pipeline/includes/data_generation/llama_generator.h b/tooling/pipeline/includes/data_generation/llama_generator.h index 1e648a3..7479337 100644 --- a/tooling/pipeline/includes/data_generation/llama_generator.h +++ b/tooling/pipeline/includes/data_generation/llama_generator.h @@ -17,6 +17,7 @@ #include "data_generation/data_generator.h" #include "data_generation/prompt_formatting/prompt_formatter.h" #include "data_model/application_options.h" +#include "services/prompt_directory.h" struct llama_model; struct llama_context; @@ -33,10 +34,12 @@ class LlamaGenerator final : public DataGenerator { * @param options Parsed application options. * @param model_path Filesystem path to GGUF model assets. * @param prompt_formatter Formatter that produces model-specific prompts. + * @param prompt_directory Directory service for loading named prompt files. */ LlamaGenerator(const ApplicationOptions& options, const std::string& model_path, - std::unique_ptr prompt_formatter); + std::unique_ptr prompt_formatter, + std::unique_ptr prompt_directory); ~LlamaGenerator() override; @@ -119,15 +122,6 @@ class LlamaGenerator final : public DataGenerator { int max_tokens = kDefaultMaxTokens, std::string_view grammar = {}); - /** - * @brief Loads the brewery system prompt from disk. - * - * @param prompt_file_path Prompt file path to try first. - * @return Loaded prompt text. - */ - std::string LoadBrewerySystemPrompt( - const std::filesystem::path& prompt_file_path); - ModelHandle model_; ContextHandle context_; float sampling_temperature_ = 1.0F; @@ -135,8 +129,8 @@ class LlamaGenerator final : public DataGenerator { uint32_t sampling_top_k_ = kDefaultSamplingTopK; std::mt19937 rng_; uint32_t n_ctx_ = kDefaultContextSize; - std::string brewery_system_prompt_; std::unique_ptr prompt_formatter_; + std::unique_ptr prompt_directory_; }; #endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_ diff --git a/tooling/pipeline/includes/data_generation/mock_generator.h b/tooling/pipeline/includes/data_generation/mock_generator.h index d91c1e3..5924164 100644 --- a/tooling/pipeline/includes/data_generation/mock_generator.h +++ b/tooling/pipeline/includes/data_generation/mock_generator.h @@ -44,6 +44,13 @@ class MockGenerator final : public DataGenerator { */ static size_t DeterministicHash(const Location& location); + // Hash stride constants for deterministic distribution across fixed-size + // arrays. These coprime strides spread hash values uniformly without + // clustering, ensuring diverse output across different hash inputs. + static constexpr size_t kNounHashStride = 7; + static constexpr size_t kDescriptionHashStride = 13; + static constexpr size_t kBioHashStride = 11; + static constexpr std::array kBreweryAdjectives = { "Craft", "Heritage", "Local", "Artisan", "Pioneer", "Golden", "Modern", "Classic", "Summit", "Northern", "Riverstone", "Barrel", diff --git a/tooling/pipeline/includes/data_model/application_options.h b/tooling/pipeline/includes/data_model/application_options.h index 1d36bd7..7cbab47 100644 --- a/tooling/pipeline/includes/data_model/application_options.h +++ b/tooling/pipeline/includes/data_model/application_options.h @@ -6,37 +6,71 @@ * @brief Program options for the Biergarten pipeline application. */ +#include #include +#include +#include #include +namespace prog_opts = boost::program_options; + /** - * @brief Program options for the Biergarten pipeline application. + * @brief LLM sampling parameters. */ -struct ApplicationOptions { - /// @brief Path to the LLM model file (gguf format); mutually exclusive with - /// use_mocked. - std::string model_path; - - /// @brief Use mocked generator instead of LLM; mutually exclusive with - /// model_path. - bool use_mocked = false; - +struct SamplingOptions { /// @brief LLM sampling temperature (0.0 to 1.0, higher = more random). float temperature = 1.0F; - /// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more - /// random). + /// @brief LLM nucleus sampling top-p parameter. float top_p = 0.95F; /// @brief LLM top-k sampling parameter. uint32_t top_k = 64; - /// @brief Context window size (tokens) for LLM inference. Higher values - /// support longer prompts but use more memory. + /// @brief Context window size (tokens). uint32_t n_ctx = 8192; - /// @brief Random seed for sampling (-1 for random, otherwise non-negative). + /// @brief Random seed (-1 for random, otherwise non-negative). int seed = -1; }; +/** + * @brief Configuration for the LLM generator component. + */ +struct GeneratorOptions { + /// @brief Path to the LLM model file (gguf format). + std::filesystem::path model_path; + + /// @brief Use mocked generator instead of actual LLM inference. + bool use_mocked = false; + + /// @brief Specific sampling parameters for this generator. + /// If nullopt, the application should use global defaults. + std::optional sampling; +}; + +/** + * @brief Configuration for the pipeline execution and output. + */ +struct PipelineOptions { + /// @brief Directory for generated artifacts. + std::filesystem::path output_path; + + /// @brief Directory that contains named prompt files (e.g. + /// BREWERY_GENERATION.md). + std::filesystem::path prompt_dir; + + /// @brief Path for application logs. + std::filesystem::path log_path; +}; + +/** + * @brief Root configuration object for the Biergarten pipeline. + */ +struct ApplicationOptions { + GeneratorOptions generator; + PipelineOptions pipeline; +}; + +std::optional ParseArguments(const int argc, char** argv); #endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_APPLICATION_OPTIONS_H_ diff --git a/tooling/pipeline/includes/services/export_service.h b/tooling/pipeline/includes/services/export_service.h index 55d0b06..3b5d6c6 100644 --- a/tooling/pipeline/includes/services/export_service.h +++ b/tooling/pipeline/includes/services/export_service.h @@ -6,6 +6,8 @@ * @brief Abstraction for persisting generated brewery data. */ +#include + #include "data_model/generated_brewery.h" /** diff --git a/tooling/pipeline/includes/services/prompt_directory.h b/tooling/pipeline/includes/services/prompt_directory.h new file mode 100644 index 0000000..53410ad --- /dev/null +++ b/tooling/pipeline/includes/services/prompt_directory.h @@ -0,0 +1,76 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_ + +/** + * @file services/prompt_directory.h + * @brief Interface and filesystem-backed implementation for named prompt + * loading. + * + * Prompt files are resolved by key: a key of "BREWERY_GENERATION" maps to the + * file /BREWERY_GENERATION.md. The interface is kept intentionally + * narrow so test doubles can be injected without touching the filesystem. + */ + +#include +#include +#include +#include +#include + +/** + * @brief Interface for loading named prompt files. + */ +class IPromptDirectory { + public: + IPromptDirectory() = default; + IPromptDirectory(const IPromptDirectory&) = delete; + IPromptDirectory& operator=(const IPromptDirectory&) = delete; + IPromptDirectory(IPromptDirectory&&) = delete; + IPromptDirectory& operator=(IPromptDirectory&&) = delete; + virtual ~IPromptDirectory() = default; + + /** + * @brief Loads the prompt associated with @p key. + * + * @param key Logical prompt key, e.g. "BREWERY_GENERATION". + * @return Prompt text. + * @throws std::runtime_error if the prompt file cannot be found or read. + */ + [[nodiscard]] virtual std::string Load(std::string_view key) = 0; +}; + +/** + * @brief Filesystem-backed IPromptDirectory implementation. + * + * Each call to Load() checks an in-process cache first, then reads + * /.md from disk. The directory must exist and be readable + * at construction time; individual file absence is reported lazily at Load(). + */ +class PromptDirectory final : public IPromptDirectory { + public: + /** + * @brief Constructs a PromptDirectory rooted at @p prompt_dir. + * + * @param prompt_dir Absolute or relative path to the prompt directory. + * @throws std::runtime_error if @p prompt_dir does not exist or is not a + * directory. + */ + explicit PromptDirectory(const std::filesystem::path& prompt_dir); + + /** + * @brief Loads the prompt for @p key, caching the result. + * + * Maps @p key → /.md. + * + * @param key Logical prompt key. + * @return Prompt text. + * @throws std::runtime_error if the file does not exist or is empty. + */ + [[nodiscard]] std::string Load(std::string_view key) override; + + private: + std::filesystem::path prompt_dir_; + std::unordered_map cache_; +}; + +#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_ diff --git a/tooling/pipeline/includes/services/sqlite_connection_helpers.h b/tooling/pipeline/includes/services/sqlite_connection_helpers.h index 7509c0e..b8dd370 100644 --- a/tooling/pipeline/includes/services/sqlite_connection_helpers.h +++ b/tooling/pipeline/includes/services/sqlite_connection_helpers.h @@ -7,6 +7,7 @@ */ #include + #include #include #include @@ -20,12 +21,10 @@ void ThrowSqliteError(sqlite3* db_handle, std::string_view action); SqliteDatabaseHandle OpenDatabase(const std::filesystem::path& path); void ExecSql(const SqliteDatabaseHandle& db_handle, std::string_view sql, - const char* action); + const char* action); void RollbackTransactionNoThrow(const SqliteDatabaseHandle& db_handle) noexcept; } // namespace sqlite_export_service_internal #endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_CONNECTION_HELPERS_H_ - - diff --git a/tooling/pipeline/includes/services/sqlite_export_service.h b/tooling/pipeline/includes/services/sqlite_export_service.h index 0fa998f..ebae1ec 100644 --- a/tooling/pipeline/includes/services/sqlite_export_service.h +++ b/tooling/pipeline/includes/services/sqlite_export_service.h @@ -11,6 +11,7 @@ #include #include +#include "data_model/application_options.h" #include "services/date_time_provider.h" #include "services/export_service.h" #include "services/sqlite_export_service_helpers.h" @@ -20,7 +21,7 @@ */ class SqliteExportService final : public IExportService { public: - SqliteExportService(); + explicit SqliteExportService(const ApplicationOptions& options); ~SqliteExportService() override; SqliteExportService(const SqliteExportService&) = delete; @@ -41,12 +42,12 @@ class SqliteExportService final : public IExportService { void InitializeSchema() const; void PrepareStatements(); void RollbackAndCloseNoThrow() noexcept; - void FinalizeStatements() noexcept; [[nodiscard]] std::filesystem::path BuildDatabasePath() const; [[nodiscard]] static std::string BuildLocationKey(const Location& location); std::unique_ptr date_time_provider_; + std::filesystem::path output_path_; std::string run_timestamp_utc_; std::filesystem::path database_path_; SqliteDatabaseHandle db_handle_; diff --git a/tooling/pipeline/includes/services/sqlite_export_service_helpers.h b/tooling/pipeline/includes/services/sqlite_export_service_helpers.h index 9b63aa5..e18766a 100644 --- a/tooling/pipeline/includes/services/sqlite_export_service_helpers.h +++ b/tooling/pipeline/includes/services/sqlite_export_service_helpers.h @@ -3,8 +3,8 @@ /* Umbrella header for backward compatibility. */ -#include "services/sqlite_handle_types.h" #include "services/sqlite_connection_helpers.h" +#include "services/sqlite_handle_types.h" #include "services/sqlite_statement_helpers.h" #endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_EXPORT_SERVICE_HELPERS_H_ diff --git a/tooling/pipeline/includes/services/sqlite_handle_types.h b/tooling/pipeline/includes/services/sqlite_handle_types.h index 6994c4a..92da0ef 100644 --- a/tooling/pipeline/includes/services/sqlite_handle_types.h +++ b/tooling/pipeline/includes/services/sqlite_handle_types.h @@ -6,6 +6,7 @@ */ #include + #include #include @@ -33,4 +34,3 @@ struct BindParam { } // namespace sqlite_export_service_internal #endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_HANDLE_TYPES_H_ - diff --git a/tooling/pipeline/includes/services/sqlite_statement_helpers.h b/tooling/pipeline/includes/services/sqlite_statement_helpers.h index 5f3315b..35cf829 100644 --- a/tooling/pipeline/includes/services/sqlite_statement_helpers.h +++ b/tooling/pipeline/includes/services/sqlite_statement_helpers.h @@ -3,10 +3,12 @@ /** * @file services/sqlite_statement_helpers.h - * @brief Declarations for statement-level SQLite helper functions and constants. + * @brief Declarations for statement-level SQLite helper functions and + * constants. */ #include + #include #include #include @@ -107,10 +109,8 @@ void StepStatement(const SqliteDatabaseHandle& db_handle, sqlite3_int64 LastInsertRowId(const SqliteDatabaseHandle& db_handle); -std::string SerializeLocalLanguages(const std::vector& local_languages); std::string SerializeVector(const std::vector& str_vec); } // namespace sqlite_export_service_internal #endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_STATEMENT_HELPERS_H_ - diff --git a/tooling/pipeline/includes/services/timer.h b/tooling/pipeline/includes/services/timer.h new file mode 100644 index 0000000..5542721 --- /dev/null +++ b/tooling/pipeline/includes/services/timer.h @@ -0,0 +1,35 @@ +#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_ + +#include + +/** + * @file services/timer.h + * @brief Simple timer utility for measuring elapsed time. + */ +class Timer { + std::chrono::steady_clock::time_point start_time = + std::chrono::steady_clock::now(); + + public: + Timer(const Timer&) = delete; + Timer& operator=(const Timer&) = delete; + Timer(Timer&&) = delete; + Timer& operator=(Timer&&) = delete; + Timer() = default; + ~Timer() = default; + + [[nodiscard]] int64_t Elapsed() const { + return std::chrono::duration_cast( + std::chrono::steady_clock::now() - start_time) + .count(); + } + + [[nodiscard]] int64_t Reset() { + auto previous_elapsed = Elapsed(); + start_time = std::chrono::steady_clock::now(); + return previous_elapsed; + } +}; + +#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_ diff --git a/tooling/pipeline/prompts/system.md b/tooling/pipeline/prompts/BREWERY_GENERATION.md similarity index 100% rename from tooling/pipeline/prompts/system.md rename to tooling/pipeline/prompts/BREWERY_GENERATION.md diff --git a/tooling/pipeline/src/application_options/parse_arguments.cc b/tooling/pipeline/src/application_options/parse_arguments.cc new file mode 100644 index 0000000..019cb32 --- /dev/null +++ b/tooling/pipeline/src/application_options/parse_arguments.cc @@ -0,0 +1,150 @@ +#include + +#include +#include +#include + +#include "data_model/application_options.h" + +std::optional ParseArguments(const int argc, char** argv) { + prog_opts::options_description desc("Pipeline Options"); + auto opt = desc.add_options(); + + opt("help,h", "Produce help message"); + + // Defaults sourced from SamplingOptions{} so the CLI and LlamaGenerator + // share a single source of truth — changing the struct updates both. + auto add_sampling_options = [&]() -> void { + const SamplingOptions sampling_defaults{}; + opt("temperature", + prog_opts::value()->default_value(sampling_defaults.temperature), + "Sampling temperature (higher = more random)"); + opt("top-p", + prog_opts::value()->default_value(sampling_defaults.top_p), + "Nucleus sampling top-p in (0,1] (higher = more random)"); + opt("top-k", + prog_opts::value()->default_value(sampling_defaults.top_k), + "Top-k sampling parameter (higher = more candidate tokens)"); + opt("n-ctx", + prog_opts::value()->default_value(sampling_defaults.n_ctx), + "Context window size in tokens"); + opt("seed", prog_opts::value()->default_value(sampling_defaults.seed), + "Sampler seed: -1 for random, otherwise non-negative integer"); + }; + + // --mocked and --model are mutually exclusive; validation is enforced below + // rather than at registration to produce a clear diagnostic message. + auto add_generator_options = [&]() -> void { + opt("mocked", prog_opts::bool_switch(), + "Use mocked generator for brewery/user data"); + opt("model,m", prog_opts::value()->default_value(""), + "Path to LLM model (gguf)"); + }; + + auto add_pipeline_options = [&]() -> void { + opt("output,o", prog_opts::value()->default_value("output"), + "Directory for generated artifacts"); + opt("log-path", + prog_opts::value()->default_value("pipeline.log"), + "Path for application logs"); + opt("prompt-dir", prog_opts::value()->default_value(""), + "Directory containing named prompt files (e.g. BREWERY_GENERATION.md)." + " Required when not using --mocked."); + }; + + add_sampling_options(); + add_generator_options(); + add_pipeline_options(); + + // No flags provided — treat as a help request rather than an error. + if (argc == 1) { + spdlog::info("Biergarten Pipeline"); + std::stringstream usage_stream; + usage_stream << "\nUsage: biergarten-pipeline [options]\n\n" << desc; + spdlog::info(usage_stream.str()); + return std::nullopt; + } + + try { + prog_opts::variables_map var_map; + prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), var_map); + prog_opts::notify(var_map); + + if (var_map.contains("help")) { + std::stringstream help_stream; + help_stream << "\n" << desc; + spdlog::info(help_stream.str()); + return std::nullopt; + } + + ApplicationOptions options; + + options.pipeline.output_path = var_map["output"].as(); + options.pipeline.log_path = var_map["log-path"].as(); + options.pipeline.prompt_dir = var_map["prompt-dir"].as(); + + const bool use_mocked = var_map["mocked"].as(); + const std::string model_path = var_map["model"].as(); + + // Enforce mutual exclusivity before any further configuration is applied. + if (use_mocked && !model_path.empty()) { + spdlog::error( + "Invalid arguments: --mocked and --model are mutually exclusive"); + return std::nullopt; + } + + if (!use_mocked && model_path.empty()) { + spdlog::error( + "Invalid arguments: either --mocked or --model must be specified"); + return std::nullopt; + } + + // Prompt directory is only meaningful for live inference — the mock + // generator has no use for it and should not require it to be present. + if (!use_mocked && options.pipeline.prompt_dir.empty()) { + spdlog::error( + "Invalid arguments: --prompt-dir is required when not using " + "--mocked"); + return std::nullopt; + } + + options.generator.use_mocked = use_mocked; + options.generator.model_path = model_path; + + // Only populate sampling config when the user explicitly overrides at + // least one value. Leaving it as std::nullopt lets LlamaGenerator fall + // back to its own SamplingOptions{} defaults, keeping the two paths + // consistent without redundant copies. + const bool user_provided_sampling = + !var_map["temperature"].defaulted() || !var_map["top-p"].defaulted() || + !var_map["top-k"].defaulted() || !var_map["n-ctx"].defaulted() || + !var_map["seed"].defaulted(); + + if (user_provided_sampling) { + // Warn but do not fail — the run is still valid, the flags are just + // silently irrelevant when no model is loaded. + if (use_mocked) { + spdlog::warn("Sampling parameters are ignored when using --mocked"); + } else { + SamplingOptions sampling; + sampling.temperature = var_map["temperature"].as(); + sampling.top_p = var_map["top-p"].as(); + sampling.top_k = var_map["top-k"].as(); + sampling.n_ctx = var_map["n-ctx"].as(); + sampling.seed = var_map["seed"].as(); + + options.generator.sampling = sampling; + } + } + + return options; + + } catch (const std::exception& exception) { + spdlog::error("Failed to parse command-line arguments: {}", + exception.what()); + return std::nullopt; + } catch (...) { + spdlog::error("Failed to parse command-line arguments: unknown error"); + return std::nullopt; + } +} diff --git a/tooling/pipeline/src/data_generation/llama/generate_brewery.cc b/tooling/pipeline/src/data_generation/llama/generate_brewery.cc index 7e92664..569036b 100644 --- a/tooling/pipeline/src/data_generation/llama/generate_brewery.cc +++ b/tooling/pipeline/src/data_generation/llama/generate_brewery.cc @@ -33,6 +33,9 @@ static std::string FormatLocalLanguageCodes( return formatted; } +// GBNF grammar for structured brewery JSON output. +// @TODO move to a separate gbnf file if it grows in complexity or is shared +// across modules. static constexpr std::string_view kBreweryJsonGrammar = R"json_brewery( root ::= thought-block "{" ws "\"name_en\"" ws ":" ws string ws "," ws "\"description_en\"" ws ":" ws string ws "," ws "\"name_local\"" ws ":" ws string ws "," ws "\"description_local\"" ws ":" ws string ws "}" ws thought-block ::= [^{]* @@ -59,11 +62,12 @@ BreweryResult LlamaGenerator::GenerateBrewery( location.country.empty() ? std::string{} : std::format(", {}", location.country); /** - * Load brewery system prompt from file - * Falls back to minimal inline prompt if file not found + * Load brewery system prompt via the injected prompt directory. + * The key "BREWERY_GENERATION" resolves to BREWERY_GENERATION.md inside + * the configured --prompt-dir. Throws on missing or empty file. */ const std::string system_prompt = - LoadBrewerySystemPrompt("prompts/system.md"); + prompt_directory_->Load("BREWERY_GENERATION"); std::string user_prompt = std::format( "## CITY:\n{}\n\n## COUNTRY:\n{}\n\n## LOCAL LANGUAGE CODES:\n{}\n\n## " diff --git a/tooling/pipeline/src/data_generation/llama/generate_user.cc b/tooling/pipeline/src/data_generation/llama/generate_user.cc index eaebc09..7ed6426 100644 --- a/tooling/pipeline/src/data_generation/llama/generate_user.cc +++ b/tooling/pipeline/src/data_generation/llama/generate_user.cc @@ -12,6 +12,13 @@ #include "data_generation/llama_generator.h" #include "data_generation/llama_generator_helpers.h" +// TODO: Implement locale-aware user profile generation. +// Current implementation returns a hardcoded test value and ignores the +// locale parameter. Future implementation should: +// 1. Load a USER_GENERATION.md prompt template with locale context +// 2. Perform LLM inference with locale-specific username/bio generation +// 3. Parse and validate JSON output with retry handling (similar to brewery) +// 4. Return locale-aware username and biography UserResult LlamaGenerator::GenerateUser(const std::string& locale) { return {.username = "test_user", .bio = "This is a test user profile from " + locale + "."}; diff --git a/tooling/pipeline/src/data_generation/llama/helpers.cc b/tooling/pipeline/src/data_generation/llama/helpers.cc index 8556b8d..1433523 100644 --- a/tooling/pipeline/src/data_generation/llama/helpers.cc +++ b/tooling/pipeline/src/data_generation/llama/helpers.cc @@ -58,6 +58,11 @@ static std::string CondenseWhitespace(std::string_view text) { return out; } +// Guard against truncating in the first half of the string. +// This preserves the critical opening content and avoids cutting critical +// context words early in the region description. +static constexpr size_t kTruncationGuardDivisor = 2; + /** * Truncate region context to fit within max length while preserving word * boundaries @@ -71,7 +76,8 @@ std::string PrepareRegionContext(std::string_view region_context, normalized.resize(max_chars); const size_t last_space = normalized.find_last_of(' '); - if (last_space != std::string::npos && last_space > max_chars / 2) { + if (last_space != std::string::npos && + last_space > max_chars / kTruncationGuardDivisor) { normalized.resize(last_space); } diff --git a/tooling/pipeline/src/data_generation/llama/infer.cc b/tooling/pipeline/src/data_generation/llama/infer.cc index 2a2c116..dc06d0b 100644 --- a/tooling/pipeline/src/data_generation/llama/infer.cc +++ b/tooling/pipeline/src/data_generation/llama/infer.cc @@ -19,6 +19,9 @@ #include "llama.h" static constexpr size_t kPromptTokenSlack = 8; +// Minimum tokens to keep when using top-p sampling. Ensures at least one +// candidate token remains available even with very restrictive top-p values. +static constexpr size_t kTopPMinKeep = 1; namespace { @@ -62,7 +65,7 @@ SamplerHandle MakeSamplerChain(const llama_vocab* vocab, "LlamaGenerator: failed to initialize temperature sampler"); add_sampler(llama_sampler_init_top_k(static_cast(config.top_k)), "LlamaGenerator: failed to initialize top-k sampler"); - add_sampler(llama_sampler_init_top_p(config.top_p, 1), + add_sampler(llama_sampler_init_top_p(config.top_p, kTopPMinKeep), "LlamaGenerator: failed to initialize top-p sampler"); add_sampler(llama_sampler_init_dist(config.seed), "LlamaGenerator: failed to initialize distribution sampler"); diff --git a/tooling/pipeline/src/data_generation/llama/llama_generator.cc b/tooling/pipeline/src/data_generation/llama/llama_generator.cc index a854f48..2a12dc8 100644 --- a/tooling/pipeline/src/data_generation/llama/llama_generator.cc +++ b/tooling/pipeline/src/data_generation/llama/llama_generator.cc @@ -32,9 +32,11 @@ void LlamaGenerator::ContextDeleter::operator()( LlamaGenerator::LlamaGenerator( const ApplicationOptions& options, const std::string& model_path, - std::unique_ptr prompt_formatter) + std::unique_ptr prompt_formatter, + std::unique_ptr prompt_directory) : rng_(std::random_device{}()), - prompt_formatter_(std::move(prompt_formatter)) { + prompt_formatter_(std::move(prompt_formatter)), + prompt_directory_(std::move(prompt_directory)) { if (model_path.empty()) { throw std::runtime_error("LlamaGenerator: model path must not be empty"); } @@ -44,41 +46,49 @@ LlamaGenerator::LlamaGenerator( "LlamaGenerator: prompt formatter dependency must not be null"); } - if (options.temperature < 0.0F) { + if (!prompt_directory_) { + throw std::runtime_error( + "LlamaGenerator: prompt directory dependency must not be null"); + } + + const auto sampling = options.generator.sampling.value_or(SamplingOptions{}); + + if (sampling.temperature < 0.0F) { throw std::runtime_error( "LlamaGenerator: sampling temperature must be >= 0"); } - if (options.top_p <= 0.0F || options.top_p > 1.0F) { + if (sampling.top_p <= 0.0F || sampling.top_p > 1.0F) { throw std::runtime_error( "LlamaGenerator: sampling top-p must be in (0, 1]"); } - if (options.top_k == 0U) { + if (sampling.top_k == 0U) { throw std::runtime_error("LlamaGenerator: sampling top-k must be > 0"); } - if (options.seed < -1) { + if (sampling.seed < -1) { throw std::runtime_error( "LlamaGenerator: seed must be >= 0, or -1 for random"); } - if (options.n_ctx == 0 || options.n_ctx > kMaxContextSize) { + if (sampling.n_ctx == 0 || sampling.n_ctx > kMaxContextSize) { throw std::runtime_error( "LlamaGenerator: context size must be in range [1, 32768]"); } - sampling_temperature_ = options.temperature; - sampling_top_p_ = options.top_p; - sampling_top_k_ = options.top_k; + sampling_temperature_ = sampling.temperature; + sampling_top_p_ = sampling.top_p; + sampling_top_k_ = sampling.top_k; - if (options.seed == -1) { + if (sampling.seed == -1) { std::random_device random_device; rng_.seed(random_device()); } else { - rng_.seed(static_cast(options.seed)); + rng_.seed(static_cast(sampling.seed)); } - n_ctx_ = options.n_ctx; + + n_ctx_ = sampling.n_ctx; this->Load(model_path); } diff --git a/tooling/pipeline/src/data_generation/llama/load.cc b/tooling/pipeline/src/data_generation/llama/load.cc index 98feb5a..8ce3142 100644 --- a/tooling/pipeline/src/data_generation/llama/load.cc +++ b/tooling/pipeline/src/data_generation/llama/load.cc @@ -14,6 +14,10 @@ #include "data_generation/llama_generator.h" #include "llama.h" +// Maximum batch size for decode operations. Capping the batch prevents +// excessive memory allocation while maintaining inference performance. +static constexpr uint32_t kMaxBatchSize = 5000U; + void LlamaGenerator::Load(const std::string& model_path) { context_.reset(); model_.reset(); @@ -28,7 +32,7 @@ void LlamaGenerator::Load(const std::string& model_path) { llama_context_params context_params = llama_context_default_params(); context_params.n_ctx = n_ctx_; - context_params.n_batch = std::min(n_ctx_, static_cast(5000)); + context_params.n_batch = std::min(n_ctx_, kMaxBatchSize); LlamaGenerator::ContextHandle loaded_context( llama_init_from_model(loaded_model.get(), context_params)); diff --git a/tooling/pipeline/src/data_generation/llama/load_brewery_prompt.cc b/tooling/pipeline/src/data_generation/llama/load_brewery_prompt.cc deleted file mode 100644 index f59d590..0000000 --- a/tooling/pipeline/src/data_generation/llama/load_brewery_prompt.cc +++ /dev/null @@ -1,55 +0,0 @@ -/** - * @file data_generation/llama/load_brewery_prompt.cc - * @brief Resolves brewery system prompt content from cache or a configured - * filesystem path and provides a robust inline fallback prompt when absent. - */ - -#include - -#include -#include -#include - -#include "data_generation/llama_generator.h" - -/** - * @brief Loads brewery system prompt from disk or cache. - * - * @param prompt_file_path Preferred prompt file location. - * @return Prompt text loaded from disk. - */ -std::string LlamaGenerator::LoadBrewerySystemPrompt( - const std::filesystem::path& prompt_file_path) { - // Return cached version if already loaded - if (!brewery_system_prompt_.empty()) { - return brewery_system_prompt_; - } - - std::ifstream prompt_file(prompt_file_path); - if (!prompt_file.is_open()) { - spdlog::error( - "LlamaGenerator: Failed to open brewery system prompt file '{}'", - prompt_file_path.string()); - throw std::runtime_error( - "LlamaGenerator: missing brewery system prompt file: " + - prompt_file_path.string()); - } - - const std::string prompt((std::istreambuf_iterator(prompt_file)), - std::istreambuf_iterator()); - prompt_file.close(); - - if (prompt.empty()) { - spdlog::error("LlamaGenerator: Brewery system prompt file '{}' is empty", - prompt_file_path.string()); - throw std::runtime_error( - "LlamaGenerator: empty brewery system prompt file: " + - prompt_file_path.string()); - } - - spdlog::info( - "LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)", - prompt_file_path.string(), prompt.length()); - brewery_system_prompt_ = prompt; - return brewery_system_prompt_; -} diff --git a/tooling/pipeline/src/data_generation/mock/generate_brewery.cc b/tooling/pipeline/src/data_generation/mock/generate_brewery.cc index c2495e0..20a4ff6 100644 --- a/tooling/pipeline/src/data_generation/mock/generate_brewery.cc +++ b/tooling/pipeline/src/data_generation/mock/generate_brewery.cc @@ -17,9 +17,9 @@ BreweryResult MockGenerator::GenerateBrewery( const std::string_view adjective = kBreweryAdjectives.at(hash % kBreweryAdjectives.size()); const std::string_view noun = - kBreweryNouns.at(hash / 7 % kBreweryNouns.size()); - const std::string_view base_description = - kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size()); + kBreweryNouns.at(hash / kNounHashStride % kBreweryNouns.size()); + const std::string_view base_description = kBreweryDescriptions.at( + (hash / kDescriptionHashStride) % kBreweryDescriptions.size()); const std::string name = std::format("{} {} {}", location.city, adjective, noun); diff --git a/tooling/pipeline/src/data_generation/mock/generate_user.cc b/tooling/pipeline/src/data_generation/mock/generate_user.cc index 51c26d2..38257fb 100644 --- a/tooling/pipeline/src/data_generation/mock/generate_user.cc +++ b/tooling/pipeline/src/data_generation/mock/generate_user.cc @@ -15,7 +15,7 @@ UserResult MockGenerator::GenerateUser(const std::string& locale) { UserResult result; const std::string_view username = kUsernames[hash % kUsernames.size()]; - const std::string_view bio = kBios[hash / 11 % kBios.size()]; + const std::string_view bio = kBios[hash / kBioHashStride % kBios.size()]; result.username = username; result.bio = bio; return result; diff --git a/tooling/pipeline/src/main.cc b/tooling/pipeline/src/main.cc index 2ce3779..e457d54 100644 --- a/tooling/pipeline/src/main.cc +++ b/tooling/pipeline/src/main.cc @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -23,127 +24,14 @@ #include "llama_backend_state.h" #include "services/enrichment_service.h" #include "services/export_service.h" +#include "services/prompt_directory.h" #include "services/sqlite_export_service.h" +#include "services/timer.h" #include "services/wikipedia_service.h" #include "web_client/curl_web_client.h" -namespace prog_opts = boost::program_options; namespace di = boost::di; -/** - * @brief Parse command-line arguments into ApplicationOptions. - * - * @param argc Command-line argument count. - * @param argv Command-line arguments. - * @return Parsed ApplicationOptions if parsing succeeded, std::nullopt - * otherwise. - */ -std::optional ParseArguments(const int argc, char** argv) { - prog_opts::options_description desc("Pipeline Options"); - - auto opt = desc.add_options(); - - opt("help,h", "Produce help message"); - - opt("mocked", prog_opts::bool_switch(), - "Use mocked generator for brewery/user data"); - - opt("model,m", prog_opts::value()->default_value(""), - "Path to LLM model (gguf)"); - - opt("temperature", prog_opts::value()->default_value(1.0F), - "Sampling temperature (higher = more random)"); - - opt("top-p", prog_opts::value()->default_value(0.95F), - "Nucleus sampling top-p in (0,1] (higher = more random)"); - - opt("top-k", prog_opts::value()->default_value(64), - "Top-k sampling parameter (higher = more candidate tokens)"); - - opt("n-ctx", prog_opts::value()->default_value(8192), - "Context window size in tokens (1-32768)"); - - opt("seed", prog_opts::value()->default_value(-1), - "Sampler seed: -1 for random, otherwise non-negative integer"); - - // Handle the "no arguments" or "help" case - if (argc == 1) { - spdlog::info("Biergarten Pipeline"); - std::stringstream usage_stream; - usage_stream << "\nUsage: biergarten-pipeline [options]\n\n" << desc; - spdlog::info(usage_stream.str()); - return std::nullopt; - } - - try { - prog_opts::variables_map variables_map; - prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), - variables_map); - prog_opts::notify(variables_map); - - if (variables_map.contains("help")) { - std::stringstream help_stream; - help_stream << "\n" << desc; - spdlog::info(help_stream.str()); - return std::nullopt; - } - - const auto use_mocked = variables_map["mocked"].as(); - const auto model_path = variables_map["model"].as(); - - if (use_mocked && !model_path.empty()) { - spdlog::error( - "Invalid arguments: --mocked and --model are mutually exclusive"); - return std::nullopt; - } - - if (!use_mocked && model_path.empty()) { - spdlog::error( - "Invalid arguments: Either --mocked or --model must be specified"); - return std::nullopt; - } - - const bool has_llm_params = !variables_map["temperature"].defaulted() || - !variables_map["top-p"].defaulted() || - !variables_map["top-k"].defaulted() || - !variables_map["seed"].defaulted(); - - if (use_mocked && has_llm_params) { - spdlog::warn( - "Sampling parameters (--temperature, --top-p, --top-k, --seed) are" - " ignored when using --mocked"); - } - - ApplicationOptions options; - options.use_mocked = use_mocked; - options.model_path = model_path; - options.temperature = variables_map["temperature"].as(); - options.top_p = variables_map["top-p"].as(); - options.top_k = variables_map["top-k"].as(); - options.n_ctx = variables_map["n-ctx"].as(); - options.seed = variables_map["seed"].as(); - - return options; - } catch (const std::exception& exception) { - spdlog::error("Failed to parse command-line arguments: {}", - exception.what()); - return std::nullopt; - } catch (...) { - spdlog::error("Failed to parse command-line arguments: unknown error"); - return std::nullopt; - } -} - -struct Timer { - std::chrono::steady_clock::time_point start_time = - std::chrono::steady_clock::now(); - [[nodiscard]] int64_t Elapsed() const { - return std::chrono::duration_cast( - std::chrono::steady_clock::now() - start_time) - .count(); - } -}; - int main(const int argc, char** argv) { try { Timer timer; @@ -157,6 +45,20 @@ int main(const int argc, char** argv) { } const auto options = *parsed_options; + const std::string model_path = options.generator.model_path.string(); + const auto sampling = + options.generator.sampling.value_or(SamplingOptions{}); + + std::unique_ptr prompt_directory; + if (!options.generator.use_mocked) { + try { + prompt_directory = + std::make_unique(options.pipeline.prompt_dir); + } catch (const std::exception& dir_error) { + spdlog::error("[Startup] Invalid --prompt-dir: {}", dir_error.what()); + return 1; + } + } const auto injector = di::make_injector( di::bind().to(), @@ -164,10 +66,11 @@ int main(const int argc, char** argv) { di::bind().to(), di::bind().to(), di::bind().to(), - di::bind().to(options.model_path), + di::bind().to(model_path), di::bind().to( - [options](const auto& inj) -> std::unique_ptr { - if (options.use_mocked) { + [options, model_path, sampling, &prompt_directory]( + const auto& inj) -> std::unique_ptr { + if (options.generator.use_mocked) { spdlog::info( "[Generator] Using MockGenerator (no model path provided)"); return std::make_unique(); @@ -176,9 +79,12 @@ int main(const int argc, char** argv) { spdlog::info( "[Generator] Using LlamaGenerator: {} (temperature={}, " "top-p={}, top-k={}, n_ctx={}, seed={})", - options.model_path, options.temperature, options.top_p, - options.top_k, options.n_ctx, options.seed); - return inj.template create>(); + model_path, sampling.temperature, sampling.top_p, + sampling.top_k, sampling.n_ctx, sampling.seed); + return std::make_unique( + options, model_path, + inj.template create>(), + std::move(prompt_directory)); })); auto generator = diff --git a/tooling/pipeline/src/services/prompt_directory.cc b/tooling/pipeline/src/services/prompt_directory.cc new file mode 100644 index 0000000..ab908b2 --- /dev/null +++ b/tooling/pipeline/src/services/prompt_directory.cc @@ -0,0 +1,85 @@ +/** + * @file services/prompt_directory.cc + * @brief PromptDirectory implementation: validates the directory at + * construction and loads named prompt files on demand with in-process caching. + */ + +#include "services/prompt_directory.h" + +#include + +#include +#include +#include +#include +#include + +// --------------------------------------------------------------------------- +// PromptDirectory +// --------------------------------------------------------------------------- + +PromptDirectory::PromptDirectory(const std::filesystem::path& prompt_dir) + : prompt_dir_(prompt_dir) { + std::error_code ec; + + // Scenario 4: directory must exist. + if (!std::filesystem::exists(prompt_dir_, ec) || ec) { + throw std::runtime_error( + "PromptDirectory: prompt directory does not exist: " + + prompt_dir_.string()); + } + + // Scenario 4: path must be a directory, not a file. + if (!std::filesystem::is_directory(prompt_dir_, ec) || ec) { + throw std::runtime_error( + "PromptDirectory: prompt directory path is not a directory: " + + prompt_dir_.string()); + } + + // Scenario 4: directory must be readable (probe with directory_iterator). + std::filesystem::directory_iterator probe(prompt_dir_, ec); + if (ec) { + throw std::runtime_error( + "PromptDirectory: prompt directory is not readable: " + + prompt_dir_.string() + " (" + ec.message() + ")"); + } + + spdlog::info("[PromptDirectory] Resolved prompt directory: {}", + prompt_dir_.string()); +} + +std::string PromptDirectory::Load(std::string_view key) { + const std::string key_str(key); + + // Return cached content if already loaded during this run. + const auto cache_it = cache_.find(key_str); + if (cache_it != cache_.end()) { + return cache_it->second; + } + + // Scenario 3: resolve /.md and require it to exist. + const std::filesystem::path file_path = + prompt_dir_ / std::filesystem::path(key_str + ".md"); + + std::ifstream file(file_path); + if (!file.is_open()) { + throw std::runtime_error( + "PromptDirectory: prompt file not found for key '" + key_str + + "': " + file_path.string()); + } + + std::string content((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); + file.close(); + + if (content.empty()) { + throw std::runtime_error("PromptDirectory: prompt file for key '" + + key_str + "' is empty: " + file_path.string()); + } + + spdlog::info("[PromptDirectory] Loaded prompt '{}' from '{}' ({} chars)", + key_str, file_path.string(), content.size()); + + cache_.emplace(key_str, content); + return content; +} diff --git a/tooling/pipeline/src/services/sqlite/build_database_path.cc b/tooling/pipeline/src/services/sqlite/build_database_path.cc deleted file mode 100644 index 3a96cdf..0000000 --- a/tooling/pipeline/src/services/sqlite/build_database_path.cc +++ /dev/null @@ -1,24 +0,0 @@ -/** - * @file services/sqlite/build_database_path.cc - * @brief SqliteExportService::BuildDatabasePath() implementation. - */ - -#include -#include - -#include "services/sqlite_export_service.h" - -std::filesystem::path SqliteExportService::BuildDatabasePath() const { - std::filesystem::path base_filename("biergarten_seed_" + run_timestamp_utc_ + - ".sqlite"); - std::filesystem::path candidate = - std::filesystem::current_path() / base_filename; - - for (int suffix = 1; std::filesystem::exists(candidate); ++suffix) { - candidate = std::filesystem::current_path() / - std::filesystem::path("biergarten_seed_" + run_timestamp_utc_ + - "-" + std::to_string(suffix) + ".sqlite"); - } - - return candidate; -} diff --git a/tooling/pipeline/src/services/sqlite/finalize.cc b/tooling/pipeline/src/services/sqlite/finalize.cc index ffb9d54..b2ef035 100644 --- a/tooling/pipeline/src/services/sqlite/finalize.cc +++ b/tooling/pipeline/src/services/sqlite/finalize.cc @@ -8,7 +8,6 @@ #include "services/sqlite_export_service.h" #include "services/sqlite_export_service_helpers.h" - void SqliteExportService::Finalize() { if (db_handle_ == nullptr) { return; diff --git a/tooling/pipeline/src/services/sqlite/helpers/sqlite_connection_helpers.cpp b/tooling/pipeline/src/services/sqlite/helpers/sqlite_connection_helpers.cc similarity index 91% rename from tooling/pipeline/src/services/sqlite/helpers/sqlite_connection_helpers.cpp rename to tooling/pipeline/src/services/sqlite/helpers/sqlite_connection_helpers.cc index 1c302e6..b5fec26 100644 --- a/tooling/pipeline/src/services/sqlite/helpers/sqlite_connection_helpers.cpp +++ b/tooling/pipeline/src/services/sqlite/helpers/sqlite_connection_helpers.cc @@ -10,7 +10,8 @@ void SqliteDatabaseDeleter::operator()(sqlite3* handle) const noexcept { } } -void SqliteStatementDeleter::operator()(sqlite3_stmt* statement) const noexcept { +void SqliteStatementDeleter::operator()( + sqlite3_stmt* statement) const noexcept { if (statement != nullptr) { sqlite3_finalize(statement); } @@ -23,7 +24,6 @@ void ThrowSqliteError(sqlite3* db_handle, std::string_view action) { } SqliteDatabaseHandle OpenDatabase(const std::filesystem::path& path) { - sqlite3* raw_handle = nullptr; const int result = sqlite3_open(path.string().c_str(), &raw_handle); @@ -54,7 +54,8 @@ void ExecSql(const SqliteDatabaseHandle& db_handle, std::string_view sql, } } -void RollbackTransactionNoThrow(const SqliteDatabaseHandle& db_handle) noexcept { +void RollbackTransactionNoThrow( + const SqliteDatabaseHandle& db_handle) noexcept { if (!db_handle) { return; } @@ -63,4 +64,3 @@ void RollbackTransactionNoThrow(const SqliteDatabaseHandle& db_handle) noexcept } } // namespace sqlite_export_service_internal - diff --git a/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp b/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cc similarity index 91% rename from tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp rename to tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cc index fd09056..9da278f 100644 --- a/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cpp +++ b/tooling/pipeline/src/services/sqlite/helpers/sqlite_statement_helpers.cc @@ -1,11 +1,12 @@ #include "services/sqlite_statement_helpers.h" -#include "services/sqlite_connection_helpers.h" -#include -#include -#include -#include #include +#include +#include +#include +#include + +#include "services/sqlite_connection_helpers.h" namespace sqlite_export_service_internal { @@ -86,16 +87,6 @@ sqlite3_int64 LastInsertRowId(const SqliteDatabaseHandle& db_handle) { return sqlite3_last_insert_rowid(db_handle.get()); } -std::string SerializeLocalLanguages( - const std::vector& local_languages) { - boost::json::array array; - array.reserve(local_languages.size()); - for (const auto& language : local_languages) { - array.emplace_back(language); - } - return boost::json::serialize(array); -} - std::string SerializeVector(const std::vector& str_vec) { boost::json::array array(str_vec.size()); for (const auto& s : str_vec) { @@ -105,4 +96,3 @@ std::string SerializeVector(const std::vector& str_vec) { } } // namespace sqlite_export_service_internal - diff --git a/tooling/pipeline/src/services/sqlite/initialize.cc b/tooling/pipeline/src/services/sqlite/initialize.cc index f3f7560..bbe6515 100644 --- a/tooling/pipeline/src/services/sqlite/initialize.cc +++ b/tooling/pipeline/src/services/sqlite/initialize.cc @@ -11,6 +11,19 @@ #include "services/sqlite_export_service.h" #include "services/sqlite_export_service_helpers.h" +std::filesystem::path SqliteExportService::BuildDatabasePath() const { + std::filesystem::path base_filename("biergarten_seed_" + run_timestamp_utc_ + + ".sqlite"); + std::filesystem::path candidate = output_path_ / base_filename; + + for (int suffix = 1; std::filesystem::exists(candidate); ++suffix) { + candidate = output_path_ / + std::filesystem::path("biergarten_seed_" + run_timestamp_utc_ + + "-" + std::to_string(suffix) + ".sqlite"); + } + + return candidate; +} void SqliteExportService::InitializeSchema() const { sqlite_export_service_internal::ExecSql( @@ -46,7 +59,6 @@ void SqliteExportService::RollbackAndCloseNoThrow() noexcept { location_cache_.clear(); } - void SqliteExportService::Initialize() { if (db_handle_ != nullptr) { throw std::runtime_error("SQLite export service is already initialized"); diff --git a/tooling/pipeline/src/services/sqlite/process_record.cc b/tooling/pipeline/src/services/sqlite/process_record.cc index 786aa19..be3af22 100644 --- a/tooling/pipeline/src/services/sqlite/process_record.cc +++ b/tooling/pipeline/src/services/sqlite/process_record.cc @@ -3,6 +3,8 @@ * @brief SqliteExportService::ProcessRecord() implementation. */ +#include +#include #include #include diff --git a/tooling/pipeline/src/services/sqlite/sqlite_export_service.cc b/tooling/pipeline/src/services/sqlite/sqlite_export_service.cc index 377c917..4bf66a3 100644 --- a/tooling/pipeline/src/services/sqlite/sqlite_export_service.cc +++ b/tooling/pipeline/src/services/sqlite/sqlite_export_service.cc @@ -7,11 +7,12 @@ #include -SqliteExportService::SqliteExportService() - : date_time_provider_(std::make_unique()) {} +SqliteExportService::SqliteExportService(const ApplicationOptions& options) + : date_time_provider_(std::make_unique()), + output_path_(options.pipeline.output_path) {} SqliteExportService::~SqliteExportService() { if (db_handle_ != nullptr) { RollbackAndCloseNoThrow(); } -} \ No newline at end of file +} diff --git a/tooling/pipeline/src/web_client/curl_web_client_get.cc b/tooling/pipeline/src/web_client/curl_web_client_get.cc index 2e178f7..0a8473e 100644 --- a/tooling/pipeline/src/web_client/curl_web_client_get.cc +++ b/tooling/pipeline/src/web_client/curl_web_client_get.cc @@ -17,6 +17,7 @@ using CurlHandle = std::unique_ptr; static constexpr long kConnectionTimeout = 10; static constexpr long kRequestTimeout = 30; +static constexpr long kMaxRedirects = 5; static constexpr int32_t kOkHttpStatus = 200; static CurlHandle CreateHandle() { @@ -32,7 +33,7 @@ static void SetCommonGetOptions(CURL* curl, const std::string& url) { curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0"); curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L); + curl_easy_setopt(curl, CURLOPT_MAXREDIRS, kMaxRedirects); curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, kConnectionTimeout); curl_easy_setopt(curl, CURLOPT_TIMEOUT, kRequestTimeout); curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");