mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-05-31 17:53:59 +00:00
refactor(pipeline): restructure config, add PromptDirectory, consolidate SQLite layer (#217)
* Refactor ApplicationOptions to separate config concerns * add prompt dir app option * readability updates: remove magic numbers, update comments * codebase formatting * Update docs * Extract argument parsing, timer out of
This commit is contained in:
@@ -141,37 +141,38 @@ package "Domain: Models" {
|
||||
|
||||
LocationContext *-- Completeness
|
||||
}
|
||||
|
||||
package "Domain: Application Configuration"{
|
||||
@startuml
|
||||
package "Domain: Application Configuration" {
|
||||
class SamplingOptions {
|
||||
+ temperature : float = 1.0F
|
||||
+ top_p : float = 0.95F
|
||||
+ top_k : uint32_t = 64
|
||||
+ n_ctx : uint32_t = 8192
|
||||
+ seed : int = -1
|
||||
+ temperature: float = 1.0F
|
||||
+ top_p: float = 0.95F
|
||||
+ top_k: uint32_t = 64
|
||||
+ n_ctx: uint32_t = 8192
|
||||
+ seed: int = -1
|
||||
}
|
||||
|
||||
class GeneratorOptions {
|
||||
+ model_path : std::filesystem::path
|
||||
+ use_mocked : bool = false
|
||||
+ sampling : SamplingOptions
|
||||
+ model_path: std::filesystem::path
|
||||
+ use_mocked: bool = false
|
||||
+ sampling: std::optional<SamplingOptions>
|
||||
}
|
||||
|
||||
class PipelineOptions {
|
||||
+ output_path : std::filesystem::path
|
||||
+ log_path : std::filesystem::path
|
||||
+ output_path: std::filesystem::path
|
||||
+ log_path: std::filesystem::path
|
||||
}
|
||||
|
||||
class ApplicationOptions {
|
||||
+ generator : GeneratorOptions
|
||||
+ pipeline : PipelineOptions
|
||||
+ generator: GeneratorOptions
|
||||
+ pipeline: PipelineOptions
|
||||
}
|
||||
|
||||
' --- Domain Model Relationships ---
|
||||
ApplicationOptions *-- GeneratorOptions
|
||||
ApplicationOptions *-- PipelineOptions
|
||||
GeneratorOptions *-- SamplingOptions
|
||||
GeneratorOptions o-- SamplingOptions
|
||||
}
|
||||
@endum
|
||||
|
||||
package "Domain: Policy" {
|
||||
|
||||
|
||||
1
tooling/pipeline/.gitignore
vendored
1
tooling/pipeline/.gitignore
vendored
@@ -6,3 +6,4 @@ data
|
||||
models
|
||||
*.gguf
|
||||
BiergartenPipeline.png
|
||||
output
|
||||
@@ -85,14 +85,14 @@ endif()
|
||||
FetchContent_Declare(
|
||||
llama-cpp
|
||||
GIT_REPOSITORY https://github.com/ggml-org/llama.cpp.git
|
||||
GIT_TAG b8742
|
||||
GIT_TAG b8742
|
||||
)
|
||||
FetchContent_MakeAvailable(llama-cpp)
|
||||
|
||||
FetchContent_Declare(
|
||||
boost-di
|
||||
GIT_REPOSITORY https://github.com/boost-ext/di.git
|
||||
GIT_TAG v1.3.0
|
||||
GIT_TAG v1.3.0
|
||||
)
|
||||
FetchContent_MakeAvailable(boost-di)
|
||||
if(TARGET Boost.DI AND NOT TARGET boost::di)
|
||||
@@ -102,7 +102,7 @@ endif()
|
||||
FetchContent_Declare(
|
||||
spdlog
|
||||
GIT_REPOSITORY https://github.com/gabime/spdlog.git
|
||||
GIT_TAG v1.15.3
|
||||
GIT_TAG v1.15.3
|
||||
)
|
||||
FetchContent_MakeAvailable(spdlog)
|
||||
|
||||
@@ -121,8 +121,8 @@ set(SOURCES
|
||||
src/services/wikipedia/fetch_extract.cc
|
||||
src/services/sqlite/sqlite_export_service.cc
|
||||
src/services/sqlite/build_database_path.cc
|
||||
src/services/sqlite/process_record.cc
|
||||
src/services/sqlite/initialize.cc
|
||||
src/services/sqlite/process_record.cc
|
||||
src/services/sqlite/initialize.cc
|
||||
src/services/sqlite/finalize.cc
|
||||
src/web_client/curl_global_state.cc
|
||||
src/web_client/curl_web_client_get.cc
|
||||
@@ -133,14 +133,14 @@ set(SOURCES
|
||||
src/data_generation/llama/helpers.cc
|
||||
src/data_generation/llama/infer.cc
|
||||
src/data_generation/llama/load.cc
|
||||
src/data_generation/llama/load_brewery_prompt.cc
|
||||
src/services/prompt_directory.cc
|
||||
src/data_generation/prompt_formatting/gemma4_jinja_prompt_formatter.cc
|
||||
src/data_generation/mock/deterministic_hash.cc
|
||||
src/data_generation/mock/generate_brewery.cc
|
||||
src/data_generation/mock/generate_user.cc
|
||||
src/json_handling/json_loader.cc
|
||||
src/services/sqlite/helpers/sqlite_connection_helpers.cpp
|
||||
src/services/sqlite/helpers/sqlite_statement_helpers.cpp
|
||||
src/services/sqlite/helpers/sqlite_connection_helpers.cpp
|
||||
src/services/sqlite/helpers/sqlite_statement_helpers.cpp
|
||||
)
|
||||
|
||||
# =============================================================================
|
||||
@@ -173,6 +173,6 @@ configure_file(
|
||||
|
||||
add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_directory
|
||||
${CMAKE_SOURCE_DIR}/prompts
|
||||
${CMAKE_BINARY_DIR}/prompts
|
||||
${CMAKE_SOURCE_DIR}/prompts
|
||||
${CMAKE_BINARY_DIR}/prompts
|
||||
)
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "data_generation/data_generator.h"
|
||||
#include "data_generation/prompt_formatting/prompt_formatter.h"
|
||||
#include "data_model/application_options.h"
|
||||
#include "services/prompt_directory.h"
|
||||
|
||||
struct llama_model;
|
||||
struct llama_context;
|
||||
@@ -33,10 +34,12 @@ class LlamaGenerator final : public DataGenerator {
|
||||
* @param options Parsed application options.
|
||||
* @param model_path Filesystem path to GGUF model assets.
|
||||
* @param prompt_formatter Formatter that produces model-specific prompts.
|
||||
* @param prompt_directory Directory service for loading named prompt files.
|
||||
*/
|
||||
LlamaGenerator(const ApplicationOptions& options,
|
||||
const std::string& model_path,
|
||||
std::unique_ptr<IPromptFormatter> prompt_formatter);
|
||||
std::unique_ptr<IPromptFormatter> prompt_formatter,
|
||||
std::unique_ptr<IPromptDirectory> prompt_directory);
|
||||
|
||||
~LlamaGenerator() override;
|
||||
|
||||
@@ -119,15 +122,6 @@ class LlamaGenerator final : public DataGenerator {
|
||||
int max_tokens = kDefaultMaxTokens,
|
||||
std::string_view grammar = {});
|
||||
|
||||
/**
|
||||
* @brief Loads the brewery system prompt from disk.
|
||||
*
|
||||
* @param prompt_file_path Prompt file path to try first.
|
||||
* @return Loaded prompt text.
|
||||
*/
|
||||
std::string LoadBrewerySystemPrompt(
|
||||
const std::filesystem::path& prompt_file_path);
|
||||
|
||||
ModelHandle model_;
|
||||
ContextHandle context_;
|
||||
float sampling_temperature_ = 1.0F;
|
||||
@@ -135,8 +129,8 @@ class LlamaGenerator final : public DataGenerator {
|
||||
uint32_t sampling_top_k_ = kDefaultSamplingTopK;
|
||||
std::mt19937 rng_;
|
||||
uint32_t n_ctx_ = kDefaultContextSize;
|
||||
std::string brewery_system_prompt_;
|
||||
std::unique_ptr<IPromptFormatter> prompt_formatter_;
|
||||
std::unique_ptr<IPromptDirectory> prompt_directory_;
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||
|
||||
@@ -44,6 +44,13 @@ class MockGenerator final : public DataGenerator {
|
||||
*/
|
||||
static size_t DeterministicHash(const Location& location);
|
||||
|
||||
// Hash stride constants for deterministic distribution across fixed-size
|
||||
// arrays. These coprime strides spread hash values uniformly without
|
||||
// clustering, ensuring diverse output across different hash inputs.
|
||||
static constexpr size_t kNounHashStride = 7;
|
||||
static constexpr size_t kDescriptionHashStride = 13;
|
||||
static constexpr size_t kBioHashStride = 11;
|
||||
|
||||
static constexpr std::array<std::string_view, 18> kBreweryAdjectives = {
|
||||
"Craft", "Heritage", "Local", "Artisan", "Pioneer", "Golden",
|
||||
"Modern", "Classic", "Summit", "Northern", "Riverstone", "Barrel",
|
||||
|
||||
@@ -6,37 +6,71 @@
|
||||
* @brief Program options for the Biergarten pipeline application.
|
||||
*/
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
#include <cstdint>
|
||||
#include <filesystem>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace prog_opts = boost::program_options;
|
||||
|
||||
/**
|
||||
* @brief Program options for the Biergarten pipeline application.
|
||||
* @brief LLM sampling parameters.
|
||||
*/
|
||||
struct ApplicationOptions {
|
||||
/// @brief Path to the LLM model file (gguf format); mutually exclusive with
|
||||
/// use_mocked.
|
||||
std::string model_path;
|
||||
|
||||
/// @brief Use mocked generator instead of LLM; mutually exclusive with
|
||||
/// model_path.
|
||||
bool use_mocked = false;
|
||||
|
||||
struct SamplingOptions {
|
||||
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
|
||||
float temperature = 1.0F;
|
||||
|
||||
/// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more
|
||||
/// random).
|
||||
/// @brief LLM nucleus sampling top-p parameter.
|
||||
float top_p = 0.95F;
|
||||
|
||||
/// @brief LLM top-k sampling parameter.
|
||||
uint32_t top_k = 64;
|
||||
|
||||
/// @brief Context window size (tokens) for LLM inference. Higher values
|
||||
/// support longer prompts but use more memory.
|
||||
/// @brief Context window size (tokens).
|
||||
uint32_t n_ctx = 8192;
|
||||
|
||||
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
|
||||
/// @brief Random seed (-1 for random, otherwise non-negative).
|
||||
int seed = -1;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Configuration for the LLM generator component.
|
||||
*/
|
||||
struct GeneratorOptions {
|
||||
/// @brief Path to the LLM model file (gguf format).
|
||||
std::filesystem::path model_path;
|
||||
|
||||
/// @brief Use mocked generator instead of actual LLM inference.
|
||||
bool use_mocked = false;
|
||||
|
||||
/// @brief Specific sampling parameters for this generator.
|
||||
/// If nullopt, the application should use global defaults.
|
||||
std::optional<SamplingOptions> sampling;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Configuration for the pipeline execution and output.
|
||||
*/
|
||||
struct PipelineOptions {
|
||||
/// @brief Directory for generated artifacts.
|
||||
std::filesystem::path output_path;
|
||||
|
||||
/// @brief Directory that contains named prompt files (e.g.
|
||||
/// BREWERY_GENERATION.md).
|
||||
std::filesystem::path prompt_dir;
|
||||
|
||||
/// @brief Path for application logs.
|
||||
std::filesystem::path log_path;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Root configuration object for the Biergarten pipeline.
|
||||
*/
|
||||
struct ApplicationOptions {
|
||||
GeneratorOptions generator;
|
||||
PipelineOptions pipeline;
|
||||
};
|
||||
|
||||
std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv);
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_APPLICATION_OPTIONS_H_
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
* @brief Abstraction for persisting generated brewery data.
|
||||
*/
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "data_model/generated_brewery.h"
|
||||
|
||||
/**
|
||||
|
||||
76
tooling/pipeline/includes/services/prompt_directory.h
Normal file
76
tooling/pipeline/includes/services/prompt_directory.h
Normal file
@@ -0,0 +1,76 @@
|
||||
#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_
|
||||
#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_
|
||||
|
||||
/**
|
||||
* @file services/prompt_directory.h
|
||||
* @brief Interface and filesystem-backed implementation for named prompt
|
||||
* loading.
|
||||
*
|
||||
* Prompt files are resolved by key: a key of "BREWERY_GENERATION" maps to the
|
||||
* file <prompt_dir>/BREWERY_GENERATION.md. The interface is kept intentionally
|
||||
* narrow so test doubles can be injected without touching the filesystem.
|
||||
*/
|
||||
|
||||
#include <filesystem>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
|
||||
/**
|
||||
* @brief Interface for loading named prompt files.
|
||||
*/
|
||||
class IPromptDirectory {
|
||||
public:
|
||||
IPromptDirectory() = default;
|
||||
IPromptDirectory(const IPromptDirectory&) = delete;
|
||||
IPromptDirectory& operator=(const IPromptDirectory&) = delete;
|
||||
IPromptDirectory(IPromptDirectory&&) = delete;
|
||||
IPromptDirectory& operator=(IPromptDirectory&&) = delete;
|
||||
virtual ~IPromptDirectory() = default;
|
||||
|
||||
/**
|
||||
* @brief Loads the prompt associated with @p key.
|
||||
*
|
||||
* @param key Logical prompt key, e.g. "BREWERY_GENERATION".
|
||||
* @return Prompt text.
|
||||
* @throws std::runtime_error if the prompt file cannot be found or read.
|
||||
*/
|
||||
[[nodiscard]] virtual std::string Load(std::string_view key) = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Filesystem-backed IPromptDirectory implementation.
|
||||
*
|
||||
* Each call to Load() checks an in-process cache first, then reads
|
||||
* <prompt_dir>/<key>.md from disk. The directory must exist and be readable
|
||||
* at construction time; individual file absence is reported lazily at Load().
|
||||
*/
|
||||
class PromptDirectory final : public IPromptDirectory {
|
||||
public:
|
||||
/**
|
||||
* @brief Constructs a PromptDirectory rooted at @p prompt_dir.
|
||||
*
|
||||
* @param prompt_dir Absolute or relative path to the prompt directory.
|
||||
* @throws std::runtime_error if @p prompt_dir does not exist or is not a
|
||||
* directory.
|
||||
*/
|
||||
explicit PromptDirectory(const std::filesystem::path& prompt_dir);
|
||||
|
||||
/**
|
||||
* @brief Loads the prompt for @p key, caching the result.
|
||||
*
|
||||
* Maps @p key → <prompt_dir>/<key>.md.
|
||||
*
|
||||
* @param key Logical prompt key.
|
||||
* @return Prompt text.
|
||||
* @throws std::runtime_error if the file does not exist or is empty.
|
||||
*/
|
||||
[[nodiscard]] std::string Load(std::string_view key) override;
|
||||
|
||||
private:
|
||||
std::filesystem::path prompt_dir_;
|
||||
std::unordered_map<std::string, std::string> cache_;
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_
|
||||
@@ -7,6 +7,7 @@
|
||||
*/
|
||||
|
||||
#include <sqlite3.h>
|
||||
|
||||
#include <filesystem>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
@@ -20,12 +21,10 @@ void ThrowSqliteError(sqlite3* db_handle, std::string_view action);
|
||||
SqliteDatabaseHandle OpenDatabase(const std::filesystem::path& path);
|
||||
|
||||
void ExecSql(const SqliteDatabaseHandle& db_handle, std::string_view sql,
|
||||
const char* action);
|
||||
const char* action);
|
||||
|
||||
void RollbackTransactionNoThrow(const SqliteDatabaseHandle& db_handle) noexcept;
|
||||
|
||||
} // namespace sqlite_export_service_internal
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_CONNECTION_HELPERS_H_
|
||||
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "data_model/application_options.h"
|
||||
#include "services/date_time_provider.h"
|
||||
#include "services/export_service.h"
|
||||
#include "services/sqlite_export_service_helpers.h"
|
||||
@@ -20,7 +21,7 @@
|
||||
*/
|
||||
class SqliteExportService final : public IExportService {
|
||||
public:
|
||||
SqliteExportService();
|
||||
explicit SqliteExportService(const ApplicationOptions& options);
|
||||
~SqliteExportService() override;
|
||||
|
||||
SqliteExportService(const SqliteExportService&) = delete;
|
||||
@@ -41,12 +42,12 @@ class SqliteExportService final : public IExportService {
|
||||
void InitializeSchema() const;
|
||||
void PrepareStatements();
|
||||
void RollbackAndCloseNoThrow() noexcept;
|
||||
void FinalizeStatements() noexcept;
|
||||
|
||||
[[nodiscard]] std::filesystem::path BuildDatabasePath() const;
|
||||
[[nodiscard]] static std::string BuildLocationKey(const Location& location);
|
||||
|
||||
std::unique_ptr<IDateTimeProvider> date_time_provider_;
|
||||
std::filesystem::path output_path_;
|
||||
std::string run_timestamp_utc_;
|
||||
std::filesystem::path database_path_;
|
||||
SqliteDatabaseHandle db_handle_;
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
|
||||
/* Umbrella header for backward compatibility. */
|
||||
|
||||
#include "services/sqlite_handle_types.h"
|
||||
#include "services/sqlite_connection_helpers.h"
|
||||
#include "services/sqlite_handle_types.h"
|
||||
#include "services/sqlite_statement_helpers.h"
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_EXPORT_SERVICE_HELPERS_H_
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
*/
|
||||
|
||||
#include <sqlite3.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string_view>
|
||||
|
||||
@@ -33,4 +34,3 @@ struct BindParam {
|
||||
} // namespace sqlite_export_service_internal
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_HANDLE_TYPES_H_
|
||||
|
||||
|
||||
@@ -3,10 +3,12 @@
|
||||
|
||||
/**
|
||||
* @file services/sqlite_statement_helpers.h
|
||||
* @brief Declarations for statement-level SQLite helper functions and constants.
|
||||
* @brief Declarations for statement-level SQLite helper functions and
|
||||
* constants.
|
||||
*/
|
||||
|
||||
#include <sqlite3.h>
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
@@ -107,10 +109,8 @@ void StepStatement(const SqliteDatabaseHandle& db_handle,
|
||||
|
||||
sqlite3_int64 LastInsertRowId(const SqliteDatabaseHandle& db_handle);
|
||||
|
||||
std::string SerializeLocalLanguages(const std::vector<std::string>& local_languages);
|
||||
std::string SerializeVector(const std::vector<std::string>& str_vec);
|
||||
|
||||
} // namespace sqlite_export_service_internal
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_STATEMENT_HELPERS_H_
|
||||
|
||||
|
||||
35
tooling/pipeline/includes/services/timer.h
Normal file
35
tooling/pipeline/includes/services/timer.h
Normal file
@@ -0,0 +1,35 @@
|
||||
#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_
|
||||
#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_
|
||||
|
||||
#include <chrono>
|
||||
|
||||
/**
|
||||
* @file services/timer.h
|
||||
* @brief Simple timer utility for measuring elapsed time.
|
||||
*/
|
||||
class Timer {
|
||||
std::chrono::steady_clock::time_point start_time =
|
||||
std::chrono::steady_clock::now();
|
||||
|
||||
public:
|
||||
Timer(const Timer&) = delete;
|
||||
Timer& operator=(const Timer&) = delete;
|
||||
Timer(Timer&&) = delete;
|
||||
Timer& operator=(Timer&&) = delete;
|
||||
Timer() = default;
|
||||
~Timer() = default;
|
||||
|
||||
[[nodiscard]] int64_t Elapsed() const {
|
||||
return std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - start_time)
|
||||
.count();
|
||||
}
|
||||
|
||||
[[nodiscard]] int64_t Reset() {
|
||||
auto previous_elapsed = Elapsed();
|
||||
start_time = std::chrono::steady_clock::now();
|
||||
return previous_elapsed;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_
|
||||
150
tooling/pipeline/src/application_options/parse_arguments.cc
Normal file
150
tooling/pipeline/src/application_options/parse_arguments.cc
Normal file
@@ -0,0 +1,150 @@
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <optional>
|
||||
#include <sstream>
|
||||
#include <string>
|
||||
|
||||
#include "data_model/application_options.h"
|
||||
|
||||
std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
|
||||
prog_opts::options_description desc("Pipeline Options");
|
||||
auto opt = desc.add_options();
|
||||
|
||||
opt("help,h", "Produce help message");
|
||||
|
||||
// Defaults sourced from SamplingOptions{} so the CLI and LlamaGenerator
|
||||
// share a single source of truth — changing the struct updates both.
|
||||
auto add_sampling_options = [&]() -> void {
|
||||
const SamplingOptions sampling_defaults{};
|
||||
opt("temperature",
|
||||
prog_opts::value<float>()->default_value(sampling_defaults.temperature),
|
||||
"Sampling temperature (higher = more random)");
|
||||
opt("top-p",
|
||||
prog_opts::value<float>()->default_value(sampling_defaults.top_p),
|
||||
"Nucleus sampling top-p in (0,1] (higher = more random)");
|
||||
opt("top-k",
|
||||
prog_opts::value<uint32_t>()->default_value(sampling_defaults.top_k),
|
||||
"Top-k sampling parameter (higher = more candidate tokens)");
|
||||
opt("n-ctx",
|
||||
prog_opts::value<uint32_t>()->default_value(sampling_defaults.n_ctx),
|
||||
"Context window size in tokens");
|
||||
opt("seed", prog_opts::value<int>()->default_value(sampling_defaults.seed),
|
||||
"Sampler seed: -1 for random, otherwise non-negative integer");
|
||||
};
|
||||
|
||||
// --mocked and --model are mutually exclusive; validation is enforced below
|
||||
// rather than at registration to produce a clear diagnostic message.
|
||||
auto add_generator_options = [&]() -> void {
|
||||
opt("mocked", prog_opts::bool_switch(),
|
||||
"Use mocked generator for brewery/user data");
|
||||
opt("model,m", prog_opts::value<std::string>()->default_value(""),
|
||||
"Path to LLM model (gguf)");
|
||||
};
|
||||
|
||||
auto add_pipeline_options = [&]() -> void {
|
||||
opt("output,o", prog_opts::value<std::string>()->default_value("output"),
|
||||
"Directory for generated artifacts");
|
||||
opt("log-path",
|
||||
prog_opts::value<std::string>()->default_value("pipeline.log"),
|
||||
"Path for application logs");
|
||||
opt("prompt-dir", prog_opts::value<std::string>()->default_value(""),
|
||||
"Directory containing named prompt files (e.g. BREWERY_GENERATION.md)."
|
||||
" Required when not using --mocked.");
|
||||
};
|
||||
|
||||
add_sampling_options();
|
||||
add_generator_options();
|
||||
add_pipeline_options();
|
||||
|
||||
// No flags provided — treat as a help request rather than an error.
|
||||
if (argc == 1) {
|
||||
spdlog::info("Biergarten Pipeline");
|
||||
std::stringstream usage_stream;
|
||||
usage_stream << "\nUsage: biergarten-pipeline [options]\n\n" << desc;
|
||||
spdlog::info(usage_stream.str());
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
try {
|
||||
prog_opts::variables_map var_map;
|
||||
prog_opts::store(prog_opts::parse_command_line(argc, argv, desc), var_map);
|
||||
prog_opts::notify(var_map);
|
||||
|
||||
if (var_map.contains("help")) {
|
||||
std::stringstream help_stream;
|
||||
help_stream << "\n" << desc;
|
||||
spdlog::info(help_stream.str());
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
ApplicationOptions options;
|
||||
|
||||
options.pipeline.output_path = var_map["output"].as<std::string>();
|
||||
options.pipeline.log_path = var_map["log-path"].as<std::string>();
|
||||
options.pipeline.prompt_dir = var_map["prompt-dir"].as<std::string>();
|
||||
|
||||
const bool use_mocked = var_map["mocked"].as<bool>();
|
||||
const std::string model_path = var_map["model"].as<std::string>();
|
||||
|
||||
// Enforce mutual exclusivity before any further configuration is applied.
|
||||
if (use_mocked && !model_path.empty()) {
|
||||
spdlog::error(
|
||||
"Invalid arguments: --mocked and --model are mutually exclusive");
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (!use_mocked && model_path.empty()) {
|
||||
spdlog::error(
|
||||
"Invalid arguments: either --mocked or --model must be specified");
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
// Prompt directory is only meaningful for live inference — the mock
|
||||
// generator has no use for it and should not require it to be present.
|
||||
if (!use_mocked && options.pipeline.prompt_dir.empty()) {
|
||||
spdlog::error(
|
||||
"Invalid arguments: --prompt-dir is required when not using "
|
||||
"--mocked");
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
options.generator.use_mocked = use_mocked;
|
||||
options.generator.model_path = model_path;
|
||||
|
||||
// Only populate sampling config when the user explicitly overrides at
|
||||
// least one value. Leaving it as std::nullopt lets LlamaGenerator fall
|
||||
// back to its own SamplingOptions{} defaults, keeping the two paths
|
||||
// consistent without redundant copies.
|
||||
const bool user_provided_sampling =
|
||||
!var_map["temperature"].defaulted() || !var_map["top-p"].defaulted() ||
|
||||
!var_map["top-k"].defaulted() || !var_map["n-ctx"].defaulted() ||
|
||||
!var_map["seed"].defaulted();
|
||||
|
||||
if (user_provided_sampling) {
|
||||
// Warn but do not fail — the run is still valid, the flags are just
|
||||
// silently irrelevant when no model is loaded.
|
||||
if (use_mocked) {
|
||||
spdlog::warn("Sampling parameters are ignored when using --mocked");
|
||||
} else {
|
||||
SamplingOptions sampling;
|
||||
sampling.temperature = var_map["temperature"].as<float>();
|
||||
sampling.top_p = var_map["top-p"].as<float>();
|
||||
sampling.top_k = var_map["top-k"].as<uint32_t>();
|
||||
sampling.n_ctx = var_map["n-ctx"].as<uint32_t>();
|
||||
sampling.seed = var_map["seed"].as<int>();
|
||||
|
||||
options.generator.sampling = sampling;
|
||||
}
|
||||
}
|
||||
|
||||
return options;
|
||||
|
||||
} catch (const std::exception& exception) {
|
||||
spdlog::error("Failed to parse command-line arguments: {}",
|
||||
exception.what());
|
||||
return std::nullopt;
|
||||
} catch (...) {
|
||||
spdlog::error("Failed to parse command-line arguments: unknown error");
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
@@ -33,6 +33,9 @@ static std::string FormatLocalLanguageCodes(
|
||||
return formatted;
|
||||
}
|
||||
|
||||
// GBNF grammar for structured brewery JSON output.
|
||||
// @TODO move to a separate gbnf file if it grows in complexity or is shared
|
||||
// across modules.
|
||||
static constexpr std::string_view kBreweryJsonGrammar = R"json_brewery(
|
||||
root ::= thought-block "{" ws "\"name_en\"" ws ":" ws string ws "," ws "\"description_en\"" ws ":" ws string ws "," ws "\"name_local\"" ws ":" ws string ws "," ws "\"description_local\"" ws ":" ws string ws "}" ws
|
||||
thought-block ::= [^{]*
|
||||
@@ -59,11 +62,12 @@ BreweryResult LlamaGenerator::GenerateBrewery(
|
||||
location.country.empty() ? std::string{}
|
||||
: std::format(", {}", location.country);
|
||||
/**
|
||||
* Load brewery system prompt from file
|
||||
* Falls back to minimal inline prompt if file not found
|
||||
* Load brewery system prompt via the injected prompt directory.
|
||||
* The key "BREWERY_GENERATION" resolves to BREWERY_GENERATION.md inside
|
||||
* the configured --prompt-dir. Throws on missing or empty file.
|
||||
*/
|
||||
const std::string system_prompt =
|
||||
LoadBrewerySystemPrompt("prompts/system.md");
|
||||
prompt_directory_->Load("BREWERY_GENERATION");
|
||||
|
||||
std::string user_prompt = std::format(
|
||||
"## CITY:\n{}\n\n## COUNTRY:\n{}\n\n## LOCAL LANGUAGE CODES:\n{}\n\n## "
|
||||
|
||||
@@ -12,6 +12,13 @@
|
||||
#include "data_generation/llama_generator.h"
|
||||
#include "data_generation/llama_generator_helpers.h"
|
||||
|
||||
// TODO: Implement locale-aware user profile generation.
|
||||
// Current implementation returns a hardcoded test value and ignores the
|
||||
// locale parameter. Future implementation should:
|
||||
// 1. Load a USER_GENERATION.md prompt template with locale context
|
||||
// 2. Perform LLM inference with locale-specific username/bio generation
|
||||
// 3. Parse and validate JSON output with retry handling (similar to brewery)
|
||||
// 4. Return locale-aware username and biography
|
||||
UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
|
||||
return {.username = "test_user",
|
||||
.bio = "This is a test user profile from " + locale + "."};
|
||||
|
||||
@@ -58,6 +58,11 @@ static std::string CondenseWhitespace(std::string_view text) {
|
||||
return out;
|
||||
}
|
||||
|
||||
// Guard against truncating in the first half of the string.
|
||||
// This preserves the critical opening content and avoids cutting critical
|
||||
// context words early in the region description.
|
||||
static constexpr size_t kTruncationGuardDivisor = 2;
|
||||
|
||||
/**
|
||||
* Truncate region context to fit within max length while preserving word
|
||||
* boundaries
|
||||
@@ -71,7 +76,8 @@ std::string PrepareRegionContext(std::string_view region_context,
|
||||
|
||||
normalized.resize(max_chars);
|
||||
const size_t last_space = normalized.find_last_of(' ');
|
||||
if (last_space != std::string::npos && last_space > max_chars / 2) {
|
||||
if (last_space != std::string::npos &&
|
||||
last_space > max_chars / kTruncationGuardDivisor) {
|
||||
normalized.resize(last_space);
|
||||
}
|
||||
|
||||
|
||||
@@ -19,6 +19,9 @@
|
||||
#include "llama.h"
|
||||
|
||||
static constexpr size_t kPromptTokenSlack = 8;
|
||||
// Minimum tokens to keep when using top-p sampling. Ensures at least one
|
||||
// candidate token remains available even with very restrictive top-p values.
|
||||
static constexpr size_t kTopPMinKeep = 1;
|
||||
|
||||
namespace {
|
||||
|
||||
@@ -62,7 +65,7 @@ SamplerHandle MakeSamplerChain(const llama_vocab* vocab,
|
||||
"LlamaGenerator: failed to initialize temperature sampler");
|
||||
add_sampler(llama_sampler_init_top_k(static_cast<int32_t>(config.top_k)),
|
||||
"LlamaGenerator: failed to initialize top-k sampler");
|
||||
add_sampler(llama_sampler_init_top_p(config.top_p, 1),
|
||||
add_sampler(llama_sampler_init_top_p(config.top_p, kTopPMinKeep),
|
||||
"LlamaGenerator: failed to initialize top-p sampler");
|
||||
add_sampler(llama_sampler_init_dist(config.seed),
|
||||
"LlamaGenerator: failed to initialize distribution sampler");
|
||||
|
||||
@@ -32,9 +32,11 @@ void LlamaGenerator::ContextDeleter::operator()(
|
||||
|
||||
LlamaGenerator::LlamaGenerator(
|
||||
const ApplicationOptions& options, const std::string& model_path,
|
||||
std::unique_ptr<IPromptFormatter> prompt_formatter)
|
||||
std::unique_ptr<IPromptFormatter> prompt_formatter,
|
||||
std::unique_ptr<IPromptDirectory> prompt_directory)
|
||||
: rng_(std::random_device{}()),
|
||||
prompt_formatter_(std::move(prompt_formatter)) {
|
||||
prompt_formatter_(std::move(prompt_formatter)),
|
||||
prompt_directory_(std::move(prompt_directory)) {
|
||||
if (model_path.empty()) {
|
||||
throw std::runtime_error("LlamaGenerator: model path must not be empty");
|
||||
}
|
||||
@@ -44,41 +46,49 @@ LlamaGenerator::LlamaGenerator(
|
||||
"LlamaGenerator: prompt formatter dependency must not be null");
|
||||
}
|
||||
|
||||
if (options.temperature < 0.0F) {
|
||||
if (!prompt_directory_) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: prompt directory dependency must not be null");
|
||||
}
|
||||
|
||||
const auto sampling = options.generator.sampling.value_or(SamplingOptions{});
|
||||
|
||||
if (sampling.temperature < 0.0F) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: sampling temperature must be >= 0");
|
||||
}
|
||||
|
||||
if (options.top_p <= 0.0F || options.top_p > 1.0F) {
|
||||
if (sampling.top_p <= 0.0F || sampling.top_p > 1.0F) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: sampling top-p must be in (0, 1]");
|
||||
}
|
||||
|
||||
if (options.top_k == 0U) {
|
||||
if (sampling.top_k == 0U) {
|
||||
throw std::runtime_error("LlamaGenerator: sampling top-k must be > 0");
|
||||
}
|
||||
|
||||
if (options.seed < -1) {
|
||||
if (sampling.seed < -1) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: seed must be >= 0, or -1 for random");
|
||||
}
|
||||
|
||||
if (options.n_ctx == 0 || options.n_ctx > kMaxContextSize) {
|
||||
if (sampling.n_ctx == 0 || sampling.n_ctx > kMaxContextSize) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: context size must be in range [1, 32768]");
|
||||
}
|
||||
|
||||
sampling_temperature_ = options.temperature;
|
||||
sampling_top_p_ = options.top_p;
|
||||
sampling_top_k_ = options.top_k;
|
||||
sampling_temperature_ = sampling.temperature;
|
||||
sampling_top_p_ = sampling.top_p;
|
||||
sampling_top_k_ = sampling.top_k;
|
||||
|
||||
if (options.seed == -1) {
|
||||
if (sampling.seed == -1) {
|
||||
std::random_device random_device;
|
||||
rng_.seed(random_device());
|
||||
} else {
|
||||
rng_.seed(static_cast<uint32_t>(options.seed));
|
||||
rng_.seed(static_cast<uint32_t>(sampling.seed));
|
||||
}
|
||||
n_ctx_ = options.n_ctx;
|
||||
|
||||
n_ctx_ = sampling.n_ctx;
|
||||
|
||||
this->Load(model_path);
|
||||
}
|
||||
|
||||
@@ -14,6 +14,10 @@
|
||||
#include "data_generation/llama_generator.h"
|
||||
#include "llama.h"
|
||||
|
||||
// Maximum batch size for decode operations. Capping the batch prevents
|
||||
// excessive memory allocation while maintaining inference performance.
|
||||
static constexpr uint32_t kMaxBatchSize = 5000U;
|
||||
|
||||
void LlamaGenerator::Load(const std::string& model_path) {
|
||||
context_.reset();
|
||||
model_.reset();
|
||||
@@ -28,7 +32,7 @@ void LlamaGenerator::Load(const std::string& model_path) {
|
||||
|
||||
llama_context_params context_params = llama_context_default_params();
|
||||
context_params.n_ctx = n_ctx_;
|
||||
context_params.n_batch = std::min(n_ctx_, static_cast<uint32_t>(5000));
|
||||
context_params.n_batch = std::min(n_ctx_, kMaxBatchSize);
|
||||
|
||||
LlamaGenerator::ContextHandle loaded_context(
|
||||
llama_init_from_model(loaded_model.get(), context_params));
|
||||
|
||||
@@ -1,55 +0,0 @@
|
||||
/**
|
||||
* @file data_generation/llama/load_brewery_prompt.cc
|
||||
* @brief Resolves brewery system prompt content from cache or a configured
|
||||
* filesystem path and provides a robust inline fallback prompt when absent.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "data_generation/llama_generator.h"
|
||||
|
||||
/**
|
||||
* @brief Loads brewery system prompt from disk or cache.
|
||||
*
|
||||
* @param prompt_file_path Preferred prompt file location.
|
||||
* @return Prompt text loaded from disk.
|
||||
*/
|
||||
std::string LlamaGenerator::LoadBrewerySystemPrompt(
|
||||
const std::filesystem::path& prompt_file_path) {
|
||||
// Return cached version if already loaded
|
||||
if (!brewery_system_prompt_.empty()) {
|
||||
return brewery_system_prompt_;
|
||||
}
|
||||
|
||||
std::ifstream prompt_file(prompt_file_path);
|
||||
if (!prompt_file.is_open()) {
|
||||
spdlog::error(
|
||||
"LlamaGenerator: Failed to open brewery system prompt file '{}'",
|
||||
prompt_file_path.string());
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: missing brewery system prompt file: " +
|
||||
prompt_file_path.string());
|
||||
}
|
||||
|
||||
const std::string prompt((std::istreambuf_iterator(prompt_file)),
|
||||
std::istreambuf_iterator<char>());
|
||||
prompt_file.close();
|
||||
|
||||
if (prompt.empty()) {
|
||||
spdlog::error("LlamaGenerator: Brewery system prompt file '{}' is empty",
|
||||
prompt_file_path.string());
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: empty brewery system prompt file: " +
|
||||
prompt_file_path.string());
|
||||
}
|
||||
|
||||
spdlog::info(
|
||||
"LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)",
|
||||
prompt_file_path.string(), prompt.length());
|
||||
brewery_system_prompt_ = prompt;
|
||||
return brewery_system_prompt_;
|
||||
}
|
||||
@@ -17,9 +17,9 @@ BreweryResult MockGenerator::GenerateBrewery(
|
||||
const std::string_view adjective =
|
||||
kBreweryAdjectives.at(hash % kBreweryAdjectives.size());
|
||||
const std::string_view noun =
|
||||
kBreweryNouns.at(hash / 7 % kBreweryNouns.size());
|
||||
const std::string_view base_description =
|
||||
kBreweryDescriptions.at((hash / 13) % kBreweryDescriptions.size());
|
||||
kBreweryNouns.at(hash / kNounHashStride % kBreweryNouns.size());
|
||||
const std::string_view base_description = kBreweryDescriptions.at(
|
||||
(hash / kDescriptionHashStride) % kBreweryDescriptions.size());
|
||||
|
||||
const std::string name =
|
||||
std::format("{} {} {}", location.city, adjective, noun);
|
||||
|
||||
@@ -15,7 +15,7 @@ UserResult MockGenerator::GenerateUser(const std::string& locale) {
|
||||
|
||||
UserResult result;
|
||||
const std::string_view username = kUsernames[hash % kUsernames.size()];
|
||||
const std::string_view bio = kBios[hash / 11 % kBios.size()];
|
||||
const std::string_view bio = kBios[hash / kBioHashStride % kBios.size()];
|
||||
result.username = username;
|
||||
result.bio = bio;
|
||||
return result;
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <boost/di.hpp>
|
||||
#include <boost/program_options.hpp>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <exception>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
@@ -23,127 +24,14 @@
|
||||
#include "llama_backend_state.h"
|
||||
#include "services/enrichment_service.h"
|
||||
#include "services/export_service.h"
|
||||
#include "services/prompt_directory.h"
|
||||
#include "services/sqlite_export_service.h"
|
||||
#include "services/timer.h"
|
||||
#include "services/wikipedia_service.h"
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
namespace prog_opts = boost::program_options;
|
||||
namespace di = boost::di;
|
||||
|
||||
/**
|
||||
* @brief Parse command-line arguments into ApplicationOptions.
|
||||
*
|
||||
* @param argc Command-line argument count.
|
||||
* @param argv Command-line arguments.
|
||||
* @return Parsed ApplicationOptions if parsing succeeded, std::nullopt
|
||||
* otherwise.
|
||||
*/
|
||||
std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv) {
|
||||
prog_opts::options_description desc("Pipeline Options");
|
||||
|
||||
auto opt = desc.add_options();
|
||||
|
||||
opt("help,h", "Produce help message");
|
||||
|
||||
opt("mocked", prog_opts::bool_switch(),
|
||||
"Use mocked generator for brewery/user data");
|
||||
|
||||
opt("model,m", prog_opts::value<std::string>()->default_value(""),
|
||||
"Path to LLM model (gguf)");
|
||||
|
||||
opt("temperature", prog_opts::value<float>()->default_value(1.0F),
|
||||
"Sampling temperature (higher = more random)");
|
||||
|
||||
opt("top-p", prog_opts::value<float>()->default_value(0.95F),
|
||||
"Nucleus sampling top-p in (0,1] (higher = more random)");
|
||||
|
||||
opt("top-k", prog_opts::value<uint32_t>()->default_value(64),
|
||||
"Top-k sampling parameter (higher = more candidate tokens)");
|
||||
|
||||
opt("n-ctx", prog_opts::value<uint32_t>()->default_value(8192),
|
||||
"Context window size in tokens (1-32768)");
|
||||
|
||||
opt("seed", prog_opts::value<int>()->default_value(-1),
|
||||
"Sampler seed: -1 for random, otherwise non-negative integer");
|
||||
|
||||
// Handle the "no arguments" or "help" case
|
||||
if (argc == 1) {
|
||||
spdlog::info("Biergarten Pipeline");
|
||||
std::stringstream usage_stream;
|
||||
usage_stream << "\nUsage: biergarten-pipeline [options]\n\n" << desc;
|
||||
spdlog::info(usage_stream.str());
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
try {
|
||||
prog_opts::variables_map variables_map;
|
||||
prog_opts::store(prog_opts::parse_command_line(argc, argv, desc),
|
||||
variables_map);
|
||||
prog_opts::notify(variables_map);
|
||||
|
||||
if (variables_map.contains("help")) {
|
||||
std::stringstream help_stream;
|
||||
help_stream << "\n" << desc;
|
||||
spdlog::info(help_stream.str());
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
const auto use_mocked = variables_map["mocked"].as<bool>();
|
||||
const auto model_path = variables_map["model"].as<std::string>();
|
||||
|
||||
if (use_mocked && !model_path.empty()) {
|
||||
spdlog::error(
|
||||
"Invalid arguments: --mocked and --model are mutually exclusive");
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
if (!use_mocked && model_path.empty()) {
|
||||
spdlog::error(
|
||||
"Invalid arguments: Either --mocked or --model must be specified");
|
||||
return std::nullopt;
|
||||
}
|
||||
|
||||
const bool has_llm_params = !variables_map["temperature"].defaulted() ||
|
||||
!variables_map["top-p"].defaulted() ||
|
||||
!variables_map["top-k"].defaulted() ||
|
||||
!variables_map["seed"].defaulted();
|
||||
|
||||
if (use_mocked && has_llm_params) {
|
||||
spdlog::warn(
|
||||
"Sampling parameters (--temperature, --top-p, --top-k, --seed) are"
|
||||
" ignored when using --mocked");
|
||||
}
|
||||
|
||||
ApplicationOptions options;
|
||||
options.use_mocked = use_mocked;
|
||||
options.model_path = model_path;
|
||||
options.temperature = variables_map["temperature"].as<float>();
|
||||
options.top_p = variables_map["top-p"].as<float>();
|
||||
options.top_k = variables_map["top-k"].as<uint32_t>();
|
||||
options.n_ctx = variables_map["n-ctx"].as<uint32_t>();
|
||||
options.seed = variables_map["seed"].as<int>();
|
||||
|
||||
return options;
|
||||
} catch (const std::exception& exception) {
|
||||
spdlog::error("Failed to parse command-line arguments: {}",
|
||||
exception.what());
|
||||
return std::nullopt;
|
||||
} catch (...) {
|
||||
spdlog::error("Failed to parse command-line arguments: unknown error");
|
||||
return std::nullopt;
|
||||
}
|
||||
}
|
||||
|
||||
struct Timer {
|
||||
std::chrono::steady_clock::time_point start_time =
|
||||
std::chrono::steady_clock::now();
|
||||
[[nodiscard]] int64_t Elapsed() const {
|
||||
return std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - start_time)
|
||||
.count();
|
||||
}
|
||||
};
|
||||
|
||||
int main(const int argc, char** argv) {
|
||||
try {
|
||||
Timer timer;
|
||||
@@ -157,6 +45,20 @@ int main(const int argc, char** argv) {
|
||||
}
|
||||
|
||||
const auto options = *parsed_options;
|
||||
const std::string model_path = options.generator.model_path.string();
|
||||
const auto sampling =
|
||||
options.generator.sampling.value_or(SamplingOptions{});
|
||||
|
||||
std::unique_ptr<IPromptDirectory> prompt_directory;
|
||||
if (!options.generator.use_mocked) {
|
||||
try {
|
||||
prompt_directory =
|
||||
std::make_unique<PromptDirectory>(options.pipeline.prompt_dir);
|
||||
} catch (const std::exception& dir_error) {
|
||||
spdlog::error("[Startup] Invalid --prompt-dir: {}", dir_error.what());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
const auto injector = di::make_injector(
|
||||
di::bind<WebClient>().to<CURLWebClient>(),
|
||||
@@ -164,10 +66,11 @@ int main(const int argc, char** argv) {
|
||||
di::bind<IEnrichmentService>().to<WikipediaService>(),
|
||||
di::bind<IExportService>().to<SqliteExportService>(),
|
||||
di::bind<IPromptFormatter>().to<Gemma4JinjaPromptFormatter>(),
|
||||
di::bind<std::string>().to(options.model_path),
|
||||
di::bind<std::string>().to(model_path),
|
||||
di::bind<DataGenerator>().to(
|
||||
[options](const auto& inj) -> std::unique_ptr<DataGenerator> {
|
||||
if (options.use_mocked) {
|
||||
[options, model_path, sampling, &prompt_directory](
|
||||
const auto& inj) -> std::unique_ptr<DataGenerator> {
|
||||
if (options.generator.use_mocked) {
|
||||
spdlog::info(
|
||||
"[Generator] Using MockGenerator (no model path provided)");
|
||||
return std::make_unique<MockGenerator>();
|
||||
@@ -176,9 +79,12 @@ int main(const int argc, char** argv) {
|
||||
spdlog::info(
|
||||
"[Generator] Using LlamaGenerator: {} (temperature={}, "
|
||||
"top-p={}, top-k={}, n_ctx={}, seed={})",
|
||||
options.model_path, options.temperature, options.top_p,
|
||||
options.top_k, options.n_ctx, options.seed);
|
||||
return inj.template create<std::unique_ptr<LlamaGenerator>>();
|
||||
model_path, sampling.temperature, sampling.top_p,
|
||||
sampling.top_k, sampling.n_ctx, sampling.seed);
|
||||
return std::make_unique<LlamaGenerator>(
|
||||
options, model_path,
|
||||
inj.template create<std::unique_ptr<IPromptFormatter>>(),
|
||||
std::move(prompt_directory));
|
||||
}));
|
||||
|
||||
auto generator =
|
||||
|
||||
85
tooling/pipeline/src/services/prompt_directory.cc
Normal file
85
tooling/pipeline/src/services/prompt_directory.cc
Normal file
@@ -0,0 +1,85 @@
|
||||
/**
|
||||
* @file services/prompt_directory.cc
|
||||
* @brief PromptDirectory implementation: validates the directory at
|
||||
* construction and loads named prompt files on demand with in-process caching.
|
||||
*/
|
||||
|
||||
#include "services/prompt_directory.h"
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// PromptDirectory
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
PromptDirectory::PromptDirectory(const std::filesystem::path& prompt_dir)
|
||||
: prompt_dir_(prompt_dir) {
|
||||
std::error_code ec;
|
||||
|
||||
// Scenario 4: directory must exist.
|
||||
if (!std::filesystem::exists(prompt_dir_, ec) || ec) {
|
||||
throw std::runtime_error(
|
||||
"PromptDirectory: prompt directory does not exist: " +
|
||||
prompt_dir_.string());
|
||||
}
|
||||
|
||||
// Scenario 4: path must be a directory, not a file.
|
||||
if (!std::filesystem::is_directory(prompt_dir_, ec) || ec) {
|
||||
throw std::runtime_error(
|
||||
"PromptDirectory: prompt directory path is not a directory: " +
|
||||
prompt_dir_.string());
|
||||
}
|
||||
|
||||
// Scenario 4: directory must be readable (probe with directory_iterator).
|
||||
std::filesystem::directory_iterator probe(prompt_dir_, ec);
|
||||
if (ec) {
|
||||
throw std::runtime_error(
|
||||
"PromptDirectory: prompt directory is not readable: " +
|
||||
prompt_dir_.string() + " (" + ec.message() + ")");
|
||||
}
|
||||
|
||||
spdlog::info("[PromptDirectory] Resolved prompt directory: {}",
|
||||
prompt_dir_.string());
|
||||
}
|
||||
|
||||
std::string PromptDirectory::Load(std::string_view key) {
|
||||
const std::string key_str(key);
|
||||
|
||||
// Return cached content if already loaded during this run.
|
||||
const auto cache_it = cache_.find(key_str);
|
||||
if (cache_it != cache_.end()) {
|
||||
return cache_it->second;
|
||||
}
|
||||
|
||||
// Scenario 3: resolve <prompt_dir>/<key>.md and require it to exist.
|
||||
const std::filesystem::path file_path =
|
||||
prompt_dir_ / std::filesystem::path(key_str + ".md");
|
||||
|
||||
std::ifstream file(file_path);
|
||||
if (!file.is_open()) {
|
||||
throw std::runtime_error(
|
||||
"PromptDirectory: prompt file not found for key '" + key_str +
|
||||
"': " + file_path.string());
|
||||
}
|
||||
|
||||
std::string content((std::istreambuf_iterator<char>(file)),
|
||||
std::istreambuf_iterator<char>());
|
||||
file.close();
|
||||
|
||||
if (content.empty()) {
|
||||
throw std::runtime_error("PromptDirectory: prompt file for key '" +
|
||||
key_str + "' is empty: " + file_path.string());
|
||||
}
|
||||
|
||||
spdlog::info("[PromptDirectory] Loaded prompt '{}' from '{}' ({} chars)",
|
||||
key_str, file_path.string(), content.size());
|
||||
|
||||
cache_.emplace(key_str, content);
|
||||
return content;
|
||||
}
|
||||
@@ -1,24 +0,0 @@
|
||||
/**
|
||||
* @file services/sqlite/build_database_path.cc
|
||||
* @brief SqliteExportService::BuildDatabasePath() implementation.
|
||||
*/
|
||||
|
||||
#include <filesystem>
|
||||
#include <string>
|
||||
|
||||
#include "services/sqlite_export_service.h"
|
||||
|
||||
std::filesystem::path SqliteExportService::BuildDatabasePath() const {
|
||||
std::filesystem::path base_filename("biergarten_seed_" + run_timestamp_utc_ +
|
||||
".sqlite");
|
||||
std::filesystem::path candidate =
|
||||
std::filesystem::current_path() / base_filename;
|
||||
|
||||
for (int suffix = 1; std::filesystem::exists(candidate); ++suffix) {
|
||||
candidate = std::filesystem::current_path() /
|
||||
std::filesystem::path("biergarten_seed_" + run_timestamp_utc_ +
|
||||
"-" + std::to_string(suffix) + ".sqlite");
|
||||
}
|
||||
|
||||
return candidate;
|
||||
}
|
||||
@@ -8,7 +8,6 @@
|
||||
#include "services/sqlite_export_service.h"
|
||||
#include "services/sqlite_export_service_helpers.h"
|
||||
|
||||
|
||||
void SqliteExportService::Finalize() {
|
||||
if (db_handle_ == nullptr) {
|
||||
return;
|
||||
|
||||
@@ -10,7 +10,8 @@ void SqliteDatabaseDeleter::operator()(sqlite3* handle) const noexcept {
|
||||
}
|
||||
}
|
||||
|
||||
void SqliteStatementDeleter::operator()(sqlite3_stmt* statement) const noexcept {
|
||||
void SqliteStatementDeleter::operator()(
|
||||
sqlite3_stmt* statement) const noexcept {
|
||||
if (statement != nullptr) {
|
||||
sqlite3_finalize(statement);
|
||||
}
|
||||
@@ -23,7 +24,6 @@ void ThrowSqliteError(sqlite3* db_handle, std::string_view action) {
|
||||
}
|
||||
|
||||
SqliteDatabaseHandle OpenDatabase(const std::filesystem::path& path) {
|
||||
|
||||
sqlite3* raw_handle = nullptr;
|
||||
const int result = sqlite3_open(path.string().c_str(), &raw_handle);
|
||||
|
||||
@@ -54,7 +54,8 @@ void ExecSql(const SqliteDatabaseHandle& db_handle, std::string_view sql,
|
||||
}
|
||||
}
|
||||
|
||||
void RollbackTransactionNoThrow(const SqliteDatabaseHandle& db_handle) noexcept {
|
||||
void RollbackTransactionNoThrow(
|
||||
const SqliteDatabaseHandle& db_handle) noexcept {
|
||||
if (!db_handle) {
|
||||
return;
|
||||
}
|
||||
@@ -63,4 +64,3 @@ void RollbackTransactionNoThrow(const SqliteDatabaseHandle& db_handle) noexcept
|
||||
}
|
||||
|
||||
} // namespace sqlite_export_service_internal
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#include "services/sqlite_statement_helpers.h"
|
||||
#include "services/sqlite_connection_helpers.h"
|
||||
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include <limits>
|
||||
#include <stdexcept>
|
||||
#include <boost/json.hpp>
|
||||
#include <cstring>
|
||||
#include <limits>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "services/sqlite_connection_helpers.h"
|
||||
|
||||
namespace sqlite_export_service_internal {
|
||||
|
||||
@@ -86,16 +87,6 @@ sqlite3_int64 LastInsertRowId(const SqliteDatabaseHandle& db_handle) {
|
||||
return sqlite3_last_insert_rowid(db_handle.get());
|
||||
}
|
||||
|
||||
std::string SerializeLocalLanguages(
|
||||
const std::vector<std::string>& local_languages) {
|
||||
boost::json::array array;
|
||||
array.reserve(local_languages.size());
|
||||
for (const auto& language : local_languages) {
|
||||
array.emplace_back(language);
|
||||
}
|
||||
return boost::json::serialize(array);
|
||||
}
|
||||
|
||||
std::string SerializeVector(const std::vector<std::string>& str_vec) {
|
||||
boost::json::array array(str_vec.size());
|
||||
for (const auto& s : str_vec) {
|
||||
@@ -105,4 +96,3 @@ std::string SerializeVector(const std::vector<std::string>& str_vec) {
|
||||
}
|
||||
|
||||
} // namespace sqlite_export_service_internal
|
||||
|
||||
@@ -11,6 +11,19 @@
|
||||
#include "services/sqlite_export_service.h"
|
||||
#include "services/sqlite_export_service_helpers.h"
|
||||
|
||||
std::filesystem::path SqliteExportService::BuildDatabasePath() const {
|
||||
std::filesystem::path base_filename("biergarten_seed_" + run_timestamp_utc_ +
|
||||
".sqlite");
|
||||
std::filesystem::path candidate = output_path_ / base_filename;
|
||||
|
||||
for (int suffix = 1; std::filesystem::exists(candidate); ++suffix) {
|
||||
candidate = output_path_ /
|
||||
std::filesystem::path("biergarten_seed_" + run_timestamp_utc_ +
|
||||
"-" + std::to_string(suffix) + ".sqlite");
|
||||
}
|
||||
|
||||
return candidate;
|
||||
}
|
||||
|
||||
void SqliteExportService::InitializeSchema() const {
|
||||
sqlite_export_service_internal::ExecSql(
|
||||
@@ -46,7 +59,6 @@ void SqliteExportService::RollbackAndCloseNoThrow() noexcept {
|
||||
location_cache_.clear();
|
||||
}
|
||||
|
||||
|
||||
void SqliteExportService::Initialize() {
|
||||
if (db_handle_ != nullptr) {
|
||||
throw std::runtime_error("SQLite export service is already initialized");
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
* @brief SqliteExportService::ProcessRecord() implementation.
|
||||
*/
|
||||
|
||||
#include <iomanip>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
|
||||
@@ -7,11 +7,12 @@
|
||||
|
||||
#include <memory>
|
||||
|
||||
SqliteExportService::SqliteExportService()
|
||||
: date_time_provider_(std::make_unique<SystemDateTimeProvider>()) {}
|
||||
SqliteExportService::SqliteExportService(const ApplicationOptions& options)
|
||||
: date_time_provider_(std::make_unique<SystemDateTimeProvider>()),
|
||||
output_path_(options.pipeline.output_path) {}
|
||||
|
||||
SqliteExportService::~SqliteExportService() {
|
||||
if (db_handle_ != nullptr) {
|
||||
RollbackAndCloseNoThrow();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,6 +17,7 @@ using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
||||
|
||||
static constexpr long kConnectionTimeout = 10;
|
||||
static constexpr long kRequestTimeout = 30;
|
||||
static constexpr long kMaxRedirects = 5;
|
||||
static constexpr int32_t kOkHttpStatus = 200;
|
||||
|
||||
static CurlHandle CreateHandle() {
|
||||
@@ -32,7 +33,7 @@ static void SetCommonGetOptions(CURL* curl, const std::string& url) {
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
|
||||
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, kMaxRedirects);
|
||||
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, kConnectionTimeout);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, kRequestTimeout);
|
||||
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
|
||||
|
||||
Reference in New Issue
Block a user