mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
refactor(pipeline): restructure config, add PromptDirectory, consolidate SQLite layer (#217)
* Refactor ApplicationOptions to separate config concerns * add prompt dir app option * readability updates: remove magic numbers, update comments * codebase formatting * Update docs * Extract argument parsing, timer out of
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
#include "data_generation/data_generator.h"
|
||||
#include "data_generation/prompt_formatting/prompt_formatter.h"
|
||||
#include "data_model/application_options.h"
|
||||
#include "services/prompt_directory.h"
|
||||
|
||||
struct llama_model;
|
||||
struct llama_context;
|
||||
@@ -33,10 +34,12 @@ class LlamaGenerator final : public DataGenerator {
|
||||
* @param options Parsed application options.
|
||||
* @param model_path Filesystem path to GGUF model assets.
|
||||
* @param prompt_formatter Formatter that produces model-specific prompts.
|
||||
* @param prompt_directory Directory service for loading named prompt files.
|
||||
*/
|
||||
LlamaGenerator(const ApplicationOptions& options,
|
||||
const std::string& model_path,
|
||||
std::unique_ptr<IPromptFormatter> prompt_formatter);
|
||||
std::unique_ptr<IPromptFormatter> prompt_formatter,
|
||||
std::unique_ptr<IPromptDirectory> prompt_directory);
|
||||
|
||||
~LlamaGenerator() override;
|
||||
|
||||
@@ -119,15 +122,6 @@ class LlamaGenerator final : public DataGenerator {
|
||||
int max_tokens = kDefaultMaxTokens,
|
||||
std::string_view grammar = {});
|
||||
|
||||
/**
|
||||
* @brief Loads the brewery system prompt from disk.
|
||||
*
|
||||
* @param prompt_file_path Prompt file path to try first.
|
||||
* @return Loaded prompt text.
|
||||
*/
|
||||
std::string LoadBrewerySystemPrompt(
|
||||
const std::filesystem::path& prompt_file_path);
|
||||
|
||||
ModelHandle model_;
|
||||
ContextHandle context_;
|
||||
float sampling_temperature_ = 1.0F;
|
||||
@@ -135,8 +129,8 @@ class LlamaGenerator final : public DataGenerator {
|
||||
uint32_t sampling_top_k_ = kDefaultSamplingTopK;
|
||||
std::mt19937 rng_;
|
||||
uint32_t n_ctx_ = kDefaultContextSize;
|
||||
std::string brewery_system_prompt_;
|
||||
std::unique_ptr<IPromptFormatter> prompt_formatter_;
|
||||
std::unique_ptr<IPromptDirectory> prompt_directory_;
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||
|
||||
@@ -44,6 +44,13 @@ class MockGenerator final : public DataGenerator {
|
||||
*/
|
||||
static size_t DeterministicHash(const Location& location);
|
||||
|
||||
// Hash stride constants for deterministic distribution across fixed-size
|
||||
// arrays. These coprime strides spread hash values uniformly without
|
||||
// clustering, ensuring diverse output across different hash inputs.
|
||||
static constexpr size_t kNounHashStride = 7;
|
||||
static constexpr size_t kDescriptionHashStride = 13;
|
||||
static constexpr size_t kBioHashStride = 11;
|
||||
|
||||
static constexpr std::array<std::string_view, 18> kBreweryAdjectives = {
|
||||
"Craft", "Heritage", "Local", "Artisan", "Pioneer", "Golden",
|
||||
"Modern", "Classic", "Summit", "Northern", "Riverstone", "Barrel",
|
||||
|
||||
@@ -6,37 +6,71 @@
|
||||
* @brief Program options for the Biergarten pipeline application.
|
||||
*/
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
#include <cstdint>
|
||||
#include <filesystem>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace prog_opts = boost::program_options;
|
||||
|
||||
/**
|
||||
* @brief Program options for the Biergarten pipeline application.
|
||||
* @brief LLM sampling parameters.
|
||||
*/
|
||||
struct ApplicationOptions {
|
||||
/// @brief Path to the LLM model file (gguf format); mutually exclusive with
|
||||
/// use_mocked.
|
||||
std::string model_path;
|
||||
|
||||
/// @brief Use mocked generator instead of LLM; mutually exclusive with
|
||||
/// model_path.
|
||||
bool use_mocked = false;
|
||||
|
||||
struct SamplingOptions {
|
||||
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
|
||||
float temperature = 1.0F;
|
||||
|
||||
/// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more
|
||||
/// random).
|
||||
/// @brief LLM nucleus sampling top-p parameter.
|
||||
float top_p = 0.95F;
|
||||
|
||||
/// @brief LLM top-k sampling parameter.
|
||||
uint32_t top_k = 64;
|
||||
|
||||
/// @brief Context window size (tokens) for LLM inference. Higher values
|
||||
/// support longer prompts but use more memory.
|
||||
/// @brief Context window size (tokens).
|
||||
uint32_t n_ctx = 8192;
|
||||
|
||||
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
|
||||
/// @brief Random seed (-1 for random, otherwise non-negative).
|
||||
int seed = -1;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Configuration for the LLM generator component.
|
||||
*/
|
||||
struct GeneratorOptions {
|
||||
/// @brief Path to the LLM model file (gguf format).
|
||||
std::filesystem::path model_path;
|
||||
|
||||
/// @brief Use mocked generator instead of actual LLM inference.
|
||||
bool use_mocked = false;
|
||||
|
||||
/// @brief Specific sampling parameters for this generator.
|
||||
/// If nullopt, the application should use global defaults.
|
||||
std::optional<SamplingOptions> sampling;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Configuration for the pipeline execution and output.
|
||||
*/
|
||||
struct PipelineOptions {
|
||||
/// @brief Directory for generated artifacts.
|
||||
std::filesystem::path output_path;
|
||||
|
||||
/// @brief Directory that contains named prompt files (e.g.
|
||||
/// BREWERY_GENERATION.md).
|
||||
std::filesystem::path prompt_dir;
|
||||
|
||||
/// @brief Path for application logs.
|
||||
std::filesystem::path log_path;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Root configuration object for the Biergarten pipeline.
|
||||
*/
|
||||
struct ApplicationOptions {
|
||||
GeneratorOptions generator;
|
||||
PipelineOptions pipeline;
|
||||
};
|
||||
|
||||
std::optional<ApplicationOptions> ParseArguments(const int argc, char** argv);
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_MODEL_APPLICATION_OPTIONS_H_
|
||||
|
||||
@@ -6,6 +6,8 @@
|
||||
* @brief Abstraction for persisting generated brewery data.
|
||||
*/
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "data_model/generated_brewery.h"
|
||||
|
||||
/**
|
||||
|
||||
76
tooling/pipeline/includes/services/prompt_directory.h
Normal file
76
tooling/pipeline/includes/services/prompt_directory.h
Normal file
@@ -0,0 +1,76 @@
|
||||
#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_
|
||||
#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_
|
||||
|
||||
/**
|
||||
* @file services/prompt_directory.h
|
||||
* @brief Interface and filesystem-backed implementation for named prompt
|
||||
* loading.
|
||||
*
|
||||
* Prompt files are resolved by key: a key of "BREWERY_GENERATION" maps to the
|
||||
* file <prompt_dir>/BREWERY_GENERATION.md. The interface is kept intentionally
|
||||
* narrow so test doubles can be injected without touching the filesystem.
|
||||
*/
|
||||
|
||||
#include <filesystem>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <unordered_map>
|
||||
|
||||
/**
|
||||
* @brief Interface for loading named prompt files.
|
||||
*/
|
||||
class IPromptDirectory {
|
||||
public:
|
||||
IPromptDirectory() = default;
|
||||
IPromptDirectory(const IPromptDirectory&) = delete;
|
||||
IPromptDirectory& operator=(const IPromptDirectory&) = delete;
|
||||
IPromptDirectory(IPromptDirectory&&) = delete;
|
||||
IPromptDirectory& operator=(IPromptDirectory&&) = delete;
|
||||
virtual ~IPromptDirectory() = default;
|
||||
|
||||
/**
|
||||
* @brief Loads the prompt associated with @p key.
|
||||
*
|
||||
* @param key Logical prompt key, e.g. "BREWERY_GENERATION".
|
||||
* @return Prompt text.
|
||||
* @throws std::runtime_error if the prompt file cannot be found or read.
|
||||
*/
|
||||
[[nodiscard]] virtual std::string Load(std::string_view key) = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief Filesystem-backed IPromptDirectory implementation.
|
||||
*
|
||||
* Each call to Load() checks an in-process cache first, then reads
|
||||
* <prompt_dir>/<key>.md from disk. The directory must exist and be readable
|
||||
* at construction time; individual file absence is reported lazily at Load().
|
||||
*/
|
||||
class PromptDirectory final : public IPromptDirectory {
|
||||
public:
|
||||
/**
|
||||
* @brief Constructs a PromptDirectory rooted at @p prompt_dir.
|
||||
*
|
||||
* @param prompt_dir Absolute or relative path to the prompt directory.
|
||||
* @throws std::runtime_error if @p prompt_dir does not exist or is not a
|
||||
* directory.
|
||||
*/
|
||||
explicit PromptDirectory(const std::filesystem::path& prompt_dir);
|
||||
|
||||
/**
|
||||
* @brief Loads the prompt for @p key, caching the result.
|
||||
*
|
||||
* Maps @p key → <prompt_dir>/<key>.md.
|
||||
*
|
||||
* @param key Logical prompt key.
|
||||
* @return Prompt text.
|
||||
* @throws std::runtime_error if the file does not exist or is empty.
|
||||
*/
|
||||
[[nodiscard]] std::string Load(std::string_view key) override;
|
||||
|
||||
private:
|
||||
std::filesystem::path prompt_dir_;
|
||||
std::unordered_map<std::string, std::string> cache_;
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_PROMPT_DIRECTORY_H_
|
||||
@@ -7,6 +7,7 @@
|
||||
*/
|
||||
|
||||
#include <sqlite3.h>
|
||||
|
||||
#include <filesystem>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
@@ -20,12 +21,10 @@ void ThrowSqliteError(sqlite3* db_handle, std::string_view action);
|
||||
SqliteDatabaseHandle OpenDatabase(const std::filesystem::path& path);
|
||||
|
||||
void ExecSql(const SqliteDatabaseHandle& db_handle, std::string_view sql,
|
||||
const char* action);
|
||||
const char* action);
|
||||
|
||||
void RollbackTransactionNoThrow(const SqliteDatabaseHandle& db_handle) noexcept;
|
||||
|
||||
} // namespace sqlite_export_service_internal
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_CONNECTION_HELPERS_H_
|
||||
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "data_model/application_options.h"
|
||||
#include "services/date_time_provider.h"
|
||||
#include "services/export_service.h"
|
||||
#include "services/sqlite_export_service_helpers.h"
|
||||
@@ -20,7 +21,7 @@
|
||||
*/
|
||||
class SqliteExportService final : public IExportService {
|
||||
public:
|
||||
SqliteExportService();
|
||||
explicit SqliteExportService(const ApplicationOptions& options);
|
||||
~SqliteExportService() override;
|
||||
|
||||
SqliteExportService(const SqliteExportService&) = delete;
|
||||
@@ -41,12 +42,12 @@ class SqliteExportService final : public IExportService {
|
||||
void InitializeSchema() const;
|
||||
void PrepareStatements();
|
||||
void RollbackAndCloseNoThrow() noexcept;
|
||||
void FinalizeStatements() noexcept;
|
||||
|
||||
[[nodiscard]] std::filesystem::path BuildDatabasePath() const;
|
||||
[[nodiscard]] static std::string BuildLocationKey(const Location& location);
|
||||
|
||||
std::unique_ptr<IDateTimeProvider> date_time_provider_;
|
||||
std::filesystem::path output_path_;
|
||||
std::string run_timestamp_utc_;
|
||||
std::filesystem::path database_path_;
|
||||
SqliteDatabaseHandle db_handle_;
|
||||
|
||||
@@ -3,8 +3,8 @@
|
||||
|
||||
/* Umbrella header for backward compatibility. */
|
||||
|
||||
#include "services/sqlite_handle_types.h"
|
||||
#include "services/sqlite_connection_helpers.h"
|
||||
#include "services/sqlite_handle_types.h"
|
||||
#include "services/sqlite_statement_helpers.h"
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_EXPORT_SERVICE_HELPERS_H_
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
*/
|
||||
|
||||
#include <sqlite3.h>
|
||||
|
||||
#include <memory>
|
||||
#include <string_view>
|
||||
|
||||
@@ -33,4 +34,3 @@ struct BindParam {
|
||||
} // namespace sqlite_export_service_internal
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_HANDLE_TYPES_H_
|
||||
|
||||
|
||||
@@ -3,10 +3,12 @@
|
||||
|
||||
/**
|
||||
* @file services/sqlite_statement_helpers.h
|
||||
* @brief Declarations for statement-level SQLite helper functions and constants.
|
||||
* @brief Declarations for statement-level SQLite helper functions and
|
||||
* constants.
|
||||
*/
|
||||
|
||||
#include <sqlite3.h>
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
#include <vector>
|
||||
@@ -107,10 +109,8 @@ void StepStatement(const SqliteDatabaseHandle& db_handle,
|
||||
|
||||
sqlite3_int64 LastInsertRowId(const SqliteDatabaseHandle& db_handle);
|
||||
|
||||
std::string SerializeLocalLanguages(const std::vector<std::string>& local_languages);
|
||||
std::string SerializeVector(const std::vector<std::string>& str_vec);
|
||||
|
||||
} // namespace sqlite_export_service_internal
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_SQLITE_STATEMENT_HELPERS_H_
|
||||
|
||||
|
||||
35
tooling/pipeline/includes/services/timer.h
Normal file
35
tooling/pipeline/includes/services/timer.h
Normal file
@@ -0,0 +1,35 @@
|
||||
#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_
|
||||
#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_
|
||||
|
||||
#include <chrono>
|
||||
|
||||
/**
|
||||
* @file services/timer.h
|
||||
* @brief Simple timer utility for measuring elapsed time.
|
||||
*/
|
||||
class Timer {
|
||||
std::chrono::steady_clock::time_point start_time =
|
||||
std::chrono::steady_clock::now();
|
||||
|
||||
public:
|
||||
Timer(const Timer&) = delete;
|
||||
Timer& operator=(const Timer&) = delete;
|
||||
Timer(Timer&&) = delete;
|
||||
Timer& operator=(Timer&&) = delete;
|
||||
Timer() = default;
|
||||
~Timer() = default;
|
||||
|
||||
[[nodiscard]] int64_t Elapsed() const {
|
||||
return std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
std::chrono::steady_clock::now() - start_time)
|
||||
.count();
|
||||
}
|
||||
|
||||
[[nodiscard]] int64_t Reset() {
|
||||
auto previous_elapsed = Elapsed();
|
||||
start_time = std::chrono::steady_clock::now();
|
||||
return previous_elapsed;
|
||||
}
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_TIMER_H_
|
||||
Reference in New Issue
Block a user