mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
Move pipeline directory
This commit is contained in:
41
tooling/pipeline/includes/data_generation/data_generator.h
Normal file
41
tooling/pipeline/includes/data_generation/data_generator.h
Normal file
@@ -0,0 +1,41 @@
|
||||
#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_DATA_GENERATOR_H_
|
||||
#define BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_DATA_GENERATOR_H_
|
||||
|
||||
/**
|
||||
* @file data_generation/data_generator.h
|
||||
* @brief Shared generator interfaces and result models.
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "data_model/brewery_result.h"
|
||||
#include "data_model/location.h"
|
||||
#include "data_model/user_result.h"
|
||||
|
||||
/**
|
||||
* @brief Interface for data generator implementations.
|
||||
*/
|
||||
class DataGenerator {
|
||||
public:
|
||||
virtual ~DataGenerator() = default;
|
||||
|
||||
/**
|
||||
* @brief Generates brewery data for a location.
|
||||
*
|
||||
* @param location Location data
|
||||
* @param region_context Additional regional context text.
|
||||
* @return Brewery generation result.
|
||||
*/
|
||||
virtual BreweryResult GenerateBrewery(const Location& location,
|
||||
const std::string& region_context) = 0;
|
||||
|
||||
/**
|
||||
* @brief Generates a user profile for a locale.
|
||||
*
|
||||
* @param locale Locale hint used by generator.
|
||||
* @return User generation result.
|
||||
*/
|
||||
virtual UserResult GenerateUser(const std::string& locale) = 0;
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_DATA_GENERATOR_H_
|
||||
142
tooling/pipeline/includes/data_generation/llama_generator.h
Normal file
142
tooling/pipeline/includes/data_generation/llama_generator.h
Normal file
@@ -0,0 +1,142 @@
|
||||
#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||
#define BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||
|
||||
#include <filesystem>
|
||||
|
||||
/**
|
||||
* @file data_generation/llama_generator.h
|
||||
* @brief llama.cpp-backed implementation of DataGenerator.
|
||||
*/
|
||||
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
#include <random>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "data_generation/data_generator.h"
|
||||
#include "data_generation/prompt_formatting/prompt_formatter.h"
|
||||
#include "data_model/application_options.h"
|
||||
|
||||
struct llama_model;
|
||||
struct llama_context;
|
||||
|
||||
/**
|
||||
* @brief Data generator implementation backed by llama.cpp.
|
||||
*/
|
||||
class LlamaGenerator final : public DataGenerator {
|
||||
public:
|
||||
/**
|
||||
* @brief Constructs a generator using parsed application options and loads
|
||||
* the configured model immediately.
|
||||
*
|
||||
* @param options Parsed application options.
|
||||
* @param model_path Filesystem path to GGUF model assets.
|
||||
* @param prompt_formatter Formatter that produces model-specific prompts.
|
||||
*/
|
||||
LlamaGenerator(const ApplicationOptions& options,
|
||||
const std::string& model_path,
|
||||
std::unique_ptr<IPromptFormatter> prompt_formatter);
|
||||
|
||||
~LlamaGenerator() override;
|
||||
|
||||
// disable copy constructor
|
||||
LlamaGenerator(const LlamaGenerator&) = delete;
|
||||
|
||||
// disable copy assignment operator
|
||||
LlamaGenerator& operator=(const LlamaGenerator&) = delete;
|
||||
// disable move constructor
|
||||
LlamaGenerator(LlamaGenerator&&) = delete;
|
||||
// disable move assignment operator
|
||||
LlamaGenerator& operator=(LlamaGenerator&&) = delete;
|
||||
|
||||
/**
|
||||
* @brief Generates brewery data for a specific location.
|
||||
*
|
||||
* @param location Location object.
|
||||
* @param region_context Additional regional context.
|
||||
* @return Generated brewery result.
|
||||
*/
|
||||
BreweryResult GenerateBrewery(const Location& location,
|
||||
const std::string& region_context) override;
|
||||
|
||||
/**
|
||||
* @brief Generates a user profile for the provided locale.
|
||||
*
|
||||
* @param locale Locale hint.
|
||||
* @return Generated user profile.
|
||||
*/
|
||||
UserResult GenerateUser(const std::string& locale) override;
|
||||
|
||||
private:
|
||||
static constexpr int32_t kDefaultMaxTokens = 10000;
|
||||
static constexpr float kDefaultSamplingTopP = 0.95F;
|
||||
static constexpr uint32_t kDefaultSamplingTopK = 64;
|
||||
static constexpr uint32_t kDefaultContextSize = 8192;
|
||||
|
||||
struct ModelDeleter {
|
||||
void operator()(llama_model* model) const noexcept;
|
||||
};
|
||||
struct ContextDeleter {
|
||||
void operator()(llama_context* context) const noexcept;
|
||||
};
|
||||
|
||||
using ModelHandle = std::unique_ptr<llama_model, ModelDeleter>;
|
||||
using ContextHandle = std::unique_ptr<llama_context, ContextDeleter>;
|
||||
|
||||
/**
|
||||
* @brief Loads model and prepares inference context.
|
||||
*
|
||||
* @param model_path Filesystem path to GGUF model.
|
||||
*/
|
||||
void Load(const std::string& model_path);
|
||||
|
||||
/**
|
||||
* @brief Infers text from separate system and user prompts.
|
||||
*
|
||||
* This helps chat-capable models preserve system-role behavior instead of
|
||||
* concatenating system text into user input.
|
||||
*
|
||||
* @param system_prompt System role prompt.
|
||||
* @param prompt User prompt.
|
||||
* @param max_tokens Maximum tokens to generate.
|
||||
* @param grammar Optional GBNF grammar constraining generated output.
|
||||
* @return Generated text.
|
||||
*/
|
||||
std::string Infer(const std::string& system_prompt, const std::string& prompt,
|
||||
int max_tokens = kDefaultMaxTokens,
|
||||
std::string_view grammar = {});
|
||||
|
||||
/**
|
||||
* @brief Runs inference on an already-formatted prompt.
|
||||
*
|
||||
* @param formatted_prompt Prompt preformatted for model chat template.
|
||||
* @param max_tokens Maximum tokens to generate.
|
||||
* @param grammar Optional GBNF grammar constraining generated output.
|
||||
* @return Generated text.
|
||||
*/
|
||||
std::string InferFormatted(const std::string& formatted_prompt,
|
||||
int max_tokens = kDefaultMaxTokens,
|
||||
std::string_view grammar = {});
|
||||
|
||||
/**
|
||||
* @brief Loads the brewery system prompt from disk.
|
||||
*
|
||||
* @param prompt_file_path Prompt file path to try first.
|
||||
* @return Loaded prompt text.
|
||||
*/
|
||||
std::string LoadBrewerySystemPrompt(
|
||||
const std::filesystem::path& prompt_file_path);
|
||||
|
||||
ModelHandle model_;
|
||||
ContextHandle context_;
|
||||
float sampling_temperature_ = 1.0F;
|
||||
float sampling_top_p_ = kDefaultSamplingTopP;
|
||||
uint32_t sampling_top_k_ = kDefaultSamplingTopK;
|
||||
std::mt19937 rng_;
|
||||
uint32_t n_ctx_ = kDefaultContextSize;
|
||||
std::string brewery_system_prompt_;
|
||||
std::unique_ptr<IPromptFormatter> prompt_formatter_;
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||
@@ -0,0 +1,50 @@
|
||||
#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
|
||||
#define BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
|
||||
|
||||
/**
|
||||
* @file data_generation/llama_generator_helpers.h
|
||||
* @brief Shared helper APIs used by LlamaGenerator translation units.
|
||||
*/
|
||||
|
||||
#include <cstddef>
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "data_model/brewery_result.h"
|
||||
|
||||
struct llama_vocab;
|
||||
using llama_token = int32_t;
|
||||
|
||||
/**
|
||||
* @brief Normalizes and truncates regional context.
|
||||
*
|
||||
* @param region_context Input regional context text.
|
||||
* @param max_chars Maximum output length.
|
||||
* @return Processed region context.
|
||||
*/
|
||||
std::string PrepareRegionContext(std::string_view region_context,
|
||||
size_t max_chars = 2000);
|
||||
|
||||
/**
|
||||
* @brief Decodes a sampled token and appends it to output text.
|
||||
*
|
||||
* @param vocab Model vocabulary.
|
||||
* @param token Sampled token id.
|
||||
* @param output Output text buffer.
|
||||
*/
|
||||
void AppendTokenPiece(const llama_vocab* vocab, llama_token token,
|
||||
std::string& output);
|
||||
|
||||
/**
|
||||
* @brief Validates and parses brewery JSON output.
|
||||
*
|
||||
* @param raw Raw model output.
|
||||
* @param brewery_out Parsed brewery payload.
|
||||
* @return Validation error message if invalid, or std::nullopt on success.
|
||||
*/
|
||||
std::optional<std::string> ValidateBreweryJson(const std::string& raw,
|
||||
BreweryResult& brewery_out);
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
|
||||
123
tooling/pipeline/includes/data_generation/mock_generator.h
Normal file
123
tooling/pipeline/includes/data_generation/mock_generator.h
Normal file
@@ -0,0 +1,123 @@
|
||||
#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_MOCK_GENERATOR_H_
|
||||
#define BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_MOCK_GENERATOR_H_
|
||||
|
||||
/**
|
||||
* @file data_generation/mock_generator.h
|
||||
* @brief Deterministic mock implementation of DataGenerator.
|
||||
*/
|
||||
|
||||
#include <array>
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "data_generation/data_generator.h"
|
||||
|
||||
/**
|
||||
* @brief Mock generator used for deterministic, model-free outputs.
|
||||
*/
|
||||
class MockGenerator final : public DataGenerator {
|
||||
public:
|
||||
/**
|
||||
* @brief Generates deterministic brewery data for a location.
|
||||
*
|
||||
* @param location City and country names.
|
||||
* @param region_context Unused for mock generation.
|
||||
* @return Generated brewery result.
|
||||
*/
|
||||
BreweryResult GenerateBrewery(const Location& location,
|
||||
const std::string& region_context) override;
|
||||
|
||||
/**
|
||||
* @brief Generates deterministic user data for a locale.
|
||||
*
|
||||
* @param locale Locale hint.
|
||||
* @return Generated user result.
|
||||
*/
|
||||
UserResult GenerateUser(const std::string& locale) override;
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Combines two strings into a stable hash value.
|
||||
*
|
||||
* @param location City and country names.
|
||||
* @return Deterministic hash value.
|
||||
*/
|
||||
static size_t DeterministicHash(const Location& location);
|
||||
|
||||
static constexpr std::array<std::string_view, 18> kBreweryAdjectives = {
|
||||
"Craft", "Heritage", "Local", "Artisan", "Pioneer", "Golden",
|
||||
"Modern", "Classic", "Summit", "Northern", "Riverstone", "Barrel",
|
||||
"Hinterland", "Harbor", "Wild", "Granite", "Copper", "Maple"};
|
||||
|
||||
static constexpr std::array<std::string_view, 18> kBreweryNouns = {
|
||||
"Brewing Co.", "Brewery", "Bier Haus", "Taproom", "Works",
|
||||
"House", "Fermentery", "Ale Co.", "Cellars", "Collective",
|
||||
"Project", "Foundry", "Malthouse", "Public House", "Co-op",
|
||||
"Lab", "Beer Hall", "Guild"};
|
||||
|
||||
static constexpr std::array<std::string_view, 18> kBreweryDescriptions = {
|
||||
"Handcrafted pale ales and seasonal IPAs with local ingredients.",
|
||||
"Traditional lagers and experimental sours in small batches.",
|
||||
"Award-winning stouts and wildly hoppy blonde ales.",
|
||||
"Craft brewery specializing in Belgian-style triples and dark "
|
||||
"porters.",
|
||||
"Modern brewery blending tradition with bold experimental flavors.",
|
||||
"Neighborhood-focused taproom pouring crisp pilsners and citrusy "
|
||||
"pale "
|
||||
"ales.",
|
||||
"Small-batch brewery known for barrel-aged releases and smoky "
|
||||
"lagers.",
|
||||
"Independent brewhouse pairing farmhouse ales with rotating food "
|
||||
"pop-ups.",
|
||||
"Community brewpub making balanced bitters, saisons, and hazy IPAs.",
|
||||
"Experimental nanobrewery exploring local yeast and regional "
|
||||
"grains.",
|
||||
"Family-run brewery producing smooth amber ales and robust porters.",
|
||||
"Urban brewery crafting clean lagers and bright, fruit-forward "
|
||||
"sours.",
|
||||
"Riverfront brewhouse featuring oak-matured ales and seasonal "
|
||||
"blends.",
|
||||
"Modern taproom focused on sessionable lagers and classic pub "
|
||||
"styles.",
|
||||
"Brewery rooted in tradition with a lineup of malty reds and crisp "
|
||||
"lagers.",
|
||||
"Creative brewery offering rotating collaborations and limited "
|
||||
"draft-only "
|
||||
"pours.",
|
||||
"Locally inspired brewery serving approachable ales with bold hop "
|
||||
"character.",
|
||||
"Destination taproom known for balanced IPAs and cocoa-rich "
|
||||
"stouts."};
|
||||
|
||||
static constexpr std::array<std::string_view, 18> kUsernames = {
|
||||
"hopseeker", "malttrail", "yeastwhisper", "lagerlane",
|
||||
"barrelbound", "foamfinder", "taphunter", "graingeist",
|
||||
"brewscout", "aleatlas", "caskcompass", "hopsandmaps",
|
||||
"mashpilot", "pintnomad", "fermentfriend", "stoutsignal",
|
||||
"sessionwander", "kettlekeeper"};
|
||||
|
||||
static constexpr std::array<std::string_view, 18> kBios = {
|
||||
"Always chasing balanced IPAs and crisp lagers across local taprooms.",
|
||||
"Weekend brewery explorer with a soft spot for dark, roasty stouts.",
|
||||
"Documenting tiny brewpubs, fresh pours, and unforgettable beer "
|
||||
"gardens.",
|
||||
"Fan of farmhouse ales, food pairings, and long tasting flights.",
|
||||
"Collecting favorite pilsners one city at a time.",
|
||||
"Hops-first drinker who still saves room for classic malt-forward "
|
||||
"styles.",
|
||||
"Finding hidden tap lists and sharing the best seasonal releases.",
|
||||
"Brewery road-tripper focused on local ingredients and clean "
|
||||
"fermentation.",
|
||||
"Always comparing house lagers and ranking patio pint vibes.",
|
||||
"Curious about yeast strains, barrel programs, and cellar experiments.",
|
||||
"Believes every neighborhood deserves a great community taproom.",
|
||||
"Looking for session beers that taste great from first sip to last.",
|
||||
"Belgian ale enthusiast who never skips a new saison.",
|
||||
"Hazy IPA critic with deep respect for a perfectly clear pilsner.",
|
||||
"Visits breweries for the stories, stays for the flagship pours.",
|
||||
"Craft beer fan mapping tasting notes and favorite brew routes.",
|
||||
"Always ready to trade recommendations for underrated local breweries.",
|
||||
"Keeping a running list of must-try collab releases and tap takeovers."};
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_MOCK_GENERATOR_H_
|
||||
@@ -0,0 +1,15 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
#include "data_generation/prompt_formatting/prompt_formatter.h"
|
||||
|
||||
class Gemma4JinjaPromptFormatter final : public IPromptFormatter {
|
||||
public:
|
||||
Gemma4JinjaPromptFormatter() = default;
|
||||
~Gemma4JinjaPromptFormatter() override = default;
|
||||
|
||||
[[nodiscard]] std::string Format(std::string_view system_prompt,
|
||||
std::string_view user_prompt) const override;
|
||||
};
|
||||
@@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <string_view>
|
||||
|
||||
class IPromptFormatter {
|
||||
public:
|
||||
IPromptFormatter() = default;
|
||||
IPromptFormatter(const IPromptFormatter&) = delete;
|
||||
IPromptFormatter& operator=(const IPromptFormatter&) = delete;
|
||||
IPromptFormatter(IPromptFormatter&&) = delete;
|
||||
IPromptFormatter& operator=(IPromptFormatter&&) = delete;
|
||||
virtual ~IPromptFormatter() = default;
|
||||
|
||||
[[nodiscard]] virtual std::string Format(
|
||||
std::string_view system_prompt, std::string_view user_prompt) const = 0;
|
||||
};
|
||||
Reference in New Issue
Block a user