Create biergarten brewery pipeline project (#199)

This commit is contained in:
Aaron Po
2026-04-18 19:19:14 -04:00
committed by GitHub
parent fd3c172e35
commit 898cc8971b
59 changed files with 5638 additions and 0 deletions

View File

@@ -0,0 +1,41 @@
#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_DATA_GENERATOR_H_
#define BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_DATA_GENERATOR_H_
/**
* @file data_generation/data_generator.h
* @brief Shared generator interfaces and result models.
*/
#include <string>
#include "data_model/brewery_result.h"
#include "data_model/location.h"
#include "data_model/user_result.h"
/**
* @brief Interface for data generator implementations.
*/
class DataGenerator {
public:
virtual ~DataGenerator() = default;
/**
* @brief Generates brewery data for a location.
*
* @param location Location data
* @param region_context Additional regional context text.
* @return Brewery generation result.
*/
virtual BreweryResult GenerateBrewery(const Location& location,
const std::string& region_context) = 0;
/**
* @brief Generates a user profile for a locale.
*
* @param locale Locale hint used by generator.
* @return User generation result.
*/
virtual UserResult GenerateUser(const std::string& locale) = 0;
};
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_DATA_GENERATOR_H_

View File

@@ -0,0 +1,141 @@
#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_
#define BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_
#include <filesystem>
/**
* @file data_generation/llama_generator.h
* @brief llama.cpp-backed implementation of DataGenerator.
*/
#include <cstdint>
#include <memory>
#include <random>
#include <string>
#include <string_view>
#include "data_generation/data_generator.h"
#include "data_generation/prompt_formatting/prompt_formatter.h"
#include "data_model/application_options.h"
struct llama_model;
struct llama_context;
/**
* @brief Data generator implementation backed by llama.cpp.
*/
class LlamaGenerator final : public DataGenerator {
public:
/**
* @brief Constructs a generator using parsed application options and loads
* the configured model immediately.
*
* @param options Parsed application options.
* @param model_path Filesystem path to GGUF model assets.
* @param prompt_formatter Formatter that produces model-specific prompts.
*/
LlamaGenerator(const ApplicationOptions& options,
const std::string& model_path,
std::unique_ptr<IPromptFormatter> prompt_formatter);
~LlamaGenerator() override;
// disable copy constructor
LlamaGenerator(const LlamaGenerator&) = delete;
// disable copy assignment operator
LlamaGenerator& operator=(const LlamaGenerator&) = delete;
// disable move constructor
LlamaGenerator(LlamaGenerator&&) = delete;
// disable move assignment operator
LlamaGenerator& operator=(LlamaGenerator&&) = delete;
/**
* @brief Generates brewery data for a specific location.
*
* @param location Location object.
* @param region_context Additional regional context.
* @return Generated brewery result.
*/
BreweryResult GenerateBrewery(const Location& location,
const std::string& region_context) override;
/**
* @brief Generates a user profile for the provided locale.
*
* @param locale Locale hint.
* @return Generated user profile.
*/
UserResult GenerateUser(const std::string& locale) override;
private:
static constexpr int32_t kDefaultMaxTokens = 10000;
static constexpr float kDefaultSamplingTopP = 0.95F;
static constexpr uint32_t kDefaultSamplingTopK = 64;
static constexpr uint32_t kDefaultContextSize = 8192;
struct ModelDeleter {
void operator()(llama_model* model) const noexcept;
};
struct ContextDeleter {
void operator()(llama_context* context) const noexcept;
};
using ModelHandle = std::unique_ptr<llama_model, ModelDeleter>;
using ContextHandle = std::unique_ptr<llama_context, ContextDeleter>;
/**
* @brief Loads model and prepares inference context.
*
* @param model_path Filesystem path to GGUF model.
*/
void Load(const std::string& model_path);
/**
* @brief Infers text from separate system and user prompts.
*
* This helps chat-capable models preserve system-role behavior instead of
* concatenating system text into user input.
*
* @param system_prompt System role prompt.
* @param prompt User prompt.
* @param max_tokens Maximum tokens to generate.
* @param grammar Optional GBNF grammar constraining generated output.
* @return Generated text.
*/
std::string Infer(const std::string& system_prompt, const std::string& prompt,
int max_tokens = kDefaultMaxTokens,
std::string_view grammar = {});
/**
* @brief Runs inference on an already-formatted prompt.
*
* @param formatted_prompt Prompt preformatted for model chat template.
* @param max_tokens Maximum tokens to generate.
* @param grammar Optional GBNF grammar constraining generated output.
* @return Generated text.
*/
std::string InferFormatted(const std::string& formatted_prompt,
int max_tokens = kDefaultMaxTokens,
std::string_view grammar = {});
/**
* @brief Loads the brewery system prompt from disk.
*
* @param prompt_file_path Prompt file path to try first.
* @return Loaded prompt text.
*/
std::string LoadBrewerySystemPrompt(const std::filesystem::path& prompt_file_path);
ModelHandle model_;
ContextHandle context_;
float sampling_temperature_ = 1.0F;
float sampling_top_p_ = kDefaultSamplingTopP;
uint32_t sampling_top_k_ = kDefaultSamplingTopK;
std::mt19937 rng_;
uint32_t n_ctx_ = kDefaultContextSize;
std::string brewery_system_prompt_;
std::unique_ptr<IPromptFormatter> prompt_formatter_;
};
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_H_

View File

@@ -0,0 +1,50 @@
#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
#define BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_
/**
* @file data_generation/llama_generator_helpers.h
* @brief Shared helper APIs used by LlamaGenerator translation units.
*/
#include <cstddef>
#include <cstdint>
#include <optional>
#include <string>
#include <string_view>
#include "data_model/brewery_result.h"
struct llama_vocab;
using llama_token = int32_t;
/**
* @brief Normalizes and truncates regional context.
*
* @param region_context Input regional context text.
* @param max_chars Maximum output length.
* @return Processed region context.
*/
std::string PrepareRegionContext(std::string_view region_context,
size_t max_chars = 2000);
/**
* @brief Decodes a sampled token and appends it to output text.
*
* @param vocab Model vocabulary.
* @param token Sampled token id.
* @param output Output text buffer.
*/
void AppendTokenPiece(const llama_vocab* vocab, llama_token token,
std::string& output);
/**
* @brief Validates and parses brewery JSON output.
*
* @param raw Raw model output.
* @param brewery_out Parsed brewery payload.
* @return Validation error message if invalid, or std::nullopt on success.
*/
std::optional<std::string> ValidateBreweryJson(const std::string& raw,
BreweryResult& brewery_out);
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_LLAMA_GENERATOR_HELPERS_H_

View File

@@ -0,0 +1,123 @@
#ifndef BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_MOCK_GENERATOR_H_
#define BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_MOCK_GENERATOR_H_
/**
* @file data_generation/mock_generator.h
* @brief Deterministic mock implementation of DataGenerator.
*/
#include <array>
#include <string>
#include <string_view>
#include "data_generation/data_generator.h"
/**
* @brief Mock generator used for deterministic, model-free outputs.
*/
class MockGenerator final : public DataGenerator {
public:
/**
* @brief Generates deterministic brewery data for a location.
*
* @param location City and country names.
* @param region_context Unused for mock generation.
* @return Generated brewery result.
*/
BreweryResult GenerateBrewery(const Location& location,
const std::string& region_context) override;
/**
* @brief Generates deterministic user data for a locale.
*
* @param locale Locale hint.
* @return Generated user result.
*/
UserResult GenerateUser(const std::string& locale) override;
private:
/**
* @brief Combines two strings into a stable hash value.
*
* @param location City and country names.
* @return Deterministic hash value.
*/
static size_t DeterministicHash(const Location& location);
static constexpr std::array<std::string_view, 18> kBreweryAdjectives = {
"Craft", "Heritage", "Local", "Artisan", "Pioneer", "Golden",
"Modern", "Classic", "Summit", "Northern", "Riverstone", "Barrel",
"Hinterland", "Harbor", "Wild", "Granite", "Copper", "Maple"};
static constexpr std::array<std::string_view, 18> kBreweryNouns = {
"Brewing Co.", "Brewery", "Bier Haus", "Taproom", "Works",
"House", "Fermentery", "Ale Co.", "Cellars", "Collective",
"Project", "Foundry", "Malthouse", "Public House", "Co-op",
"Lab", "Beer Hall", "Guild"};
static constexpr std::array<std::string_view, 18> kBreweryDescriptions = {
"Handcrafted pale ales and seasonal IPAs with local ingredients.",
"Traditional lagers and experimental sours in small batches.",
"Award-winning stouts and wildly hoppy blonde ales.",
"Craft brewery specializing in Belgian-style triples and dark "
"porters.",
"Modern brewery blending tradition with bold experimental flavors.",
"Neighborhood-focused taproom pouring crisp pilsners and citrusy "
"pale "
"ales.",
"Small-batch brewery known for barrel-aged releases and smoky "
"lagers.",
"Independent brewhouse pairing farmhouse ales with rotating food "
"pop-ups.",
"Community brewpub making balanced bitters, saisons, and hazy IPAs.",
"Experimental nanobrewery exploring local yeast and regional "
"grains.",
"Family-run brewery producing smooth amber ales and robust porters.",
"Urban brewery crafting clean lagers and bright, fruit-forward "
"sours.",
"Riverfront brewhouse featuring oak-matured ales and seasonal "
"blends.",
"Modern taproom focused on sessionable lagers and classic pub "
"styles.",
"Brewery rooted in tradition with a lineup of malty reds and crisp "
"lagers.",
"Creative brewery offering rotating collaborations and limited "
"draft-only "
"pours.",
"Locally inspired brewery serving approachable ales with bold hop "
"character.",
"Destination taproom known for balanced IPAs and cocoa-rich "
"stouts."};
static constexpr std::array<std::string_view, 18> kUsernames = {
"hopseeker", "malttrail", "yeastwhisper", "lagerlane",
"barrelbound", "foamfinder", "taphunter", "graingeist",
"brewscout", "aleatlas", "caskcompass", "hopsandmaps",
"mashpilot", "pintnomad", "fermentfriend", "stoutsignal",
"sessionwander", "kettlekeeper"};
static constexpr std::array<std::string_view, 18> kBios = {
"Always chasing balanced IPAs and crisp lagers across local taprooms.",
"Weekend brewery explorer with a soft spot for dark, roasty stouts.",
"Documenting tiny brewpubs, fresh pours, and unforgettable beer "
"gardens.",
"Fan of farmhouse ales, food pairings, and long tasting flights.",
"Collecting favorite pilsners one city at a time.",
"Hops-first drinker who still saves room for classic malt-forward "
"styles.",
"Finding hidden tap lists and sharing the best seasonal releases.",
"Brewery road-tripper focused on local ingredients and clean "
"fermentation.",
"Always comparing house lagers and ranking patio pint vibes.",
"Curious about yeast strains, barrel programs, and cellar experiments.",
"Believes every neighborhood deserves a great community taproom.",
"Looking for session beers that taste great from first sip to last.",
"Belgian ale enthusiast who never skips a new saison.",
"Hazy IPA critic with deep respect for a perfectly clear pilsner.",
"Visits breweries for the stories, stays for the flagship pours.",
"Craft beer fan mapping tasting notes and favorite brew routes.",
"Always ready to trade recommendations for underrated local breweries.",
"Keeping a running list of must-try collab releases and tap takeovers."};
};
#endif // BIERGARTEN_PIPELINE_INCLUDES_DATA_GENERATION_MOCK_GENERATOR_H_

View File

@@ -0,0 +1,15 @@
#pragma once
#include <string>
#include <string_view>
#include "data_generation/prompt_formatting/prompt_formatter.h"
class Gemma4JinjaPromptFormatter final : public IPromptFormatter {
public:
Gemma4JinjaPromptFormatter() = default;
~Gemma4JinjaPromptFormatter() override = default;
[[nodiscard]] std::string Format(std::string_view system_prompt,
std::string_view user_prompt) const override;
};

View File

@@ -0,0 +1,18 @@
#pragma once
#include <string>
#include <string_view>
class IPromptFormatter {
public:
IPromptFormatter() = default;
IPromptFormatter(const IPromptFormatter&) = delete;
IPromptFormatter& operator=(const IPromptFormatter&) = delete;
IPromptFormatter(IPromptFormatter&&) = delete;
IPromptFormatter& operator=(IPromptFormatter&&) = delete;
virtual ~IPromptFormatter() = default;
[[nodiscard]] virtual std::string Format(
std::string_view system_prompt,
std::string_view user_prompt) const = 0;
};