mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
Implement Llama-based brewery and user data generation; remove mock generator and related files
This commit is contained in:
@@ -1,21 +0,0 @@
|
||||
#include "generator.h"
|
||||
#include <functional>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
void LlamaBreweryGenerator::LoadModel(const std::string &modelPath) {
|
||||
spdlog::info(" [Mock] Initialized llama model: {}", modelPath);
|
||||
spdlog::info(" OK: Model ready");
|
||||
}
|
||||
|
||||
LlamaBreweryGenerator::Brewery
|
||||
LlamaBreweryGenerator::GenerateBrewery(const std::string &cityName, int seed) {
|
||||
// Deterministic mock generation for stable test output.
|
||||
size_t nameHash = std::hash<std::string>{}(cityName + std::to_string(seed));
|
||||
|
||||
Brewery result;
|
||||
result.name = breweryAdjectives[nameHash % breweryAdjectives.size()] + " " +
|
||||
breweryNouns[(nameHash / 7) % breweryNouns.size()];
|
||||
result.description = descriptions[(nameHash / 13) % descriptions.size()];
|
||||
|
||||
return result;
|
||||
}
|
||||
236
pipeline/src/llama_generator.cpp
Normal file
236
pipeline/src/llama_generator.cpp
Normal file
@@ -0,0 +1,236 @@
|
||||
#include "llama_generator.h"
|
||||
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cctype>
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
namespace {
|
||||
|
||||
std::string trim(std::string value) {
|
||||
auto notSpace = [](unsigned char ch) { return !std::isspace(ch); };
|
||||
|
||||
value.erase(value.begin(),
|
||||
std::find_if(value.begin(), value.end(), notSpace));
|
||||
value.erase(std::find_if(value.rbegin(), value.rend(), notSpace).base(),
|
||||
value.end());
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
void appendTokenPiece(const llama_vocab *vocab, llama_token token,
|
||||
std::string &output) {
|
||||
std::array<char, 256> buffer{};
|
||||
int32_t bytes =
|
||||
llama_token_to_piece(vocab, token, buffer.data(),
|
||||
static_cast<int32_t>(buffer.size()), 0, true);
|
||||
|
||||
if (bytes < 0) {
|
||||
std::vector<char> dynamicBuffer(static_cast<std::size_t>(-bytes));
|
||||
bytes = llama_token_to_piece(vocab, token, dynamicBuffer.data(),
|
||||
static_cast<int32_t>(dynamicBuffer.size()), 0,
|
||||
true);
|
||||
if (bytes < 0) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: failed to decode sampled token piece");
|
||||
}
|
||||
|
||||
output.append(dynamicBuffer.data(), static_cast<std::size_t>(bytes));
|
||||
return;
|
||||
}
|
||||
|
||||
output.append(buffer.data(), static_cast<std::size_t>(bytes));
|
||||
}
|
||||
|
||||
std::pair<std::string, std::string>
|
||||
parseTwoLineResponse(const std::string &raw, const std::string &errorMessage) {
|
||||
const auto newlinePos = raw.find('\n');
|
||||
if (newlinePos == std::string::npos) {
|
||||
throw std::runtime_error(errorMessage);
|
||||
}
|
||||
|
||||
std::string first = trim(raw.substr(0, newlinePos));
|
||||
std::string second = trim(raw.substr(newlinePos + 1));
|
||||
|
||||
if (first.empty() || second.empty()) {
|
||||
throw std::runtime_error(errorMessage);
|
||||
}
|
||||
|
||||
return {first, second};
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
LlamaGenerator::~LlamaGenerator() {
|
||||
if (context_ != nullptr) {
|
||||
llama_free(context_);
|
||||
context_ = nullptr;
|
||||
}
|
||||
|
||||
if (model_ != nullptr) {
|
||||
llama_model_free(model_);
|
||||
model_ = nullptr;
|
||||
}
|
||||
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
void LlamaGenerator::load(const std::string &modelPath) {
|
||||
if (modelPath.empty()) {
|
||||
throw std::runtime_error("LlamaGenerator: model path must not be empty");
|
||||
}
|
||||
|
||||
if (context_ != nullptr) {
|
||||
llama_free(context_);
|
||||
context_ = nullptr;
|
||||
}
|
||||
if (model_ != nullptr) {
|
||||
llama_model_free(model_);
|
||||
model_ = nullptr;
|
||||
}
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
llama_model_params modelParams = llama_model_default_params();
|
||||
model_ = llama_load_model_from_file(modelPath.c_str(), modelParams);
|
||||
if (model_ == nullptr) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: failed to load model from path: " + modelPath);
|
||||
}
|
||||
|
||||
llama_context_params contextParams = llama_context_default_params();
|
||||
contextParams.n_ctx = 2048;
|
||||
|
||||
context_ = llama_init_from_model(model_, contextParams);
|
||||
if (context_ == nullptr) {
|
||||
llama_model_free(model_);
|
||||
model_ = nullptr;
|
||||
throw std::runtime_error("LlamaGenerator: failed to create context");
|
||||
}
|
||||
|
||||
spdlog::info("[LlamaGenerator] Loaded model: {}", modelPath);
|
||||
}
|
||||
|
||||
std::string LlamaGenerator::infer(const std::string &prompt, int maxTokens) {
|
||||
if (model_ == nullptr || context_ == nullptr) {
|
||||
throw std::runtime_error("LlamaGenerator: model not loaded");
|
||||
}
|
||||
|
||||
const llama_vocab *vocab = llama_model_get_vocab(model_);
|
||||
if (vocab == nullptr) {
|
||||
throw std::runtime_error("LlamaGenerator: vocab unavailable");
|
||||
}
|
||||
|
||||
std::vector<llama_token> promptTokens(prompt.size() + 8);
|
||||
int32_t tokenCount =
|
||||
llama_tokenize(vocab, prompt.c_str(), static_cast<int32_t>(prompt.size()),
|
||||
promptTokens.data(),
|
||||
static_cast<int32_t>(promptTokens.size()), true, true);
|
||||
|
||||
if (tokenCount < 0) {
|
||||
promptTokens.resize(static_cast<std::size_t>(-tokenCount));
|
||||
tokenCount =
|
||||
llama_tokenize(vocab, prompt.c_str(),
|
||||
static_cast<int32_t>(prompt.size()), promptTokens.data(),
|
||||
static_cast<int32_t>(promptTokens.size()), true, true);
|
||||
}
|
||||
|
||||
if (tokenCount < 0) {
|
||||
throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
|
||||
}
|
||||
|
||||
promptTokens.resize(static_cast<std::size_t>(tokenCount));
|
||||
|
||||
const llama_batch promptBatch = llama_batch_get_one(
|
||||
promptTokens.data(), static_cast<int32_t>(promptTokens.size()));
|
||||
if (llama_decode(context_, promptBatch) != 0) {
|
||||
throw std::runtime_error("LlamaGenerator: prompt decode failed");
|
||||
}
|
||||
|
||||
llama_sampler_chain_params samplerParams =
|
||||
llama_sampler_chain_default_params();
|
||||
using SamplerPtr =
|
||||
std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
|
||||
SamplerPtr sampler(llama_sampler_chain_init(samplerParams),
|
||||
&llama_sampler_free);
|
||||
|
||||
if (!sampler) {
|
||||
throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
|
||||
}
|
||||
|
||||
llama_sampler_chain_add(sampler.get(), llama_sampler_init_greedy());
|
||||
|
||||
std::vector<llama_token> generatedTokens;
|
||||
generatedTokens.reserve(static_cast<std::size_t>(maxTokens));
|
||||
|
||||
for (int i = 0; i < maxTokens; ++i) {
|
||||
const llama_token next = llama_sampler_sample(sampler.get(), context_, -1);
|
||||
if (llama_vocab_is_eog(vocab, next)) {
|
||||
break;
|
||||
}
|
||||
|
||||
generatedTokens.push_back(next);
|
||||
|
||||
llama_token token = next;
|
||||
const llama_batch oneTokenBatch = llama_batch_get_one(&token, 1);
|
||||
if (llama_decode(context_, oneTokenBatch) != 0) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: decode failed during generation");
|
||||
}
|
||||
}
|
||||
|
||||
std::string output;
|
||||
for (const llama_token token : generatedTokens) {
|
||||
appendTokenPiece(vocab, token, output);
|
||||
}
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
BreweryResult
|
||||
LlamaGenerator::generateBrewery(const std::string &cityName,
|
||||
const std::string ®ionContext) {
|
||||
std::string prompt =
|
||||
"Generate a craft brewery name and one-sentence description for a "
|
||||
"brewery located in " +
|
||||
cityName + ". " + regionContext +
|
||||
" Respond with exactly two lines: first line is the name, second "
|
||||
"line is the description.";
|
||||
|
||||
const std::string raw = infer(prompt, 128);
|
||||
auto [name, description] =
|
||||
parseTwoLineResponse(raw, "LlamaGenerator: malformed brewery response");
|
||||
|
||||
return {name, description};
|
||||
}
|
||||
|
||||
UserResult LlamaGenerator::generateUser(const std::string &locale) {
|
||||
std::string prompt =
|
||||
"Generate a plausible craft beer enthusiast username and a one-sentence "
|
||||
"bio. Locale: " +
|
||||
locale +
|
||||
". Respond with exactly two lines: first line is the username (no "
|
||||
"spaces), second line is the bio.";
|
||||
|
||||
const std::string raw = infer(prompt, 128);
|
||||
auto [username, bio] =
|
||||
parseTwoLineResponse(raw, "LlamaGenerator: malformed user response");
|
||||
|
||||
username.erase(
|
||||
std::remove_if(username.begin(), username.end(),
|
||||
[](unsigned char ch) { return std::isspace(ch); }),
|
||||
username.end());
|
||||
|
||||
if (username.empty() || bio.empty()) {
|
||||
throw std::runtime_error("LlamaGenerator: malformed user response");
|
||||
}
|
||||
|
||||
return {username, bio};
|
||||
}
|
||||
@@ -1,9 +1,12 @@
|
||||
#include "data_downloader.h"
|
||||
#include "data_generator.h"
|
||||
#include "database.h"
|
||||
#include "generator.h"
|
||||
#include "json_loader.h"
|
||||
#include "llama_generator.h"
|
||||
#include "mock_generator.h"
|
||||
#include <curl/curl.h>
|
||||
#include <filesystem>
|
||||
#include <memory>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
static bool FileExists(const std::string &filePath) {
|
||||
@@ -14,7 +17,7 @@ int main(int argc, char *argv[]) {
|
||||
try {
|
||||
curl_global_init(CURL_GLOBAL_DEFAULT);
|
||||
|
||||
std::string modelPath = argc > 1 ? argv[1] : "./model.gguf";
|
||||
std::string modelPath = argc > 1 ? argv[1] : "";
|
||||
std::string cacheDir = argc > 2 ? argv[2] : "/tmp";
|
||||
std::string commit =
|
||||
argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28
|
||||
@@ -41,8 +44,15 @@ int main(int argc, char *argv[]) {
|
||||
}
|
||||
|
||||
spdlog::info("Initializing brewery generator...");
|
||||
LlamaBreweryGenerator generator;
|
||||
generator.LoadModel(modelPath);
|
||||
std::unique_ptr<IDataGenerator> generator;
|
||||
if (modelPath.empty()) {
|
||||
generator = std::make_unique<MockGenerator>();
|
||||
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
|
||||
} else {
|
||||
generator = std::make_unique<LlamaGenerator>();
|
||||
spdlog::info("[Generator] Using LlamaGenerator: {}", modelPath);
|
||||
}
|
||||
generator->load(modelPath);
|
||||
|
||||
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||
|
||||
@@ -74,7 +84,7 @@ int main(int argc, char *argv[]) {
|
||||
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===\n");
|
||||
for (size_t i = 0; i < std::min(size_t(5), cities.size()); i++) {
|
||||
const auto &[cityId, cityName] = cities[i];
|
||||
auto brewery = generator.GenerateBrewery(cityName, i);
|
||||
auto brewery = generator->generateBrewery(cityName, "");
|
||||
spdlog::info(" {}: {}", cityName, brewery.name);
|
||||
spdlog::info(" -> {}", brewery.description);
|
||||
}
|
||||
|
||||
101
pipeline/src/mock_generator.cpp
Normal file
101
pipeline/src/mock_generator.cpp
Normal file
@@ -0,0 +1,101 @@
|
||||
#include "mock_generator.h"
|
||||
|
||||
#include <functional>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
const std::vector<std::string> MockGenerator::kBreweryAdjectives = {
|
||||
"Craft", "Heritage", "Local", "Artisan", "Pioneer", "Golden",
|
||||
"Modern", "Classic", "Summit", "Northern", "Riverstone", "Barrel",
|
||||
"Hinterland", "Harbor", "Wild", "Granite", "Copper", "Maple"};
|
||||
|
||||
const std::vector<std::string> MockGenerator::kBreweryNouns = {
|
||||
"Brewing Co.", "Brewery", "Bier Haus", "Taproom", "Works",
|
||||
"House", "Fermentery", "Ale Co.", "Cellars", "Collective",
|
||||
"Project", "Foundry", "Malthouse", "Public House", "Co-op",
|
||||
"Lab", "Beer Hall", "Guild"};
|
||||
|
||||
const std::vector<std::string> MockGenerator::kBreweryDescriptions = {
|
||||
"Handcrafted pale ales and seasonal IPAs with local ingredients.",
|
||||
"Traditional lagers and experimental sours in small batches.",
|
||||
"Award-winning stouts and wildly hoppy blonde ales.",
|
||||
"Craft brewery specializing in Belgian-style triples and dark porters.",
|
||||
"Modern brewery blending tradition with bold experimental flavors.",
|
||||
"Neighborhood-focused taproom pouring crisp pilsners and citrusy pale "
|
||||
"ales.",
|
||||
"Small-batch brewery known for barrel-aged releases and smoky lagers.",
|
||||
"Independent brewhouse pairing farmhouse ales with rotating food pop-ups.",
|
||||
"Community brewpub making balanced bitters, saisons, and hazy IPAs.",
|
||||
"Experimental nanobrewery exploring local yeast and regional grains.",
|
||||
"Family-run brewery producing smooth amber ales and robust porters.",
|
||||
"Urban brewery crafting clean lagers and bright, fruit-forward sours.",
|
||||
"Riverfront brewhouse featuring oak-matured ales and seasonal blends.",
|
||||
"Modern taproom focused on sessionable lagers and classic pub styles.",
|
||||
"Brewery rooted in tradition with a lineup of malty reds and crisp lagers.",
|
||||
"Creative brewery offering rotating collaborations and limited draft-only "
|
||||
"pours.",
|
||||
"Locally inspired brewery serving approachable ales with bold hop "
|
||||
"character.",
|
||||
"Destination taproom known for balanced IPAs and cocoa-rich stouts."};
|
||||
|
||||
const std::vector<std::string> MockGenerator::kUsernames = {
|
||||
"hopseeker", "malttrail", "yeastwhisper", "lagerlane",
|
||||
"barrelbound", "foamfinder", "taphunter", "graingeist",
|
||||
"brewscout", "aleatlas", "caskcompass", "hopsandmaps",
|
||||
"mashpilot", "pintnomad", "fermentfriend", "stoutsignal",
|
||||
"sessionwander", "kettlekeeper"};
|
||||
|
||||
const std::vector<std::string> MockGenerator::kBios = {
|
||||
"Always chasing balanced IPAs and crisp lagers across local taprooms.",
|
||||
"Weekend brewery explorer with a soft spot for dark, roasty stouts.",
|
||||
"Documenting tiny brewpubs, fresh pours, and unforgettable beer gardens.",
|
||||
"Fan of farmhouse ales, food pairings, and long tasting flights.",
|
||||
"Collecting favorite pilsners one city at a time.",
|
||||
"Hops-first drinker who still saves room for classic malt-forward styles.",
|
||||
"Finding hidden tap lists and sharing the best seasonal releases.",
|
||||
"Brewery road-tripper focused on local ingredients and clean fermentation.",
|
||||
"Always comparing house lagers and ranking patio pint vibes.",
|
||||
"Curious about yeast strains, barrel programs, and cellar experiments.",
|
||||
"Believes every neighborhood deserves a great community taproom.",
|
||||
"Looking for session beers that taste great from first sip to last.",
|
||||
"Belgian ale enthusiast who never skips a new saison.",
|
||||
"Hazy IPA critic with deep respect for a perfectly clear pilsner.",
|
||||
"Visits breweries for the stories, stays for the flagship pours.",
|
||||
"Craft beer fan mapping tasting notes and favorite brew routes.",
|
||||
"Always ready to trade recommendations for underrated local breweries.",
|
||||
"Keeping a running list of must-try collab releases and tap takeovers."};
|
||||
|
||||
void MockGenerator::load(const std::string & /*modelPath*/) {
|
||||
spdlog::info("[MockGenerator] No model needed");
|
||||
}
|
||||
|
||||
std::size_t MockGenerator::deterministicHash(const std::string &a,
|
||||
const std::string &b) {
|
||||
std::size_t seed = std::hash<std::string>{}(a);
|
||||
const std::size_t mixed = std::hash<std::string>{}(b);
|
||||
seed ^= mixed + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
|
||||
seed = (seed << 13) | (seed >> ((sizeof(std::size_t) * 8) - 13));
|
||||
return seed;
|
||||
}
|
||||
|
||||
BreweryResult MockGenerator::generateBrewery(const std::string &cityName,
|
||||
const std::string ®ionContext) {
|
||||
const std::size_t hash = regionContext.empty()
|
||||
? std::hash<std::string>{}(cityName)
|
||||
: deterministicHash(cityName, regionContext);
|
||||
|
||||
BreweryResult result;
|
||||
result.name = kBreweryAdjectives[hash % kBreweryAdjectives.size()] + " " +
|
||||
kBreweryNouns[(hash / 7) % kBreweryNouns.size()];
|
||||
result.description =
|
||||
kBreweryDescriptions[(hash / 13) % kBreweryDescriptions.size()];
|
||||
return result;
|
||||
}
|
||||
|
||||
UserResult MockGenerator::generateUser(const std::string &locale) {
|
||||
const std::size_t hash = std::hash<std::string>{}(locale);
|
||||
|
||||
UserResult result;
|
||||
result.username = kUsernames[hash % kUsernames.size()];
|
||||
result.bio = kBios[(hash / 11) % kBios.size()];
|
||||
return result;
|
||||
}
|
||||
Reference in New Issue
Block a user