Implement Llama-based brewery and user data generation; remove mock generator and related files

This commit is contained in:
Aaron Po
2026-04-01 23:29:16 -04:00
parent 248a51b35f
commit 280c9c61bd
11 changed files with 445 additions and 128 deletions

View File

@@ -1,21 +0,0 @@
#include "generator.h"
#include <functional>
#include <spdlog/spdlog.h>
void LlamaBreweryGenerator::LoadModel(const std::string &modelPath) {
spdlog::info(" [Mock] Initialized llama model: {}", modelPath);
spdlog::info(" OK: Model ready");
}
LlamaBreweryGenerator::Brewery
LlamaBreweryGenerator::GenerateBrewery(const std::string &cityName, int seed) {
// Deterministic mock generation for stable test output.
size_t nameHash = std::hash<std::string>{}(cityName + std::to_string(seed));
Brewery result;
result.name = breweryAdjectives[nameHash % breweryAdjectives.size()] + " " +
breweryNouns[(nameHash / 7) % breweryNouns.size()];
result.description = descriptions[(nameHash / 13) % descriptions.size()];
return result;
}

View File

@@ -0,0 +1,236 @@
#include "llama_generator.h"
#include "llama.h"
#include <algorithm>
#include <array>
#include <cctype>
#include <memory>
#include <stdexcept>
#include <string>
#include <vector>
#include <spdlog/spdlog.h>
namespace {
std::string trim(std::string value) {
auto notSpace = [](unsigned char ch) { return !std::isspace(ch); };
value.erase(value.begin(),
std::find_if(value.begin(), value.end(), notSpace));
value.erase(std::find_if(value.rbegin(), value.rend(), notSpace).base(),
value.end());
return value;
}
void appendTokenPiece(const llama_vocab *vocab, llama_token token,
std::string &output) {
std::array<char, 256> buffer{};
int32_t bytes =
llama_token_to_piece(vocab, token, buffer.data(),
static_cast<int32_t>(buffer.size()), 0, true);
if (bytes < 0) {
std::vector<char> dynamicBuffer(static_cast<std::size_t>(-bytes));
bytes = llama_token_to_piece(vocab, token, dynamicBuffer.data(),
static_cast<int32_t>(dynamicBuffer.size()), 0,
true);
if (bytes < 0) {
throw std::runtime_error(
"LlamaGenerator: failed to decode sampled token piece");
}
output.append(dynamicBuffer.data(), static_cast<std::size_t>(bytes));
return;
}
output.append(buffer.data(), static_cast<std::size_t>(bytes));
}
std::pair<std::string, std::string>
parseTwoLineResponse(const std::string &raw, const std::string &errorMessage) {
const auto newlinePos = raw.find('\n');
if (newlinePos == std::string::npos) {
throw std::runtime_error(errorMessage);
}
std::string first = trim(raw.substr(0, newlinePos));
std::string second = trim(raw.substr(newlinePos + 1));
if (first.empty() || second.empty()) {
throw std::runtime_error(errorMessage);
}
return {first, second};
}
} // namespace
LlamaGenerator::~LlamaGenerator() {
if (context_ != nullptr) {
llama_free(context_);
context_ = nullptr;
}
if (model_ != nullptr) {
llama_model_free(model_);
model_ = nullptr;
}
llama_backend_free();
}
void LlamaGenerator::load(const std::string &modelPath) {
if (modelPath.empty()) {
throw std::runtime_error("LlamaGenerator: model path must not be empty");
}
if (context_ != nullptr) {
llama_free(context_);
context_ = nullptr;
}
if (model_ != nullptr) {
llama_model_free(model_);
model_ = nullptr;
}
llama_backend_init();
llama_model_params modelParams = llama_model_default_params();
model_ = llama_load_model_from_file(modelPath.c_str(), modelParams);
if (model_ == nullptr) {
throw std::runtime_error(
"LlamaGenerator: failed to load model from path: " + modelPath);
}
llama_context_params contextParams = llama_context_default_params();
contextParams.n_ctx = 2048;
context_ = llama_init_from_model(model_, contextParams);
if (context_ == nullptr) {
llama_model_free(model_);
model_ = nullptr;
throw std::runtime_error("LlamaGenerator: failed to create context");
}
spdlog::info("[LlamaGenerator] Loaded model: {}", modelPath);
}
std::string LlamaGenerator::infer(const std::string &prompt, int maxTokens) {
if (model_ == nullptr || context_ == nullptr) {
throw std::runtime_error("LlamaGenerator: model not loaded");
}
const llama_vocab *vocab = llama_model_get_vocab(model_);
if (vocab == nullptr) {
throw std::runtime_error("LlamaGenerator: vocab unavailable");
}
std::vector<llama_token> promptTokens(prompt.size() + 8);
int32_t tokenCount =
llama_tokenize(vocab, prompt.c_str(), static_cast<int32_t>(prompt.size()),
promptTokens.data(),
static_cast<int32_t>(promptTokens.size()), true, true);
if (tokenCount < 0) {
promptTokens.resize(static_cast<std::size_t>(-tokenCount));
tokenCount =
llama_tokenize(vocab, prompt.c_str(),
static_cast<int32_t>(prompt.size()), promptTokens.data(),
static_cast<int32_t>(promptTokens.size()), true, true);
}
if (tokenCount < 0) {
throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
}
promptTokens.resize(static_cast<std::size_t>(tokenCount));
const llama_batch promptBatch = llama_batch_get_one(
promptTokens.data(), static_cast<int32_t>(promptTokens.size()));
if (llama_decode(context_, promptBatch) != 0) {
throw std::runtime_error("LlamaGenerator: prompt decode failed");
}
llama_sampler_chain_params samplerParams =
llama_sampler_chain_default_params();
using SamplerPtr =
std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
SamplerPtr sampler(llama_sampler_chain_init(samplerParams),
&llama_sampler_free);
if (!sampler) {
throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
}
llama_sampler_chain_add(sampler.get(), llama_sampler_init_greedy());
std::vector<llama_token> generatedTokens;
generatedTokens.reserve(static_cast<std::size_t>(maxTokens));
for (int i = 0; i < maxTokens; ++i) {
const llama_token next = llama_sampler_sample(sampler.get(), context_, -1);
if (llama_vocab_is_eog(vocab, next)) {
break;
}
generatedTokens.push_back(next);
llama_token token = next;
const llama_batch oneTokenBatch = llama_batch_get_one(&token, 1);
if (llama_decode(context_, oneTokenBatch) != 0) {
throw std::runtime_error(
"LlamaGenerator: decode failed during generation");
}
}
std::string output;
for (const llama_token token : generatedTokens) {
appendTokenPiece(vocab, token, output);
}
return output;
}
BreweryResult
LlamaGenerator::generateBrewery(const std::string &cityName,
const std::string &regionContext) {
std::string prompt =
"Generate a craft brewery name and one-sentence description for a "
"brewery located in " +
cityName + ". " + regionContext +
" Respond with exactly two lines: first line is the name, second "
"line is the description.";
const std::string raw = infer(prompt, 128);
auto [name, description] =
parseTwoLineResponse(raw, "LlamaGenerator: malformed brewery response");
return {name, description};
}
UserResult LlamaGenerator::generateUser(const std::string &locale) {
std::string prompt =
"Generate a plausible craft beer enthusiast username and a one-sentence "
"bio. Locale: " +
locale +
". Respond with exactly two lines: first line is the username (no "
"spaces), second line is the bio.";
const std::string raw = infer(prompt, 128);
auto [username, bio] =
parseTwoLineResponse(raw, "LlamaGenerator: malformed user response");
username.erase(
std::remove_if(username.begin(), username.end(),
[](unsigned char ch) { return std::isspace(ch); }),
username.end());
if (username.empty() || bio.empty()) {
throw std::runtime_error("LlamaGenerator: malformed user response");
}
return {username, bio};
}

View File

@@ -1,9 +1,12 @@
#include "data_downloader.h"
#include "data_generator.h"
#include "database.h"
#include "generator.h"
#include "json_loader.h"
#include "llama_generator.h"
#include "mock_generator.h"
#include <curl/curl.h>
#include <filesystem>
#include <memory>
#include <spdlog/spdlog.h>
static bool FileExists(const std::string &filePath) {
@@ -14,7 +17,7 @@ int main(int argc, char *argv[]) {
try {
curl_global_init(CURL_GLOBAL_DEFAULT);
std::string modelPath = argc > 1 ? argv[1] : "./model.gguf";
std::string modelPath = argc > 1 ? argv[1] : "";
std::string cacheDir = argc > 2 ? argv[2] : "/tmp";
std::string commit =
argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28
@@ -41,8 +44,15 @@ int main(int argc, char *argv[]) {
}
spdlog::info("Initializing brewery generator...");
LlamaBreweryGenerator generator;
generator.LoadModel(modelPath);
std::unique_ptr<IDataGenerator> generator;
if (modelPath.empty()) {
generator = std::make_unique<MockGenerator>();
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
} else {
generator = std::make_unique<LlamaGenerator>();
spdlog::info("[Generator] Using LlamaGenerator: {}", modelPath);
}
generator->load(modelPath);
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
@@ -74,7 +84,7 @@ int main(int argc, char *argv[]) {
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===\n");
for (size_t i = 0; i < std::min(size_t(5), cities.size()); i++) {
const auto &[cityId, cityName] = cities[i];
auto brewery = generator.GenerateBrewery(cityName, i);
auto brewery = generator->generateBrewery(cityName, "");
spdlog::info(" {}: {}", cityName, brewery.name);
spdlog::info(" -> {}", brewery.description);
}

View File

@@ -0,0 +1,101 @@
#include "mock_generator.h"
#include <functional>
#include <spdlog/spdlog.h>
const std::vector<std::string> MockGenerator::kBreweryAdjectives = {
"Craft", "Heritage", "Local", "Artisan", "Pioneer", "Golden",
"Modern", "Classic", "Summit", "Northern", "Riverstone", "Barrel",
"Hinterland", "Harbor", "Wild", "Granite", "Copper", "Maple"};
const std::vector<std::string> MockGenerator::kBreweryNouns = {
"Brewing Co.", "Brewery", "Bier Haus", "Taproom", "Works",
"House", "Fermentery", "Ale Co.", "Cellars", "Collective",
"Project", "Foundry", "Malthouse", "Public House", "Co-op",
"Lab", "Beer Hall", "Guild"};
const std::vector<std::string> MockGenerator::kBreweryDescriptions = {
"Handcrafted pale ales and seasonal IPAs with local ingredients.",
"Traditional lagers and experimental sours in small batches.",
"Award-winning stouts and wildly hoppy blonde ales.",
"Craft brewery specializing in Belgian-style triples and dark porters.",
"Modern brewery blending tradition with bold experimental flavors.",
"Neighborhood-focused taproom pouring crisp pilsners and citrusy pale "
"ales.",
"Small-batch brewery known for barrel-aged releases and smoky lagers.",
"Independent brewhouse pairing farmhouse ales with rotating food pop-ups.",
"Community brewpub making balanced bitters, saisons, and hazy IPAs.",
"Experimental nanobrewery exploring local yeast and regional grains.",
"Family-run brewery producing smooth amber ales and robust porters.",
"Urban brewery crafting clean lagers and bright, fruit-forward sours.",
"Riverfront brewhouse featuring oak-matured ales and seasonal blends.",
"Modern taproom focused on sessionable lagers and classic pub styles.",
"Brewery rooted in tradition with a lineup of malty reds and crisp lagers.",
"Creative brewery offering rotating collaborations and limited draft-only "
"pours.",
"Locally inspired brewery serving approachable ales with bold hop "
"character.",
"Destination taproom known for balanced IPAs and cocoa-rich stouts."};
const std::vector<std::string> MockGenerator::kUsernames = {
"hopseeker", "malttrail", "yeastwhisper", "lagerlane",
"barrelbound", "foamfinder", "taphunter", "graingeist",
"brewscout", "aleatlas", "caskcompass", "hopsandmaps",
"mashpilot", "pintnomad", "fermentfriend", "stoutsignal",
"sessionwander", "kettlekeeper"};
const std::vector<std::string> MockGenerator::kBios = {
"Always chasing balanced IPAs and crisp lagers across local taprooms.",
"Weekend brewery explorer with a soft spot for dark, roasty stouts.",
"Documenting tiny brewpubs, fresh pours, and unforgettable beer gardens.",
"Fan of farmhouse ales, food pairings, and long tasting flights.",
"Collecting favorite pilsners one city at a time.",
"Hops-first drinker who still saves room for classic malt-forward styles.",
"Finding hidden tap lists and sharing the best seasonal releases.",
"Brewery road-tripper focused on local ingredients and clean fermentation.",
"Always comparing house lagers and ranking patio pint vibes.",
"Curious about yeast strains, barrel programs, and cellar experiments.",
"Believes every neighborhood deserves a great community taproom.",
"Looking for session beers that taste great from first sip to last.",
"Belgian ale enthusiast who never skips a new saison.",
"Hazy IPA critic with deep respect for a perfectly clear pilsner.",
"Visits breweries for the stories, stays for the flagship pours.",
"Craft beer fan mapping tasting notes and favorite brew routes.",
"Always ready to trade recommendations for underrated local breweries.",
"Keeping a running list of must-try collab releases and tap takeovers."};
void MockGenerator::load(const std::string & /*modelPath*/) {
spdlog::info("[MockGenerator] No model needed");
}
std::size_t MockGenerator::deterministicHash(const std::string &a,
const std::string &b) {
std::size_t seed = std::hash<std::string>{}(a);
const std::size_t mixed = std::hash<std::string>{}(b);
seed ^= mixed + 0x9e3779b97f4a7c15ULL + (seed << 6) + (seed >> 2);
seed = (seed << 13) | (seed >> ((sizeof(std::size_t) * 8) - 13));
return seed;
}
BreweryResult MockGenerator::generateBrewery(const std::string &cityName,
const std::string &regionContext) {
const std::size_t hash = regionContext.empty()
? std::hash<std::string>{}(cityName)
: deterministicHash(cityName, regionContext);
BreweryResult result;
result.name = kBreweryAdjectives[hash % kBreweryAdjectives.size()] + " " +
kBreweryNouns[(hash / 7) % kBreweryNouns.size()];
result.description =
kBreweryDescriptions[(hash / 13) % kBreweryDescriptions.size()];
return result;
}
UserResult MockGenerator::generateUser(const std::string &locale) {
const std::size_t hash = std::hash<std::string>{}(locale);
UserResult result;
result.username = kUsernames[hash % kUsernames.size()];
result.bio = kBios[(hash / 11) % kBios.size()];
return result;
}