Reorganize directory structure

This commit is contained in:
Aaron Po
2026-04-02 18:27:01 -04:00
parent a1f0ca5b20
commit 52e2333304
23 changed files with 330 additions and 171 deletions

View File

@@ -0,0 +1,132 @@
#include "biergarten_data_generator.h"
#include <algorithm>
#include <filesystem>
#include <unordered_map>
#include <spdlog/spdlog.h>
#include "data_generation/data_downloader.h"
#include "json_handling/json_loader.h"
#include "data_generation/llama_generator.h"
#include "data_generation/mock_generator.h"
#include "wikipedia/wikipedia_service.h"
BiergartenDataGenerator::BiergartenDataGenerator(
const ApplicationOptions &options,
std::shared_ptr<IWebClient> webClient,
SqliteDatabase &database)
: options_(options), webClient_(webClient), database_(database) {}
std::unique_ptr<IDataGenerator> BiergartenDataGenerator::InitializeGenerator() {
spdlog::info("Initializing brewery generator...");
std::unique_ptr<IDataGenerator> generator;
if (options_.modelPath.empty()) {
generator = std::make_unique<MockGenerator>();
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
} else {
auto llamaGenerator = std::make_unique<LlamaGenerator>();
llamaGenerator->setSamplingOptions(options_.temperature, options_.topP,
options_.seed);
spdlog::info(
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
"seed={})",
options_.modelPath, options_.temperature, options_.topP,
options_.seed);
generator = std::move(llamaGenerator);
}
generator->load(options_.modelPath);
return generator;
}
void BiergartenDataGenerator::LoadGeographicData() {
std::string jsonPath = options_.cacheDir + "/countries+states+cities.json";
std::string dbPath = options_.cacheDir + "/biergarten-pipeline.db";
bool hasJsonCache = std::filesystem::exists(jsonPath);
bool hasDbCache = std::filesystem::exists(dbPath);
spdlog::info("Initializing SQLite database at {}...", dbPath);
database_.Initialize(dbPath);
if (hasDbCache && hasJsonCache) {
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
} else {
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
DataDownloader downloader(webClient_);
downloader.DownloadCountriesDatabase(jsonPath, options_.commit);
JsonLoader::LoadWorldCities(jsonPath, database_);
}
}
void BiergartenDataGenerator::GenerateSampleBreweries() {
auto generator = InitializeGenerator();
WikipediaService wikipediaService(webClient_);
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
auto countries = database_.QueryCountries(50);
auto states = database_.QueryStates(50);
auto cities = database_.QueryCities();
// Build a quick map of country id -> name for per-city lookups.
auto allCountries = database_.QueryCountries(0);
std::unordered_map<int, std::string> countryMap;
for (const auto &c : allCountries)
countryMap[c.id] = c.name;
spdlog::info("\nTotal records loaded:");
spdlog::info(" Countries: {}", database_.QueryCountries(0).size());
spdlog::info(" States: {}", database_.QueryStates(0).size());
spdlog::info(" Cities: {}", cities.size());
generatedBreweries_.clear();
const size_t sampleCount = std::min(size_t(30), cities.size());
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
for (size_t i = 0; i < sampleCount; i++) {
const auto &city = cities[i];
const int cityId = city.id;
const std::string cityName = city.name;
std::string localCountry;
const auto countryIt = countryMap.find(city.countryId);
if (countryIt != countryMap.end()) {
localCountry = countryIt->second;
}
const std::string regionContext =
wikipediaService.GetSummary(cityName, localCountry);
spdlog::debug("[Pipeline] Region context for {}: {}", cityName,
regionContext);
auto brewery =
generator->generateBrewery(cityName, localCountry, regionContext);
generatedBreweries_.push_back({cityId, cityName, brewery});
}
spdlog::info("\n=== GENERATED DATA DUMP ===");
for (size_t i = 0; i < generatedBreweries_.size(); i++) {
const auto &entry = generatedBreweries_[i];
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.cityId,
entry.cityName);
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
}
}
int BiergartenDataGenerator::Run() {
try {
LoadGeographicData();
GenerateSampleBreweries();
spdlog::info("\nOK: Pipeline completed successfully");
return 0;
} catch (const std::exception &e) {
spdlog::error("ERROR: Pipeline failed: {}", e.what());
return 1;
}
}

View File

@@ -1,5 +1,5 @@
#include "data_downloader.h"
#include "web_client.h"
#include "data_generation/data_downloader.h"
#include "web_client/web_client.h"
#include <filesystem>
#include <fstream>
#include <spdlog/spdlog.h>

View File

@@ -11,7 +11,7 @@
#include <boost/json.hpp>
#include <spdlog/spdlog.h>
#include "llama_generator.h"
#include "data_generation/llama_generator.h"
namespace {

View File

@@ -1,4 +1,4 @@
#include "mock_generator.h"
#include "data_generation/mock_generator.h"
#include <functional>
#include <spdlog/spdlog.h>

View File

@@ -1,4 +1,4 @@
#include "database.h"
#include "database/database.h"
#include <spdlog/spdlog.h>
#include <stdexcept>

View File

@@ -2,8 +2,8 @@
#include <spdlog/spdlog.h>
#include "json_loader.h"
#include "stream_parser.h"
#include "json_handling/json_loader.h"
#include "json_handling/stream_parser.h"
void JsonLoader::LoadWorldCities(const std::string &jsonPath,
SqliteDatabase &db) {

View File

@@ -5,8 +5,8 @@
#include <boost/json/basic_parser_impl.hpp>
#include <spdlog/spdlog.h>
#include "database.h"
#include "stream_parser.h"
#include "database/database.h"
#include "json_handling/stream_parser.h"
class CityRecordHandler {
friend class boost::json::basic_parser<CityRecordHandler>;

View File

@@ -1,163 +1,76 @@
#include <algorithm>
#include <filesystem>
#include <iostream>
#include <memory>
#include <unordered_map>
#include <vector>
#include <boost/program_options.hpp>
#include <spdlog/spdlog.h>
#include "curl_web_client.h"
#include "data_downloader.h"
#include "data_generator.h"
#include "database.h"
#include "json_loader.h"
#include "llama_generator.h"
#include "mock_generator.h"
#include "wikipedia_service.h"
#include "application_options.h"
#include "biergarten_data_generator.h"
#include "web_client/curl_web_client.h"
#include "database/database.h"
namespace po = boost::program_options;
/**
* @brief Parse command-line arguments into ApplicationOptions.
*
* @param argc Command-line argument count.
* @param argv Command-line arguments.
* @param options Output ApplicationOptions struct.
* @return true if parsing succeeded and help was not requested, false otherwise.
*/
bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
po::options_description desc("Pipeline Options");
desc.add_options()("help,h", "Produce help message")(
"model,m", po::value<std::string>()->default_value(""),
"Path to LLM model (gguf)")(
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
"Directory for cached JSON")(
"temperature", po::value<float>()->default_value(0.8f),
"Sampling temperature (higher = more random)")(
"top-p", po::value<float>()->default_value(0.92f),
"Nucleus sampling top-p in (0,1] (higher = more random)")(
"seed", po::value<int>()->default_value(-1),
"Sampler seed: -1 for random, otherwise non-negative integer")(
"commit", po::value<std::string>()->default_value("c5eb7772"),
"Git commit hash for DB consistency");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);
if (vm.count("help")) {
std::cout << desc << "\n";
return false;
}
options.modelPath = vm["model"].as<std::string>();
options.cacheDir = vm["cache-dir"].as<std::string>();
options.temperature = vm["temperature"].as<float>();
options.topP = vm["top-p"].as<float>();
options.seed = vm["seed"].as<int>();
options.commit = vm["commit"].as<std::string>();
return true;
}
int main(int argc, char *argv[]) {
try {
const CurlGlobalState curl_state;
po::options_description desc("Pipeline Options");
desc.add_options()("help,h", "Produce help message")(
"model,m", po::value<std::string>()->default_value(""),
"Path to LLM model (gguf)")(
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
"Directory for cached JSON")(
"temperature", po::value<float>()->default_value(0.8f),
"Sampling temperature (higher = more random)")(
"top-p", po::value<float>()->default_value(0.92f),
"Nucleus sampling top-p in (0,1] (higher = more random)")(
"seed", po::value<int>()->default_value(-1),
"Sampler seed: -1 for random, otherwise non-negative integer")(
"commit", po::value<std::string>()->default_value("c5eb7772"),
"Git commit hash for DB consistency");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);
if (vm.count("help")) {
std::cout << desc << "\n";
ApplicationOptions options;
if (!ParseArguments(argc, argv, options)) {
return 0;
}
std::string modelPath = vm["model"].as<std::string>();
std::string cacheDir = vm["cache-dir"].as<std::string>();
float temperature = vm["temperature"].as<float>();
float topP = vm["top-p"].as<float>();
int seed = vm["seed"].as<int>();
std::string commit = vm["commit"].as<std::string>();
std::string jsonPath = cacheDir + "/countries+states+cities.json";
std::string dbPath = cacheDir + "/biergarten-pipeline.db";
bool hasJsonCache = std::filesystem::exists(jsonPath);
bool hasDbCache = std::filesystem::exists(dbPath);
auto webClient = std::make_shared<CURLWebClient>();
SqliteDatabase database;
SqliteDatabase db;
spdlog::info("Initializing SQLite database at {}...", dbPath);
db.Initialize(dbPath);
if (hasDbCache && hasJsonCache) {
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
} else {
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
DataDownloader downloader(webClient);
downloader.DownloadCountriesDatabase(jsonPath, commit);
JsonLoader::LoadWorldCities(jsonPath, db);
}
spdlog::info("Initializing brewery generator...");
std::unique_ptr<IDataGenerator> generator;
if (modelPath.empty()) {
generator = std::make_unique<MockGenerator>();
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
} else {
auto llamaGenerator = std::make_unique<LlamaGenerator>();
llamaGenerator->setSamplingOptions(temperature, topP, seed);
spdlog::info(
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
"seed={})",
modelPath, temperature, topP, seed);
generator = std::move(llamaGenerator);
}
generator->load(modelPath);
WikipediaService wikipediaService(webClient);
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
auto countries = db.QueryCountries(50);
auto states = db.QueryStates(50);
auto cities = db.QueryCities();
// Build a quick map of country id -> name for per-city lookups.
auto allCountries = db.QueryCountries(0);
std::unordered_map<int, std::string> countryMap;
for (const auto &c : allCountries)
countryMap[c.id] = c.name;
spdlog::info("\nTotal records loaded:");
spdlog::info(" Countries: {}", db.QueryCountries(0).size());
spdlog::info(" States: {}", db.QueryStates(0).size());
spdlog::info(" Cities: {}", cities.size());
struct GeneratedBrewery {
int cityId;
std::string cityName;
BreweryResult brewery;
};
std::vector<GeneratedBrewery> generatedBreweries;
const size_t sampleCount = std::min(size_t(30), cities.size());
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
for (size_t i = 0; i < sampleCount; i++) {
const auto &city = cities[i];
const int cityId = city.id;
const std::string cityName = city.name;
std::string localCountry;
const auto countryIt = countryMap.find(city.countryId);
if (countryIt != countryMap.end()) {
localCountry = countryIt->second;
}
const std::string regionContext =
wikipediaService.GetSummary(cityName, localCountry);
spdlog::debug("[Pipeline] Region context for {}: {}", cityName,
regionContext);
auto brewery =
generator->generateBrewery(cityName, localCountry, regionContext);
generatedBreweries.push_back({cityId, cityName, brewery});
}
spdlog::info("\n=== GENERATED DATA DUMP ===");
for (size_t i = 0; i < generatedBreweries.size(); i++) {
const auto &entry = generatedBreweries[i];
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.cityId,
entry.cityName);
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
}
spdlog::info("\nOK: Pipeline completed successfully");
return 0;
BiergartenDataGenerator generator(options, webClient, database);
return generator.Run();
} catch (const std::exception &e) {
spdlog::error("ERROR: Pipeline failed: {}", e.what());
spdlog::error("ERROR: Application failed: {}", e.what());
return 1;
}
}

View File

@@ -1,4 +1,4 @@
#include "curl_web_client.h"
#include "web_client/curl_web_client.h"
#include <cstdio>
#include <curl/curl.h>
#include <fstream>

View File

@@ -1,4 +1,4 @@
#include "wikipedia_service.h"
#include "wikipedia/wikipedia_service.h"
#include <boost/json.hpp>
#include <spdlog/spdlog.h>