Reorganize directory structure

This commit is contained in:
Aaron Po
2026-04-02 18:27:01 -04:00
parent a1f0ca5b20
commit 52e2333304
23 changed files with 330 additions and 171 deletions

View File

@@ -78,14 +78,15 @@ endif()
# Main Executable # Main Executable
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
set(PIPELINE_SOURCES set(PIPELINE_SOURCES
src/curl_web_client.cpp src/biergarten_data_generator.cpp
src/data_downloader.cpp src/web_client/curl_web_client.cpp
src/database.cpp src/data_generation/data_downloader.cpp
src/json_loader.cpp src/database/database.cpp
src/llama_generator.cpp src/json_handling/json_loader.cpp
src/mock_generator.cpp src/data_generation/llama_generator.cpp
src/stream_parser.cpp src/data_generation/mock_generator.cpp
src/wikipedia_service.cpp src/json_handling/stream_parser.cpp
src/wikipedia/wikipedia_service.cpp
src/main.cpp src/main.cpp
) )
@@ -118,10 +119,10 @@ if(ENABLE_CLANG_FORMAT_TARGETS)
find_program(CLANG_FORMAT_EXE NAMES clang-format) find_program(CLANG_FORMAT_EXE NAMES clang-format)
if(CLANG_FORMAT_EXE) if(CLANG_FORMAT_EXE)
file(GLOB_RECURSE FORMAT_SOURCES CONFIGURE_DEPENDS file(GLOB_RECURSE FORMAT_SOURCES CONFIGURE_DEPENDS
${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc ${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cc
${CMAKE_CURRENT_SOURCE_DIR}/includes/*.h ${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.h
${CMAKE_CURRENT_SOURCE_DIR}/includes/*.hpp ${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.hpp
) )
add_custom_target(format add_custom_target(format

View File

@@ -0,0 +1,2 @@
#pragma once

View File

@@ -0,0 +1,111 @@
#pragma once
#include <memory>
#include <string>
#include <vector>
#include <unordered_map>
#include "application_options.h"
#include "data_generation/data_generator.h"
#include "database/database.h"
#include "web_client/web_client.h"
#include "wikipedia/wikipedia_service.h"
/**
* @brief Program options for the Biergarten pipeline application.
*/
struct ApplicationOptions {
/// @brief Path to the LLM model file (gguf format).
std::string modelPath;
/// @brief Directory for cached JSON and database files.
std::string cacheDir;
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
float temperature = 0.8f;
/// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more random).
float topP = 0.92f;
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
int seed = -1;
/// @brief Git commit hash for database consistency.
std::string commit = "c5eb7772";
};
/**
* @brief Main data generator class for the Biergarten pipeline.
*
* This class encapsulates the core logic for generating brewery data.
* It handles database initialization, data loading/downloading, and brewery generation.
*/
class BiergartenDataGenerator {
public:
/**
* @brief Construct a BiergartenDataGenerator with injected dependencies.
*
* @param options Application configuration options.
* @param webClient HTTP client for downloading data.
* @param database SQLite database instance.
*/
BiergartenDataGenerator(const ApplicationOptions &options,
std::shared_ptr<IWebClient> webClient,
SqliteDatabase &database);
/**
* @brief Run the data generation pipeline.
*
* Performs the following steps:
* 1. Initialize database
* 2. Download geographic data if needed
* 3. Initialize the generator (LLM or Mock)
* 4. Generate brewery data for sample cities
*
* @return 0 on success, 1 on failure.
*/
int Run();
private:
/// @brief Immutable application options.
const ApplicationOptions options_;
/// @brief Shared HTTP client dependency.
std::shared_ptr<IWebClient> webClient_;
/// @brief Database dependency.
SqliteDatabase &database_;
/**
* @brief Initialize the data generator based on options.
*
* Creates either a MockGenerator (if no model path) or LlamaGenerator.
*
* @return A unique_ptr to the initialized generator.
*/
std::unique_ptr<IDataGenerator> InitializeGenerator();
/**
* @brief Download and load geographic data if not cached.
*/
void LoadGeographicData();
/**
* @brief Generate sample breweries for demonstration.
*/
void GenerateSampleBreweries();
/**
* @brief Helper struct to store generated brewery data.
*/
struct GeneratedBrewery {
int cityId;
std::string cityName;
BreweryResult brewery;
};
/// @brief Stores generated brewery data.
std::vector<GeneratedBrewery> generatedBreweries_;
};

View File

@@ -5,7 +5,7 @@
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include "web_client.h" #include "web_client/web_client.h"
/// @brief Downloads and caches source geography JSON payloads. /// @brief Downloads and caches source geography JSON payloads.
class DataDownloader { class DataDownloader {

View File

@@ -3,7 +3,7 @@
#include <cstdint> #include <cstdint>
#include <string> #include <string>
#include "data_generator.h" #include "data_generation/data_generator.h"
struct llama_model; struct llama_model;
struct llama_context; struct llama_context;

View File

@@ -1,6 +1,6 @@
#pragma once #pragma once
#include "data_generator.h" #include "data_generation/data_generator.h"
#include <string> #include <string>
#include <vector> #include <vector>

View File

@@ -1,7 +1,7 @@
#pragma once #pragma once
#include "database.h" #include "database/database.h"
#include "stream_parser.h" #include "json_handling/stream_parser.h"
#include <string> #include <string>
/// @brief Loads world-city JSON data into SQLite through streaming parsing. /// @brief Loads world-city JSON data into SQLite through streaming parsing.

View File

@@ -1,6 +1,6 @@
#pragma once #pragma once
#include "database.h" #include "database/database.h"
#include <functional> #include <functional>
#include <string> #include <string>

View File

@@ -1,6 +1,6 @@
#pragma once #pragma once
#include "web_client.h" #include "web_client/web_client.h"
#include <memory> #include <memory>
// RAII for curl_global_init/cleanup. // RAII for curl_global_init/cleanup.

View File

@@ -5,7 +5,7 @@
#include <string_view> #include <string_view>
#include <unordered_map> #include <unordered_map>
#include "web_client.h" #include "web_client/web_client.h"
/// @brief Provides cached Wikipedia summary lookups for city and country pairs. /// @brief Provides cached Wikipedia summary lookups for city and country pairs.
class WikipediaService { class WikipediaService {

View File

@@ -0,0 +1,132 @@
#include "biergarten_data_generator.h"
#include <algorithm>
#include <filesystem>
#include <unordered_map>
#include <spdlog/spdlog.h>
#include "data_generation/data_downloader.h"
#include "json_handling/json_loader.h"
#include "data_generation/llama_generator.h"
#include "data_generation/mock_generator.h"
#include "wikipedia/wikipedia_service.h"
BiergartenDataGenerator::BiergartenDataGenerator(
const ApplicationOptions &options,
std::shared_ptr<IWebClient> webClient,
SqliteDatabase &database)
: options_(options), webClient_(webClient), database_(database) {}
std::unique_ptr<IDataGenerator> BiergartenDataGenerator::InitializeGenerator() {
spdlog::info("Initializing brewery generator...");
std::unique_ptr<IDataGenerator> generator;
if (options_.modelPath.empty()) {
generator = std::make_unique<MockGenerator>();
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
} else {
auto llamaGenerator = std::make_unique<LlamaGenerator>();
llamaGenerator->setSamplingOptions(options_.temperature, options_.topP,
options_.seed);
spdlog::info(
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
"seed={})",
options_.modelPath, options_.temperature, options_.topP,
options_.seed);
generator = std::move(llamaGenerator);
}
generator->load(options_.modelPath);
return generator;
}
void BiergartenDataGenerator::LoadGeographicData() {
std::string jsonPath = options_.cacheDir + "/countries+states+cities.json";
std::string dbPath = options_.cacheDir + "/biergarten-pipeline.db";
bool hasJsonCache = std::filesystem::exists(jsonPath);
bool hasDbCache = std::filesystem::exists(dbPath);
spdlog::info("Initializing SQLite database at {}...", dbPath);
database_.Initialize(dbPath);
if (hasDbCache && hasJsonCache) {
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
} else {
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
DataDownloader downloader(webClient_);
downloader.DownloadCountriesDatabase(jsonPath, options_.commit);
JsonLoader::LoadWorldCities(jsonPath, database_);
}
}
void BiergartenDataGenerator::GenerateSampleBreweries() {
auto generator = InitializeGenerator();
WikipediaService wikipediaService(webClient_);
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
auto countries = database_.QueryCountries(50);
auto states = database_.QueryStates(50);
auto cities = database_.QueryCities();
// Build a quick map of country id -> name for per-city lookups.
auto allCountries = database_.QueryCountries(0);
std::unordered_map<int, std::string> countryMap;
for (const auto &c : allCountries)
countryMap[c.id] = c.name;
spdlog::info("\nTotal records loaded:");
spdlog::info(" Countries: {}", database_.QueryCountries(0).size());
spdlog::info(" States: {}", database_.QueryStates(0).size());
spdlog::info(" Cities: {}", cities.size());
generatedBreweries_.clear();
const size_t sampleCount = std::min(size_t(30), cities.size());
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
for (size_t i = 0; i < sampleCount; i++) {
const auto &city = cities[i];
const int cityId = city.id;
const std::string cityName = city.name;
std::string localCountry;
const auto countryIt = countryMap.find(city.countryId);
if (countryIt != countryMap.end()) {
localCountry = countryIt->second;
}
const std::string regionContext =
wikipediaService.GetSummary(cityName, localCountry);
spdlog::debug("[Pipeline] Region context for {}: {}", cityName,
regionContext);
auto brewery =
generator->generateBrewery(cityName, localCountry, regionContext);
generatedBreweries_.push_back({cityId, cityName, brewery});
}
spdlog::info("\n=== GENERATED DATA DUMP ===");
for (size_t i = 0; i < generatedBreweries_.size(); i++) {
const auto &entry = generatedBreweries_[i];
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.cityId,
entry.cityName);
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
}
}
int BiergartenDataGenerator::Run() {
try {
LoadGeographicData();
GenerateSampleBreweries();
spdlog::info("\nOK: Pipeline completed successfully");
return 0;
} catch (const std::exception &e) {
spdlog::error("ERROR: Pipeline failed: {}", e.what());
return 1;
}
}

View File

@@ -1,5 +1,5 @@
#include "data_downloader.h" #include "data_generation/data_downloader.h"
#include "web_client.h" #include "web_client/web_client.h"
#include <filesystem> #include <filesystem>
#include <fstream> #include <fstream>
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>

View File

@@ -11,7 +11,7 @@
#include <boost/json.hpp> #include <boost/json.hpp>
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include "llama_generator.h" #include "data_generation/llama_generator.h"
namespace { namespace {

View File

@@ -1,4 +1,4 @@
#include "mock_generator.h" #include "data_generation/mock_generator.h"
#include <functional> #include <functional>
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>

View File

@@ -1,4 +1,4 @@
#include "database.h" #include "database/database.h"
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <stdexcept> #include <stdexcept>

View File

@@ -2,8 +2,8 @@
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include "json_loader.h" #include "json_handling/json_loader.h"
#include "stream_parser.h" #include "json_handling/stream_parser.h"
void JsonLoader::LoadWorldCities(const std::string &jsonPath, void JsonLoader::LoadWorldCities(const std::string &jsonPath,
SqliteDatabase &db) { SqliteDatabase &db) {

View File

@@ -5,8 +5,8 @@
#include <boost/json/basic_parser_impl.hpp> #include <boost/json/basic_parser_impl.hpp>
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include "database.h" #include "database/database.h"
#include "stream_parser.h" #include "json_handling/stream_parser.h"
class CityRecordHandler { class CityRecordHandler {
friend class boost::json::basic_parser<CityRecordHandler>; friend class boost::json::basic_parser<CityRecordHandler>;

View File

@@ -1,163 +1,76 @@
#include <algorithm>
#include <filesystem>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <unordered_map>
#include <vector>
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include "curl_web_client.h" #include "application_options.h"
#include "data_downloader.h" #include "biergarten_data_generator.h"
#include "data_generator.h" #include "web_client/curl_web_client.h"
#include "database.h" #include "database/database.h"
#include "json_loader.h"
#include "llama_generator.h"
#include "mock_generator.h"
#include "wikipedia_service.h"
namespace po = boost::program_options; namespace po = boost::program_options;
/**
* @brief Parse command-line arguments into ApplicationOptions.
*
* @param argc Command-line argument count.
* @param argv Command-line arguments.
* @param options Output ApplicationOptions struct.
* @return true if parsing succeeded and help was not requested, false otherwise.
*/
bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
po::options_description desc("Pipeline Options");
desc.add_options()("help,h", "Produce help message")(
"model,m", po::value<std::string>()->default_value(""),
"Path to LLM model (gguf)")(
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
"Directory for cached JSON")(
"temperature", po::value<float>()->default_value(0.8f),
"Sampling temperature (higher = more random)")(
"top-p", po::value<float>()->default_value(0.92f),
"Nucleus sampling top-p in (0,1] (higher = more random)")(
"seed", po::value<int>()->default_value(-1),
"Sampler seed: -1 for random, otherwise non-negative integer")(
"commit", po::value<std::string>()->default_value("c5eb7772"),
"Git commit hash for DB consistency");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);
if (vm.count("help")) {
std::cout << desc << "\n";
return false;
}
options.modelPath = vm["model"].as<std::string>();
options.cacheDir = vm["cache-dir"].as<std::string>();
options.temperature = vm["temperature"].as<float>();
options.topP = vm["top-p"].as<float>();
options.seed = vm["seed"].as<int>();
options.commit = vm["commit"].as<std::string>();
return true;
}
int main(int argc, char *argv[]) { int main(int argc, char *argv[]) {
try { try {
const CurlGlobalState curl_state; const CurlGlobalState curl_state;
po::options_description desc("Pipeline Options"); ApplicationOptions options;
desc.add_options()("help,h", "Produce help message")( if (!ParseArguments(argc, argv, options)) {
"model,m", po::value<std::string>()->default_value(""),
"Path to LLM model (gguf)")(
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
"Directory for cached JSON")(
"temperature", po::value<float>()->default_value(0.8f),
"Sampling temperature (higher = more random)")(
"top-p", po::value<float>()->default_value(0.92f),
"Nucleus sampling top-p in (0,1] (higher = more random)")(
"seed", po::value<int>()->default_value(-1),
"Sampler seed: -1 for random, otherwise non-negative integer")(
"commit", po::value<std::string>()->default_value("c5eb7772"),
"Git commit hash for DB consistency");
po::variables_map vm;
po::store(po::parse_command_line(argc, argv, desc), vm);
po::notify(vm);
if (vm.count("help")) {
std::cout << desc << "\n";
return 0; return 0;
} }
std::string modelPath = vm["model"].as<std::string>();
std::string cacheDir = vm["cache-dir"].as<std::string>();
float temperature = vm["temperature"].as<float>();
float topP = vm["top-p"].as<float>();
int seed = vm["seed"].as<int>();
std::string commit = vm["commit"].as<std::string>();
std::string jsonPath = cacheDir + "/countries+states+cities.json";
std::string dbPath = cacheDir + "/biergarten-pipeline.db";
bool hasJsonCache = std::filesystem::exists(jsonPath);
bool hasDbCache = std::filesystem::exists(dbPath);
auto webClient = std::make_shared<CURLWebClient>(); auto webClient = std::make_shared<CURLWebClient>();
SqliteDatabase database;
SqliteDatabase db; BiergartenDataGenerator generator(options, webClient, database);
return generator.Run();
spdlog::info("Initializing SQLite database at {}...", dbPath);
db.Initialize(dbPath);
if (hasDbCache && hasJsonCache) {
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
} else {
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
DataDownloader downloader(webClient);
downloader.DownloadCountriesDatabase(jsonPath, commit);
JsonLoader::LoadWorldCities(jsonPath, db);
}
spdlog::info("Initializing brewery generator...");
std::unique_ptr<IDataGenerator> generator;
if (modelPath.empty()) {
generator = std::make_unique<MockGenerator>();
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
} else {
auto llamaGenerator = std::make_unique<LlamaGenerator>();
llamaGenerator->setSamplingOptions(temperature, topP, seed);
spdlog::info(
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
"seed={})",
modelPath, temperature, topP, seed);
generator = std::move(llamaGenerator);
}
generator->load(modelPath);
WikipediaService wikipediaService(webClient);
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
auto countries = db.QueryCountries(50);
auto states = db.QueryStates(50);
auto cities = db.QueryCities();
// Build a quick map of country id -> name for per-city lookups.
auto allCountries = db.QueryCountries(0);
std::unordered_map<int, std::string> countryMap;
for (const auto &c : allCountries)
countryMap[c.id] = c.name;
spdlog::info("\nTotal records loaded:");
spdlog::info(" Countries: {}", db.QueryCountries(0).size());
spdlog::info(" States: {}", db.QueryStates(0).size());
spdlog::info(" Cities: {}", cities.size());
struct GeneratedBrewery {
int cityId;
std::string cityName;
BreweryResult brewery;
};
std::vector<GeneratedBrewery> generatedBreweries;
const size_t sampleCount = std::min(size_t(30), cities.size());
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
for (size_t i = 0; i < sampleCount; i++) {
const auto &city = cities[i];
const int cityId = city.id;
const std::string cityName = city.name;
std::string localCountry;
const auto countryIt = countryMap.find(city.countryId);
if (countryIt != countryMap.end()) {
localCountry = countryIt->second;
}
const std::string regionContext =
wikipediaService.GetSummary(cityName, localCountry);
spdlog::debug("[Pipeline] Region context for {}: {}", cityName,
regionContext);
auto brewery =
generator->generateBrewery(cityName, localCountry, regionContext);
generatedBreweries.push_back({cityId, cityName, brewery});
}
spdlog::info("\n=== GENERATED DATA DUMP ===");
for (size_t i = 0; i < generatedBreweries.size(); i++) {
const auto &entry = generatedBreweries[i];
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.cityId,
entry.cityName);
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
}
spdlog::info("\nOK: Pipeline completed successfully");
return 0;
} catch (const std::exception &e) { } catch (const std::exception &e) {
spdlog::error("ERROR: Pipeline failed: {}", e.what()); spdlog::error("ERROR: Application failed: {}", e.what());
return 1; return 1;
} }
} }

View File

@@ -1,4 +1,4 @@
#include "curl_web_client.h" #include "web_client/curl_web_client.h"
#include <cstdio> #include <cstdio>
#include <curl/curl.h> #include <curl/curl.h>
#include <fstream> #include <fstream>

View File

@@ -1,4 +1,4 @@
#include "wikipedia_service.h" #include "wikipedia/wikipedia_service.h"
#include <boost/json.hpp> #include <boost/json.hpp>
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>