mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-04-05 18:09:04 +00:00
Reorganize directory structure
This commit is contained in:
@@ -78,14 +78,15 @@ endif()
|
|||||||
# Main Executable
|
# Main Executable
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
set(PIPELINE_SOURCES
|
set(PIPELINE_SOURCES
|
||||||
src/curl_web_client.cpp
|
src/biergarten_data_generator.cpp
|
||||||
src/data_downloader.cpp
|
src/web_client/curl_web_client.cpp
|
||||||
src/database.cpp
|
src/data_generation/data_downloader.cpp
|
||||||
src/json_loader.cpp
|
src/database/database.cpp
|
||||||
src/llama_generator.cpp
|
src/json_handling/json_loader.cpp
|
||||||
src/mock_generator.cpp
|
src/data_generation/llama_generator.cpp
|
||||||
src/stream_parser.cpp
|
src/data_generation/mock_generator.cpp
|
||||||
src/wikipedia_service.cpp
|
src/json_handling/stream_parser.cpp
|
||||||
|
src/wikipedia/wikipedia_service.cpp
|
||||||
src/main.cpp
|
src/main.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -118,10 +119,10 @@ if(ENABLE_CLANG_FORMAT_TARGETS)
|
|||||||
find_program(CLANG_FORMAT_EXE NAMES clang-format)
|
find_program(CLANG_FORMAT_EXE NAMES clang-format)
|
||||||
if(CLANG_FORMAT_EXE)
|
if(CLANG_FORMAT_EXE)
|
||||||
file(GLOB_RECURSE FORMAT_SOURCES CONFIGURE_DEPENDS
|
file(GLOB_RECURSE FORMAT_SOURCES CONFIGURE_DEPENDS
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/src/*.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/src/*.cc
|
${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cc
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/includes/*.h
|
${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.h
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/includes/*.hpp
|
${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.hpp
|
||||||
)
|
)
|
||||||
|
|
||||||
add_custom_target(format
|
add_custom_target(format
|
||||||
|
|||||||
2
pipeline/includes/application_options.h
Normal file
2
pipeline/includes/application_options.h
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
111
pipeline/includes/biergarten_data_generator.h
Normal file
111
pipeline/includes/biergarten_data_generator.h
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include "application_options.h"
|
||||||
|
#include "data_generation/data_generator.h"
|
||||||
|
#include "database/database.h"
|
||||||
|
#include "web_client/web_client.h"
|
||||||
|
#include "wikipedia/wikipedia_service.h"
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Program options for the Biergarten pipeline application.
|
||||||
|
*/
|
||||||
|
struct ApplicationOptions {
|
||||||
|
/// @brief Path to the LLM model file (gguf format).
|
||||||
|
std::string modelPath;
|
||||||
|
|
||||||
|
/// @brief Directory for cached JSON and database files.
|
||||||
|
std::string cacheDir;
|
||||||
|
|
||||||
|
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
|
||||||
|
float temperature = 0.8f;
|
||||||
|
|
||||||
|
/// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more random).
|
||||||
|
float topP = 0.92f;
|
||||||
|
|
||||||
|
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
|
||||||
|
int seed = -1;
|
||||||
|
|
||||||
|
/// @brief Git commit hash for database consistency.
|
||||||
|
std::string commit = "c5eb7772";
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Main data generator class for the Biergarten pipeline.
|
||||||
|
*
|
||||||
|
* This class encapsulates the core logic for generating brewery data.
|
||||||
|
* It handles database initialization, data loading/downloading, and brewery generation.
|
||||||
|
*/
|
||||||
|
class BiergartenDataGenerator {
|
||||||
|
public:
|
||||||
|
/**
|
||||||
|
* @brief Construct a BiergartenDataGenerator with injected dependencies.
|
||||||
|
*
|
||||||
|
* @param options Application configuration options.
|
||||||
|
* @param webClient HTTP client for downloading data.
|
||||||
|
* @param database SQLite database instance.
|
||||||
|
*/
|
||||||
|
BiergartenDataGenerator(const ApplicationOptions &options,
|
||||||
|
std::shared_ptr<IWebClient> webClient,
|
||||||
|
SqliteDatabase &database);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Run the data generation pipeline.
|
||||||
|
*
|
||||||
|
* Performs the following steps:
|
||||||
|
* 1. Initialize database
|
||||||
|
* 2. Download geographic data if needed
|
||||||
|
* 3. Initialize the generator (LLM or Mock)
|
||||||
|
* 4. Generate brewery data for sample cities
|
||||||
|
*
|
||||||
|
* @return 0 on success, 1 on failure.
|
||||||
|
*/
|
||||||
|
int Run();
|
||||||
|
|
||||||
|
private:
|
||||||
|
/// @brief Immutable application options.
|
||||||
|
const ApplicationOptions options_;
|
||||||
|
|
||||||
|
/// @brief Shared HTTP client dependency.
|
||||||
|
std::shared_ptr<IWebClient> webClient_;
|
||||||
|
|
||||||
|
/// @brief Database dependency.
|
||||||
|
SqliteDatabase &database_;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Initialize the data generator based on options.
|
||||||
|
*
|
||||||
|
* Creates either a MockGenerator (if no model path) or LlamaGenerator.
|
||||||
|
*
|
||||||
|
* @return A unique_ptr to the initialized generator.
|
||||||
|
*/
|
||||||
|
std::unique_ptr<IDataGenerator> InitializeGenerator();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Download and load geographic data if not cached.
|
||||||
|
*/
|
||||||
|
void LoadGeographicData();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Generate sample breweries for demonstration.
|
||||||
|
*/
|
||||||
|
void GenerateSampleBreweries();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Helper struct to store generated brewery data.
|
||||||
|
*/
|
||||||
|
struct GeneratedBrewery {
|
||||||
|
int cityId;
|
||||||
|
std::string cityName;
|
||||||
|
BreweryResult brewery;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// @brief Stores generated brewery data.
|
||||||
|
std::vector<GeneratedBrewery> generatedBreweries_;
|
||||||
|
};
|
||||||
@@ -5,7 +5,7 @@
|
|||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "web_client.h"
|
#include "web_client/web_client.h"
|
||||||
|
|
||||||
/// @brief Downloads and caches source geography JSON payloads.
|
/// @brief Downloads and caches source geography JSON payloads.
|
||||||
class DataDownloader {
|
class DataDownloader {
|
||||||
@@ -3,7 +3,7 @@
|
|||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "data_generator.h"
|
#include "data_generation/data_generator.h"
|
||||||
|
|
||||||
struct llama_model;
|
struct llama_model;
|
||||||
struct llama_context;
|
struct llama_context;
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "data_generator.h"
|
#include "data_generation/data_generator.h"
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
@@ -1,7 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "database.h"
|
#include "database/database.h"
|
||||||
#include "stream_parser.h"
|
#include "json_handling/stream_parser.h"
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
/// @brief Loads world-city JSON data into SQLite through streaming parsing.
|
/// @brief Loads world-city JSON data into SQLite through streaming parsing.
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "database.h"
|
#include "database/database.h"
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@@ -1,6 +1,6 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include "web_client.h"
|
#include "web_client/web_client.h"
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
// RAII for curl_global_init/cleanup.
|
// RAII for curl_global_init/cleanup.
|
||||||
@@ -5,7 +5,7 @@
|
|||||||
#include <string_view>
|
#include <string_view>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
|
||||||
#include "web_client.h"
|
#include "web_client/web_client.h"
|
||||||
|
|
||||||
/// @brief Provides cached Wikipedia summary lookups for city and country pairs.
|
/// @brief Provides cached Wikipedia summary lookups for city and country pairs.
|
||||||
class WikipediaService {
|
class WikipediaService {
|
||||||
132
pipeline/src/biergarten_data_generator.cpp
Normal file
132
pipeline/src/biergarten_data_generator.cpp
Normal file
@@ -0,0 +1,132 @@
|
|||||||
|
#include "biergarten_data_generator.h"
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
#include "data_generation/data_downloader.h"
|
||||||
|
#include "json_handling/json_loader.h"
|
||||||
|
#include "data_generation/llama_generator.h"
|
||||||
|
#include "data_generation/mock_generator.h"
|
||||||
|
#include "wikipedia/wikipedia_service.h"
|
||||||
|
|
||||||
|
BiergartenDataGenerator::BiergartenDataGenerator(
|
||||||
|
const ApplicationOptions &options,
|
||||||
|
std::shared_ptr<IWebClient> webClient,
|
||||||
|
SqliteDatabase &database)
|
||||||
|
: options_(options), webClient_(webClient), database_(database) {}
|
||||||
|
|
||||||
|
std::unique_ptr<IDataGenerator> BiergartenDataGenerator::InitializeGenerator() {
|
||||||
|
spdlog::info("Initializing brewery generator...");
|
||||||
|
|
||||||
|
std::unique_ptr<IDataGenerator> generator;
|
||||||
|
if (options_.modelPath.empty()) {
|
||||||
|
generator = std::make_unique<MockGenerator>();
|
||||||
|
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
|
||||||
|
} else {
|
||||||
|
auto llamaGenerator = std::make_unique<LlamaGenerator>();
|
||||||
|
llamaGenerator->setSamplingOptions(options_.temperature, options_.topP,
|
||||||
|
options_.seed);
|
||||||
|
spdlog::info(
|
||||||
|
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
|
||||||
|
"seed={})",
|
||||||
|
options_.modelPath, options_.temperature, options_.topP,
|
||||||
|
options_.seed);
|
||||||
|
generator = std::move(llamaGenerator);
|
||||||
|
}
|
||||||
|
generator->load(options_.modelPath);
|
||||||
|
|
||||||
|
return generator;
|
||||||
|
}
|
||||||
|
|
||||||
|
void BiergartenDataGenerator::LoadGeographicData() {
|
||||||
|
std::string jsonPath = options_.cacheDir + "/countries+states+cities.json";
|
||||||
|
std::string dbPath = options_.cacheDir + "/biergarten-pipeline.db";
|
||||||
|
|
||||||
|
bool hasJsonCache = std::filesystem::exists(jsonPath);
|
||||||
|
bool hasDbCache = std::filesystem::exists(dbPath);
|
||||||
|
|
||||||
|
spdlog::info("Initializing SQLite database at {}...", dbPath);
|
||||||
|
database_.Initialize(dbPath);
|
||||||
|
|
||||||
|
if (hasDbCache && hasJsonCache) {
|
||||||
|
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
|
||||||
|
} else {
|
||||||
|
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
|
||||||
|
DataDownloader downloader(webClient_);
|
||||||
|
downloader.DownloadCountriesDatabase(jsonPath, options_.commit);
|
||||||
|
|
||||||
|
JsonLoader::LoadWorldCities(jsonPath, database_);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BiergartenDataGenerator::GenerateSampleBreweries() {
|
||||||
|
auto generator = InitializeGenerator();
|
||||||
|
WikipediaService wikipediaService(webClient_);
|
||||||
|
|
||||||
|
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||||
|
|
||||||
|
auto countries = database_.QueryCountries(50);
|
||||||
|
auto states = database_.QueryStates(50);
|
||||||
|
auto cities = database_.QueryCities();
|
||||||
|
|
||||||
|
// Build a quick map of country id -> name for per-city lookups.
|
||||||
|
auto allCountries = database_.QueryCountries(0);
|
||||||
|
std::unordered_map<int, std::string> countryMap;
|
||||||
|
for (const auto &c : allCountries)
|
||||||
|
countryMap[c.id] = c.name;
|
||||||
|
|
||||||
|
spdlog::info("\nTotal records loaded:");
|
||||||
|
spdlog::info(" Countries: {}", database_.QueryCountries(0).size());
|
||||||
|
spdlog::info(" States: {}", database_.QueryStates(0).size());
|
||||||
|
spdlog::info(" Cities: {}", cities.size());
|
||||||
|
|
||||||
|
generatedBreweries_.clear();
|
||||||
|
const size_t sampleCount = std::min(size_t(30), cities.size());
|
||||||
|
|
||||||
|
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
|
||||||
|
for (size_t i = 0; i < sampleCount; i++) {
|
||||||
|
const auto &city = cities[i];
|
||||||
|
const int cityId = city.id;
|
||||||
|
const std::string cityName = city.name;
|
||||||
|
|
||||||
|
std::string localCountry;
|
||||||
|
const auto countryIt = countryMap.find(city.countryId);
|
||||||
|
if (countryIt != countryMap.end()) {
|
||||||
|
localCountry = countryIt->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string regionContext =
|
||||||
|
wikipediaService.GetSummary(cityName, localCountry);
|
||||||
|
spdlog::debug("[Pipeline] Region context for {}: {}", cityName,
|
||||||
|
regionContext);
|
||||||
|
|
||||||
|
auto brewery =
|
||||||
|
generator->generateBrewery(cityName, localCountry, regionContext);
|
||||||
|
generatedBreweries_.push_back({cityId, cityName, brewery});
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
||||||
|
for (size_t i = 0; i < generatedBreweries_.size(); i++) {
|
||||||
|
const auto &entry = generatedBreweries_[i];
|
||||||
|
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.cityId,
|
||||||
|
entry.cityName);
|
||||||
|
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
|
||||||
|
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int BiergartenDataGenerator::Run() {
|
||||||
|
try {
|
||||||
|
LoadGeographicData();
|
||||||
|
GenerateSampleBreweries();
|
||||||
|
|
||||||
|
spdlog::info("\nOK: Pipeline completed successfully");
|
||||||
|
return 0;
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
spdlog::error("ERROR: Pipeline failed: {}", e.what());
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
#include "data_downloader.h"
|
#include "data_generation/data_downloader.h"
|
||||||
#include "web_client.h"
|
#include "web_client/web_client.h"
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
@@ -11,7 +11,7 @@
|
|||||||
#include <boost/json.hpp>
|
#include <boost/json.hpp>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
#include "llama_generator.h"
|
#include "data_generation/llama_generator.h"
|
||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
#include "mock_generator.h"
|
#include "data_generation/mock_generator.h"
|
||||||
|
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
#include "database.h"
|
#include "database/database.h"
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
@@ -2,8 +2,8 @@
|
|||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
#include "json_loader.h"
|
#include "json_handling/json_loader.h"
|
||||||
#include "stream_parser.h"
|
#include "json_handling/stream_parser.h"
|
||||||
|
|
||||||
void JsonLoader::LoadWorldCities(const std::string &jsonPath,
|
void JsonLoader::LoadWorldCities(const std::string &jsonPath,
|
||||||
SqliteDatabase &db) {
|
SqliteDatabase &db) {
|
||||||
@@ -5,8 +5,8 @@
|
|||||||
#include <boost/json/basic_parser_impl.hpp>
|
#include <boost/json/basic_parser_impl.hpp>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
#include "database.h"
|
#include "database/database.h"
|
||||||
#include "stream_parser.h"
|
#include "json_handling/stream_parser.h"
|
||||||
|
|
||||||
class CityRecordHandler {
|
class CityRecordHandler {
|
||||||
friend class boost::json::basic_parser<CityRecordHandler>;
|
friend class boost::json::basic_parser<CityRecordHandler>;
|
||||||
@@ -1,163 +1,76 @@
|
|||||||
#include <algorithm>
|
|
||||||
#include <filesystem>
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
#include "curl_web_client.h"
|
#include "application_options.h"
|
||||||
#include "data_downloader.h"
|
#include "biergarten_data_generator.h"
|
||||||
#include "data_generator.h"
|
#include "web_client/curl_web_client.h"
|
||||||
#include "database.h"
|
#include "database/database.h"
|
||||||
#include "json_loader.h"
|
|
||||||
#include "llama_generator.h"
|
|
||||||
#include "mock_generator.h"
|
|
||||||
#include "wikipedia_service.h"
|
|
||||||
|
|
||||||
namespace po = boost::program_options;
|
namespace po = boost::program_options;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Parse command-line arguments into ApplicationOptions.
|
||||||
|
*
|
||||||
|
* @param argc Command-line argument count.
|
||||||
|
* @param argv Command-line arguments.
|
||||||
|
* @param options Output ApplicationOptions struct.
|
||||||
|
* @return true if parsing succeeded and help was not requested, false otherwise.
|
||||||
|
*/
|
||||||
|
bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
|
||||||
|
po::options_description desc("Pipeline Options");
|
||||||
|
desc.add_options()("help,h", "Produce help message")(
|
||||||
|
"model,m", po::value<std::string>()->default_value(""),
|
||||||
|
"Path to LLM model (gguf)")(
|
||||||
|
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
|
||||||
|
"Directory for cached JSON")(
|
||||||
|
"temperature", po::value<float>()->default_value(0.8f),
|
||||||
|
"Sampling temperature (higher = more random)")(
|
||||||
|
"top-p", po::value<float>()->default_value(0.92f),
|
||||||
|
"Nucleus sampling top-p in (0,1] (higher = more random)")(
|
||||||
|
"seed", po::value<int>()->default_value(-1),
|
||||||
|
"Sampler seed: -1 for random, otherwise non-negative integer")(
|
||||||
|
"commit", po::value<std::string>()->default_value("c5eb7772"),
|
||||||
|
"Git commit hash for DB consistency");
|
||||||
|
|
||||||
|
po::variables_map vm;
|
||||||
|
po::store(po::parse_command_line(argc, argv, desc), vm);
|
||||||
|
po::notify(vm);
|
||||||
|
|
||||||
|
if (vm.count("help")) {
|
||||||
|
std::cout << desc << "\n";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
options.modelPath = vm["model"].as<std::string>();
|
||||||
|
options.cacheDir = vm["cache-dir"].as<std::string>();
|
||||||
|
options.temperature = vm["temperature"].as<float>();
|
||||||
|
options.topP = vm["top-p"].as<float>();
|
||||||
|
options.seed = vm["seed"].as<int>();
|
||||||
|
options.commit = vm["commit"].as<std::string>();
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
try {
|
try {
|
||||||
const CurlGlobalState curl_state;
|
const CurlGlobalState curl_state;
|
||||||
|
|
||||||
po::options_description desc("Pipeline Options");
|
ApplicationOptions options;
|
||||||
desc.add_options()("help,h", "Produce help message")(
|
if (!ParseArguments(argc, argv, options)) {
|
||||||
"model,m", po::value<std::string>()->default_value(""),
|
|
||||||
"Path to LLM model (gguf)")(
|
|
||||||
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
|
|
||||||
"Directory for cached JSON")(
|
|
||||||
"temperature", po::value<float>()->default_value(0.8f),
|
|
||||||
"Sampling temperature (higher = more random)")(
|
|
||||||
"top-p", po::value<float>()->default_value(0.92f),
|
|
||||||
"Nucleus sampling top-p in (0,1] (higher = more random)")(
|
|
||||||
"seed", po::value<int>()->default_value(-1),
|
|
||||||
"Sampler seed: -1 for random, otherwise non-negative integer")(
|
|
||||||
"commit", po::value<std::string>()->default_value("c5eb7772"),
|
|
||||||
"Git commit hash for DB consistency");
|
|
||||||
|
|
||||||
po::variables_map vm;
|
|
||||||
po::store(po::parse_command_line(argc, argv, desc), vm);
|
|
||||||
po::notify(vm);
|
|
||||||
|
|
||||||
if (vm.count("help")) {
|
|
||||||
std::cout << desc << "\n";
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string modelPath = vm["model"].as<std::string>();
|
|
||||||
std::string cacheDir = vm["cache-dir"].as<std::string>();
|
|
||||||
float temperature = vm["temperature"].as<float>();
|
|
||||||
float topP = vm["top-p"].as<float>();
|
|
||||||
int seed = vm["seed"].as<int>();
|
|
||||||
std::string commit = vm["commit"].as<std::string>();
|
|
||||||
|
|
||||||
std::string jsonPath = cacheDir + "/countries+states+cities.json";
|
|
||||||
std::string dbPath = cacheDir + "/biergarten-pipeline.db";
|
|
||||||
|
|
||||||
bool hasJsonCache = std::filesystem::exists(jsonPath);
|
|
||||||
bool hasDbCache = std::filesystem::exists(dbPath);
|
|
||||||
|
|
||||||
auto webClient = std::make_shared<CURLWebClient>();
|
auto webClient = std::make_shared<CURLWebClient>();
|
||||||
|
SqliteDatabase database;
|
||||||
|
|
||||||
SqliteDatabase db;
|
BiergartenDataGenerator generator(options, webClient, database);
|
||||||
|
return generator.Run();
|
||||||
spdlog::info("Initializing SQLite database at {}...", dbPath);
|
|
||||||
db.Initialize(dbPath);
|
|
||||||
|
|
||||||
if (hasDbCache && hasJsonCache) {
|
|
||||||
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
|
|
||||||
} else {
|
|
||||||
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
|
|
||||||
DataDownloader downloader(webClient);
|
|
||||||
downloader.DownloadCountriesDatabase(jsonPath, commit);
|
|
||||||
|
|
||||||
JsonLoader::LoadWorldCities(jsonPath, db);
|
|
||||||
}
|
|
||||||
|
|
||||||
spdlog::info("Initializing brewery generator...");
|
|
||||||
std::unique_ptr<IDataGenerator> generator;
|
|
||||||
if (modelPath.empty()) {
|
|
||||||
generator = std::make_unique<MockGenerator>();
|
|
||||||
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
|
|
||||||
} else {
|
|
||||||
auto llamaGenerator = std::make_unique<LlamaGenerator>();
|
|
||||||
llamaGenerator->setSamplingOptions(temperature, topP, seed);
|
|
||||||
spdlog::info(
|
|
||||||
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
|
|
||||||
"seed={})",
|
|
||||||
modelPath, temperature, topP, seed);
|
|
||||||
generator = std::move(llamaGenerator);
|
|
||||||
}
|
|
||||||
generator->load(modelPath);
|
|
||||||
|
|
||||||
WikipediaService wikipediaService(webClient);
|
|
||||||
|
|
||||||
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
|
||||||
|
|
||||||
auto countries = db.QueryCountries(50);
|
|
||||||
auto states = db.QueryStates(50);
|
|
||||||
auto cities = db.QueryCities();
|
|
||||||
|
|
||||||
// Build a quick map of country id -> name for per-city lookups.
|
|
||||||
auto allCountries = db.QueryCountries(0);
|
|
||||||
std::unordered_map<int, std::string> countryMap;
|
|
||||||
for (const auto &c : allCountries)
|
|
||||||
countryMap[c.id] = c.name;
|
|
||||||
|
|
||||||
spdlog::info("\nTotal records loaded:");
|
|
||||||
spdlog::info(" Countries: {}", db.QueryCountries(0).size());
|
|
||||||
spdlog::info(" States: {}", db.QueryStates(0).size());
|
|
||||||
spdlog::info(" Cities: {}", cities.size());
|
|
||||||
|
|
||||||
struct GeneratedBrewery {
|
|
||||||
int cityId;
|
|
||||||
std::string cityName;
|
|
||||||
BreweryResult brewery;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<GeneratedBrewery> generatedBreweries;
|
|
||||||
const size_t sampleCount = std::min(size_t(30), cities.size());
|
|
||||||
|
|
||||||
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
|
|
||||||
for (size_t i = 0; i < sampleCount; i++) {
|
|
||||||
const auto &city = cities[i];
|
|
||||||
const int cityId = city.id;
|
|
||||||
const std::string cityName = city.name;
|
|
||||||
|
|
||||||
std::string localCountry;
|
|
||||||
const auto countryIt = countryMap.find(city.countryId);
|
|
||||||
if (countryIt != countryMap.end()) {
|
|
||||||
localCountry = countryIt->second;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string regionContext =
|
|
||||||
wikipediaService.GetSummary(cityName, localCountry);
|
|
||||||
spdlog::debug("[Pipeline] Region context for {}: {}", cityName,
|
|
||||||
regionContext);
|
|
||||||
|
|
||||||
auto brewery =
|
|
||||||
generator->generateBrewery(cityName, localCountry, regionContext);
|
|
||||||
generatedBreweries.push_back({cityId, cityName, brewery});
|
|
||||||
}
|
|
||||||
|
|
||||||
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
|
||||||
for (size_t i = 0; i < generatedBreweries.size(); i++) {
|
|
||||||
const auto &entry = generatedBreweries[i];
|
|
||||||
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.cityId,
|
|
||||||
entry.cityName);
|
|
||||||
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
|
|
||||||
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
|
|
||||||
}
|
|
||||||
|
|
||||||
spdlog::info("\nOK: Pipeline completed successfully");
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
} catch (const std::exception &e) {
|
} catch (const std::exception &e) {
|
||||||
spdlog::error("ERROR: Pipeline failed: {}", e.what());
|
spdlog::error("ERROR: Application failed: {}", e.what());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
#include "curl_web_client.h"
|
#include "web_client/curl_web_client.h"
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
#include "wikipedia_service.h"
|
#include "wikipedia/wikipedia_service.h"
|
||||||
#include <boost/json.hpp>
|
#include <boost/json.hpp>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
Reference in New Issue
Block a user