Begin work on biergarten data generator pipeline

This commit is contained in:
Aaron Po
2026-04-01 19:33:50 -04:00
parent 581863d69b
commit 23e2199b6b
12 changed files with 1323 additions and 0 deletions

View File

@@ -0,0 +1,111 @@
/**
* @file data_downloader.h
* @brief Download geographic data from GitHub repositories using libcurl.
*
* Provides functionality to fetch JSON data from GitHub using libcurl, with
* support for commit-based versioning to ensure reproducible builds. Downloads
* are cached to avoid repeated network requests.
*
* Example usage:
* @code
* DataDownloader downloader;
* std::string jsonPath = downloader.DownloadCountriesDatabase(
* "/tmp/countries-data.json", // local cache path
* "c5eb7772" // optional commit hash or HEAD
* );
* // Now use jsonPath with JsonLoader::LoadWorldCities(jsonPath, db)
* @endcode
*/
#ifndef DATA_DOWNLOADER_H
#define DATA_DOWNLOADER_H
#include <stdexcept>
#include <string>
/**
* @class DataDownloader
* @brief Manages downloading and caching of geographic data from GitHub.
*
* This class encapsulates libcurl networking operations for reproducible
* data fetching. All methods are non-blocking and synchronous.
*
* @note Requires libcurl to be available at runtime.
* @note GitHub raw content CDN is used for efficient downloads.
*/
class DataDownloader {
public:
/**
* @brief Default constructor.
*
* Initializes the downloader without any specific state. The downloader
* is ready to use immediately.
*/
DataDownloader();
/**
* @brief Destructor.
*
* Cleans up any resources. No explicit cleanup needed beyond destruction.
*/
~DataDownloader();
/**
* @brief Download the countries+states+cities JSON database from GitHub.
*
* Downloads the geographic data from the
* dr5hn/countries-states-cities-database repository. If the file already
* exists at cachePath, it is used directly without downloading again.
*
* The download URL format is:
* @verbatim
* https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/
* {commit}/json/countries+states+cities.json
* @endverbatim
*
* @param cachePath Local filesystem path where the JSON file should be
* stored. If the file already exists, download is skipped.
* @param commit Git commit hash or branch name (default: "c5eb7772").
* Examples: "HEAD", "main", "c5eb7772",
* "c5eb7772225f6b1802a54f39adb8c73464a85be1a"
*
* @return The file path where JSON was saved (same as cachePath).
*
* @throws std::runtime_error if:
* - Network download fails
* - File cannot be written to cachePath
* - Commit hash is invalid (404 on GitHub)
*
* Example with default commit (stable v2026-03-28):
* @code
* std::string path =
* downloader.DownloadCountriesDatabase("/tmp/data.json");
* @endcode
*
* Example with custom commit:
* @code
* std::string path = downloader.DownloadCountriesDatabase(
* "/tmp/data.json",
* "main" // Download latest from main branch
* );
* @endcode
*/
std::string DownloadCountriesDatabase(
const std::string &cachePath,
const std::string &commit = "c5eb7772" // Stable commit: 2026-03-28 export
);
private:
/**
* @brief Check if a file already exists at the given path.
*
* Used internally to implement cache-hit logic. No download occurs if
* the file already exists.
*
* @param filePath Path to check.
* @return True if file exists and is readable, false otherwise.
*/
bool FileExists(const std::string &filePath) const;
};
#endif // DATA_DOWNLOADER_H

View File

@@ -0,0 +1,102 @@
#pragma once
#include <mutex>
#include <sqlite3.h>
#include <string>
#include <vector>
/// @struct Country
/// @brief Represents a country with geographic identifiers
struct Country {
int id;
std::string name;
std::string iso2; ///< 2-letter ISO code (e.g., "US", "CA")
std::string iso3; ///< 3-letter ISO code (e.g., "USA", "CAN")
};
/// @struct State
/// @brief Represents a state or province with geographic identifiers
struct State {
int id;
std::string name;
std::string iso2; ///< 2-letter state code (e.g., "CA", "ON")
int countryId;
};
/**
* @class SqliteDatabase
* @brief Thread-safe in-memory SQLite database wrapper for geographic data
*
* Manages a local in-memory SQLite database with countries, states, and cities.
* All write operations are serialized via mutex to enable safe concurrent
* access from multiple threads. Uses INSERT OR IGNORE for idempotent
* operations.
*
* Schema Relationships:
* countries (id, name, iso2, iso3)
* ↓ (one-to-many)
* states (id, country_id, name, iso2)
* ↓ (one-to-many)
* cities (id, state_id, country_id, name, latitude, longitude)
*/
class SqliteDatabase {
private:
sqlite3 *db = nullptr; ///< SQLite database connection handle
std::mutex dbMutex; ///< Protects all database operations from race conditions
/// @brief Creates the schema with three related tables and foreign keys
void InitializeSchema();
public:
/// @brief Destructor: safely closes the database connection
~SqliteDatabase();
/// @brief Opens an in-memory SQLite database and initializes the schema
void Initialize();
/// @brief Inserts a country record
/// @param id Unique country identifier
/// @param name Country name
/// @param iso2 2-letter ISO country code
/// @param iso3 3-letter ISO country code
/// @note Thread-safe: uses mutex lock. Idempotent: INSERT OR IGNORE prevents
/// duplicates
void InsertCountry(int id, const std::string &name, const std::string &iso2,
const std::string &iso3);
/// @brief Inserts a state/province record
/// @param id Unique state identifier
/// @param countryId Foreign key reference to parent country
/// @param name State/province name
/// @param iso2 2-letter state code (e.g., "CA", "ON")
/// @note Thread-safe and idempotent via mutex and INSERT OR IGNORE
void InsertState(int id, int countryId, const std::string &name,
const std::string &iso2);
/// @brief Inserts a city record with geographic coordinates
/// @param id Unique city identifier
/// @param stateId Foreign key reference to parent state
/// @param countryId Foreign key reference to parent country
/// @param name City name
/// @param latitude Geographic latitude coordinate (WGS84)
/// @param longitude Geographic longitude coordinate (WGS84)
/// @note Thread-safe and idempotent. Called by multithreaded JSON loader.
void InsertCity(int id, int stateId, int countryId, const std::string &name,
double latitude, double longitude);
/// @brief Queries all cities from the database
/// @return Vector of (city_id, city_name) pairs sorted alphabetically
std::vector<std::pair<int, std::string>> QueryCities();
/// @brief Queries all countries from the database with ISO codes
/// @param limit Maximum number of records to return (0 = all)
/// @return Vector of Country structs (includes id, name, iso2, iso3) sorted
/// alphabetically
std::vector<Country> QueryCountries(int limit = 0);
/// @brief Queries all states from the database with ISO codes
/// @param limit Maximum number of records to return (0 = all)
/// @return Vector of State structs (includes id, name, iso2, countryId)
/// sorted alphabetically
std::vector<State> QueryStates(int limit = 0);
};

View File

@@ -0,0 +1,59 @@
#pragma once
#include <string>
#include <vector>
/**
* @class LlamaBreweryGenerator
* @brief Generates brewery names and descriptions for cities
*
* Currently provides a deterministic mock implementation that generates
* brewery names and descriptions based on city name hashing.
*
* Design Pattern: Strategy pattern ready for swapping real llama.cpp
* implementation later. The LoadModel() and GenerateBrewery() interface
* will remain the same once actual LM inference is integrated.
*
* Mock Implementation: Uses std::hash to deterministically map city names
* to brewery templates, ensuring reproducible results for testing.
*/
class LlamaBreweryGenerator {
private:
/// Adjectives for brewery names (e.g., "Craft", "Heritage", etc.)
const std::vector<std::string> breweryAdjectives = {
"Craft", "Heritage", "Local", "Artisan",
"Pioneer", "Golden", "Modern", "Classic"};
/// Nouns for brewery names (e.g., "Brewing Co.", "Brewery", etc.)
const std::vector<std::string> breweryNouns = {
"Brewing Co.", "Brewery", "Bier Haus", "Taproom",
"Works", "House", "Fermentery", "Ale Co."};
/// Pre-written brewery descriptions (currently hand-crafted)
const std::vector<std::string> descriptions = {
"Handcrafted pale ales and seasonal IPAs with local ingredients.",
"Traditional lagers and experimental sours in small batches.",
"Award-winning stouts and wildly hoppy blonde ales.",
"Craft brewery specializing in Belgian-style triples and dark porters.",
"Modern brewery blending tradition with bold experimental flavors."};
public:
/// @struct Brewery
/// @brief Output structure for generated brewery data
struct Brewery {
std::string name; ///< Generated brewery name (e.g., "Craft Brewing Co.")
std::string description; ///< Short description of brewery style/offerings
};
/// @brief Loads a language model (currently mocked)
/// @param modelPath Path to GGUF model file (not used in mock)
/// @note In real implementation, loads llama.cpp model into memory
void LoadModel(const std::string &modelPath);
/// @brief Generates a brewery name and description for a city
/// @param cityName City name to generate brewery for
/// @param seed Integer seed (used for deterministic output in mock)
/// @return Brewery struct with name and description
/// @note Deterministic: same cityName+seed always produces same brewery
Brewery GenerateBrewery(const std::string &cityName, int seed);
};

View File

@@ -0,0 +1,85 @@
#pragma once
#include "database.h"
#include <nlohmann/json.hpp>
#include <string>
using json = nlohmann::json;
/**
* @class JsonLoader
* @brief Loads world geographic data from JSON file into SQLite database
*
* Handles parsing and population of world cities, states, and countries from
* a structured JSON source file. The loader uses parallel threads to chunk
* the city records and maximize database insertion throughput.
*
* Input Format (JSON Structure):
* @code
* {
* "countries": [
* {"id": 1, "name": "Canada", "iso2": "CA", "iso3": "CAN"},
* ...
* ],
* "states": [
* {"id": 1, "country_id": 1, "name": "Ontario", "iso2": "ON"},
* ...
* ],
* "cities": [
* {"id": 1, "state_id": 1, "country_id": 1, "name": "Toronto",
* "latitude": 43.6532, "longitude": -79.3832},
* ...
* ]
* }
* @endcode
*
* Performance Characteristics:
* - Reads entire JSON file into memory (nlohmann/json parser)
* - Iterates through countries: typically 200+ records
* - Iterates through states: typically 3000+ records
* - Iterates through cities: typically 50,000+ records (MAJOR DATASET)
* - Uses multithreading to chunk city insertion across threads
* - Thread pool size defaults to number of CPU cores
*
* Multithreading Strategy:
* - Divides cities into N chunks (N = CPU core count)
* - Each thread processes one chunk sequentially
* - Database has mutex protection for thread-safe concurrent access
* - Allows safe parallel writing to same SQLite database
*
* Example Usage:
* @code
* SqliteDatabase db;
* db.Initialize();
* JsonLoader::LoadWorldCities("../data/world_city_data.json", db);
* // Database now contains all countries, states, and cities
* @endcode
*/
class JsonLoader {
public:
/// @brief Loads world geographic data from JSON and populates database
///
/// Process:
/// 1. Reads and parses entire JSON file
/// 2. Inserts all countries into database (typically 200-250 records)
/// 3. Inserts all states/provinces (typically 3000+ records)
/// 4. Spawns worker threads to insert cities (typically 50,000+ records)
/// 5. Waits for all threads to complete
/// 6. Prints statistics about loaded data
///
/// @param jsonPath Filesystem path to world_city_data.json
/// @param db Reference to initialized SqliteDatabase to populate
///
/// @throws std::runtime_error if JSON file cannot be read or parsed
/// @throws std::runtime_error if database insertion fails
///
/// Output Examples:
/// @code
/// Loading JSON: ../data/world_city_data.json
/// Loaded countries: 250
/// Loaded states: 3500
/// Loaded cities: 52000
/// ✓ World city data loaded successfully
/// @endcode
static void LoadWorldCities(const std::string &jsonPath, SqliteDatabase &db);
};