mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-04-05 18:09:04 +00:00
Begin work on biergarten data generator pipeline
This commit is contained in:
111
pipeline/includes/data_downloader.h
Normal file
111
pipeline/includes/data_downloader.h
Normal file
@@ -0,0 +1,111 @@
|
||||
/**
|
||||
* @file data_downloader.h
|
||||
* @brief Download geographic data from GitHub repositories using libcurl.
|
||||
*
|
||||
* Provides functionality to fetch JSON data from GitHub using libcurl, with
|
||||
* support for commit-based versioning to ensure reproducible builds. Downloads
|
||||
* are cached to avoid repeated network requests.
|
||||
*
|
||||
* Example usage:
|
||||
* @code
|
||||
* DataDownloader downloader;
|
||||
* std::string jsonPath = downloader.DownloadCountriesDatabase(
|
||||
* "/tmp/countries-data.json", // local cache path
|
||||
* "c5eb7772" // optional commit hash or HEAD
|
||||
* );
|
||||
* // Now use jsonPath with JsonLoader::LoadWorldCities(jsonPath, db)
|
||||
* @endcode
|
||||
*/
|
||||
|
||||
#ifndef DATA_DOWNLOADER_H
|
||||
#define DATA_DOWNLOADER_H
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
/**
|
||||
* @class DataDownloader
|
||||
* @brief Manages downloading and caching of geographic data from GitHub.
|
||||
*
|
||||
* This class encapsulates libcurl networking operations for reproducible
|
||||
* data fetching. All methods are non-blocking and synchronous.
|
||||
*
|
||||
* @note Requires libcurl to be available at runtime.
|
||||
* @note GitHub raw content CDN is used for efficient downloads.
|
||||
*/
|
||||
class DataDownloader {
|
||||
public:
|
||||
/**
|
||||
* @brief Default constructor.
|
||||
*
|
||||
* Initializes the downloader without any specific state. The downloader
|
||||
* is ready to use immediately.
|
||||
*/
|
||||
DataDownloader();
|
||||
|
||||
/**
|
||||
* @brief Destructor.
|
||||
*
|
||||
* Cleans up any resources. No explicit cleanup needed beyond destruction.
|
||||
*/
|
||||
~DataDownloader();
|
||||
|
||||
/**
|
||||
* @brief Download the countries+states+cities JSON database from GitHub.
|
||||
*
|
||||
* Downloads the geographic data from the
|
||||
* dr5hn/countries-states-cities-database repository. If the file already
|
||||
* exists at cachePath, it is used directly without downloading again.
|
||||
*
|
||||
* The download URL format is:
|
||||
* @verbatim
|
||||
* https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/
|
||||
* {commit}/json/countries+states+cities.json
|
||||
* @endverbatim
|
||||
*
|
||||
* @param cachePath Local filesystem path where the JSON file should be
|
||||
* stored. If the file already exists, download is skipped.
|
||||
* @param commit Git commit hash or branch name (default: "c5eb7772").
|
||||
* Examples: "HEAD", "main", "c5eb7772",
|
||||
* "c5eb7772225f6b1802a54f39adb8c73464a85be1a"
|
||||
*
|
||||
* @return The file path where JSON was saved (same as cachePath).
|
||||
*
|
||||
* @throws std::runtime_error if:
|
||||
* - Network download fails
|
||||
* - File cannot be written to cachePath
|
||||
* - Commit hash is invalid (404 on GitHub)
|
||||
*
|
||||
* Example with default commit (stable v2026-03-28):
|
||||
* @code
|
||||
* std::string path =
|
||||
* downloader.DownloadCountriesDatabase("/tmp/data.json");
|
||||
* @endcode
|
||||
*
|
||||
* Example with custom commit:
|
||||
* @code
|
||||
* std::string path = downloader.DownloadCountriesDatabase(
|
||||
* "/tmp/data.json",
|
||||
* "main" // Download latest from main branch
|
||||
* );
|
||||
* @endcode
|
||||
*/
|
||||
std::string DownloadCountriesDatabase(
|
||||
const std::string &cachePath,
|
||||
const std::string &commit = "c5eb7772" // Stable commit: 2026-03-28 export
|
||||
);
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Check if a file already exists at the given path.
|
||||
*
|
||||
* Used internally to implement cache-hit logic. No download occurs if
|
||||
* the file already exists.
|
||||
*
|
||||
* @param filePath Path to check.
|
||||
* @return True if file exists and is readable, false otherwise.
|
||||
*/
|
||||
bool FileExists(const std::string &filePath) const;
|
||||
};
|
||||
|
||||
#endif // DATA_DOWNLOADER_H
|
||||
102
pipeline/includes/database.h
Normal file
102
pipeline/includes/database.h
Normal file
@@ -0,0 +1,102 @@
|
||||
#pragma once
|
||||
|
||||
#include <mutex>
|
||||
#include <sqlite3.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
/// @struct Country
|
||||
/// @brief Represents a country with geographic identifiers
|
||||
struct Country {
|
||||
int id;
|
||||
std::string name;
|
||||
std::string iso2; ///< 2-letter ISO code (e.g., "US", "CA")
|
||||
std::string iso3; ///< 3-letter ISO code (e.g., "USA", "CAN")
|
||||
};
|
||||
|
||||
/// @struct State
|
||||
/// @brief Represents a state or province with geographic identifiers
|
||||
struct State {
|
||||
int id;
|
||||
std::string name;
|
||||
std::string iso2; ///< 2-letter state code (e.g., "CA", "ON")
|
||||
int countryId;
|
||||
};
|
||||
|
||||
/**
|
||||
* @class SqliteDatabase
|
||||
* @brief Thread-safe in-memory SQLite database wrapper for geographic data
|
||||
*
|
||||
* Manages a local in-memory SQLite database with countries, states, and cities.
|
||||
* All write operations are serialized via mutex to enable safe concurrent
|
||||
* access from multiple threads. Uses INSERT OR IGNORE for idempotent
|
||||
* operations.
|
||||
*
|
||||
* Schema Relationships:
|
||||
* countries (id, name, iso2, iso3)
|
||||
* ↓ (one-to-many)
|
||||
* states (id, country_id, name, iso2)
|
||||
* ↓ (one-to-many)
|
||||
* cities (id, state_id, country_id, name, latitude, longitude)
|
||||
*/
|
||||
class SqliteDatabase {
|
||||
private:
|
||||
sqlite3 *db = nullptr; ///< SQLite database connection handle
|
||||
std::mutex dbMutex; ///< Protects all database operations from race conditions
|
||||
|
||||
/// @brief Creates the schema with three related tables and foreign keys
|
||||
void InitializeSchema();
|
||||
|
||||
public:
|
||||
/// @brief Destructor: safely closes the database connection
|
||||
~SqliteDatabase();
|
||||
|
||||
/// @brief Opens an in-memory SQLite database and initializes the schema
|
||||
void Initialize();
|
||||
|
||||
/// @brief Inserts a country record
|
||||
/// @param id Unique country identifier
|
||||
/// @param name Country name
|
||||
/// @param iso2 2-letter ISO country code
|
||||
/// @param iso3 3-letter ISO country code
|
||||
/// @note Thread-safe: uses mutex lock. Idempotent: INSERT OR IGNORE prevents
|
||||
/// duplicates
|
||||
void InsertCountry(int id, const std::string &name, const std::string &iso2,
|
||||
const std::string &iso3);
|
||||
|
||||
/// @brief Inserts a state/province record
|
||||
/// @param id Unique state identifier
|
||||
/// @param countryId Foreign key reference to parent country
|
||||
/// @param name State/province name
|
||||
/// @param iso2 2-letter state code (e.g., "CA", "ON")
|
||||
/// @note Thread-safe and idempotent via mutex and INSERT OR IGNORE
|
||||
void InsertState(int id, int countryId, const std::string &name,
|
||||
const std::string &iso2);
|
||||
|
||||
/// @brief Inserts a city record with geographic coordinates
|
||||
/// @param id Unique city identifier
|
||||
/// @param stateId Foreign key reference to parent state
|
||||
/// @param countryId Foreign key reference to parent country
|
||||
/// @param name City name
|
||||
/// @param latitude Geographic latitude coordinate (WGS84)
|
||||
/// @param longitude Geographic longitude coordinate (WGS84)
|
||||
/// @note Thread-safe and idempotent. Called by multithreaded JSON loader.
|
||||
void InsertCity(int id, int stateId, int countryId, const std::string &name,
|
||||
double latitude, double longitude);
|
||||
|
||||
/// @brief Queries all cities from the database
|
||||
/// @return Vector of (city_id, city_name) pairs sorted alphabetically
|
||||
std::vector<std::pair<int, std::string>> QueryCities();
|
||||
|
||||
/// @brief Queries all countries from the database with ISO codes
|
||||
/// @param limit Maximum number of records to return (0 = all)
|
||||
/// @return Vector of Country structs (includes id, name, iso2, iso3) sorted
|
||||
/// alphabetically
|
||||
std::vector<Country> QueryCountries(int limit = 0);
|
||||
|
||||
/// @brief Queries all states from the database with ISO codes
|
||||
/// @param limit Maximum number of records to return (0 = all)
|
||||
/// @return Vector of State structs (includes id, name, iso2, countryId)
|
||||
/// sorted alphabetically
|
||||
std::vector<State> QueryStates(int limit = 0);
|
||||
};
|
||||
59
pipeline/includes/generator.h
Normal file
59
pipeline/includes/generator.h
Normal file
@@ -0,0 +1,59 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* @class LlamaBreweryGenerator
|
||||
* @brief Generates brewery names and descriptions for cities
|
||||
*
|
||||
* Currently provides a deterministic mock implementation that generates
|
||||
* brewery names and descriptions based on city name hashing.
|
||||
*
|
||||
* Design Pattern: Strategy pattern ready for swapping real llama.cpp
|
||||
* implementation later. The LoadModel() and GenerateBrewery() interface
|
||||
* will remain the same once actual LM inference is integrated.
|
||||
*
|
||||
* Mock Implementation: Uses std::hash to deterministically map city names
|
||||
* to brewery templates, ensuring reproducible results for testing.
|
||||
*/
|
||||
class LlamaBreweryGenerator {
|
||||
private:
|
||||
/// Adjectives for brewery names (e.g., "Craft", "Heritage", etc.)
|
||||
const std::vector<std::string> breweryAdjectives = {
|
||||
"Craft", "Heritage", "Local", "Artisan",
|
||||
"Pioneer", "Golden", "Modern", "Classic"};
|
||||
|
||||
/// Nouns for brewery names (e.g., "Brewing Co.", "Brewery", etc.)
|
||||
const std::vector<std::string> breweryNouns = {
|
||||
"Brewing Co.", "Brewery", "Bier Haus", "Taproom",
|
||||
"Works", "House", "Fermentery", "Ale Co."};
|
||||
|
||||
/// Pre-written brewery descriptions (currently hand-crafted)
|
||||
const std::vector<std::string> descriptions = {
|
||||
"Handcrafted pale ales and seasonal IPAs with local ingredients.",
|
||||
"Traditional lagers and experimental sours in small batches.",
|
||||
"Award-winning stouts and wildly hoppy blonde ales.",
|
||||
"Craft brewery specializing in Belgian-style triples and dark porters.",
|
||||
"Modern brewery blending tradition with bold experimental flavors."};
|
||||
|
||||
public:
|
||||
/// @struct Brewery
|
||||
/// @brief Output structure for generated brewery data
|
||||
struct Brewery {
|
||||
std::string name; ///< Generated brewery name (e.g., "Craft Brewing Co.")
|
||||
std::string description; ///< Short description of brewery style/offerings
|
||||
};
|
||||
|
||||
/// @brief Loads a language model (currently mocked)
|
||||
/// @param modelPath Path to GGUF model file (not used in mock)
|
||||
/// @note In real implementation, loads llama.cpp model into memory
|
||||
void LoadModel(const std::string &modelPath);
|
||||
|
||||
/// @brief Generates a brewery name and description for a city
|
||||
/// @param cityName City name to generate brewery for
|
||||
/// @param seed Integer seed (used for deterministic output in mock)
|
||||
/// @return Brewery struct with name and description
|
||||
/// @note Deterministic: same cityName+seed always produces same brewery
|
||||
Brewery GenerateBrewery(const std::string &cityName, int seed);
|
||||
};
|
||||
85
pipeline/includes/json_loader.h
Normal file
85
pipeline/includes/json_loader.h
Normal file
@@ -0,0 +1,85 @@
|
||||
#pragma once
|
||||
|
||||
#include "database.h"
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <string>
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
/**
|
||||
* @class JsonLoader
|
||||
* @brief Loads world geographic data from JSON file into SQLite database
|
||||
*
|
||||
* Handles parsing and population of world cities, states, and countries from
|
||||
* a structured JSON source file. The loader uses parallel threads to chunk
|
||||
* the city records and maximize database insertion throughput.
|
||||
*
|
||||
* Input Format (JSON Structure):
|
||||
* @code
|
||||
* {
|
||||
* "countries": [
|
||||
* {"id": 1, "name": "Canada", "iso2": "CA", "iso3": "CAN"},
|
||||
* ...
|
||||
* ],
|
||||
* "states": [
|
||||
* {"id": 1, "country_id": 1, "name": "Ontario", "iso2": "ON"},
|
||||
* ...
|
||||
* ],
|
||||
* "cities": [
|
||||
* {"id": 1, "state_id": 1, "country_id": 1, "name": "Toronto",
|
||||
* "latitude": 43.6532, "longitude": -79.3832},
|
||||
* ...
|
||||
* ]
|
||||
* }
|
||||
* @endcode
|
||||
*
|
||||
* Performance Characteristics:
|
||||
* - Reads entire JSON file into memory (nlohmann/json parser)
|
||||
* - Iterates through countries: typically 200+ records
|
||||
* - Iterates through states: typically 3000+ records
|
||||
* - Iterates through cities: typically 50,000+ records (MAJOR DATASET)
|
||||
* - Uses multithreading to chunk city insertion across threads
|
||||
* - Thread pool size defaults to number of CPU cores
|
||||
*
|
||||
* Multithreading Strategy:
|
||||
* - Divides cities into N chunks (N = CPU core count)
|
||||
* - Each thread processes one chunk sequentially
|
||||
* - Database has mutex protection for thread-safe concurrent access
|
||||
* - Allows safe parallel writing to same SQLite database
|
||||
*
|
||||
* Example Usage:
|
||||
* @code
|
||||
* SqliteDatabase db;
|
||||
* db.Initialize();
|
||||
* JsonLoader::LoadWorldCities("../data/world_city_data.json", db);
|
||||
* // Database now contains all countries, states, and cities
|
||||
* @endcode
|
||||
*/
|
||||
class JsonLoader {
|
||||
public:
|
||||
/// @brief Loads world geographic data from JSON and populates database
|
||||
///
|
||||
/// Process:
|
||||
/// 1. Reads and parses entire JSON file
|
||||
/// 2. Inserts all countries into database (typically 200-250 records)
|
||||
/// 3. Inserts all states/provinces (typically 3000+ records)
|
||||
/// 4. Spawns worker threads to insert cities (typically 50,000+ records)
|
||||
/// 5. Waits for all threads to complete
|
||||
/// 6. Prints statistics about loaded data
|
||||
///
|
||||
/// @param jsonPath Filesystem path to world_city_data.json
|
||||
/// @param db Reference to initialized SqliteDatabase to populate
|
||||
///
|
||||
/// @throws std::runtime_error if JSON file cannot be read or parsed
|
||||
/// @throws std::runtime_error if database insertion fails
|
||||
///
|
||||
/// Output Examples:
|
||||
/// @code
|
||||
/// Loading JSON: ../data/world_city_data.json
|
||||
/// Loaded countries: 250
|
||||
/// Loaded states: 3500
|
||||
/// Loaded cities: 52000
|
||||
/// ✓ World city data loaded successfully
|
||||
/// @endcode
|
||||
static void LoadWorldCities(const std::string &jsonPath, SqliteDatabase &db);
|
||||
};
|
||||
Reference in New Issue
Block a user