Begin work on biergarten data generator pipeline

This commit is contained in:
Aaron Po
2026-04-01 19:33:50 -04:00
parent 581863d69b
commit 23e2199b6b
12 changed files with 1323 additions and 0 deletions

View File

@@ -0,0 +1,85 @@
#pragma once
#include "database.h"
#include <nlohmann/json.hpp>
#include <string>
using json = nlohmann::json;
/**
* @class JsonLoader
* @brief Loads world geographic data from JSON file into SQLite database
*
* Handles parsing and population of world cities, states, and countries from
* a structured JSON source file. The loader uses parallel threads to chunk
* the city records and maximize database insertion throughput.
*
* Input Format (JSON Structure):
* @code
* {
* "countries": [
* {"id": 1, "name": "Canada", "iso2": "CA", "iso3": "CAN"},
* ...
* ],
* "states": [
* {"id": 1, "country_id": 1, "name": "Ontario", "iso2": "ON"},
* ...
* ],
* "cities": [
* {"id": 1, "state_id": 1, "country_id": 1, "name": "Toronto",
* "latitude": 43.6532, "longitude": -79.3832},
* ...
* ]
* }
* @endcode
*
* Performance Characteristics:
* - Reads entire JSON file into memory (nlohmann/json parser)
* - Iterates through countries: typically 200+ records
* - Iterates through states: typically 3000+ records
* - Iterates through cities: typically 50,000+ records (MAJOR DATASET)
* - Uses multithreading to chunk city insertion across threads
* - Thread pool size defaults to number of CPU cores
*
* Multithreading Strategy:
* - Divides cities into N chunks (N = CPU core count)
* - Each thread processes one chunk sequentially
* - Database has mutex protection for thread-safe concurrent access
* - Allows safe parallel writing to same SQLite database
*
* Example Usage:
* @code
* SqliteDatabase db;
* db.Initialize();
* JsonLoader::LoadWorldCities("../data/world_city_data.json", db);
* // Database now contains all countries, states, and cities
* @endcode
*/
class JsonLoader {
public:
/// @brief Loads world geographic data from JSON and populates database
///
/// Process:
/// 1. Reads and parses entire JSON file
/// 2. Inserts all countries into database (typically 200-250 records)
/// 3. Inserts all states/provinces (typically 3000+ records)
/// 4. Spawns worker threads to insert cities (typically 50,000+ records)
/// 5. Waits for all threads to complete
/// 6. Prints statistics about loaded data
///
/// @param jsonPath Filesystem path to world_city_data.json
/// @param db Reference to initialized SqliteDatabase to populate
///
/// @throws std::runtime_error if JSON file cannot be read or parsed
/// @throws std::runtime_error if database insertion fails
///
/// Output Examples:
/// @code
/// Loading JSON: ../data/world_city_data.json
/// Loaded countries: 250
/// Loaded states: 3500
/// Loaded cities: 52000
/// ✓ World city data loaded successfully
/// @endcode
static void LoadWorldCities(const std::string &jsonPath, SqliteDatabase &db);
};