mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
Begin work on biergarten data generator pipeline
This commit is contained in:
103
pipeline/src/data_downloader.cpp
Normal file
103
pipeline/src/data_downloader.cpp
Normal file
@@ -0,0 +1,103 @@
|
||||
#include "data_downloader.h"
|
||||
#include <cstdio>
|
||||
#include <curl/curl.h>
|
||||
#include <fstream>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <sstream>
|
||||
#include <sys/stat.h>
|
||||
|
||||
static size_t WriteCallback(void *contents, size_t size, size_t nmemb,
|
||||
void *userp) {
|
||||
size_t realsize = size * nmemb;
|
||||
std::ofstream *outFile = static_cast<std::ofstream *>(userp);
|
||||
outFile->write(static_cast<char *>(contents), realsize);
|
||||
return realsize;
|
||||
}
|
||||
|
||||
DataDownloader::DataDownloader() {}
|
||||
|
||||
DataDownloader::~DataDownloader() {}
|
||||
|
||||
bool DataDownloader::FileExists(const std::string &filePath) const {
|
||||
struct stat buffer;
|
||||
return (stat(filePath.c_str(), &buffer) == 0);
|
||||
}
|
||||
|
||||
std::string
|
||||
DataDownloader::DownloadCountriesDatabase(const std::string &cachePath,
|
||||
const std::string &commit) {
|
||||
if (FileExists(cachePath)) {
|
||||
spdlog::info("[DataDownloader] Cache hit: {}", cachePath);
|
||||
return cachePath;
|
||||
}
|
||||
|
||||
std::string shortCommit = commit;
|
||||
if (commit.length() > 7) {
|
||||
shortCommit = commit.substr(0, 7);
|
||||
}
|
||||
|
||||
std::string url = "https://raw.githubusercontent.com/dr5hn/"
|
||||
"countries-states-cities-database/" +
|
||||
shortCommit + "/json/countries+states+cities.json";
|
||||
|
||||
spdlog::info("[DataDownloader] Downloading: {}", url);
|
||||
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
throw std::runtime_error("[DataDownloader] Failed to initialize libcurl");
|
||||
}
|
||||
|
||||
std::ofstream outFile(cachePath, std::ios::binary);
|
||||
if (!outFile.is_open()) {
|
||||
curl_easy_cleanup(curl);
|
||||
throw std::runtime_error("[DataDownloader] Cannot open file for writing: " +
|
||||
cachePath);
|
||||
}
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, static_cast<void *>(&outFile));
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30L);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 300L);
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
|
||||
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
|
||||
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
outFile.close();
|
||||
|
||||
if (res != CURLE_OK) {
|
||||
curl_easy_cleanup(curl);
|
||||
std::remove(cachePath.c_str());
|
||||
|
||||
std::string error = std::string("[DataDownloader] Download failed: ") +
|
||||
curl_easy_strerror(res);
|
||||
throw std::runtime_error(error);
|
||||
}
|
||||
|
||||
long httpCode = 0;
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &httpCode);
|
||||
curl_easy_cleanup(curl);
|
||||
|
||||
if (httpCode != 200) {
|
||||
std::remove(cachePath.c_str());
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "[DataDownloader] HTTP error " << httpCode
|
||||
<< " (commit: " << shortCommit << ")";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
std::ifstream fileCheck(cachePath, std::ios::binary | std::ios::ate);
|
||||
std::streamsize size = fileCheck.tellg();
|
||||
fileCheck.close();
|
||||
|
||||
spdlog::info("[DataDownloader] OK: Download complete: {} ({:.2f} MB)",
|
||||
cachePath, (size / (1024.0 * 1024.0)));
|
||||
return cachePath;
|
||||
}
|
||||
229
pipeline/src/database.cpp
Normal file
229
pipeline/src/database.cpp
Normal file
@@ -0,0 +1,229 @@
|
||||
#include "database.h"
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <stdexcept>
|
||||
|
||||
void SqliteDatabase::InitializeSchema() {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
const char *schema = R"(
|
||||
CREATE TABLE IF NOT EXISTS countries (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
iso2 TEXT,
|
||||
iso3 TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS states (
|
||||
id INTEGER PRIMARY KEY,
|
||||
country_id INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
iso2 TEXT,
|
||||
FOREIGN KEY(country_id) REFERENCES countries(id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS cities (
|
||||
id INTEGER PRIMARY KEY,
|
||||
state_id INTEGER NOT NULL,
|
||||
country_id INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
latitude REAL,
|
||||
longitude REAL,
|
||||
FOREIGN KEY(state_id) REFERENCES states(id),
|
||||
FOREIGN KEY(country_id) REFERENCES countries(id)
|
||||
);
|
||||
)";
|
||||
|
||||
char *errMsg = nullptr;
|
||||
int rc = sqlite3_exec(db, schema, nullptr, nullptr, &errMsg);
|
||||
if (rc != SQLITE_OK) {
|
||||
std::string error = errMsg ? std::string(errMsg) : "Unknown error";
|
||||
sqlite3_free(errMsg);
|
||||
throw std::runtime_error("Failed to create schema: " + error);
|
||||
}
|
||||
}
|
||||
|
||||
SqliteDatabase::~SqliteDatabase() {
|
||||
if (db) {
|
||||
sqlite3_close(db);
|
||||
}
|
||||
}
|
||||
|
||||
void SqliteDatabase::Initialize() {
|
||||
int rc = sqlite3_open(":memory:", &db);
|
||||
if (rc) {
|
||||
throw std::runtime_error("Failed to create in-memory SQLite database");
|
||||
}
|
||||
spdlog::info("OK: In-memory SQLite database created");
|
||||
InitializeSchema();
|
||||
}
|
||||
|
||||
void SqliteDatabase::InsertCountry(int id, const std::string &name,
|
||||
const std::string &iso2,
|
||||
const std::string &iso3) {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
const char *query = R"(
|
||||
INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
|
||||
VALUES (?, ?, ?, ?)
|
||||
)";
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||
if (rc != SQLITE_OK)
|
||||
throw std::runtime_error("Failed to prepare country insert");
|
||||
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_STATIC);
|
||||
|
||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||
throw std::runtime_error("Failed to insert country");
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
void SqliteDatabase::InsertState(int id, int countryId, const std::string &name,
|
||||
const std::string &iso2) {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
const char *query = R"(
|
||||
INSERT OR IGNORE INTO states (id, country_id, name, iso2)
|
||||
VALUES (?, ?, ?, ?)
|
||||
)";
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||
if (rc != SQLITE_OK)
|
||||
throw std::runtime_error("Failed to prepare state insert");
|
||||
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_int(stmt, 2, countryId);
|
||||
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_STATIC);
|
||||
|
||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||
throw std::runtime_error("Failed to insert state");
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
void SqliteDatabase::InsertCity(int id, int stateId, int countryId,
|
||||
const std::string &name, double latitude,
|
||||
double longitude) {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
const char *query = R"(
|
||||
INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
)";
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||
if (rc != SQLITE_OK)
|
||||
throw std::runtime_error("Failed to prepare city insert");
|
||||
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_int(stmt, 2, stateId);
|
||||
sqlite3_bind_int(stmt, 3, countryId);
|
||||
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_double(stmt, 5, latitude);
|
||||
sqlite3_bind_double(stmt, 6, longitude);
|
||||
|
||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||
throw std::runtime_error("Failed to insert city");
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
std::vector<std::pair<int, std::string>> SqliteDatabase::QueryCities() {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
std::vector<std::pair<int, std::string>> cities;
|
||||
sqlite3_stmt *stmt = nullptr;
|
||||
|
||||
const char *query = "SELECT id, name FROM cities ORDER BY name";
|
||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||
|
||||
if (rc != SQLITE_OK) {
|
||||
throw std::runtime_error("Failed to prepare query");
|
||||
}
|
||||
|
||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||
int id = sqlite3_column_int(stmt, 0);
|
||||
const char *name =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
||||
cities.push_back({id, name ? std::string(name) : ""});
|
||||
}
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
return cities;
|
||||
}
|
||||
|
||||
std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
std::vector<Country> countries;
|
||||
sqlite3_stmt *stmt = nullptr;
|
||||
|
||||
std::string query =
|
||||
"SELECT id, name, iso2, iso3 FROM countries ORDER BY name";
|
||||
if (limit > 0) {
|
||||
query += " LIMIT " + std::to_string(limit);
|
||||
}
|
||||
|
||||
int rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt, nullptr);
|
||||
|
||||
if (rc != SQLITE_OK) {
|
||||
throw std::runtime_error("Failed to prepare countries query");
|
||||
}
|
||||
|
||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||
int id = sqlite3_column_int(stmt, 0);
|
||||
const char *name =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
||||
const char *iso2 =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 2));
|
||||
const char *iso3 =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 3));
|
||||
countries.push_back({id, name ? std::string(name) : "",
|
||||
iso2 ? std::string(iso2) : "",
|
||||
iso3 ? std::string(iso3) : ""});
|
||||
}
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
return countries;
|
||||
}
|
||||
|
||||
std::vector<State> SqliteDatabase::QueryStates(int limit) {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
std::vector<State> states;
|
||||
sqlite3_stmt *stmt = nullptr;
|
||||
|
||||
std::string query =
|
||||
"SELECT id, name, iso2, country_id FROM states ORDER BY name";
|
||||
if (limit > 0) {
|
||||
query += " LIMIT " + std::to_string(limit);
|
||||
}
|
||||
|
||||
int rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt, nullptr);
|
||||
|
||||
if (rc != SQLITE_OK) {
|
||||
throw std::runtime_error("Failed to prepare states query");
|
||||
}
|
||||
|
||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||
int id = sqlite3_column_int(stmt, 0);
|
||||
const char *name =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
||||
const char *iso2 =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 2));
|
||||
int countryId = sqlite3_column_int(stmt, 3);
|
||||
states.push_back({id, name ? std::string(name) : "",
|
||||
iso2 ? std::string(iso2) : "", countryId});
|
||||
}
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
return states;
|
||||
}
|
||||
21
pipeline/src/generator.cpp
Normal file
21
pipeline/src/generator.cpp
Normal file
@@ -0,0 +1,21 @@
|
||||
#include "generator.h"
|
||||
#include <functional>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
void LlamaBreweryGenerator::LoadModel(const std::string &modelPath) {
|
||||
spdlog::info(" [Mock] Initialized llama model: {}", modelPath);
|
||||
spdlog::info(" OK: Model ready");
|
||||
}
|
||||
|
||||
LlamaBreweryGenerator::Brewery
|
||||
LlamaBreweryGenerator::GenerateBrewery(const std::string &cityName, int seed) {
|
||||
// Deterministic mock generation for stable test output.
|
||||
size_t nameHash = std::hash<std::string>{}(cityName + std::to_string(seed));
|
||||
|
||||
Brewery result;
|
||||
result.name = breweryAdjectives[nameHash % breweryAdjectives.size()] + " " +
|
||||
breweryNouns[(nameHash / 7) % breweryNouns.size()];
|
||||
result.description = descriptions[(nameHash / 13) % descriptions.size()];
|
||||
|
||||
return result;
|
||||
}
|
||||
80
pipeline/src/json_loader.cpp
Normal file
80
pipeline/src/json_loader.cpp
Normal file
@@ -0,0 +1,80 @@
|
||||
#include "json_loader.h"
|
||||
#include "stream_parser.h"
|
||||
#include "work_queue.h"
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
void JsonLoader::LoadWorldCities(const std::string &jsonPath,
|
||||
SqliteDatabase &db) {
|
||||
auto startTime = std::chrono::high_resolution_clock::now();
|
||||
spdlog::info("\nLoading {} (streaming RapidJSON SAX + producer-consumer)...",
|
||||
jsonPath);
|
||||
|
||||
const unsigned int QUEUE_CAPACITY = 1000;
|
||||
WorkQueue<CityRecord> queue(QUEUE_CAPACITY);
|
||||
|
||||
spdlog::info("Creating worker thread pool...");
|
||||
|
||||
unsigned int numWorkers = std::thread::hardware_concurrency();
|
||||
if (numWorkers == 0)
|
||||
numWorkers = 4; // Fallback if unavailable
|
||||
spdlog::info(" Spawning {} worker threads", numWorkers);
|
||||
|
||||
std::vector<std::thread> workers;
|
||||
std::atomic<unsigned long> citiesProcessed{0};
|
||||
|
||||
for (unsigned int i = 0; i < numWorkers; ++i) {
|
||||
workers.push_back(std::thread([&]() {
|
||||
unsigned long localCount = 0;
|
||||
while (auto record = queue.pop()) {
|
||||
db.InsertCity(record->id, record->state_id, record->country_id,
|
||||
record->name, record->latitude, record->longitude);
|
||||
localCount++;
|
||||
}
|
||||
citiesProcessed += localCount;
|
||||
}));
|
||||
}
|
||||
|
||||
spdlog::info("Streaming cities into worker queue...");
|
||||
|
||||
unsigned long totalCities = 0;
|
||||
StreamingJsonParser::Parse(
|
||||
jsonPath, db, [&](const CityRecord &record) { queue.push(record); },
|
||||
[&](size_t current, size_t total) {
|
||||
if (current % 10000 == 0 && current > 0) {
|
||||
spdlog::info(" [Progress] Parsed {} cities...", current);
|
||||
}
|
||||
totalCities = current;
|
||||
});
|
||||
|
||||
spdlog::info(" OK: Parsed all cities from JSON");
|
||||
|
||||
queue.shutdown_queue();
|
||||
|
||||
spdlog::info("Waiting for worker threads to complete...");
|
||||
for (auto &worker : workers) {
|
||||
if (worker.joinable()) {
|
||||
worker.join();
|
||||
}
|
||||
}
|
||||
|
||||
auto endTime = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
endTime - startTime);
|
||||
|
||||
spdlog::info("\n=== World City Data Loading Summary ===\n");
|
||||
spdlog::info("Cities inserted: {}", citiesProcessed);
|
||||
spdlog::info("Elapsed time: {} ms", duration.count());
|
||||
long long throughput =
|
||||
(citiesProcessed > 0 && duration.count() > 0)
|
||||
? (1000LL * static_cast<long long>(citiesProcessed)) /
|
||||
static_cast<long long>(duration.count())
|
||||
: 0LL;
|
||||
spdlog::info("Throughput: {} cities/sec", throughput);
|
||||
spdlog::info("Worker pool: {} threads", numWorkers);
|
||||
spdlog::info("Queue capacity: {}", QUEUE_CAPACITY);
|
||||
spdlog::info("=======================================\n");
|
||||
}
|
||||
79
pipeline/src/main.cpp
Normal file
79
pipeline/src/main.cpp
Normal file
@@ -0,0 +1,79 @@
|
||||
#include "data_downloader.h"
|
||||
#include "database.h"
|
||||
#include "generator.h"
|
||||
#include "json_loader.h"
|
||||
#include <curl/curl.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
try {
|
||||
curl_global_init(CURL_GLOBAL_DEFAULT);
|
||||
|
||||
std::string modelPath = argc > 1 ? argv[1] : "./model.gguf";
|
||||
std::string cacheDir = argc > 2 ? argv[2] : "/tmp";
|
||||
std::string commit =
|
||||
argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28
|
||||
|
||||
std::string jsonPath = cacheDir + "/countries+states+cities.json";
|
||||
|
||||
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
|
||||
DataDownloader downloader;
|
||||
downloader.DownloadCountriesDatabase(jsonPath, commit);
|
||||
|
||||
SqliteDatabase db;
|
||||
|
||||
spdlog::info("Initializing in-memory SQLite database...");
|
||||
db.Initialize();
|
||||
|
||||
JsonLoader::LoadWorldCities(jsonPath, db);
|
||||
|
||||
spdlog::info("Initializing brewery generator...");
|
||||
LlamaBreweryGenerator generator;
|
||||
generator.LoadModel(modelPath);
|
||||
|
||||
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||
|
||||
auto countries = db.QueryCountries(50);
|
||||
auto states = db.QueryStates(50);
|
||||
auto cities = db.QueryCities();
|
||||
|
||||
spdlog::info("\nTotal records loaded:");
|
||||
spdlog::info(" Countries: {}", db.QueryCountries(0).size());
|
||||
spdlog::info(" States: {}", db.QueryStates(0).size());
|
||||
spdlog::info(" Cities: {}", cities.size());
|
||||
|
||||
spdlog::info("\n--- 50 COUNTRIES ---");
|
||||
for (size_t i = 0; i < countries.size(); i++) {
|
||||
spdlog::info("{}. {} ({}) {}", (i + 1), countries[i].iso2,
|
||||
countries[i].iso3, countries[i].name);
|
||||
}
|
||||
|
||||
spdlog::info("\n--- 50 STATES ---");
|
||||
for (size_t i = 0; i < states.size(); i++) {
|
||||
spdlog::info("{}. {}: {}", (i + 1), states[i].iso2, states[i].name);
|
||||
}
|
||||
|
||||
spdlog::info("\n--- 50 CITIES ---");
|
||||
for (size_t i = 0; i < std::min(size_t(50), cities.size()); i++) {
|
||||
spdlog::info("{}. {}", (i + 1), cities[i].second);
|
||||
}
|
||||
|
||||
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===\n");
|
||||
for (size_t i = 0; i < std::min(size_t(5), cities.size()); i++) {
|
||||
const auto &[cityId, cityName] = cities[i];
|
||||
auto brewery = generator.GenerateBrewery(cityName, i);
|
||||
spdlog::info(" {}: {}", cityName, brewery.name);
|
||||
spdlog::info(" -> {}", brewery.description);
|
||||
}
|
||||
|
||||
spdlog::info("\nOK: Pipeline completed successfully");
|
||||
|
||||
curl_global_cleanup();
|
||||
return 0;
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
spdlog::error("ERROR: Pipeline failed: {}", e.what());
|
||||
curl_global_cleanup();
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
227
pipeline/src/stream_parser.cpp
Normal file
227
pipeline/src/stream_parser.cpp
Normal file
@@ -0,0 +1,227 @@
|
||||
#include "stream_parser.h"
|
||||
#include "database.h"
|
||||
#include <fstream>
|
||||
#include <rapidjson/reader.h>
|
||||
#include <rapidjson/stringbuffer.h>
|
||||
#include <spdlog/spdlog.h>
|
||||
#include <sstream>
|
||||
|
||||
using namespace rapidjson;
|
||||
|
||||
class CityRecordHandler : public BaseReaderHandler<UTF8<>, CityRecordHandler> {
|
||||
public:
|
||||
struct ParseContext {
|
||||
SqliteDatabase *db = nullptr;
|
||||
std::function<void(const CityRecord &)> on_city;
|
||||
std::function<void(size_t, size_t)> on_progress;
|
||||
size_t cities_emitted = 0;
|
||||
size_t total_file_size = 0;
|
||||
int countries_inserted = 0;
|
||||
int states_inserted = 0;
|
||||
};
|
||||
|
||||
CityRecordHandler(ParseContext &ctx) : context(ctx) {}
|
||||
|
||||
bool StartArray() {
|
||||
depth++;
|
||||
|
||||
if (depth == 1) {
|
||||
in_countries_array = true;
|
||||
} else if (depth == 3 && current_key == "states") {
|
||||
in_states_array = true;
|
||||
} else if (depth == 5 && current_key == "cities") {
|
||||
in_cities_array = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool EndArray(SizeType /*elementCount*/) {
|
||||
if (depth == 1) {
|
||||
in_countries_array = false;
|
||||
} else if (depth == 3) {
|
||||
in_states_array = false;
|
||||
} else if (depth == 5) {
|
||||
in_cities_array = false;
|
||||
}
|
||||
depth--;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool StartObject() {
|
||||
depth++;
|
||||
|
||||
if (depth == 2 && in_countries_array) {
|
||||
in_country_object = true;
|
||||
current_country_id = 0;
|
||||
country_info[0].clear();
|
||||
country_info[1].clear();
|
||||
country_info[2].clear();
|
||||
} else if (depth == 4 && in_states_array) {
|
||||
in_state_object = true;
|
||||
current_state_id = 0;
|
||||
state_info[0].clear();
|
||||
state_info[1].clear();
|
||||
} else if (depth == 6 && in_cities_array) {
|
||||
building_city = true;
|
||||
current_city = {};
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool EndObject(SizeType /*memberCount*/) {
|
||||
if (depth == 6 && building_city) {
|
||||
if (current_city.id > 0 && current_state_id > 0 &&
|
||||
current_country_id > 0) {
|
||||
current_city.state_id = current_state_id;
|
||||
current_city.country_id = current_country_id;
|
||||
|
||||
try {
|
||||
context.on_city(current_city);
|
||||
context.cities_emitted++;
|
||||
|
||||
if (context.on_progress && context.cities_emitted % 10000 == 0) {
|
||||
context.on_progress(context.cities_emitted,
|
||||
context.total_file_size);
|
||||
}
|
||||
} catch (const std::exception &e) {
|
||||
spdlog::warn(" WARN: Failed to emit city: {}", e.what());
|
||||
}
|
||||
}
|
||||
building_city = false;
|
||||
} else if (depth == 4 && in_state_object) {
|
||||
if (current_state_id > 0 && current_country_id > 0) {
|
||||
try {
|
||||
context.db->InsertState(current_state_id, current_country_id,
|
||||
state_info[0], state_info[1]);
|
||||
context.states_inserted++;
|
||||
} catch (const std::exception &e) {
|
||||
spdlog::warn(" WARN: Failed to insert state: {}", e.what());
|
||||
}
|
||||
}
|
||||
in_state_object = false;
|
||||
} else if (depth == 2 && in_country_object) {
|
||||
if (current_country_id > 0) {
|
||||
try {
|
||||
context.db->InsertCountry(current_country_id, country_info[0],
|
||||
country_info[1], country_info[2]);
|
||||
context.countries_inserted++;
|
||||
} catch (const std::exception &e) {
|
||||
spdlog::warn(" WARN: Failed to insert country: {}", e.what());
|
||||
}
|
||||
}
|
||||
in_country_object = false;
|
||||
}
|
||||
|
||||
depth--;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Key(const char *str, SizeType len, bool /*copy*/) {
|
||||
current_key.assign(str, len);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool String(const char *str, SizeType len, bool /*copy*/) {
|
||||
if (building_city && current_key == "name") {
|
||||
current_city.name.assign(str, len);
|
||||
} else if (in_state_object && current_key == "name") {
|
||||
state_info[0].assign(str, len);
|
||||
} else if (in_state_object && current_key == "iso2") {
|
||||
state_info[1].assign(str, len);
|
||||
} else if (in_country_object && current_key == "name") {
|
||||
country_info[0].assign(str, len);
|
||||
} else if (in_country_object && current_key == "iso2") {
|
||||
country_info[1].assign(str, len);
|
||||
} else if (in_country_object && current_key == "iso3") {
|
||||
country_info[2].assign(str, len);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Int(int i) {
|
||||
if (building_city && current_key == "id") {
|
||||
current_city.id = i;
|
||||
} else if (in_state_object && current_key == "id") {
|
||||
current_state_id = i;
|
||||
} else if (in_country_object && current_key == "id") {
|
||||
current_country_id = i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Uint(unsigned i) { return Int(static_cast<int>(i)); }
|
||||
|
||||
bool Int64(int64_t i) { return Int(static_cast<int>(i)); }
|
||||
|
||||
bool Uint64(uint64_t i) { return Int(static_cast<int>(i)); }
|
||||
|
||||
bool Double(double d) {
|
||||
if (building_city) {
|
||||
if (current_key == "latitude") {
|
||||
current_city.latitude = d;
|
||||
} else if (current_key == "longitude") {
|
||||
current_city.longitude = d;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Bool(bool /*b*/) { return true; }
|
||||
bool Null() { return true; }
|
||||
|
||||
private:
|
||||
ParseContext &context;
|
||||
|
||||
int depth = 0;
|
||||
bool in_countries_array = false;
|
||||
bool in_country_object = false;
|
||||
bool in_states_array = false;
|
||||
bool in_state_object = false;
|
||||
bool in_cities_array = false;
|
||||
bool building_city = false;
|
||||
|
||||
int current_country_id = 0;
|
||||
int current_state_id = 0;
|
||||
CityRecord current_city = {};
|
||||
std::string current_key;
|
||||
|
||||
std::string country_info[3];
|
||||
std::string state_info[2];
|
||||
};
|
||||
|
||||
void StreamingJsonParser::Parse(
|
||||
const std::string &filePath, SqliteDatabase &db,
|
||||
std::function<void(const CityRecord &)> onCity,
|
||||
std::function<void(size_t, size_t)> onProgress) {
|
||||
|
||||
spdlog::info(" Streaming parse of {}...", filePath);
|
||||
|
||||
std::ifstream file(filePath, std::ios::binary);
|
||||
if (!file.is_open()) {
|
||||
throw std::runtime_error("Failed to open JSON file: " + filePath);
|
||||
}
|
||||
|
||||
std::stringstream buffer;
|
||||
buffer << file.rdbuf();
|
||||
file.close();
|
||||
std::string json_str = buffer.str();
|
||||
size_t total_size = json_str.length();
|
||||
|
||||
CityRecordHandler::ParseContext ctx{&db, onCity, onProgress, 0,
|
||||
total_size, 0, 0};
|
||||
CityRecordHandler handler(ctx);
|
||||
|
||||
Reader reader;
|
||||
StringStream ss(json_str.c_str());
|
||||
|
||||
if (!reader.Parse(ss, handler)) {
|
||||
ParseErrorCode errCode = reader.GetParseErrorCode();
|
||||
size_t errOffset = reader.GetErrorOffset();
|
||||
throw std::runtime_error(std::string("JSON parse error at offset ") +
|
||||
std::to_string(errOffset) +
|
||||
" (code: " + std::to_string(errCode) + ")");
|
||||
}
|
||||
|
||||
spdlog::info(" OK: Parsed {} countries, {} states, {} cities",
|
||||
ctx.countries_inserted, ctx.states_inserted, ctx.cities_emitted);
|
||||
}
|
||||
Reference in New Issue
Block a user