mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
replace SQLite geo pipeline with curated in-memory locations
This commit is contained in:
@@ -4,20 +4,21 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <filesystem>
|
||||
#include <unordered_map>
|
||||
#include <future>
|
||||
#include <iterator>
|
||||
#include <random>
|
||||
|
||||
#include "data_generation/data_downloader.h"
|
||||
#include "data_generation/llama_generator.h"
|
||||
#include "data_generation/mock_generator.h"
|
||||
#include "json_handling/json_loader.h"
|
||||
#include "wikipedia/wikipedia_service.h"
|
||||
|
||||
BiergartenDataGenerator::BiergartenDataGenerator(
|
||||
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client,
|
||||
SqliteDatabase& database)
|
||||
: options_(options), webClient_(web_client), database_(database) {}
|
||||
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
|
||||
: options_(options), webClient_(std::move(web_client)) {}
|
||||
|
||||
std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
|
||||
auto BiergartenDataGenerator::InitializeGenerator()
|
||||
-> std::unique_ptr<DataGenerator> {
|
||||
spdlog::info("Initializing brewery generator...");
|
||||
|
||||
std::unique_ptr<DataGenerator> generator;
|
||||
@@ -41,75 +42,60 @@ std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
|
||||
return generator;
|
||||
}
|
||||
|
||||
void BiergartenDataGenerator::LoadGeographicData() {
|
||||
std::string json_path = options_.cache_dir + "/countries+states+cities.json";
|
||||
std::string db_path = options_.cache_dir + "/biergarten-pipeline.db";
|
||||
|
||||
bool has_json_cache = std::filesystem::exists(json_path);
|
||||
bool has_db_cache = std::filesystem::exists(db_path);
|
||||
|
||||
spdlog::info("Initializing SQLite database at {}...", db_path);
|
||||
database_.Initialize(db_path);
|
||||
|
||||
if (has_db_cache && has_json_cache) {
|
||||
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
|
||||
} else {
|
||||
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
|
||||
DataDownloader downloader(webClient_);
|
||||
downloader.DownloadCountriesDatabase(json_path, options_.commit);
|
||||
|
||||
JsonLoader::LoadWorldCities(json_path, database_);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<City, std::string>>
|
||||
BiergartenDataGenerator::QueryCitiesWithCountries() {
|
||||
auto BiergartenDataGenerator::QueryCitiesWithCountries()
|
||||
-> std::vector<Location> {
|
||||
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||
|
||||
auto cities = database_.QueryCities();
|
||||
|
||||
// Build a quick map of country id -> name for per-city lookups.
|
||||
auto all_countries = database_.QueryCountries(0);
|
||||
std::unordered_map<int, std::string> country_map;
|
||||
for (const auto& c : all_countries) {
|
||||
country_map[c.id] = c.name;
|
||||
}
|
||||
|
||||
spdlog::info("\nTotal records loaded:");
|
||||
spdlog::info(" Countries: {}", database_.QueryCountries(0).size());
|
||||
spdlog::info(" States: {}", database_.QueryStates(0).size());
|
||||
spdlog::info(" Cities: {}", cities.size());
|
||||
|
||||
// Cap at 30 entries.
|
||||
const size_t sample_count = std::min(size_t(30), cities.size());
|
||||
std::vector<std::pair<City, std::string>> result;
|
||||
|
||||
for (size_t i = 0; i < sample_count; i++) {
|
||||
const auto& city = cities[i];
|
||||
std::string country_name;
|
||||
const auto country_it = country_map.find(city.country_id);
|
||||
if (country_it != country_map.end()) {
|
||||
country_name = country_it->second;
|
||||
std::filesystem::path locations_path = "locations.json";
|
||||
if (!std::filesystem::exists(locations_path)) {
|
||||
const std::filesystem::path cache_path =
|
||||
std::filesystem::path(options_.cache_dir) / "locations.json";
|
||||
if (std::filesystem::exists(cache_path)) {
|
||||
locations_path = cache_path;
|
||||
}
|
||||
result.push_back({city, country_name});
|
||||
}
|
||||
|
||||
return result;
|
||||
auto all_locations = JsonLoader::LoadLocations(locations_path.string());
|
||||
spdlog::info(" Locations available: {}", all_locations.size());
|
||||
|
||||
const size_t sample_count = std::min<size_t>(30, all_locations.size());
|
||||
std::vector<Location> sampled_locations;
|
||||
sampled_locations.reserve(sample_count);
|
||||
|
||||
std::random_device random_generator;
|
||||
std::sample(all_locations.begin(), all_locations.end(),
|
||||
std::back_inserter(sampled_locations), sample_count,
|
||||
random_generator);
|
||||
|
||||
spdlog::info(" Sampled locations: {}", sampled_locations.size());
|
||||
return sampled_locations;
|
||||
}
|
||||
|
||||
std::vector<BiergartenDataGenerator::EnrichedCity>
|
||||
BiergartenDataGenerator::EnrichWithWikipedia(
|
||||
const std::vector<std::pair<City, std::string>>& cities) {
|
||||
WikipediaService wikipedia_service(webClient_);
|
||||
auto BiergartenDataGenerator::EnrichWithWikipedia(
|
||||
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
|
||||
std::vector<EnrichedCity> enriched;
|
||||
enriched.reserve(cities.size());
|
||||
|
||||
for (const auto& [city, country_name] : cities) {
|
||||
const std::string region_context =
|
||||
wikipedia_service.GetSummary(city.name, country_name);
|
||||
spdlog::debug("[Pipeline] Region context for {}: {}", city.name,
|
||||
region_context);
|
||||
std::vector<std::future<EnrichedCity>> pending;
|
||||
pending.reserve(cities.size());
|
||||
|
||||
enriched.push_back({city.id, city.name, country_name, region_context});
|
||||
for (const auto& city : cities) {
|
||||
pending.push_back(std::async(std::launch::async,
|
||||
[web_client = webClient_, city]() {
|
||||
WikipediaService wikipedia_service(
|
||||
web_client);
|
||||
const std::string region_context =
|
||||
wikipedia_service.GetSummary(
|
||||
city.city, city.country);
|
||||
spdlog::debug(
|
||||
"[Pipeline] Region context for {}: {}",
|
||||
city.city, region_context);
|
||||
return EnrichedCity{city, region_context};
|
||||
}));
|
||||
}
|
||||
|
||||
for (auto& task : pending) {
|
||||
enriched.push_back(task.get());
|
||||
}
|
||||
|
||||
return enriched;
|
||||
@@ -121,28 +107,30 @@ void BiergartenDataGenerator::GenerateBreweries(
|
||||
generatedBreweries_.clear();
|
||||
|
||||
for (const auto& enriched_city : cities) {
|
||||
auto brewery = generator.GenerateBrewery(enriched_city.city_name,
|
||||
enriched_city.country_name,
|
||||
auto brewery = generator.GenerateBrewery(enriched_city.location.city,
|
||||
enriched_city.location.country,
|
||||
enriched_city.region_context);
|
||||
generatedBreweries_.push_back(
|
||||
{enriched_city.city_id, enriched_city.city_name, brewery});
|
||||
generatedBreweries_.push_back({enriched_city.location, brewery});
|
||||
}
|
||||
}
|
||||
|
||||
void BiergartenDataGenerator::LogResults() const {
|
||||
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
||||
for (size_t i = 0; i < generatedBreweries_.size(); i++) {
|
||||
const auto& entry = generatedBreweries_[i];
|
||||
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.city_id,
|
||||
entry.city_name);
|
||||
size_t index = 1;
|
||||
for (const auto& entry : generatedBreweries_) {
|
||||
spdlog::info("{}. city=\"{}\" country=\"{}\" state=\"{}\" "
|
||||
"iso3166_2={} lat={} lon={}",
|
||||
index, entry.location.city, entry.location.country,
|
||||
entry.location.state_province, entry.location.iso3166_2,
|
||||
entry.location.latitude, entry.location.longitude);
|
||||
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
|
||||
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
|
||||
++index;
|
||||
}
|
||||
}
|
||||
|
||||
int BiergartenDataGenerator::Run() {
|
||||
auto BiergartenDataGenerator::Run() -> int {
|
||||
try {
|
||||
LoadGeographicData();
|
||||
auto generator = InitializeGenerator();
|
||||
auto cities = QueryCitiesWithCountries();
|
||||
auto enriched = EnrichWithWikipedia(cities);
|
||||
|
||||
@@ -1,44 +0,0 @@
|
||||
#include "data_generation/data_downloader.h"
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "web_client/web_client.h"
|
||||
|
||||
DataDownloader::DataDownloader(std::shared_ptr<WebClient> web_client)
|
||||
: web_client_(std::move(web_client)) {}
|
||||
|
||||
DataDownloader::~DataDownloader() {}
|
||||
|
||||
bool DataDownloader::FileExists(const std::string& file_path) {
|
||||
return std::filesystem::exists(file_path);
|
||||
}
|
||||
|
||||
std::string DataDownloader::DownloadCountriesDatabase(
|
||||
const std::string& cache_path, const std::string& commit) {
|
||||
if (FileExists(cache_path)) {
|
||||
spdlog::info("[DataDownloader] Cache hit: {}", cache_path);
|
||||
return cache_path;
|
||||
}
|
||||
|
||||
std::string url =
|
||||
"https://raw.githubusercontent.com/dr5hn/"
|
||||
"countries-states-cities-database/" +
|
||||
commit + "/json/countries+states+cities.json";
|
||||
|
||||
spdlog::info("[DataDownloader] Downloading: {}", url);
|
||||
|
||||
web_client_->DownloadToFile(url, cache_path);
|
||||
|
||||
std::ifstream file_check(cache_path, std::ios::binary | std::ios::ate);
|
||||
std::streamsize size = file_check.tellg();
|
||||
file_check.close();
|
||||
|
||||
spdlog::info("[DataDownloader] OK: Download complete: {} ({:.2f} MB)",
|
||||
cache_path, (size / (1024.0 * 1024.0)));
|
||||
return cache_path;
|
||||
}
|
||||
@@ -1,264 +0,0 @@
|
||||
#include "database/database.h"
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
void SqliteDatabase::InitializeSchema() {
|
||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||
|
||||
const char* schema = R"(
|
||||
CREATE TABLE IF NOT EXISTS countries (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
iso2 TEXT,
|
||||
iso3 TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS states (
|
||||
id INTEGER PRIMARY KEY,
|
||||
country_id INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
iso2 TEXT,
|
||||
FOREIGN KEY(country_id) REFERENCES countries(id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS cities (
|
||||
id INTEGER PRIMARY KEY,
|
||||
state_id INTEGER NOT NULL,
|
||||
country_id INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
latitude REAL,
|
||||
longitude REAL,
|
||||
FOREIGN KEY(state_id) REFERENCES states(id),
|
||||
FOREIGN KEY(country_id) REFERENCES countries(id)
|
||||
);
|
||||
)";
|
||||
|
||||
char* errMsg = nullptr;
|
||||
int rc = sqlite3_exec(db_, schema, nullptr, nullptr, &errMsg);
|
||||
if (rc != SQLITE_OK) {
|
||||
std::string error = errMsg ? std::string(errMsg) : "Unknown error";
|
||||
sqlite3_free(errMsg);
|
||||
throw std::runtime_error("Failed to create schema: " + error);
|
||||
}
|
||||
}
|
||||
|
||||
SqliteDatabase::~SqliteDatabase() {
|
||||
if (db_) {
|
||||
sqlite3_close(db_);
|
||||
}
|
||||
}
|
||||
|
||||
void SqliteDatabase::Initialize(const std::string& db_path) {
|
||||
int rc = sqlite3_open(db_path.c_str(), &db_);
|
||||
if (rc) {
|
||||
throw std::runtime_error("Failed to open SQLite database: " + db_path);
|
||||
}
|
||||
spdlog::info("OK: SQLite database opened: {}", db_path);
|
||||
InitializeSchema();
|
||||
}
|
||||
|
||||
void SqliteDatabase::BeginTransaction() {
|
||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||
char* err = nullptr;
|
||||
if (sqlite3_exec(db_, "BEGIN TRANSACTION", nullptr, nullptr, &err) !=
|
||||
SQLITE_OK) {
|
||||
std::string msg = err ? err : "unknown";
|
||||
sqlite3_free(err);
|
||||
throw std::runtime_error("BeginTransaction failed: " + msg);
|
||||
}
|
||||
}
|
||||
|
||||
void SqliteDatabase::CommitTransaction() {
|
||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||
char* err = nullptr;
|
||||
if (sqlite3_exec(db_, "COMMIT", nullptr, nullptr, &err) != SQLITE_OK) {
|
||||
std::string msg = err ? err : "unknown";
|
||||
sqlite3_free(err);
|
||||
throw std::runtime_error("CommitTransaction failed: " + msg);
|
||||
}
|
||||
}
|
||||
|
||||
void SqliteDatabase::RollbackTransaction() {
|
||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||
char* err = nullptr;
|
||||
if (sqlite3_exec(db_, "ROLLBACK", nullptr, nullptr, &err) != SQLITE_OK) {
|
||||
std::string msg = err ? err : "unknown";
|
||||
sqlite3_free(err);
|
||||
throw std::runtime_error("RollbackTransaction failed: " + msg);
|
||||
}
|
||||
}
|
||||
|
||||
void SqliteDatabase::InsertCountry(int id, const std::string& name,
|
||||
const std::string& iso2,
|
||||
const std::string& iso3) {
|
||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||
|
||||
const char* query = R"(
|
||||
INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
|
||||
VALUES (?, ?, ?, ?)
|
||||
)";
|
||||
|
||||
sqlite3_stmt* stmt;
|
||||
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
||||
if (rc != SQLITE_OK)
|
||||
throw std::runtime_error("Failed to prepare country insert");
|
||||
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_TRANSIENT);
|
||||
sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_TRANSIENT);
|
||||
sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_TRANSIENT);
|
||||
|
||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||
throw std::runtime_error("Failed to insert country");
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
void SqliteDatabase::InsertState(int id, int country_id,
|
||||
const std::string& name,
|
||||
const std::string& iso2) {
|
||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||
|
||||
const char* query = R"(
|
||||
INSERT OR IGNORE INTO states (id, country_id, name, iso2)
|
||||
VALUES (?, ?, ?, ?)
|
||||
)";
|
||||
|
||||
sqlite3_stmt* stmt;
|
||||
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
||||
if (rc != SQLITE_OK)
|
||||
throw std::runtime_error("Failed to prepare state insert");
|
||||
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_int(stmt, 2, country_id);
|
||||
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_TRANSIENT);
|
||||
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_TRANSIENT);
|
||||
|
||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||
throw std::runtime_error("Failed to insert state");
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
void SqliteDatabase::InsertCity(int id, int state_id, int country_id,
|
||||
const std::string& name, double latitude,
|
||||
double longitude) {
|
||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||
|
||||
const char* query = R"(
|
||||
INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
)";
|
||||
|
||||
sqlite3_stmt* stmt;
|
||||
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
||||
if (rc != SQLITE_OK)
|
||||
throw std::runtime_error("Failed to prepare city insert");
|
||||
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_int(stmt, 2, state_id);
|
||||
sqlite3_bind_int(stmt, 3, country_id);
|
||||
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_TRANSIENT);
|
||||
sqlite3_bind_double(stmt, 5, latitude);
|
||||
sqlite3_bind_double(stmt, 6, longitude);
|
||||
|
||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||
throw std::runtime_error("Failed to insert city");
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
std::vector<City> SqliteDatabase::QueryCities() {
|
||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||
std::vector<City> cities;
|
||||
sqlite3_stmt* stmt = nullptr;
|
||||
|
||||
const char* query =
|
||||
"SELECT id, name, country_id FROM cities ORDER BY RANDOM()";
|
||||
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
||||
|
||||
if (rc != SQLITE_OK) {
|
||||
throw std::runtime_error("Failed to prepare query");
|
||||
}
|
||||
|
||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||
int id = sqlite3_column_int(stmt, 0);
|
||||
const char* name =
|
||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
|
||||
int country_id = sqlite3_column_int(stmt, 2);
|
||||
cities.push_back({id, name ? std::string(name) : "", country_id});
|
||||
}
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
return cities;
|
||||
}
|
||||
|
||||
std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
|
||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||
|
||||
std::vector<Country> countries;
|
||||
sqlite3_stmt* stmt = nullptr;
|
||||
|
||||
std::string query =
|
||||
"SELECT id, name, iso2, iso3 FROM countries ORDER BY name";
|
||||
if (limit > 0) {
|
||||
query += " LIMIT " + std::to_string(limit);
|
||||
}
|
||||
|
||||
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
|
||||
|
||||
if (rc != SQLITE_OK) {
|
||||
throw std::runtime_error("Failed to prepare countries query");
|
||||
}
|
||||
|
||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||
int id = sqlite3_column_int(stmt, 0);
|
||||
const char* name =
|
||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
|
||||
const char* iso2 =
|
||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
|
||||
const char* iso3 =
|
||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 3));
|
||||
countries.push_back({id, name ? std::string(name) : "",
|
||||
iso2 ? std::string(iso2) : "",
|
||||
iso3 ? std::string(iso3) : ""});
|
||||
}
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
return countries;
|
||||
}
|
||||
|
||||
std::vector<State> SqliteDatabase::QueryStates(int limit) {
|
||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||
|
||||
std::vector<State> states;
|
||||
sqlite3_stmt* stmt = nullptr;
|
||||
|
||||
std::string query =
|
||||
"SELECT id, name, iso2, country_id FROM states ORDER BY name";
|
||||
if (limit > 0) {
|
||||
query += " LIMIT " + std::to_string(limit);
|
||||
}
|
||||
|
||||
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
|
||||
|
||||
if (rc != SQLITE_OK) {
|
||||
throw std::runtime_error("Failed to prepare states query");
|
||||
}
|
||||
|
||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||
int id = sqlite3_column_int(stmt, 0);
|
||||
const char* name =
|
||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
|
||||
const char* iso2 =
|
||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
|
||||
int country_id = sqlite3_column_int(stmt, 3);
|
||||
states.push_back({id, name ? std::string(name) : "",
|
||||
iso2 ? std::string(iso2) : "", country_id});
|
||||
}
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
return states;
|
||||
}
|
||||
@@ -2,66 +2,82 @@
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <chrono>
|
||||
#include <boost/json.hpp>
|
||||
|
||||
#include "json_handling/stream_parser.h"
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <stdexcept>
|
||||
|
||||
void JsonLoader::LoadWorldCities(const std::string& json_path,
|
||||
SqliteDatabase& db) {
|
||||
constexpr size_t kBatchSize = 10000;
|
||||
namespace {
|
||||
|
||||
auto startTime = std::chrono::high_resolution_clock::now();
|
||||
spdlog::info("\nLoading {} (streaming Boost.JSON SAX)...", json_path);
|
||||
auto ReadRequiredString(const boost::json::object& object,
|
||||
const char* key) -> std::string {
|
||||
const boost::json::value* value = object.if_contains(key);
|
||||
if (value == nullptr || !value->is_string()) {
|
||||
throw std::runtime_error(std::string("Missing or invalid string field: ") +
|
||||
key);
|
||||
}
|
||||
return std::string(value->as_string().c_str());
|
||||
}
|
||||
|
||||
db.BeginTransaction();
|
||||
bool transactionOpen = true;
|
||||
auto ReadRequiredNumber(const boost::json::object& object, const char* key)
|
||||
-> double {
|
||||
const boost::json::value* value = object.if_contains(key);
|
||||
if (value == nullptr || !value->is_number()) {
|
||||
throw std::runtime_error(std::string("Missing or invalid numeric field: ") +
|
||||
key);
|
||||
}
|
||||
return value->to_number<double>();
|
||||
}
|
||||
|
||||
size_t citiesProcessed = 0;
|
||||
try {
|
||||
StreamingJsonParser::Parse(
|
||||
json_path, db,
|
||||
[&](const CityRecord& record) {
|
||||
db.InsertCity(record.id, record.state_id, record.country_id,
|
||||
record.name, record.latitude, record.longitude);
|
||||
++citiesProcessed;
|
||||
} // namespace
|
||||
|
||||
if (citiesProcessed % kBatchSize == 0) {
|
||||
db.CommitTransaction();
|
||||
db.BeginTransaction();
|
||||
}
|
||||
},
|
||||
[&](size_t current, size_t /*total*/) {
|
||||
if (current % kBatchSize == 0 && current > 0) {
|
||||
spdlog::info(" [Progress] Parsed {} cities...", current);
|
||||
}
|
||||
});
|
||||
|
||||
spdlog::info(" OK: Parsed all cities from JSON");
|
||||
|
||||
if (transactionOpen) {
|
||||
db.CommitTransaction();
|
||||
transactionOpen = false;
|
||||
}
|
||||
} catch (...) {
|
||||
if (transactionOpen) {
|
||||
db.RollbackTransaction();
|
||||
transactionOpen = false;
|
||||
}
|
||||
throw;
|
||||
auto JsonLoader::LoadLocations(const std::string& filepath)
|
||||
-> std::vector<Location> {
|
||||
std::ifstream input(filepath);
|
||||
if (!input.is_open()) {
|
||||
throw std::runtime_error("Failed to open locations file: " + filepath);
|
||||
}
|
||||
|
||||
auto endTime = std::chrono::high_resolution_clock::now();
|
||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||
endTime - startTime);
|
||||
std::stringstream buffer;
|
||||
buffer << input.rdbuf();
|
||||
const std::string content = buffer.str();
|
||||
|
||||
spdlog::info("\n=== World City Data Loading Summary ===\n");
|
||||
spdlog::info("Cities inserted: {}", citiesProcessed);
|
||||
spdlog::info("Elapsed time: {} ms", duration.count());
|
||||
long long throughput =
|
||||
(citiesProcessed > 0 && duration.count() > 0)
|
||||
? (1000LL * static_cast<long long>(citiesProcessed)) /
|
||||
static_cast<long long>(duration.count())
|
||||
: 0LL;
|
||||
spdlog::info("Throughput: {} cities/sec", throughput);
|
||||
spdlog::info("=======================================\n");
|
||||
boost::json::error_code error;
|
||||
boost::json::value root = boost::json::parse(content, error);
|
||||
if (error) {
|
||||
throw std::runtime_error("Failed to parse locations JSON: " +
|
||||
error.message());
|
||||
}
|
||||
|
||||
if (!root.is_array()) {
|
||||
throw std::runtime_error(
|
||||
"Invalid locations JSON: root element must be an array");
|
||||
}
|
||||
|
||||
std::vector<Location> locations;
|
||||
const auto& items = root.as_array();
|
||||
locations.reserve(items.size());
|
||||
|
||||
for (const auto& item : items) {
|
||||
if (!item.is_object()) {
|
||||
throw std::runtime_error(
|
||||
"Invalid locations JSON: each entry must be an object");
|
||||
}
|
||||
|
||||
const auto& object = item.as_object();
|
||||
locations.push_back(Location{
|
||||
.city = ReadRequiredString(object, "city"),
|
||||
.state_province = ReadRequiredString(object, "state_province"),
|
||||
.iso3166_2 = ReadRequiredString(object, "iso3166_2"),
|
||||
.country = ReadRequiredString(object, "country"),
|
||||
.iso3166_1 = ReadRequiredString(object, "iso3166_1"),
|
||||
.latitude = ReadRequiredNumber(object, "latitude"),
|
||||
.longitude = ReadRequiredNumber(object, "longitude"),
|
||||
});
|
||||
}
|
||||
|
||||
spdlog::info("[JsonLoader] Loaded {} locations from {}", locations.size(),
|
||||
filepath);
|
||||
return locations;
|
||||
}
|
||||
|
||||
@@ -1,289 +0,0 @@
|
||||
#include "json_handling/stream_parser.h"
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <boost/json.hpp>
|
||||
#include <boost/json/basic_parser_impl.hpp>
|
||||
#include <cstdio>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "database/database.h"
|
||||
|
||||
class CityRecordHandler {
|
||||
friend class boost::json::basic_parser<CityRecordHandler>;
|
||||
|
||||
public:
|
||||
static constexpr std::size_t max_array_size = static_cast<std::size_t>(-1);
|
||||
static constexpr std::size_t max_object_size = static_cast<std::size_t>(-1);
|
||||
static constexpr std::size_t max_string_size = static_cast<std::size_t>(-1);
|
||||
static constexpr std::size_t max_key_size = static_cast<std::size_t>(-1);
|
||||
|
||||
struct ParseContext {
|
||||
SqliteDatabase* db = nullptr;
|
||||
std::function<void(const CityRecord&)> on_city;
|
||||
std::function<void(size_t, size_t)> on_progress;
|
||||
size_t cities_emitted = 0;
|
||||
size_t total_file_size = 0;
|
||||
int countries_inserted = 0;
|
||||
int states_inserted = 0;
|
||||
};
|
||||
|
||||
explicit CityRecordHandler(ParseContext& ctx) : context(ctx) {}
|
||||
|
||||
private:
|
||||
ParseContext& context;
|
||||
|
||||
int depth = 0;
|
||||
bool in_countries_array = false;
|
||||
bool in_country_object = false;
|
||||
bool in_states_array = false;
|
||||
bool in_state_object = false;
|
||||
bool in_cities_array = false;
|
||||
bool building_city = false;
|
||||
|
||||
int current_country_id = 0;
|
||||
int current_state_id = 0;
|
||||
CityRecord current_city = {};
|
||||
std::string current_key;
|
||||
std::string current_key_val;
|
||||
std::string current_string_val;
|
||||
|
||||
std::string country_info[3];
|
||||
std::string state_info[2];
|
||||
|
||||
// Boost.JSON SAX Hooks
|
||||
bool on_document_begin(boost::system::error_code&) { return true; }
|
||||
bool on_document_end(boost::system::error_code&) { return true; }
|
||||
|
||||
bool on_array_begin(boost::system::error_code&) {
|
||||
depth++;
|
||||
if (depth == 1) {
|
||||
in_countries_array = true;
|
||||
} else if (depth == 3 && current_key == "states") {
|
||||
in_states_array = true;
|
||||
} else if (depth == 5 && current_key == "cities") {
|
||||
in_cities_array = true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool on_array_end(std::size_t, boost::system::error_code&) {
|
||||
if (depth == 1) {
|
||||
in_countries_array = false;
|
||||
} else if (depth == 3) {
|
||||
in_states_array = false;
|
||||
} else if (depth == 5) {
|
||||
in_cities_array = false;
|
||||
}
|
||||
depth--;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool on_object_begin(boost::system::error_code&) {
|
||||
depth++;
|
||||
if (depth == 2 && in_countries_array) {
|
||||
in_country_object = true;
|
||||
current_country_id = 0;
|
||||
country_info[0].clear();
|
||||
country_info[1].clear();
|
||||
country_info[2].clear();
|
||||
} else if (depth == 4 && in_states_array) {
|
||||
in_state_object = true;
|
||||
current_state_id = 0;
|
||||
state_info[0].clear();
|
||||
state_info[1].clear();
|
||||
} else if (depth == 6 && in_cities_array) {
|
||||
building_city = true;
|
||||
current_city = {};
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool on_object_end(std::size_t, boost::system::error_code&) {
|
||||
if (depth == 6 && building_city) {
|
||||
if (current_city.id > 0 && current_state_id > 0 &&
|
||||
current_country_id > 0) {
|
||||
current_city.state_id = current_state_id;
|
||||
current_city.country_id = current_country_id;
|
||||
|
||||
try {
|
||||
context.on_city(current_city);
|
||||
context.cities_emitted++;
|
||||
|
||||
if (context.on_progress && context.cities_emitted % 10000 == 0) {
|
||||
context.on_progress(context.cities_emitted,
|
||||
context.total_file_size);
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::warn("Record parsing failed: {}", e.what());
|
||||
}
|
||||
}
|
||||
building_city = false;
|
||||
} else if (depth == 4 && in_state_object) {
|
||||
if (current_state_id > 0 && current_country_id > 0) {
|
||||
try {
|
||||
context.db->InsertState(current_state_id, current_country_id,
|
||||
state_info[0], state_info[1]);
|
||||
context.states_inserted++;
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::warn("Record parsing failed: {}", e.what());
|
||||
}
|
||||
}
|
||||
in_state_object = false;
|
||||
} else if (depth == 2 && in_country_object) {
|
||||
if (current_country_id > 0) {
|
||||
try {
|
||||
context.db->InsertCountry(current_country_id, country_info[0],
|
||||
country_info[1], country_info[2]);
|
||||
context.countries_inserted++;
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::warn("Record parsing failed: {}", e.what());
|
||||
}
|
||||
}
|
||||
in_country_object = false;
|
||||
}
|
||||
|
||||
depth--;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool on_key_part(boost::json::string_view s, std::size_t,
|
||||
boost::system::error_code&) {
|
||||
current_key_val.append(s.data(), s.size());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool on_key(boost::json::string_view s, std::size_t,
|
||||
boost::system::error_code&) {
|
||||
current_key_val.append(s.data(), s.size());
|
||||
current_key = current_key_val;
|
||||
current_key_val.clear();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool on_string_part(boost::json::string_view s, std::size_t,
|
||||
boost::system::error_code&) {
|
||||
current_string_val.append(s.data(), s.size());
|
||||
return true;
|
||||
}
|
||||
|
||||
bool on_string(boost::json::string_view s, std::size_t,
|
||||
boost::system::error_code&) {
|
||||
current_string_val.append(s.data(), s.size());
|
||||
|
||||
if (building_city && current_key == "name") {
|
||||
current_city.name = current_string_val;
|
||||
} else if (in_state_object && current_key == "name") {
|
||||
state_info[0] = current_string_val;
|
||||
} else if (in_state_object && current_key == "iso2") {
|
||||
state_info[1] = current_string_val;
|
||||
} else if (in_country_object && current_key == "name") {
|
||||
country_info[0] = current_string_val;
|
||||
} else if (in_country_object && current_key == "iso2") {
|
||||
country_info[1] = current_string_val;
|
||||
} else if (in_country_object && current_key == "iso3") {
|
||||
country_info[2] = current_string_val;
|
||||
}
|
||||
|
||||
current_string_val.clear();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool on_number_part(boost::json::string_view, boost::system::error_code&) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool on_int64(int64_t i, boost::json::string_view,
|
||||
boost::system::error_code&) {
|
||||
if (building_city && current_key == "id") {
|
||||
current_city.id = static_cast<int>(i);
|
||||
} else if (in_state_object && current_key == "id") {
|
||||
current_state_id = static_cast<int>(i);
|
||||
} else if (in_country_object && current_key == "id") {
|
||||
current_country_id = static_cast<int>(i);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool on_uint64(uint64_t u, boost::json::string_view,
|
||||
boost::system::error_code& ec) {
|
||||
return on_int64(static_cast<int64_t>(u), "", ec);
|
||||
}
|
||||
|
||||
bool on_double(double d, boost::json::string_view,
|
||||
boost::system::error_code&) {
|
||||
if (building_city) {
|
||||
if (current_key == "latitude") {
|
||||
current_city.latitude = d;
|
||||
} else if (current_key == "longitude") {
|
||||
current_city.longitude = d;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool on_bool(bool, boost::system::error_code&) { return true; }
|
||||
bool on_null(boost::system::error_code&) { return true; }
|
||||
bool on_comment_part(boost::json::string_view, boost::system::error_code&) {
|
||||
return true;
|
||||
}
|
||||
bool on_comment(boost::json::string_view, boost::system::error_code&) {
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
void StreamingJsonParser::Parse(
|
||||
const std::string& file_path, SqliteDatabase& db,
|
||||
std::function<void(const CityRecord&)> on_city,
|
||||
std::function<void(size_t, size_t)> on_progress) {
|
||||
spdlog::info(" Streaming parse of {} (Boost.JSON)...", file_path);
|
||||
|
||||
FILE* file = std::fopen(file_path.c_str(), "rb");
|
||||
if (!file) {
|
||||
throw std::runtime_error("Failed to open JSON file: " + file_path);
|
||||
}
|
||||
|
||||
size_t total_size = 0;
|
||||
if (std::fseek(file, 0, SEEK_END) == 0) {
|
||||
long file_size = std::ftell(file);
|
||||
if (file_size > 0) {
|
||||
total_size = static_cast<size_t>(file_size);
|
||||
}
|
||||
std::rewind(file);
|
||||
}
|
||||
|
||||
CityRecordHandler::ParseContext ctx{&db, on_city, on_progress, 0, total_size,
|
||||
0, 0};
|
||||
boost::json::basic_parser<CityRecordHandler> parser(
|
||||
boost::json::parse_options{}, ctx);
|
||||
|
||||
char buf[65536];
|
||||
size_t bytes_read;
|
||||
boost::system::error_code ec;
|
||||
|
||||
while ((bytes_read = std::fread(buf, 1, sizeof(buf), file)) > 0) {
|
||||
char const* p = buf;
|
||||
std::size_t remain = bytes_read;
|
||||
|
||||
while (remain > 0) {
|
||||
std::size_t consumed = parser.write_some(true, p, remain, ec);
|
||||
if (ec) {
|
||||
std::fclose(file);
|
||||
throw std::runtime_error("JSON parse error: " + ec.message());
|
||||
}
|
||||
p += consumed;
|
||||
remain -= consumed;
|
||||
}
|
||||
}
|
||||
|
||||
parser.write_some(false, nullptr, 0, ec); // Signal EOF
|
||||
std::fclose(file);
|
||||
|
||||
if (ec) {
|
||||
throw std::runtime_error("JSON parse error at EOF: " + ec.message());
|
||||
}
|
||||
|
||||
spdlog::info(" OK: Parsed {} countries, {} states, {} cities",
|
||||
ctx.countries_inserted, ctx.states_inserted,
|
||||
ctx.cities_emitted);
|
||||
}
|
||||
@@ -5,7 +5,6 @@
|
||||
#include <memory>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
#include "database/database.h"
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
namespace po = boost::program_options;
|
||||
@@ -122,9 +121,8 @@ int main(int argc, char* argv[]) {
|
||||
}
|
||||
|
||||
auto webClient = std::make_shared<CURLWebClient>();
|
||||
SqliteDatabase database;
|
||||
|
||||
BiergartenDataGenerator generator(options, webClient, database);
|
||||
BiergartenDataGenerator generator(options, webClient);
|
||||
return generator.Run();
|
||||
|
||||
} catch (const std::exception& e) {
|
||||
|
||||
Reference in New Issue
Block a user