replace SQLite geo pipeline with curated in-memory locations

This commit is contained in:
Aaron Po
2026-04-07 02:28:15 -04:00
parent 60ee2ecf74
commit b8e96a6d45
14 changed files with 1135 additions and 1079 deletions

View File

@@ -4,20 +4,21 @@
#include <algorithm>
#include <filesystem>
#include <unordered_map>
#include <future>
#include <iterator>
#include <random>
#include "data_generation/data_downloader.h"
#include "data_generation/llama_generator.h"
#include "data_generation/mock_generator.h"
#include "json_handling/json_loader.h"
#include "wikipedia/wikipedia_service.h"
BiergartenDataGenerator::BiergartenDataGenerator(
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client,
SqliteDatabase& database)
: options_(options), webClient_(web_client), database_(database) {}
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
: options_(options), webClient_(std::move(web_client)) {}
std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
auto BiergartenDataGenerator::InitializeGenerator()
-> std::unique_ptr<DataGenerator> {
spdlog::info("Initializing brewery generator...");
std::unique_ptr<DataGenerator> generator;
@@ -41,75 +42,60 @@ std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
return generator;
}
void BiergartenDataGenerator::LoadGeographicData() {
std::string json_path = options_.cache_dir + "/countries+states+cities.json";
std::string db_path = options_.cache_dir + "/biergarten-pipeline.db";
bool has_json_cache = std::filesystem::exists(json_path);
bool has_db_cache = std::filesystem::exists(db_path);
spdlog::info("Initializing SQLite database at {}...", db_path);
database_.Initialize(db_path);
if (has_db_cache && has_json_cache) {
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
} else {
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
DataDownloader downloader(webClient_);
downloader.DownloadCountriesDatabase(json_path, options_.commit);
JsonLoader::LoadWorldCities(json_path, database_);
}
}
std::vector<std::pair<City, std::string>>
BiergartenDataGenerator::QueryCitiesWithCountries() {
auto BiergartenDataGenerator::QueryCitiesWithCountries()
-> std::vector<Location> {
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
auto cities = database_.QueryCities();
// Build a quick map of country id -> name for per-city lookups.
auto all_countries = database_.QueryCountries(0);
std::unordered_map<int, std::string> country_map;
for (const auto& c : all_countries) {
country_map[c.id] = c.name;
}
spdlog::info("\nTotal records loaded:");
spdlog::info(" Countries: {}", database_.QueryCountries(0).size());
spdlog::info(" States: {}", database_.QueryStates(0).size());
spdlog::info(" Cities: {}", cities.size());
// Cap at 30 entries.
const size_t sample_count = std::min(size_t(30), cities.size());
std::vector<std::pair<City, std::string>> result;
for (size_t i = 0; i < sample_count; i++) {
const auto& city = cities[i];
std::string country_name;
const auto country_it = country_map.find(city.country_id);
if (country_it != country_map.end()) {
country_name = country_it->second;
std::filesystem::path locations_path = "locations.json";
if (!std::filesystem::exists(locations_path)) {
const std::filesystem::path cache_path =
std::filesystem::path(options_.cache_dir) / "locations.json";
if (std::filesystem::exists(cache_path)) {
locations_path = cache_path;
}
result.push_back({city, country_name});
}
return result;
auto all_locations = JsonLoader::LoadLocations(locations_path.string());
spdlog::info(" Locations available: {}", all_locations.size());
const size_t sample_count = std::min<size_t>(30, all_locations.size());
std::vector<Location> sampled_locations;
sampled_locations.reserve(sample_count);
std::random_device random_generator;
std::sample(all_locations.begin(), all_locations.end(),
std::back_inserter(sampled_locations), sample_count,
random_generator);
spdlog::info(" Sampled locations: {}", sampled_locations.size());
return sampled_locations;
}
std::vector<BiergartenDataGenerator::EnrichedCity>
BiergartenDataGenerator::EnrichWithWikipedia(
const std::vector<std::pair<City, std::string>>& cities) {
WikipediaService wikipedia_service(webClient_);
auto BiergartenDataGenerator::EnrichWithWikipedia(
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
std::vector<EnrichedCity> enriched;
enriched.reserve(cities.size());
for (const auto& [city, country_name] : cities) {
const std::string region_context =
wikipedia_service.GetSummary(city.name, country_name);
spdlog::debug("[Pipeline] Region context for {}: {}", city.name,
region_context);
std::vector<std::future<EnrichedCity>> pending;
pending.reserve(cities.size());
enriched.push_back({city.id, city.name, country_name, region_context});
for (const auto& city : cities) {
pending.push_back(std::async(std::launch::async,
[web_client = webClient_, city]() {
WikipediaService wikipedia_service(
web_client);
const std::string region_context =
wikipedia_service.GetSummary(
city.city, city.country);
spdlog::debug(
"[Pipeline] Region context for {}: {}",
city.city, region_context);
return EnrichedCity{city, region_context};
}));
}
for (auto& task : pending) {
enriched.push_back(task.get());
}
return enriched;
@@ -121,28 +107,30 @@ void BiergartenDataGenerator::GenerateBreweries(
generatedBreweries_.clear();
for (const auto& enriched_city : cities) {
auto brewery = generator.GenerateBrewery(enriched_city.city_name,
enriched_city.country_name,
auto brewery = generator.GenerateBrewery(enriched_city.location.city,
enriched_city.location.country,
enriched_city.region_context);
generatedBreweries_.push_back(
{enriched_city.city_id, enriched_city.city_name, brewery});
generatedBreweries_.push_back({enriched_city.location, brewery});
}
}
void BiergartenDataGenerator::LogResults() const {
spdlog::info("\n=== GENERATED DATA DUMP ===");
for (size_t i = 0; i < generatedBreweries_.size(); i++) {
const auto& entry = generatedBreweries_[i];
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.city_id,
entry.city_name);
size_t index = 1;
for (const auto& entry : generatedBreweries_) {
spdlog::info("{}. city=\"{}\" country=\"{}\" state=\"{}\" "
"iso3166_2={} lat={} lon={}",
index, entry.location.city, entry.location.country,
entry.location.state_province, entry.location.iso3166_2,
entry.location.latitude, entry.location.longitude);
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
++index;
}
}
int BiergartenDataGenerator::Run() {
auto BiergartenDataGenerator::Run() -> int {
try {
LoadGeographicData();
auto generator = InitializeGenerator();
auto cities = QueryCitiesWithCountries();
auto enriched = EnrichWithWikipedia(cities);

View File

@@ -1,44 +0,0 @@
#include "data_generation/data_downloader.h"
#include <spdlog/spdlog.h>
#include <filesystem>
#include <fstream>
#include <sstream>
#include <stdexcept>
#include "web_client/web_client.h"
DataDownloader::DataDownloader(std::shared_ptr<WebClient> web_client)
: web_client_(std::move(web_client)) {}
DataDownloader::~DataDownloader() {}
bool DataDownloader::FileExists(const std::string& file_path) {
return std::filesystem::exists(file_path);
}
std::string DataDownloader::DownloadCountriesDatabase(
const std::string& cache_path, const std::string& commit) {
if (FileExists(cache_path)) {
spdlog::info("[DataDownloader] Cache hit: {}", cache_path);
return cache_path;
}
std::string url =
"https://raw.githubusercontent.com/dr5hn/"
"countries-states-cities-database/" +
commit + "/json/countries+states+cities.json";
spdlog::info("[DataDownloader] Downloading: {}", url);
web_client_->DownloadToFile(url, cache_path);
std::ifstream file_check(cache_path, std::ios::binary | std::ios::ate);
std::streamsize size = file_check.tellg();
file_check.close();
spdlog::info("[DataDownloader] OK: Download complete: {} ({:.2f} MB)",
cache_path, (size / (1024.0 * 1024.0)));
return cache_path;
}

View File

@@ -1,264 +0,0 @@
#include "database/database.h"
#include <spdlog/spdlog.h>
#include <stdexcept>
void SqliteDatabase::InitializeSchema() {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* schema = R"(
CREATE TABLE IF NOT EXISTS countries (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
iso2 TEXT,
iso3 TEXT
);
CREATE TABLE IF NOT EXISTS states (
id INTEGER PRIMARY KEY,
country_id INTEGER NOT NULL,
name TEXT NOT NULL,
iso2 TEXT,
FOREIGN KEY(country_id) REFERENCES countries(id)
);
CREATE TABLE IF NOT EXISTS cities (
id INTEGER PRIMARY KEY,
state_id INTEGER NOT NULL,
country_id INTEGER NOT NULL,
name TEXT NOT NULL,
latitude REAL,
longitude REAL,
FOREIGN KEY(state_id) REFERENCES states(id),
FOREIGN KEY(country_id) REFERENCES countries(id)
);
)";
char* errMsg = nullptr;
int rc = sqlite3_exec(db_, schema, nullptr, nullptr, &errMsg);
if (rc != SQLITE_OK) {
std::string error = errMsg ? std::string(errMsg) : "Unknown error";
sqlite3_free(errMsg);
throw std::runtime_error("Failed to create schema: " + error);
}
}
SqliteDatabase::~SqliteDatabase() {
if (db_) {
sqlite3_close(db_);
}
}
void SqliteDatabase::Initialize(const std::string& db_path) {
int rc = sqlite3_open(db_path.c_str(), &db_);
if (rc) {
throw std::runtime_error("Failed to open SQLite database: " + db_path);
}
spdlog::info("OK: SQLite database opened: {}", db_path);
InitializeSchema();
}
void SqliteDatabase::BeginTransaction() {
std::lock_guard<std::mutex> lock(db_mutex_);
char* err = nullptr;
if (sqlite3_exec(db_, "BEGIN TRANSACTION", nullptr, nullptr, &err) !=
SQLITE_OK) {
std::string msg = err ? err : "unknown";
sqlite3_free(err);
throw std::runtime_error("BeginTransaction failed: " + msg);
}
}
void SqliteDatabase::CommitTransaction() {
std::lock_guard<std::mutex> lock(db_mutex_);
char* err = nullptr;
if (sqlite3_exec(db_, "COMMIT", nullptr, nullptr, &err) != SQLITE_OK) {
std::string msg = err ? err : "unknown";
sqlite3_free(err);
throw std::runtime_error("CommitTransaction failed: " + msg);
}
}
void SqliteDatabase::RollbackTransaction() {
std::lock_guard<std::mutex> lock(db_mutex_);
char* err = nullptr;
if (sqlite3_exec(db_, "ROLLBACK", nullptr, nullptr, &err) != SQLITE_OK) {
std::string msg = err ? err : "unknown";
sqlite3_free(err);
throw std::runtime_error("RollbackTransaction failed: " + msg);
}
}
void SqliteDatabase::InsertCountry(int id, const std::string& name,
const std::string& iso2,
const std::string& iso3) {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* query = R"(
INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
VALUES (?, ?, ?, ?)
)";
sqlite3_stmt* stmt;
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK)
throw std::runtime_error("Failed to prepare country insert");
sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_TRANSIENT);
if (sqlite3_step(stmt) != SQLITE_DONE) {
throw std::runtime_error("Failed to insert country");
}
sqlite3_finalize(stmt);
}
void SqliteDatabase::InsertState(int id, int country_id,
const std::string& name,
const std::string& iso2) {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* query = R"(
INSERT OR IGNORE INTO states (id, country_id, name, iso2)
VALUES (?, ?, ?, ?)
)";
sqlite3_stmt* stmt;
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK)
throw std::runtime_error("Failed to prepare state insert");
sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_int(stmt, 2, country_id);
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_TRANSIENT);
if (sqlite3_step(stmt) != SQLITE_DONE) {
throw std::runtime_error("Failed to insert state");
}
sqlite3_finalize(stmt);
}
void SqliteDatabase::InsertCity(int id, int state_id, int country_id,
const std::string& name, double latitude,
double longitude) {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* query = R"(
INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
VALUES (?, ?, ?, ?, ?, ?)
)";
sqlite3_stmt* stmt;
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK)
throw std::runtime_error("Failed to prepare city insert");
sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_int(stmt, 2, state_id);
sqlite3_bind_int(stmt, 3, country_id);
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_double(stmt, 5, latitude);
sqlite3_bind_double(stmt, 6, longitude);
if (sqlite3_step(stmt) != SQLITE_DONE) {
throw std::runtime_error("Failed to insert city");
}
sqlite3_finalize(stmt);
}
std::vector<City> SqliteDatabase::QueryCities() {
std::lock_guard<std::mutex> lock(db_mutex_);
std::vector<City> cities;
sqlite3_stmt* stmt = nullptr;
const char* query =
"SELECT id, name, country_id FROM cities ORDER BY RANDOM()";
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK) {
throw std::runtime_error("Failed to prepare query");
}
while (sqlite3_step(stmt) == SQLITE_ROW) {
int id = sqlite3_column_int(stmt, 0);
const char* name =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
int country_id = sqlite3_column_int(stmt, 2);
cities.push_back({id, name ? std::string(name) : "", country_id});
}
sqlite3_finalize(stmt);
return cities;
}
std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
std::lock_guard<std::mutex> lock(db_mutex_);
std::vector<Country> countries;
sqlite3_stmt* stmt = nullptr;
std::string query =
"SELECT id, name, iso2, iso3 FROM countries ORDER BY name";
if (limit > 0) {
query += " LIMIT " + std::to_string(limit);
}
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
if (rc != SQLITE_OK) {
throw std::runtime_error("Failed to prepare countries query");
}
while (sqlite3_step(stmt) == SQLITE_ROW) {
int id = sqlite3_column_int(stmt, 0);
const char* name =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
const char* iso2 =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
const char* iso3 =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 3));
countries.push_back({id, name ? std::string(name) : "",
iso2 ? std::string(iso2) : "",
iso3 ? std::string(iso3) : ""});
}
sqlite3_finalize(stmt);
return countries;
}
std::vector<State> SqliteDatabase::QueryStates(int limit) {
std::lock_guard<std::mutex> lock(db_mutex_);
std::vector<State> states;
sqlite3_stmt* stmt = nullptr;
std::string query =
"SELECT id, name, iso2, country_id FROM states ORDER BY name";
if (limit > 0) {
query += " LIMIT " + std::to_string(limit);
}
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
if (rc != SQLITE_OK) {
throw std::runtime_error("Failed to prepare states query");
}
while (sqlite3_step(stmt) == SQLITE_ROW) {
int id = sqlite3_column_int(stmt, 0);
const char* name =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
const char* iso2 =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
int country_id = sqlite3_column_int(stmt, 3);
states.push_back({id, name ? std::string(name) : "",
iso2 ? std::string(iso2) : "", country_id});
}
sqlite3_finalize(stmt);
return states;
}

View File

@@ -2,66 +2,82 @@
#include <spdlog/spdlog.h>
#include <chrono>
#include <boost/json.hpp>
#include "json_handling/stream_parser.h"
#include <fstream>
#include <sstream>
#include <stdexcept>
void JsonLoader::LoadWorldCities(const std::string& json_path,
SqliteDatabase& db) {
constexpr size_t kBatchSize = 10000;
namespace {
auto startTime = std::chrono::high_resolution_clock::now();
spdlog::info("\nLoading {} (streaming Boost.JSON SAX)...", json_path);
auto ReadRequiredString(const boost::json::object& object,
const char* key) -> std::string {
const boost::json::value* value = object.if_contains(key);
if (value == nullptr || !value->is_string()) {
throw std::runtime_error(std::string("Missing or invalid string field: ") +
key);
}
return std::string(value->as_string().c_str());
}
db.BeginTransaction();
bool transactionOpen = true;
auto ReadRequiredNumber(const boost::json::object& object, const char* key)
-> double {
const boost::json::value* value = object.if_contains(key);
if (value == nullptr || !value->is_number()) {
throw std::runtime_error(std::string("Missing or invalid numeric field: ") +
key);
}
return value->to_number<double>();
}
size_t citiesProcessed = 0;
try {
StreamingJsonParser::Parse(
json_path, db,
[&](const CityRecord& record) {
db.InsertCity(record.id, record.state_id, record.country_id,
record.name, record.latitude, record.longitude);
++citiesProcessed;
} // namespace
if (citiesProcessed % kBatchSize == 0) {
db.CommitTransaction();
db.BeginTransaction();
}
},
[&](size_t current, size_t /*total*/) {
if (current % kBatchSize == 0 && current > 0) {
spdlog::info(" [Progress] Parsed {} cities...", current);
}
});
spdlog::info(" OK: Parsed all cities from JSON");
if (transactionOpen) {
db.CommitTransaction();
transactionOpen = false;
}
} catch (...) {
if (transactionOpen) {
db.RollbackTransaction();
transactionOpen = false;
}
throw;
auto JsonLoader::LoadLocations(const std::string& filepath)
-> std::vector<Location> {
std::ifstream input(filepath);
if (!input.is_open()) {
throw std::runtime_error("Failed to open locations file: " + filepath);
}
auto endTime = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
endTime - startTime);
std::stringstream buffer;
buffer << input.rdbuf();
const std::string content = buffer.str();
spdlog::info("\n=== World City Data Loading Summary ===\n");
spdlog::info("Cities inserted: {}", citiesProcessed);
spdlog::info("Elapsed time: {} ms", duration.count());
long long throughput =
(citiesProcessed > 0 && duration.count() > 0)
? (1000LL * static_cast<long long>(citiesProcessed)) /
static_cast<long long>(duration.count())
: 0LL;
spdlog::info("Throughput: {} cities/sec", throughput);
spdlog::info("=======================================\n");
boost::json::error_code error;
boost::json::value root = boost::json::parse(content, error);
if (error) {
throw std::runtime_error("Failed to parse locations JSON: " +
error.message());
}
if (!root.is_array()) {
throw std::runtime_error(
"Invalid locations JSON: root element must be an array");
}
std::vector<Location> locations;
const auto& items = root.as_array();
locations.reserve(items.size());
for (const auto& item : items) {
if (!item.is_object()) {
throw std::runtime_error(
"Invalid locations JSON: each entry must be an object");
}
const auto& object = item.as_object();
locations.push_back(Location{
.city = ReadRequiredString(object, "city"),
.state_province = ReadRequiredString(object, "state_province"),
.iso3166_2 = ReadRequiredString(object, "iso3166_2"),
.country = ReadRequiredString(object, "country"),
.iso3166_1 = ReadRequiredString(object, "iso3166_1"),
.latitude = ReadRequiredNumber(object, "latitude"),
.longitude = ReadRequiredNumber(object, "longitude"),
});
}
spdlog::info("[JsonLoader] Loaded {} locations from {}", locations.size(),
filepath);
return locations;
}

View File

@@ -1,289 +0,0 @@
#include "json_handling/stream_parser.h"
#include <spdlog/spdlog.h>
#include <boost/json.hpp>
#include <boost/json/basic_parser_impl.hpp>
#include <cstdio>
#include <stdexcept>
#include "database/database.h"
class CityRecordHandler {
friend class boost::json::basic_parser<CityRecordHandler>;
public:
static constexpr std::size_t max_array_size = static_cast<std::size_t>(-1);
static constexpr std::size_t max_object_size = static_cast<std::size_t>(-1);
static constexpr std::size_t max_string_size = static_cast<std::size_t>(-1);
static constexpr std::size_t max_key_size = static_cast<std::size_t>(-1);
struct ParseContext {
SqliteDatabase* db = nullptr;
std::function<void(const CityRecord&)> on_city;
std::function<void(size_t, size_t)> on_progress;
size_t cities_emitted = 0;
size_t total_file_size = 0;
int countries_inserted = 0;
int states_inserted = 0;
};
explicit CityRecordHandler(ParseContext& ctx) : context(ctx) {}
private:
ParseContext& context;
int depth = 0;
bool in_countries_array = false;
bool in_country_object = false;
bool in_states_array = false;
bool in_state_object = false;
bool in_cities_array = false;
bool building_city = false;
int current_country_id = 0;
int current_state_id = 0;
CityRecord current_city = {};
std::string current_key;
std::string current_key_val;
std::string current_string_val;
std::string country_info[3];
std::string state_info[2];
// Boost.JSON SAX Hooks
bool on_document_begin(boost::system::error_code&) { return true; }
bool on_document_end(boost::system::error_code&) { return true; }
bool on_array_begin(boost::system::error_code&) {
depth++;
if (depth == 1) {
in_countries_array = true;
} else if (depth == 3 && current_key == "states") {
in_states_array = true;
} else if (depth == 5 && current_key == "cities") {
in_cities_array = true;
}
return true;
}
bool on_array_end(std::size_t, boost::system::error_code&) {
if (depth == 1) {
in_countries_array = false;
} else if (depth == 3) {
in_states_array = false;
} else if (depth == 5) {
in_cities_array = false;
}
depth--;
return true;
}
bool on_object_begin(boost::system::error_code&) {
depth++;
if (depth == 2 && in_countries_array) {
in_country_object = true;
current_country_id = 0;
country_info[0].clear();
country_info[1].clear();
country_info[2].clear();
} else if (depth == 4 && in_states_array) {
in_state_object = true;
current_state_id = 0;
state_info[0].clear();
state_info[1].clear();
} else if (depth == 6 && in_cities_array) {
building_city = true;
current_city = {};
}
return true;
}
bool on_object_end(std::size_t, boost::system::error_code&) {
if (depth == 6 && building_city) {
if (current_city.id > 0 && current_state_id > 0 &&
current_country_id > 0) {
current_city.state_id = current_state_id;
current_city.country_id = current_country_id;
try {
context.on_city(current_city);
context.cities_emitted++;
if (context.on_progress && context.cities_emitted % 10000 == 0) {
context.on_progress(context.cities_emitted,
context.total_file_size);
}
} catch (const std::exception& e) {
spdlog::warn("Record parsing failed: {}", e.what());
}
}
building_city = false;
} else if (depth == 4 && in_state_object) {
if (current_state_id > 0 && current_country_id > 0) {
try {
context.db->InsertState(current_state_id, current_country_id,
state_info[0], state_info[1]);
context.states_inserted++;
} catch (const std::exception& e) {
spdlog::warn("Record parsing failed: {}", e.what());
}
}
in_state_object = false;
} else if (depth == 2 && in_country_object) {
if (current_country_id > 0) {
try {
context.db->InsertCountry(current_country_id, country_info[0],
country_info[1], country_info[2]);
context.countries_inserted++;
} catch (const std::exception& e) {
spdlog::warn("Record parsing failed: {}", e.what());
}
}
in_country_object = false;
}
depth--;
return true;
}
bool on_key_part(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_key_val.append(s.data(), s.size());
return true;
}
bool on_key(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_key_val.append(s.data(), s.size());
current_key = current_key_val;
current_key_val.clear();
return true;
}
bool on_string_part(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_string_val.append(s.data(), s.size());
return true;
}
bool on_string(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_string_val.append(s.data(), s.size());
if (building_city && current_key == "name") {
current_city.name = current_string_val;
} else if (in_state_object && current_key == "name") {
state_info[0] = current_string_val;
} else if (in_state_object && current_key == "iso2") {
state_info[1] = current_string_val;
} else if (in_country_object && current_key == "name") {
country_info[0] = current_string_val;
} else if (in_country_object && current_key == "iso2") {
country_info[1] = current_string_val;
} else if (in_country_object && current_key == "iso3") {
country_info[2] = current_string_val;
}
current_string_val.clear();
return true;
}
bool on_number_part(boost::json::string_view, boost::system::error_code&) {
return true;
}
bool on_int64(int64_t i, boost::json::string_view,
boost::system::error_code&) {
if (building_city && current_key == "id") {
current_city.id = static_cast<int>(i);
} else if (in_state_object && current_key == "id") {
current_state_id = static_cast<int>(i);
} else if (in_country_object && current_key == "id") {
current_country_id = static_cast<int>(i);
}
return true;
}
bool on_uint64(uint64_t u, boost::json::string_view,
boost::system::error_code& ec) {
return on_int64(static_cast<int64_t>(u), "", ec);
}
bool on_double(double d, boost::json::string_view,
boost::system::error_code&) {
if (building_city) {
if (current_key == "latitude") {
current_city.latitude = d;
} else if (current_key == "longitude") {
current_city.longitude = d;
}
}
return true;
}
bool on_bool(bool, boost::system::error_code&) { return true; }
bool on_null(boost::system::error_code&) { return true; }
bool on_comment_part(boost::json::string_view, boost::system::error_code&) {
return true;
}
bool on_comment(boost::json::string_view, boost::system::error_code&) {
return true;
}
};
void StreamingJsonParser::Parse(
const std::string& file_path, SqliteDatabase& db,
std::function<void(const CityRecord&)> on_city,
std::function<void(size_t, size_t)> on_progress) {
spdlog::info(" Streaming parse of {} (Boost.JSON)...", file_path);
FILE* file = std::fopen(file_path.c_str(), "rb");
if (!file) {
throw std::runtime_error("Failed to open JSON file: " + file_path);
}
size_t total_size = 0;
if (std::fseek(file, 0, SEEK_END) == 0) {
long file_size = std::ftell(file);
if (file_size > 0) {
total_size = static_cast<size_t>(file_size);
}
std::rewind(file);
}
CityRecordHandler::ParseContext ctx{&db, on_city, on_progress, 0, total_size,
0, 0};
boost::json::basic_parser<CityRecordHandler> parser(
boost::json::parse_options{}, ctx);
char buf[65536];
size_t bytes_read;
boost::system::error_code ec;
while ((bytes_read = std::fread(buf, 1, sizeof(buf), file)) > 0) {
char const* p = buf;
std::size_t remain = bytes_read;
while (remain > 0) {
std::size_t consumed = parser.write_some(true, p, remain, ec);
if (ec) {
std::fclose(file);
throw std::runtime_error("JSON parse error: " + ec.message());
}
p += consumed;
remain -= consumed;
}
}
parser.write_some(false, nullptr, 0, ec); // Signal EOF
std::fclose(file);
if (ec) {
throw std::runtime_error("JSON parse error at EOF: " + ec.message());
}
spdlog::info(" OK: Parsed {} countries, {} states, {} cities",
ctx.countries_inserted, ctx.states_inserted,
ctx.cities_emitted);
}

View File

@@ -5,7 +5,6 @@
#include <memory>
#include "biergarten_data_generator.h"
#include "database/database.h"
#include "web_client/curl_web_client.h"
namespace po = boost::program_options;
@@ -122,9 +121,8 @@ int main(int argc, char* argv[]) {
}
auto webClient = std::make_shared<CURLWebClient>();
SqliteDatabase database;
BiergartenDataGenerator generator(options, webClient, database);
BiergartenDataGenerator generator(options, webClient);
return generator.Run();
} catch (const std::exception& e) {