mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-04-05 18:09:04 +00:00
cleanup
This commit is contained in:
@@ -39,8 +39,14 @@ public:
|
|||||||
/// @brief Closes the SQLite connection if initialized.
|
/// @brief Closes the SQLite connection if initialized.
|
||||||
~SqliteDatabase();
|
~SqliteDatabase();
|
||||||
|
|
||||||
/// @brief Opens the in-memory database and creates schema objects.
|
/// @brief Opens the SQLite database at dbPath and creates schema objects.
|
||||||
void Initialize();
|
void Initialize(const std::string &dbPath = ":memory:");
|
||||||
|
|
||||||
|
/// @brief Starts a database transaction for batched writes.
|
||||||
|
void BeginTransaction();
|
||||||
|
|
||||||
|
/// @brief Commits the active database transaction.
|
||||||
|
void CommitTransaction();
|
||||||
|
|
||||||
/// @brief Inserts a country row.
|
/// @brief Inserts a country row.
|
||||||
void InsertCountry(int id, const std::string &name, const std::string &iso2,
|
void InsertCountry(int id, const std::string &name, const std::string &iso2,
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
#include "data_downloader.h"
|
#include "data_downloader.h"
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
|
#include <filesystem>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <sys/stat.h>
|
|
||||||
|
|
||||||
static size_t WriteCallback(void *contents, size_t size, size_t nmemb,
|
static size_t WriteCallback(void *contents, size_t size, size_t nmemb,
|
||||||
void *userp) {
|
void *userp) {
|
||||||
@@ -19,8 +19,7 @@ DataDownloader::DataDownloader() {}
|
|||||||
DataDownloader::~DataDownloader() {}
|
DataDownloader::~DataDownloader() {}
|
||||||
|
|
||||||
bool DataDownloader::FileExists(const std::string &filePath) const {
|
bool DataDownloader::FileExists(const std::string &filePath) const {
|
||||||
struct stat buffer;
|
return std::filesystem::exists(filePath);
|
||||||
return (stat(filePath.c_str(), &buffer) == 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string
|
std::string
|
||||||
|
|||||||
@@ -48,15 +48,36 @@ SqliteDatabase::~SqliteDatabase() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SqliteDatabase::Initialize() {
|
void SqliteDatabase::Initialize(const std::string &dbPath) {
|
||||||
int rc = sqlite3_open(":memory:", &db);
|
int rc = sqlite3_open(dbPath.c_str(), &db);
|
||||||
if (rc) {
|
if (rc) {
|
||||||
throw std::runtime_error("Failed to create in-memory SQLite database");
|
throw std::runtime_error("Failed to open SQLite database: " + dbPath);
|
||||||
}
|
}
|
||||||
spdlog::info("OK: In-memory SQLite database created");
|
spdlog::info("OK: SQLite database opened: {}", dbPath);
|
||||||
InitializeSchema();
|
InitializeSchema();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void SqliteDatabase::BeginTransaction() {
|
||||||
|
std::lock_guard<std::mutex> lock(dbMutex);
|
||||||
|
char *err = nullptr;
|
||||||
|
if (sqlite3_exec(db, "BEGIN TRANSACTION", nullptr, nullptr, &err) !=
|
||||||
|
SQLITE_OK) {
|
||||||
|
std::string msg = err ? err : "unknown";
|
||||||
|
sqlite3_free(err);
|
||||||
|
throw std::runtime_error("BeginTransaction failed: " + msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SqliteDatabase::CommitTransaction() {
|
||||||
|
std::lock_guard<std::mutex> lock(dbMutex);
|
||||||
|
char *err = nullptr;
|
||||||
|
if (sqlite3_exec(db, "COMMIT", nullptr, nullptr, &err) != SQLITE_OK) {
|
||||||
|
std::string msg = err ? err : "unknown";
|
||||||
|
sqlite3_free(err);
|
||||||
|
throw std::runtime_error("CommitTransaction failed: " + msg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void SqliteDatabase::InsertCountry(int id, const std::string &name,
|
void SqliteDatabase::InsertCountry(int id, const std::string &name,
|
||||||
const std::string &iso2,
|
const std::string &iso2,
|
||||||
const std::string &iso3) {
|
const std::string &iso3) {
|
||||||
|
|||||||
@@ -1,65 +1,32 @@
|
|||||||
#include "json_loader.h"
|
#include "json_loader.h"
|
||||||
#include "stream_parser.h"
|
#include "stream_parser.h"
|
||||||
#include "work_queue.h"
|
|
||||||
#include <atomic>
|
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
#include <thread>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
void JsonLoader::LoadWorldCities(const std::string &jsonPath,
|
void JsonLoader::LoadWorldCities(const std::string &jsonPath,
|
||||||
SqliteDatabase &db) {
|
SqliteDatabase &db) {
|
||||||
auto startTime = std::chrono::high_resolution_clock::now();
|
auto startTime = std::chrono::high_resolution_clock::now();
|
||||||
spdlog::info("\nLoading {} (streaming RapidJSON SAX + producer-consumer)...",
|
spdlog::info("\nLoading {} (streaming RapidJSON SAX)...", jsonPath);
|
||||||
jsonPath);
|
|
||||||
|
|
||||||
const unsigned int QUEUE_CAPACITY = 1000;
|
db.BeginTransaction();
|
||||||
WorkQueue<CityRecord> queue(QUEUE_CAPACITY);
|
|
||||||
|
|
||||||
spdlog::info("Creating worker thread pool...");
|
size_t citiesProcessed = 0;
|
||||||
|
|
||||||
unsigned int numWorkers = std::thread::hardware_concurrency();
|
|
||||||
if (numWorkers == 0)
|
|
||||||
numWorkers = 4; // Fallback if unavailable
|
|
||||||
spdlog::info(" Spawning {} worker threads", numWorkers);
|
|
||||||
|
|
||||||
std::vector<std::thread> workers;
|
|
||||||
std::atomic<unsigned long> citiesProcessed{0};
|
|
||||||
|
|
||||||
for (unsigned int i = 0; i < numWorkers; ++i) {
|
|
||||||
workers.push_back(std::thread([&]() {
|
|
||||||
unsigned long localCount = 0;
|
|
||||||
while (auto record = queue.pop()) {
|
|
||||||
db.InsertCity(record->id, record->state_id, record->country_id,
|
|
||||||
record->name, record->latitude, record->longitude);
|
|
||||||
localCount++;
|
|
||||||
}
|
|
||||||
citiesProcessed += localCount;
|
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
spdlog::info("Streaming cities into worker queue...");
|
|
||||||
|
|
||||||
unsigned long totalCities = 0;
|
|
||||||
StreamingJsonParser::Parse(
|
StreamingJsonParser::Parse(
|
||||||
jsonPath, db, [&](const CityRecord &record) { queue.push(record); },
|
jsonPath, db,
|
||||||
|
[&](const CityRecord &record) {
|
||||||
|
db.InsertCity(record.id, record.state_id, record.country_id,
|
||||||
|
record.name, record.latitude, record.longitude);
|
||||||
|
citiesProcessed++;
|
||||||
|
},
|
||||||
[&](size_t current, size_t total) {
|
[&](size_t current, size_t total) {
|
||||||
if (current % 10000 == 0 && current > 0) {
|
if (current % 10000 == 0 && current > 0) {
|
||||||
spdlog::info(" [Progress] Parsed {} cities...", current);
|
spdlog::info(" [Progress] Parsed {} cities...", current);
|
||||||
}
|
}
|
||||||
totalCities = current;
|
|
||||||
});
|
});
|
||||||
|
|
||||||
spdlog::info(" OK: Parsed all cities from JSON");
|
spdlog::info(" OK: Parsed all cities from JSON");
|
||||||
|
|
||||||
queue.shutdown_queue();
|
db.CommitTransaction();
|
||||||
|
|
||||||
spdlog::info("Waiting for worker threads to complete...");
|
|
||||||
for (auto &worker : workers) {
|
|
||||||
if (worker.joinable()) {
|
|
||||||
worker.join();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
auto endTime = std::chrono::high_resolution_clock::now();
|
auto endTime = std::chrono::high_resolution_clock::now();
|
||||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
|
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||||
@@ -74,7 +41,5 @@ void JsonLoader::LoadWorldCities(const std::string &jsonPath,
|
|||||||
static_cast<long long>(duration.count())
|
static_cast<long long>(duration.count())
|
||||||
: 0LL;
|
: 0LL;
|
||||||
spdlog::info("Throughput: {} cities/sec", throughput);
|
spdlog::info("Throughput: {} cities/sec", throughput);
|
||||||
spdlog::info("Worker pool: {} threads", numWorkers);
|
|
||||||
spdlog::info("Queue capacity: {}", QUEUE_CAPACITY);
|
|
||||||
spdlog::info("=======================================\n");
|
spdlog::info("=======================================\n");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -3,8 +3,13 @@
|
|||||||
#include "generator.h"
|
#include "generator.h"
|
||||||
#include "json_loader.h"
|
#include "json_loader.h"
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
|
#include <filesystem>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
static bool FileExists(const std::string &filePath) {
|
||||||
|
return std::filesystem::exists(filePath);
|
||||||
|
}
|
||||||
|
|
||||||
int main(int argc, char *argv[]) {
|
int main(int argc, char *argv[]) {
|
||||||
try {
|
try {
|
||||||
curl_global_init(CURL_GLOBAL_DEFAULT);
|
curl_global_init(CURL_GLOBAL_DEFAULT);
|
||||||
@@ -15,17 +20,25 @@ int main(int argc, char *argv[]) {
|
|||||||
argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28
|
argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28
|
||||||
|
|
||||||
std::string jsonPath = cacheDir + "/countries+states+cities.json";
|
std::string jsonPath = cacheDir + "/countries+states+cities.json";
|
||||||
|
std::string dbPath = cacheDir + "/biergarten-pipeline.db";
|
||||||
|
|
||||||
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
|
bool hasJsonCache = FileExists(jsonPath);
|
||||||
DataDownloader downloader;
|
bool hasDbCache = FileExists(dbPath);
|
||||||
downloader.DownloadCountriesDatabase(jsonPath, commit);
|
|
||||||
|
|
||||||
SqliteDatabase db;
|
SqliteDatabase db;
|
||||||
|
|
||||||
spdlog::info("Initializing in-memory SQLite database...");
|
spdlog::info("Initializing SQLite database at {}...", dbPath);
|
||||||
db.Initialize();
|
db.Initialize(dbPath);
|
||||||
|
|
||||||
JsonLoader::LoadWorldCities(jsonPath, db);
|
if (hasDbCache && hasJsonCache) {
|
||||||
|
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
|
||||||
|
} else {
|
||||||
|
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
|
||||||
|
DataDownloader downloader;
|
||||||
|
downloader.DownloadCountriesDatabase(jsonPath, commit);
|
||||||
|
|
||||||
|
JsonLoader::LoadWorldCities(jsonPath, db);
|
||||||
|
}
|
||||||
|
|
||||||
spdlog::info("Initializing brewery generator...");
|
spdlog::info("Initializing brewery generator...");
|
||||||
LlamaBreweryGenerator generator;
|
LlamaBreweryGenerator generator;
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
#include "stream_parser.h"
|
#include "stream_parser.h"
|
||||||
#include "database.h"
|
#include "database.h"
|
||||||
#include <fstream>
|
#include <cstdio>
|
||||||
|
#include <rapidjson/filereadstream.h>
|
||||||
#include <rapidjson/reader.h>
|
#include <rapidjson/reader.h>
|
||||||
#include <rapidjson/stringbuffer.h>
|
#include <rapidjson/stringbuffer.h>
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
#include <sstream>
|
|
||||||
|
|
||||||
using namespace rapidjson;
|
using namespace rapidjson;
|
||||||
|
|
||||||
@@ -196,32 +196,39 @@ void StreamingJsonParser::Parse(
|
|||||||
|
|
||||||
spdlog::info(" Streaming parse of {}...", filePath);
|
spdlog::info(" Streaming parse of {}...", filePath);
|
||||||
|
|
||||||
std::ifstream file(filePath, std::ios::binary);
|
FILE *file = std::fopen(filePath.c_str(), "rb");
|
||||||
if (!file.is_open()) {
|
if (!file) {
|
||||||
throw std::runtime_error("Failed to open JSON file: " + filePath);
|
throw std::runtime_error("Failed to open JSON file: " + filePath);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::stringstream buffer;
|
size_t total_size = 0;
|
||||||
buffer << file.rdbuf();
|
if (std::fseek(file, 0, SEEK_END) == 0) {
|
||||||
file.close();
|
long file_size = std::ftell(file);
|
||||||
std::string json_str = buffer.str();
|
if (file_size > 0) {
|
||||||
size_t total_size = json_str.length();
|
total_size = static_cast<size_t>(file_size);
|
||||||
|
}
|
||||||
|
std::rewind(file);
|
||||||
|
}
|
||||||
|
|
||||||
CityRecordHandler::ParseContext ctx{&db, onCity, onProgress, 0,
|
CityRecordHandler::ParseContext ctx{&db, onCity, onProgress, 0,
|
||||||
total_size, 0, 0};
|
total_size, 0, 0};
|
||||||
CityRecordHandler handler(ctx);
|
CityRecordHandler handler(ctx);
|
||||||
|
|
||||||
Reader reader;
|
Reader reader;
|
||||||
StringStream ss(json_str.c_str());
|
char buf[65536];
|
||||||
|
FileReadStream frs(file, buf, sizeof(buf));
|
||||||
|
|
||||||
if (!reader.Parse(ss, handler)) {
|
if (!reader.Parse(frs, handler)) {
|
||||||
ParseErrorCode errCode = reader.GetParseErrorCode();
|
ParseErrorCode errCode = reader.GetParseErrorCode();
|
||||||
size_t errOffset = reader.GetErrorOffset();
|
size_t errOffset = reader.GetErrorOffset();
|
||||||
|
std::fclose(file);
|
||||||
throw std::runtime_error(std::string("JSON parse error at offset ") +
|
throw std::runtime_error(std::string("JSON parse error at offset ") +
|
||||||
std::to_string(errOffset) +
|
std::to_string(errOffset) +
|
||||||
" (code: " + std::to_string(errCode) + ")");
|
" (code: " + std::to_string(errCode) + ")");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::fclose(file);
|
||||||
|
|
||||||
spdlog::info(" OK: Parsed {} countries, {} states, {} cities",
|
spdlog::info(" OK: Parsed {} countries, {} states, {} cities",
|
||||||
ctx.countries_inserted, ctx.states_inserted, ctx.cities_emitted);
|
ctx.countries_inserted, ctx.states_inserted, ctx.cities_emitted);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user