mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
replace SQLite geo pipeline with curated in-memory locations
This commit is contained in:
@@ -3,11 +3,10 @@
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <unordered_map>
|
||||
#include <vector>
|
||||
|
||||
#include "data_generation/data_generator.h"
|
||||
#include "database/database.h"
|
||||
#include "models/location.h"
|
||||
#include "web_client/web_client.h"
|
||||
#include "wikipedia/wikipedia_service.h"
|
||||
|
||||
@@ -49,8 +48,7 @@ struct ApplicationOptions {
|
||||
* @brief Main data generator class for the Biergarten pipeline.
|
||||
*
|
||||
* This class encapsulates the core logic for generating brewery data.
|
||||
* It handles database initialization, data loading/downloading, and brewery
|
||||
* generation.
|
||||
* It handles location loading, city enrichment, and brewery generation.
|
||||
*/
|
||||
class BiergartenDataGenerator {
|
||||
public:
|
||||
@@ -59,20 +57,17 @@ class BiergartenDataGenerator {
|
||||
*
|
||||
* @param options Application configuration options.
|
||||
* @param web_client HTTP client for downloading data.
|
||||
* @param database SQLite database instance.
|
||||
*/
|
||||
BiergartenDataGenerator(const ApplicationOptions& options,
|
||||
std::shared_ptr<WebClient> web_client,
|
||||
SqliteDatabase& database);
|
||||
std::shared_ptr<WebClient> web_client);
|
||||
|
||||
/**
|
||||
* @brief Run the data generation pipeline.
|
||||
*
|
||||
* Performs the following steps:
|
||||
* 1. Initialize database
|
||||
* 2. Download geographic data if needed
|
||||
* 3. Initialize the generator (LLM or Mock)
|
||||
* 4. Generate brewery data for sample cities
|
||||
* 1. Load curated locations from JSON
|
||||
* 2. Initialize the generator (LLM or Mock)
|
||||
* 3. Generate brewery data for sampled cities
|
||||
*
|
||||
* @return 0 on success, 1 on failure.
|
||||
*/
|
||||
@@ -85,16 +80,11 @@ class BiergartenDataGenerator {
|
||||
/// @brief Shared HTTP client dependency.
|
||||
std::shared_ptr<WebClient> webClient_;
|
||||
|
||||
/// @brief Database dependency.
|
||||
SqliteDatabase& database_;
|
||||
|
||||
/**
|
||||
* @brief Enriched city data with Wikipedia context.
|
||||
*/
|
||||
struct EnrichedCity {
|
||||
int city_id;
|
||||
std::string city_name;
|
||||
std::string country_name;
|
||||
Location location;
|
||||
std::string region_context;
|
||||
};
|
||||
|
||||
@@ -108,25 +98,20 @@ class BiergartenDataGenerator {
|
||||
std::unique_ptr<DataGenerator> InitializeGenerator();
|
||||
|
||||
/**
|
||||
* @brief Download and load geographic data if not cached.
|
||||
*/
|
||||
void LoadGeographicData();
|
||||
|
||||
/**
|
||||
* @brief Query cities from database and build country name map.
|
||||
* @brief Load locations from JSON and sample cities.
|
||||
*
|
||||
* @return Vector of (City, country_name) pairs capped at 30 entries.
|
||||
* @return Vector of sampled locations capped at 30 entries.
|
||||
*/
|
||||
std::vector<std::pair<City, std::string>> QueryCitiesWithCountries();
|
||||
std::vector<Location> QueryCitiesWithCountries();
|
||||
|
||||
/**
|
||||
* @brief Enrich cities with Wikipedia summaries.
|
||||
*
|
||||
* @param cities Vector of (City, country_name) pairs.
|
||||
* @param cities Vector of sampled locations.
|
||||
* @return Vector of enriched city data with context.
|
||||
*/
|
||||
std::vector<EnrichedCity> EnrichWithWikipedia(
|
||||
const std::vector<std::pair<City, std::string>>& cities);
|
||||
const std::vector<Location>& cities);
|
||||
|
||||
/**
|
||||
* @brief Generate breweries for enriched cities.
|
||||
@@ -146,8 +131,7 @@ class BiergartenDataGenerator {
|
||||
* @brief Helper struct to store generated brewery data.
|
||||
*/
|
||||
struct GeneratedBrewery {
|
||||
int city_id;
|
||||
std::string city_name;
|
||||
Location location;
|
||||
BreweryResult brewery;
|
||||
};
|
||||
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
|
||||
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
|
||||
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
#include "web_client/web_client.h"
|
||||
|
||||
/// @brief Downloads and caches source geography JSON payloads.
|
||||
class DataDownloader {
|
||||
public:
|
||||
/// @brief Initializes global curl state used by this downloader.
|
||||
explicit DataDownloader(std::shared_ptr<WebClient> web_client);
|
||||
|
||||
/// @brief Cleans up global curl state.
|
||||
~DataDownloader();
|
||||
|
||||
/// @brief Returns a local JSON path, downloading it when cache is missing.
|
||||
std::string DownloadCountriesDatabase(
|
||||
const std::string& cache_path,
|
||||
const std::string& commit =
|
||||
"c5eb7772" // Stable commit: 2026-03-28 export
|
||||
);
|
||||
|
||||
private:
|
||||
static bool FileExists(const std::string& file_path);
|
||||
std::shared_ptr<WebClient> web_client_;
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
|
||||
@@ -1,87 +0,0 @@
|
||||
#ifndef BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
|
||||
#define BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
|
||||
|
||||
#include <sqlite3.h>
|
||||
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
struct Country {
|
||||
/// @brief Country identifier from the source dataset.
|
||||
int id;
|
||||
/// @brief Country display name.
|
||||
std::string name;
|
||||
/// @brief ISO 3166-1 alpha-2 code.
|
||||
std::string iso2;
|
||||
/// @brief ISO 3166-1 alpha-3 code.
|
||||
std::string iso3;
|
||||
};
|
||||
|
||||
struct State {
|
||||
/// @brief State or province identifier from the source dataset.
|
||||
int id;
|
||||
/// @brief State or province display name.
|
||||
std::string name;
|
||||
/// @brief State or province short code.
|
||||
std::string iso2;
|
||||
/// @brief Parent country identifier.
|
||||
int country_id;
|
||||
};
|
||||
|
||||
struct City {
|
||||
/// @brief City identifier from the source dataset.
|
||||
int id;
|
||||
/// @brief City display name.
|
||||
std::string name;
|
||||
/// @brief Parent country identifier.
|
||||
int country_id;
|
||||
};
|
||||
|
||||
/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks.
|
||||
class SqliteDatabase {
|
||||
private:
|
||||
sqlite3* db_ = nullptr;
|
||||
std::mutex db_mutex_;
|
||||
|
||||
void InitializeSchema();
|
||||
|
||||
public:
|
||||
/// @brief Closes the SQLite connection if initialized.
|
||||
~SqliteDatabase();
|
||||
|
||||
/// @brief Opens the SQLite database at db_path and creates schema objects.
|
||||
void Initialize(const std::string& db_path = ":memory:");
|
||||
|
||||
/// @brief Starts a database transaction for batched writes.
|
||||
void BeginTransaction();
|
||||
|
||||
/// @brief Commits the active database transaction.
|
||||
void CommitTransaction();
|
||||
|
||||
/// @brief Rolls back the active database transaction.
|
||||
void RollbackTransaction();
|
||||
|
||||
/// @brief Inserts a country row.
|
||||
void InsertCountry(int id, const std::string& name, const std::string& iso2,
|
||||
const std::string& iso3);
|
||||
|
||||
/// @brief Inserts a state row linked to a country.
|
||||
void InsertState(int id, int country_id, const std::string& name,
|
||||
const std::string& iso2);
|
||||
|
||||
/// @brief Inserts a city row linked to state and country.
|
||||
void InsertCity(int id, int state_id, int country_id,
|
||||
const std::string& name, double latitude, double longitude);
|
||||
|
||||
/// @brief Returns city records including parent country id.
|
||||
std::vector<City> QueryCities();
|
||||
|
||||
/// @brief Returns countries with optional row limit.
|
||||
std::vector<Country> QueryCountries(int limit = 0);
|
||||
|
||||
/// @brief Returns states with optional row limit.
|
||||
std::vector<State> QueryStates(int limit = 0);
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
|
||||
@@ -2,16 +2,15 @@
|
||||
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "database/database.h"
|
||||
#include "json_handling/stream_parser.h"
|
||||
#include "models/location.h"
|
||||
|
||||
/// @brief Loads world-city JSON data into SQLite through streaming parsing.
|
||||
/// @brief Loads curated world locations from a JSON file into memory.
|
||||
class JsonLoader {
|
||||
public:
|
||||
/// @brief Parses a JSON file and writes country/state/city rows into db.
|
||||
static void LoadWorldCities(const std::string& json_path,
|
||||
SqliteDatabase& db);
|
||||
/// @brief Parses a JSON array file and returns all location records.
|
||||
static std::vector<Location> LoadLocations(const std::string& filepath);
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
|
||||
#define BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
|
||||
|
||||
#include <functional>
|
||||
#include <string>
|
||||
|
||||
#include "database/database.h"
|
||||
|
||||
// Forward declaration to avoid circular dependency
|
||||
class SqliteDatabase;
|
||||
|
||||
/// @brief In-memory representation of one parsed city entry.
|
||||
struct CityRecord {
|
||||
int id;
|
||||
int state_id;
|
||||
int country_id;
|
||||
std::string name;
|
||||
double latitude;
|
||||
double longitude;
|
||||
};
|
||||
|
||||
/// @brief Streaming SAX parser that emits city records during traversal.
|
||||
class StreamingJsonParser {
|
||||
public:
|
||||
/// @brief Parses file_path and invokes callbacks for city rows and progress.
|
||||
static void Parse(const std::string& file_path, SqliteDatabase& db,
|
||||
std::function<void(const CityRecord&)> on_city,
|
||||
std::function<void(size_t, size_t)> on_progress = nullptr);
|
||||
|
||||
private:
|
||||
/// @brief Mutable SAX handler state while traversing nested JSON arrays.
|
||||
struct ParseState {
|
||||
int current_country_id = 0;
|
||||
int current_state_id = 0;
|
||||
|
||||
CityRecord current_city = {};
|
||||
bool building_city = false;
|
||||
std::string current_key;
|
||||
|
||||
int array_depth = 0;
|
||||
int object_depth = 0;
|
||||
bool in_countries_array = false;
|
||||
bool in_states_array = false;
|
||||
bool in_cities_array = false;
|
||||
|
||||
std::function<void(const CityRecord&)> on_city;
|
||||
std::function<void(size_t, size_t)> on_progress;
|
||||
size_t bytes_processed = 0;
|
||||
};
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
|
||||
Reference in New Issue
Block a user