replace SQLite geo pipeline with curated in-memory locations

This commit is contained in:
Aaron Po
2026-04-07 02:28:15 -04:00
parent 60ee2ecf74
commit b8e96a6d45
14 changed files with 1135 additions and 1079 deletions

View File

@@ -3,11 +3,10 @@
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "data_generation/data_generator.h"
#include "database/database.h"
#include "models/location.h"
#include "web_client/web_client.h"
#include "wikipedia/wikipedia_service.h"
@@ -49,8 +48,7 @@ struct ApplicationOptions {
* @brief Main data generator class for the Biergarten pipeline.
*
* This class encapsulates the core logic for generating brewery data.
* It handles database initialization, data loading/downloading, and brewery
* generation.
* It handles location loading, city enrichment, and brewery generation.
*/
class BiergartenDataGenerator {
public:
@@ -59,20 +57,17 @@ class BiergartenDataGenerator {
*
* @param options Application configuration options.
* @param web_client HTTP client for downloading data.
* @param database SQLite database instance.
*/
BiergartenDataGenerator(const ApplicationOptions& options,
std::shared_ptr<WebClient> web_client,
SqliteDatabase& database);
std::shared_ptr<WebClient> web_client);
/**
* @brief Run the data generation pipeline.
*
* Performs the following steps:
* 1. Initialize database
* 2. Download geographic data if needed
* 3. Initialize the generator (LLM or Mock)
* 4. Generate brewery data for sample cities
* 1. Load curated locations from JSON
* 2. Initialize the generator (LLM or Mock)
* 3. Generate brewery data for sampled cities
*
* @return 0 on success, 1 on failure.
*/
@@ -85,16 +80,11 @@ class BiergartenDataGenerator {
/// @brief Shared HTTP client dependency.
std::shared_ptr<WebClient> webClient_;
/// @brief Database dependency.
SqliteDatabase& database_;
/**
* @brief Enriched city data with Wikipedia context.
*/
struct EnrichedCity {
int city_id;
std::string city_name;
std::string country_name;
Location location;
std::string region_context;
};
@@ -108,25 +98,20 @@ class BiergartenDataGenerator {
std::unique_ptr<DataGenerator> InitializeGenerator();
/**
* @brief Download and load geographic data if not cached.
*/
void LoadGeographicData();
/**
* @brief Query cities from database and build country name map.
* @brief Load locations from JSON and sample cities.
*
* @return Vector of (City, country_name) pairs capped at 30 entries.
* @return Vector of sampled locations capped at 30 entries.
*/
std::vector<std::pair<City, std::string>> QueryCitiesWithCountries();
std::vector<Location> QueryCitiesWithCountries();
/**
* @brief Enrich cities with Wikipedia summaries.
*
* @param cities Vector of (City, country_name) pairs.
* @param cities Vector of sampled locations.
* @return Vector of enriched city data with context.
*/
std::vector<EnrichedCity> EnrichWithWikipedia(
const std::vector<std::pair<City, std::string>>& cities);
const std::vector<Location>& cities);
/**
* @brief Generate breweries for enriched cities.
@@ -146,8 +131,7 @@ class BiergartenDataGenerator {
* @brief Helper struct to store generated brewery data.
*/
struct GeneratedBrewery {
int city_id;
std::string city_name;
Location location;
BreweryResult brewery;
};

View File

@@ -1,31 +0,0 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
#include <memory>
#include <stdexcept>
#include <string>
#include "web_client/web_client.h"
/// @brief Downloads and caches source geography JSON payloads.
class DataDownloader {
public:
/// @brief Initializes global curl state used by this downloader.
explicit DataDownloader(std::shared_ptr<WebClient> web_client);
/// @brief Cleans up global curl state.
~DataDownloader();
/// @brief Returns a local JSON path, downloading it when cache is missing.
std::string DownloadCountriesDatabase(
const std::string& cache_path,
const std::string& commit =
"c5eb7772" // Stable commit: 2026-03-28 export
);
private:
static bool FileExists(const std::string& file_path);
std::shared_ptr<WebClient> web_client_;
};
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_

View File

@@ -1,87 +0,0 @@
#ifndef BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
#define BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
#include <sqlite3.h>
#include <mutex>
#include <string>
#include <vector>
struct Country {
/// @brief Country identifier from the source dataset.
int id;
/// @brief Country display name.
std::string name;
/// @brief ISO 3166-1 alpha-2 code.
std::string iso2;
/// @brief ISO 3166-1 alpha-3 code.
std::string iso3;
};
struct State {
/// @brief State or province identifier from the source dataset.
int id;
/// @brief State or province display name.
std::string name;
/// @brief State or province short code.
std::string iso2;
/// @brief Parent country identifier.
int country_id;
};
struct City {
/// @brief City identifier from the source dataset.
int id;
/// @brief City display name.
std::string name;
/// @brief Parent country identifier.
int country_id;
};
/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks.
class SqliteDatabase {
private:
sqlite3* db_ = nullptr;
std::mutex db_mutex_;
void InitializeSchema();
public:
/// @brief Closes the SQLite connection if initialized.
~SqliteDatabase();
/// @brief Opens the SQLite database at db_path and creates schema objects.
void Initialize(const std::string& db_path = ":memory:");
/// @brief Starts a database transaction for batched writes.
void BeginTransaction();
/// @brief Commits the active database transaction.
void CommitTransaction();
/// @brief Rolls back the active database transaction.
void RollbackTransaction();
/// @brief Inserts a country row.
void InsertCountry(int id, const std::string& name, const std::string& iso2,
const std::string& iso3);
/// @brief Inserts a state row linked to a country.
void InsertState(int id, int country_id, const std::string& name,
const std::string& iso2);
/// @brief Inserts a city row linked to state and country.
void InsertCity(int id, int state_id, int country_id,
const std::string& name, double latitude, double longitude);
/// @brief Returns city records including parent country id.
std::vector<City> QueryCities();
/// @brief Returns countries with optional row limit.
std::vector<Country> QueryCountries(int limit = 0);
/// @brief Returns states with optional row limit.
std::vector<State> QueryStates(int limit = 0);
};
#endif // BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_

View File

@@ -2,16 +2,15 @@
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
#include <string>
#include <vector>
#include "database/database.h"
#include "json_handling/stream_parser.h"
#include "models/location.h"
/// @brief Loads world-city JSON data into SQLite through streaming parsing.
/// @brief Loads curated world locations from a JSON file into memory.
class JsonLoader {
public:
/// @brief Parses a JSON file and writes country/state/city rows into db.
static void LoadWorldCities(const std::string& json_path,
SqliteDatabase& db);
/// @brief Parses a JSON array file and returns all location records.
static std::vector<Location> LoadLocations(const std::string& filepath);
};
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_

View File

@@ -1,52 +0,0 @@
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
#define BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
#include <functional>
#include <string>
#include "database/database.h"
// Forward declaration to avoid circular dependency
class SqliteDatabase;
/// @brief In-memory representation of one parsed city entry.
struct CityRecord {
int id;
int state_id;
int country_id;
std::string name;
double latitude;
double longitude;
};
/// @brief Streaming SAX parser that emits city records during traversal.
class StreamingJsonParser {
public:
/// @brief Parses file_path and invokes callbacks for city rows and progress.
static void Parse(const std::string& file_path, SqliteDatabase& db,
std::function<void(const CityRecord&)> on_city,
std::function<void(size_t, size_t)> on_progress = nullptr);
private:
/// @brief Mutable SAX handler state while traversing nested JSON arrays.
struct ParseState {
int current_country_id = 0;
int current_state_id = 0;
CityRecord current_city = {};
bool building_city = false;
std::string current_key;
int array_depth = 0;
int object_depth = 0;
bool in_countries_array = false;
bool in_states_array = false;
bool in_cities_array = false;
std::function<void(const CityRecord&)> on_city;
std::function<void(size_t, size_t)> on_progress;
size_t bytes_processed = 0;
};
};
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_