replace SQLite geo pipeline with curated in-memory locations

This commit is contained in:
Aaron Po
2026-04-07 02:28:15 -04:00
parent 60ee2ecf74
commit b8e96a6d45
14 changed files with 1135 additions and 1079 deletions

2
pipeline/.gitignore vendored
View File

@@ -1,3 +1,5 @@
dist dist
build build
data data
models
*.gguf

View File

@@ -1,170 +1,104 @@
cmake_minimum_required(VERSION 3.20) cmake_minimum_required(VERSION 3.24)
project(biergarten-pipeline VERSION 0.1.0 LANGUAGES CXX) project(biergarten-pipeline)
# =============================================================================
# Allows older dependencies to configure on newer CMake. # 1. GPU Detection
set(CMAKE_POLICY_VERSION_MINIMUM 3.5) # =============================================================================
# GGML_CUDA / GGML_METAL are set here so that the llama.cpp FetchContent below
# Policies # inherits them as cache variables before its CMakeLists.txt is processed.
cmake_policy(SET CMP0167 NEW) # FindBoost improvements if(APPLE)
message(STATUS "[biergarten] Apple Silicon detected — enabling Metal acceleration.")
# Global Settings set(GGML_METAL ON CACHE BOOL "Enable Metal for Apple Silicon" FORCE)
elseif(UNIX AND NOT APPLE)
find_package(CUDAToolkit QUIET)
if(CUDAToolkit_FOUND)
message(STATUS "[biergarten] NVIDIA GPU detected — enabling CUDA acceleration.")
set(GGML_CUDA ON CACHE BOOL "Enable CUDA for NVIDIA GPUs" FORCE)
# 'native' resolves to the exact SM version of the present GPU at configure time
# (e.g. sm_89 for RTX 2000 Ada). Change to a concrete arch list for cross-compilation.
set(CMAKE_CUDA_ARCHITECTURES native)
else()
message(STATUS "[biergarten] No NVIDIA GPU found — falling back to CPU.")
endif()
endif()
# =============================================================================
# 2. Project-wide Settings
# =============================================================================
set(CMAKE_CXX_STANDARD 23) set(CMAKE_CXX_STANDARD 23)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
# =============================================================================
option(ENABLE_CLANG_TIDY "Enable clang-tidy static analysis for project targets" ON) # 3. Dependencies
option(ENABLE_CLANG_FORMAT_TARGETS "Enable clang-format helper targets" ON) # =============================================================================
if(ENABLE_CLANG_TIDY)
find_program(CLANG_TIDY_EXE NAMES clang-tidy)
if(CLANG_TIDY_EXE)
set(BIERGARTEN_CLANG_TIDY_COMMAND
"${CLANG_TIDY_EXE};--config-file=${CMAKE_CURRENT_SOURCE_DIR}/.clang-tidy")
message(STATUS "clang-tidy enabled: ${CLANG_TIDY_EXE}")
else()
message(STATUS "clang-tidy not found; static analysis is disabled")
endif()
endif()
# -----------------------------------------------------------------------------
# Compiler Options & Warnings (Interface Library)
# -----------------------------------------------------------------------------
add_library(project_options INTERFACE)
target_compile_options(project_options INTERFACE
$<$<CXX_COMPILER_ID:GNU,Clang>:
-Wall -Wextra -Wpedantic -Wshadow -Wconversion -Wsign-conversion -Wunused
>
$<$<CXX_COMPILER_ID:MSVC>:
/W4 /WX /permissive-
>
)
# -----------------------------------------------------------------------------
# Dependencies
# -----------------------------------------------------------------------------
find_package(CURL REQUIRED)
find_package(SQLite3 REQUIRED)
find_package(Boost 1.75 REQUIRED COMPONENTS program_options json)
include(FetchContent) include(FetchContent)
# --- libcurl ------------------------------------------------------------------
# spdlog (Logging) # Prefer the system package; the build will fail at link time if absent and
# no system curl is found, so emit a fatal error early rather than a silent gap.
find_package(CURL QUIET)
if(NOT CURL_FOUND)
message(FATAL_ERROR
"[biergarten] libcurl not found. Install it via your package manager "
"(e.g. 'sudo dnf install libcurl-devel') or set CURL_ROOT.")
endif()
# --- llama.cpp ----------------------------------------------------------------
# Pinned to a specific commit for reproducible builds.
# To update: pick a new commit SHA from https://github.com/ggml-org/llama.cpp
FetchContent_Declare(
llama-cpp
GIT_REPOSITORY https://github.com/ggml-org/llama.cpp.git
GIT_TAG b8611
)
FetchContent_MakeAvailable(llama-cpp)
# --- Boost (JSON + program_options) ------------------------------------------
FetchContent_Declare(
boost
URL https://github.com/boostorg/boost/releases/download/boost-1.85.0/boost-1.85.0-cmake.tar.gz
)
FetchContent_MakeAvailable(boost)
# --- spdlog -------------------------------------------------------------------
FetchContent_Declare( FetchContent_Declare(
spdlog spdlog
GIT_REPOSITORY https://github.com/gabime/spdlog.git GIT_REPOSITORY https://github.com/gabime/spdlog.git
GIT_TAG v1.11.0 GIT_TAG v1.15.3
) )
FetchContent_MakeAvailable(spdlog) FetchContent_MakeAvailable(spdlog)
# =============================================================================
# llama.cpp (LLM Inference) # 4. Sources
set(LLAMA_BUILD_TESTS OFF CACHE BOOL "" FORCE) # =============================================================================
set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) set(SOURCES
set(LLAMA_BUILD_SERVER OFF CACHE BOOL "" FORCE) src/main.cpp
FetchContent_Declare(
llama_cpp
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
GIT_TAG b8611
)
FetchContent_MakeAvailable(llama_cpp)
if(TARGET llama)
target_compile_options(llama PRIVATE
$<$<CXX_COMPILER_ID:AppleClang>:-include algorithm>
)
endif()
# -----------------------------------------------------------------------------
# Main Executable
# -----------------------------------------------------------------------------
set(PIPELINE_SOURCES
src/biergarten_data_generator.cpp src/biergarten_data_generator.cpp
src/web_client/curl_web_client.cpp
src/data_generation/data_downloader.cpp
src/database/database.cpp
src/json_handling/json_loader.cpp
src/data_generation/llama/destructor.cpp src/data_generation/llama/destructor.cpp
src/data_generation/llama/set_sampling_options.cpp
src/data_generation/llama/load.cpp
src/data_generation/llama/infer.cpp
src/data_generation/llama/generate_brewery.cpp src/data_generation/llama/generate_brewery.cpp
src/data_generation/llama/generate_user.cpp src/data_generation/llama/generate_user.cpp
src/data_generation/llama/helpers.cpp src/data_generation/llama/helpers.cpp
src/data_generation/llama/infer.cpp
src/data_generation/llama/load.cpp
src/data_generation/llama/load_brewery_prompt.cpp src/data_generation/llama/load_brewery_prompt.cpp
src/data_generation/llama/set_sampling_options.cpp
src/data_generation/mock/data.cpp src/data_generation/mock/data.cpp
src/data_generation/mock/deterministic_hash.cpp src/data_generation/mock/deterministic_hash.cpp
src/data_generation/mock/load.cpp
src/data_generation/mock/generate_brewery.cpp src/data_generation/mock/generate_brewery.cpp
src/data_generation/mock/generate_user.cpp src/data_generation/mock/generate_user.cpp
src/json_handling/stream_parser.cpp src/data_generation/mock/load.cpp
src/json_handling/json_loader.cpp
src/web_client/curl_web_client.cpp
src/wikipedia/wikipedia_service.cpp src/wikipedia/wikipedia_service.cpp
src/main.cpp
) )
# =============================================================================
add_executable(biergarten-pipeline ${PIPELINE_SOURCES}) # 5. Target
# =============================================================================
if(BIERGARTEN_CLANG_TIDY_COMMAND) add_executable(${PROJECT_NAME}
set_target_properties(biergarten-pipeline PROPERTIES ${SOURCES}
CXX_CLANG_TIDY "${BIERGARTEN_CLANG_TIDY_COMMAND}"
) )
endif() target_include_directories(${PROJECT_NAME} PRIVATE
includes
target_include_directories(biergarten-pipeline ${llama-cpp_SOURCE_DIR}/include
PRIVATE ${llama-cpp_SOURCE_DIR}/common
${CMAKE_CURRENT_SOURCE_DIR}/includes
${llama_cpp_SOURCE_DIR}/include
) )
target_link_libraries(${PROJECT_NAME} PRIVATE
target_link_libraries(biergarten-pipeline
PRIVATE
project_options
CURL::libcurl
SQLite::SQLite3
spdlog::spdlog
llama llama
Boost::program_options boost_json
Boost::json boost_program_options
spdlog::spdlog
CURL::libcurl
) )
if(ENABLE_CLANG_FORMAT_TARGETS)
find_program(CLANG_FORMAT_EXE NAMES clang-format)
if(CLANG_FORMAT_EXE)
file(GLOB_RECURSE FORMAT_SOURCES CONFIGURE_DEPENDS
${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cpp
${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cc
${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.h
${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.hpp
)
add_custom_target(format
COMMAND ${CLANG_FORMAT_EXE} -style=file -i ${FORMAT_SOURCES}
COMMENT "Formatting source files with clang-format (Google style)"
VERBATIM
)
add_custom_target(format-check
COMMAND ${CLANG_FORMAT_EXE} -style=file --dry-run --Werror ${FORMAT_SOURCES}
COMMENT "Checking source formatting with clang-format (Google style)"
VERBATIM
)
else()
message(STATUS "clang-format not found; format targets are disabled")
endif()
endif()
# -----------------------------------------------------------------------------
# Post-Build Steps & Utilities
# -----------------------------------------------------------------------------
add_custom_command(TARGET biergarten-pipeline POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_SOURCE_DIR}/output
COMMENT "Ensuring output directory exists"
)
find_program(VALGRIND valgrind)
if(VALGRIND)
add_custom_target(memcheck
COMMAND ${VALGRIND} --leak-check=full --error-exitcode=1 $<TARGET_FILE:biergarten-pipeline> --help
DEPENDS biergarten-pipeline
COMMENT "Running Valgrind memory check"
)
endif()

View File

@@ -3,11 +3,10 @@
#include <memory> #include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <vector> #include <vector>
#include "data_generation/data_generator.h" #include "data_generation/data_generator.h"
#include "database/database.h" #include "models/location.h"
#include "web_client/web_client.h" #include "web_client/web_client.h"
#include "wikipedia/wikipedia_service.h" #include "wikipedia/wikipedia_service.h"
@@ -49,8 +48,7 @@ struct ApplicationOptions {
* @brief Main data generator class for the Biergarten pipeline. * @brief Main data generator class for the Biergarten pipeline.
* *
* This class encapsulates the core logic for generating brewery data. * This class encapsulates the core logic for generating brewery data.
* It handles database initialization, data loading/downloading, and brewery * It handles location loading, city enrichment, and brewery generation.
* generation.
*/ */
class BiergartenDataGenerator { class BiergartenDataGenerator {
public: public:
@@ -59,20 +57,17 @@ class BiergartenDataGenerator {
* *
* @param options Application configuration options. * @param options Application configuration options.
* @param web_client HTTP client for downloading data. * @param web_client HTTP client for downloading data.
* @param database SQLite database instance.
*/ */
BiergartenDataGenerator(const ApplicationOptions& options, BiergartenDataGenerator(const ApplicationOptions& options,
std::shared_ptr<WebClient> web_client, std::shared_ptr<WebClient> web_client);
SqliteDatabase& database);
/** /**
* @brief Run the data generation pipeline. * @brief Run the data generation pipeline.
* *
* Performs the following steps: * Performs the following steps:
* 1. Initialize database * 1. Load curated locations from JSON
* 2. Download geographic data if needed * 2. Initialize the generator (LLM or Mock)
* 3. Initialize the generator (LLM or Mock) * 3. Generate brewery data for sampled cities
* 4. Generate brewery data for sample cities
* *
* @return 0 on success, 1 on failure. * @return 0 on success, 1 on failure.
*/ */
@@ -85,16 +80,11 @@ class BiergartenDataGenerator {
/// @brief Shared HTTP client dependency. /// @brief Shared HTTP client dependency.
std::shared_ptr<WebClient> webClient_; std::shared_ptr<WebClient> webClient_;
/// @brief Database dependency.
SqliteDatabase& database_;
/** /**
* @brief Enriched city data with Wikipedia context. * @brief Enriched city data with Wikipedia context.
*/ */
struct EnrichedCity { struct EnrichedCity {
int city_id; Location location;
std::string city_name;
std::string country_name;
std::string region_context; std::string region_context;
}; };
@@ -108,25 +98,20 @@ class BiergartenDataGenerator {
std::unique_ptr<DataGenerator> InitializeGenerator(); std::unique_ptr<DataGenerator> InitializeGenerator();
/** /**
* @brief Download and load geographic data if not cached. * @brief Load locations from JSON and sample cities.
*/
void LoadGeographicData();
/**
* @brief Query cities from database and build country name map.
* *
* @return Vector of (City, country_name) pairs capped at 30 entries. * @return Vector of sampled locations capped at 30 entries.
*/ */
std::vector<std::pair<City, std::string>> QueryCitiesWithCountries(); std::vector<Location> QueryCitiesWithCountries();
/** /**
* @brief Enrich cities with Wikipedia summaries. * @brief Enrich cities with Wikipedia summaries.
* *
* @param cities Vector of (City, country_name) pairs. * @param cities Vector of sampled locations.
* @return Vector of enriched city data with context. * @return Vector of enriched city data with context.
*/ */
std::vector<EnrichedCity> EnrichWithWikipedia( std::vector<EnrichedCity> EnrichWithWikipedia(
const std::vector<std::pair<City, std::string>>& cities); const std::vector<Location>& cities);
/** /**
* @brief Generate breweries for enriched cities. * @brief Generate breweries for enriched cities.
@@ -146,8 +131,7 @@ class BiergartenDataGenerator {
* @brief Helper struct to store generated brewery data. * @brief Helper struct to store generated brewery data.
*/ */
struct GeneratedBrewery { struct GeneratedBrewery {
int city_id; Location location;
std::string city_name;
BreweryResult brewery; BreweryResult brewery;
}; };

View File

@@ -1,31 +0,0 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
#include <memory>
#include <stdexcept>
#include <string>
#include "web_client/web_client.h"
/// @brief Downloads and caches source geography JSON payloads.
class DataDownloader {
public:
/// @brief Initializes global curl state used by this downloader.
explicit DataDownloader(std::shared_ptr<WebClient> web_client);
/// @brief Cleans up global curl state.
~DataDownloader();
/// @brief Returns a local JSON path, downloading it when cache is missing.
std::string DownloadCountriesDatabase(
const std::string& cache_path,
const std::string& commit =
"c5eb7772" // Stable commit: 2026-03-28 export
);
private:
static bool FileExists(const std::string& file_path);
std::shared_ptr<WebClient> web_client_;
};
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_

View File

@@ -1,87 +0,0 @@
#ifndef BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
#define BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
#include <sqlite3.h>
#include <mutex>
#include <string>
#include <vector>
struct Country {
/// @brief Country identifier from the source dataset.
int id;
/// @brief Country display name.
std::string name;
/// @brief ISO 3166-1 alpha-2 code.
std::string iso2;
/// @brief ISO 3166-1 alpha-3 code.
std::string iso3;
};
struct State {
/// @brief State or province identifier from the source dataset.
int id;
/// @brief State or province display name.
std::string name;
/// @brief State or province short code.
std::string iso2;
/// @brief Parent country identifier.
int country_id;
};
struct City {
/// @brief City identifier from the source dataset.
int id;
/// @brief City display name.
std::string name;
/// @brief Parent country identifier.
int country_id;
};
/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks.
class SqliteDatabase {
private:
sqlite3* db_ = nullptr;
std::mutex db_mutex_;
void InitializeSchema();
public:
/// @brief Closes the SQLite connection if initialized.
~SqliteDatabase();
/// @brief Opens the SQLite database at db_path and creates schema objects.
void Initialize(const std::string& db_path = ":memory:");
/// @brief Starts a database transaction for batched writes.
void BeginTransaction();
/// @brief Commits the active database transaction.
void CommitTransaction();
/// @brief Rolls back the active database transaction.
void RollbackTransaction();
/// @brief Inserts a country row.
void InsertCountry(int id, const std::string& name, const std::string& iso2,
const std::string& iso3);
/// @brief Inserts a state row linked to a country.
void InsertState(int id, int country_id, const std::string& name,
const std::string& iso2);
/// @brief Inserts a city row linked to state and country.
void InsertCity(int id, int state_id, int country_id,
const std::string& name, double latitude, double longitude);
/// @brief Returns city records including parent country id.
std::vector<City> QueryCities();
/// @brief Returns countries with optional row limit.
std::vector<Country> QueryCountries(int limit = 0);
/// @brief Returns states with optional row limit.
std::vector<State> QueryStates(int limit = 0);
};
#endif // BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_

View File

@@ -2,16 +2,15 @@
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_ #define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
#include <string> #include <string>
#include <vector>
#include "database/database.h" #include "models/location.h"
#include "json_handling/stream_parser.h"
/// @brief Loads world-city JSON data into SQLite through streaming parsing. /// @brief Loads curated world locations from a JSON file into memory.
class JsonLoader { class JsonLoader {
public: public:
/// @brief Parses a JSON file and writes country/state/city rows into db. /// @brief Parses a JSON array file and returns all location records.
static void LoadWorldCities(const std::string& json_path, static std::vector<Location> LoadLocations(const std::string& filepath);
SqliteDatabase& db);
}; };
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_ #endif // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_

View File

@@ -1,52 +0,0 @@
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
#define BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
#include <functional>
#include <string>
#include "database/database.h"
// Forward declaration to avoid circular dependency
class SqliteDatabase;
/// @brief In-memory representation of one parsed city entry.
struct CityRecord {
int id;
int state_id;
int country_id;
std::string name;
double latitude;
double longitude;
};
/// @brief Streaming SAX parser that emits city records during traversal.
class StreamingJsonParser {
public:
/// @brief Parses file_path and invokes callbacks for city rows and progress.
static void Parse(const std::string& file_path, SqliteDatabase& db,
std::function<void(const CityRecord&)> on_city,
std::function<void(size_t, size_t)> on_progress = nullptr);
private:
/// @brief Mutable SAX handler state while traversing nested JSON arrays.
struct ParseState {
int current_country_id = 0;
int current_state_id = 0;
CityRecord current_city = {};
bool building_city = false;
std::string current_key;
int array_depth = 0;
int object_depth = 0;
bool in_countries_array = false;
bool in_states_array = false;
bool in_cities_array = false;
std::function<void(const CityRecord&)> on_city;
std::function<void(size_t, size_t)> on_progress;
size_t bytes_processed = 0;
};
};
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_

902
pipeline/locations.json Normal file
View File

@@ -0,0 +1,902 @@
[
{
"city": "Cape Town",
"state_province": "Western Cape",
"iso3166_2": "ZA-WC",
"country": "South Africa",
"iso3166_1": "ZA",
"latitude": -33.9249,
"longitude": 18.4241
},
{
"city": "Johannesburg",
"state_province": "Gauteng",
"iso3166_2": "ZA-GT",
"country": "South Africa",
"iso3166_1": "ZA",
"latitude": -26.2041,
"longitude": 28.0473
},
{
"city": "Durban",
"state_province": "KwaZulu-Natal",
"iso3166_2": "ZA-NL",
"country": "South Africa",
"iso3166_1": "ZA",
"latitude": -29.8587,
"longitude": 31.0218
},
{
"city": "Franschhoek",
"state_province": "Western Cape",
"iso3166_2": "ZA-WC",
"country": "South Africa",
"iso3166_1": "ZA",
"latitude": -33.9146,
"longitude": 19.1198
},
{
"city": "Nairobi",
"state_province": "Nairobi",
"iso3166_2": "KE-30",
"country": "Kenya",
"iso3166_1": "KE",
"latitude": -1.2921,
"longitude": 36.8219
},
{
"city": "Buenos Aires",
"state_province": "Buenos Aires City",
"iso3166_2": "AR-C",
"country": "Argentina",
"iso3166_1": "AR",
"latitude": -34.6037,
"longitude": -58.3816
},
{
"city": "Bariloche",
"state_province": "Río Negro",
"iso3166_2": "AR-R",
"country": "Argentina",
"iso3166_1": "AR",
"latitude": -41.1335,
"longitude": -71.3103
},
{
"city": "Bogotá",
"state_province": "Bogotá D.C.",
"iso3166_2": "CO-DC",
"country": "Colombia",
"iso3166_1": "CO",
"latitude": 4.711,
"longitude": -74.0721
},
{
"city": "Medellín",
"state_province": "Antioquia",
"iso3166_2": "CO-ANT",
"country": "Colombia",
"iso3166_1": "CO",
"latitude": 6.2442,
"longitude": -75.5812
},
{
"city": "São Paulo",
"state_province": "São Paulo",
"iso3166_2": "BR-SP",
"country": "Brazil",
"iso3166_1": "BR",
"latitude": -23.5505,
"longitude": -46.6333
},
{
"city": "Curitiba",
"state_province": "Paraná",
"iso3166_2": "BR-PR",
"country": "Brazil",
"iso3166_1": "BR",
"latitude": -25.4284,
"longitude": -49.2733
},
{
"city": "Rio de Janeiro",
"state_province": "Rio de Janeiro",
"iso3166_2": "BR-RJ",
"country": "Brazil",
"iso3166_1": "BR",
"latitude": -22.9068,
"longitude": -43.1729
},
{
"city": "Santiago",
"state_province": "Santiago Metropolitan",
"iso3166_2": "CL-RM",
"country": "Chile",
"iso3166_1": "CL",
"latitude": -33.4489,
"longitude": -70.6693
},
{
"city": "Valdivia",
"state_province": "Los Ríos",
"iso3166_2": "CL-LR",
"country": "Chile",
"iso3166_1": "CL",
"latitude": -39.8142,
"longitude": -73.2459
},
{
"city": "Lima",
"state_province": "Lima",
"iso3166_2": "PE-LMA",
"country": "Peru",
"iso3166_1": "PE",
"latitude": -12.0464,
"longitude": -77.0428
},
{
"city": "Tokyo",
"state_province": "Tokyo",
"iso3166_2": "JP-13",
"country": "Japan",
"iso3166_1": "JP",
"latitude": 35.6762,
"longitude": 139.6503
},
{
"city": "Osaka",
"state_province": "Osaka",
"iso3166_2": "JP-27",
"country": "Japan",
"iso3166_1": "JP",
"latitude": 34.6937,
"longitude": 135.5023
},
{
"city": "Kyoto",
"state_province": "Kyoto",
"iso3166_2": "JP-26",
"country": "Japan",
"iso3166_1": "JP",
"latitude": 35.0116,
"longitude": 135.7681
},
{
"city": "Sapporo",
"state_province": "Hokkaido",
"iso3166_2": "JP-01",
"country": "Japan",
"iso3166_1": "JP",
"latitude": 43.0618,
"longitude": 141.3545
},
{
"city": "Seoul",
"state_province": "Seoul",
"iso3166_2": "KR-11",
"country": "South Korea",
"iso3166_1": "KR",
"latitude": 37.5665,
"longitude": 126.978
},
{
"city": "Busan",
"state_province": "Busan",
"iso3166_2": "KR-26",
"country": "South Korea",
"iso3166_1": "KR",
"latitude": 35.1796,
"longitude": 129.0756
},
{
"city": "Ho Chi Minh City",
"state_province": "Ho Chi Minh",
"iso3166_2": "VN-SG",
"country": "Vietnam",
"iso3166_1": "VN",
"latitude": 10.8231,
"longitude": 106.6297
},
{
"city": "Hanoi",
"state_province": "Hanoi",
"iso3166_2": "VN-HN",
"country": "Vietnam",
"iso3166_1": "VN",
"latitude": 21.0285,
"longitude": 105.8542
},
{
"city": "Da Nang",
"state_province": "Da Nang",
"iso3166_2": "VN-DN",
"country": "Vietnam",
"iso3166_1": "VN",
"latitude": 16.0544,
"longitude": 108.2022
},
{
"city": "Bangkok",
"state_province": "Bangkok",
"iso3166_2": "TH-10",
"country": "Thailand",
"iso3166_1": "TH",
"latitude": 13.7563,
"longitude": 100.5018
},
{
"city": "Taipei",
"state_province": "Taipei",
"iso3166_2": "TW-TPE",
"country": "Taiwan",
"iso3166_1": "TW",
"latitude": 25.033,
"longitude": 121.5654
},
{
"city": "Beijing",
"state_province": "Beijing",
"iso3166_2": "CN-BJ",
"country": "China",
"iso3166_1": "CN",
"latitude": 39.9042,
"longitude": 116.4074
},
{
"city": "Shanghai",
"state_province": "Shanghai",
"iso3166_2": "CN-SH",
"country": "China",
"iso3166_1": "CN",
"latitude": 31.2304,
"longitude": 121.4737
},
{
"city": "Bengaluru",
"state_province": "Karnataka",
"iso3166_2": "IN-KA",
"country": "India",
"iso3166_1": "IN",
"latitude": 12.9716,
"longitude": 77.5946
},
{
"city": "Singapore",
"state_province": "Central Singapore",
"iso3166_2": "SG-01",
"country": "Singapore",
"iso3166_1": "SG",
"latitude": 1.3521,
"longitude": 103.8198
},
{
"city": "Melbourne",
"state_province": "Victoria",
"iso3166_2": "AU-VIC",
"country": "Australia",
"iso3166_1": "AU",
"latitude": -37.8136,
"longitude": 144.9631
},
{
"city": "Sydney",
"state_province": "New South Wales",
"iso3166_2": "AU-NSW",
"country": "Australia",
"iso3166_1": "AU",
"latitude": -33.8688,
"longitude": 151.2093
},
{
"city": "Brisbane",
"state_province": "Queensland",
"iso3166_2": "AU-QLD",
"country": "Australia",
"iso3166_1": "AU",
"latitude": -27.4705,
"longitude": 153.026
},
{
"city": "Adelaide",
"state_province": "South Australia",
"iso3166_2": "AU-SA",
"country": "Australia",
"iso3166_1": "AU",
"latitude": -34.9285,
"longitude": 138.6007
},
{
"city": "Perth",
"state_province": "Western Australia",
"iso3166_2": "AU-WA",
"country": "Australia",
"iso3166_1": "AU",
"latitude": -31.9505,
"longitude": 115.8605
},
{
"city": "Hobart",
"state_province": "Tasmania",
"iso3166_2": "AU-TAS",
"country": "Australia",
"iso3166_1": "AU",
"latitude": -42.8821,
"longitude": 147.3272
},
{
"city": "Wellington",
"state_province": "Wellington",
"iso3166_2": "NZ-WGN",
"country": "New Zealand",
"iso3166_1": "NZ",
"latitude": -41.2865,
"longitude": 174.7762
},
{
"city": "Auckland",
"state_province": "Auckland",
"iso3166_2": "NZ-AUK",
"country": "New Zealand",
"iso3166_1": "NZ",
"latitude": -36.8485,
"longitude": 174.7633
},
{
"city": "Christchurch",
"state_province": "Canterbury",
"iso3166_2": "NZ-CAN",
"country": "New Zealand",
"iso3166_1": "NZ",
"latitude": -43.532,
"longitude": 172.6306
},
{
"city": "Nelson",
"state_province": "Nelson",
"iso3166_2": "NZ-NSN",
"country": "New Zealand",
"iso3166_1": "NZ",
"latitude": -41.2706,
"longitude": 173.284
},
{
"city": "Munich",
"state_province": "Bavaria",
"iso3166_2": "DE-BY",
"country": "Germany",
"iso3166_1": "DE",
"latitude": 48.1351,
"longitude": 11.582
},
{
"city": "Berlin",
"state_province": "Berlin",
"iso3166_2": "DE-BE",
"country": "Germany",
"iso3166_1": "DE",
"latitude": 52.52,
"longitude": 13.405
},
{
"city": "Cologne",
"state_province": "North Rhine-Westphalia",
"iso3166_2": "DE-NW",
"country": "Germany",
"iso3166_1": "DE",
"latitude": 50.9375,
"longitude": 6.9603
},
{
"city": "Bamberg",
"state_province": "Bavaria",
"iso3166_2": "DE-BY",
"country": "Germany",
"iso3166_1": "DE",
"latitude": 49.8916,
"longitude": 10.8916
},
{
"city": "Brussels",
"state_province": "Brussels-Capital",
"iso3166_2": "BE-BRU",
"country": "Belgium",
"iso3166_1": "BE",
"latitude": 50.8503,
"longitude": 4.3517
},
{
"city": "Antwerp",
"state_province": "Flanders",
"iso3166_2": "BE-VLG",
"country": "Belgium",
"iso3166_1": "BE",
"latitude": 51.2194,
"longitude": 4.4025
},
{
"city": "Bruges",
"state_province": "Flanders",
"iso3166_2": "BE-VLG",
"country": "Belgium",
"iso3166_1": "BE",
"latitude": 51.2093,
"longitude": 3.2247
},
{
"city": "London",
"state_province": "England",
"iso3166_2": "GB-ENG",
"country": "United Kingdom",
"iso3166_1": "GB",
"latitude": 51.5074,
"longitude": -0.1278
},
{
"city": "Bristol",
"state_province": "England",
"iso3166_2": "GB-ENG",
"country": "United Kingdom",
"iso3166_1": "GB",
"latitude": 51.4545,
"longitude": -2.5879
},
{
"city": "Edinburgh",
"state_province": "Scotland",
"iso3166_2": "GB-SCT",
"country": "United Kingdom",
"iso3166_1": "GB",
"latitude": 55.9533,
"longitude": -3.1883
},
{
"city": "Glasgow",
"state_province": "Scotland",
"iso3166_2": "GB-SCT",
"country": "United Kingdom",
"iso3166_1": "GB",
"latitude": 55.8642,
"longitude": -4.2518
},
{
"city": "Prague",
"state_province": "Prague",
"iso3166_2": "CZ-10",
"country": "Czechia",
"iso3166_1": "CZ",
"latitude": 50.0755,
"longitude": 14.4378
},
{
"city": "Pilsen",
"state_province": "Plzeň",
"iso3166_2": "CZ-32",
"country": "Czechia",
"iso3166_1": "CZ",
"latitude": 49.7384,
"longitude": 13.3736
},
{
"city": "Amsterdam",
"state_province": "North Holland",
"iso3166_2": "NL-NH",
"country": "Netherlands",
"iso3166_1": "NL",
"latitude": 52.3676,
"longitude": 4.9041
},
{
"city": "Copenhagen",
"state_province": "Capital Region",
"iso3166_2": "DK-84",
"country": "Denmark",
"iso3166_1": "DK",
"latitude": 55.6761,
"longitude": 12.5683
},
{
"city": "Warsaw",
"state_province": "Masovian",
"iso3166_2": "PL-MZ",
"country": "Poland",
"iso3166_1": "PL",
"latitude": 52.2297,
"longitude": 21.0122
},
{
"city": "Krakow",
"state_province": "Lesser Poland",
"iso3166_2": "PL-MA",
"country": "Poland",
"iso3166_1": "PL",
"latitude": 50.0647,
"longitude": 19.945
},
{
"city": "Rome",
"state_province": "Lazio",
"iso3166_2": "IT-62",
"country": "Italy",
"iso3166_1": "IT",
"latitude": 41.9028,
"longitude": 12.4964
},
{
"city": "Milan",
"state_province": "Lombardy",
"iso3166_2": "IT-25",
"country": "Italy",
"iso3166_1": "IT",
"latitude": 45.4642,
"longitude": 9.19
},
{
"city": "Barcelona",
"state_province": "Catalonia",
"iso3166_2": "ES-CT",
"country": "Spain",
"iso3166_1": "ES",
"latitude": 41.3851,
"longitude": 2.1734
},
{
"city": "Madrid",
"state_province": "Madrid",
"iso3166_2": "ES-MD",
"country": "Spain",
"iso3166_1": "ES",
"latitude": 40.4168,
"longitude": -3.7038
},
{
"city": "Paris",
"state_province": "Île-de-France",
"iso3166_2": "FR-IDF",
"country": "France",
"iso3166_1": "FR",
"latitude": 48.8566,
"longitude": 2.3522
},
{
"city": "Lyon",
"state_province": "Auvergne-Rhône-Alpes",
"iso3166_2": "FR-ARA",
"country": "France",
"iso3166_1": "FR",
"latitude": 45.764,
"longitude": 4.8357
},
{
"city": "Stockholm",
"state_province": "Stockholm",
"iso3166_2": "SE-AB",
"country": "Sweden",
"iso3166_1": "SE",
"latitude": 59.3293,
"longitude": 18.0686
},
{
"city": "Gothenburg",
"state_province": "Västra Götaland",
"iso3166_2": "SE-O",
"country": "Sweden",
"iso3166_1": "SE",
"latitude": 57.7089,
"longitude": 11.9746
},
{
"city": "Oslo",
"state_province": "Oslo",
"iso3166_2": "NO-03",
"country": "Norway",
"iso3166_1": "NO",
"latitude": 59.9139,
"longitude": 10.7522
},
{
"city": "Dublin",
"state_province": "Leinster",
"iso3166_2": "IE-L",
"country": "Ireland",
"iso3166_1": "IE",
"latitude": 53.3498,
"longitude": -6.2603
},
{
"city": "Vienna",
"state_province": "Vienna",
"iso3166_2": "AT-9",
"country": "Austria",
"iso3166_1": "AT",
"latitude": 48.2082,
"longitude": 16.3738
},
{
"city": "Zurich",
"state_province": "Zurich",
"iso3166_2": "CH-ZH",
"country": "Switzerland",
"iso3166_1": "CH",
"latitude": 47.3769,
"longitude": 8.5417
},
{
"city": "Tallinn",
"state_province": "Harju",
"iso3166_2": "EE-37",
"country": "Estonia",
"iso3166_1": "EE",
"latitude": 59.437,
"longitude": 24.7536
},
{
"city": "Denver",
"state_province": "Colorado",
"iso3166_2": "US-CO",
"country": "United States",
"iso3166_1": "US",
"latitude": 39.7392,
"longitude": -104.9903
},
{
"city": "Portland",
"state_province": "Oregon",
"iso3166_2": "US-OR",
"country": "United States",
"iso3166_1": "US",
"latitude": 45.5152,
"longitude": -122.6784
},
{
"city": "San Diego",
"state_province": "California",
"iso3166_2": "US-CA",
"country": "United States",
"iso3166_1": "US",
"latitude": 32.7157,
"longitude": -117.1611
},
{
"city": "Asheville",
"state_province": "North Carolina",
"iso3166_2": "US-NC",
"country": "United States",
"iso3166_1": "US",
"latitude": 35.5951,
"longitude": -82.5515
},
{
"city": "Grand Rapids",
"state_province": "Michigan",
"iso3166_2": "US-MI",
"country": "United States",
"iso3166_1": "US",
"latitude": 42.9634,
"longitude": -85.6681
},
{
"city": "Chicago",
"state_province": "Illinois",
"iso3166_2": "US-IL",
"country": "United States",
"iso3166_1": "US",
"latitude": 41.8781,
"longitude": -87.6298
},
{
"city": "Seattle",
"state_province": "Washington",
"iso3166_2": "US-WA",
"country": "United States",
"iso3166_1": "US",
"latitude": 47.6062,
"longitude": -122.3321
},
{
"city": "Austin",
"state_province": "Texas",
"iso3166_2": "US-TX",
"country": "United States",
"iso3166_1": "US",
"latitude": 30.2672,
"longitude": -97.7431
},
{
"city": "Boston",
"state_province": "Massachusetts",
"iso3166_2": "US-MA",
"country": "United States",
"iso3166_1": "US",
"latitude": 42.3601,
"longitude": -71.0589
},
{
"city": "Philadelphia",
"state_province": "Pennsylvania",
"iso3166_2": "US-PA",
"country": "United States",
"iso3166_1": "US",
"latitude": 39.9526,
"longitude": -75.1652
},
{
"city": "Brooklyn",
"state_province": "New York",
"iso3166_2": "US-NY",
"country": "United States",
"iso3166_1": "US",
"latitude": 40.6782,
"longitude": -73.9442
},
{
"city": "Milwaukee",
"state_province": "Wisconsin",
"iso3166_2": "US-WI",
"country": "United States",
"iso3166_1": "US",
"latitude": 43.0389,
"longitude": -87.9065
},
{
"city": "Richmond",
"state_province": "Virginia",
"iso3166_2": "US-VA",
"country": "United States",
"iso3166_1": "US",
"latitude": 37.5407,
"longitude": -77.436
},
{
"city": "Cincinnati",
"state_province": "Ohio",
"iso3166_2": "US-OH",
"country": "United States",
"iso3166_1": "US",
"latitude": 39.1031,
"longitude": -84.512
},
{
"city": "St. Louis",
"state_province": "Missouri",
"iso3166_2": "US-MO",
"country": "United States",
"iso3166_1": "US",
"latitude": 38.627,
"longitude": -90.1994
},
{
"city": "Tampa",
"state_province": "Florida",
"iso3166_2": "US-FL",
"country": "United States",
"iso3166_1": "US",
"latitude": 27.9506,
"longitude": -82.4572
},
{
"city": "Minneapolis",
"state_province": "Minnesota",
"iso3166_2": "US-MN",
"country": "United States",
"iso3166_1": "US",
"latitude": 44.9778,
"longitude": -93.265
},
{
"city": "Burlington",
"state_province": "Vermont",
"iso3166_2": "US-VT",
"country": "United States",
"iso3166_1": "US",
"latitude": 44.4759,
"longitude": -73.2121
},
{
"city": "Portland",
"state_province": "Maine",
"iso3166_2": "US-ME",
"country": "United States",
"iso3166_1": "US",
"latitude": 43.6591,
"longitude": -70.2568
},
{
"city": "Atlanta",
"state_province": "Georgia",
"iso3166_2": "US-GA",
"country": "United States",
"iso3166_1": "US",
"latitude": 33.749,
"longitude": -84.388
},
{
"city": "Toronto",
"state_province": "Ontario",
"iso3166_2": "CA-ON",
"country": "Canada",
"iso3166_1": "CA",
"latitude": 43.651,
"longitude": -79.347
},
{
"city": "Vancouver",
"state_province": "British Columbia",
"iso3166_2": "CA-BC",
"country": "Canada",
"iso3166_1": "CA",
"latitude": 49.2827,
"longitude": -123.1207
},
{
"city": "Montreal",
"state_province": "Quebec",
"iso3166_2": "CA-QC",
"country": "Canada",
"iso3166_1": "CA",
"latitude": 45.5017,
"longitude": -73.5673
},
{
"city": "Calgary",
"state_province": "Alberta",
"iso3166_2": "CA-AB",
"country": "Canada",
"iso3166_1": "CA",
"latitude": 51.0447,
"longitude": -114.0719
},
{
"city": "Halifax",
"state_province": "Nova Scotia",
"iso3166_2": "CA-NS",
"country": "Canada",
"iso3166_1": "CA",
"latitude": 44.6488,
"longitude": -63.5752
},
{
"city": "Mexico City",
"state_province": "Mexico City",
"iso3166_2": "MX-CMX",
"country": "Mexico",
"iso3166_1": "MX",
"latitude": 19.4326,
"longitude": -99.1332
},
{
"city": "Tijuana",
"state_province": "Baja California",
"iso3166_2": "MX-BCN",
"country": "Mexico",
"iso3166_1": "MX",
"latitude": 32.5149,
"longitude": -117.0382
},
{
"city": "Monterrey",
"state_province": "Nuevo León",
"iso3166_2": "MX-NLE",
"country": "Mexico",
"iso3166_1": "MX",
"latitude": 25.6866,
"longitude": -100.3161
},
{
"city": "Guadalajara",
"state_province": "Jalisco",
"iso3166_2": "MX-JAL",
"country": "Mexico",
"iso3166_1": "MX",
"latitude": 20.6597,
"longitude": -103.3496
},
{
"city": "Ensenada",
"state_province": "Baja California",
"iso3166_2": "MX-BCN",
"country": "Mexico",
"iso3166_1": "MX",
"latitude": 31.8667,
"longitude": -116.5964
}
]

View File

@@ -4,20 +4,21 @@
#include <algorithm> #include <algorithm>
#include <filesystem> #include <filesystem>
#include <unordered_map> #include <future>
#include <iterator>
#include <random>
#include "data_generation/data_downloader.h"
#include "data_generation/llama_generator.h" #include "data_generation/llama_generator.h"
#include "data_generation/mock_generator.h" #include "data_generation/mock_generator.h"
#include "json_handling/json_loader.h" #include "json_handling/json_loader.h"
#include "wikipedia/wikipedia_service.h" #include "wikipedia/wikipedia_service.h"
BiergartenDataGenerator::BiergartenDataGenerator( BiergartenDataGenerator::BiergartenDataGenerator(
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client, const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
SqliteDatabase& database) : options_(options), webClient_(std::move(web_client)) {}
: options_(options), webClient_(web_client), database_(database) {}
std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() { auto BiergartenDataGenerator::InitializeGenerator()
-> std::unique_ptr<DataGenerator> {
spdlog::info("Initializing brewery generator..."); spdlog::info("Initializing brewery generator...");
std::unique_ptr<DataGenerator> generator; std::unique_ptr<DataGenerator> generator;
@@ -41,75 +42,60 @@ std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
return generator; return generator;
} }
void BiergartenDataGenerator::LoadGeographicData() { auto BiergartenDataGenerator::QueryCitiesWithCountries()
std::string json_path = options_.cache_dir + "/countries+states+cities.json"; -> std::vector<Location> {
std::string db_path = options_.cache_dir + "/biergarten-pipeline.db";
bool has_json_cache = std::filesystem::exists(json_path);
bool has_db_cache = std::filesystem::exists(db_path);
spdlog::info("Initializing SQLite database at {}...", db_path);
database_.Initialize(db_path);
if (has_db_cache && has_json_cache) {
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
} else {
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
DataDownloader downloader(webClient_);
downloader.DownloadCountriesDatabase(json_path, options_.commit);
JsonLoader::LoadWorldCities(json_path, database_);
}
}
std::vector<std::pair<City, std::string>>
BiergartenDataGenerator::QueryCitiesWithCountries() {
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ==="); spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
auto cities = database_.QueryCities(); std::filesystem::path locations_path = "locations.json";
if (!std::filesystem::exists(locations_path)) {
// Build a quick map of country id -> name for per-city lookups. const std::filesystem::path cache_path =
auto all_countries = database_.QueryCountries(0); std::filesystem::path(options_.cache_dir) / "locations.json";
std::unordered_map<int, std::string> country_map; if (std::filesystem::exists(cache_path)) {
for (const auto& c : all_countries) { locations_path = cache_path;
country_map[c.id] = c.name; }
} }
spdlog::info("\nTotal records loaded:"); auto all_locations = JsonLoader::LoadLocations(locations_path.string());
spdlog::info(" Countries: {}", database_.QueryCountries(0).size()); spdlog::info(" Locations available: {}", all_locations.size());
spdlog::info(" States: {}", database_.QueryStates(0).size());
spdlog::info(" Cities: {}", cities.size());
// Cap at 30 entries. const size_t sample_count = std::min<size_t>(30, all_locations.size());
const size_t sample_count = std::min(size_t(30), cities.size()); std::vector<Location> sampled_locations;
std::vector<std::pair<City, std::string>> result; sampled_locations.reserve(sample_count);
for (size_t i = 0; i < sample_count; i++) { std::random_device random_generator;
const auto& city = cities[i]; std::sample(all_locations.begin(), all_locations.end(),
std::string country_name; std::back_inserter(sampled_locations), sample_count,
const auto country_it = country_map.find(city.country_id); random_generator);
if (country_it != country_map.end()) {
country_name = country_it->second; spdlog::info(" Sampled locations: {}", sampled_locations.size());
} return sampled_locations;
result.push_back({city, country_name});
} }
return result; auto BiergartenDataGenerator::EnrichWithWikipedia(
} const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
std::vector<BiergartenDataGenerator::EnrichedCity>
BiergartenDataGenerator::EnrichWithWikipedia(
const std::vector<std::pair<City, std::string>>& cities) {
WikipediaService wikipedia_service(webClient_);
std::vector<EnrichedCity> enriched; std::vector<EnrichedCity> enriched;
enriched.reserve(cities.size());
for (const auto& [city, country_name] : cities) { std::vector<std::future<EnrichedCity>> pending;
pending.reserve(cities.size());
for (const auto& city : cities) {
pending.push_back(std::async(std::launch::async,
[web_client = webClient_, city]() {
WikipediaService wikipedia_service(
web_client);
const std::string region_context = const std::string region_context =
wikipedia_service.GetSummary(city.name, country_name); wikipedia_service.GetSummary(
spdlog::debug("[Pipeline] Region context for {}: {}", city.name, city.city, city.country);
region_context); spdlog::debug(
"[Pipeline] Region context for {}: {}",
city.city, region_context);
return EnrichedCity{city, region_context};
}));
}
enriched.push_back({city.id, city.name, country_name, region_context}); for (auto& task : pending) {
enriched.push_back(task.get());
} }
return enriched; return enriched;
@@ -121,28 +107,30 @@ void BiergartenDataGenerator::GenerateBreweries(
generatedBreweries_.clear(); generatedBreweries_.clear();
for (const auto& enriched_city : cities) { for (const auto& enriched_city : cities) {
auto brewery = generator.GenerateBrewery(enriched_city.city_name, auto brewery = generator.GenerateBrewery(enriched_city.location.city,
enriched_city.country_name, enriched_city.location.country,
enriched_city.region_context); enriched_city.region_context);
generatedBreweries_.push_back( generatedBreweries_.push_back({enriched_city.location, brewery});
{enriched_city.city_id, enriched_city.city_name, brewery});
} }
} }
void BiergartenDataGenerator::LogResults() const { void BiergartenDataGenerator::LogResults() const {
spdlog::info("\n=== GENERATED DATA DUMP ==="); spdlog::info("\n=== GENERATED DATA DUMP ===");
for (size_t i = 0; i < generatedBreweries_.size(); i++) { size_t index = 1;
const auto& entry = generatedBreweries_[i]; for (const auto& entry : generatedBreweries_) {
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.city_id, spdlog::info("{}. city=\"{}\" country=\"{}\" state=\"{}\" "
entry.city_name); "iso3166_2={} lat={} lon={}",
index, entry.location.city, entry.location.country,
entry.location.state_province, entry.location.iso3166_2,
entry.location.latitude, entry.location.longitude);
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name); spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description); spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
++index;
} }
} }
int BiergartenDataGenerator::Run() { auto BiergartenDataGenerator::Run() -> int {
try { try {
LoadGeographicData();
auto generator = InitializeGenerator(); auto generator = InitializeGenerator();
auto cities = QueryCitiesWithCountries(); auto cities = QueryCitiesWithCountries();
auto enriched = EnrichWithWikipedia(cities); auto enriched = EnrichWithWikipedia(cities);

View File

@@ -1,44 +0,0 @@
#include "data_generation/data_downloader.h"
#include <spdlog/spdlog.h>
#include <filesystem>
#include <fstream>
#include <sstream>
#include <stdexcept>
#include "web_client/web_client.h"
DataDownloader::DataDownloader(std::shared_ptr<WebClient> web_client)
: web_client_(std::move(web_client)) {}
DataDownloader::~DataDownloader() {}
bool DataDownloader::FileExists(const std::string& file_path) {
return std::filesystem::exists(file_path);
}
std::string DataDownloader::DownloadCountriesDatabase(
const std::string& cache_path, const std::string& commit) {
if (FileExists(cache_path)) {
spdlog::info("[DataDownloader] Cache hit: {}", cache_path);
return cache_path;
}
std::string url =
"https://raw.githubusercontent.com/dr5hn/"
"countries-states-cities-database/" +
commit + "/json/countries+states+cities.json";
spdlog::info("[DataDownloader] Downloading: {}", url);
web_client_->DownloadToFile(url, cache_path);
std::ifstream file_check(cache_path, std::ios::binary | std::ios::ate);
std::streamsize size = file_check.tellg();
file_check.close();
spdlog::info("[DataDownloader] OK: Download complete: {} ({:.2f} MB)",
cache_path, (size / (1024.0 * 1024.0)));
return cache_path;
}

View File

@@ -1,264 +0,0 @@
#include "database/database.h"
#include <spdlog/spdlog.h>
#include <stdexcept>
void SqliteDatabase::InitializeSchema() {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* schema = R"(
CREATE TABLE IF NOT EXISTS countries (
id INTEGER PRIMARY KEY,
name TEXT NOT NULL,
iso2 TEXT,
iso3 TEXT
);
CREATE TABLE IF NOT EXISTS states (
id INTEGER PRIMARY KEY,
country_id INTEGER NOT NULL,
name TEXT NOT NULL,
iso2 TEXT,
FOREIGN KEY(country_id) REFERENCES countries(id)
);
CREATE TABLE IF NOT EXISTS cities (
id INTEGER PRIMARY KEY,
state_id INTEGER NOT NULL,
country_id INTEGER NOT NULL,
name TEXT NOT NULL,
latitude REAL,
longitude REAL,
FOREIGN KEY(state_id) REFERENCES states(id),
FOREIGN KEY(country_id) REFERENCES countries(id)
);
)";
char* errMsg = nullptr;
int rc = sqlite3_exec(db_, schema, nullptr, nullptr, &errMsg);
if (rc != SQLITE_OK) {
std::string error = errMsg ? std::string(errMsg) : "Unknown error";
sqlite3_free(errMsg);
throw std::runtime_error("Failed to create schema: " + error);
}
}
SqliteDatabase::~SqliteDatabase() {
if (db_) {
sqlite3_close(db_);
}
}
void SqliteDatabase::Initialize(const std::string& db_path) {
int rc = sqlite3_open(db_path.c_str(), &db_);
if (rc) {
throw std::runtime_error("Failed to open SQLite database: " + db_path);
}
spdlog::info("OK: SQLite database opened: {}", db_path);
InitializeSchema();
}
void SqliteDatabase::BeginTransaction() {
std::lock_guard<std::mutex> lock(db_mutex_);
char* err = nullptr;
if (sqlite3_exec(db_, "BEGIN TRANSACTION", nullptr, nullptr, &err) !=
SQLITE_OK) {
std::string msg = err ? err : "unknown";
sqlite3_free(err);
throw std::runtime_error("BeginTransaction failed: " + msg);
}
}
void SqliteDatabase::CommitTransaction() {
std::lock_guard<std::mutex> lock(db_mutex_);
char* err = nullptr;
if (sqlite3_exec(db_, "COMMIT", nullptr, nullptr, &err) != SQLITE_OK) {
std::string msg = err ? err : "unknown";
sqlite3_free(err);
throw std::runtime_error("CommitTransaction failed: " + msg);
}
}
void SqliteDatabase::RollbackTransaction() {
std::lock_guard<std::mutex> lock(db_mutex_);
char* err = nullptr;
if (sqlite3_exec(db_, "ROLLBACK", nullptr, nullptr, &err) != SQLITE_OK) {
std::string msg = err ? err : "unknown";
sqlite3_free(err);
throw std::runtime_error("RollbackTransaction failed: " + msg);
}
}
void SqliteDatabase::InsertCountry(int id, const std::string& name,
const std::string& iso2,
const std::string& iso3) {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* query = R"(
INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
VALUES (?, ?, ?, ?)
)";
sqlite3_stmt* stmt;
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK)
throw std::runtime_error("Failed to prepare country insert");
sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_TRANSIENT);
if (sqlite3_step(stmt) != SQLITE_DONE) {
throw std::runtime_error("Failed to insert country");
}
sqlite3_finalize(stmt);
}
void SqliteDatabase::InsertState(int id, int country_id,
const std::string& name,
const std::string& iso2) {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* query = R"(
INSERT OR IGNORE INTO states (id, country_id, name, iso2)
VALUES (?, ?, ?, ?)
)";
sqlite3_stmt* stmt;
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK)
throw std::runtime_error("Failed to prepare state insert");
sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_int(stmt, 2, country_id);
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_TRANSIENT);
if (sqlite3_step(stmt) != SQLITE_DONE) {
throw std::runtime_error("Failed to insert state");
}
sqlite3_finalize(stmt);
}
void SqliteDatabase::InsertCity(int id, int state_id, int country_id,
const std::string& name, double latitude,
double longitude) {
std::lock_guard<std::mutex> lock(db_mutex_);
const char* query = R"(
INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
VALUES (?, ?, ?, ?, ?, ?)
)";
sqlite3_stmt* stmt;
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK)
throw std::runtime_error("Failed to prepare city insert");
sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_int(stmt, 2, state_id);
sqlite3_bind_int(stmt, 3, country_id);
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_double(stmt, 5, latitude);
sqlite3_bind_double(stmt, 6, longitude);
if (sqlite3_step(stmt) != SQLITE_DONE) {
throw std::runtime_error("Failed to insert city");
}
sqlite3_finalize(stmt);
}
std::vector<City> SqliteDatabase::QueryCities() {
std::lock_guard<std::mutex> lock(db_mutex_);
std::vector<City> cities;
sqlite3_stmt* stmt = nullptr;
const char* query =
"SELECT id, name, country_id FROM cities ORDER BY RANDOM()";
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK) {
throw std::runtime_error("Failed to prepare query");
}
while (sqlite3_step(stmt) == SQLITE_ROW) {
int id = sqlite3_column_int(stmt, 0);
const char* name =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
int country_id = sqlite3_column_int(stmt, 2);
cities.push_back({id, name ? std::string(name) : "", country_id});
}
sqlite3_finalize(stmt);
return cities;
}
std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
std::lock_guard<std::mutex> lock(db_mutex_);
std::vector<Country> countries;
sqlite3_stmt* stmt = nullptr;
std::string query =
"SELECT id, name, iso2, iso3 FROM countries ORDER BY name";
if (limit > 0) {
query += " LIMIT " + std::to_string(limit);
}
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
if (rc != SQLITE_OK) {
throw std::runtime_error("Failed to prepare countries query");
}
while (sqlite3_step(stmt) == SQLITE_ROW) {
int id = sqlite3_column_int(stmt, 0);
const char* name =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
const char* iso2 =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
const char* iso3 =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 3));
countries.push_back({id, name ? std::string(name) : "",
iso2 ? std::string(iso2) : "",
iso3 ? std::string(iso3) : ""});
}
sqlite3_finalize(stmt);
return countries;
}
std::vector<State> SqliteDatabase::QueryStates(int limit) {
std::lock_guard<std::mutex> lock(db_mutex_);
std::vector<State> states;
sqlite3_stmt* stmt = nullptr;
std::string query =
"SELECT id, name, iso2, country_id FROM states ORDER BY name";
if (limit > 0) {
query += " LIMIT " + std::to_string(limit);
}
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
if (rc != SQLITE_OK) {
throw std::runtime_error("Failed to prepare states query");
}
while (sqlite3_step(stmt) == SQLITE_ROW) {
int id = sqlite3_column_int(stmt, 0);
const char* name =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
const char* iso2 =
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
int country_id = sqlite3_column_int(stmt, 3);
states.push_back({id, name ? std::string(name) : "",
iso2 ? std::string(iso2) : "", country_id});
}
sqlite3_finalize(stmt);
return states;
}

View File

@@ -2,66 +2,82 @@
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <chrono> #include <boost/json.hpp>
#include "json_handling/stream_parser.h" #include <fstream>
#include <sstream>
#include <stdexcept>
void JsonLoader::LoadWorldCities(const std::string& json_path, namespace {
SqliteDatabase& db) {
constexpr size_t kBatchSize = 10000;
auto startTime = std::chrono::high_resolution_clock::now(); auto ReadRequiredString(const boost::json::object& object,
spdlog::info("\nLoading {} (streaming Boost.JSON SAX)...", json_path); const char* key) -> std::string {
const boost::json::value* value = object.if_contains(key);
db.BeginTransaction(); if (value == nullptr || !value->is_string()) {
bool transactionOpen = true; throw std::runtime_error(std::string("Missing or invalid string field: ") +
key);
size_t citiesProcessed = 0;
try {
StreamingJsonParser::Parse(
json_path, db,
[&](const CityRecord& record) {
db.InsertCity(record.id, record.state_id, record.country_id,
record.name, record.latitude, record.longitude);
++citiesProcessed;
if (citiesProcessed % kBatchSize == 0) {
db.CommitTransaction();
db.BeginTransaction();
} }
}, return std::string(value->as_string().c_str());
[&](size_t current, size_t /*total*/) {
if (current % kBatchSize == 0 && current > 0) {
spdlog::info(" [Progress] Parsed {} cities...", current);
} }
auto ReadRequiredNumber(const boost::json::object& object, const char* key)
-> double {
const boost::json::value* value = object.if_contains(key);
if (value == nullptr || !value->is_number()) {
throw std::runtime_error(std::string("Missing or invalid numeric field: ") +
key);
}
return value->to_number<double>();
}
} // namespace
auto JsonLoader::LoadLocations(const std::string& filepath)
-> std::vector<Location> {
std::ifstream input(filepath);
if (!input.is_open()) {
throw std::runtime_error("Failed to open locations file: " + filepath);
}
std::stringstream buffer;
buffer << input.rdbuf();
const std::string content = buffer.str();
boost::json::error_code error;
boost::json::value root = boost::json::parse(content, error);
if (error) {
throw std::runtime_error("Failed to parse locations JSON: " +
error.message());
}
if (!root.is_array()) {
throw std::runtime_error(
"Invalid locations JSON: root element must be an array");
}
std::vector<Location> locations;
const auto& items = root.as_array();
locations.reserve(items.size());
for (const auto& item : items) {
if (!item.is_object()) {
throw std::runtime_error(
"Invalid locations JSON: each entry must be an object");
}
const auto& object = item.as_object();
locations.push_back(Location{
.city = ReadRequiredString(object, "city"),
.state_province = ReadRequiredString(object, "state_province"),
.iso3166_2 = ReadRequiredString(object, "iso3166_2"),
.country = ReadRequiredString(object, "country"),
.iso3166_1 = ReadRequiredString(object, "iso3166_1"),
.latitude = ReadRequiredNumber(object, "latitude"),
.longitude = ReadRequiredNumber(object, "longitude"),
}); });
spdlog::info(" OK: Parsed all cities from JSON");
if (transactionOpen) {
db.CommitTransaction();
transactionOpen = false;
}
} catch (...) {
if (transactionOpen) {
db.RollbackTransaction();
transactionOpen = false;
}
throw;
} }
auto endTime = std::chrono::high_resolution_clock::now(); spdlog::info("[JsonLoader] Loaded {} locations from {}", locations.size(),
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>( filepath);
endTime - startTime); return locations;
spdlog::info("\n=== World City Data Loading Summary ===\n");
spdlog::info("Cities inserted: {}", citiesProcessed);
spdlog::info("Elapsed time: {} ms", duration.count());
long long throughput =
(citiesProcessed > 0 && duration.count() > 0)
? (1000LL * static_cast<long long>(citiesProcessed)) /
static_cast<long long>(duration.count())
: 0LL;
spdlog::info("Throughput: {} cities/sec", throughput);
spdlog::info("=======================================\n");
} }

View File

@@ -1,289 +0,0 @@
#include "json_handling/stream_parser.h"
#include <spdlog/spdlog.h>
#include <boost/json.hpp>
#include <boost/json/basic_parser_impl.hpp>
#include <cstdio>
#include <stdexcept>
#include "database/database.h"
class CityRecordHandler {
friend class boost::json::basic_parser<CityRecordHandler>;
public:
static constexpr std::size_t max_array_size = static_cast<std::size_t>(-1);
static constexpr std::size_t max_object_size = static_cast<std::size_t>(-1);
static constexpr std::size_t max_string_size = static_cast<std::size_t>(-1);
static constexpr std::size_t max_key_size = static_cast<std::size_t>(-1);
struct ParseContext {
SqliteDatabase* db = nullptr;
std::function<void(const CityRecord&)> on_city;
std::function<void(size_t, size_t)> on_progress;
size_t cities_emitted = 0;
size_t total_file_size = 0;
int countries_inserted = 0;
int states_inserted = 0;
};
explicit CityRecordHandler(ParseContext& ctx) : context(ctx) {}
private:
ParseContext& context;
int depth = 0;
bool in_countries_array = false;
bool in_country_object = false;
bool in_states_array = false;
bool in_state_object = false;
bool in_cities_array = false;
bool building_city = false;
int current_country_id = 0;
int current_state_id = 0;
CityRecord current_city = {};
std::string current_key;
std::string current_key_val;
std::string current_string_val;
std::string country_info[3];
std::string state_info[2];
// Boost.JSON SAX Hooks
bool on_document_begin(boost::system::error_code&) { return true; }
bool on_document_end(boost::system::error_code&) { return true; }
bool on_array_begin(boost::system::error_code&) {
depth++;
if (depth == 1) {
in_countries_array = true;
} else if (depth == 3 && current_key == "states") {
in_states_array = true;
} else if (depth == 5 && current_key == "cities") {
in_cities_array = true;
}
return true;
}
bool on_array_end(std::size_t, boost::system::error_code&) {
if (depth == 1) {
in_countries_array = false;
} else if (depth == 3) {
in_states_array = false;
} else if (depth == 5) {
in_cities_array = false;
}
depth--;
return true;
}
bool on_object_begin(boost::system::error_code&) {
depth++;
if (depth == 2 && in_countries_array) {
in_country_object = true;
current_country_id = 0;
country_info[0].clear();
country_info[1].clear();
country_info[2].clear();
} else if (depth == 4 && in_states_array) {
in_state_object = true;
current_state_id = 0;
state_info[0].clear();
state_info[1].clear();
} else if (depth == 6 && in_cities_array) {
building_city = true;
current_city = {};
}
return true;
}
bool on_object_end(std::size_t, boost::system::error_code&) {
if (depth == 6 && building_city) {
if (current_city.id > 0 && current_state_id > 0 &&
current_country_id > 0) {
current_city.state_id = current_state_id;
current_city.country_id = current_country_id;
try {
context.on_city(current_city);
context.cities_emitted++;
if (context.on_progress && context.cities_emitted % 10000 == 0) {
context.on_progress(context.cities_emitted,
context.total_file_size);
}
} catch (const std::exception& e) {
spdlog::warn("Record parsing failed: {}", e.what());
}
}
building_city = false;
} else if (depth == 4 && in_state_object) {
if (current_state_id > 0 && current_country_id > 0) {
try {
context.db->InsertState(current_state_id, current_country_id,
state_info[0], state_info[1]);
context.states_inserted++;
} catch (const std::exception& e) {
spdlog::warn("Record parsing failed: {}", e.what());
}
}
in_state_object = false;
} else if (depth == 2 && in_country_object) {
if (current_country_id > 0) {
try {
context.db->InsertCountry(current_country_id, country_info[0],
country_info[1], country_info[2]);
context.countries_inserted++;
} catch (const std::exception& e) {
spdlog::warn("Record parsing failed: {}", e.what());
}
}
in_country_object = false;
}
depth--;
return true;
}
bool on_key_part(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_key_val.append(s.data(), s.size());
return true;
}
bool on_key(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_key_val.append(s.data(), s.size());
current_key = current_key_val;
current_key_val.clear();
return true;
}
bool on_string_part(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_string_val.append(s.data(), s.size());
return true;
}
bool on_string(boost::json::string_view s, std::size_t,
boost::system::error_code&) {
current_string_val.append(s.data(), s.size());
if (building_city && current_key == "name") {
current_city.name = current_string_val;
} else if (in_state_object && current_key == "name") {
state_info[0] = current_string_val;
} else if (in_state_object && current_key == "iso2") {
state_info[1] = current_string_val;
} else if (in_country_object && current_key == "name") {
country_info[0] = current_string_val;
} else if (in_country_object && current_key == "iso2") {
country_info[1] = current_string_val;
} else if (in_country_object && current_key == "iso3") {
country_info[2] = current_string_val;
}
current_string_val.clear();
return true;
}
bool on_number_part(boost::json::string_view, boost::system::error_code&) {
return true;
}
bool on_int64(int64_t i, boost::json::string_view,
boost::system::error_code&) {
if (building_city && current_key == "id") {
current_city.id = static_cast<int>(i);
} else if (in_state_object && current_key == "id") {
current_state_id = static_cast<int>(i);
} else if (in_country_object && current_key == "id") {
current_country_id = static_cast<int>(i);
}
return true;
}
bool on_uint64(uint64_t u, boost::json::string_view,
boost::system::error_code& ec) {
return on_int64(static_cast<int64_t>(u), "", ec);
}
bool on_double(double d, boost::json::string_view,
boost::system::error_code&) {
if (building_city) {
if (current_key == "latitude") {
current_city.latitude = d;
} else if (current_key == "longitude") {
current_city.longitude = d;
}
}
return true;
}
bool on_bool(bool, boost::system::error_code&) { return true; }
bool on_null(boost::system::error_code&) { return true; }
bool on_comment_part(boost::json::string_view, boost::system::error_code&) {
return true;
}
bool on_comment(boost::json::string_view, boost::system::error_code&) {
return true;
}
};
void StreamingJsonParser::Parse(
const std::string& file_path, SqliteDatabase& db,
std::function<void(const CityRecord&)> on_city,
std::function<void(size_t, size_t)> on_progress) {
spdlog::info(" Streaming parse of {} (Boost.JSON)...", file_path);
FILE* file = std::fopen(file_path.c_str(), "rb");
if (!file) {
throw std::runtime_error("Failed to open JSON file: " + file_path);
}
size_t total_size = 0;
if (std::fseek(file, 0, SEEK_END) == 0) {
long file_size = std::ftell(file);
if (file_size > 0) {
total_size = static_cast<size_t>(file_size);
}
std::rewind(file);
}
CityRecordHandler::ParseContext ctx{&db, on_city, on_progress, 0, total_size,
0, 0};
boost::json::basic_parser<CityRecordHandler> parser(
boost::json::parse_options{}, ctx);
char buf[65536];
size_t bytes_read;
boost::system::error_code ec;
while ((bytes_read = std::fread(buf, 1, sizeof(buf), file)) > 0) {
char const* p = buf;
std::size_t remain = bytes_read;
while (remain > 0) {
std::size_t consumed = parser.write_some(true, p, remain, ec);
if (ec) {
std::fclose(file);
throw std::runtime_error("JSON parse error: " + ec.message());
}
p += consumed;
remain -= consumed;
}
}
parser.write_some(false, nullptr, 0, ec); // Signal EOF
std::fclose(file);
if (ec) {
throw std::runtime_error("JSON parse error at EOF: " + ec.message());
}
spdlog::info(" OK: Parsed {} countries, {} states, {} cities",
ctx.countries_inserted, ctx.states_inserted,
ctx.cities_emitted);
}

View File

@@ -5,7 +5,6 @@
#include <memory> #include <memory>
#include "biergarten_data_generator.h" #include "biergarten_data_generator.h"
#include "database/database.h"
#include "web_client/curl_web_client.h" #include "web_client/curl_web_client.h"
namespace po = boost::program_options; namespace po = boost::program_options;
@@ -122,9 +121,8 @@ int main(int argc, char* argv[]) {
} }
auto webClient = std::make_shared<CURLWebClient>(); auto webClient = std::make_shared<CURLWebClient>();
SqliteDatabase database;
BiergartenDataGenerator generator(options, webClient, database); BiergartenDataGenerator generator(options, webClient);
return generator.Run(); return generator.Run();
} catch (const std::exception& e) { } catch (const std::exception& e) {