mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 01:54:00 +00:00
replace SQLite geo pipeline with curated in-memory locations
This commit is contained in:
2
pipeline/.gitignore
vendored
2
pipeline/.gitignore
vendored
@@ -1,3 +1,5 @@
|
|||||||
dist
|
dist
|
||||||
build
|
build
|
||||||
data
|
data
|
||||||
|
models
|
||||||
|
*.gguf
|
||||||
|
|||||||
@@ -1,170 +1,104 @@
|
|||||||
cmake_minimum_required(VERSION 3.20)
|
cmake_minimum_required(VERSION 3.24)
|
||||||
project(biergarten-pipeline VERSION 0.1.0 LANGUAGES CXX)
|
project(biergarten-pipeline)
|
||||||
|
# =============================================================================
|
||||||
# Allows older dependencies to configure on newer CMake.
|
# 1. GPU Detection
|
||||||
set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
|
# =============================================================================
|
||||||
|
# GGML_CUDA / GGML_METAL are set here so that the llama.cpp FetchContent below
|
||||||
# Policies
|
# inherits them as cache variables before its CMakeLists.txt is processed.
|
||||||
cmake_policy(SET CMP0167 NEW) # FindBoost improvements
|
if(APPLE)
|
||||||
|
message(STATUS "[biergarten] Apple Silicon detected — enabling Metal acceleration.")
|
||||||
# Global Settings
|
set(GGML_METAL ON CACHE BOOL "Enable Metal for Apple Silicon" FORCE)
|
||||||
|
elseif(UNIX AND NOT APPLE)
|
||||||
|
find_package(CUDAToolkit QUIET)
|
||||||
|
if(CUDAToolkit_FOUND)
|
||||||
|
message(STATUS "[biergarten] NVIDIA GPU detected — enabling CUDA acceleration.")
|
||||||
|
set(GGML_CUDA ON CACHE BOOL "Enable CUDA for NVIDIA GPUs" FORCE)
|
||||||
|
# 'native' resolves to the exact SM version of the present GPU at configure time
|
||||||
|
# (e.g. sm_89 for RTX 2000 Ada). Change to a concrete arch list for cross-compilation.
|
||||||
|
set(CMAKE_CUDA_ARCHITECTURES native)
|
||||||
|
else()
|
||||||
|
message(STATUS "[biergarten] No NVIDIA GPU found — falling back to CPU.")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
# =============================================================================
|
||||||
|
# 2. Project-wide Settings
|
||||||
|
# =============================================================================
|
||||||
set(CMAKE_CXX_STANDARD 23)
|
set(CMAKE_CXX_STANDARD 23)
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
set(CMAKE_CXX_EXTENSIONS OFF)
|
|
||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
# =============================================================================
|
||||||
option(ENABLE_CLANG_TIDY "Enable clang-tidy static analysis for project targets" ON)
|
# 3. Dependencies
|
||||||
option(ENABLE_CLANG_FORMAT_TARGETS "Enable clang-format helper targets" ON)
|
# =============================================================================
|
||||||
|
|
||||||
if(ENABLE_CLANG_TIDY)
|
|
||||||
find_program(CLANG_TIDY_EXE NAMES clang-tidy)
|
|
||||||
if(CLANG_TIDY_EXE)
|
|
||||||
set(BIERGARTEN_CLANG_TIDY_COMMAND
|
|
||||||
"${CLANG_TIDY_EXE};--config-file=${CMAKE_CURRENT_SOURCE_DIR}/.clang-tidy")
|
|
||||||
message(STATUS "clang-tidy enabled: ${CLANG_TIDY_EXE}")
|
|
||||||
else()
|
|
||||||
message(STATUS "clang-tidy not found; static analysis is disabled")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Compiler Options & Warnings (Interface Library)
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
add_library(project_options INTERFACE)
|
|
||||||
target_compile_options(project_options INTERFACE
|
|
||||||
$<$<CXX_COMPILER_ID:GNU,Clang>:
|
|
||||||
-Wall -Wextra -Wpedantic -Wshadow -Wconversion -Wsign-conversion -Wunused
|
|
||||||
>
|
|
||||||
$<$<CXX_COMPILER_ID:MSVC>:
|
|
||||||
/W4 /WX /permissive-
|
|
||||||
>
|
|
||||||
)
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Dependencies
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
find_package(CURL REQUIRED)
|
|
||||||
find_package(SQLite3 REQUIRED)
|
|
||||||
find_package(Boost 1.75 REQUIRED COMPONENTS program_options json)
|
|
||||||
|
|
||||||
include(FetchContent)
|
include(FetchContent)
|
||||||
|
# --- libcurl ------------------------------------------------------------------
|
||||||
# spdlog (Logging)
|
# Prefer the system package; the build will fail at link time if absent and
|
||||||
|
# no system curl is found, so emit a fatal error early rather than a silent gap.
|
||||||
|
find_package(CURL QUIET)
|
||||||
|
if(NOT CURL_FOUND)
|
||||||
|
message(FATAL_ERROR
|
||||||
|
"[biergarten] libcurl not found. Install it via your package manager "
|
||||||
|
"(e.g. 'sudo dnf install libcurl-devel') or set CURL_ROOT.")
|
||||||
|
endif()
|
||||||
|
# --- llama.cpp ----------------------------------------------------------------
|
||||||
|
# Pinned to a specific commit for reproducible builds.
|
||||||
|
# To update: pick a new commit SHA from https://github.com/ggml-org/llama.cpp
|
||||||
|
FetchContent_Declare(
|
||||||
|
llama-cpp
|
||||||
|
GIT_REPOSITORY https://github.com/ggml-org/llama.cpp.git
|
||||||
|
GIT_TAG b8611
|
||||||
|
)
|
||||||
|
FetchContent_MakeAvailable(llama-cpp)
|
||||||
|
# --- Boost (JSON + program_options) ------------------------------------------
|
||||||
|
FetchContent_Declare(
|
||||||
|
boost
|
||||||
|
URL https://github.com/boostorg/boost/releases/download/boost-1.85.0/boost-1.85.0-cmake.tar.gz
|
||||||
|
)
|
||||||
|
FetchContent_MakeAvailable(boost)
|
||||||
|
# --- spdlog -------------------------------------------------------------------
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
spdlog
|
spdlog
|
||||||
GIT_REPOSITORY https://github.com/gabime/spdlog.git
|
GIT_REPOSITORY https://github.com/gabime/spdlog.git
|
||||||
GIT_TAG v1.11.0
|
GIT_TAG v1.15.3
|
||||||
)
|
)
|
||||||
FetchContent_MakeAvailable(spdlog)
|
FetchContent_MakeAvailable(spdlog)
|
||||||
|
# =============================================================================
|
||||||
# llama.cpp (LLM Inference)
|
# 4. Sources
|
||||||
set(LLAMA_BUILD_TESTS OFF CACHE BOOL "" FORCE)
|
# =============================================================================
|
||||||
set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
|
set(SOURCES
|
||||||
set(LLAMA_BUILD_SERVER OFF CACHE BOOL "" FORCE)
|
src/main.cpp
|
||||||
FetchContent_Declare(
|
|
||||||
llama_cpp
|
|
||||||
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
|
|
||||||
GIT_TAG b8611
|
|
||||||
)
|
|
||||||
FetchContent_MakeAvailable(llama_cpp)
|
|
||||||
|
|
||||||
if(TARGET llama)
|
|
||||||
target_compile_options(llama PRIVATE
|
|
||||||
$<$<CXX_COMPILER_ID:AppleClang>:-include algorithm>
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Main Executable
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
set(PIPELINE_SOURCES
|
|
||||||
src/biergarten_data_generator.cpp
|
src/biergarten_data_generator.cpp
|
||||||
src/web_client/curl_web_client.cpp
|
|
||||||
src/data_generation/data_downloader.cpp
|
|
||||||
src/database/database.cpp
|
|
||||||
src/json_handling/json_loader.cpp
|
|
||||||
src/data_generation/llama/destructor.cpp
|
src/data_generation/llama/destructor.cpp
|
||||||
src/data_generation/llama/set_sampling_options.cpp
|
|
||||||
src/data_generation/llama/load.cpp
|
|
||||||
src/data_generation/llama/infer.cpp
|
|
||||||
src/data_generation/llama/generate_brewery.cpp
|
src/data_generation/llama/generate_brewery.cpp
|
||||||
src/data_generation/llama/generate_user.cpp
|
src/data_generation/llama/generate_user.cpp
|
||||||
src/data_generation/llama/helpers.cpp
|
src/data_generation/llama/helpers.cpp
|
||||||
|
src/data_generation/llama/infer.cpp
|
||||||
|
src/data_generation/llama/load.cpp
|
||||||
src/data_generation/llama/load_brewery_prompt.cpp
|
src/data_generation/llama/load_brewery_prompt.cpp
|
||||||
|
src/data_generation/llama/set_sampling_options.cpp
|
||||||
src/data_generation/mock/data.cpp
|
src/data_generation/mock/data.cpp
|
||||||
src/data_generation/mock/deterministic_hash.cpp
|
src/data_generation/mock/deterministic_hash.cpp
|
||||||
src/data_generation/mock/load.cpp
|
|
||||||
src/data_generation/mock/generate_brewery.cpp
|
src/data_generation/mock/generate_brewery.cpp
|
||||||
src/data_generation/mock/generate_user.cpp
|
src/data_generation/mock/generate_user.cpp
|
||||||
src/json_handling/stream_parser.cpp
|
src/data_generation/mock/load.cpp
|
||||||
|
src/json_handling/json_loader.cpp
|
||||||
|
src/web_client/curl_web_client.cpp
|
||||||
src/wikipedia/wikipedia_service.cpp
|
src/wikipedia/wikipedia_service.cpp
|
||||||
src/main.cpp
|
|
||||||
)
|
)
|
||||||
|
# =============================================================================
|
||||||
add_executable(biergarten-pipeline ${PIPELINE_SOURCES})
|
# 5. Target
|
||||||
|
# =============================================================================
|
||||||
if(BIERGARTEN_CLANG_TIDY_COMMAND)
|
add_executable(${PROJECT_NAME}
|
||||||
set_target_properties(biergarten-pipeline PROPERTIES
|
${SOURCES}
|
||||||
CXX_CLANG_TIDY "${BIERGARTEN_CLANG_TIDY_COMMAND}"
|
|
||||||
)
|
)
|
||||||
endif()
|
target_include_directories(${PROJECT_NAME} PRIVATE
|
||||||
|
includes
|
||||||
target_include_directories(biergarten-pipeline
|
${llama-cpp_SOURCE_DIR}/include
|
||||||
PRIVATE
|
${llama-cpp_SOURCE_DIR}/common
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/includes
|
|
||||||
${llama_cpp_SOURCE_DIR}/include
|
|
||||||
)
|
)
|
||||||
|
target_link_libraries(${PROJECT_NAME} PRIVATE
|
||||||
target_link_libraries(biergarten-pipeline
|
|
||||||
PRIVATE
|
|
||||||
project_options
|
|
||||||
CURL::libcurl
|
|
||||||
SQLite::SQLite3
|
|
||||||
spdlog::spdlog
|
|
||||||
llama
|
llama
|
||||||
Boost::program_options
|
boost_json
|
||||||
Boost::json
|
boost_program_options
|
||||||
|
spdlog::spdlog
|
||||||
|
CURL::libcurl
|
||||||
)
|
)
|
||||||
|
|
||||||
if(ENABLE_CLANG_FORMAT_TARGETS)
|
|
||||||
find_program(CLANG_FORMAT_EXE NAMES clang-format)
|
|
||||||
if(CLANG_FORMAT_EXE)
|
|
||||||
file(GLOB_RECURSE FORMAT_SOURCES CONFIGURE_DEPENDS
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cpp
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cc
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.h
|
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.hpp
|
|
||||||
)
|
|
||||||
|
|
||||||
add_custom_target(format
|
|
||||||
COMMAND ${CLANG_FORMAT_EXE} -style=file -i ${FORMAT_SOURCES}
|
|
||||||
COMMENT "Formatting source files with clang-format (Google style)"
|
|
||||||
VERBATIM
|
|
||||||
)
|
|
||||||
|
|
||||||
add_custom_target(format-check
|
|
||||||
COMMAND ${CLANG_FORMAT_EXE} -style=file --dry-run --Werror ${FORMAT_SOURCES}
|
|
||||||
COMMENT "Checking source formatting with clang-format (Google style)"
|
|
||||||
VERBATIM
|
|
||||||
)
|
|
||||||
else()
|
|
||||||
message(STATUS "clang-format not found; format targets are disabled")
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Post-Build Steps & Utilities
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
add_custom_command(TARGET biergarten-pipeline POST_BUILD
|
|
||||||
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_SOURCE_DIR}/output
|
|
||||||
COMMENT "Ensuring output directory exists"
|
|
||||||
)
|
|
||||||
|
|
||||||
find_program(VALGRIND valgrind)
|
|
||||||
if(VALGRIND)
|
|
||||||
add_custom_target(memcheck
|
|
||||||
COMMAND ${VALGRIND} --leak-check=full --error-exitcode=1 $<TARGET_FILE:biergarten-pipeline> --help
|
|
||||||
DEPENDS biergarten-pipeline
|
|
||||||
COMMENT "Running Valgrind memory check"
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
|
|||||||
@@ -3,11 +3,10 @@
|
|||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "data_generation/data_generator.h"
|
#include "data_generation/data_generator.h"
|
||||||
#include "database/database.h"
|
#include "models/location.h"
|
||||||
#include "web_client/web_client.h"
|
#include "web_client/web_client.h"
|
||||||
#include "wikipedia/wikipedia_service.h"
|
#include "wikipedia/wikipedia_service.h"
|
||||||
|
|
||||||
@@ -49,8 +48,7 @@ struct ApplicationOptions {
|
|||||||
* @brief Main data generator class for the Biergarten pipeline.
|
* @brief Main data generator class for the Biergarten pipeline.
|
||||||
*
|
*
|
||||||
* This class encapsulates the core logic for generating brewery data.
|
* This class encapsulates the core logic for generating brewery data.
|
||||||
* It handles database initialization, data loading/downloading, and brewery
|
* It handles location loading, city enrichment, and brewery generation.
|
||||||
* generation.
|
|
||||||
*/
|
*/
|
||||||
class BiergartenDataGenerator {
|
class BiergartenDataGenerator {
|
||||||
public:
|
public:
|
||||||
@@ -59,20 +57,17 @@ class BiergartenDataGenerator {
|
|||||||
*
|
*
|
||||||
* @param options Application configuration options.
|
* @param options Application configuration options.
|
||||||
* @param web_client HTTP client for downloading data.
|
* @param web_client HTTP client for downloading data.
|
||||||
* @param database SQLite database instance.
|
|
||||||
*/
|
*/
|
||||||
BiergartenDataGenerator(const ApplicationOptions& options,
|
BiergartenDataGenerator(const ApplicationOptions& options,
|
||||||
std::shared_ptr<WebClient> web_client,
|
std::shared_ptr<WebClient> web_client);
|
||||||
SqliteDatabase& database);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Run the data generation pipeline.
|
* @brief Run the data generation pipeline.
|
||||||
*
|
*
|
||||||
* Performs the following steps:
|
* Performs the following steps:
|
||||||
* 1. Initialize database
|
* 1. Load curated locations from JSON
|
||||||
* 2. Download geographic data if needed
|
* 2. Initialize the generator (LLM or Mock)
|
||||||
* 3. Initialize the generator (LLM or Mock)
|
* 3. Generate brewery data for sampled cities
|
||||||
* 4. Generate brewery data for sample cities
|
|
||||||
*
|
*
|
||||||
* @return 0 on success, 1 on failure.
|
* @return 0 on success, 1 on failure.
|
||||||
*/
|
*/
|
||||||
@@ -85,16 +80,11 @@ class BiergartenDataGenerator {
|
|||||||
/// @brief Shared HTTP client dependency.
|
/// @brief Shared HTTP client dependency.
|
||||||
std::shared_ptr<WebClient> webClient_;
|
std::shared_ptr<WebClient> webClient_;
|
||||||
|
|
||||||
/// @brief Database dependency.
|
|
||||||
SqliteDatabase& database_;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Enriched city data with Wikipedia context.
|
* @brief Enriched city data with Wikipedia context.
|
||||||
*/
|
*/
|
||||||
struct EnrichedCity {
|
struct EnrichedCity {
|
||||||
int city_id;
|
Location location;
|
||||||
std::string city_name;
|
|
||||||
std::string country_name;
|
|
||||||
std::string region_context;
|
std::string region_context;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -108,25 +98,20 @@ class BiergartenDataGenerator {
|
|||||||
std::unique_ptr<DataGenerator> InitializeGenerator();
|
std::unique_ptr<DataGenerator> InitializeGenerator();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Download and load geographic data if not cached.
|
* @brief Load locations from JSON and sample cities.
|
||||||
*/
|
|
||||||
void LoadGeographicData();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Query cities from database and build country name map.
|
|
||||||
*
|
*
|
||||||
* @return Vector of (City, country_name) pairs capped at 30 entries.
|
* @return Vector of sampled locations capped at 30 entries.
|
||||||
*/
|
*/
|
||||||
std::vector<std::pair<City, std::string>> QueryCitiesWithCountries();
|
std::vector<Location> QueryCitiesWithCountries();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Enrich cities with Wikipedia summaries.
|
* @brief Enrich cities with Wikipedia summaries.
|
||||||
*
|
*
|
||||||
* @param cities Vector of (City, country_name) pairs.
|
* @param cities Vector of sampled locations.
|
||||||
* @return Vector of enriched city data with context.
|
* @return Vector of enriched city data with context.
|
||||||
*/
|
*/
|
||||||
std::vector<EnrichedCity> EnrichWithWikipedia(
|
std::vector<EnrichedCity> EnrichWithWikipedia(
|
||||||
const std::vector<std::pair<City, std::string>>& cities);
|
const std::vector<Location>& cities);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Generate breweries for enriched cities.
|
* @brief Generate breweries for enriched cities.
|
||||||
@@ -146,8 +131,7 @@ class BiergartenDataGenerator {
|
|||||||
* @brief Helper struct to store generated brewery data.
|
* @brief Helper struct to store generated brewery data.
|
||||||
*/
|
*/
|
||||||
struct GeneratedBrewery {
|
struct GeneratedBrewery {
|
||||||
int city_id;
|
Location location;
|
||||||
std::string city_name;
|
|
||||||
BreweryResult brewery;
|
BreweryResult brewery;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -1,31 +0,0 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
|
|
||||||
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
|
|
||||||
|
|
||||||
#include <memory>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "web_client/web_client.h"
|
|
||||||
|
|
||||||
/// @brief Downloads and caches source geography JSON payloads.
|
|
||||||
class DataDownloader {
|
|
||||||
public:
|
|
||||||
/// @brief Initializes global curl state used by this downloader.
|
|
||||||
explicit DataDownloader(std::shared_ptr<WebClient> web_client);
|
|
||||||
|
|
||||||
/// @brief Cleans up global curl state.
|
|
||||||
~DataDownloader();
|
|
||||||
|
|
||||||
/// @brief Returns a local JSON path, downloading it when cache is missing.
|
|
||||||
std::string DownloadCountriesDatabase(
|
|
||||||
const std::string& cache_path,
|
|
||||||
const std::string& commit =
|
|
||||||
"c5eb7772" // Stable commit: 2026-03-28 export
|
|
||||||
);
|
|
||||||
|
|
||||||
private:
|
|
||||||
static bool FileExists(const std::string& file_path);
|
|
||||||
std::shared_ptr<WebClient> web_client_;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
|
|
||||||
@@ -1,87 +0,0 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
|
|
||||||
#define BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
|
|
||||||
|
|
||||||
#include <sqlite3.h>
|
|
||||||
|
|
||||||
#include <mutex>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
struct Country {
|
|
||||||
/// @brief Country identifier from the source dataset.
|
|
||||||
int id;
|
|
||||||
/// @brief Country display name.
|
|
||||||
std::string name;
|
|
||||||
/// @brief ISO 3166-1 alpha-2 code.
|
|
||||||
std::string iso2;
|
|
||||||
/// @brief ISO 3166-1 alpha-3 code.
|
|
||||||
std::string iso3;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct State {
|
|
||||||
/// @brief State or province identifier from the source dataset.
|
|
||||||
int id;
|
|
||||||
/// @brief State or province display name.
|
|
||||||
std::string name;
|
|
||||||
/// @brief State or province short code.
|
|
||||||
std::string iso2;
|
|
||||||
/// @brief Parent country identifier.
|
|
||||||
int country_id;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct City {
|
|
||||||
/// @brief City identifier from the source dataset.
|
|
||||||
int id;
|
|
||||||
/// @brief City display name.
|
|
||||||
std::string name;
|
|
||||||
/// @brief Parent country identifier.
|
|
||||||
int country_id;
|
|
||||||
};
|
|
||||||
|
|
||||||
/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks.
|
|
||||||
class SqliteDatabase {
|
|
||||||
private:
|
|
||||||
sqlite3* db_ = nullptr;
|
|
||||||
std::mutex db_mutex_;
|
|
||||||
|
|
||||||
void InitializeSchema();
|
|
||||||
|
|
||||||
public:
|
|
||||||
/// @brief Closes the SQLite connection if initialized.
|
|
||||||
~SqliteDatabase();
|
|
||||||
|
|
||||||
/// @brief Opens the SQLite database at db_path and creates schema objects.
|
|
||||||
void Initialize(const std::string& db_path = ":memory:");
|
|
||||||
|
|
||||||
/// @brief Starts a database transaction for batched writes.
|
|
||||||
void BeginTransaction();
|
|
||||||
|
|
||||||
/// @brief Commits the active database transaction.
|
|
||||||
void CommitTransaction();
|
|
||||||
|
|
||||||
/// @brief Rolls back the active database transaction.
|
|
||||||
void RollbackTransaction();
|
|
||||||
|
|
||||||
/// @brief Inserts a country row.
|
|
||||||
void InsertCountry(int id, const std::string& name, const std::string& iso2,
|
|
||||||
const std::string& iso3);
|
|
||||||
|
|
||||||
/// @brief Inserts a state row linked to a country.
|
|
||||||
void InsertState(int id, int country_id, const std::string& name,
|
|
||||||
const std::string& iso2);
|
|
||||||
|
|
||||||
/// @brief Inserts a city row linked to state and country.
|
|
||||||
void InsertCity(int id, int state_id, int country_id,
|
|
||||||
const std::string& name, double latitude, double longitude);
|
|
||||||
|
|
||||||
/// @brief Returns city records including parent country id.
|
|
||||||
std::vector<City> QueryCities();
|
|
||||||
|
|
||||||
/// @brief Returns countries with optional row limit.
|
|
||||||
std::vector<Country> QueryCountries(int limit = 0);
|
|
||||||
|
|
||||||
/// @brief Returns states with optional row limit.
|
|
||||||
std::vector<State> QueryStates(int limit = 0);
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
|
|
||||||
@@ -2,16 +2,15 @@
|
|||||||
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#include "database/database.h"
|
#include "models/location.h"
|
||||||
#include "json_handling/stream_parser.h"
|
|
||||||
|
|
||||||
/// @brief Loads world-city JSON data into SQLite through streaming parsing.
|
/// @brief Loads curated world locations from a JSON file into memory.
|
||||||
class JsonLoader {
|
class JsonLoader {
|
||||||
public:
|
public:
|
||||||
/// @brief Parses a JSON file and writes country/state/city rows into db.
|
/// @brief Parses a JSON array file and returns all location records.
|
||||||
static void LoadWorldCities(const std::string& json_path,
|
static std::vector<Location> LoadLocations(const std::string& filepath);
|
||||||
SqliteDatabase& db);
|
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||||
|
|||||||
@@ -1,52 +0,0 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
|
|
||||||
#define BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
|
|
||||||
|
|
||||||
#include <functional>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "database/database.h"
|
|
||||||
|
|
||||||
// Forward declaration to avoid circular dependency
|
|
||||||
class SqliteDatabase;
|
|
||||||
|
|
||||||
/// @brief In-memory representation of one parsed city entry.
|
|
||||||
struct CityRecord {
|
|
||||||
int id;
|
|
||||||
int state_id;
|
|
||||||
int country_id;
|
|
||||||
std::string name;
|
|
||||||
double latitude;
|
|
||||||
double longitude;
|
|
||||||
};
|
|
||||||
|
|
||||||
/// @brief Streaming SAX parser that emits city records during traversal.
|
|
||||||
class StreamingJsonParser {
|
|
||||||
public:
|
|
||||||
/// @brief Parses file_path and invokes callbacks for city rows and progress.
|
|
||||||
static void Parse(const std::string& file_path, SqliteDatabase& db,
|
|
||||||
std::function<void(const CityRecord&)> on_city,
|
|
||||||
std::function<void(size_t, size_t)> on_progress = nullptr);
|
|
||||||
|
|
||||||
private:
|
|
||||||
/// @brief Mutable SAX handler state while traversing nested JSON arrays.
|
|
||||||
struct ParseState {
|
|
||||||
int current_country_id = 0;
|
|
||||||
int current_state_id = 0;
|
|
||||||
|
|
||||||
CityRecord current_city = {};
|
|
||||||
bool building_city = false;
|
|
||||||
std::string current_key;
|
|
||||||
|
|
||||||
int array_depth = 0;
|
|
||||||
int object_depth = 0;
|
|
||||||
bool in_countries_array = false;
|
|
||||||
bool in_states_array = false;
|
|
||||||
bool in_cities_array = false;
|
|
||||||
|
|
||||||
std::function<void(const CityRecord&)> on_city;
|
|
||||||
std::function<void(size_t, size_t)> on_progress;
|
|
||||||
size_t bytes_processed = 0;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
|
|
||||||
902
pipeline/locations.json
Normal file
902
pipeline/locations.json
Normal file
@@ -0,0 +1,902 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"city": "Cape Town",
|
||||||
|
"state_province": "Western Cape",
|
||||||
|
"iso3166_2": "ZA-WC",
|
||||||
|
"country": "South Africa",
|
||||||
|
"iso3166_1": "ZA",
|
||||||
|
"latitude": -33.9249,
|
||||||
|
"longitude": 18.4241
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Johannesburg",
|
||||||
|
"state_province": "Gauteng",
|
||||||
|
"iso3166_2": "ZA-GT",
|
||||||
|
"country": "South Africa",
|
||||||
|
"iso3166_1": "ZA",
|
||||||
|
"latitude": -26.2041,
|
||||||
|
"longitude": 28.0473
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Durban",
|
||||||
|
"state_province": "KwaZulu-Natal",
|
||||||
|
"iso3166_2": "ZA-NL",
|
||||||
|
"country": "South Africa",
|
||||||
|
"iso3166_1": "ZA",
|
||||||
|
"latitude": -29.8587,
|
||||||
|
"longitude": 31.0218
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Franschhoek",
|
||||||
|
"state_province": "Western Cape",
|
||||||
|
"iso3166_2": "ZA-WC",
|
||||||
|
"country": "South Africa",
|
||||||
|
"iso3166_1": "ZA",
|
||||||
|
"latitude": -33.9146,
|
||||||
|
"longitude": 19.1198
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Nairobi",
|
||||||
|
"state_province": "Nairobi",
|
||||||
|
"iso3166_2": "KE-30",
|
||||||
|
"country": "Kenya",
|
||||||
|
"iso3166_1": "KE",
|
||||||
|
"latitude": -1.2921,
|
||||||
|
"longitude": 36.8219
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Buenos Aires",
|
||||||
|
"state_province": "Buenos Aires City",
|
||||||
|
"iso3166_2": "AR-C",
|
||||||
|
"country": "Argentina",
|
||||||
|
"iso3166_1": "AR",
|
||||||
|
"latitude": -34.6037,
|
||||||
|
"longitude": -58.3816
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bariloche",
|
||||||
|
"state_province": "Río Negro",
|
||||||
|
"iso3166_2": "AR-R",
|
||||||
|
"country": "Argentina",
|
||||||
|
"iso3166_1": "AR",
|
||||||
|
"latitude": -41.1335,
|
||||||
|
"longitude": -71.3103
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bogotá",
|
||||||
|
"state_province": "Bogotá D.C.",
|
||||||
|
"iso3166_2": "CO-DC",
|
||||||
|
"country": "Colombia",
|
||||||
|
"iso3166_1": "CO",
|
||||||
|
"latitude": 4.711,
|
||||||
|
"longitude": -74.0721
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Medellín",
|
||||||
|
"state_province": "Antioquia",
|
||||||
|
"iso3166_2": "CO-ANT",
|
||||||
|
"country": "Colombia",
|
||||||
|
"iso3166_1": "CO",
|
||||||
|
"latitude": 6.2442,
|
||||||
|
"longitude": -75.5812
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "São Paulo",
|
||||||
|
"state_province": "São Paulo",
|
||||||
|
"iso3166_2": "BR-SP",
|
||||||
|
"country": "Brazil",
|
||||||
|
"iso3166_1": "BR",
|
||||||
|
"latitude": -23.5505,
|
||||||
|
"longitude": -46.6333
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Curitiba",
|
||||||
|
"state_province": "Paraná",
|
||||||
|
"iso3166_2": "BR-PR",
|
||||||
|
"country": "Brazil",
|
||||||
|
"iso3166_1": "BR",
|
||||||
|
"latitude": -25.4284,
|
||||||
|
"longitude": -49.2733
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Rio de Janeiro",
|
||||||
|
"state_province": "Rio de Janeiro",
|
||||||
|
"iso3166_2": "BR-RJ",
|
||||||
|
"country": "Brazil",
|
||||||
|
"iso3166_1": "BR",
|
||||||
|
"latitude": -22.9068,
|
||||||
|
"longitude": -43.1729
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Santiago",
|
||||||
|
"state_province": "Santiago Metropolitan",
|
||||||
|
"iso3166_2": "CL-RM",
|
||||||
|
"country": "Chile",
|
||||||
|
"iso3166_1": "CL",
|
||||||
|
"latitude": -33.4489,
|
||||||
|
"longitude": -70.6693
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Valdivia",
|
||||||
|
"state_province": "Los Ríos",
|
||||||
|
"iso3166_2": "CL-LR",
|
||||||
|
"country": "Chile",
|
||||||
|
"iso3166_1": "CL",
|
||||||
|
"latitude": -39.8142,
|
||||||
|
"longitude": -73.2459
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Lima",
|
||||||
|
"state_province": "Lima",
|
||||||
|
"iso3166_2": "PE-LMA",
|
||||||
|
"country": "Peru",
|
||||||
|
"iso3166_1": "PE",
|
||||||
|
"latitude": -12.0464,
|
||||||
|
"longitude": -77.0428
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Tokyo",
|
||||||
|
"state_province": "Tokyo",
|
||||||
|
"iso3166_2": "JP-13",
|
||||||
|
"country": "Japan",
|
||||||
|
"iso3166_1": "JP",
|
||||||
|
"latitude": 35.6762,
|
||||||
|
"longitude": 139.6503
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Osaka",
|
||||||
|
"state_province": "Osaka",
|
||||||
|
"iso3166_2": "JP-27",
|
||||||
|
"country": "Japan",
|
||||||
|
"iso3166_1": "JP",
|
||||||
|
"latitude": 34.6937,
|
||||||
|
"longitude": 135.5023
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Kyoto",
|
||||||
|
"state_province": "Kyoto",
|
||||||
|
"iso3166_2": "JP-26",
|
||||||
|
"country": "Japan",
|
||||||
|
"iso3166_1": "JP",
|
||||||
|
"latitude": 35.0116,
|
||||||
|
"longitude": 135.7681
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Sapporo",
|
||||||
|
"state_province": "Hokkaido",
|
||||||
|
"iso3166_2": "JP-01",
|
||||||
|
"country": "Japan",
|
||||||
|
"iso3166_1": "JP",
|
||||||
|
"latitude": 43.0618,
|
||||||
|
"longitude": 141.3545
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Seoul",
|
||||||
|
"state_province": "Seoul",
|
||||||
|
"iso3166_2": "KR-11",
|
||||||
|
"country": "South Korea",
|
||||||
|
"iso3166_1": "KR",
|
||||||
|
"latitude": 37.5665,
|
||||||
|
"longitude": 126.978
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Busan",
|
||||||
|
"state_province": "Busan",
|
||||||
|
"iso3166_2": "KR-26",
|
||||||
|
"country": "South Korea",
|
||||||
|
"iso3166_1": "KR",
|
||||||
|
"latitude": 35.1796,
|
||||||
|
"longitude": 129.0756
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Ho Chi Minh City",
|
||||||
|
"state_province": "Ho Chi Minh",
|
||||||
|
"iso3166_2": "VN-SG",
|
||||||
|
"country": "Vietnam",
|
||||||
|
"iso3166_1": "VN",
|
||||||
|
"latitude": 10.8231,
|
||||||
|
"longitude": 106.6297
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Hanoi",
|
||||||
|
"state_province": "Hanoi",
|
||||||
|
"iso3166_2": "VN-HN",
|
||||||
|
"country": "Vietnam",
|
||||||
|
"iso3166_1": "VN",
|
||||||
|
"latitude": 21.0285,
|
||||||
|
"longitude": 105.8542
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Da Nang",
|
||||||
|
"state_province": "Da Nang",
|
||||||
|
"iso3166_2": "VN-DN",
|
||||||
|
"country": "Vietnam",
|
||||||
|
"iso3166_1": "VN",
|
||||||
|
"latitude": 16.0544,
|
||||||
|
"longitude": 108.2022
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bangkok",
|
||||||
|
"state_province": "Bangkok",
|
||||||
|
"iso3166_2": "TH-10",
|
||||||
|
"country": "Thailand",
|
||||||
|
"iso3166_1": "TH",
|
||||||
|
"latitude": 13.7563,
|
||||||
|
"longitude": 100.5018
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Taipei",
|
||||||
|
"state_province": "Taipei",
|
||||||
|
"iso3166_2": "TW-TPE",
|
||||||
|
"country": "Taiwan",
|
||||||
|
"iso3166_1": "TW",
|
||||||
|
"latitude": 25.033,
|
||||||
|
"longitude": 121.5654
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Beijing",
|
||||||
|
"state_province": "Beijing",
|
||||||
|
"iso3166_2": "CN-BJ",
|
||||||
|
"country": "China",
|
||||||
|
"iso3166_1": "CN",
|
||||||
|
"latitude": 39.9042,
|
||||||
|
"longitude": 116.4074
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Shanghai",
|
||||||
|
"state_province": "Shanghai",
|
||||||
|
"iso3166_2": "CN-SH",
|
||||||
|
"country": "China",
|
||||||
|
"iso3166_1": "CN",
|
||||||
|
"latitude": 31.2304,
|
||||||
|
"longitude": 121.4737
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bengaluru",
|
||||||
|
"state_province": "Karnataka",
|
||||||
|
"iso3166_2": "IN-KA",
|
||||||
|
"country": "India",
|
||||||
|
"iso3166_1": "IN",
|
||||||
|
"latitude": 12.9716,
|
||||||
|
"longitude": 77.5946
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Singapore",
|
||||||
|
"state_province": "Central Singapore",
|
||||||
|
"iso3166_2": "SG-01",
|
||||||
|
"country": "Singapore",
|
||||||
|
"iso3166_1": "SG",
|
||||||
|
"latitude": 1.3521,
|
||||||
|
"longitude": 103.8198
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Melbourne",
|
||||||
|
"state_province": "Victoria",
|
||||||
|
"iso3166_2": "AU-VIC",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -37.8136,
|
||||||
|
"longitude": 144.9631
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Sydney",
|
||||||
|
"state_province": "New South Wales",
|
||||||
|
"iso3166_2": "AU-NSW",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -33.8688,
|
||||||
|
"longitude": 151.2093
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Brisbane",
|
||||||
|
"state_province": "Queensland",
|
||||||
|
"iso3166_2": "AU-QLD",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -27.4705,
|
||||||
|
"longitude": 153.026
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Adelaide",
|
||||||
|
"state_province": "South Australia",
|
||||||
|
"iso3166_2": "AU-SA",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -34.9285,
|
||||||
|
"longitude": 138.6007
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Perth",
|
||||||
|
"state_province": "Western Australia",
|
||||||
|
"iso3166_2": "AU-WA",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -31.9505,
|
||||||
|
"longitude": 115.8605
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Hobart",
|
||||||
|
"state_province": "Tasmania",
|
||||||
|
"iso3166_2": "AU-TAS",
|
||||||
|
"country": "Australia",
|
||||||
|
"iso3166_1": "AU",
|
||||||
|
"latitude": -42.8821,
|
||||||
|
"longitude": 147.3272
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Wellington",
|
||||||
|
"state_province": "Wellington",
|
||||||
|
"iso3166_2": "NZ-WGN",
|
||||||
|
"country": "New Zealand",
|
||||||
|
"iso3166_1": "NZ",
|
||||||
|
"latitude": -41.2865,
|
||||||
|
"longitude": 174.7762
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Auckland",
|
||||||
|
"state_province": "Auckland",
|
||||||
|
"iso3166_2": "NZ-AUK",
|
||||||
|
"country": "New Zealand",
|
||||||
|
"iso3166_1": "NZ",
|
||||||
|
"latitude": -36.8485,
|
||||||
|
"longitude": 174.7633
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Christchurch",
|
||||||
|
"state_province": "Canterbury",
|
||||||
|
"iso3166_2": "NZ-CAN",
|
||||||
|
"country": "New Zealand",
|
||||||
|
"iso3166_1": "NZ",
|
||||||
|
"latitude": -43.532,
|
||||||
|
"longitude": 172.6306
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Nelson",
|
||||||
|
"state_province": "Nelson",
|
||||||
|
"iso3166_2": "NZ-NSN",
|
||||||
|
"country": "New Zealand",
|
||||||
|
"iso3166_1": "NZ",
|
||||||
|
"latitude": -41.2706,
|
||||||
|
"longitude": 173.284
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Munich",
|
||||||
|
"state_province": "Bavaria",
|
||||||
|
"iso3166_2": "DE-BY",
|
||||||
|
"country": "Germany",
|
||||||
|
"iso3166_1": "DE",
|
||||||
|
"latitude": 48.1351,
|
||||||
|
"longitude": 11.582
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Berlin",
|
||||||
|
"state_province": "Berlin",
|
||||||
|
"iso3166_2": "DE-BE",
|
||||||
|
"country": "Germany",
|
||||||
|
"iso3166_1": "DE",
|
||||||
|
"latitude": 52.52,
|
||||||
|
"longitude": 13.405
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Cologne",
|
||||||
|
"state_province": "North Rhine-Westphalia",
|
||||||
|
"iso3166_2": "DE-NW",
|
||||||
|
"country": "Germany",
|
||||||
|
"iso3166_1": "DE",
|
||||||
|
"latitude": 50.9375,
|
||||||
|
"longitude": 6.9603
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bamberg",
|
||||||
|
"state_province": "Bavaria",
|
||||||
|
"iso3166_2": "DE-BY",
|
||||||
|
"country": "Germany",
|
||||||
|
"iso3166_1": "DE",
|
||||||
|
"latitude": 49.8916,
|
||||||
|
"longitude": 10.8916
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Brussels",
|
||||||
|
"state_province": "Brussels-Capital",
|
||||||
|
"iso3166_2": "BE-BRU",
|
||||||
|
"country": "Belgium",
|
||||||
|
"iso3166_1": "BE",
|
||||||
|
"latitude": 50.8503,
|
||||||
|
"longitude": 4.3517
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Antwerp",
|
||||||
|
"state_province": "Flanders",
|
||||||
|
"iso3166_2": "BE-VLG",
|
||||||
|
"country": "Belgium",
|
||||||
|
"iso3166_1": "BE",
|
||||||
|
"latitude": 51.2194,
|
||||||
|
"longitude": 4.4025
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bruges",
|
||||||
|
"state_province": "Flanders",
|
||||||
|
"iso3166_2": "BE-VLG",
|
||||||
|
"country": "Belgium",
|
||||||
|
"iso3166_1": "BE",
|
||||||
|
"latitude": 51.2093,
|
||||||
|
"longitude": 3.2247
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "London",
|
||||||
|
"state_province": "England",
|
||||||
|
"iso3166_2": "GB-ENG",
|
||||||
|
"country": "United Kingdom",
|
||||||
|
"iso3166_1": "GB",
|
||||||
|
"latitude": 51.5074,
|
||||||
|
"longitude": -0.1278
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Bristol",
|
||||||
|
"state_province": "England",
|
||||||
|
"iso3166_2": "GB-ENG",
|
||||||
|
"country": "United Kingdom",
|
||||||
|
"iso3166_1": "GB",
|
||||||
|
"latitude": 51.4545,
|
||||||
|
"longitude": -2.5879
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Edinburgh",
|
||||||
|
"state_province": "Scotland",
|
||||||
|
"iso3166_2": "GB-SCT",
|
||||||
|
"country": "United Kingdom",
|
||||||
|
"iso3166_1": "GB",
|
||||||
|
"latitude": 55.9533,
|
||||||
|
"longitude": -3.1883
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Glasgow",
|
||||||
|
"state_province": "Scotland",
|
||||||
|
"iso3166_2": "GB-SCT",
|
||||||
|
"country": "United Kingdom",
|
||||||
|
"iso3166_1": "GB",
|
||||||
|
"latitude": 55.8642,
|
||||||
|
"longitude": -4.2518
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Prague",
|
||||||
|
"state_province": "Prague",
|
||||||
|
"iso3166_2": "CZ-10",
|
||||||
|
"country": "Czechia",
|
||||||
|
"iso3166_1": "CZ",
|
||||||
|
"latitude": 50.0755,
|
||||||
|
"longitude": 14.4378
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Pilsen",
|
||||||
|
"state_province": "Plzeň",
|
||||||
|
"iso3166_2": "CZ-32",
|
||||||
|
"country": "Czechia",
|
||||||
|
"iso3166_1": "CZ",
|
||||||
|
"latitude": 49.7384,
|
||||||
|
"longitude": 13.3736
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Amsterdam",
|
||||||
|
"state_province": "North Holland",
|
||||||
|
"iso3166_2": "NL-NH",
|
||||||
|
"country": "Netherlands",
|
||||||
|
"iso3166_1": "NL",
|
||||||
|
"latitude": 52.3676,
|
||||||
|
"longitude": 4.9041
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Copenhagen",
|
||||||
|
"state_province": "Capital Region",
|
||||||
|
"iso3166_2": "DK-84",
|
||||||
|
"country": "Denmark",
|
||||||
|
"iso3166_1": "DK",
|
||||||
|
"latitude": 55.6761,
|
||||||
|
"longitude": 12.5683
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Warsaw",
|
||||||
|
"state_province": "Masovian",
|
||||||
|
"iso3166_2": "PL-MZ",
|
||||||
|
"country": "Poland",
|
||||||
|
"iso3166_1": "PL",
|
||||||
|
"latitude": 52.2297,
|
||||||
|
"longitude": 21.0122
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Krakow",
|
||||||
|
"state_province": "Lesser Poland",
|
||||||
|
"iso3166_2": "PL-MA",
|
||||||
|
"country": "Poland",
|
||||||
|
"iso3166_1": "PL",
|
||||||
|
"latitude": 50.0647,
|
||||||
|
"longitude": 19.945
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Rome",
|
||||||
|
"state_province": "Lazio",
|
||||||
|
"iso3166_2": "IT-62",
|
||||||
|
"country": "Italy",
|
||||||
|
"iso3166_1": "IT",
|
||||||
|
"latitude": 41.9028,
|
||||||
|
"longitude": 12.4964
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Milan",
|
||||||
|
"state_province": "Lombardy",
|
||||||
|
"iso3166_2": "IT-25",
|
||||||
|
"country": "Italy",
|
||||||
|
"iso3166_1": "IT",
|
||||||
|
"latitude": 45.4642,
|
||||||
|
"longitude": 9.19
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Barcelona",
|
||||||
|
"state_province": "Catalonia",
|
||||||
|
"iso3166_2": "ES-CT",
|
||||||
|
"country": "Spain",
|
||||||
|
"iso3166_1": "ES",
|
||||||
|
"latitude": 41.3851,
|
||||||
|
"longitude": 2.1734
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Madrid",
|
||||||
|
"state_province": "Madrid",
|
||||||
|
"iso3166_2": "ES-MD",
|
||||||
|
"country": "Spain",
|
||||||
|
"iso3166_1": "ES",
|
||||||
|
"latitude": 40.4168,
|
||||||
|
"longitude": -3.7038
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Paris",
|
||||||
|
"state_province": "Île-de-France",
|
||||||
|
"iso3166_2": "FR-IDF",
|
||||||
|
"country": "France",
|
||||||
|
"iso3166_1": "FR",
|
||||||
|
"latitude": 48.8566,
|
||||||
|
"longitude": 2.3522
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Lyon",
|
||||||
|
"state_province": "Auvergne-Rhône-Alpes",
|
||||||
|
"iso3166_2": "FR-ARA",
|
||||||
|
"country": "France",
|
||||||
|
"iso3166_1": "FR",
|
||||||
|
"latitude": 45.764,
|
||||||
|
"longitude": 4.8357
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Stockholm",
|
||||||
|
"state_province": "Stockholm",
|
||||||
|
"iso3166_2": "SE-AB",
|
||||||
|
"country": "Sweden",
|
||||||
|
"iso3166_1": "SE",
|
||||||
|
"latitude": 59.3293,
|
||||||
|
"longitude": 18.0686
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Gothenburg",
|
||||||
|
"state_province": "Västra Götaland",
|
||||||
|
"iso3166_2": "SE-O",
|
||||||
|
"country": "Sweden",
|
||||||
|
"iso3166_1": "SE",
|
||||||
|
"latitude": 57.7089,
|
||||||
|
"longitude": 11.9746
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Oslo",
|
||||||
|
"state_province": "Oslo",
|
||||||
|
"iso3166_2": "NO-03",
|
||||||
|
"country": "Norway",
|
||||||
|
"iso3166_1": "NO",
|
||||||
|
"latitude": 59.9139,
|
||||||
|
"longitude": 10.7522
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Dublin",
|
||||||
|
"state_province": "Leinster",
|
||||||
|
"iso3166_2": "IE-L",
|
||||||
|
"country": "Ireland",
|
||||||
|
"iso3166_1": "IE",
|
||||||
|
"latitude": 53.3498,
|
||||||
|
"longitude": -6.2603
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Vienna",
|
||||||
|
"state_province": "Vienna",
|
||||||
|
"iso3166_2": "AT-9",
|
||||||
|
"country": "Austria",
|
||||||
|
"iso3166_1": "AT",
|
||||||
|
"latitude": 48.2082,
|
||||||
|
"longitude": 16.3738
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Zurich",
|
||||||
|
"state_province": "Zurich",
|
||||||
|
"iso3166_2": "CH-ZH",
|
||||||
|
"country": "Switzerland",
|
||||||
|
"iso3166_1": "CH",
|
||||||
|
"latitude": 47.3769,
|
||||||
|
"longitude": 8.5417
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Tallinn",
|
||||||
|
"state_province": "Harju",
|
||||||
|
"iso3166_2": "EE-37",
|
||||||
|
"country": "Estonia",
|
||||||
|
"iso3166_1": "EE",
|
||||||
|
"latitude": 59.437,
|
||||||
|
"longitude": 24.7536
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Denver",
|
||||||
|
"state_province": "Colorado",
|
||||||
|
"iso3166_2": "US-CO",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 39.7392,
|
||||||
|
"longitude": -104.9903
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Portland",
|
||||||
|
"state_province": "Oregon",
|
||||||
|
"iso3166_2": "US-OR",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 45.5152,
|
||||||
|
"longitude": -122.6784
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "San Diego",
|
||||||
|
"state_province": "California",
|
||||||
|
"iso3166_2": "US-CA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 32.7157,
|
||||||
|
"longitude": -117.1611
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Asheville",
|
||||||
|
"state_province": "North Carolina",
|
||||||
|
"iso3166_2": "US-NC",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 35.5951,
|
||||||
|
"longitude": -82.5515
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Grand Rapids",
|
||||||
|
"state_province": "Michigan",
|
||||||
|
"iso3166_2": "US-MI",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 42.9634,
|
||||||
|
"longitude": -85.6681
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Chicago",
|
||||||
|
"state_province": "Illinois",
|
||||||
|
"iso3166_2": "US-IL",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 41.8781,
|
||||||
|
"longitude": -87.6298
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Seattle",
|
||||||
|
"state_province": "Washington",
|
||||||
|
"iso3166_2": "US-WA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 47.6062,
|
||||||
|
"longitude": -122.3321
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Austin",
|
||||||
|
"state_province": "Texas",
|
||||||
|
"iso3166_2": "US-TX",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 30.2672,
|
||||||
|
"longitude": -97.7431
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Boston",
|
||||||
|
"state_province": "Massachusetts",
|
||||||
|
"iso3166_2": "US-MA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 42.3601,
|
||||||
|
"longitude": -71.0589
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Philadelphia",
|
||||||
|
"state_province": "Pennsylvania",
|
||||||
|
"iso3166_2": "US-PA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 39.9526,
|
||||||
|
"longitude": -75.1652
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Brooklyn",
|
||||||
|
"state_province": "New York",
|
||||||
|
"iso3166_2": "US-NY",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 40.6782,
|
||||||
|
"longitude": -73.9442
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Milwaukee",
|
||||||
|
"state_province": "Wisconsin",
|
||||||
|
"iso3166_2": "US-WI",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 43.0389,
|
||||||
|
"longitude": -87.9065
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Richmond",
|
||||||
|
"state_province": "Virginia",
|
||||||
|
"iso3166_2": "US-VA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 37.5407,
|
||||||
|
"longitude": -77.436
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Cincinnati",
|
||||||
|
"state_province": "Ohio",
|
||||||
|
"iso3166_2": "US-OH",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 39.1031,
|
||||||
|
"longitude": -84.512
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "St. Louis",
|
||||||
|
"state_province": "Missouri",
|
||||||
|
"iso3166_2": "US-MO",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 38.627,
|
||||||
|
"longitude": -90.1994
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Tampa",
|
||||||
|
"state_province": "Florida",
|
||||||
|
"iso3166_2": "US-FL",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 27.9506,
|
||||||
|
"longitude": -82.4572
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Minneapolis",
|
||||||
|
"state_province": "Minnesota",
|
||||||
|
"iso3166_2": "US-MN",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 44.9778,
|
||||||
|
"longitude": -93.265
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Burlington",
|
||||||
|
"state_province": "Vermont",
|
||||||
|
"iso3166_2": "US-VT",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 44.4759,
|
||||||
|
"longitude": -73.2121
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Portland",
|
||||||
|
"state_province": "Maine",
|
||||||
|
"iso3166_2": "US-ME",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 43.6591,
|
||||||
|
"longitude": -70.2568
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Atlanta",
|
||||||
|
"state_province": "Georgia",
|
||||||
|
"iso3166_2": "US-GA",
|
||||||
|
"country": "United States",
|
||||||
|
"iso3166_1": "US",
|
||||||
|
"latitude": 33.749,
|
||||||
|
"longitude": -84.388
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Toronto",
|
||||||
|
"state_province": "Ontario",
|
||||||
|
"iso3166_2": "CA-ON",
|
||||||
|
"country": "Canada",
|
||||||
|
"iso3166_1": "CA",
|
||||||
|
"latitude": 43.651,
|
||||||
|
"longitude": -79.347
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Vancouver",
|
||||||
|
"state_province": "British Columbia",
|
||||||
|
"iso3166_2": "CA-BC",
|
||||||
|
"country": "Canada",
|
||||||
|
"iso3166_1": "CA",
|
||||||
|
"latitude": 49.2827,
|
||||||
|
"longitude": -123.1207
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Montreal",
|
||||||
|
"state_province": "Quebec",
|
||||||
|
"iso3166_2": "CA-QC",
|
||||||
|
"country": "Canada",
|
||||||
|
"iso3166_1": "CA",
|
||||||
|
"latitude": 45.5017,
|
||||||
|
"longitude": -73.5673
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Calgary",
|
||||||
|
"state_province": "Alberta",
|
||||||
|
"iso3166_2": "CA-AB",
|
||||||
|
"country": "Canada",
|
||||||
|
"iso3166_1": "CA",
|
||||||
|
"latitude": 51.0447,
|
||||||
|
"longitude": -114.0719
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Halifax",
|
||||||
|
"state_province": "Nova Scotia",
|
||||||
|
"iso3166_2": "CA-NS",
|
||||||
|
"country": "Canada",
|
||||||
|
"iso3166_1": "CA",
|
||||||
|
"latitude": 44.6488,
|
||||||
|
"longitude": -63.5752
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Mexico City",
|
||||||
|
"state_province": "Mexico City",
|
||||||
|
"iso3166_2": "MX-CMX",
|
||||||
|
"country": "Mexico",
|
||||||
|
"iso3166_1": "MX",
|
||||||
|
"latitude": 19.4326,
|
||||||
|
"longitude": -99.1332
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Tijuana",
|
||||||
|
"state_province": "Baja California",
|
||||||
|
"iso3166_2": "MX-BCN",
|
||||||
|
"country": "Mexico",
|
||||||
|
"iso3166_1": "MX",
|
||||||
|
"latitude": 32.5149,
|
||||||
|
"longitude": -117.0382
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Monterrey",
|
||||||
|
"state_province": "Nuevo León",
|
||||||
|
"iso3166_2": "MX-NLE",
|
||||||
|
"country": "Mexico",
|
||||||
|
"iso3166_1": "MX",
|
||||||
|
"latitude": 25.6866,
|
||||||
|
"longitude": -100.3161
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Guadalajara",
|
||||||
|
"state_province": "Jalisco",
|
||||||
|
"iso3166_2": "MX-JAL",
|
||||||
|
"country": "Mexico",
|
||||||
|
"iso3166_1": "MX",
|
||||||
|
"latitude": 20.6597,
|
||||||
|
"longitude": -103.3496
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"city": "Ensenada",
|
||||||
|
"state_province": "Baja California",
|
||||||
|
"iso3166_2": "MX-BCN",
|
||||||
|
"country": "Mexico",
|
||||||
|
"iso3166_1": "MX",
|
||||||
|
"latitude": 31.8667,
|
||||||
|
"longitude": -116.5964
|
||||||
|
}
|
||||||
|
]
|
||||||
@@ -4,20 +4,21 @@
|
|||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
#include <unordered_map>
|
#include <future>
|
||||||
|
#include <iterator>
|
||||||
|
#include <random>
|
||||||
|
|
||||||
#include "data_generation/data_downloader.h"
|
|
||||||
#include "data_generation/llama_generator.h"
|
#include "data_generation/llama_generator.h"
|
||||||
#include "data_generation/mock_generator.h"
|
#include "data_generation/mock_generator.h"
|
||||||
#include "json_handling/json_loader.h"
|
#include "json_handling/json_loader.h"
|
||||||
#include "wikipedia/wikipedia_service.h"
|
#include "wikipedia/wikipedia_service.h"
|
||||||
|
|
||||||
BiergartenDataGenerator::BiergartenDataGenerator(
|
BiergartenDataGenerator::BiergartenDataGenerator(
|
||||||
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client,
|
const ApplicationOptions& options, std::shared_ptr<WebClient> web_client)
|
||||||
SqliteDatabase& database)
|
: options_(options), webClient_(std::move(web_client)) {}
|
||||||
: options_(options), webClient_(web_client), database_(database) {}
|
|
||||||
|
|
||||||
std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
|
auto BiergartenDataGenerator::InitializeGenerator()
|
||||||
|
-> std::unique_ptr<DataGenerator> {
|
||||||
spdlog::info("Initializing brewery generator...");
|
spdlog::info("Initializing brewery generator...");
|
||||||
|
|
||||||
std::unique_ptr<DataGenerator> generator;
|
std::unique_ptr<DataGenerator> generator;
|
||||||
@@ -41,75 +42,60 @@ std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
|
|||||||
return generator;
|
return generator;
|
||||||
}
|
}
|
||||||
|
|
||||||
void BiergartenDataGenerator::LoadGeographicData() {
|
auto BiergartenDataGenerator::QueryCitiesWithCountries()
|
||||||
std::string json_path = options_.cache_dir + "/countries+states+cities.json";
|
-> std::vector<Location> {
|
||||||
std::string db_path = options_.cache_dir + "/biergarten-pipeline.db";
|
|
||||||
|
|
||||||
bool has_json_cache = std::filesystem::exists(json_path);
|
|
||||||
bool has_db_cache = std::filesystem::exists(db_path);
|
|
||||||
|
|
||||||
spdlog::info("Initializing SQLite database at {}...", db_path);
|
|
||||||
database_.Initialize(db_path);
|
|
||||||
|
|
||||||
if (has_db_cache && has_json_cache) {
|
|
||||||
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
|
|
||||||
} else {
|
|
||||||
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
|
|
||||||
DataDownloader downloader(webClient_);
|
|
||||||
downloader.DownloadCountriesDatabase(json_path, options_.commit);
|
|
||||||
|
|
||||||
JsonLoader::LoadWorldCities(json_path, database_);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::pair<City, std::string>>
|
|
||||||
BiergartenDataGenerator::QueryCitiesWithCountries() {
|
|
||||||
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||||
|
|
||||||
auto cities = database_.QueryCities();
|
std::filesystem::path locations_path = "locations.json";
|
||||||
|
if (!std::filesystem::exists(locations_path)) {
|
||||||
// Build a quick map of country id -> name for per-city lookups.
|
const std::filesystem::path cache_path =
|
||||||
auto all_countries = database_.QueryCountries(0);
|
std::filesystem::path(options_.cache_dir) / "locations.json";
|
||||||
std::unordered_map<int, std::string> country_map;
|
if (std::filesystem::exists(cache_path)) {
|
||||||
for (const auto& c : all_countries) {
|
locations_path = cache_path;
|
||||||
country_map[c.id] = c.name;
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
spdlog::info("\nTotal records loaded:");
|
auto all_locations = JsonLoader::LoadLocations(locations_path.string());
|
||||||
spdlog::info(" Countries: {}", database_.QueryCountries(0).size());
|
spdlog::info(" Locations available: {}", all_locations.size());
|
||||||
spdlog::info(" States: {}", database_.QueryStates(0).size());
|
|
||||||
spdlog::info(" Cities: {}", cities.size());
|
|
||||||
|
|
||||||
// Cap at 30 entries.
|
const size_t sample_count = std::min<size_t>(30, all_locations.size());
|
||||||
const size_t sample_count = std::min(size_t(30), cities.size());
|
std::vector<Location> sampled_locations;
|
||||||
std::vector<std::pair<City, std::string>> result;
|
sampled_locations.reserve(sample_count);
|
||||||
|
|
||||||
for (size_t i = 0; i < sample_count; i++) {
|
std::random_device random_generator;
|
||||||
const auto& city = cities[i];
|
std::sample(all_locations.begin(), all_locations.end(),
|
||||||
std::string country_name;
|
std::back_inserter(sampled_locations), sample_count,
|
||||||
const auto country_it = country_map.find(city.country_id);
|
random_generator);
|
||||||
if (country_it != country_map.end()) {
|
|
||||||
country_name = country_it->second;
|
spdlog::info(" Sampled locations: {}", sampled_locations.size());
|
||||||
}
|
return sampled_locations;
|
||||||
result.push_back({city, country_name});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return result;
|
auto BiergartenDataGenerator::EnrichWithWikipedia(
|
||||||
}
|
const std::vector<Location>& cities) -> std::vector<EnrichedCity> {
|
||||||
|
|
||||||
std::vector<BiergartenDataGenerator::EnrichedCity>
|
|
||||||
BiergartenDataGenerator::EnrichWithWikipedia(
|
|
||||||
const std::vector<std::pair<City, std::string>>& cities) {
|
|
||||||
WikipediaService wikipedia_service(webClient_);
|
|
||||||
std::vector<EnrichedCity> enriched;
|
std::vector<EnrichedCity> enriched;
|
||||||
|
enriched.reserve(cities.size());
|
||||||
|
|
||||||
for (const auto& [city, country_name] : cities) {
|
std::vector<std::future<EnrichedCity>> pending;
|
||||||
|
pending.reserve(cities.size());
|
||||||
|
|
||||||
|
for (const auto& city : cities) {
|
||||||
|
pending.push_back(std::async(std::launch::async,
|
||||||
|
[web_client = webClient_, city]() {
|
||||||
|
WikipediaService wikipedia_service(
|
||||||
|
web_client);
|
||||||
const std::string region_context =
|
const std::string region_context =
|
||||||
wikipedia_service.GetSummary(city.name, country_name);
|
wikipedia_service.GetSummary(
|
||||||
spdlog::debug("[Pipeline] Region context for {}: {}", city.name,
|
city.city, city.country);
|
||||||
region_context);
|
spdlog::debug(
|
||||||
|
"[Pipeline] Region context for {}: {}",
|
||||||
|
city.city, region_context);
|
||||||
|
return EnrichedCity{city, region_context};
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
enriched.push_back({city.id, city.name, country_name, region_context});
|
for (auto& task : pending) {
|
||||||
|
enriched.push_back(task.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
return enriched;
|
return enriched;
|
||||||
@@ -121,28 +107,30 @@ void BiergartenDataGenerator::GenerateBreweries(
|
|||||||
generatedBreweries_.clear();
|
generatedBreweries_.clear();
|
||||||
|
|
||||||
for (const auto& enriched_city : cities) {
|
for (const auto& enriched_city : cities) {
|
||||||
auto brewery = generator.GenerateBrewery(enriched_city.city_name,
|
auto brewery = generator.GenerateBrewery(enriched_city.location.city,
|
||||||
enriched_city.country_name,
|
enriched_city.location.country,
|
||||||
enriched_city.region_context);
|
enriched_city.region_context);
|
||||||
generatedBreweries_.push_back(
|
generatedBreweries_.push_back({enriched_city.location, brewery});
|
||||||
{enriched_city.city_id, enriched_city.city_name, brewery});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void BiergartenDataGenerator::LogResults() const {
|
void BiergartenDataGenerator::LogResults() const {
|
||||||
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
||||||
for (size_t i = 0; i < generatedBreweries_.size(); i++) {
|
size_t index = 1;
|
||||||
const auto& entry = generatedBreweries_[i];
|
for (const auto& entry : generatedBreweries_) {
|
||||||
spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.city_id,
|
spdlog::info("{}. city=\"{}\" country=\"{}\" state=\"{}\" "
|
||||||
entry.city_name);
|
"iso3166_2={} lat={} lon={}",
|
||||||
|
index, entry.location.city, entry.location.country,
|
||||||
|
entry.location.state_province, entry.location.iso3166_2,
|
||||||
|
entry.location.latitude, entry.location.longitude);
|
||||||
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
|
spdlog::info(" brewery_name=\"{}\"", entry.brewery.name);
|
||||||
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
|
spdlog::info(" brewery_description=\"{}\"", entry.brewery.description);
|
||||||
|
++index;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int BiergartenDataGenerator::Run() {
|
auto BiergartenDataGenerator::Run() -> int {
|
||||||
try {
|
try {
|
||||||
LoadGeographicData();
|
|
||||||
auto generator = InitializeGenerator();
|
auto generator = InitializeGenerator();
|
||||||
auto cities = QueryCitiesWithCountries();
|
auto cities = QueryCitiesWithCountries();
|
||||||
auto enriched = EnrichWithWikipedia(cities);
|
auto enriched = EnrichWithWikipedia(cities);
|
||||||
|
|||||||
@@ -1,44 +0,0 @@
|
|||||||
#include "data_generation/data_downloader.h"
|
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
|
||||||
|
|
||||||
#include <filesystem>
|
|
||||||
#include <fstream>
|
|
||||||
#include <sstream>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
#include "web_client/web_client.h"
|
|
||||||
|
|
||||||
DataDownloader::DataDownloader(std::shared_ptr<WebClient> web_client)
|
|
||||||
: web_client_(std::move(web_client)) {}
|
|
||||||
|
|
||||||
DataDownloader::~DataDownloader() {}
|
|
||||||
|
|
||||||
bool DataDownloader::FileExists(const std::string& file_path) {
|
|
||||||
return std::filesystem::exists(file_path);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string DataDownloader::DownloadCountriesDatabase(
|
|
||||||
const std::string& cache_path, const std::string& commit) {
|
|
||||||
if (FileExists(cache_path)) {
|
|
||||||
spdlog::info("[DataDownloader] Cache hit: {}", cache_path);
|
|
||||||
return cache_path;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string url =
|
|
||||||
"https://raw.githubusercontent.com/dr5hn/"
|
|
||||||
"countries-states-cities-database/" +
|
|
||||||
commit + "/json/countries+states+cities.json";
|
|
||||||
|
|
||||||
spdlog::info("[DataDownloader] Downloading: {}", url);
|
|
||||||
|
|
||||||
web_client_->DownloadToFile(url, cache_path);
|
|
||||||
|
|
||||||
std::ifstream file_check(cache_path, std::ios::binary | std::ios::ate);
|
|
||||||
std::streamsize size = file_check.tellg();
|
|
||||||
file_check.close();
|
|
||||||
|
|
||||||
spdlog::info("[DataDownloader] OK: Download complete: {} ({:.2f} MB)",
|
|
||||||
cache_path, (size / (1024.0 * 1024.0)));
|
|
||||||
return cache_path;
|
|
||||||
}
|
|
||||||
@@ -1,264 +0,0 @@
|
|||||||
#include "database/database.h"
|
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
|
||||||
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
void SqliteDatabase::InitializeSchema() {
|
|
||||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
|
||||||
|
|
||||||
const char* schema = R"(
|
|
||||||
CREATE TABLE IF NOT EXISTS countries (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
name TEXT NOT NULL,
|
|
||||||
iso2 TEXT,
|
|
||||||
iso3 TEXT
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS states (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
country_id INTEGER NOT NULL,
|
|
||||||
name TEXT NOT NULL,
|
|
||||||
iso2 TEXT,
|
|
||||||
FOREIGN KEY(country_id) REFERENCES countries(id)
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS cities (
|
|
||||||
id INTEGER PRIMARY KEY,
|
|
||||||
state_id INTEGER NOT NULL,
|
|
||||||
country_id INTEGER NOT NULL,
|
|
||||||
name TEXT NOT NULL,
|
|
||||||
latitude REAL,
|
|
||||||
longitude REAL,
|
|
||||||
FOREIGN KEY(state_id) REFERENCES states(id),
|
|
||||||
FOREIGN KEY(country_id) REFERENCES countries(id)
|
|
||||||
);
|
|
||||||
)";
|
|
||||||
|
|
||||||
char* errMsg = nullptr;
|
|
||||||
int rc = sqlite3_exec(db_, schema, nullptr, nullptr, &errMsg);
|
|
||||||
if (rc != SQLITE_OK) {
|
|
||||||
std::string error = errMsg ? std::string(errMsg) : "Unknown error";
|
|
||||||
sqlite3_free(errMsg);
|
|
||||||
throw std::runtime_error("Failed to create schema: " + error);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
SqliteDatabase::~SqliteDatabase() {
|
|
||||||
if (db_) {
|
|
||||||
sqlite3_close(db_);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void SqliteDatabase::Initialize(const std::string& db_path) {
|
|
||||||
int rc = sqlite3_open(db_path.c_str(), &db_);
|
|
||||||
if (rc) {
|
|
||||||
throw std::runtime_error("Failed to open SQLite database: " + db_path);
|
|
||||||
}
|
|
||||||
spdlog::info("OK: SQLite database opened: {}", db_path);
|
|
||||||
InitializeSchema();
|
|
||||||
}
|
|
||||||
|
|
||||||
void SqliteDatabase::BeginTransaction() {
|
|
||||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
|
||||||
char* err = nullptr;
|
|
||||||
if (sqlite3_exec(db_, "BEGIN TRANSACTION", nullptr, nullptr, &err) !=
|
|
||||||
SQLITE_OK) {
|
|
||||||
std::string msg = err ? err : "unknown";
|
|
||||||
sqlite3_free(err);
|
|
||||||
throw std::runtime_error("BeginTransaction failed: " + msg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void SqliteDatabase::CommitTransaction() {
|
|
||||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
|
||||||
char* err = nullptr;
|
|
||||||
if (sqlite3_exec(db_, "COMMIT", nullptr, nullptr, &err) != SQLITE_OK) {
|
|
||||||
std::string msg = err ? err : "unknown";
|
|
||||||
sqlite3_free(err);
|
|
||||||
throw std::runtime_error("CommitTransaction failed: " + msg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void SqliteDatabase::RollbackTransaction() {
|
|
||||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
|
||||||
char* err = nullptr;
|
|
||||||
if (sqlite3_exec(db_, "ROLLBACK", nullptr, nullptr, &err) != SQLITE_OK) {
|
|
||||||
std::string msg = err ? err : "unknown";
|
|
||||||
sqlite3_free(err);
|
|
||||||
throw std::runtime_error("RollbackTransaction failed: " + msg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void SqliteDatabase::InsertCountry(int id, const std::string& name,
|
|
||||||
const std::string& iso2,
|
|
||||||
const std::string& iso3) {
|
|
||||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
|
||||||
|
|
||||||
const char* query = R"(
|
|
||||||
INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
|
|
||||||
VALUES (?, ?, ?, ?)
|
|
||||||
)";
|
|
||||||
|
|
||||||
sqlite3_stmt* stmt;
|
|
||||||
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
|
||||||
if (rc != SQLITE_OK)
|
|
||||||
throw std::runtime_error("Failed to prepare country insert");
|
|
||||||
|
|
||||||
sqlite3_bind_int(stmt, 1, id);
|
|
||||||
sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_TRANSIENT);
|
|
||||||
sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_TRANSIENT);
|
|
||||||
sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_TRANSIENT);
|
|
||||||
|
|
||||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
|
||||||
throw std::runtime_error("Failed to insert country");
|
|
||||||
}
|
|
||||||
sqlite3_finalize(stmt);
|
|
||||||
}
|
|
||||||
|
|
||||||
void SqliteDatabase::InsertState(int id, int country_id,
|
|
||||||
const std::string& name,
|
|
||||||
const std::string& iso2) {
|
|
||||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
|
||||||
|
|
||||||
const char* query = R"(
|
|
||||||
INSERT OR IGNORE INTO states (id, country_id, name, iso2)
|
|
||||||
VALUES (?, ?, ?, ?)
|
|
||||||
)";
|
|
||||||
|
|
||||||
sqlite3_stmt* stmt;
|
|
||||||
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
|
||||||
if (rc != SQLITE_OK)
|
|
||||||
throw std::runtime_error("Failed to prepare state insert");
|
|
||||||
|
|
||||||
sqlite3_bind_int(stmt, 1, id);
|
|
||||||
sqlite3_bind_int(stmt, 2, country_id);
|
|
||||||
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_TRANSIENT);
|
|
||||||
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_TRANSIENT);
|
|
||||||
|
|
||||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
|
||||||
throw std::runtime_error("Failed to insert state");
|
|
||||||
}
|
|
||||||
sqlite3_finalize(stmt);
|
|
||||||
}
|
|
||||||
|
|
||||||
void SqliteDatabase::InsertCity(int id, int state_id, int country_id,
|
|
||||||
const std::string& name, double latitude,
|
|
||||||
double longitude) {
|
|
||||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
|
||||||
|
|
||||||
const char* query = R"(
|
|
||||||
INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
|
|
||||||
VALUES (?, ?, ?, ?, ?, ?)
|
|
||||||
)";
|
|
||||||
|
|
||||||
sqlite3_stmt* stmt;
|
|
||||||
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
|
||||||
if (rc != SQLITE_OK)
|
|
||||||
throw std::runtime_error("Failed to prepare city insert");
|
|
||||||
|
|
||||||
sqlite3_bind_int(stmt, 1, id);
|
|
||||||
sqlite3_bind_int(stmt, 2, state_id);
|
|
||||||
sqlite3_bind_int(stmt, 3, country_id);
|
|
||||||
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_TRANSIENT);
|
|
||||||
sqlite3_bind_double(stmt, 5, latitude);
|
|
||||||
sqlite3_bind_double(stmt, 6, longitude);
|
|
||||||
|
|
||||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
|
||||||
throw std::runtime_error("Failed to insert city");
|
|
||||||
}
|
|
||||||
sqlite3_finalize(stmt);
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<City> SqliteDatabase::QueryCities() {
|
|
||||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
|
||||||
std::vector<City> cities;
|
|
||||||
sqlite3_stmt* stmt = nullptr;
|
|
||||||
|
|
||||||
const char* query =
|
|
||||||
"SELECT id, name, country_id FROM cities ORDER BY RANDOM()";
|
|
||||||
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
|
||||||
|
|
||||||
if (rc != SQLITE_OK) {
|
|
||||||
throw std::runtime_error("Failed to prepare query");
|
|
||||||
}
|
|
||||||
|
|
||||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
|
||||||
int id = sqlite3_column_int(stmt, 0);
|
|
||||||
const char* name =
|
|
||||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
|
|
||||||
int country_id = sqlite3_column_int(stmt, 2);
|
|
||||||
cities.push_back({id, name ? std::string(name) : "", country_id});
|
|
||||||
}
|
|
||||||
|
|
||||||
sqlite3_finalize(stmt);
|
|
||||||
return cities;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
|
|
||||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
|
||||||
|
|
||||||
std::vector<Country> countries;
|
|
||||||
sqlite3_stmt* stmt = nullptr;
|
|
||||||
|
|
||||||
std::string query =
|
|
||||||
"SELECT id, name, iso2, iso3 FROM countries ORDER BY name";
|
|
||||||
if (limit > 0) {
|
|
||||||
query += " LIMIT " + std::to_string(limit);
|
|
||||||
}
|
|
||||||
|
|
||||||
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
|
|
||||||
|
|
||||||
if (rc != SQLITE_OK) {
|
|
||||||
throw std::runtime_error("Failed to prepare countries query");
|
|
||||||
}
|
|
||||||
|
|
||||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
|
||||||
int id = sqlite3_column_int(stmt, 0);
|
|
||||||
const char* name =
|
|
||||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
|
|
||||||
const char* iso2 =
|
|
||||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
|
|
||||||
const char* iso3 =
|
|
||||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 3));
|
|
||||||
countries.push_back({id, name ? std::string(name) : "",
|
|
||||||
iso2 ? std::string(iso2) : "",
|
|
||||||
iso3 ? std::string(iso3) : ""});
|
|
||||||
}
|
|
||||||
|
|
||||||
sqlite3_finalize(stmt);
|
|
||||||
return countries;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<State> SqliteDatabase::QueryStates(int limit) {
|
|
||||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
|
||||||
|
|
||||||
std::vector<State> states;
|
|
||||||
sqlite3_stmt* stmt = nullptr;
|
|
||||||
|
|
||||||
std::string query =
|
|
||||||
"SELECT id, name, iso2, country_id FROM states ORDER BY name";
|
|
||||||
if (limit > 0) {
|
|
||||||
query += " LIMIT " + std::to_string(limit);
|
|
||||||
}
|
|
||||||
|
|
||||||
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
|
|
||||||
|
|
||||||
if (rc != SQLITE_OK) {
|
|
||||||
throw std::runtime_error("Failed to prepare states query");
|
|
||||||
}
|
|
||||||
|
|
||||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
|
||||||
int id = sqlite3_column_int(stmt, 0);
|
|
||||||
const char* name =
|
|
||||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 1));
|
|
||||||
const char* iso2 =
|
|
||||||
reinterpret_cast<const char*>(sqlite3_column_text(stmt, 2));
|
|
||||||
int country_id = sqlite3_column_int(stmt, 3);
|
|
||||||
states.push_back({id, name ? std::string(name) : "",
|
|
||||||
iso2 ? std::string(iso2) : "", country_id});
|
|
||||||
}
|
|
||||||
|
|
||||||
sqlite3_finalize(stmt);
|
|
||||||
return states;
|
|
||||||
}
|
|
||||||
@@ -2,66 +2,82 @@
|
|||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
#include <chrono>
|
#include <boost/json.hpp>
|
||||||
|
|
||||||
#include "json_handling/stream_parser.h"
|
#include <fstream>
|
||||||
|
#include <sstream>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
void JsonLoader::LoadWorldCities(const std::string& json_path,
|
namespace {
|
||||||
SqliteDatabase& db) {
|
|
||||||
constexpr size_t kBatchSize = 10000;
|
|
||||||
|
|
||||||
auto startTime = std::chrono::high_resolution_clock::now();
|
auto ReadRequiredString(const boost::json::object& object,
|
||||||
spdlog::info("\nLoading {} (streaming Boost.JSON SAX)...", json_path);
|
const char* key) -> std::string {
|
||||||
|
const boost::json::value* value = object.if_contains(key);
|
||||||
db.BeginTransaction();
|
if (value == nullptr || !value->is_string()) {
|
||||||
bool transactionOpen = true;
|
throw std::runtime_error(std::string("Missing or invalid string field: ") +
|
||||||
|
key);
|
||||||
size_t citiesProcessed = 0;
|
|
||||||
try {
|
|
||||||
StreamingJsonParser::Parse(
|
|
||||||
json_path, db,
|
|
||||||
[&](const CityRecord& record) {
|
|
||||||
db.InsertCity(record.id, record.state_id, record.country_id,
|
|
||||||
record.name, record.latitude, record.longitude);
|
|
||||||
++citiesProcessed;
|
|
||||||
|
|
||||||
if (citiesProcessed % kBatchSize == 0) {
|
|
||||||
db.CommitTransaction();
|
|
||||||
db.BeginTransaction();
|
|
||||||
}
|
}
|
||||||
},
|
return std::string(value->as_string().c_str());
|
||||||
[&](size_t current, size_t /*total*/) {
|
|
||||||
if (current % kBatchSize == 0 && current > 0) {
|
|
||||||
spdlog::info(" [Progress] Parsed {} cities...", current);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto ReadRequiredNumber(const boost::json::object& object, const char* key)
|
||||||
|
-> double {
|
||||||
|
const boost::json::value* value = object.if_contains(key);
|
||||||
|
if (value == nullptr || !value->is_number()) {
|
||||||
|
throw std::runtime_error(std::string("Missing or invalid numeric field: ") +
|
||||||
|
key);
|
||||||
|
}
|
||||||
|
return value->to_number<double>();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
auto JsonLoader::LoadLocations(const std::string& filepath)
|
||||||
|
-> std::vector<Location> {
|
||||||
|
std::ifstream input(filepath);
|
||||||
|
if (!input.is_open()) {
|
||||||
|
throw std::runtime_error("Failed to open locations file: " + filepath);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::stringstream buffer;
|
||||||
|
buffer << input.rdbuf();
|
||||||
|
const std::string content = buffer.str();
|
||||||
|
|
||||||
|
boost::json::error_code error;
|
||||||
|
boost::json::value root = boost::json::parse(content, error);
|
||||||
|
if (error) {
|
||||||
|
throw std::runtime_error("Failed to parse locations JSON: " +
|
||||||
|
error.message());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!root.is_array()) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Invalid locations JSON: root element must be an array");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<Location> locations;
|
||||||
|
const auto& items = root.as_array();
|
||||||
|
locations.reserve(items.size());
|
||||||
|
|
||||||
|
for (const auto& item : items) {
|
||||||
|
if (!item.is_object()) {
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Invalid locations JSON: each entry must be an object");
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto& object = item.as_object();
|
||||||
|
locations.push_back(Location{
|
||||||
|
.city = ReadRequiredString(object, "city"),
|
||||||
|
.state_province = ReadRequiredString(object, "state_province"),
|
||||||
|
.iso3166_2 = ReadRequiredString(object, "iso3166_2"),
|
||||||
|
.country = ReadRequiredString(object, "country"),
|
||||||
|
.iso3166_1 = ReadRequiredString(object, "iso3166_1"),
|
||||||
|
.latitude = ReadRequiredNumber(object, "latitude"),
|
||||||
|
.longitude = ReadRequiredNumber(object, "longitude"),
|
||||||
});
|
});
|
||||||
|
|
||||||
spdlog::info(" OK: Parsed all cities from JSON");
|
|
||||||
|
|
||||||
if (transactionOpen) {
|
|
||||||
db.CommitTransaction();
|
|
||||||
transactionOpen = false;
|
|
||||||
}
|
|
||||||
} catch (...) {
|
|
||||||
if (transactionOpen) {
|
|
||||||
db.RollbackTransaction();
|
|
||||||
transactionOpen = false;
|
|
||||||
}
|
|
||||||
throw;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
auto endTime = std::chrono::high_resolution_clock::now();
|
spdlog::info("[JsonLoader] Loaded {} locations from {}", locations.size(),
|
||||||
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
|
filepath);
|
||||||
endTime - startTime);
|
return locations;
|
||||||
|
|
||||||
spdlog::info("\n=== World City Data Loading Summary ===\n");
|
|
||||||
spdlog::info("Cities inserted: {}", citiesProcessed);
|
|
||||||
spdlog::info("Elapsed time: {} ms", duration.count());
|
|
||||||
long long throughput =
|
|
||||||
(citiesProcessed > 0 && duration.count() > 0)
|
|
||||||
? (1000LL * static_cast<long long>(citiesProcessed)) /
|
|
||||||
static_cast<long long>(duration.count())
|
|
||||||
: 0LL;
|
|
||||||
spdlog::info("Throughput: {} cities/sec", throughput);
|
|
||||||
spdlog::info("=======================================\n");
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,289 +0,0 @@
|
|||||||
#include "json_handling/stream_parser.h"
|
|
||||||
|
|
||||||
#include <spdlog/spdlog.h>
|
|
||||||
|
|
||||||
#include <boost/json.hpp>
|
|
||||||
#include <boost/json/basic_parser_impl.hpp>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
#include "database/database.h"
|
|
||||||
|
|
||||||
class CityRecordHandler {
|
|
||||||
friend class boost::json::basic_parser<CityRecordHandler>;
|
|
||||||
|
|
||||||
public:
|
|
||||||
static constexpr std::size_t max_array_size = static_cast<std::size_t>(-1);
|
|
||||||
static constexpr std::size_t max_object_size = static_cast<std::size_t>(-1);
|
|
||||||
static constexpr std::size_t max_string_size = static_cast<std::size_t>(-1);
|
|
||||||
static constexpr std::size_t max_key_size = static_cast<std::size_t>(-1);
|
|
||||||
|
|
||||||
struct ParseContext {
|
|
||||||
SqliteDatabase* db = nullptr;
|
|
||||||
std::function<void(const CityRecord&)> on_city;
|
|
||||||
std::function<void(size_t, size_t)> on_progress;
|
|
||||||
size_t cities_emitted = 0;
|
|
||||||
size_t total_file_size = 0;
|
|
||||||
int countries_inserted = 0;
|
|
||||||
int states_inserted = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
explicit CityRecordHandler(ParseContext& ctx) : context(ctx) {}
|
|
||||||
|
|
||||||
private:
|
|
||||||
ParseContext& context;
|
|
||||||
|
|
||||||
int depth = 0;
|
|
||||||
bool in_countries_array = false;
|
|
||||||
bool in_country_object = false;
|
|
||||||
bool in_states_array = false;
|
|
||||||
bool in_state_object = false;
|
|
||||||
bool in_cities_array = false;
|
|
||||||
bool building_city = false;
|
|
||||||
|
|
||||||
int current_country_id = 0;
|
|
||||||
int current_state_id = 0;
|
|
||||||
CityRecord current_city = {};
|
|
||||||
std::string current_key;
|
|
||||||
std::string current_key_val;
|
|
||||||
std::string current_string_val;
|
|
||||||
|
|
||||||
std::string country_info[3];
|
|
||||||
std::string state_info[2];
|
|
||||||
|
|
||||||
// Boost.JSON SAX Hooks
|
|
||||||
bool on_document_begin(boost::system::error_code&) { return true; }
|
|
||||||
bool on_document_end(boost::system::error_code&) { return true; }
|
|
||||||
|
|
||||||
bool on_array_begin(boost::system::error_code&) {
|
|
||||||
depth++;
|
|
||||||
if (depth == 1) {
|
|
||||||
in_countries_array = true;
|
|
||||||
} else if (depth == 3 && current_key == "states") {
|
|
||||||
in_states_array = true;
|
|
||||||
} else if (depth == 5 && current_key == "cities") {
|
|
||||||
in_cities_array = true;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_array_end(std::size_t, boost::system::error_code&) {
|
|
||||||
if (depth == 1) {
|
|
||||||
in_countries_array = false;
|
|
||||||
} else if (depth == 3) {
|
|
||||||
in_states_array = false;
|
|
||||||
} else if (depth == 5) {
|
|
||||||
in_cities_array = false;
|
|
||||||
}
|
|
||||||
depth--;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_object_begin(boost::system::error_code&) {
|
|
||||||
depth++;
|
|
||||||
if (depth == 2 && in_countries_array) {
|
|
||||||
in_country_object = true;
|
|
||||||
current_country_id = 0;
|
|
||||||
country_info[0].clear();
|
|
||||||
country_info[1].clear();
|
|
||||||
country_info[2].clear();
|
|
||||||
} else if (depth == 4 && in_states_array) {
|
|
||||||
in_state_object = true;
|
|
||||||
current_state_id = 0;
|
|
||||||
state_info[0].clear();
|
|
||||||
state_info[1].clear();
|
|
||||||
} else if (depth == 6 && in_cities_array) {
|
|
||||||
building_city = true;
|
|
||||||
current_city = {};
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_object_end(std::size_t, boost::system::error_code&) {
|
|
||||||
if (depth == 6 && building_city) {
|
|
||||||
if (current_city.id > 0 && current_state_id > 0 &&
|
|
||||||
current_country_id > 0) {
|
|
||||||
current_city.state_id = current_state_id;
|
|
||||||
current_city.country_id = current_country_id;
|
|
||||||
|
|
||||||
try {
|
|
||||||
context.on_city(current_city);
|
|
||||||
context.cities_emitted++;
|
|
||||||
|
|
||||||
if (context.on_progress && context.cities_emitted % 10000 == 0) {
|
|
||||||
context.on_progress(context.cities_emitted,
|
|
||||||
context.total_file_size);
|
|
||||||
}
|
|
||||||
} catch (const std::exception& e) {
|
|
||||||
spdlog::warn("Record parsing failed: {}", e.what());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
building_city = false;
|
|
||||||
} else if (depth == 4 && in_state_object) {
|
|
||||||
if (current_state_id > 0 && current_country_id > 0) {
|
|
||||||
try {
|
|
||||||
context.db->InsertState(current_state_id, current_country_id,
|
|
||||||
state_info[0], state_info[1]);
|
|
||||||
context.states_inserted++;
|
|
||||||
} catch (const std::exception& e) {
|
|
||||||
spdlog::warn("Record parsing failed: {}", e.what());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
in_state_object = false;
|
|
||||||
} else if (depth == 2 && in_country_object) {
|
|
||||||
if (current_country_id > 0) {
|
|
||||||
try {
|
|
||||||
context.db->InsertCountry(current_country_id, country_info[0],
|
|
||||||
country_info[1], country_info[2]);
|
|
||||||
context.countries_inserted++;
|
|
||||||
} catch (const std::exception& e) {
|
|
||||||
spdlog::warn("Record parsing failed: {}", e.what());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
in_country_object = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
depth--;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_key_part(boost::json::string_view s, std::size_t,
|
|
||||||
boost::system::error_code&) {
|
|
||||||
current_key_val.append(s.data(), s.size());
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_key(boost::json::string_view s, std::size_t,
|
|
||||||
boost::system::error_code&) {
|
|
||||||
current_key_val.append(s.data(), s.size());
|
|
||||||
current_key = current_key_val;
|
|
||||||
current_key_val.clear();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_string_part(boost::json::string_view s, std::size_t,
|
|
||||||
boost::system::error_code&) {
|
|
||||||
current_string_val.append(s.data(), s.size());
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_string(boost::json::string_view s, std::size_t,
|
|
||||||
boost::system::error_code&) {
|
|
||||||
current_string_val.append(s.data(), s.size());
|
|
||||||
|
|
||||||
if (building_city && current_key == "name") {
|
|
||||||
current_city.name = current_string_val;
|
|
||||||
} else if (in_state_object && current_key == "name") {
|
|
||||||
state_info[0] = current_string_val;
|
|
||||||
} else if (in_state_object && current_key == "iso2") {
|
|
||||||
state_info[1] = current_string_val;
|
|
||||||
} else if (in_country_object && current_key == "name") {
|
|
||||||
country_info[0] = current_string_val;
|
|
||||||
} else if (in_country_object && current_key == "iso2") {
|
|
||||||
country_info[1] = current_string_val;
|
|
||||||
} else if (in_country_object && current_key == "iso3") {
|
|
||||||
country_info[2] = current_string_val;
|
|
||||||
}
|
|
||||||
|
|
||||||
current_string_val.clear();
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_number_part(boost::json::string_view, boost::system::error_code&) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_int64(int64_t i, boost::json::string_view,
|
|
||||||
boost::system::error_code&) {
|
|
||||||
if (building_city && current_key == "id") {
|
|
||||||
current_city.id = static_cast<int>(i);
|
|
||||||
} else if (in_state_object && current_key == "id") {
|
|
||||||
current_state_id = static_cast<int>(i);
|
|
||||||
} else if (in_country_object && current_key == "id") {
|
|
||||||
current_country_id = static_cast<int>(i);
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_uint64(uint64_t u, boost::json::string_view,
|
|
||||||
boost::system::error_code& ec) {
|
|
||||||
return on_int64(static_cast<int64_t>(u), "", ec);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_double(double d, boost::json::string_view,
|
|
||||||
boost::system::error_code&) {
|
|
||||||
if (building_city) {
|
|
||||||
if (current_key == "latitude") {
|
|
||||||
current_city.latitude = d;
|
|
||||||
} else if (current_key == "longitude") {
|
|
||||||
current_city.longitude = d;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool on_bool(bool, boost::system::error_code&) { return true; }
|
|
||||||
bool on_null(boost::system::error_code&) { return true; }
|
|
||||||
bool on_comment_part(boost::json::string_view, boost::system::error_code&) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
bool on_comment(boost::json::string_view, boost::system::error_code&) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
void StreamingJsonParser::Parse(
|
|
||||||
const std::string& file_path, SqliteDatabase& db,
|
|
||||||
std::function<void(const CityRecord&)> on_city,
|
|
||||||
std::function<void(size_t, size_t)> on_progress) {
|
|
||||||
spdlog::info(" Streaming parse of {} (Boost.JSON)...", file_path);
|
|
||||||
|
|
||||||
FILE* file = std::fopen(file_path.c_str(), "rb");
|
|
||||||
if (!file) {
|
|
||||||
throw std::runtime_error("Failed to open JSON file: " + file_path);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t total_size = 0;
|
|
||||||
if (std::fseek(file, 0, SEEK_END) == 0) {
|
|
||||||
long file_size = std::ftell(file);
|
|
||||||
if (file_size > 0) {
|
|
||||||
total_size = static_cast<size_t>(file_size);
|
|
||||||
}
|
|
||||||
std::rewind(file);
|
|
||||||
}
|
|
||||||
|
|
||||||
CityRecordHandler::ParseContext ctx{&db, on_city, on_progress, 0, total_size,
|
|
||||||
0, 0};
|
|
||||||
boost::json::basic_parser<CityRecordHandler> parser(
|
|
||||||
boost::json::parse_options{}, ctx);
|
|
||||||
|
|
||||||
char buf[65536];
|
|
||||||
size_t bytes_read;
|
|
||||||
boost::system::error_code ec;
|
|
||||||
|
|
||||||
while ((bytes_read = std::fread(buf, 1, sizeof(buf), file)) > 0) {
|
|
||||||
char const* p = buf;
|
|
||||||
std::size_t remain = bytes_read;
|
|
||||||
|
|
||||||
while (remain > 0) {
|
|
||||||
std::size_t consumed = parser.write_some(true, p, remain, ec);
|
|
||||||
if (ec) {
|
|
||||||
std::fclose(file);
|
|
||||||
throw std::runtime_error("JSON parse error: " + ec.message());
|
|
||||||
}
|
|
||||||
p += consumed;
|
|
||||||
remain -= consumed;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
parser.write_some(false, nullptr, 0, ec); // Signal EOF
|
|
||||||
std::fclose(file);
|
|
||||||
|
|
||||||
if (ec) {
|
|
||||||
throw std::runtime_error("JSON parse error at EOF: " + ec.message());
|
|
||||||
}
|
|
||||||
|
|
||||||
spdlog::info(" OK: Parsed {} countries, {} states, {} cities",
|
|
||||||
ctx.countries_inserted, ctx.states_inserted,
|
|
||||||
ctx.cities_emitted);
|
|
||||||
}
|
|
||||||
@@ -5,7 +5,6 @@
|
|||||||
#include <memory>
|
#include <memory>
|
||||||
|
|
||||||
#include "biergarten_data_generator.h"
|
#include "biergarten_data_generator.h"
|
||||||
#include "database/database.h"
|
|
||||||
#include "web_client/curl_web_client.h"
|
#include "web_client/curl_web_client.h"
|
||||||
|
|
||||||
namespace po = boost::program_options;
|
namespace po = boost::program_options;
|
||||||
@@ -122,9 +121,8 @@ int main(int argc, char* argv[]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
auto webClient = std::make_shared<CURLWebClient>();
|
auto webClient = std::make_shared<CURLWebClient>();
|
||||||
SqliteDatabase database;
|
|
||||||
|
|
||||||
BiergartenDataGenerator generator(options, webClient, database);
|
BiergartenDataGenerator generator(options, webClient);
|
||||||
return generator.Run();
|
return generator.Run();
|
||||||
|
|
||||||
} catch (const std::exception& e) {
|
} catch (const std::exception& e) {
|
||||||
|
|||||||
Reference in New Issue
Block a user