diff --git a/pipeline/.gitignore b/pipeline/.gitignore index 2c120f6..931567f 100644 --- a/pipeline/.gitignore +++ b/pipeline/.gitignore @@ -1,3 +1,5 @@ dist build data +models +*.gguf diff --git a/pipeline/CMakeLists.txt b/pipeline/CMakeLists.txt index 5c2aeec..1f612ee 100644 --- a/pipeline/CMakeLists.txt +++ b/pipeline/CMakeLists.txt @@ -1,170 +1,104 @@ -cmake_minimum_required(VERSION 3.20) -project(biergarten-pipeline VERSION 0.1.0 LANGUAGES CXX) - -# Allows older dependencies to configure on newer CMake. -set(CMAKE_POLICY_VERSION_MINIMUM 3.5) - -# Policies -cmake_policy(SET CMP0167 NEW) # FindBoost improvements - -# Global Settings -set(CMAKE_CXX_STANDARD 23) -set(CMAKE_CXX_STANDARD_REQUIRED ON) -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_EXPORT_COMPILE_COMMANDS ON) - -option(ENABLE_CLANG_TIDY "Enable clang-tidy static analysis for project targets" ON) -option(ENABLE_CLANG_FORMAT_TARGETS "Enable clang-format helper targets" ON) - -if(ENABLE_CLANG_TIDY) - find_program(CLANG_TIDY_EXE NAMES clang-tidy) - if(CLANG_TIDY_EXE) - set(BIERGARTEN_CLANG_TIDY_COMMAND - "${CLANG_TIDY_EXE};--config-file=${CMAKE_CURRENT_SOURCE_DIR}/.clang-tidy") - message(STATUS "clang-tidy enabled: ${CLANG_TIDY_EXE}") +cmake_minimum_required(VERSION 3.24) +project(biergarten-pipeline) +# ============================================================================= +# 1. GPU Detection +# ============================================================================= +# GGML_CUDA / GGML_METAL are set here so that the llama.cpp FetchContent below +# inherits them as cache variables before its CMakeLists.txt is processed. +if(APPLE) + message(STATUS "[biergarten] Apple Silicon detected — enabling Metal acceleration.") + set(GGML_METAL ON CACHE BOOL "Enable Metal for Apple Silicon" FORCE) +elseif(UNIX AND NOT APPLE) + find_package(CUDAToolkit QUIET) + if(CUDAToolkit_FOUND) + message(STATUS "[biergarten] NVIDIA GPU detected — enabling CUDA acceleration.") + set(GGML_CUDA ON CACHE BOOL "Enable CUDA for NVIDIA GPUs" FORCE) + # 'native' resolves to the exact SM version of the present GPU at configure time + # (e.g. sm_89 for RTX 2000 Ada). Change to a concrete arch list for cross-compilation. + set(CMAKE_CUDA_ARCHITECTURES native) else() - message(STATUS "clang-tidy not found; static analysis is disabled") + message(STATUS "[biergarten] No NVIDIA GPU found — falling back to CPU.") endif() endif() - -# ----------------------------------------------------------------------------- -# Compiler Options & Warnings (Interface Library) -# ----------------------------------------------------------------------------- -add_library(project_options INTERFACE) -target_compile_options(project_options INTERFACE - $<$: - -Wall -Wextra -Wpedantic -Wshadow -Wconversion -Wsign-conversion -Wunused - > - $<$: - /W4 /WX /permissive- - > -) - -# ----------------------------------------------------------------------------- -# Dependencies -# ----------------------------------------------------------------------------- -find_package(CURL REQUIRED) -find_package(SQLite3 REQUIRED) -find_package(Boost 1.75 REQUIRED COMPONENTS program_options json) - +# ============================================================================= +# 2. Project-wide Settings +# ============================================================================= +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +# ============================================================================= +# 3. Dependencies +# ============================================================================= include(FetchContent) - -# spdlog (Logging) +# --- libcurl ------------------------------------------------------------------ +# Prefer the system package; the build will fail at link time if absent and +# no system curl is found, so emit a fatal error early rather than a silent gap. +find_package(CURL QUIET) +if(NOT CURL_FOUND) + message(FATAL_ERROR + "[biergarten] libcurl not found. Install it via your package manager " + "(e.g. 'sudo dnf install libcurl-devel') or set CURL_ROOT.") +endif() +# --- llama.cpp ---------------------------------------------------------------- +# Pinned to a specific commit for reproducible builds. +# To update: pick a new commit SHA from https://github.com/ggml-org/llama.cpp +FetchContent_Declare( + llama-cpp + GIT_REPOSITORY https://github.com/ggml-org/llama.cpp.git + GIT_TAG b8611 +) +FetchContent_MakeAvailable(llama-cpp) +# --- Boost (JSON + program_options) ------------------------------------------ +FetchContent_Declare( + boost + URL https://github.com/boostorg/boost/releases/download/boost-1.85.0/boost-1.85.0-cmake.tar.gz +) +FetchContent_MakeAvailable(boost) +# --- spdlog ------------------------------------------------------------------- FetchContent_Declare( spdlog GIT_REPOSITORY https://github.com/gabime/spdlog.git - GIT_TAG v1.11.0 + GIT_TAG v1.15.3 ) FetchContent_MakeAvailable(spdlog) - -# llama.cpp (LLM Inference) -set(LLAMA_BUILD_TESTS OFF CACHE BOOL "" FORCE) -set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE) -set(LLAMA_BUILD_SERVER OFF CACHE BOOL "" FORCE) -FetchContent_Declare( - llama_cpp - GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git - GIT_TAG b8611 -) -FetchContent_MakeAvailable(llama_cpp) - -if(TARGET llama) - target_compile_options(llama PRIVATE - $<$:-include algorithm> - ) -endif() - -# ----------------------------------------------------------------------------- -# Main Executable -# ----------------------------------------------------------------------------- -set(PIPELINE_SOURCES +# ============================================================================= +# 4. Sources +# ============================================================================= +set(SOURCES + src/main.cpp src/biergarten_data_generator.cpp - src/web_client/curl_web_client.cpp - src/data_generation/data_downloader.cpp - src/database/database.cpp - src/json_handling/json_loader.cpp src/data_generation/llama/destructor.cpp - src/data_generation/llama/set_sampling_options.cpp - src/data_generation/llama/load.cpp - src/data_generation/llama/infer.cpp src/data_generation/llama/generate_brewery.cpp src/data_generation/llama/generate_user.cpp src/data_generation/llama/helpers.cpp + src/data_generation/llama/infer.cpp + src/data_generation/llama/load.cpp src/data_generation/llama/load_brewery_prompt.cpp + src/data_generation/llama/set_sampling_options.cpp src/data_generation/mock/data.cpp src/data_generation/mock/deterministic_hash.cpp - src/data_generation/mock/load.cpp src/data_generation/mock/generate_brewery.cpp src/data_generation/mock/generate_user.cpp - src/json_handling/stream_parser.cpp + src/data_generation/mock/load.cpp + src/json_handling/json_loader.cpp + src/web_client/curl_web_client.cpp src/wikipedia/wikipedia_service.cpp - src/main.cpp ) - -add_executable(biergarten-pipeline ${PIPELINE_SOURCES}) - -if(BIERGARTEN_CLANG_TIDY_COMMAND) - set_target_properties(biergarten-pipeline PROPERTIES - CXX_CLANG_TIDY "${BIERGARTEN_CLANG_TIDY_COMMAND}" - ) -endif() - -target_include_directories(biergarten-pipeline - PRIVATE - ${CMAKE_CURRENT_SOURCE_DIR}/includes - ${llama_cpp_SOURCE_DIR}/include +# ============================================================================= +# 5. Target +# ============================================================================= +add_executable(${PROJECT_NAME} + ${SOURCES} ) - -target_link_libraries(biergarten-pipeline - PRIVATE - project_options - CURL::libcurl - SQLite::SQLite3 - spdlog::spdlog - llama - Boost::program_options - Boost::json +target_include_directories(${PROJECT_NAME} PRIVATE + includes + ${llama-cpp_SOURCE_DIR}/include + ${llama-cpp_SOURCE_DIR}/common ) - -if(ENABLE_CLANG_FORMAT_TARGETS) - find_program(CLANG_FORMAT_EXE NAMES clang-format) - if(CLANG_FORMAT_EXE) - file(GLOB_RECURSE FORMAT_SOURCES CONFIGURE_DEPENDS - ${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/**/*.cc - ${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.h - ${CMAKE_CURRENT_SOURCE_DIR}/includes/**/*.hpp - ) - - add_custom_target(format - COMMAND ${CLANG_FORMAT_EXE} -style=file -i ${FORMAT_SOURCES} - COMMENT "Formatting source files with clang-format (Google style)" - VERBATIM - ) - - add_custom_target(format-check - COMMAND ${CLANG_FORMAT_EXE} -style=file --dry-run --Werror ${FORMAT_SOURCES} - COMMENT "Checking source formatting with clang-format (Google style)" - VERBATIM - ) - else() - message(STATUS "clang-format not found; format targets are disabled") - endif() -endif() - -# ----------------------------------------------------------------------------- -# Post-Build Steps & Utilities -# ----------------------------------------------------------------------------- -add_custom_command(TARGET biergarten-pipeline POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_SOURCE_DIR}/output - COMMENT "Ensuring output directory exists" +target_link_libraries(${PROJECT_NAME} PRIVATE + llama + boost_json + boost_program_options + spdlog::spdlog + CURL::libcurl ) - -find_program(VALGRIND valgrind) -if(VALGRIND) - add_custom_target(memcheck - COMMAND ${VALGRIND} --leak-check=full --error-exitcode=1 $ --help - DEPENDS biergarten-pipeline - COMMENT "Running Valgrind memory check" - ) -endif() diff --git a/pipeline/includes/biergarten_data_generator.h b/pipeline/includes/biergarten_data_generator.h index 1fbe00e..8b8d259 100644 --- a/pipeline/includes/biergarten_data_generator.h +++ b/pipeline/includes/biergarten_data_generator.h @@ -3,11 +3,10 @@ #include #include -#include #include #include "data_generation/data_generator.h" -#include "database/database.h" +#include "models/location.h" #include "web_client/web_client.h" #include "wikipedia/wikipedia_service.h" @@ -49,8 +48,7 @@ struct ApplicationOptions { * @brief Main data generator class for the Biergarten pipeline. * * This class encapsulates the core logic for generating brewery data. - * It handles database initialization, data loading/downloading, and brewery - * generation. + * It handles location loading, city enrichment, and brewery generation. */ class BiergartenDataGenerator { public: @@ -59,20 +57,17 @@ class BiergartenDataGenerator { * * @param options Application configuration options. * @param web_client HTTP client for downloading data. - * @param database SQLite database instance. */ BiergartenDataGenerator(const ApplicationOptions& options, - std::shared_ptr web_client, - SqliteDatabase& database); + std::shared_ptr web_client); /** * @brief Run the data generation pipeline. * * Performs the following steps: - * 1. Initialize database - * 2. Download geographic data if needed - * 3. Initialize the generator (LLM or Mock) - * 4. Generate brewery data for sample cities + * 1. Load curated locations from JSON + * 2. Initialize the generator (LLM or Mock) + * 3. Generate brewery data for sampled cities * * @return 0 on success, 1 on failure. */ @@ -85,16 +80,11 @@ class BiergartenDataGenerator { /// @brief Shared HTTP client dependency. std::shared_ptr webClient_; - /// @brief Database dependency. - SqliteDatabase& database_; - /** * @brief Enriched city data with Wikipedia context. */ struct EnrichedCity { - int city_id; - std::string city_name; - std::string country_name; + Location location; std::string region_context; }; @@ -108,25 +98,20 @@ class BiergartenDataGenerator { std::unique_ptr InitializeGenerator(); /** - * @brief Download and load geographic data if not cached. - */ - void LoadGeographicData(); - - /** - * @brief Query cities from database and build country name map. + * @brief Load locations from JSON and sample cities. * - * @return Vector of (City, country_name) pairs capped at 30 entries. + * @return Vector of sampled locations capped at 30 entries. */ - std::vector> QueryCitiesWithCountries(); + std::vector QueryCitiesWithCountries(); /** * @brief Enrich cities with Wikipedia summaries. * - * @param cities Vector of (City, country_name) pairs. + * @param cities Vector of sampled locations. * @return Vector of enriched city data with context. */ std::vector EnrichWithWikipedia( - const std::vector>& cities); + const std::vector& cities); /** * @brief Generate breweries for enriched cities. @@ -146,8 +131,7 @@ class BiergartenDataGenerator { * @brief Helper struct to store generated brewery data. */ struct GeneratedBrewery { - int city_id; - std::string city_name; + Location location; BreweryResult brewery; }; diff --git a/pipeline/includes/data_generation/data_downloader.h b/pipeline/includes/data_generation/data_downloader.h deleted file mode 100644 index cf2de92..0000000 --- a/pipeline/includes/data_generation/data_downloader.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_ -#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_ - -#include -#include -#include - -#include "web_client/web_client.h" - -/// @brief Downloads and caches source geography JSON payloads. -class DataDownloader { - public: - /// @brief Initializes global curl state used by this downloader. - explicit DataDownloader(std::shared_ptr web_client); - - /// @brief Cleans up global curl state. - ~DataDownloader(); - - /// @brief Returns a local JSON path, downloading it when cache is missing. - std::string DownloadCountriesDatabase( - const std::string& cache_path, - const std::string& commit = - "c5eb7772" // Stable commit: 2026-03-28 export - ); - - private: - static bool FileExists(const std::string& file_path); - std::shared_ptr web_client_; -}; - -#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_ diff --git a/pipeline/includes/database/database.h b/pipeline/includes/database/database.h deleted file mode 100644 index 01b2f23..0000000 --- a/pipeline/includes/database/database.h +++ /dev/null @@ -1,87 +0,0 @@ -#ifndef BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_ -#define BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_ - -#include - -#include -#include -#include - -struct Country { - /// @brief Country identifier from the source dataset. - int id; - /// @brief Country display name. - std::string name; - /// @brief ISO 3166-1 alpha-2 code. - std::string iso2; - /// @brief ISO 3166-1 alpha-3 code. - std::string iso3; -}; - -struct State { - /// @brief State or province identifier from the source dataset. - int id; - /// @brief State or province display name. - std::string name; - /// @brief State or province short code. - std::string iso2; - /// @brief Parent country identifier. - int country_id; -}; - -struct City { - /// @brief City identifier from the source dataset. - int id; - /// @brief City display name. - std::string name; - /// @brief Parent country identifier. - int country_id; -}; - -/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks. -class SqliteDatabase { - private: - sqlite3* db_ = nullptr; - std::mutex db_mutex_; - - void InitializeSchema(); - - public: - /// @brief Closes the SQLite connection if initialized. - ~SqliteDatabase(); - - /// @brief Opens the SQLite database at db_path and creates schema objects. - void Initialize(const std::string& db_path = ":memory:"); - - /// @brief Starts a database transaction for batched writes. - void BeginTransaction(); - - /// @brief Commits the active database transaction. - void CommitTransaction(); - - /// @brief Rolls back the active database transaction. - void RollbackTransaction(); - - /// @brief Inserts a country row. - void InsertCountry(int id, const std::string& name, const std::string& iso2, - const std::string& iso3); - - /// @brief Inserts a state row linked to a country. - void InsertState(int id, int country_id, const std::string& name, - const std::string& iso2); - - /// @brief Inserts a city row linked to state and country. - void InsertCity(int id, int state_id, int country_id, - const std::string& name, double latitude, double longitude); - - /// @brief Returns city records including parent country id. - std::vector QueryCities(); - - /// @brief Returns countries with optional row limit. - std::vector QueryCountries(int limit = 0); - - /// @brief Returns states with optional row limit. - std::vector QueryStates(int limit = 0); -}; - -#endif // BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_ diff --git a/pipeline/includes/json_handling/json_loader.h b/pipeline/includes/json_handling/json_loader.h index d6fca00..4f2ec7d 100644 --- a/pipeline/includes/json_handling/json_loader.h +++ b/pipeline/includes/json_handling/json_loader.h @@ -2,16 +2,15 @@ #define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_ #include +#include -#include "database/database.h" -#include "json_handling/stream_parser.h" +#include "models/location.h" -/// @brief Loads world-city JSON data into SQLite through streaming parsing. +/// @brief Loads curated world locations from a JSON file into memory. class JsonLoader { public: - /// @brief Parses a JSON file and writes country/state/city rows into db. - static void LoadWorldCities(const std::string& json_path, - SqliteDatabase& db); + /// @brief Parses a JSON array file and returns all location records. + static std::vector LoadLocations(const std::string& filepath); }; #endif // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_ diff --git a/pipeline/includes/json_handling/stream_parser.h b/pipeline/includes/json_handling/stream_parser.h deleted file mode 100644 index f712702..0000000 --- a/pipeline/includes/json_handling/stream_parser.h +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_ -#define BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_ - -#include -#include - -#include "database/database.h" - -// Forward declaration to avoid circular dependency -class SqliteDatabase; - -/// @brief In-memory representation of one parsed city entry. -struct CityRecord { - int id; - int state_id; - int country_id; - std::string name; - double latitude; - double longitude; -}; - -/// @brief Streaming SAX parser that emits city records during traversal. -class StreamingJsonParser { - public: - /// @brief Parses file_path and invokes callbacks for city rows and progress. - static void Parse(const std::string& file_path, SqliteDatabase& db, - std::function on_city, - std::function on_progress = nullptr); - - private: - /// @brief Mutable SAX handler state while traversing nested JSON arrays. - struct ParseState { - int current_country_id = 0; - int current_state_id = 0; - - CityRecord current_city = {}; - bool building_city = false; - std::string current_key; - - int array_depth = 0; - int object_depth = 0; - bool in_countries_array = false; - bool in_states_array = false; - bool in_cities_array = false; - - std::function on_city; - std::function on_progress; - size_t bytes_processed = 0; - }; -}; - -#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_ diff --git a/pipeline/locations.json b/pipeline/locations.json new file mode 100644 index 0000000..81a320d --- /dev/null +++ b/pipeline/locations.json @@ -0,0 +1,902 @@ +[ + { + "city": "Cape Town", + "state_province": "Western Cape", + "iso3166_2": "ZA-WC", + "country": "South Africa", + "iso3166_1": "ZA", + "latitude": -33.9249, + "longitude": 18.4241 + }, + { + "city": "Johannesburg", + "state_province": "Gauteng", + "iso3166_2": "ZA-GT", + "country": "South Africa", + "iso3166_1": "ZA", + "latitude": -26.2041, + "longitude": 28.0473 + }, + { + "city": "Durban", + "state_province": "KwaZulu-Natal", + "iso3166_2": "ZA-NL", + "country": "South Africa", + "iso3166_1": "ZA", + "latitude": -29.8587, + "longitude": 31.0218 + }, + { + "city": "Franschhoek", + "state_province": "Western Cape", + "iso3166_2": "ZA-WC", + "country": "South Africa", + "iso3166_1": "ZA", + "latitude": -33.9146, + "longitude": 19.1198 + }, + { + "city": "Nairobi", + "state_province": "Nairobi", + "iso3166_2": "KE-30", + "country": "Kenya", + "iso3166_1": "KE", + "latitude": -1.2921, + "longitude": 36.8219 + }, + { + "city": "Buenos Aires", + "state_province": "Buenos Aires City", + "iso3166_2": "AR-C", + "country": "Argentina", + "iso3166_1": "AR", + "latitude": -34.6037, + "longitude": -58.3816 + }, + { + "city": "Bariloche", + "state_province": "Río Negro", + "iso3166_2": "AR-R", + "country": "Argentina", + "iso3166_1": "AR", + "latitude": -41.1335, + "longitude": -71.3103 + }, + { + "city": "Bogotá", + "state_province": "Bogotá D.C.", + "iso3166_2": "CO-DC", + "country": "Colombia", + "iso3166_1": "CO", + "latitude": 4.711, + "longitude": -74.0721 + }, + { + "city": "Medellín", + "state_province": "Antioquia", + "iso3166_2": "CO-ANT", + "country": "Colombia", + "iso3166_1": "CO", + "latitude": 6.2442, + "longitude": -75.5812 + }, + { + "city": "São Paulo", + "state_province": "São Paulo", + "iso3166_2": "BR-SP", + "country": "Brazil", + "iso3166_1": "BR", + "latitude": -23.5505, + "longitude": -46.6333 + }, + { + "city": "Curitiba", + "state_province": "Paraná", + "iso3166_2": "BR-PR", + "country": "Brazil", + "iso3166_1": "BR", + "latitude": -25.4284, + "longitude": -49.2733 + }, + { + "city": "Rio de Janeiro", + "state_province": "Rio de Janeiro", + "iso3166_2": "BR-RJ", + "country": "Brazil", + "iso3166_1": "BR", + "latitude": -22.9068, + "longitude": -43.1729 + }, + { + "city": "Santiago", + "state_province": "Santiago Metropolitan", + "iso3166_2": "CL-RM", + "country": "Chile", + "iso3166_1": "CL", + "latitude": -33.4489, + "longitude": -70.6693 + }, + { + "city": "Valdivia", + "state_province": "Los Ríos", + "iso3166_2": "CL-LR", + "country": "Chile", + "iso3166_1": "CL", + "latitude": -39.8142, + "longitude": -73.2459 + }, + { + "city": "Lima", + "state_province": "Lima", + "iso3166_2": "PE-LMA", + "country": "Peru", + "iso3166_1": "PE", + "latitude": -12.0464, + "longitude": -77.0428 + }, + { + "city": "Tokyo", + "state_province": "Tokyo", + "iso3166_2": "JP-13", + "country": "Japan", + "iso3166_1": "JP", + "latitude": 35.6762, + "longitude": 139.6503 + }, + { + "city": "Osaka", + "state_province": "Osaka", + "iso3166_2": "JP-27", + "country": "Japan", + "iso3166_1": "JP", + "latitude": 34.6937, + "longitude": 135.5023 + }, + { + "city": "Kyoto", + "state_province": "Kyoto", + "iso3166_2": "JP-26", + "country": "Japan", + "iso3166_1": "JP", + "latitude": 35.0116, + "longitude": 135.7681 + }, + { + "city": "Sapporo", + "state_province": "Hokkaido", + "iso3166_2": "JP-01", + "country": "Japan", + "iso3166_1": "JP", + "latitude": 43.0618, + "longitude": 141.3545 + }, + { + "city": "Seoul", + "state_province": "Seoul", + "iso3166_2": "KR-11", + "country": "South Korea", + "iso3166_1": "KR", + "latitude": 37.5665, + "longitude": 126.978 + }, + { + "city": "Busan", + "state_province": "Busan", + "iso3166_2": "KR-26", + "country": "South Korea", + "iso3166_1": "KR", + "latitude": 35.1796, + "longitude": 129.0756 + }, + { + "city": "Ho Chi Minh City", + "state_province": "Ho Chi Minh", + "iso3166_2": "VN-SG", + "country": "Vietnam", + "iso3166_1": "VN", + "latitude": 10.8231, + "longitude": 106.6297 + }, + { + "city": "Hanoi", + "state_province": "Hanoi", + "iso3166_2": "VN-HN", + "country": "Vietnam", + "iso3166_1": "VN", + "latitude": 21.0285, + "longitude": 105.8542 + }, + { + "city": "Da Nang", + "state_province": "Da Nang", + "iso3166_2": "VN-DN", + "country": "Vietnam", + "iso3166_1": "VN", + "latitude": 16.0544, + "longitude": 108.2022 + }, + { + "city": "Bangkok", + "state_province": "Bangkok", + "iso3166_2": "TH-10", + "country": "Thailand", + "iso3166_1": "TH", + "latitude": 13.7563, + "longitude": 100.5018 + }, + { + "city": "Taipei", + "state_province": "Taipei", + "iso3166_2": "TW-TPE", + "country": "Taiwan", + "iso3166_1": "TW", + "latitude": 25.033, + "longitude": 121.5654 + }, + { + "city": "Beijing", + "state_province": "Beijing", + "iso3166_2": "CN-BJ", + "country": "China", + "iso3166_1": "CN", + "latitude": 39.9042, + "longitude": 116.4074 + }, + { + "city": "Shanghai", + "state_province": "Shanghai", + "iso3166_2": "CN-SH", + "country": "China", + "iso3166_1": "CN", + "latitude": 31.2304, + "longitude": 121.4737 + }, + { + "city": "Bengaluru", + "state_province": "Karnataka", + "iso3166_2": "IN-KA", + "country": "India", + "iso3166_1": "IN", + "latitude": 12.9716, + "longitude": 77.5946 + }, + { + "city": "Singapore", + "state_province": "Central Singapore", + "iso3166_2": "SG-01", + "country": "Singapore", + "iso3166_1": "SG", + "latitude": 1.3521, + "longitude": 103.8198 + }, + { + "city": "Melbourne", + "state_province": "Victoria", + "iso3166_2": "AU-VIC", + "country": "Australia", + "iso3166_1": "AU", + "latitude": -37.8136, + "longitude": 144.9631 + }, + { + "city": "Sydney", + "state_province": "New South Wales", + "iso3166_2": "AU-NSW", + "country": "Australia", + "iso3166_1": "AU", + "latitude": -33.8688, + "longitude": 151.2093 + }, + { + "city": "Brisbane", + "state_province": "Queensland", + "iso3166_2": "AU-QLD", + "country": "Australia", + "iso3166_1": "AU", + "latitude": -27.4705, + "longitude": 153.026 + }, + { + "city": "Adelaide", + "state_province": "South Australia", + "iso3166_2": "AU-SA", + "country": "Australia", + "iso3166_1": "AU", + "latitude": -34.9285, + "longitude": 138.6007 + }, + { + "city": "Perth", + "state_province": "Western Australia", + "iso3166_2": "AU-WA", + "country": "Australia", + "iso3166_1": "AU", + "latitude": -31.9505, + "longitude": 115.8605 + }, + { + "city": "Hobart", + "state_province": "Tasmania", + "iso3166_2": "AU-TAS", + "country": "Australia", + "iso3166_1": "AU", + "latitude": -42.8821, + "longitude": 147.3272 + }, + { + "city": "Wellington", + "state_province": "Wellington", + "iso3166_2": "NZ-WGN", + "country": "New Zealand", + "iso3166_1": "NZ", + "latitude": -41.2865, + "longitude": 174.7762 + }, + { + "city": "Auckland", + "state_province": "Auckland", + "iso3166_2": "NZ-AUK", + "country": "New Zealand", + "iso3166_1": "NZ", + "latitude": -36.8485, + "longitude": 174.7633 + }, + { + "city": "Christchurch", + "state_province": "Canterbury", + "iso3166_2": "NZ-CAN", + "country": "New Zealand", + "iso3166_1": "NZ", + "latitude": -43.532, + "longitude": 172.6306 + }, + { + "city": "Nelson", + "state_province": "Nelson", + "iso3166_2": "NZ-NSN", + "country": "New Zealand", + "iso3166_1": "NZ", + "latitude": -41.2706, + "longitude": 173.284 + }, + { + "city": "Munich", + "state_province": "Bavaria", + "iso3166_2": "DE-BY", + "country": "Germany", + "iso3166_1": "DE", + "latitude": 48.1351, + "longitude": 11.582 + }, + { + "city": "Berlin", + "state_province": "Berlin", + "iso3166_2": "DE-BE", + "country": "Germany", + "iso3166_1": "DE", + "latitude": 52.52, + "longitude": 13.405 + }, + { + "city": "Cologne", + "state_province": "North Rhine-Westphalia", + "iso3166_2": "DE-NW", + "country": "Germany", + "iso3166_1": "DE", + "latitude": 50.9375, + "longitude": 6.9603 + }, + { + "city": "Bamberg", + "state_province": "Bavaria", + "iso3166_2": "DE-BY", + "country": "Germany", + "iso3166_1": "DE", + "latitude": 49.8916, + "longitude": 10.8916 + }, + { + "city": "Brussels", + "state_province": "Brussels-Capital", + "iso3166_2": "BE-BRU", + "country": "Belgium", + "iso3166_1": "BE", + "latitude": 50.8503, + "longitude": 4.3517 + }, + { + "city": "Antwerp", + "state_province": "Flanders", + "iso3166_2": "BE-VLG", + "country": "Belgium", + "iso3166_1": "BE", + "latitude": 51.2194, + "longitude": 4.4025 + }, + { + "city": "Bruges", + "state_province": "Flanders", + "iso3166_2": "BE-VLG", + "country": "Belgium", + "iso3166_1": "BE", + "latitude": 51.2093, + "longitude": 3.2247 + }, + { + "city": "London", + "state_province": "England", + "iso3166_2": "GB-ENG", + "country": "United Kingdom", + "iso3166_1": "GB", + "latitude": 51.5074, + "longitude": -0.1278 + }, + { + "city": "Bristol", + "state_province": "England", + "iso3166_2": "GB-ENG", + "country": "United Kingdom", + "iso3166_1": "GB", + "latitude": 51.4545, + "longitude": -2.5879 + }, + { + "city": "Edinburgh", + "state_province": "Scotland", + "iso3166_2": "GB-SCT", + "country": "United Kingdom", + "iso3166_1": "GB", + "latitude": 55.9533, + "longitude": -3.1883 + }, + { + "city": "Glasgow", + "state_province": "Scotland", + "iso3166_2": "GB-SCT", + "country": "United Kingdom", + "iso3166_1": "GB", + "latitude": 55.8642, + "longitude": -4.2518 + }, + { + "city": "Prague", + "state_province": "Prague", + "iso3166_2": "CZ-10", + "country": "Czechia", + "iso3166_1": "CZ", + "latitude": 50.0755, + "longitude": 14.4378 + }, + { + "city": "Pilsen", + "state_province": "Plzeň", + "iso3166_2": "CZ-32", + "country": "Czechia", + "iso3166_1": "CZ", + "latitude": 49.7384, + "longitude": 13.3736 + }, + { + "city": "Amsterdam", + "state_province": "North Holland", + "iso3166_2": "NL-NH", + "country": "Netherlands", + "iso3166_1": "NL", + "latitude": 52.3676, + "longitude": 4.9041 + }, + { + "city": "Copenhagen", + "state_province": "Capital Region", + "iso3166_2": "DK-84", + "country": "Denmark", + "iso3166_1": "DK", + "latitude": 55.6761, + "longitude": 12.5683 + }, + { + "city": "Warsaw", + "state_province": "Masovian", + "iso3166_2": "PL-MZ", + "country": "Poland", + "iso3166_1": "PL", + "latitude": 52.2297, + "longitude": 21.0122 + }, + { + "city": "Krakow", + "state_province": "Lesser Poland", + "iso3166_2": "PL-MA", + "country": "Poland", + "iso3166_1": "PL", + "latitude": 50.0647, + "longitude": 19.945 + }, + { + "city": "Rome", + "state_province": "Lazio", + "iso3166_2": "IT-62", + "country": "Italy", + "iso3166_1": "IT", + "latitude": 41.9028, + "longitude": 12.4964 + }, + { + "city": "Milan", + "state_province": "Lombardy", + "iso3166_2": "IT-25", + "country": "Italy", + "iso3166_1": "IT", + "latitude": 45.4642, + "longitude": 9.19 + }, + { + "city": "Barcelona", + "state_province": "Catalonia", + "iso3166_2": "ES-CT", + "country": "Spain", + "iso3166_1": "ES", + "latitude": 41.3851, + "longitude": 2.1734 + }, + { + "city": "Madrid", + "state_province": "Madrid", + "iso3166_2": "ES-MD", + "country": "Spain", + "iso3166_1": "ES", + "latitude": 40.4168, + "longitude": -3.7038 + }, + { + "city": "Paris", + "state_province": "Île-de-France", + "iso3166_2": "FR-IDF", + "country": "France", + "iso3166_1": "FR", + "latitude": 48.8566, + "longitude": 2.3522 + }, + { + "city": "Lyon", + "state_province": "Auvergne-Rhône-Alpes", + "iso3166_2": "FR-ARA", + "country": "France", + "iso3166_1": "FR", + "latitude": 45.764, + "longitude": 4.8357 + }, + { + "city": "Stockholm", + "state_province": "Stockholm", + "iso3166_2": "SE-AB", + "country": "Sweden", + "iso3166_1": "SE", + "latitude": 59.3293, + "longitude": 18.0686 + }, + { + "city": "Gothenburg", + "state_province": "Västra Götaland", + "iso3166_2": "SE-O", + "country": "Sweden", + "iso3166_1": "SE", + "latitude": 57.7089, + "longitude": 11.9746 + }, + { + "city": "Oslo", + "state_province": "Oslo", + "iso3166_2": "NO-03", + "country": "Norway", + "iso3166_1": "NO", + "latitude": 59.9139, + "longitude": 10.7522 + }, + { + "city": "Dublin", + "state_province": "Leinster", + "iso3166_2": "IE-L", + "country": "Ireland", + "iso3166_1": "IE", + "latitude": 53.3498, + "longitude": -6.2603 + }, + { + "city": "Vienna", + "state_province": "Vienna", + "iso3166_2": "AT-9", + "country": "Austria", + "iso3166_1": "AT", + "latitude": 48.2082, + "longitude": 16.3738 + }, + { + "city": "Zurich", + "state_province": "Zurich", + "iso3166_2": "CH-ZH", + "country": "Switzerland", + "iso3166_1": "CH", + "latitude": 47.3769, + "longitude": 8.5417 + }, + { + "city": "Tallinn", + "state_province": "Harju", + "iso3166_2": "EE-37", + "country": "Estonia", + "iso3166_1": "EE", + "latitude": 59.437, + "longitude": 24.7536 + }, + { + "city": "Denver", + "state_province": "Colorado", + "iso3166_2": "US-CO", + "country": "United States", + "iso3166_1": "US", + "latitude": 39.7392, + "longitude": -104.9903 + }, + { + "city": "Portland", + "state_province": "Oregon", + "iso3166_2": "US-OR", + "country": "United States", + "iso3166_1": "US", + "latitude": 45.5152, + "longitude": -122.6784 + }, + { + "city": "San Diego", + "state_province": "California", + "iso3166_2": "US-CA", + "country": "United States", + "iso3166_1": "US", + "latitude": 32.7157, + "longitude": -117.1611 + }, + { + "city": "Asheville", + "state_province": "North Carolina", + "iso3166_2": "US-NC", + "country": "United States", + "iso3166_1": "US", + "latitude": 35.5951, + "longitude": -82.5515 + }, + { + "city": "Grand Rapids", + "state_province": "Michigan", + "iso3166_2": "US-MI", + "country": "United States", + "iso3166_1": "US", + "latitude": 42.9634, + "longitude": -85.6681 + }, + { + "city": "Chicago", + "state_province": "Illinois", + "iso3166_2": "US-IL", + "country": "United States", + "iso3166_1": "US", + "latitude": 41.8781, + "longitude": -87.6298 + }, + { + "city": "Seattle", + "state_province": "Washington", + "iso3166_2": "US-WA", + "country": "United States", + "iso3166_1": "US", + "latitude": 47.6062, + "longitude": -122.3321 + }, + { + "city": "Austin", + "state_province": "Texas", + "iso3166_2": "US-TX", + "country": "United States", + "iso3166_1": "US", + "latitude": 30.2672, + "longitude": -97.7431 + }, + { + "city": "Boston", + "state_province": "Massachusetts", + "iso3166_2": "US-MA", + "country": "United States", + "iso3166_1": "US", + "latitude": 42.3601, + "longitude": -71.0589 + }, + { + "city": "Philadelphia", + "state_province": "Pennsylvania", + "iso3166_2": "US-PA", + "country": "United States", + "iso3166_1": "US", + "latitude": 39.9526, + "longitude": -75.1652 + }, + { + "city": "Brooklyn", + "state_province": "New York", + "iso3166_2": "US-NY", + "country": "United States", + "iso3166_1": "US", + "latitude": 40.6782, + "longitude": -73.9442 + }, + { + "city": "Milwaukee", + "state_province": "Wisconsin", + "iso3166_2": "US-WI", + "country": "United States", + "iso3166_1": "US", + "latitude": 43.0389, + "longitude": -87.9065 + }, + { + "city": "Richmond", + "state_province": "Virginia", + "iso3166_2": "US-VA", + "country": "United States", + "iso3166_1": "US", + "latitude": 37.5407, + "longitude": -77.436 + }, + { + "city": "Cincinnati", + "state_province": "Ohio", + "iso3166_2": "US-OH", + "country": "United States", + "iso3166_1": "US", + "latitude": 39.1031, + "longitude": -84.512 + }, + { + "city": "St. Louis", + "state_province": "Missouri", + "iso3166_2": "US-MO", + "country": "United States", + "iso3166_1": "US", + "latitude": 38.627, + "longitude": -90.1994 + }, + { + "city": "Tampa", + "state_province": "Florida", + "iso3166_2": "US-FL", + "country": "United States", + "iso3166_1": "US", + "latitude": 27.9506, + "longitude": -82.4572 + }, + { + "city": "Minneapolis", + "state_province": "Minnesota", + "iso3166_2": "US-MN", + "country": "United States", + "iso3166_1": "US", + "latitude": 44.9778, + "longitude": -93.265 + }, + { + "city": "Burlington", + "state_province": "Vermont", + "iso3166_2": "US-VT", + "country": "United States", + "iso3166_1": "US", + "latitude": 44.4759, + "longitude": -73.2121 + }, + { + "city": "Portland", + "state_province": "Maine", + "iso3166_2": "US-ME", + "country": "United States", + "iso3166_1": "US", + "latitude": 43.6591, + "longitude": -70.2568 + }, + { + "city": "Atlanta", + "state_province": "Georgia", + "iso3166_2": "US-GA", + "country": "United States", + "iso3166_1": "US", + "latitude": 33.749, + "longitude": -84.388 + }, + { + "city": "Toronto", + "state_province": "Ontario", + "iso3166_2": "CA-ON", + "country": "Canada", + "iso3166_1": "CA", + "latitude": 43.651, + "longitude": -79.347 + }, + { + "city": "Vancouver", + "state_province": "British Columbia", + "iso3166_2": "CA-BC", + "country": "Canada", + "iso3166_1": "CA", + "latitude": 49.2827, + "longitude": -123.1207 + }, + { + "city": "Montreal", + "state_province": "Quebec", + "iso3166_2": "CA-QC", + "country": "Canada", + "iso3166_1": "CA", + "latitude": 45.5017, + "longitude": -73.5673 + }, + { + "city": "Calgary", + "state_province": "Alberta", + "iso3166_2": "CA-AB", + "country": "Canada", + "iso3166_1": "CA", + "latitude": 51.0447, + "longitude": -114.0719 + }, + { + "city": "Halifax", + "state_province": "Nova Scotia", + "iso3166_2": "CA-NS", + "country": "Canada", + "iso3166_1": "CA", + "latitude": 44.6488, + "longitude": -63.5752 + }, + { + "city": "Mexico City", + "state_province": "Mexico City", + "iso3166_2": "MX-CMX", + "country": "Mexico", + "iso3166_1": "MX", + "latitude": 19.4326, + "longitude": -99.1332 + }, + { + "city": "Tijuana", + "state_province": "Baja California", + "iso3166_2": "MX-BCN", + "country": "Mexico", + "iso3166_1": "MX", + "latitude": 32.5149, + "longitude": -117.0382 + }, + { + "city": "Monterrey", + "state_province": "Nuevo León", + "iso3166_2": "MX-NLE", + "country": "Mexico", + "iso3166_1": "MX", + "latitude": 25.6866, + "longitude": -100.3161 + }, + { + "city": "Guadalajara", + "state_province": "Jalisco", + "iso3166_2": "MX-JAL", + "country": "Mexico", + "iso3166_1": "MX", + "latitude": 20.6597, + "longitude": -103.3496 + }, + { + "city": "Ensenada", + "state_province": "Baja California", + "iso3166_2": "MX-BCN", + "country": "Mexico", + "iso3166_1": "MX", + "latitude": 31.8667, + "longitude": -116.5964 + } +] diff --git a/pipeline/src/biergarten_data_generator.cpp b/pipeline/src/biergarten_data_generator.cpp index 6663b5e..ba79be2 100644 --- a/pipeline/src/biergarten_data_generator.cpp +++ b/pipeline/src/biergarten_data_generator.cpp @@ -4,20 +4,21 @@ #include #include -#include +#include +#include +#include -#include "data_generation/data_downloader.h" #include "data_generation/llama_generator.h" #include "data_generation/mock_generator.h" #include "json_handling/json_loader.h" #include "wikipedia/wikipedia_service.h" BiergartenDataGenerator::BiergartenDataGenerator( - const ApplicationOptions& options, std::shared_ptr web_client, - SqliteDatabase& database) - : options_(options), webClient_(web_client), database_(database) {} + const ApplicationOptions& options, std::shared_ptr web_client) + : options_(options), webClient_(std::move(web_client)) {} -std::unique_ptr BiergartenDataGenerator::InitializeGenerator() { +auto BiergartenDataGenerator::InitializeGenerator() + -> std::unique_ptr { spdlog::info("Initializing brewery generator..."); std::unique_ptr generator; @@ -41,75 +42,60 @@ std::unique_ptr BiergartenDataGenerator::InitializeGenerator() { return generator; } -void BiergartenDataGenerator::LoadGeographicData() { - std::string json_path = options_.cache_dir + "/countries+states+cities.json"; - std::string db_path = options_.cache_dir + "/biergarten-pipeline.db"; - - bool has_json_cache = std::filesystem::exists(json_path); - bool has_db_cache = std::filesystem::exists(db_path); - - spdlog::info("Initializing SQLite database at {}...", db_path); - database_.Initialize(db_path); - - if (has_db_cache && has_json_cache) { - spdlog::info("[Pipeline] Cache hit: skipping download and parse"); - } else { - spdlog::info("\n[Pipeline] Downloading geographic data from GitHub..."); - DataDownloader downloader(webClient_); - downloader.DownloadCountriesDatabase(json_path, options_.commit); - - JsonLoader::LoadWorldCities(json_path, database_); - } -} - -std::vector> -BiergartenDataGenerator::QueryCitiesWithCountries() { +auto BiergartenDataGenerator::QueryCitiesWithCountries() + -> std::vector { spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ==="); - auto cities = database_.QueryCities(); - - // Build a quick map of country id -> name for per-city lookups. - auto all_countries = database_.QueryCountries(0); - std::unordered_map country_map; - for (const auto& c : all_countries) { - country_map[c.id] = c.name; - } - - spdlog::info("\nTotal records loaded:"); - spdlog::info(" Countries: {}", database_.QueryCountries(0).size()); - spdlog::info(" States: {}", database_.QueryStates(0).size()); - spdlog::info(" Cities: {}", cities.size()); - - // Cap at 30 entries. - const size_t sample_count = std::min(size_t(30), cities.size()); - std::vector> result; - - for (size_t i = 0; i < sample_count; i++) { - const auto& city = cities[i]; - std::string country_name; - const auto country_it = country_map.find(city.country_id); - if (country_it != country_map.end()) { - country_name = country_it->second; + std::filesystem::path locations_path = "locations.json"; + if (!std::filesystem::exists(locations_path)) { + const std::filesystem::path cache_path = + std::filesystem::path(options_.cache_dir) / "locations.json"; + if (std::filesystem::exists(cache_path)) { + locations_path = cache_path; } - result.push_back({city, country_name}); } - return result; + auto all_locations = JsonLoader::LoadLocations(locations_path.string()); + spdlog::info(" Locations available: {}", all_locations.size()); + + const size_t sample_count = std::min(30, all_locations.size()); + std::vector sampled_locations; + sampled_locations.reserve(sample_count); + + std::random_device random_generator; + std::sample(all_locations.begin(), all_locations.end(), + std::back_inserter(sampled_locations), sample_count, + random_generator); + + spdlog::info(" Sampled locations: {}", sampled_locations.size()); + return sampled_locations; } -std::vector -BiergartenDataGenerator::EnrichWithWikipedia( - const std::vector>& cities) { - WikipediaService wikipedia_service(webClient_); +auto BiergartenDataGenerator::EnrichWithWikipedia( + const std::vector& cities) -> std::vector { std::vector enriched; + enriched.reserve(cities.size()); - for (const auto& [city, country_name] : cities) { - const std::string region_context = - wikipedia_service.GetSummary(city.name, country_name); - spdlog::debug("[Pipeline] Region context for {}: {}", city.name, - region_context); + std::vector> pending; + pending.reserve(cities.size()); - enriched.push_back({city.id, city.name, country_name, region_context}); + for (const auto& city : cities) { + pending.push_back(std::async(std::launch::async, + [web_client = webClient_, city]() { + WikipediaService wikipedia_service( + web_client); + const std::string region_context = + wikipedia_service.GetSummary( + city.city, city.country); + spdlog::debug( + "[Pipeline] Region context for {}: {}", + city.city, region_context); + return EnrichedCity{city, region_context}; + })); + } + + for (auto& task : pending) { + enriched.push_back(task.get()); } return enriched; @@ -121,28 +107,30 @@ void BiergartenDataGenerator::GenerateBreweries( generatedBreweries_.clear(); for (const auto& enriched_city : cities) { - auto brewery = generator.GenerateBrewery(enriched_city.city_name, - enriched_city.country_name, + auto brewery = generator.GenerateBrewery(enriched_city.location.city, + enriched_city.location.country, enriched_city.region_context); - generatedBreweries_.push_back( - {enriched_city.city_id, enriched_city.city_name, brewery}); + generatedBreweries_.push_back({enriched_city.location, brewery}); } } void BiergartenDataGenerator::LogResults() const { spdlog::info("\n=== GENERATED DATA DUMP ==="); - for (size_t i = 0; i < generatedBreweries_.size(); i++) { - const auto& entry = generatedBreweries_[i]; - spdlog::info("{}. city_id={} city=\"{}\"", i + 1, entry.city_id, - entry.city_name); + size_t index = 1; + for (const auto& entry : generatedBreweries_) { + spdlog::info("{}. city=\"{}\" country=\"{}\" state=\"{}\" " + "iso3166_2={} lat={} lon={}", + index, entry.location.city, entry.location.country, + entry.location.state_province, entry.location.iso3166_2, + entry.location.latitude, entry.location.longitude); spdlog::info(" brewery_name=\"{}\"", entry.brewery.name); spdlog::info(" brewery_description=\"{}\"", entry.brewery.description); + ++index; } } -int BiergartenDataGenerator::Run() { +auto BiergartenDataGenerator::Run() -> int { try { - LoadGeographicData(); auto generator = InitializeGenerator(); auto cities = QueryCitiesWithCountries(); auto enriched = EnrichWithWikipedia(cities); diff --git a/pipeline/src/data_generation/data_downloader.cpp b/pipeline/src/data_generation/data_downloader.cpp deleted file mode 100644 index 83861d4..0000000 --- a/pipeline/src/data_generation/data_downloader.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include "data_generation/data_downloader.h" - -#include - -#include -#include -#include -#include - -#include "web_client/web_client.h" - -DataDownloader::DataDownloader(std::shared_ptr web_client) - : web_client_(std::move(web_client)) {} - -DataDownloader::~DataDownloader() {} - -bool DataDownloader::FileExists(const std::string& file_path) { - return std::filesystem::exists(file_path); -} - -std::string DataDownloader::DownloadCountriesDatabase( - const std::string& cache_path, const std::string& commit) { - if (FileExists(cache_path)) { - spdlog::info("[DataDownloader] Cache hit: {}", cache_path); - return cache_path; - } - - std::string url = - "https://raw.githubusercontent.com/dr5hn/" - "countries-states-cities-database/" + - commit + "/json/countries+states+cities.json"; - - spdlog::info("[DataDownloader] Downloading: {}", url); - - web_client_->DownloadToFile(url, cache_path); - - std::ifstream file_check(cache_path, std::ios::binary | std::ios::ate); - std::streamsize size = file_check.tellg(); - file_check.close(); - - spdlog::info("[DataDownloader] OK: Download complete: {} ({:.2f} MB)", - cache_path, (size / (1024.0 * 1024.0))); - return cache_path; -} diff --git a/pipeline/src/database/database.cpp b/pipeline/src/database/database.cpp deleted file mode 100644 index 7d22bf3..0000000 --- a/pipeline/src/database/database.cpp +++ /dev/null @@ -1,264 +0,0 @@ -#include "database/database.h" - -#include - -#include - -void SqliteDatabase::InitializeSchema() { - std::lock_guard lock(db_mutex_); - - const char* schema = R"( - CREATE TABLE IF NOT EXISTS countries ( - id INTEGER PRIMARY KEY, - name TEXT NOT NULL, - iso2 TEXT, - iso3 TEXT - ); - - CREATE TABLE IF NOT EXISTS states ( - id INTEGER PRIMARY KEY, - country_id INTEGER NOT NULL, - name TEXT NOT NULL, - iso2 TEXT, - FOREIGN KEY(country_id) REFERENCES countries(id) - ); - - CREATE TABLE IF NOT EXISTS cities ( - id INTEGER PRIMARY KEY, - state_id INTEGER NOT NULL, - country_id INTEGER NOT NULL, - name TEXT NOT NULL, - latitude REAL, - longitude REAL, - FOREIGN KEY(state_id) REFERENCES states(id), - FOREIGN KEY(country_id) REFERENCES countries(id) - ); - )"; - - char* errMsg = nullptr; - int rc = sqlite3_exec(db_, schema, nullptr, nullptr, &errMsg); - if (rc != SQLITE_OK) { - std::string error = errMsg ? std::string(errMsg) : "Unknown error"; - sqlite3_free(errMsg); - throw std::runtime_error("Failed to create schema: " + error); - } -} - -SqliteDatabase::~SqliteDatabase() { - if (db_) { - sqlite3_close(db_); - } -} - -void SqliteDatabase::Initialize(const std::string& db_path) { - int rc = sqlite3_open(db_path.c_str(), &db_); - if (rc) { - throw std::runtime_error("Failed to open SQLite database: " + db_path); - } - spdlog::info("OK: SQLite database opened: {}", db_path); - InitializeSchema(); -} - -void SqliteDatabase::BeginTransaction() { - std::lock_guard lock(db_mutex_); - char* err = nullptr; - if (sqlite3_exec(db_, "BEGIN TRANSACTION", nullptr, nullptr, &err) != - SQLITE_OK) { - std::string msg = err ? err : "unknown"; - sqlite3_free(err); - throw std::runtime_error("BeginTransaction failed: " + msg); - } -} - -void SqliteDatabase::CommitTransaction() { - std::lock_guard lock(db_mutex_); - char* err = nullptr; - if (sqlite3_exec(db_, "COMMIT", nullptr, nullptr, &err) != SQLITE_OK) { - std::string msg = err ? err : "unknown"; - sqlite3_free(err); - throw std::runtime_error("CommitTransaction failed: " + msg); - } -} - -void SqliteDatabase::RollbackTransaction() { - std::lock_guard lock(db_mutex_); - char* err = nullptr; - if (sqlite3_exec(db_, "ROLLBACK", nullptr, nullptr, &err) != SQLITE_OK) { - std::string msg = err ? err : "unknown"; - sqlite3_free(err); - throw std::runtime_error("RollbackTransaction failed: " + msg); - } -} - -void SqliteDatabase::InsertCountry(int id, const std::string& name, - const std::string& iso2, - const std::string& iso3) { - std::lock_guard lock(db_mutex_); - - const char* query = R"( - INSERT OR IGNORE INTO countries (id, name, iso2, iso3) - VALUES (?, ?, ?, ?) - )"; - - sqlite3_stmt* stmt; - int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr); - if (rc != SQLITE_OK) - throw std::runtime_error("Failed to prepare country insert"); - - sqlite3_bind_int(stmt, 1, id); - sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_TRANSIENT); - sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_TRANSIENT); - sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_TRANSIENT); - - if (sqlite3_step(stmt) != SQLITE_DONE) { - throw std::runtime_error("Failed to insert country"); - } - sqlite3_finalize(stmt); -} - -void SqliteDatabase::InsertState(int id, int country_id, - const std::string& name, - const std::string& iso2) { - std::lock_guard lock(db_mutex_); - - const char* query = R"( - INSERT OR IGNORE INTO states (id, country_id, name, iso2) - VALUES (?, ?, ?, ?) - )"; - - sqlite3_stmt* stmt; - int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr); - if (rc != SQLITE_OK) - throw std::runtime_error("Failed to prepare state insert"); - - sqlite3_bind_int(stmt, 1, id); - sqlite3_bind_int(stmt, 2, country_id); - sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_TRANSIENT); - sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_TRANSIENT); - - if (sqlite3_step(stmt) != SQLITE_DONE) { - throw std::runtime_error("Failed to insert state"); - } - sqlite3_finalize(stmt); -} - -void SqliteDatabase::InsertCity(int id, int state_id, int country_id, - const std::string& name, double latitude, - double longitude) { - std::lock_guard lock(db_mutex_); - - const char* query = R"( - INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude) - VALUES (?, ?, ?, ?, ?, ?) - )"; - - sqlite3_stmt* stmt; - int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr); - if (rc != SQLITE_OK) - throw std::runtime_error("Failed to prepare city insert"); - - sqlite3_bind_int(stmt, 1, id); - sqlite3_bind_int(stmt, 2, state_id); - sqlite3_bind_int(stmt, 3, country_id); - sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_TRANSIENT); - sqlite3_bind_double(stmt, 5, latitude); - sqlite3_bind_double(stmt, 6, longitude); - - if (sqlite3_step(stmt) != SQLITE_DONE) { - throw std::runtime_error("Failed to insert city"); - } - sqlite3_finalize(stmt); -} - -std::vector SqliteDatabase::QueryCities() { - std::lock_guard lock(db_mutex_); - std::vector cities; - sqlite3_stmt* stmt = nullptr; - - const char* query = - "SELECT id, name, country_id FROM cities ORDER BY RANDOM()"; - int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr); - - if (rc != SQLITE_OK) { - throw std::runtime_error("Failed to prepare query"); - } - - while (sqlite3_step(stmt) == SQLITE_ROW) { - int id = sqlite3_column_int(stmt, 0); - const char* name = - reinterpret_cast(sqlite3_column_text(stmt, 1)); - int country_id = sqlite3_column_int(stmt, 2); - cities.push_back({id, name ? std::string(name) : "", country_id}); - } - - sqlite3_finalize(stmt); - return cities; -} - -std::vector SqliteDatabase::QueryCountries(int limit) { - std::lock_guard lock(db_mutex_); - - std::vector countries; - sqlite3_stmt* stmt = nullptr; - - std::string query = - "SELECT id, name, iso2, iso3 FROM countries ORDER BY name"; - if (limit > 0) { - query += " LIMIT " + std::to_string(limit); - } - - int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr); - - if (rc != SQLITE_OK) { - throw std::runtime_error("Failed to prepare countries query"); - } - - while (sqlite3_step(stmt) == SQLITE_ROW) { - int id = sqlite3_column_int(stmt, 0); - const char* name = - reinterpret_cast(sqlite3_column_text(stmt, 1)); - const char* iso2 = - reinterpret_cast(sqlite3_column_text(stmt, 2)); - const char* iso3 = - reinterpret_cast(sqlite3_column_text(stmt, 3)); - countries.push_back({id, name ? std::string(name) : "", - iso2 ? std::string(iso2) : "", - iso3 ? std::string(iso3) : ""}); - } - - sqlite3_finalize(stmt); - return countries; -} - -std::vector SqliteDatabase::QueryStates(int limit) { - std::lock_guard lock(db_mutex_); - - std::vector states; - sqlite3_stmt* stmt = nullptr; - - std::string query = - "SELECT id, name, iso2, country_id FROM states ORDER BY name"; - if (limit > 0) { - query += " LIMIT " + std::to_string(limit); - } - - int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr); - - if (rc != SQLITE_OK) { - throw std::runtime_error("Failed to prepare states query"); - } - - while (sqlite3_step(stmt) == SQLITE_ROW) { - int id = sqlite3_column_int(stmt, 0); - const char* name = - reinterpret_cast(sqlite3_column_text(stmt, 1)); - const char* iso2 = - reinterpret_cast(sqlite3_column_text(stmt, 2)); - int country_id = sqlite3_column_int(stmt, 3); - states.push_back({id, name ? std::string(name) : "", - iso2 ? std::string(iso2) : "", country_id}); - } - - sqlite3_finalize(stmt); - return states; -} diff --git a/pipeline/src/json_handling/json_loader.cpp b/pipeline/src/json_handling/json_loader.cpp index c535358..c4d4a82 100644 --- a/pipeline/src/json_handling/json_loader.cpp +++ b/pipeline/src/json_handling/json_loader.cpp @@ -2,66 +2,82 @@ #include -#include +#include -#include "json_handling/stream_parser.h" +#include +#include +#include -void JsonLoader::LoadWorldCities(const std::string& json_path, - SqliteDatabase& db) { - constexpr size_t kBatchSize = 10000; +namespace { - auto startTime = std::chrono::high_resolution_clock::now(); - spdlog::info("\nLoading {} (streaming Boost.JSON SAX)...", json_path); +auto ReadRequiredString(const boost::json::object& object, + const char* key) -> std::string { + const boost::json::value* value = object.if_contains(key); + if (value == nullptr || !value->is_string()) { + throw std::runtime_error(std::string("Missing or invalid string field: ") + + key); + } + return std::string(value->as_string().c_str()); +} - db.BeginTransaction(); - bool transactionOpen = true; +auto ReadRequiredNumber(const boost::json::object& object, const char* key) + -> double { + const boost::json::value* value = object.if_contains(key); + if (value == nullptr || !value->is_number()) { + throw std::runtime_error(std::string("Missing or invalid numeric field: ") + + key); + } + return value->to_number(); +} - size_t citiesProcessed = 0; - try { - StreamingJsonParser::Parse( - json_path, db, - [&](const CityRecord& record) { - db.InsertCity(record.id, record.state_id, record.country_id, - record.name, record.latitude, record.longitude); - ++citiesProcessed; +} // namespace - if (citiesProcessed % kBatchSize == 0) { - db.CommitTransaction(); - db.BeginTransaction(); - } - }, - [&](size_t current, size_t /*total*/) { - if (current % kBatchSize == 0 && current > 0) { - spdlog::info(" [Progress] Parsed {} cities...", current); - } - }); - - spdlog::info(" OK: Parsed all cities from JSON"); - - if (transactionOpen) { - db.CommitTransaction(); - transactionOpen = false; - } - } catch (...) { - if (transactionOpen) { - db.RollbackTransaction(); - transactionOpen = false; - } - throw; +auto JsonLoader::LoadLocations(const std::string& filepath) + -> std::vector { + std::ifstream input(filepath); + if (!input.is_open()) { + throw std::runtime_error("Failed to open locations file: " + filepath); } - auto endTime = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::duration_cast( - endTime - startTime); + std::stringstream buffer; + buffer << input.rdbuf(); + const std::string content = buffer.str(); - spdlog::info("\n=== World City Data Loading Summary ===\n"); - spdlog::info("Cities inserted: {}", citiesProcessed); - spdlog::info("Elapsed time: {} ms", duration.count()); - long long throughput = - (citiesProcessed > 0 && duration.count() > 0) - ? (1000LL * static_cast(citiesProcessed)) / - static_cast(duration.count()) - : 0LL; - spdlog::info("Throughput: {} cities/sec", throughput); - spdlog::info("=======================================\n"); + boost::json::error_code error; + boost::json::value root = boost::json::parse(content, error); + if (error) { + throw std::runtime_error("Failed to parse locations JSON: " + + error.message()); + } + + if (!root.is_array()) { + throw std::runtime_error( + "Invalid locations JSON: root element must be an array"); + } + + std::vector locations; + const auto& items = root.as_array(); + locations.reserve(items.size()); + + for (const auto& item : items) { + if (!item.is_object()) { + throw std::runtime_error( + "Invalid locations JSON: each entry must be an object"); + } + + const auto& object = item.as_object(); + locations.push_back(Location{ + .city = ReadRequiredString(object, "city"), + .state_province = ReadRequiredString(object, "state_province"), + .iso3166_2 = ReadRequiredString(object, "iso3166_2"), + .country = ReadRequiredString(object, "country"), + .iso3166_1 = ReadRequiredString(object, "iso3166_1"), + .latitude = ReadRequiredNumber(object, "latitude"), + .longitude = ReadRequiredNumber(object, "longitude"), + }); + } + + spdlog::info("[JsonLoader] Loaded {} locations from {}", locations.size(), + filepath); + return locations; } diff --git a/pipeline/src/json_handling/stream_parser.cpp b/pipeline/src/json_handling/stream_parser.cpp deleted file mode 100644 index 68dcf0e..0000000 --- a/pipeline/src/json_handling/stream_parser.cpp +++ /dev/null @@ -1,289 +0,0 @@ -#include "json_handling/stream_parser.h" - -#include - -#include -#include -#include -#include - -#include "database/database.h" - -class CityRecordHandler { - friend class boost::json::basic_parser; - - public: - static constexpr std::size_t max_array_size = static_cast(-1); - static constexpr std::size_t max_object_size = static_cast(-1); - static constexpr std::size_t max_string_size = static_cast(-1); - static constexpr std::size_t max_key_size = static_cast(-1); - - struct ParseContext { - SqliteDatabase* db = nullptr; - std::function on_city; - std::function on_progress; - size_t cities_emitted = 0; - size_t total_file_size = 0; - int countries_inserted = 0; - int states_inserted = 0; - }; - - explicit CityRecordHandler(ParseContext& ctx) : context(ctx) {} - - private: - ParseContext& context; - - int depth = 0; - bool in_countries_array = false; - bool in_country_object = false; - bool in_states_array = false; - bool in_state_object = false; - bool in_cities_array = false; - bool building_city = false; - - int current_country_id = 0; - int current_state_id = 0; - CityRecord current_city = {}; - std::string current_key; - std::string current_key_val; - std::string current_string_val; - - std::string country_info[3]; - std::string state_info[2]; - - // Boost.JSON SAX Hooks - bool on_document_begin(boost::system::error_code&) { return true; } - bool on_document_end(boost::system::error_code&) { return true; } - - bool on_array_begin(boost::system::error_code&) { - depth++; - if (depth == 1) { - in_countries_array = true; - } else if (depth == 3 && current_key == "states") { - in_states_array = true; - } else if (depth == 5 && current_key == "cities") { - in_cities_array = true; - } - return true; - } - - bool on_array_end(std::size_t, boost::system::error_code&) { - if (depth == 1) { - in_countries_array = false; - } else if (depth == 3) { - in_states_array = false; - } else if (depth == 5) { - in_cities_array = false; - } - depth--; - return true; - } - - bool on_object_begin(boost::system::error_code&) { - depth++; - if (depth == 2 && in_countries_array) { - in_country_object = true; - current_country_id = 0; - country_info[0].clear(); - country_info[1].clear(); - country_info[2].clear(); - } else if (depth == 4 && in_states_array) { - in_state_object = true; - current_state_id = 0; - state_info[0].clear(); - state_info[1].clear(); - } else if (depth == 6 && in_cities_array) { - building_city = true; - current_city = {}; - } - return true; - } - - bool on_object_end(std::size_t, boost::system::error_code&) { - if (depth == 6 && building_city) { - if (current_city.id > 0 && current_state_id > 0 && - current_country_id > 0) { - current_city.state_id = current_state_id; - current_city.country_id = current_country_id; - - try { - context.on_city(current_city); - context.cities_emitted++; - - if (context.on_progress && context.cities_emitted % 10000 == 0) { - context.on_progress(context.cities_emitted, - context.total_file_size); - } - } catch (const std::exception& e) { - spdlog::warn("Record parsing failed: {}", e.what()); - } - } - building_city = false; - } else if (depth == 4 && in_state_object) { - if (current_state_id > 0 && current_country_id > 0) { - try { - context.db->InsertState(current_state_id, current_country_id, - state_info[0], state_info[1]); - context.states_inserted++; - } catch (const std::exception& e) { - spdlog::warn("Record parsing failed: {}", e.what()); - } - } - in_state_object = false; - } else if (depth == 2 && in_country_object) { - if (current_country_id > 0) { - try { - context.db->InsertCountry(current_country_id, country_info[0], - country_info[1], country_info[2]); - context.countries_inserted++; - } catch (const std::exception& e) { - spdlog::warn("Record parsing failed: {}", e.what()); - } - } - in_country_object = false; - } - - depth--; - return true; - } - - bool on_key_part(boost::json::string_view s, std::size_t, - boost::system::error_code&) { - current_key_val.append(s.data(), s.size()); - return true; - } - - bool on_key(boost::json::string_view s, std::size_t, - boost::system::error_code&) { - current_key_val.append(s.data(), s.size()); - current_key = current_key_val; - current_key_val.clear(); - return true; - } - - bool on_string_part(boost::json::string_view s, std::size_t, - boost::system::error_code&) { - current_string_val.append(s.data(), s.size()); - return true; - } - - bool on_string(boost::json::string_view s, std::size_t, - boost::system::error_code&) { - current_string_val.append(s.data(), s.size()); - - if (building_city && current_key == "name") { - current_city.name = current_string_val; - } else if (in_state_object && current_key == "name") { - state_info[0] = current_string_val; - } else if (in_state_object && current_key == "iso2") { - state_info[1] = current_string_val; - } else if (in_country_object && current_key == "name") { - country_info[0] = current_string_val; - } else if (in_country_object && current_key == "iso2") { - country_info[1] = current_string_val; - } else if (in_country_object && current_key == "iso3") { - country_info[2] = current_string_val; - } - - current_string_val.clear(); - return true; - } - - bool on_number_part(boost::json::string_view, boost::system::error_code&) { - return true; - } - - bool on_int64(int64_t i, boost::json::string_view, - boost::system::error_code&) { - if (building_city && current_key == "id") { - current_city.id = static_cast(i); - } else if (in_state_object && current_key == "id") { - current_state_id = static_cast(i); - } else if (in_country_object && current_key == "id") { - current_country_id = static_cast(i); - } - return true; - } - - bool on_uint64(uint64_t u, boost::json::string_view, - boost::system::error_code& ec) { - return on_int64(static_cast(u), "", ec); - } - - bool on_double(double d, boost::json::string_view, - boost::system::error_code&) { - if (building_city) { - if (current_key == "latitude") { - current_city.latitude = d; - } else if (current_key == "longitude") { - current_city.longitude = d; - } - } - return true; - } - - bool on_bool(bool, boost::system::error_code&) { return true; } - bool on_null(boost::system::error_code&) { return true; } - bool on_comment_part(boost::json::string_view, boost::system::error_code&) { - return true; - } - bool on_comment(boost::json::string_view, boost::system::error_code&) { - return true; - } -}; - -void StreamingJsonParser::Parse( - const std::string& file_path, SqliteDatabase& db, - std::function on_city, - std::function on_progress) { - spdlog::info(" Streaming parse of {} (Boost.JSON)...", file_path); - - FILE* file = std::fopen(file_path.c_str(), "rb"); - if (!file) { - throw std::runtime_error("Failed to open JSON file: " + file_path); - } - - size_t total_size = 0; - if (std::fseek(file, 0, SEEK_END) == 0) { - long file_size = std::ftell(file); - if (file_size > 0) { - total_size = static_cast(file_size); - } - std::rewind(file); - } - - CityRecordHandler::ParseContext ctx{&db, on_city, on_progress, 0, total_size, - 0, 0}; - boost::json::basic_parser parser( - boost::json::parse_options{}, ctx); - - char buf[65536]; - size_t bytes_read; - boost::system::error_code ec; - - while ((bytes_read = std::fread(buf, 1, sizeof(buf), file)) > 0) { - char const* p = buf; - std::size_t remain = bytes_read; - - while (remain > 0) { - std::size_t consumed = parser.write_some(true, p, remain, ec); - if (ec) { - std::fclose(file); - throw std::runtime_error("JSON parse error: " + ec.message()); - } - p += consumed; - remain -= consumed; - } - } - - parser.write_some(false, nullptr, 0, ec); // Signal EOF - std::fclose(file); - - if (ec) { - throw std::runtime_error("JSON parse error at EOF: " + ec.message()); - } - - spdlog::info(" OK: Parsed {} countries, {} states, {} cities", - ctx.countries_inserted, ctx.states_inserted, - ctx.cities_emitted); -} diff --git a/pipeline/src/main.cpp b/pipeline/src/main.cpp index 0147ba9..2f75fa2 100644 --- a/pipeline/src/main.cpp +++ b/pipeline/src/main.cpp @@ -5,7 +5,6 @@ #include #include "biergarten_data_generator.h" -#include "database/database.h" #include "web_client/curl_web_client.h" namespace po = boost::program_options; @@ -122,9 +121,8 @@ int main(int argc, char* argv[]) { } auto webClient = std::make_shared(); - SqliteDatabase database; - BiergartenDataGenerator generator(options, webClient, database); + BiergartenDataGenerator generator(options, webClient); return generator.Run(); } catch (const std::exception& e) {