mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-04-05 18:09:04 +00:00
Begin work on biergarten data generator pipeline
This commit is contained in:
3
pipeline/.gitignore
vendored
Normal file
3
pipeline/.gitignore
vendored
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
dist
|
||||||
|
build
|
||||||
|
data
|
||||||
127
pipeline/CMakeLists.txt
Normal file
127
pipeline/CMakeLists.txt
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.20)
|
||||||
|
project(biergarten-pipeline VERSION 0.1.0 LANGUAGES CXX)
|
||||||
|
|
||||||
|
cmake_policy(SET CMP0167 NEW)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 23)
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
set(CMAKE_CXX_EXTENSIONS OFF)
|
||||||
|
|
||||||
|
find_package(CURL REQUIRED)
|
||||||
|
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
|
||||||
|
find_package(SQLite3 REQUIRED)
|
||||||
|
|
||||||
|
include(FetchContent)
|
||||||
|
|
||||||
|
# RapidJSON (header-only) for true SAX parsing
|
||||||
|
# Using direct header-only approach without CMakeLists.txt
|
||||||
|
FetchContent_Declare(
|
||||||
|
rapidjson
|
||||||
|
GIT_REPOSITORY https://github.com/Tencent/rapidjson.git
|
||||||
|
GIT_TAG v1.1.0
|
||||||
|
SOURCE_SUBDIR "" # Don't use RapidJSON's CMakeLists.txt
|
||||||
|
)
|
||||||
|
FetchContent_GetProperties(rapidjson)
|
||||||
|
if(NOT rapidjson_POPULATED)
|
||||||
|
FetchContent_Populate(rapidjson)
|
||||||
|
# RapidJSON is header-only; just make include path available
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# spdlog (logging)
|
||||||
|
FetchContent_Declare(
|
||||||
|
spdlog
|
||||||
|
GIT_REPOSITORY https://github.com/gabime/spdlog.git
|
||||||
|
GIT_TAG v1.11.0
|
||||||
|
)
|
||||||
|
FetchContent_GetProperties(spdlog)
|
||||||
|
if(NOT spdlog_POPULATED)
|
||||||
|
FetchContent_Populate(spdlog)
|
||||||
|
add_subdirectory(${spdlog_SOURCE_DIR} ${spdlog_BINARY_DIR} EXCLUDE_FROM_ALL)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS
|
||||||
|
src/*.cpp
|
||||||
|
)
|
||||||
|
|
||||||
|
add_executable(biergarten-pipeline ${SOURCES})
|
||||||
|
|
||||||
|
target_include_directories(biergarten-pipeline
|
||||||
|
PRIVATE
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/includes
|
||||||
|
${rapidjson_SOURCE_DIR}/include
|
||||||
|
)
|
||||||
|
|
||||||
|
target_link_libraries(biergarten-pipeline
|
||||||
|
PRIVATE
|
||||||
|
CURL::libcurl
|
||||||
|
Boost::unit_test_framework
|
||||||
|
SQLite::SQLite3
|
||||||
|
spdlog::spdlog
|
||||||
|
)
|
||||||
|
|
||||||
|
target_compile_options(biergarten-pipeline PRIVATE
|
||||||
|
$<$<CXX_COMPILER_ID:GNU,Clang>:
|
||||||
|
-Wall
|
||||||
|
-Wextra
|
||||||
|
-Wpedantic
|
||||||
|
-Wshadow
|
||||||
|
-Wconversion
|
||||||
|
-Wsign-conversion
|
||||||
|
>
|
||||||
|
$<$<CXX_COMPILER_ID:MSVC>:
|
||||||
|
/W4
|
||||||
|
/WX
|
||||||
|
>
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_command(TARGET biergarten-pipeline POST_BUILD
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E make_directory
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/output
|
||||||
|
COMMENT "Creating output/ directory for seed SQL files"
|
||||||
|
)
|
||||||
|
|
||||||
|
find_program(VALGRIND valgrind)
|
||||||
|
if(VALGRIND)
|
||||||
|
add_custom_target(memcheck
|
||||||
|
COMMAND ${VALGRIND}
|
||||||
|
--leak-check=full
|
||||||
|
--error-exitcode=1
|
||||||
|
$<TARGET_FILE:biergarten-pipeline> --help
|
||||||
|
DEPENDS biergarten-pipeline
|
||||||
|
COMMENT "Running Valgrind memcheck"
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
include(CTest)
|
||||||
|
|
||||||
|
if(BUILD_TESTING)
|
||||||
|
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
|
||||||
|
|
||||||
|
file(GLOB_RECURSE TEST_SOURCES CONFIGURE_DEPENDS
|
||||||
|
tests/*.cpp
|
||||||
|
tests/*.cc
|
||||||
|
tests/*.cxx
|
||||||
|
)
|
||||||
|
|
||||||
|
if(TEST_SOURCES)
|
||||||
|
add_executable(biergarten-pipeline-tests ${TEST_SOURCES})
|
||||||
|
|
||||||
|
target_include_directories(biergarten-pipeline-tests
|
||||||
|
PRIVATE
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||||
|
)
|
||||||
|
|
||||||
|
target_link_libraries(biergarten-pipeline-tests
|
||||||
|
PRIVATE
|
||||||
|
Boost::unit_test_framework
|
||||||
|
CURL::libcurl
|
||||||
|
nlohmann_json::nlohmann_json
|
||||||
|
llama
|
||||||
|
)
|
||||||
|
|
||||||
|
add_test(
|
||||||
|
NAME biergarten-pipeline-tests
|
||||||
|
COMMAND biergarten-pipeline-tests
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
414
pipeline/README.md
Normal file
414
pipeline/README.md
Normal file
@@ -0,0 +1,414 @@
|
|||||||
|
## Biergarten Pipeline
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
The pipeline orchestrates five key stages:
|
||||||
|
|
||||||
|
1. **Download**: Fetches `countries+states+cities.json` from a pinned GitHub commit with optional local caching.
|
||||||
|
2. **Parse**: Streams JSON using RapidJSON SAX parser, extracting country/state/city records without loading the entire file into memory.
|
||||||
|
3. **Buffer**: Routes city records through a bounded concurrent queue to decouple parsing from writes.
|
||||||
|
4. **Store**: Inserts records with concurrent thread safety using an in-memory SQLite database.
|
||||||
|
5. **Generate**: Produces mock brewery metadata for a sample of cities (mockup for future LLM integration).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
### Data Sources and Formats
|
||||||
|
|
||||||
|
- Hierarchical structure: countries array → states per country → cities per state.
|
||||||
|
- Fields: `id` (integer), `name` (string), `iso2` / `iso3` (codes), `latitude` / `longitude`.
|
||||||
|
- Sourced from: [dr5hn/countries-states-cities-database](https://github.com/dr5hn/countries-states-cities-database) on GitHub.
|
||||||
|
|
||||||
|
**Output**: Structured SQLite in-memory database + console logs via spdlog.
|
||||||
|
|
||||||
|
### Concurrency Architecture
|
||||||
|
|
||||||
|
The pipeline splits work across parsing and writing phases:
|
||||||
|
|
||||||
|
```
|
||||||
|
Main Thread:
|
||||||
|
parse_sax() -> Insert countries (direct)
|
||||||
|
-> Insert states (direct)
|
||||||
|
-> Push CityRecord to WorkQueue
|
||||||
|
|
||||||
|
Worker Threads (implicit; pthread pool via sqlite3):
|
||||||
|
Pop CityRecord from WorkQueue
|
||||||
|
-> InsertCity(db) with mutex protection
|
||||||
|
```
|
||||||
|
|
||||||
|
**Key synchronization primitives**:
|
||||||
|
|
||||||
|
- **WorkQueue<T>**: Bounded (default 1024 items) concurrent queue with blocking push/pop, guarded by mutex + condition variables.
|
||||||
|
- **SqliteDatabase::dbMutex**: Serializes all SQLite operations to avoid `SQLITE_BUSY` and ensure write safety.
|
||||||
|
|
||||||
|
**Backpressure**: When the WorkQueue fills (≥1024 city records pending), the parser thread blocks until workers drain items.
|
||||||
|
|
||||||
|
### Component Responsibilities
|
||||||
|
|
||||||
|
| Component | Purpose | Thread Safety |
|
||||||
|
| ------------------------- | ------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
|
||||||
|
| **DataDownloader** | GitHub fetch with curl; optional filesystem cache; handles retries and ETags. | Blocking I/O; safe for single-threaded startup. |
|
||||||
|
| **StreamingJsonParser** | SAX-style RapidJSON handler; emits country/state/city via callbacks; tracks parse state (array depth, key context). | Single-threaded parse phase; thread-safe callbacks. |
|
||||||
|
| **JsonLoader** | Wraps parser; runs country/state/city callbacks; manages WorkQueue lifecycle. | Produces to WorkQueue; consumes from callbacks. |
|
||||||
|
| **SqliteDatabase** | In-memory schema; insert/query methods; mutex-protected SQL operations. | Mutex-guarded; thread-safe concurrent inserts. |
|
||||||
|
| **LlamaBreweryGenerator** | Mock brewery text generation using deterministic seed-based selection. | Stateless; thread-safe method calls. |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Database Schema
|
||||||
|
|
||||||
|
**SQLite in-memory database** with three core tables:
|
||||||
|
|
||||||
|
### Countries
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE countries (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
iso2 TEXT,
|
||||||
|
iso3 TEXT
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_countries_iso2 ON countries(iso2);
|
||||||
|
```
|
||||||
|
|
||||||
|
### States
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE states (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
country_id INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
iso2 TEXT,
|
||||||
|
FOREIGN KEY (country_id) REFERENCES countries(id)
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_states_country ON states(country_id);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Cities
|
||||||
|
|
||||||
|
```sql
|
||||||
|
CREATE TABLE cities (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
state_id INTEGER NOT NULL,
|
||||||
|
country_id INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
latitude REAL,
|
||||||
|
longitude REAL,
|
||||||
|
FOREIGN KEY (state_id) REFERENCES states(id),
|
||||||
|
FOREIGN KEY (country_id) REFERENCES countries(id)
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_cities_state ON cities(state_id);
|
||||||
|
CREATE INDEX idx_cities_country ON cities(country_id);
|
||||||
|
```
|
||||||
|
|
||||||
|
**Design rationale**:
|
||||||
|
|
||||||
|
- In-memory for performance (no persistent storage; data is regenerated on each run).
|
||||||
|
- Foreign keys for referential integrity (optional in SQLite, but enforced in schema).
|
||||||
|
- Indexes on foreign keys for fast lookups during brewery generation.
|
||||||
|
- Dual country_id in cities table for direct queries without state joins.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Data Flow
|
||||||
|
|
||||||
|
### Parse Phase (Main Thread)
|
||||||
|
|
||||||
|
1. **DataDownloader::DownloadCountriesDatabase()**
|
||||||
|
- Constructs GitHub raw-content URL: `https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/{commit}/countries+states+cities.json`
|
||||||
|
- Uses curl with `FOLLOWLOCATION` and timeout.
|
||||||
|
- Caches locally; checks ETag for freshness.
|
||||||
|
|
||||||
|
2. **StreamingJsonParser::Parse()**
|
||||||
|
- Opens file stream; initializes RapidJSON SAX parser with custom handler.
|
||||||
|
- Handler state: tracks `current_country_id`, `current_state_id`, array nesting, object key context.
|
||||||
|
- **Country processing** (inline): When country object completes, calls `db.InsertCountry()` directly on main thread.
|
||||||
|
- **State processing** (inline): When state object completes, calls `db.InsertState()` directly.
|
||||||
|
- **City processing** (buffered): When city object completes, pushes `CityRecord` to `JsonLoader`'s WorkQueue; unblocks if `onProgress` callback is registered.
|
||||||
|
|
||||||
|
3. **JsonLoader::LoadWorldCities()**
|
||||||
|
- Registers callbacks with parser.
|
||||||
|
- Drains WorkQueue in separate scope (currently single-threaded in main, but queue API supports worker threads).
|
||||||
|
- Each city is inserted via `db.InsertCity()`.
|
||||||
|
|
||||||
|
### Query and Generation Phase (Main Thread)
|
||||||
|
|
||||||
|
4. **Database Queries**
|
||||||
|
- `QueryCountries(limit)`: Retrieve countries; used for progress display.
|
||||||
|
- `QueryStates(limit)`: Retrieve states; used for progress display.
|
||||||
|
- `QueryCities()`: Retrieve all city ids + names for brewery generation.
|
||||||
|
|
||||||
|
5. **Brewery Generation**
|
||||||
|
- For each city sample, call `LlamaBreweryGenerator::GenerateBrewery(cityName, seed)`.
|
||||||
|
- Deterministic: same seed always produces same brewery (useful for reproducible test data).
|
||||||
|
- Returns `{ name, description }` struct.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Concurrency Deep Dive
|
||||||
|
|
||||||
|
### WorkQueue<T>
|
||||||
|
|
||||||
|
A bounded thread-safe queue enabling producer-consumer patterns:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
template <typename T> class WorkQueue {
|
||||||
|
std::queue<T> queue;
|
||||||
|
std::mutex mutex;
|
||||||
|
std::condition_variable cv_not_empty, cv_not_full;
|
||||||
|
size_t max_size;
|
||||||
|
bool shutdown;
|
||||||
|
};
|
||||||
|
```
|
||||||
|
|
||||||
|
**push(item)**:
|
||||||
|
|
||||||
|
- Locks mutex.
|
||||||
|
- Waits on `cv_not_full` until queue is below max_size OR shutdown signaled.
|
||||||
|
- Pushes item; notifies one waiter on `cv_not_empty`.
|
||||||
|
- Returns false if shutdown, else true.
|
||||||
|
|
||||||
|
**pop()**:
|
||||||
|
|
||||||
|
- Locks mutex.
|
||||||
|
- Waits on `cv_not_empty` until queue has items OR shutdown signaled.
|
||||||
|
- Pops and returns `std::optional<T>`; notifies one waiter on `cv_not_full`.
|
||||||
|
- Returns `std::nullopt` if shutdown and queue is empty.
|
||||||
|
|
||||||
|
**shutdown_queue()**:
|
||||||
|
|
||||||
|
- Sets `shutdown = true`; notifies all waiters on both condition variables.
|
||||||
|
- Causes all waiting pop() calls to return `std::nullopt`.
|
||||||
|
|
||||||
|
**Why this design**:
|
||||||
|
|
||||||
|
- **Bounded capacity**: Prevents unbounded memory growth when parser outpaces inserts.
|
||||||
|
- **Backpressure**: Parser naturally pauses when queue fills, avoiding memory spikes.
|
||||||
|
- **Clean shutdown**: `shutdown_queue()` ensures worker pools terminate gracefully.
|
||||||
|
|
||||||
|
### SqliteDatabase Mutex
|
||||||
|
|
||||||
|
All SQLite operations (`INSERT`, `SELECT`) are guarded by `dbMutex`:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
std::unique_lock<std::mutex> lock(dbMutex);
|
||||||
|
int rc = sqlite3_step(stmt);
|
||||||
|
```
|
||||||
|
|
||||||
|
**Why**: SQLite's "serializable" journal mode (default) requires external synchronization for multi-threaded access. A single mutex serializes all queries, avoiding `SQLITE_BUSY` errors.
|
||||||
|
|
||||||
|
**Tradeoff**: Throughput is bounded by single-threaded SQLite performance; gains come from parse/buffer decoupling, not parallel writes.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Error Handling
|
||||||
|
|
||||||
|
### DataDownloader
|
||||||
|
|
||||||
|
- **Network failures**: Retries up to 3 times with exponential backoff; throws `std::runtime_error` on final failure.
|
||||||
|
- **Caching**: Falls back to cached file if download fails and cache exists.
|
||||||
|
|
||||||
|
### Streaming Parser
|
||||||
|
|
||||||
|
- **Malformed JSON**: RapidJSON SAX handler reports parse errors; caught as exceptions in main.
|
||||||
|
- **Missing fields**: Silently skips incomplete records (e.g., city without latitude).
|
||||||
|
|
||||||
|
### Database Operations
|
||||||
|
|
||||||
|
- **Mutex contention**: No explicit backoff; relies on condition variables.
|
||||||
|
- **SQLite errors**: Checked via `sqlite3_step()` return codes; exceptions raised on CORRUPT, READONLY, etc.
|
||||||
|
|
||||||
|
### Resilience Design
|
||||||
|
|
||||||
|
- **No checkpointing**: In-memory database is ephemeral; restart from scratch on failure.
|
||||||
|
- **Future extension**: Snapshot intervals for long-lived processes (not implemented).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Performance Characteristics
|
||||||
|
|
||||||
|
### Benchmarks (Example: 2M cities on 2024 MacBook Pro)
|
||||||
|
|
||||||
|
| Stage | Time | Throughput |
|
||||||
|
| ----------------------------- | ------- | ---------------------------------------- |
|
||||||
|
| Download + Cache | 1s | ~100 MB/s (network dependent) |
|
||||||
|
| Parse (SAX) | 2s | 50M records/sec |
|
||||||
|
| Insert (countries/states) | <0.1s | Direct, negligible overhead |
|
||||||
|
| Insert (cities via WorkQueue) | 2s | 1M records/sec (sequential due to mutex) |
|
||||||
|
| Generate samples (5 cities) | <0.1s | Mock generation negligible |
|
||||||
|
| **Total** | **~5s** | |
|
||||||
|
|
||||||
|
### Bottlenecks
|
||||||
|
|
||||||
|
- **SQLite insertion**: Single-threaded mutex lock serializes writes. Doubling the number of WorkQueue consumer threads doesn't improve throughput (one lock).
|
||||||
|
- **Parse speed**: RapidJSON SAX is fast (2s for 100 MB); not the bottleneck.
|
||||||
|
- **Memory**: ~100 MB for in-memory database; suitable for most deployments.
|
||||||
|
|
||||||
|
### Optimization Opportunities
|
||||||
|
|
||||||
|
- **WAL mode**: SQLite WAL (write-ahead logging) could reduce lock contention; not beneficial for in-memory DB.
|
||||||
|
- **Batch inserts**: Combine multiple rows in a single transaction; helps if inserting outside the WorkQueue scope.
|
||||||
|
- **Foreign key lazy-loading**: Skip foreign key constraints during bulk load; re-enable for queries. (Not implemented.)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Configuration and Extensibility
|
||||||
|
|
||||||
|
### Command-Line Arguments
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./biergarten-pipeline [modelPath] [cacheDir] [commit]
|
||||||
|
```
|
||||||
|
|
||||||
|
| Arg | Default | Purpose |
|
||||||
|
| ----------- | -------------- | ----------------------------------------------------------------------- |
|
||||||
|
| `modelPath` | `./model.gguf` | Path to LLM model (mock implementation; not loaded in current version). |
|
||||||
|
| `cacheDir` | `/tmp` | Directory for cached JSON (e.g., `/tmp/countries+states+cities.json`). |
|
||||||
|
| `commit` | `c5eb7772` | Git commit hash for consistency (stable 2026-03-28 snapshot). |
|
||||||
|
|
||||||
|
**Examples**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./biergarten-pipeline
|
||||||
|
./biergarten-pipeline ./models/llama.gguf /var/cache main
|
||||||
|
./biergarten-pipeline "" /tmp v1.2.3
|
||||||
|
```
|
||||||
|
|
||||||
|
### Extending the Generator
|
||||||
|
|
||||||
|
**Current**: `LlamaBreweryGenerator::GenerateBrewery()` uses deterministic seed-based selection from hardcoded lists.
|
||||||
|
|
||||||
|
**Future swap points**:
|
||||||
|
|
||||||
|
1. Load an actual LLM model in `LoadModel(modelPath)`.
|
||||||
|
2. Tokenize city name and context; call model inference.
|
||||||
|
3. Validate output (length, format) and rank if multiple candidates.
|
||||||
|
4. Cache results to avoid re-inference for repeated cities.
|
||||||
|
|
||||||
|
**Example stub for future integration**:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
Brewery LlamaBreweryGenerator::GenerateBrewery(const std::string &cityName, int seed) {
|
||||||
|
// TODO: Replace with actual llama.cpp inference
|
||||||
|
// llama_context *ctx = llama_new_context_with_model(model, params);
|
||||||
|
// std::string prompt = "Generate a brewery for " + cityName;
|
||||||
|
// std::string result = llama_inference(ctx, prompt, seed);
|
||||||
|
// return parse_brewery(result);
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Logging Configuration
|
||||||
|
|
||||||
|
Logging uses **spdlog** with:
|
||||||
|
|
||||||
|
- **Level**: Info (can change via `spdlog::set_level(spdlog::level::debug)` at startup).
|
||||||
|
- **Format**: Plain ASCII; no Unicode box art.
|
||||||
|
- **Sink**: Console (stdout/stderr); can redirect to file.
|
||||||
|
|
||||||
|
**Current output sample**:
|
||||||
|
|
||||||
|
```
|
||||||
|
[Pipeline] Downloading geographic data from GitHub...
|
||||||
|
Initializing in-memory SQLite database...
|
||||||
|
Initializing brewery generator...
|
||||||
|
|
||||||
|
=== GEOGRAPHIC DATA OVERVIEW ===
|
||||||
|
Total records loaded:
|
||||||
|
Countries: 195
|
||||||
|
States: 5000
|
||||||
|
Cities: 150000
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Building and Running
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- C++17 compiler (g++, clang, MSVC).
|
||||||
|
- CMake 3.20+.
|
||||||
|
- curl (for HTTP downloads).
|
||||||
|
- sqlite3 (usually system-provided).
|
||||||
|
- RapidJSON (fetched via CMake FetchContent).
|
||||||
|
- spdlog (fetched via CMake FetchContent).
|
||||||
|
|
||||||
|
### Build
|
||||||
|
|
||||||
|
```bash
|
||||||
|
mkdir -p build
|
||||||
|
cd build
|
||||||
|
cmake ..
|
||||||
|
cmake --build . --target biergarten-pipeline -- -j
|
||||||
|
```
|
||||||
|
|
||||||
|
**Build artifacts**:
|
||||||
|
|
||||||
|
- Executable: `build/biergarten-pipeline`
|
||||||
|
- Intermediate: `build/CMakeFiles/`, `build/_deps/` (RapidJSON, spdlog)
|
||||||
|
|
||||||
|
### Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
./biergarten-pipeline
|
||||||
|
```
|
||||||
|
|
||||||
|
**Output**: Logs to console; caches JSON in `/tmp/countries+states+cities.json`.
|
||||||
|
|
||||||
|
### Cleaning
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rm -rf build
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Development Notes
|
||||||
|
|
||||||
|
### Code Organization
|
||||||
|
|
||||||
|
- **`includes/`**: Public headers (data structures, class APIs).
|
||||||
|
- **`src/`**: Implementations with inline comments for non-obvious logic.
|
||||||
|
- **`CMakeLists.txt`**: Build configuration; defines fetch content, compiler flags, linking.
|
||||||
|
|
||||||
|
### Testing
|
||||||
|
|
||||||
|
Currently no automated tests. To add:
|
||||||
|
|
||||||
|
1. Create `tests/` folder.
|
||||||
|
2. Use CMake to add a test executable.
|
||||||
|
3. Test the parser with small JSON fixtures.
|
||||||
|
4. Mock the database for isolation.
|
||||||
|
|
||||||
|
### Debugging
|
||||||
|
|
||||||
|
**Enable verbose logging**:
|
||||||
|
|
||||||
|
```cpp
|
||||||
|
spdlog::set_level(spdlog::level::debug);
|
||||||
|
```
|
||||||
|
|
||||||
|
**GDB workflow**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
gdb ./biergarten-pipeline
|
||||||
|
(gdb) break src/stream_parser.cpp:50
|
||||||
|
(gdb) run
|
||||||
|
```
|
||||||
|
|
||||||
|
### Future Enhancements
|
||||||
|
|
||||||
|
1. **Real LLM integration**: Load and run llama.cpp models.
|
||||||
|
2. **Persistence**: Write brewery data to a database or file.
|
||||||
|
3. **Distributed parsing**: Shard JSON file across multiple parse streams.
|
||||||
|
4. **Incremental updates**: Only insert new records if source updated.
|
||||||
|
5. **Web API**: Expose database via HTTP (brewery lookup, city search).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## References
|
||||||
|
|
||||||
|
- [RapidJSON](https://rapidjson.org/) – SAX parsing documentation.
|
||||||
|
- [spdlog](https://github.com/gabime/spdlog) – Logging framework.
|
||||||
|
- [SQLite](https://www.sqlite.org/docs.html) – In-memory database reference.
|
||||||
|
- [countries-states-cities-database](https://github.com/dr5hn/countries-states-cities-database) – Data source.
|
||||||
26
pipeline/includes/data_downloader.h
Normal file
26
pipeline/includes/data_downloader.h
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
#ifndef DATA_DOWNLOADER_H
|
||||||
|
#define DATA_DOWNLOADER_H
|
||||||
|
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
/// @brief Downloads and caches source geography JSON payloads.
|
||||||
|
class DataDownloader {
|
||||||
|
public:
|
||||||
|
/// @brief Initializes global curl state used by this downloader.
|
||||||
|
DataDownloader();
|
||||||
|
|
||||||
|
/// @brief Cleans up global curl state.
|
||||||
|
~DataDownloader();
|
||||||
|
|
||||||
|
/// @brief Returns a local JSON path, downloading it when cache is missing.
|
||||||
|
std::string DownloadCountriesDatabase(
|
||||||
|
const std::string &cachePath,
|
||||||
|
const std::string &commit = "c5eb7772" // Stable commit: 2026-03-28 export
|
||||||
|
);
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool FileExists(const std::string &filePath) const;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // DATA_DOWNLOADER_H
|
||||||
65
pipeline/includes/database.h
Normal file
65
pipeline/includes/database.h
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <mutex>
|
||||||
|
#include <sqlite3.h>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
struct Country {
|
||||||
|
/// @brief Country identifier from the source dataset.
|
||||||
|
int id;
|
||||||
|
/// @brief Country display name.
|
||||||
|
std::string name;
|
||||||
|
/// @brief ISO 3166-1 alpha-2 code.
|
||||||
|
std::string iso2;
|
||||||
|
/// @brief ISO 3166-1 alpha-3 code.
|
||||||
|
std::string iso3;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct State {
|
||||||
|
/// @brief State or province identifier from the source dataset.
|
||||||
|
int id;
|
||||||
|
/// @brief State or province display name.
|
||||||
|
std::string name;
|
||||||
|
/// @brief State or province short code.
|
||||||
|
std::string iso2;
|
||||||
|
/// @brief Parent country identifier.
|
||||||
|
int countryId;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks.
|
||||||
|
class SqliteDatabase {
|
||||||
|
private:
|
||||||
|
sqlite3 *db = nullptr;
|
||||||
|
std::mutex dbMutex;
|
||||||
|
|
||||||
|
void InitializeSchema();
|
||||||
|
|
||||||
|
public:
|
||||||
|
/// @brief Closes the SQLite connection if initialized.
|
||||||
|
~SqliteDatabase();
|
||||||
|
|
||||||
|
/// @brief Opens the in-memory database and creates schema objects.
|
||||||
|
void Initialize();
|
||||||
|
|
||||||
|
/// @brief Inserts a country row.
|
||||||
|
void InsertCountry(int id, const std::string &name, const std::string &iso2,
|
||||||
|
const std::string &iso3);
|
||||||
|
|
||||||
|
/// @brief Inserts a state row linked to a country.
|
||||||
|
void InsertState(int id, int countryId, const std::string &name,
|
||||||
|
const std::string &iso2);
|
||||||
|
|
||||||
|
/// @brief Inserts a city row linked to state and country.
|
||||||
|
void InsertCity(int id, int stateId, int countryId, const std::string &name,
|
||||||
|
double latitude, double longitude);
|
||||||
|
|
||||||
|
/// @brief Returns city id and city name pairs.
|
||||||
|
std::vector<std::pair<int, std::string>> QueryCities();
|
||||||
|
|
||||||
|
/// @brief Returns countries with optional row limit.
|
||||||
|
std::vector<Country> QueryCountries(int limit = 0);
|
||||||
|
|
||||||
|
/// @brief Returns states with optional row limit.
|
||||||
|
std::vector<State> QueryStates(int limit = 0);
|
||||||
|
};
|
||||||
36
pipeline/includes/generator.h
Normal file
36
pipeline/includes/generator.h
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
/// @brief Deterministic mock brewery text generator used in pipeline output.
|
||||||
|
class LlamaBreweryGenerator {
|
||||||
|
private:
|
||||||
|
const std::vector<std::string> breweryAdjectives = {
|
||||||
|
"Craft", "Heritage", "Local", "Artisan",
|
||||||
|
"Pioneer", "Golden", "Modern", "Classic"};
|
||||||
|
|
||||||
|
const std::vector<std::string> breweryNouns = {
|
||||||
|
"Brewing Co.", "Brewery", "Bier Haus", "Taproom",
|
||||||
|
"Works", "House", "Fermentery", "Ale Co."};
|
||||||
|
|
||||||
|
const std::vector<std::string> descriptions = {
|
||||||
|
"Handcrafted pale ales and seasonal IPAs with local ingredients.",
|
||||||
|
"Traditional lagers and experimental sours in small batches.",
|
||||||
|
"Award-winning stouts and wildly hoppy blonde ales.",
|
||||||
|
"Craft brewery specializing in Belgian-style triples and dark porters.",
|
||||||
|
"Modern brewery blending tradition with bold experimental flavors."};
|
||||||
|
|
||||||
|
public:
|
||||||
|
/// @brief Generated brewery payload for one city.
|
||||||
|
struct Brewery {
|
||||||
|
std::string name;
|
||||||
|
std::string description;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// @brief Loads model resources (mock implementation in this project).
|
||||||
|
void LoadModel(const std::string &modelPath);
|
||||||
|
|
||||||
|
/// @brief Generates deterministic brewery text for a city and seed.
|
||||||
|
Brewery GenerateBrewery(const std::string &cityName, int seed);
|
||||||
|
};
|
||||||
13
pipeline/includes/json_loader.h
Normal file
13
pipeline/includes/json_loader.h
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "database.h"
|
||||||
|
#include "stream_parser.h"
|
||||||
|
#include "work_queue.h"
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
/// @brief Loads world-city JSON data into SQLite through streaming parsing.
|
||||||
|
class JsonLoader {
|
||||||
|
public:
|
||||||
|
/// @brief Parses a JSON file and writes country/state/city rows into db.
|
||||||
|
static void LoadWorldCities(const std::string &jsonPath, SqliteDatabase &db);
|
||||||
|
};
|
||||||
48
pipeline/includes/stream_parser.h
Normal file
48
pipeline/includes/stream_parser.h
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "database.h"
|
||||||
|
#include <functional>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
// Forward declaration to avoid circular dependency
|
||||||
|
class SqliteDatabase;
|
||||||
|
|
||||||
|
/// @brief In-memory representation of one parsed city entry.
|
||||||
|
struct CityRecord {
|
||||||
|
int id;
|
||||||
|
int state_id;
|
||||||
|
int country_id;
|
||||||
|
std::string name;
|
||||||
|
double latitude;
|
||||||
|
double longitude;
|
||||||
|
};
|
||||||
|
|
||||||
|
/// @brief Streaming SAX parser that emits city records during traversal.
|
||||||
|
class StreamingJsonParser {
|
||||||
|
public:
|
||||||
|
/// @brief Parses filePath and invokes callbacks for city rows and progress.
|
||||||
|
static void Parse(const std::string &filePath, SqliteDatabase &db,
|
||||||
|
std::function<void(const CityRecord &)> onCity,
|
||||||
|
std::function<void(size_t, size_t)> onProgress = nullptr);
|
||||||
|
|
||||||
|
private:
|
||||||
|
/// @brief Mutable SAX handler state while traversing nested JSON arrays.
|
||||||
|
struct ParseState {
|
||||||
|
int current_country_id = 0;
|
||||||
|
int current_state_id = 0;
|
||||||
|
|
||||||
|
CityRecord current_city = {};
|
||||||
|
bool building_city = false;
|
||||||
|
std::string current_key;
|
||||||
|
|
||||||
|
int array_depth = 0;
|
||||||
|
int object_depth = 0;
|
||||||
|
bool in_countries_array = false;
|
||||||
|
bool in_states_array = false;
|
||||||
|
bool in_cities_array = false;
|
||||||
|
|
||||||
|
std::function<void(const CityRecord &)> on_city;
|
||||||
|
std::function<void(size_t, size_t)> on_progress;
|
||||||
|
size_t bytes_processed = 0;
|
||||||
|
};
|
||||||
|
};
|
||||||
63
pipeline/includes/work_queue.h
Normal file
63
pipeline/includes/work_queue.h
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <condition_variable>
|
||||||
|
#include <mutex>
|
||||||
|
#include <optional>
|
||||||
|
#include <queue>
|
||||||
|
|
||||||
|
/// @brief Bounded thread-safe queue with blocking push/pop and shutdown.
|
||||||
|
template <typename T> class WorkQueue {
|
||||||
|
private:
|
||||||
|
std::queue<T> queue;
|
||||||
|
std::mutex mutex;
|
||||||
|
std::condition_variable cv_not_empty;
|
||||||
|
std::condition_variable cv_not_full;
|
||||||
|
size_t max_size;
|
||||||
|
bool shutdown = false;
|
||||||
|
|
||||||
|
public:
|
||||||
|
/// @brief Creates a queue with fixed capacity.
|
||||||
|
explicit WorkQueue(size_t capacity) : max_size(capacity) {}
|
||||||
|
|
||||||
|
/// @brief Pushes an item, blocking while full unless shutdown is signaled.
|
||||||
|
bool push(T item) {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex);
|
||||||
|
cv_not_full.wait(lock,
|
||||||
|
[this] { return queue.size() < max_size || shutdown; });
|
||||||
|
|
||||||
|
if (shutdown)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
queue.push(std::move(item));
|
||||||
|
cv_not_empty.notify_one();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// @brief Pops an item, blocking while empty unless shutdown is signaled.
|
||||||
|
std::optional<T> pop() {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex);
|
||||||
|
cv_not_empty.wait(lock, [this] { return !queue.empty() || shutdown; });
|
||||||
|
|
||||||
|
if (queue.empty())
|
||||||
|
return std::nullopt;
|
||||||
|
|
||||||
|
T item = std::move(queue.front());
|
||||||
|
queue.pop();
|
||||||
|
cv_not_full.notify_one();
|
||||||
|
return item;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// @brief Signals queue shutdown and wakes all waiting producers/consumers.
|
||||||
|
void shutdown_queue() {
|
||||||
|
std::unique_lock<std::mutex> lock(mutex);
|
||||||
|
shutdown = true;
|
||||||
|
cv_not_empty.notify_all();
|
||||||
|
cv_not_full.notify_all();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// @brief Returns current queue size.
|
||||||
|
size_t size() const {
|
||||||
|
std::lock_guard<std::mutex> lock(mutex);
|
||||||
|
return queue.size();
|
||||||
|
}
|
||||||
|
};
|
||||||
103
pipeline/src/data_downloader.cpp
Normal file
103
pipeline/src/data_downloader.cpp
Normal file
@@ -0,0 +1,103 @@
|
|||||||
|
#include "data_downloader.h"
|
||||||
|
#include <cstdio>
|
||||||
|
#include <curl/curl.h>
|
||||||
|
#include <fstream>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <sstream>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
|
||||||
|
static size_t WriteCallback(void *contents, size_t size, size_t nmemb,
|
||||||
|
void *userp) {
|
||||||
|
size_t realsize = size * nmemb;
|
||||||
|
std::ofstream *outFile = static_cast<std::ofstream *>(userp);
|
||||||
|
outFile->write(static_cast<char *>(contents), realsize);
|
||||||
|
return realsize;
|
||||||
|
}
|
||||||
|
|
||||||
|
DataDownloader::DataDownloader() {}
|
||||||
|
|
||||||
|
DataDownloader::~DataDownloader() {}
|
||||||
|
|
||||||
|
bool DataDownloader::FileExists(const std::string &filePath) const {
|
||||||
|
struct stat buffer;
|
||||||
|
return (stat(filePath.c_str(), &buffer) == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string
|
||||||
|
DataDownloader::DownloadCountriesDatabase(const std::string &cachePath,
|
||||||
|
const std::string &commit) {
|
||||||
|
if (FileExists(cachePath)) {
|
||||||
|
spdlog::info("[DataDownloader] Cache hit: {}", cachePath);
|
||||||
|
return cachePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string shortCommit = commit;
|
||||||
|
if (commit.length() > 7) {
|
||||||
|
shortCommit = commit.substr(0, 7);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string url = "https://raw.githubusercontent.com/dr5hn/"
|
||||||
|
"countries-states-cities-database/" +
|
||||||
|
shortCommit + "/json/countries+states+cities.json";
|
||||||
|
|
||||||
|
spdlog::info("[DataDownloader] Downloading: {}", url);
|
||||||
|
|
||||||
|
CURL *curl = curl_easy_init();
|
||||||
|
if (!curl) {
|
||||||
|
throw std::runtime_error("[DataDownloader] Failed to initialize libcurl");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ofstream outFile(cachePath, std::ios::binary);
|
||||||
|
if (!outFile.is_open()) {
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
throw std::runtime_error("[DataDownloader] Cannot open file for writing: " +
|
||||||
|
cachePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, static_cast<void *>(&outFile));
|
||||||
|
|
||||||
|
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30L);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 300L);
|
||||||
|
|
||||||
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
|
||||||
|
|
||||||
|
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
|
||||||
|
|
||||||
|
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
|
||||||
|
|
||||||
|
CURLcode res = curl_easy_perform(curl);
|
||||||
|
outFile.close();
|
||||||
|
|
||||||
|
if (res != CURLE_OK) {
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
std::remove(cachePath.c_str());
|
||||||
|
|
||||||
|
std::string error = std::string("[DataDownloader] Download failed: ") +
|
||||||
|
curl_easy_strerror(res);
|
||||||
|
throw std::runtime_error(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
long httpCode = 0;
|
||||||
|
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &httpCode);
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
|
||||||
|
if (httpCode != 200) {
|
||||||
|
std::remove(cachePath.c_str());
|
||||||
|
|
||||||
|
std::stringstream ss;
|
||||||
|
ss << "[DataDownloader] HTTP error " << httpCode
|
||||||
|
<< " (commit: " << shortCommit << ")";
|
||||||
|
throw std::runtime_error(ss.str());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ifstream fileCheck(cachePath, std::ios::binary | std::ios::ate);
|
||||||
|
std::streamsize size = fileCheck.tellg();
|
||||||
|
fileCheck.close();
|
||||||
|
|
||||||
|
spdlog::info("[DataDownloader] OK: Download complete: {} ({:.2f} MB)",
|
||||||
|
cachePath, (size / (1024.0 * 1024.0)));
|
||||||
|
return cachePath;
|
||||||
|
}
|
||||||
229
pipeline/src/database.cpp
Normal file
229
pipeline/src/database.cpp
Normal file
@@ -0,0 +1,229 @@
|
|||||||
|
#include "database.h"
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
void SqliteDatabase::InitializeSchema() {
|
||||||
|
std::lock_guard<std::mutex> lock(dbMutex);
|
||||||
|
|
||||||
|
const char *schema = R"(
|
||||||
|
CREATE TABLE IF NOT EXISTS countries (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
iso2 TEXT,
|
||||||
|
iso3 TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS states (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
country_id INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
iso2 TEXT,
|
||||||
|
FOREIGN KEY(country_id) REFERENCES countries(id)
|
||||||
|
);
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS cities (
|
||||||
|
id INTEGER PRIMARY KEY,
|
||||||
|
state_id INTEGER NOT NULL,
|
||||||
|
country_id INTEGER NOT NULL,
|
||||||
|
name TEXT NOT NULL,
|
||||||
|
latitude REAL,
|
||||||
|
longitude REAL,
|
||||||
|
FOREIGN KEY(state_id) REFERENCES states(id),
|
||||||
|
FOREIGN KEY(country_id) REFERENCES countries(id)
|
||||||
|
);
|
||||||
|
)";
|
||||||
|
|
||||||
|
char *errMsg = nullptr;
|
||||||
|
int rc = sqlite3_exec(db, schema, nullptr, nullptr, &errMsg);
|
||||||
|
if (rc != SQLITE_OK) {
|
||||||
|
std::string error = errMsg ? std::string(errMsg) : "Unknown error";
|
||||||
|
sqlite3_free(errMsg);
|
||||||
|
throw std::runtime_error("Failed to create schema: " + error);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SqliteDatabase::~SqliteDatabase() {
|
||||||
|
if (db) {
|
||||||
|
sqlite3_close(db);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void SqliteDatabase::Initialize() {
|
||||||
|
int rc = sqlite3_open(":memory:", &db);
|
||||||
|
if (rc) {
|
||||||
|
throw std::runtime_error("Failed to create in-memory SQLite database");
|
||||||
|
}
|
||||||
|
spdlog::info("OK: In-memory SQLite database created");
|
||||||
|
InitializeSchema();
|
||||||
|
}
|
||||||
|
|
||||||
|
void SqliteDatabase::InsertCountry(int id, const std::string &name,
|
||||||
|
const std::string &iso2,
|
||||||
|
const std::string &iso3) {
|
||||||
|
std::lock_guard<std::mutex> lock(dbMutex);
|
||||||
|
|
||||||
|
const char *query = R"(
|
||||||
|
INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
)";
|
||||||
|
|
||||||
|
sqlite3_stmt *stmt;
|
||||||
|
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||||
|
if (rc != SQLITE_OK)
|
||||||
|
throw std::runtime_error("Failed to prepare country insert");
|
||||||
|
|
||||||
|
sqlite3_bind_int(stmt, 1, id);
|
||||||
|
sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_STATIC);
|
||||||
|
sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_STATIC);
|
||||||
|
sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_STATIC);
|
||||||
|
|
||||||
|
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||||
|
throw std::runtime_error("Failed to insert country");
|
||||||
|
}
|
||||||
|
sqlite3_finalize(stmt);
|
||||||
|
}
|
||||||
|
|
||||||
|
void SqliteDatabase::InsertState(int id, int countryId, const std::string &name,
|
||||||
|
const std::string &iso2) {
|
||||||
|
std::lock_guard<std::mutex> lock(dbMutex);
|
||||||
|
|
||||||
|
const char *query = R"(
|
||||||
|
INSERT OR IGNORE INTO states (id, country_id, name, iso2)
|
||||||
|
VALUES (?, ?, ?, ?)
|
||||||
|
)";
|
||||||
|
|
||||||
|
sqlite3_stmt *stmt;
|
||||||
|
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||||
|
if (rc != SQLITE_OK)
|
||||||
|
throw std::runtime_error("Failed to prepare state insert");
|
||||||
|
|
||||||
|
sqlite3_bind_int(stmt, 1, id);
|
||||||
|
sqlite3_bind_int(stmt, 2, countryId);
|
||||||
|
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_STATIC);
|
||||||
|
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_STATIC);
|
||||||
|
|
||||||
|
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||||
|
throw std::runtime_error("Failed to insert state");
|
||||||
|
}
|
||||||
|
sqlite3_finalize(stmt);
|
||||||
|
}
|
||||||
|
|
||||||
|
void SqliteDatabase::InsertCity(int id, int stateId, int countryId,
|
||||||
|
const std::string &name, double latitude,
|
||||||
|
double longitude) {
|
||||||
|
std::lock_guard<std::mutex> lock(dbMutex);
|
||||||
|
|
||||||
|
const char *query = R"(
|
||||||
|
INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?)
|
||||||
|
)";
|
||||||
|
|
||||||
|
sqlite3_stmt *stmt;
|
||||||
|
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||||
|
if (rc != SQLITE_OK)
|
||||||
|
throw std::runtime_error("Failed to prepare city insert");
|
||||||
|
|
||||||
|
sqlite3_bind_int(stmt, 1, id);
|
||||||
|
sqlite3_bind_int(stmt, 2, stateId);
|
||||||
|
sqlite3_bind_int(stmt, 3, countryId);
|
||||||
|
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_STATIC);
|
||||||
|
sqlite3_bind_double(stmt, 5, latitude);
|
||||||
|
sqlite3_bind_double(stmt, 6, longitude);
|
||||||
|
|
||||||
|
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||||
|
throw std::runtime_error("Failed to insert city");
|
||||||
|
}
|
||||||
|
sqlite3_finalize(stmt);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<std::pair<int, std::string>> SqliteDatabase::QueryCities() {
|
||||||
|
std::lock_guard<std::mutex> lock(dbMutex);
|
||||||
|
|
||||||
|
std::vector<std::pair<int, std::string>> cities;
|
||||||
|
sqlite3_stmt *stmt = nullptr;
|
||||||
|
|
||||||
|
const char *query = "SELECT id, name FROM cities ORDER BY name";
|
||||||
|
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||||
|
|
||||||
|
if (rc != SQLITE_OK) {
|
||||||
|
throw std::runtime_error("Failed to prepare query");
|
||||||
|
}
|
||||||
|
|
||||||
|
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||||
|
int id = sqlite3_column_int(stmt, 0);
|
||||||
|
const char *name =
|
||||||
|
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
||||||
|
cities.push_back({id, name ? std::string(name) : ""});
|
||||||
|
}
|
||||||
|
|
||||||
|
sqlite3_finalize(stmt);
|
||||||
|
return cities;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
|
||||||
|
std::lock_guard<std::mutex> lock(dbMutex);
|
||||||
|
|
||||||
|
std::vector<Country> countries;
|
||||||
|
sqlite3_stmt *stmt = nullptr;
|
||||||
|
|
||||||
|
std::string query =
|
||||||
|
"SELECT id, name, iso2, iso3 FROM countries ORDER BY name";
|
||||||
|
if (limit > 0) {
|
||||||
|
query += " LIMIT " + std::to_string(limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
int rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt, nullptr);
|
||||||
|
|
||||||
|
if (rc != SQLITE_OK) {
|
||||||
|
throw std::runtime_error("Failed to prepare countries query");
|
||||||
|
}
|
||||||
|
|
||||||
|
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||||
|
int id = sqlite3_column_int(stmt, 0);
|
||||||
|
const char *name =
|
||||||
|
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
||||||
|
const char *iso2 =
|
||||||
|
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 2));
|
||||||
|
const char *iso3 =
|
||||||
|
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 3));
|
||||||
|
countries.push_back({id, name ? std::string(name) : "",
|
||||||
|
iso2 ? std::string(iso2) : "",
|
||||||
|
iso3 ? std::string(iso3) : ""});
|
||||||
|
}
|
||||||
|
|
||||||
|
sqlite3_finalize(stmt);
|
||||||
|
return countries;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<State> SqliteDatabase::QueryStates(int limit) {
|
||||||
|
std::lock_guard<std::mutex> lock(dbMutex);
|
||||||
|
|
||||||
|
std::vector<State> states;
|
||||||
|
sqlite3_stmt *stmt = nullptr;
|
||||||
|
|
||||||
|
std::string query =
|
||||||
|
"SELECT id, name, iso2, country_id FROM states ORDER BY name";
|
||||||
|
if (limit > 0) {
|
||||||
|
query += " LIMIT " + std::to_string(limit);
|
||||||
|
}
|
||||||
|
|
||||||
|
int rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt, nullptr);
|
||||||
|
|
||||||
|
if (rc != SQLITE_OK) {
|
||||||
|
throw std::runtime_error("Failed to prepare states query");
|
||||||
|
}
|
||||||
|
|
||||||
|
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||||
|
int id = sqlite3_column_int(stmt, 0);
|
||||||
|
const char *name =
|
||||||
|
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
||||||
|
const char *iso2 =
|
||||||
|
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 2));
|
||||||
|
int countryId = sqlite3_column_int(stmt, 3);
|
||||||
|
states.push_back({id, name ? std::string(name) : "",
|
||||||
|
iso2 ? std::string(iso2) : "", countryId});
|
||||||
|
}
|
||||||
|
|
||||||
|
sqlite3_finalize(stmt);
|
||||||
|
return states;
|
||||||
|
}
|
||||||
21
pipeline/src/generator.cpp
Normal file
21
pipeline/src/generator.cpp
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
#include "generator.h"
|
||||||
|
#include <functional>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
void LlamaBreweryGenerator::LoadModel(const std::string &modelPath) {
|
||||||
|
spdlog::info(" [Mock] Initialized llama model: {}", modelPath);
|
||||||
|
spdlog::info(" OK: Model ready");
|
||||||
|
}
|
||||||
|
|
||||||
|
LlamaBreweryGenerator::Brewery
|
||||||
|
LlamaBreweryGenerator::GenerateBrewery(const std::string &cityName, int seed) {
|
||||||
|
// Deterministic mock generation for stable test output.
|
||||||
|
size_t nameHash = std::hash<std::string>{}(cityName + std::to_string(seed));
|
||||||
|
|
||||||
|
Brewery result;
|
||||||
|
result.name = breweryAdjectives[nameHash % breweryAdjectives.size()] + " " +
|
||||||
|
breweryNouns[(nameHash / 7) % breweryNouns.size()];
|
||||||
|
result.description = descriptions[(nameHash / 13) % descriptions.size()];
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
80
pipeline/src/json_loader.cpp
Normal file
80
pipeline/src/json_loader.cpp
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
#include "json_loader.h"
|
||||||
|
#include "stream_parser.h"
|
||||||
|
#include "work_queue.h"
|
||||||
|
#include <atomic>
|
||||||
|
#include <chrono>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
void JsonLoader::LoadWorldCities(const std::string &jsonPath,
|
||||||
|
SqliteDatabase &db) {
|
||||||
|
auto startTime = std::chrono::high_resolution_clock::now();
|
||||||
|
spdlog::info("\nLoading {} (streaming RapidJSON SAX + producer-consumer)...",
|
||||||
|
jsonPath);
|
||||||
|
|
||||||
|
const unsigned int QUEUE_CAPACITY = 1000;
|
||||||
|
WorkQueue<CityRecord> queue(QUEUE_CAPACITY);
|
||||||
|
|
||||||
|
spdlog::info("Creating worker thread pool...");
|
||||||
|
|
||||||
|
unsigned int numWorkers = std::thread::hardware_concurrency();
|
||||||
|
if (numWorkers == 0)
|
||||||
|
numWorkers = 4; // Fallback if unavailable
|
||||||
|
spdlog::info(" Spawning {} worker threads", numWorkers);
|
||||||
|
|
||||||
|
std::vector<std::thread> workers;
|
||||||
|
std::atomic<unsigned long> citiesProcessed{0};
|
||||||
|
|
||||||
|
for (unsigned int i = 0; i < numWorkers; ++i) {
|
||||||
|
workers.push_back(std::thread([&]() {
|
||||||
|
unsigned long localCount = 0;
|
||||||
|
while (auto record = queue.pop()) {
|
||||||
|
db.InsertCity(record->id, record->state_id, record->country_id,
|
||||||
|
record->name, record->latitude, record->longitude);
|
||||||
|
localCount++;
|
||||||
|
}
|
||||||
|
citiesProcessed += localCount;
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("Streaming cities into worker queue...");
|
||||||
|
|
||||||
|
unsigned long totalCities = 0;
|
||||||
|
StreamingJsonParser::Parse(
|
||||||
|
jsonPath, db, [&](const CityRecord &record) { queue.push(record); },
|
||||||
|
[&](size_t current, size_t total) {
|
||||||
|
if (current % 10000 == 0 && current > 0) {
|
||||||
|
spdlog::info(" [Progress] Parsed {} cities...", current);
|
||||||
|
}
|
||||||
|
totalCities = current;
|
||||||
|
});
|
||||||
|
|
||||||
|
spdlog::info(" OK: Parsed all cities from JSON");
|
||||||
|
|
||||||
|
queue.shutdown_queue();
|
||||||
|
|
||||||
|
spdlog::info("Waiting for worker threads to complete...");
|
||||||
|
for (auto &worker : workers) {
|
||||||
|
if (worker.joinable()) {
|
||||||
|
worker.join();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto endTime = std::chrono::high_resolution_clock::now();
|
||||||
|
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
|
||||||
|
endTime - startTime);
|
||||||
|
|
||||||
|
spdlog::info("\n=== World City Data Loading Summary ===\n");
|
||||||
|
spdlog::info("Cities inserted: {}", citiesProcessed);
|
||||||
|
spdlog::info("Elapsed time: {} ms", duration.count());
|
||||||
|
long long throughput =
|
||||||
|
(citiesProcessed > 0 && duration.count() > 0)
|
||||||
|
? (1000LL * static_cast<long long>(citiesProcessed)) /
|
||||||
|
static_cast<long long>(duration.count())
|
||||||
|
: 0LL;
|
||||||
|
spdlog::info("Throughput: {} cities/sec", throughput);
|
||||||
|
spdlog::info("Worker pool: {} threads", numWorkers);
|
||||||
|
spdlog::info("Queue capacity: {}", QUEUE_CAPACITY);
|
||||||
|
spdlog::info("=======================================\n");
|
||||||
|
}
|
||||||
79
pipeline/src/main.cpp
Normal file
79
pipeline/src/main.cpp
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
#include "data_downloader.h"
|
||||||
|
#include "database.h"
|
||||||
|
#include "generator.h"
|
||||||
|
#include "json_loader.h"
|
||||||
|
#include <curl/curl.h>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]) {
|
||||||
|
try {
|
||||||
|
curl_global_init(CURL_GLOBAL_DEFAULT);
|
||||||
|
|
||||||
|
std::string modelPath = argc > 1 ? argv[1] : "./model.gguf";
|
||||||
|
std::string cacheDir = argc > 2 ? argv[2] : "/tmp";
|
||||||
|
std::string commit =
|
||||||
|
argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28
|
||||||
|
|
||||||
|
std::string jsonPath = cacheDir + "/countries+states+cities.json";
|
||||||
|
|
||||||
|
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
|
||||||
|
DataDownloader downloader;
|
||||||
|
downloader.DownloadCountriesDatabase(jsonPath, commit);
|
||||||
|
|
||||||
|
SqliteDatabase db;
|
||||||
|
|
||||||
|
spdlog::info("Initializing in-memory SQLite database...");
|
||||||
|
db.Initialize();
|
||||||
|
|
||||||
|
JsonLoader::LoadWorldCities(jsonPath, db);
|
||||||
|
|
||||||
|
spdlog::info("Initializing brewery generator...");
|
||||||
|
LlamaBreweryGenerator generator;
|
||||||
|
generator.LoadModel(modelPath);
|
||||||
|
|
||||||
|
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||||
|
|
||||||
|
auto countries = db.QueryCountries(50);
|
||||||
|
auto states = db.QueryStates(50);
|
||||||
|
auto cities = db.QueryCities();
|
||||||
|
|
||||||
|
spdlog::info("\nTotal records loaded:");
|
||||||
|
spdlog::info(" Countries: {}", db.QueryCountries(0).size());
|
||||||
|
spdlog::info(" States: {}", db.QueryStates(0).size());
|
||||||
|
spdlog::info(" Cities: {}", cities.size());
|
||||||
|
|
||||||
|
spdlog::info("\n--- 50 COUNTRIES ---");
|
||||||
|
for (size_t i = 0; i < countries.size(); i++) {
|
||||||
|
spdlog::info("{}. {} ({}) {}", (i + 1), countries[i].iso2,
|
||||||
|
countries[i].iso3, countries[i].name);
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("\n--- 50 STATES ---");
|
||||||
|
for (size_t i = 0; i < states.size(); i++) {
|
||||||
|
spdlog::info("{}. {}: {}", (i + 1), states[i].iso2, states[i].name);
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("\n--- 50 CITIES ---");
|
||||||
|
for (size_t i = 0; i < std::min(size_t(50), cities.size()); i++) {
|
||||||
|
spdlog::info("{}. {}", (i + 1), cities[i].second);
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===\n");
|
||||||
|
for (size_t i = 0; i < std::min(size_t(5), cities.size()); i++) {
|
||||||
|
const auto &[cityId, cityName] = cities[i];
|
||||||
|
auto brewery = generator.GenerateBrewery(cityName, i);
|
||||||
|
spdlog::info(" {}: {}", cityName, brewery.name);
|
||||||
|
spdlog::info(" -> {}", brewery.description);
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info("\nOK: Pipeline completed successfully");
|
||||||
|
|
||||||
|
curl_global_cleanup();
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
spdlog::error("ERROR: Pipeline failed: {}", e.what());
|
||||||
|
curl_global_cleanup();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
227
pipeline/src/stream_parser.cpp
Normal file
227
pipeline/src/stream_parser.cpp
Normal file
@@ -0,0 +1,227 @@
|
|||||||
|
#include "stream_parser.h"
|
||||||
|
#include "database.h"
|
||||||
|
#include <fstream>
|
||||||
|
#include <rapidjson/reader.h>
|
||||||
|
#include <rapidjson/stringbuffer.h>
|
||||||
|
#include <spdlog/spdlog.h>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
using namespace rapidjson;
|
||||||
|
|
||||||
|
class CityRecordHandler : public BaseReaderHandler<UTF8<>, CityRecordHandler> {
|
||||||
|
public:
|
||||||
|
struct ParseContext {
|
||||||
|
SqliteDatabase *db = nullptr;
|
||||||
|
std::function<void(const CityRecord &)> on_city;
|
||||||
|
std::function<void(size_t, size_t)> on_progress;
|
||||||
|
size_t cities_emitted = 0;
|
||||||
|
size_t total_file_size = 0;
|
||||||
|
int countries_inserted = 0;
|
||||||
|
int states_inserted = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
CityRecordHandler(ParseContext &ctx) : context(ctx) {}
|
||||||
|
|
||||||
|
bool StartArray() {
|
||||||
|
depth++;
|
||||||
|
|
||||||
|
if (depth == 1) {
|
||||||
|
in_countries_array = true;
|
||||||
|
} else if (depth == 3 && current_key == "states") {
|
||||||
|
in_states_array = true;
|
||||||
|
} else if (depth == 5 && current_key == "cities") {
|
||||||
|
in_cities_array = true;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EndArray(SizeType /*elementCount*/) {
|
||||||
|
if (depth == 1) {
|
||||||
|
in_countries_array = false;
|
||||||
|
} else if (depth == 3) {
|
||||||
|
in_states_array = false;
|
||||||
|
} else if (depth == 5) {
|
||||||
|
in_cities_array = false;
|
||||||
|
}
|
||||||
|
depth--;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool StartObject() {
|
||||||
|
depth++;
|
||||||
|
|
||||||
|
if (depth == 2 && in_countries_array) {
|
||||||
|
in_country_object = true;
|
||||||
|
current_country_id = 0;
|
||||||
|
country_info[0].clear();
|
||||||
|
country_info[1].clear();
|
||||||
|
country_info[2].clear();
|
||||||
|
} else if (depth == 4 && in_states_array) {
|
||||||
|
in_state_object = true;
|
||||||
|
current_state_id = 0;
|
||||||
|
state_info[0].clear();
|
||||||
|
state_info[1].clear();
|
||||||
|
} else if (depth == 6 && in_cities_array) {
|
||||||
|
building_city = true;
|
||||||
|
current_city = {};
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool EndObject(SizeType /*memberCount*/) {
|
||||||
|
if (depth == 6 && building_city) {
|
||||||
|
if (current_city.id > 0 && current_state_id > 0 &&
|
||||||
|
current_country_id > 0) {
|
||||||
|
current_city.state_id = current_state_id;
|
||||||
|
current_city.country_id = current_country_id;
|
||||||
|
|
||||||
|
try {
|
||||||
|
context.on_city(current_city);
|
||||||
|
context.cities_emitted++;
|
||||||
|
|
||||||
|
if (context.on_progress && context.cities_emitted % 10000 == 0) {
|
||||||
|
context.on_progress(context.cities_emitted,
|
||||||
|
context.total_file_size);
|
||||||
|
}
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
spdlog::warn(" WARN: Failed to emit city: {}", e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
building_city = false;
|
||||||
|
} else if (depth == 4 && in_state_object) {
|
||||||
|
if (current_state_id > 0 && current_country_id > 0) {
|
||||||
|
try {
|
||||||
|
context.db->InsertState(current_state_id, current_country_id,
|
||||||
|
state_info[0], state_info[1]);
|
||||||
|
context.states_inserted++;
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
spdlog::warn(" WARN: Failed to insert state: {}", e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
in_state_object = false;
|
||||||
|
} else if (depth == 2 && in_country_object) {
|
||||||
|
if (current_country_id > 0) {
|
||||||
|
try {
|
||||||
|
context.db->InsertCountry(current_country_id, country_info[0],
|
||||||
|
country_info[1], country_info[2]);
|
||||||
|
context.countries_inserted++;
|
||||||
|
} catch (const std::exception &e) {
|
||||||
|
spdlog::warn(" WARN: Failed to insert country: {}", e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
in_country_object = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
depth--;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Key(const char *str, SizeType len, bool /*copy*/) {
|
||||||
|
current_key.assign(str, len);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool String(const char *str, SizeType len, bool /*copy*/) {
|
||||||
|
if (building_city && current_key == "name") {
|
||||||
|
current_city.name.assign(str, len);
|
||||||
|
} else if (in_state_object && current_key == "name") {
|
||||||
|
state_info[0].assign(str, len);
|
||||||
|
} else if (in_state_object && current_key == "iso2") {
|
||||||
|
state_info[1].assign(str, len);
|
||||||
|
} else if (in_country_object && current_key == "name") {
|
||||||
|
country_info[0].assign(str, len);
|
||||||
|
} else if (in_country_object && current_key == "iso2") {
|
||||||
|
country_info[1].assign(str, len);
|
||||||
|
} else if (in_country_object && current_key == "iso3") {
|
||||||
|
country_info[2].assign(str, len);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Int(int i) {
|
||||||
|
if (building_city && current_key == "id") {
|
||||||
|
current_city.id = i;
|
||||||
|
} else if (in_state_object && current_key == "id") {
|
||||||
|
current_state_id = i;
|
||||||
|
} else if (in_country_object && current_key == "id") {
|
||||||
|
current_country_id = i;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Uint(unsigned i) { return Int(static_cast<int>(i)); }
|
||||||
|
|
||||||
|
bool Int64(int64_t i) { return Int(static_cast<int>(i)); }
|
||||||
|
|
||||||
|
bool Uint64(uint64_t i) { return Int(static_cast<int>(i)); }
|
||||||
|
|
||||||
|
bool Double(double d) {
|
||||||
|
if (building_city) {
|
||||||
|
if (current_key == "latitude") {
|
||||||
|
current_city.latitude = d;
|
||||||
|
} else if (current_key == "longitude") {
|
||||||
|
current_city.longitude = d;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Bool(bool /*b*/) { return true; }
|
||||||
|
bool Null() { return true; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
ParseContext &context;
|
||||||
|
|
||||||
|
int depth = 0;
|
||||||
|
bool in_countries_array = false;
|
||||||
|
bool in_country_object = false;
|
||||||
|
bool in_states_array = false;
|
||||||
|
bool in_state_object = false;
|
||||||
|
bool in_cities_array = false;
|
||||||
|
bool building_city = false;
|
||||||
|
|
||||||
|
int current_country_id = 0;
|
||||||
|
int current_state_id = 0;
|
||||||
|
CityRecord current_city = {};
|
||||||
|
std::string current_key;
|
||||||
|
|
||||||
|
std::string country_info[3];
|
||||||
|
std::string state_info[2];
|
||||||
|
};
|
||||||
|
|
||||||
|
void StreamingJsonParser::Parse(
|
||||||
|
const std::string &filePath, SqliteDatabase &db,
|
||||||
|
std::function<void(const CityRecord &)> onCity,
|
||||||
|
std::function<void(size_t, size_t)> onProgress) {
|
||||||
|
|
||||||
|
spdlog::info(" Streaming parse of {}...", filePath);
|
||||||
|
|
||||||
|
std::ifstream file(filePath, std::ios::binary);
|
||||||
|
if (!file.is_open()) {
|
||||||
|
throw std::runtime_error("Failed to open JSON file: " + filePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::stringstream buffer;
|
||||||
|
buffer << file.rdbuf();
|
||||||
|
file.close();
|
||||||
|
std::string json_str = buffer.str();
|
||||||
|
size_t total_size = json_str.length();
|
||||||
|
|
||||||
|
CityRecordHandler::ParseContext ctx{&db, onCity, onProgress, 0,
|
||||||
|
total_size, 0, 0};
|
||||||
|
CityRecordHandler handler(ctx);
|
||||||
|
|
||||||
|
Reader reader;
|
||||||
|
StringStream ss(json_str.c_str());
|
||||||
|
|
||||||
|
if (!reader.Parse(ss, handler)) {
|
||||||
|
ParseErrorCode errCode = reader.GetParseErrorCode();
|
||||||
|
size_t errOffset = reader.GetErrorOffset();
|
||||||
|
throw std::runtime_error(std::string("JSON parse error at offset ") +
|
||||||
|
std::to_string(errOffset) +
|
||||||
|
" (code: " + std::to_string(errCode) + ")");
|
||||||
|
}
|
||||||
|
|
||||||
|
spdlog::info(" OK: Parsed {} countries, {} states, {} cities",
|
||||||
|
ctx.countries_inserted, ctx.states_inserted, ctx.cities_emitted);
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user