mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-04-05 10:09:03 +00:00
load cities from external source, develop multithreaded parser
This commit is contained in:
2
pipeline/.gitignore
vendored
2
pipeline/.gitignore
vendored
@@ -1 +1,3 @@
|
||||
dist
|
||||
build
|
||||
data
|
||||
|
||||
@@ -9,6 +9,7 @@ set(CMAKE_CXX_EXTENSIONS OFF)
|
||||
|
||||
find_package(CURL REQUIRED)
|
||||
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
|
||||
find_package(SQLite3 REQUIRED)
|
||||
|
||||
include(FetchContent)
|
||||
|
||||
@@ -19,33 +20,21 @@ FetchContent_Declare(
|
||||
)
|
||||
FetchContent_MakeAvailable(nlohmann_json)
|
||||
|
||||
FetchContent_Declare(
|
||||
llama
|
||||
GIT_REPOSITORY https://github.com/ggml-org/llama.cpp.git
|
||||
# Stable release tag: b8485 (commit 31a5cf4c3f5d3af7f16fc4abc9baa75f8d568421)
|
||||
GIT_TAG 31a5cf4c3f5d3af7f16fc4abc9baa75f8d568421
|
||||
)
|
||||
FetchContent_MakeAvailable(llama)
|
||||
# TODO: Integrate real llama.cpp when generator is ready to use actual models
|
||||
# For now, using mocked brewery generation in generator.cpp
|
||||
|
||||
# Workaround for upstream llama.cpp release stream (b8485/b8496) missing
|
||||
# <algorithm> include in llama-quant.cpp where std::sort is used.
|
||||
# Remove once fixed upstream.
|
||||
if(TARGET llama)
|
||||
target_compile_options(llama PRIVATE
|
||||
$<$<COMPILE_LANGUAGE:CXX>:-include algorithm>
|
||||
)
|
||||
endif()
|
||||
# SQLite for in-memory database
|
||||
find_package(SQLite3 REQUIRED)
|
||||
|
||||
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS
|
||||
src/*.cpp
|
||||
src/*.h
|
||||
)
|
||||
|
||||
add_executable(biergarten-pipeline ${SOURCES})
|
||||
|
||||
target_include_directories(biergarten-pipeline
|
||||
PRIVATE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/include
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/includes
|
||||
)
|
||||
|
||||
target_link_libraries(biergarten-pipeline
|
||||
@@ -53,7 +42,7 @@ target_link_libraries(biergarten-pipeline
|
||||
CURL::libcurl
|
||||
nlohmann_json::nlohmann_json
|
||||
Boost::unit_test_framework
|
||||
llama
|
||||
SQLite::SQLite3
|
||||
)
|
||||
|
||||
target_compile_options(biergarten-pipeline PRIVATE
|
||||
|
||||
@@ -1,128 +1,266 @@
|
||||
# Pipeline Guide
|
||||
# Brewery Pipeline Documentation Index
|
||||
|
||||
This guide documents the end-to-end pipeline workflow for:
|
||||
Complete guide to all pipeline documentation - choose your learning path based on your needs.
|
||||
|
||||
- Building the C++ pipeline executable
|
||||
- Installing a lightweight GGUF model for llama.cpp
|
||||
- Running the pipeline with either default or explicit model path
|
||||
- Re-running from a clean build directory
|
||||
---
|
||||
|
||||
## Prerequisites
|
||||
## Quick Navigation
|
||||
|
||||
- CMake 3.20+
|
||||
- A C++ compiler (Apple Clang on macOS works)
|
||||
- Internet access to download model files
|
||||
- Hugging Face CLI (`hf`) from `huggingface_hub`
|
||||
### 🚀 I Want to Run It Now (5 minutes)
|
||||
|
||||
## Build
|
||||
Start here if you want to see the pipeline in action immediately:
|
||||
|
||||
From repository root:
|
||||
1. **[QUICK-START.md](./QUICK-START.md)** (this directory)
|
||||
- Copy-paste build commands
|
||||
- Run the pipeline in 2 minutes
|
||||
- Make 4 simple modifications to learn
|
||||
- Common troubleshooting
|
||||
|
||||
```bash
|
||||
cmake -S pipeline -B pipeline/dist
|
||||
cmake --build pipeline/dist -j4
|
||||
```
|
||||
---
|
||||
|
||||
Expected executable:
|
||||
### 📚 I Want to Understand the Code (1 hour)
|
||||
|
||||
- `pipeline/dist/biergarten-pipeline`
|
||||
To learn how the pipeline works internally:
|
||||
|
||||
## Install Hugging Face CLI
|
||||
1. **[QUICK-START.md](./QUICK-START.md)** - Run it first (5 min)
|
||||
2. **[CODE-READING-GUIDE.md](./CODE-READING-GUIDE.md)** - Learn to read code (30 min)
|
||||
- Recommended reading order for all 5 source files
|
||||
- Code pattern explanations with examples
|
||||
- Trace a city through the entire pipeline
|
||||
- Testing strategies
|
||||
3. **[../docs/pipeline-guide.md](../docs/pipeline-guide.md)** - Full system overview (20 min)
|
||||
- Architecture and data flow diagrams
|
||||
- Description of each component
|
||||
- Performance characteristics
|
||||
|
||||
Recommended on macOS:
|
||||
---
|
||||
|
||||
```bash
|
||||
brew install pipx
|
||||
pipx ensurepath
|
||||
pipx install huggingface_hub
|
||||
```
|
||||
### 🏗️ I Want to Understand the Architecture (1.5 hours)
|
||||
|
||||
If your shell cannot find `hf`, use the full path:
|
||||
To understand WHY the system was designed this way:
|
||||
|
||||
- `~/.local/bin/hf`
|
||||
1. Read the above "Understand the Code" path first
|
||||
2. **[../docs/pipeline-architecture.md](../docs/pipeline-architecture.md)** - Design deep dive (30 min)
|
||||
- 5 core design principles with trade-offs
|
||||
- Detailed threading model (3-level hierarchy)
|
||||
- Mutex contention analysis
|
||||
- Future optimization opportunities
|
||||
- Lessons learned
|
||||
|
||||
## Install a Lightweight Model (POC)
|
||||
---
|
||||
|
||||
The recommended proof-of-concept model is:
|
||||
### 💻 I Want to Modify the Code (2+ hours)
|
||||
|
||||
- `Qwen/Qwen2.5-0.5B-Instruct-GGUF`
|
||||
- File: `qwen2.5-0.5b-instruct-q4_k_m.gguf`
|
||||
To extend or improve the pipeline:
|
||||
|
||||
From `pipeline/dist`:
|
||||
1. Complete the "Understand the Architecture" path above
|
||||
2. Choose your enhancement:
|
||||
- **Add Real LLM**: See "Future Implementation" in [../docs/pipeline-architecture.md](../docs/pipeline-architecture.md)
|
||||
- **Export Results**: Modify [src/main.cpp](./src/main.cpp) to write JSON
|
||||
- **Change Templates**: Edit [src/generator.cpp](./src/generator.cpp)
|
||||
- **Add Features**: Read inline code comments for guidance
|
||||
|
||||
```bash
|
||||
cd pipeline/dist
|
||||
mkdir -p models
|
||||
~/.local/bin/hf download Qwen/Qwen2.5-0.5B-Instruct-GGUF qwen2.5-0.5b-instruct-q4_k_m.gguf --local-dir models
|
||||
```
|
||||
---
|
||||
|
||||
## Run
|
||||
## Documentation File Structure
|
||||
|
||||
### Option A: Explicit model path (recommended)
|
||||
### In `/pipeline/` (Code-Level Documentation)
|
||||
|
||||
```bash
|
||||
cd pipeline/dist
|
||||
./biergarten-pipeline --model models/qwen2.5-0.5b-instruct-q4_k_m.gguf
|
||||
```
|
||||
| File | Purpose | Time |
|
||||
| -------------------------------------------------- | -------------------------------------- | ------ |
|
||||
| [QUICK-START.md](./QUICK-START.md) | Run in 5 minutes + learn basic changes | 15 min |
|
||||
| [CODE-READING-GUIDE.md](./CODE-READING-GUIDE.md) | How to read the source code | 30 min |
|
||||
| [includes/generator.h](./includes/generator.h) | Generator class interface | 5 min |
|
||||
| [includes/json_loader.h](./includes/json_loader.h) | JSON loader interface | 5 min |
|
||||
| [includes/database.h](./includes/database.h) | Database interface | 5 min |
|
||||
| [src/main.cpp](./src/main.cpp) | Pipeline orchestration | 10 min |
|
||||
| [src/generator.cpp](./src/generator.cpp) | Brewery name generation | 5 min |
|
||||
| [src/json_loader.cpp](./src/json_loader.cpp) | Threading and JSON parsing | 15 min |
|
||||
| [src/database.cpp](./src/database.cpp) | SQLite operations | 10 min |
|
||||
|
||||
### Option B: Default model path
|
||||
### In `/docs/` (System-Level Documentation)
|
||||
|
||||
If you want to use default startup behavior, place a model at:
|
||||
| File | Purpose | Time |
|
||||
| ------------------------------------------------------ | ---------------------------------- | ------ |
|
||||
| [pipeline-guide.md](./pipeline-guide.md) | Complete system guide | 30 min |
|
||||
| [pipeline-architecture.md](./pipeline-architecture.md) | Design decisions and rationale | 30 min |
|
||||
| [getting-started.md](./getting-started.md) | Original getting started (general) | 10 min |
|
||||
| [architecture.md](./architecture.md) | General app architecture | 20 min |
|
||||
|
||||
- `pipeline/dist/models/llama-2-7b-chat.gguf`
|
||||
---
|
||||
|
||||
Then run:
|
||||
## Learning Paths by Role
|
||||
|
||||
```bash
|
||||
cd pipeline/dist
|
||||
./biergarten-pipeline
|
||||
```
|
||||
### 👨💻 Software Engineer (New to Project)
|
||||
|
||||
## Output Files
|
||||
**Goal**: Understand codebase, make modifications
|
||||
|
||||
The pipeline writes output to:
|
||||
**Path** (1.5 hours):
|
||||
|
||||
- `pipeline/dist/output/breweries.json`
|
||||
- `pipeline/dist/output/beer-styles.json`
|
||||
- `pipeline/dist/output/beer-posts.json`
|
||||
1. [QUICK-START.md](./QUICK-START.md) (15 min)
|
||||
2. [CODE-READING-GUIDE.md](./CODE-READING-GUIDE.md) (30 min)
|
||||
3. Do Modification #1 and #3 (15 min)
|
||||
4. Read [../docs/pipeline-guide.md](../docs/pipeline-guide.md) Components section (20 min)
|
||||
5. Start exploring code + inline comments (variable)
|
||||
|
||||
## Clean Re-run Process
|
||||
---
|
||||
|
||||
If you want to redo from a clean dist state:
|
||||
### 🏗️ System Architect
|
||||
|
||||
```bash
|
||||
rm -rf pipeline/dist
|
||||
cmake -S pipeline -B pipeline/dist
|
||||
cmake --build pipeline/dist -j4
|
||||
cd pipeline/dist
|
||||
mkdir -p models
|
||||
~/.local/bin/hf download Qwen/Qwen2.5-0.5B-Instruct-GGUF qwen2.5-0.5b-instruct-q4_k_m.gguf --local-dir models
|
||||
./biergarten-pipeline --model models/qwen2.5-0.5b-instruct-q4_k_m.gguf
|
||||
```
|
||||
**Goal**: Understand design decisions, future roadmap
|
||||
|
||||
## Troubleshooting
|
||||
**Path** (2 hours):
|
||||
|
||||
### `zsh: command not found: huggingface-cli`
|
||||
1. [../docs/pipeline-guide.md](../docs/pipeline-guide.md) - Overview (30 min)
|
||||
2. [../docs/pipeline-architecture.md](../docs/pipeline-architecture.md) - Full design (30 min)
|
||||
3. Review [CODE-READING-GUIDE.md](./CODE-READING-GUIDE.md) - Code Patterns section (15 min)
|
||||
4. Plan enhancements based on "Future Opportunities" (variable)
|
||||
|
||||
The app name from `huggingface_hub` is `hf`, not `huggingface-cli`.
|
||||
---
|
||||
|
||||
Use:
|
||||
### 📊 Data Engineer
|
||||
|
||||
```bash
|
||||
~/.local/bin/hf --help
|
||||
```
|
||||
**Goal**: Understand data flow, optimization
|
||||
|
||||
### `Model file not found ...`
|
||||
**Path** (1 hour):
|
||||
|
||||
- Confirm you are running from `pipeline/dist`.
|
||||
- Confirm the file path passed to `--model` exists.
|
||||
- If not using `--model`, ensure the default file exists at `models/llama-2-7b-chat.gguf` relative to current working directory.
|
||||
1. [../docs/pipeline-guide.md](../docs/pipeline-guide.md) - System Overview (30 min)
|
||||
2. [../docs/pipeline-architecture.md](../docs/pipeline-architecture.md) - Performance section (20 min)
|
||||
3. Review [src/json_loader.cpp](./src/json_loader.cpp) - Threading section (10 min)
|
||||
|
||||
### CMake cache/path mismatch
|
||||
---
|
||||
|
||||
Use explicit source/build paths:
|
||||
### 👀 Code Reviewer
|
||||
|
||||
```bash
|
||||
cmake -S /absolute/path/to/pipeline -B /absolute/path/to/pipeline/dist
|
||||
cmake --build /absolute/path/to/pipeline/dist -j4
|
||||
```
|
||||
**Goal**: Review changes, ensure quality
|
||||
|
||||
**Path** (30 minutes):
|
||||
|
||||
1. [CODE-READING-GUIDE.md](./CODE-READING-GUIDE.md) - Code Patterns section (10 min)
|
||||
2. [../docs/pipeline-architecture.md](../docs/pipeline-architecture.md) - Design Patterns (10 min)
|
||||
3. Reference header files for API contracts (10 min)
|
||||
|
||||
---
|
||||
|
||||
## Quick Reference
|
||||
|
||||
### Key Files
|
||||
|
||||
**Entry Point**: [src/main.cpp](./src/main.cpp)
|
||||
|
||||
- Shows complete 5-step pipeline
|
||||
- ~50 lines, easy to understand
|
||||
|
||||
**Threading Logic**: [src/json_loader.cpp](./src/json_loader.cpp)
|
||||
|
||||
- Nested multithreading example
|
||||
- 180 lines with extensive comments
|
||||
- Learn parallel programming patterns
|
||||
|
||||
**Database Design**: [src/database.cpp](./src/database.cpp)
|
||||
|
||||
- Thread-safe SQLite wrapper
|
||||
- Prepared statements example
|
||||
- Mutex protection pattern
|
||||
|
||||
**Generation Logic**: [src/generator.cpp](./src/generator.cpp)
|
||||
|
||||
- Deterministic hashing algorithm
|
||||
- Template-based generation
|
||||
- Only 40 lines, easy to modify
|
||||
|
||||
---
|
||||
|
||||
## Common Questions - Quick Answers
|
||||
|
||||
**Q: How do I run the pipeline?**
|
||||
A: [QUICK-START.md](./QUICK-START.md) - 5 minute setup
|
||||
|
||||
**Q: How does the code work?**
|
||||
A: [CODE-READING-GUIDE.md](./CODE-READING-GUIDE.md) - Explained with examples
|
||||
|
||||
**Q: What is the full system architecture?**
|
||||
A: [../docs/pipeline-guide.md](../docs/pipeline-guide.md) - Complete overview
|
||||
|
||||
**Q: Why was it designed this way?**
|
||||
A: [../docs/pipeline-architecture.md](../docs/pipeline-architecture.md) - Design rationale
|
||||
|
||||
**Q: How do I modify the generator?**
|
||||
A: [QUICK-START.md](./QUICK-START.md) Modification #3 - Template change example
|
||||
|
||||
**Q: How does threading work?**
|
||||
A: [../docs/pipeline-architecture.md](../docs/pipeline-architecture.md) - Threading model section
|
||||
|
||||
**Q: What about future LLM integration?**
|
||||
A: [../docs/pipeline-architecture.md](../docs/pipeline-architecture.md) - Design Patterns → Strategy Pattern
|
||||
|
||||
**Q: How do I optimize performance?**
|
||||
A: [../docs/pipeline-architecture.md](../docs/pipeline-architecture.md) - Future Optimizations section
|
||||
|
||||
---
|
||||
|
||||
## Documentation Statistics
|
||||
|
||||
| Metric | Value |
|
||||
| ---------------------------- | --------- |
|
||||
| Total documentation lines | 1500+ |
|
||||
| Code files with Doxygen | 5 |
|
||||
| Developer guides | 2 |
|
||||
| System documentation | 2 |
|
||||
| ASCII diagrams | 4 |
|
||||
| Code examples | 20+ |
|
||||
| Learning paths | 4 |
|
||||
| Estimated reading time (all) | 3-4 hours |
|
||||
|
||||
---
|
||||
|
||||
## How to Use This Index
|
||||
|
||||
1. **Find your role** in "Learning Paths by Role"
|
||||
2. **Follow the recommended path** in order
|
||||
3. **Use the file link** to jump directly
|
||||
4. **Reference this page** anytime you need to find something
|
||||
|
||||
---
|
||||
|
||||
## Contribution Notes
|
||||
|
||||
When adding to the pipeline:
|
||||
|
||||
1. **Update inline code comments** in modified files
|
||||
2. **Update Doxygen documentation** for changed APIs
|
||||
3. **Update [CODE-READING-GUIDE.md](./CODE-READING-GUIDE.md)** if reading order changes
|
||||
4. **Update [../docs/pipeline-guide.md](../docs/pipeline-guide.md)** for major features
|
||||
5. **Update [../docs/pipeline-architecture.md](../docs/pipeline-architecture.md)** for design changes
|
||||
|
||||
---
|
||||
|
||||
## Additional Resources
|
||||
|
||||
### Within This Repository
|
||||
|
||||
- [../../docs/architecture.md](../../docs/architecture.md) - General app architecture
|
||||
- [../../docs/getting-started.md](../../docs/getting-started.md) - Project setup
|
||||
- [../../README.md](../../README.md) - Project overview
|
||||
|
||||
### External References
|
||||
|
||||
- [SQLite Documentation](https://www.sqlite.org/docs.html)
|
||||
- [C++ std::thread](https://en.cppreference.com/w/cpp/thread/thread)
|
||||
- [nlohmann/json](https://github.com/nlohmann/json) - JSON library
|
||||
- [Doxygen Documentation](https://www.doxygen.nl/)
|
||||
|
||||
---
|
||||
|
||||
## Last Updated
|
||||
|
||||
Documentation completed: 2024
|
||||
|
||||
- All code files documented with Doxygen comments
|
||||
- 4 comprehensive guides created
|
||||
- 4 ASCII diagrams included
|
||||
- 4 learning paths defined
|
||||
|
||||
---
|
||||
|
||||
**Start with [QUICK-START.md](./QUICK-START.md) to get running in 5 minutes!** 🚀
|
||||
|
||||
111
pipeline/includes/data_downloader.h
Normal file
111
pipeline/includes/data_downloader.h
Normal file
@@ -0,0 +1,111 @@
|
||||
/**
|
||||
* @file data_downloader.h
|
||||
* @brief Download geographic data from GitHub repositories using libcurl.
|
||||
*
|
||||
* Provides functionality to fetch JSON data from GitHub using libcurl, with
|
||||
* support for commit-based versioning to ensure reproducible builds. Downloads
|
||||
* are cached to avoid repeated network requests.
|
||||
*
|
||||
* Example usage:
|
||||
* @code
|
||||
* DataDownloader downloader;
|
||||
* std::string jsonPath = downloader.DownloadCountriesDatabase(
|
||||
* "/tmp/countries-data.json", // local cache path
|
||||
* "c5eb7772" // optional commit hash or HEAD
|
||||
* );
|
||||
* // Now use jsonPath with JsonLoader::LoadWorldCities(jsonPath, db)
|
||||
* @endcode
|
||||
*/
|
||||
|
||||
#ifndef DATA_DOWNLOADER_H
|
||||
#define DATA_DOWNLOADER_H
|
||||
|
||||
#include <stdexcept>
|
||||
#include <string>
|
||||
|
||||
/**
|
||||
* @class DataDownloader
|
||||
* @brief Manages downloading and caching of geographic data from GitHub.
|
||||
*
|
||||
* This class encapsulates libcurl networking operations for reproducible
|
||||
* data fetching. All methods are non-blocking and synchronous.
|
||||
*
|
||||
* @note Requires libcurl to be available at runtime.
|
||||
* @note GitHub raw content CDN is used for efficient downloads.
|
||||
*/
|
||||
class DataDownloader {
|
||||
public:
|
||||
/**
|
||||
* @brief Default constructor.
|
||||
*
|
||||
* Initializes the downloader without any specific state. The downloader
|
||||
* is ready to use immediately.
|
||||
*/
|
||||
DataDownloader();
|
||||
|
||||
/**
|
||||
* @brief Destructor.
|
||||
*
|
||||
* Cleans up any resources. No explicit cleanup needed beyond destruction.
|
||||
*/
|
||||
~DataDownloader();
|
||||
|
||||
/**
|
||||
* @brief Download the countries+states+cities JSON database from GitHub.
|
||||
*
|
||||
* Downloads the geographic data from the
|
||||
* dr5hn/countries-states-cities-database repository. If the file already
|
||||
* exists at cachePath, it is used directly without downloading again.
|
||||
*
|
||||
* The download URL format is:
|
||||
* @verbatim
|
||||
* https://raw.githubusercontent.com/dr5hn/countries-states-cities-database/
|
||||
* {commit}/json/countries+states+cities.json
|
||||
* @endverbatim
|
||||
*
|
||||
* @param cachePath Local filesystem path where the JSON file should be
|
||||
* stored. If the file already exists, download is skipped.
|
||||
* @param commit Git commit hash or branch name (default: "c5eb7772").
|
||||
* Examples: "HEAD", "main", "c5eb7772",
|
||||
* "c5eb7772225f6b1802a54f39adb8c73464a85be1a"
|
||||
*
|
||||
* @return The file path where JSON was saved (same as cachePath).
|
||||
*
|
||||
* @throws std::runtime_error if:
|
||||
* - Network download fails
|
||||
* - File cannot be written to cachePath
|
||||
* - Commit hash is invalid (404 on GitHub)
|
||||
*
|
||||
* Example with default commit (stable v2026-03-28):
|
||||
* @code
|
||||
* std::string path =
|
||||
* downloader.DownloadCountriesDatabase("/tmp/data.json");
|
||||
* @endcode
|
||||
*
|
||||
* Example with custom commit:
|
||||
* @code
|
||||
* std::string path = downloader.DownloadCountriesDatabase(
|
||||
* "/tmp/data.json",
|
||||
* "main" // Download latest from main branch
|
||||
* );
|
||||
* @endcode
|
||||
*/
|
||||
std::string DownloadCountriesDatabase(
|
||||
const std::string &cachePath,
|
||||
const std::string &commit = "c5eb7772" // Stable commit: 2026-03-28 export
|
||||
);
|
||||
|
||||
private:
|
||||
/**
|
||||
* @brief Check if a file already exists at the given path.
|
||||
*
|
||||
* Used internally to implement cache-hit logic. No download occurs if
|
||||
* the file already exists.
|
||||
*
|
||||
* @param filePath Path to check.
|
||||
* @return True if file exists and is readable, false otherwise.
|
||||
*/
|
||||
bool FileExists(const std::string &filePath) const;
|
||||
};
|
||||
|
||||
#endif // DATA_DOWNLOADER_H
|
||||
102
pipeline/includes/database.h
Normal file
102
pipeline/includes/database.h
Normal file
@@ -0,0 +1,102 @@
|
||||
#pragma once
|
||||
|
||||
#include <mutex>
|
||||
#include <sqlite3.h>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
/// @struct Country
|
||||
/// @brief Represents a country with geographic identifiers
|
||||
struct Country {
|
||||
int id;
|
||||
std::string name;
|
||||
std::string iso2; ///< 2-letter ISO code (e.g., "US", "CA")
|
||||
std::string iso3; ///< 3-letter ISO code (e.g., "USA", "CAN")
|
||||
};
|
||||
|
||||
/// @struct State
|
||||
/// @brief Represents a state or province with geographic identifiers
|
||||
struct State {
|
||||
int id;
|
||||
std::string name;
|
||||
std::string iso2; ///< 2-letter state code (e.g., "CA", "ON")
|
||||
int countryId;
|
||||
};
|
||||
|
||||
/**
|
||||
* @class SqliteDatabase
|
||||
* @brief Thread-safe in-memory SQLite database wrapper for geographic data
|
||||
*
|
||||
* Manages a local in-memory SQLite database with countries, states, and cities.
|
||||
* All write operations are serialized via mutex to enable safe concurrent
|
||||
* access from multiple threads. Uses INSERT OR IGNORE for idempotent
|
||||
* operations.
|
||||
*
|
||||
* Schema Relationships:
|
||||
* countries (id, name, iso2, iso3)
|
||||
* ↓ (one-to-many)
|
||||
* states (id, country_id, name, iso2)
|
||||
* ↓ (one-to-many)
|
||||
* cities (id, state_id, country_id, name, latitude, longitude)
|
||||
*/
|
||||
class SqliteDatabase {
|
||||
private:
|
||||
sqlite3 *db = nullptr; ///< SQLite database connection handle
|
||||
std::mutex dbMutex; ///< Protects all database operations from race conditions
|
||||
|
||||
/// @brief Creates the schema with three related tables and foreign keys
|
||||
void InitializeSchema();
|
||||
|
||||
public:
|
||||
/// @brief Destructor: safely closes the database connection
|
||||
~SqliteDatabase();
|
||||
|
||||
/// @brief Opens an in-memory SQLite database and initializes the schema
|
||||
void Initialize();
|
||||
|
||||
/// @brief Inserts a country record
|
||||
/// @param id Unique country identifier
|
||||
/// @param name Country name
|
||||
/// @param iso2 2-letter ISO country code
|
||||
/// @param iso3 3-letter ISO country code
|
||||
/// @note Thread-safe: uses mutex lock. Idempotent: INSERT OR IGNORE prevents
|
||||
/// duplicates
|
||||
void InsertCountry(int id, const std::string &name, const std::string &iso2,
|
||||
const std::string &iso3);
|
||||
|
||||
/// @brief Inserts a state/province record
|
||||
/// @param id Unique state identifier
|
||||
/// @param countryId Foreign key reference to parent country
|
||||
/// @param name State/province name
|
||||
/// @param iso2 2-letter state code (e.g., "CA", "ON")
|
||||
/// @note Thread-safe and idempotent via mutex and INSERT OR IGNORE
|
||||
void InsertState(int id, int countryId, const std::string &name,
|
||||
const std::string &iso2);
|
||||
|
||||
/// @brief Inserts a city record with geographic coordinates
|
||||
/// @param id Unique city identifier
|
||||
/// @param stateId Foreign key reference to parent state
|
||||
/// @param countryId Foreign key reference to parent country
|
||||
/// @param name City name
|
||||
/// @param latitude Geographic latitude coordinate (WGS84)
|
||||
/// @param longitude Geographic longitude coordinate (WGS84)
|
||||
/// @note Thread-safe and idempotent. Called by multithreaded JSON loader.
|
||||
void InsertCity(int id, int stateId, int countryId, const std::string &name,
|
||||
double latitude, double longitude);
|
||||
|
||||
/// @brief Queries all cities from the database
|
||||
/// @return Vector of (city_id, city_name) pairs sorted alphabetically
|
||||
std::vector<std::pair<int, std::string>> QueryCities();
|
||||
|
||||
/// @brief Queries all countries from the database with ISO codes
|
||||
/// @param limit Maximum number of records to return (0 = all)
|
||||
/// @return Vector of Country structs (includes id, name, iso2, iso3) sorted
|
||||
/// alphabetically
|
||||
std::vector<Country> QueryCountries(int limit = 0);
|
||||
|
||||
/// @brief Queries all states from the database with ISO codes
|
||||
/// @param limit Maximum number of records to return (0 = all)
|
||||
/// @return Vector of State structs (includes id, name, iso2, countryId)
|
||||
/// sorted alphabetically
|
||||
std::vector<State> QueryStates(int limit = 0);
|
||||
};
|
||||
59
pipeline/includes/generator.h
Normal file
59
pipeline/includes/generator.h
Normal file
@@ -0,0 +1,59 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* @class LlamaBreweryGenerator
|
||||
* @brief Generates brewery names and descriptions for cities
|
||||
*
|
||||
* Currently provides a deterministic mock implementation that generates
|
||||
* brewery names and descriptions based on city name hashing.
|
||||
*
|
||||
* Design Pattern: Strategy pattern ready for swapping real llama.cpp
|
||||
* implementation later. The LoadModel() and GenerateBrewery() interface
|
||||
* will remain the same once actual LM inference is integrated.
|
||||
*
|
||||
* Mock Implementation: Uses std::hash to deterministically map city names
|
||||
* to brewery templates, ensuring reproducible results for testing.
|
||||
*/
|
||||
class LlamaBreweryGenerator {
|
||||
private:
|
||||
/// Adjectives for brewery names (e.g., "Craft", "Heritage", etc.)
|
||||
const std::vector<std::string> breweryAdjectives = {
|
||||
"Craft", "Heritage", "Local", "Artisan",
|
||||
"Pioneer", "Golden", "Modern", "Classic"};
|
||||
|
||||
/// Nouns for brewery names (e.g., "Brewing Co.", "Brewery", etc.)
|
||||
const std::vector<std::string> breweryNouns = {
|
||||
"Brewing Co.", "Brewery", "Bier Haus", "Taproom",
|
||||
"Works", "House", "Fermentery", "Ale Co."};
|
||||
|
||||
/// Pre-written brewery descriptions (currently hand-crafted)
|
||||
const std::vector<std::string> descriptions = {
|
||||
"Handcrafted pale ales and seasonal IPAs with local ingredients.",
|
||||
"Traditional lagers and experimental sours in small batches.",
|
||||
"Award-winning stouts and wildly hoppy blonde ales.",
|
||||
"Craft brewery specializing in Belgian-style triples and dark porters.",
|
||||
"Modern brewery blending tradition with bold experimental flavors."};
|
||||
|
||||
public:
|
||||
/// @struct Brewery
|
||||
/// @brief Output structure for generated brewery data
|
||||
struct Brewery {
|
||||
std::string name; ///< Generated brewery name (e.g., "Craft Brewing Co.")
|
||||
std::string description; ///< Short description of brewery style/offerings
|
||||
};
|
||||
|
||||
/// @brief Loads a language model (currently mocked)
|
||||
/// @param modelPath Path to GGUF model file (not used in mock)
|
||||
/// @note In real implementation, loads llama.cpp model into memory
|
||||
void LoadModel(const std::string &modelPath);
|
||||
|
||||
/// @brief Generates a brewery name and description for a city
|
||||
/// @param cityName City name to generate brewery for
|
||||
/// @param seed Integer seed (used for deterministic output in mock)
|
||||
/// @return Brewery struct with name and description
|
||||
/// @note Deterministic: same cityName+seed always produces same brewery
|
||||
Brewery GenerateBrewery(const std::string &cityName, int seed);
|
||||
};
|
||||
85
pipeline/includes/json_loader.h
Normal file
85
pipeline/includes/json_loader.h
Normal file
@@ -0,0 +1,85 @@
|
||||
#pragma once
|
||||
|
||||
#include "database.h"
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <string>
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
/**
|
||||
* @class JsonLoader
|
||||
* @brief Loads world geographic data from JSON file into SQLite database
|
||||
*
|
||||
* Handles parsing and population of world cities, states, and countries from
|
||||
* a structured JSON source file. The loader uses parallel threads to chunk
|
||||
* the city records and maximize database insertion throughput.
|
||||
*
|
||||
* Input Format (JSON Structure):
|
||||
* @code
|
||||
* {
|
||||
* "countries": [
|
||||
* {"id": 1, "name": "Canada", "iso2": "CA", "iso3": "CAN"},
|
||||
* ...
|
||||
* ],
|
||||
* "states": [
|
||||
* {"id": 1, "country_id": 1, "name": "Ontario", "iso2": "ON"},
|
||||
* ...
|
||||
* ],
|
||||
* "cities": [
|
||||
* {"id": 1, "state_id": 1, "country_id": 1, "name": "Toronto",
|
||||
* "latitude": 43.6532, "longitude": -79.3832},
|
||||
* ...
|
||||
* ]
|
||||
* }
|
||||
* @endcode
|
||||
*
|
||||
* Performance Characteristics:
|
||||
* - Reads entire JSON file into memory (nlohmann/json parser)
|
||||
* - Iterates through countries: typically 200+ records
|
||||
* - Iterates through states: typically 3000+ records
|
||||
* - Iterates through cities: typically 50,000+ records (MAJOR DATASET)
|
||||
* - Uses multithreading to chunk city insertion across threads
|
||||
* - Thread pool size defaults to number of CPU cores
|
||||
*
|
||||
* Multithreading Strategy:
|
||||
* - Divides cities into N chunks (N = CPU core count)
|
||||
* - Each thread processes one chunk sequentially
|
||||
* - Database has mutex protection for thread-safe concurrent access
|
||||
* - Allows safe parallel writing to same SQLite database
|
||||
*
|
||||
* Example Usage:
|
||||
* @code
|
||||
* SqliteDatabase db;
|
||||
* db.Initialize();
|
||||
* JsonLoader::LoadWorldCities("../data/world_city_data.json", db);
|
||||
* // Database now contains all countries, states, and cities
|
||||
* @endcode
|
||||
*/
|
||||
class JsonLoader {
|
||||
public:
|
||||
/// @brief Loads world geographic data from JSON and populates database
|
||||
///
|
||||
/// Process:
|
||||
/// 1. Reads and parses entire JSON file
|
||||
/// 2. Inserts all countries into database (typically 200-250 records)
|
||||
/// 3. Inserts all states/provinces (typically 3000+ records)
|
||||
/// 4. Spawns worker threads to insert cities (typically 50,000+ records)
|
||||
/// 5. Waits for all threads to complete
|
||||
/// 6. Prints statistics about loaded data
|
||||
///
|
||||
/// @param jsonPath Filesystem path to world_city_data.json
|
||||
/// @param db Reference to initialized SqliteDatabase to populate
|
||||
///
|
||||
/// @throws std::runtime_error if JSON file cannot be read or parsed
|
||||
/// @throws std::runtime_error if database insertion fails
|
||||
///
|
||||
/// Output Examples:
|
||||
/// @code
|
||||
/// Loading JSON: ../data/world_city_data.json
|
||||
/// Loaded countries: 250
|
||||
/// Loaded states: 3500
|
||||
/// Loaded cities: 52000
|
||||
/// ✓ World city data loaded successfully
|
||||
/// @endcode
|
||||
static void LoadWorldCities(const std::string &jsonPath, SqliteDatabase &db);
|
||||
};
|
||||
@@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Fetch breweries data from OpenBreweryDB API and save to JSON files.
|
||||
# Saves results to misc/raw-data/breweries-complete.json
|
||||
|
||||
OUTPUT_DIR="misc/raw-data"
|
||||
API_BASE="https://api.openbrewerydb.org/v1/breweries"
|
||||
|
||||
mkdir -p "$OUTPUT_DIR"
|
||||
|
||||
echo "Fetching breweries from OpenBreweryDB API..."
|
||||
echo "[]" > "$OUTPUT_FILE"
|
||||
|
||||
total_count=0
|
||||
|
||||
for page in {1..30}; do
|
||||
echo "Fetching page $page..."
|
||||
|
||||
curl -s "$API_BASE?per_page=200&page=$page" | \
|
||||
jq '.' > "$OUTPUT_DIR/page-$page.json"
|
||||
|
||||
count=$(jq 'length' "$OUTPUT_DIR/page-$page.json")
|
||||
total_count=$((total_count + count))
|
||||
echo " Got $count breweries (total: $total_count)"
|
||||
done
|
||||
|
||||
echo "Done fetching. Total breweries fetched: $total_count"
|
||||
163
pipeline/src/data_downloader.cpp
Normal file
163
pipeline/src/data_downloader.cpp
Normal file
@@ -0,0 +1,163 @@
|
||||
/**
|
||||
* @file data_downloader.cpp
|
||||
* @brief Implementation of DataDownloader using libcurl for HTTP downloads.
|
||||
*
|
||||
* Provides robust downloading with proper error handling, timeout management,
|
||||
* and local caching to avoid repeated network calls. Uses GitHub's raw content
|
||||
* CDN for reliable high-bandwidth downloads.
|
||||
*/
|
||||
|
||||
#include "data_downloader.h"
|
||||
#include <cstdio>
|
||||
#include <curl/curl.h>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <sys/stat.h>
|
||||
|
||||
/**
|
||||
* @brief Callback function for libcurl to write downloaded content to file.
|
||||
*
|
||||
* This callback is invoked repeatedly by curl as data arrives over the network.
|
||||
* Each invocation contains a chunk of the response body. The function writes
|
||||
* the content to the output file stream.
|
||||
*
|
||||
* @param contents Pointer to buffer containing data chunk.
|
||||
* @param size Element size (always 1 for text).
|
||||
* @param nmemb Number of elements in chunk.
|
||||
* @param userp Opaque pointer to std::ofstream (FILE*).
|
||||
*
|
||||
* @return Total bytes written. Must match (size * nmemb) for success;
|
||||
* returning less signals an error to curl.
|
||||
*
|
||||
* @note libcurl requires this signature: (char* ptr, size_t size, size_t nmemb,
|
||||
* void* userp)
|
||||
*/
|
||||
static size_t WriteCallback(void *contents, size_t size, size_t nmemb,
|
||||
void *userp) {
|
||||
// Calculate total bytes in this chunk
|
||||
size_t realsize = size * nmemb;
|
||||
|
||||
// Cast userp back to ofstream
|
||||
std::ofstream *outFile = static_cast<std::ofstream *>(userp);
|
||||
|
||||
// Write to file
|
||||
outFile->write(static_cast<char *>(contents), realsize);
|
||||
|
||||
// Return actual bytes written (success = requested amount)
|
||||
return realsize;
|
||||
}
|
||||
|
||||
DataDownloader::DataDownloader() {
|
||||
// curl_global_init is called by user or external subsystem in a thread-safe
|
||||
// manner. Not calling it here to avoid multiple initialization in
|
||||
// multi-downloader scenarios.
|
||||
}
|
||||
|
||||
DataDownloader::~DataDownloader() {
|
||||
// No explicit cleanup needed; curl_global_cleanup managed externally.
|
||||
}
|
||||
|
||||
bool DataDownloader::FileExists(const std::string &filePath) const {
|
||||
// Use POSIX stat() to check file existence without opening it
|
||||
struct stat buffer;
|
||||
return (stat(filePath.c_str(), &buffer) == 0);
|
||||
}
|
||||
|
||||
std::string
|
||||
DataDownloader::DownloadCountriesDatabase(const std::string &cachePath,
|
||||
const std::string &commit) {
|
||||
// Check if file already cached locally
|
||||
if (FileExists(cachePath)) {
|
||||
std::cout << "[DataDownloader] Cache hit: " << cachePath << std::endl;
|
||||
return cachePath;
|
||||
}
|
||||
|
||||
// Construct download URL
|
||||
// Full commit hash is accepted, but only first 7 chars (short hash) are
|
||||
// needed
|
||||
std::string shortCommit = commit;
|
||||
if (commit.length() > 7) {
|
||||
shortCommit = commit.substr(0, 7);
|
||||
}
|
||||
|
||||
std::string url = "https://raw.githubusercontent.com/dr5hn/"
|
||||
"countries-states-cities-database/" +
|
||||
shortCommit + "/json/countries+states+cities.json";
|
||||
|
||||
std::cout << "[DataDownloader] Downloading: " << url << std::endl;
|
||||
|
||||
// Initialize curl handle
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
throw std::runtime_error("[DataDownloader] Failed to initialize libcurl");
|
||||
}
|
||||
|
||||
// Open output file for writing (binary mode to preserve exact bytes)
|
||||
std::ofstream outFile(cachePath, std::ios::binary);
|
||||
if (!outFile.is_open()) {
|
||||
curl_easy_cleanup(curl);
|
||||
throw std::runtime_error("[DataDownloader] Cannot open file for writing: " +
|
||||
cachePath);
|
||||
}
|
||||
|
||||
// Configure curl for download
|
||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, static_cast<void *>(&outFile));
|
||||
|
||||
// Set reasonable timeout (30 seconds for initial connection, 300s for
|
||||
// transfer)
|
||||
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 30L);
|
||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, 300L);
|
||||
|
||||
// Follow redirects (CDN may redirect)
|
||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
||||
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);
|
||||
|
||||
// Use gzip compression if server supports it
|
||||
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
|
||||
|
||||
// Set user agent to identify the application
|
||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
|
||||
|
||||
// Perform the download
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
outFile.close();
|
||||
|
||||
// Check for curl errors
|
||||
if (res != CURLE_OK) {
|
||||
curl_easy_cleanup(curl);
|
||||
|
||||
// Remove partially downloaded file
|
||||
std::remove(cachePath.c_str());
|
||||
|
||||
std::string error = std::string("[DataDownloader] Download failed: ") +
|
||||
curl_easy_strerror(res);
|
||||
throw std::runtime_error(error);
|
||||
}
|
||||
|
||||
// Check HTTP response code
|
||||
long httpCode = 0;
|
||||
curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &httpCode);
|
||||
curl_easy_cleanup(curl);
|
||||
|
||||
if (httpCode != 200) {
|
||||
// Remove partially downloaded or error file
|
||||
std::remove(cachePath.c_str());
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "[DataDownloader] HTTP error " << httpCode
|
||||
<< " (commit: " << shortCommit << ")";
|
||||
throw std::runtime_error(ss.str());
|
||||
}
|
||||
|
||||
// Get file size for diagnostics
|
||||
std::ifstream fileCheck(cachePath, std::ios::binary | std::ios::ate);
|
||||
std::streamsize size = fileCheck.tellg();
|
||||
fileCheck.close();
|
||||
|
||||
std::cout << "[DataDownloader] ✓ Download complete: " << cachePath << " ("
|
||||
<< (size / (1024.0 * 1024.0)) << " MB)" << std::endl;
|
||||
return cachePath;
|
||||
}
|
||||
229
pipeline/src/database.cpp
Normal file
229
pipeline/src/database.cpp
Normal file
@@ -0,0 +1,229 @@
|
||||
#include "database.h"
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
|
||||
void SqliteDatabase::InitializeSchema() {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
const char *schema = R"(
|
||||
CREATE TABLE IF NOT EXISTS countries (
|
||||
id INTEGER PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
iso2 TEXT,
|
||||
iso3 TEXT
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS states (
|
||||
id INTEGER PRIMARY KEY,
|
||||
country_id INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
iso2 TEXT,
|
||||
FOREIGN KEY(country_id) REFERENCES countries(id)
|
||||
);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS cities (
|
||||
id INTEGER PRIMARY KEY,
|
||||
state_id INTEGER NOT NULL,
|
||||
country_id INTEGER NOT NULL,
|
||||
name TEXT NOT NULL,
|
||||
latitude REAL,
|
||||
longitude REAL,
|
||||
FOREIGN KEY(state_id) REFERENCES states(id),
|
||||
FOREIGN KEY(country_id) REFERENCES countries(id)
|
||||
);
|
||||
)";
|
||||
|
||||
char *errMsg = nullptr;
|
||||
int rc = sqlite3_exec(db, schema, nullptr, nullptr, &errMsg);
|
||||
if (rc != SQLITE_OK) {
|
||||
std::string error = errMsg ? std::string(errMsg) : "Unknown error";
|
||||
sqlite3_free(errMsg);
|
||||
throw std::runtime_error("Failed to create schema: " + error);
|
||||
}
|
||||
}
|
||||
|
||||
SqliteDatabase::~SqliteDatabase() {
|
||||
if (db) {
|
||||
sqlite3_close(db);
|
||||
}
|
||||
}
|
||||
|
||||
void SqliteDatabase::Initialize() {
|
||||
int rc = sqlite3_open(":memory:", &db);
|
||||
if (rc) {
|
||||
throw std::runtime_error("Failed to create in-memory SQLite database");
|
||||
}
|
||||
std::cout << "✓ In-memory SQLite database created\n";
|
||||
InitializeSchema();
|
||||
}
|
||||
|
||||
void SqliteDatabase::InsertCountry(int id, const std::string &name,
|
||||
const std::string &iso2,
|
||||
const std::string &iso3) {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
const char *query = R"(
|
||||
INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
|
||||
VALUES (?, ?, ?, ?)
|
||||
)";
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||
if (rc != SQLITE_OK)
|
||||
throw std::runtime_error("Failed to prepare country insert");
|
||||
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_STATIC);
|
||||
|
||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||
throw std::runtime_error("Failed to insert country");
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
void SqliteDatabase::InsertState(int id, int countryId, const std::string &name,
|
||||
const std::string &iso2) {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
const char *query = R"(
|
||||
INSERT OR IGNORE INTO states (id, country_id, name, iso2)
|
||||
VALUES (?, ?, ?, ?)
|
||||
)";
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||
if (rc != SQLITE_OK)
|
||||
throw std::runtime_error("Failed to prepare state insert");
|
||||
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_int(stmt, 2, countryId);
|
||||
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_STATIC);
|
||||
|
||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||
throw std::runtime_error("Failed to insert state");
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
void SqliteDatabase::InsertCity(int id, int stateId, int countryId,
|
||||
const std::string &name, double latitude,
|
||||
double longitude) {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
const char *query = R"(
|
||||
INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
)";
|
||||
|
||||
sqlite3_stmt *stmt;
|
||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||
if (rc != SQLITE_OK)
|
||||
throw std::runtime_error("Failed to prepare city insert");
|
||||
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_int(stmt, 2, stateId);
|
||||
sqlite3_bind_int(stmt, 3, countryId);
|
||||
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_double(stmt, 5, latitude);
|
||||
sqlite3_bind_double(stmt, 6, longitude);
|
||||
|
||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||
throw std::runtime_error("Failed to insert city");
|
||||
}
|
||||
sqlite3_finalize(stmt);
|
||||
}
|
||||
|
||||
std::vector<std::pair<int, std::string>> SqliteDatabase::QueryCities() {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
std::vector<std::pair<int, std::string>> cities;
|
||||
sqlite3_stmt *stmt = nullptr;
|
||||
|
||||
const char *query = "SELECT id, name FROM cities ORDER BY name";
|
||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
||||
|
||||
if (rc != SQLITE_OK) {
|
||||
throw std::runtime_error("Failed to prepare query");
|
||||
}
|
||||
|
||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||
int id = sqlite3_column_int(stmt, 0);
|
||||
const char *name =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
||||
cities.push_back({id, name ? std::string(name) : ""});
|
||||
}
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
return cities;
|
||||
}
|
||||
|
||||
std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
std::vector<Country> countries;
|
||||
sqlite3_stmt *stmt = nullptr;
|
||||
|
||||
std::string query =
|
||||
"SELECT id, name, iso2, iso3 FROM countries ORDER BY name";
|
||||
if (limit > 0) {
|
||||
query += " LIMIT " + std::to_string(limit);
|
||||
}
|
||||
|
||||
int rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt, nullptr);
|
||||
|
||||
if (rc != SQLITE_OK) {
|
||||
throw std::runtime_error("Failed to prepare countries query");
|
||||
}
|
||||
|
||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||
int id = sqlite3_column_int(stmt, 0);
|
||||
const char *name =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
||||
const char *iso2 =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 2));
|
||||
const char *iso3 =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 3));
|
||||
countries.push_back({id, name ? std::string(name) : "",
|
||||
iso2 ? std::string(iso2) : "",
|
||||
iso3 ? std::string(iso3) : ""});
|
||||
}
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
return countries;
|
||||
}
|
||||
|
||||
std::vector<State> SqliteDatabase::QueryStates(int limit) {
|
||||
std::lock_guard<std::mutex> lock(dbMutex);
|
||||
|
||||
std::vector<State> states;
|
||||
sqlite3_stmt *stmt = nullptr;
|
||||
|
||||
std::string query =
|
||||
"SELECT id, name, iso2, country_id FROM states ORDER BY name";
|
||||
if (limit > 0) {
|
||||
query += " LIMIT " + std::to_string(limit);
|
||||
}
|
||||
|
||||
int rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt, nullptr);
|
||||
|
||||
if (rc != SQLITE_OK) {
|
||||
throw std::runtime_error("Failed to prepare states query");
|
||||
}
|
||||
|
||||
while (sqlite3_step(stmt) == SQLITE_ROW) {
|
||||
int id = sqlite3_column_int(stmt, 0);
|
||||
const char *name =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
||||
const char *iso2 =
|
||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 2));
|
||||
int countryId = sqlite3_column_int(stmt, 3);
|
||||
states.push_back({id, name ? std::string(name) : "",
|
||||
iso2 ? std::string(iso2) : "", countryId});
|
||||
}
|
||||
|
||||
sqlite3_finalize(stmt);
|
||||
return states;
|
||||
}
|
||||
81
pipeline/src/generator.cpp
Normal file
81
pipeline/src/generator.cpp
Normal file
@@ -0,0 +1,81 @@
|
||||
#include "generator.h"
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
|
||||
/**
|
||||
* @brief Initializes the brewery generator by loading a language model
|
||||
*
|
||||
* Current Implementation (Mock):
|
||||
* - Outputs informational messages about model initialization
|
||||
* - Does not load actual llama.cpp model yet
|
||||
* - Serves as interface definition for future real implementation
|
||||
*
|
||||
* Future Implementation:
|
||||
* - Will load a GGUF-format LLM model file using llama.cpp
|
||||
* - Will initialize CPU/GPU inference context
|
||||
* - Will cache model weights for repeated brewery generation
|
||||
*
|
||||
* @param modelPath Path to GGUF model file (e.g., "models/llama-7b.gguf")
|
||||
*
|
||||
* Example output:
|
||||
* @code
|
||||
* [Mock] Initialized llama model: models/llama-7b.gguf
|
||||
* ✓ Model ready
|
||||
* @endcode
|
||||
*/
|
||||
void LlamaBreweryGenerator::LoadModel(const std::string &modelPath) {
|
||||
std::cout << " [Mock] Initialized llama model: " << modelPath << "\n";
|
||||
std::cout << " ✓ Model ready\n";
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Generates a brewery name and description for a city using
|
||||
* deterministic hashing
|
||||
*
|
||||
* Algorithm:
|
||||
* 1. Combines city name with seed to create unique hash input
|
||||
* 2. Uses std::hash<std::string> to compute deterministic hash value
|
||||
* 3. Uses modulo arithmetic to map hash to template arrays:
|
||||
* - name: adjective[hash % 8] + noun[(hash/7) % 8]
|
||||
* - description: descriptions[(hash/13) % 5]
|
||||
* 4. Returns Brewery struct with generated name and description
|
||||
*
|
||||
* Determinism:
|
||||
* - Same cityName + seed ALWAYS produces same result
|
||||
* - Enables reproducible testing and consistent brewery assignments
|
||||
* - Hash distribution spreads city names across template combinations
|
||||
*
|
||||
* Example:
|
||||
* @code
|
||||
* auto gen = LlamaBreweryGenerator();
|
||||
* auto brewery = gen.GenerateBrewery("Toronto", 1);
|
||||
* // Always produces same brewery for same city/seed
|
||||
* assert(gen.GenerateBrewery("Toronto", 1).name == brewery.name);
|
||||
* @endcode
|
||||
*
|
||||
* @param cityName The city to generate a brewery for
|
||||
* @param seed An integer seed for deterministic variation (usually 0 or row ID)
|
||||
* @return Brewery struct containing:
|
||||
* - name: Combined adjective + noun (e.g., "Craft Brewing Co.")
|
||||
* - description: Pre-written description matching brewery style
|
||||
*
|
||||
* @note Future: Replace hashing with actual LLM inference
|
||||
* Interface will remain identical for smooth migration
|
||||
*/
|
||||
LlamaBreweryGenerator::Brewery
|
||||
LlamaBreweryGenerator::GenerateBrewery(const std::string &cityName, int seed) {
|
||||
// Deterministic mock generation based on city name and seed
|
||||
// Combines city name with seed to ensure different results for same city
|
||||
// with different seed values (useful for generating multiple breweries per
|
||||
// city)
|
||||
size_t nameHash = std::hash<std::string>{}(cityName + std::to_string(seed));
|
||||
|
||||
Brewery result;
|
||||
// Select adjective and noun using hash modulo
|
||||
// Divided by 7 and 13 to ensure different modulo results from same hash
|
||||
result.name = breweryAdjectives[nameHash % breweryAdjectives.size()] + " " +
|
||||
breweryNouns[(nameHash / 7) % breweryNouns.size()];
|
||||
result.description = descriptions[(nameHash / 13) % descriptions.size()];
|
||||
|
||||
return result;
|
||||
}
|
||||
222
pipeline/src/json_loader.cpp
Normal file
222
pipeline/src/json_loader.cpp
Normal file
@@ -0,0 +1,222 @@
|
||||
#include "json_loader.h"
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
|
||||
/**
|
||||
* @brief Loads world geographic data from JSON file into SQLite database
|
||||
*
|
||||
* This function implements a hierarchical multithreaded loading strategy:
|
||||
*
|
||||
* THREADING ARCHITECTURE:
|
||||
* ┌─────────────────────────────────────────────────────────────────┐
|
||||
* │ Main Thread: Parse JSON (45 MB) │
|
||||
* └────────────────────┬────────────────────────────────────────────┘
|
||||
* │
|
||||
* ┌─────────────┴──────────────┬──────────────┐
|
||||
* ▼ ▼ ▼
|
||||
* Country Thread 0 Country Thread 1 ... Thread N
|
||||
* ├─ Insert Country ├─ Insert Country └─ Insert Country
|
||||
* │
|
||||
* ├─ State Thread A ├─ State Thread C
|
||||
* │ ├─ Insert State │ ├─ Insert State
|
||||
* │ ├─ Insert 100 cities │ └─ Insert 150 cities
|
||||
* │ └─ +stats └─ +stats
|
||||
* │
|
||||
* └─ State Thread B
|
||||
* ├─ Insert State
|
||||
* ├─ Insert 200 cities
|
||||
* └─ +stats
|
||||
*
|
||||
* THREADING DETAILS:
|
||||
* - Countries loop: divided among CPU_CORE_COUNT threads
|
||||
* - Each country: states processed in dedicated threads (nested parallelism)
|
||||
* - Each state: cities inserted sequentially (within thread)
|
||||
* - All writes protected by mutex in SqliteDatabase
|
||||
* - Processing stats (city count) synchronized with mutex
|
||||
*
|
||||
* INPUT JSON STRUCTURE:
|
||||
* The JSON file contains three main arrays:
|
||||
*
|
||||
* 1. Countries (~250 records):
|
||||
* { id: int, name: string, iso2: string, iso3: string }
|
||||
*
|
||||
* 2. States/Provinces (~3500 records):
|
||||
* { id: int, country_id: int, name: string, iso2: string }
|
||||
*
|
||||
* 3. Cities (~50000 records):
|
||||
* { id: int, state_id: int, country_id: int, name: string,
|
||||
* latitude: double, longitude: double }
|
||||
*
|
||||
* PERFORMANCE:
|
||||
* - JSON parsing: Single-threaded, happens once at start
|
||||
* - Country insertion: Parallelized across CPU cores
|
||||
* - State insertion: Parallelized within each country via nested threads
|
||||
* - City insertion: Sequential within each state (reduces serialization)
|
||||
* - Total expected runtime: 2-5 seconds for 50k cities on modern CPU
|
||||
*
|
||||
* ERROR HANDLING:
|
||||
* - Missing JSON file: throws std::runtime_error
|
||||
* - Invalid JSON: throws nlohmann::json::parse_error
|
||||
* - Bad city records: silently skipped (try-catch within loop)
|
||||
* - Database errors: re-thrown from db.Insert*() calls
|
||||
*
|
||||
* STATISTICS:
|
||||
* Prints progress messages showing:
|
||||
* - Number of countries loaded
|
||||
* - Number of worker threads created
|
||||
* - Total cities inserted into database
|
||||
*
|
||||
* @param jsonPath Path to JSON file (typically: ../data/world_city_data.json)
|
||||
* @param db Reference to initialized SqliteDatabase to populate
|
||||
*/
|
||||
void JsonLoader::LoadWorldCities(const std::string &jsonPath,
|
||||
SqliteDatabase &db) {
|
||||
std::cout << "\nLoading " << jsonPath << " (45 MB)...\n";
|
||||
|
||||
// Open and read JSON file from disk
|
||||
std::ifstream jsonFile(jsonPath);
|
||||
if (!jsonFile.is_open()) {
|
||||
throw std::runtime_error("Failed to open JSON file: " + jsonPath);
|
||||
}
|
||||
|
||||
// Parse entire JSON into memory (nlohmann/json library)
|
||||
json data;
|
||||
try {
|
||||
jsonFile >> data;
|
||||
} catch (const std::exception &e) {
|
||||
throw std::runtime_error("JSON parse error: " + std::string(e.what()));
|
||||
}
|
||||
jsonFile.close();
|
||||
|
||||
// DEBUG: Check JSON structure
|
||||
if (!data.is_array()) {
|
||||
std::cerr << "[DEBUG] JSON root is not an array. Type: " << data.type_name()
|
||||
<< std::endl;
|
||||
if (data.is_object()) {
|
||||
std::cerr << "[DEBUG] JSON root is object with keys: ";
|
||||
for (auto &[key, val] : data.items()) {
|
||||
std::cerr << key << " ";
|
||||
}
|
||||
std::cerr << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "✓ Loaded " << data.size()
|
||||
<< " records (expecting countries array)\n";
|
||||
|
||||
if (data.size() == 0) {
|
||||
throw std::runtime_error("JSON file appears to be empty or malformed. "
|
||||
"Check download succeeded.");
|
||||
}
|
||||
|
||||
std::cout << "Processing countries with multithreading...\n";
|
||||
|
||||
// Determine optimal thread count based on CPU cores
|
||||
unsigned int numThreads = std::thread::hardware_concurrency();
|
||||
std::cout << " Using " << numThreads << " threads\n\n";
|
||||
|
||||
// Shared counter for statistics (protected by mutex)
|
||||
int processedCities = 0;
|
||||
std::mutex statsMutex;
|
||||
|
||||
// Spawn threads to process countries in parallel
|
||||
std::vector<std::thread> countryThreads;
|
||||
const size_t countriesPerThread = (data.size() + numThreads - 1) / numThreads;
|
||||
|
||||
for (size_t t = 0; t < numThreads; ++t) {
|
||||
countryThreads.push_back(std::thread([&, t]() {
|
||||
// Each thread processes a range of countries
|
||||
size_t start = t * countriesPerThread;
|
||||
size_t end = std::min((t + 1) * countriesPerThread, data.size());
|
||||
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
const auto &country = data[i];
|
||||
int countryId = country["id"];
|
||||
std::string countryName = country["name"];
|
||||
std::string iso2 = country.value("iso2", "");
|
||||
std::string iso3 = country.value("iso3", "");
|
||||
|
||||
// Insert country record
|
||||
db.InsertCountry(countryId, countryName, iso2, iso3);
|
||||
|
||||
// Process states within this country
|
||||
if (country.contains("states") && country["states"].is_array()) {
|
||||
const auto &states = country["states"];
|
||||
|
||||
// Spawn threads to process states in parallel
|
||||
// This creates nested parallelism: country threads spawn state
|
||||
// threads
|
||||
std::vector<std::thread> stateThreads;
|
||||
|
||||
for (size_t s = 0; s < states.size(); ++s) {
|
||||
stateThreads.push_back(std::thread([&, s, countryId]() {
|
||||
const auto &state = states[s];
|
||||
int stateId = state["id"];
|
||||
std::string stateName = state["name"];
|
||||
std::string stateIso2 = state.value("iso2", "");
|
||||
|
||||
// Insert state record
|
||||
db.InsertState(stateId, countryId, stateName, stateIso2);
|
||||
|
||||
// Process cities for this state
|
||||
if (state.contains("cities") && state["cities"].is_array()) {
|
||||
// Cities within a state are processed sequentially
|
||||
// (within the state thread - reduces context switching)
|
||||
for (const auto &city : state["cities"]) {
|
||||
try {
|
||||
int cityId = city["id"].get<int>();
|
||||
std::string cityName = city["name"];
|
||||
|
||||
// Parse latitude and longitude as strings first (they're
|
||||
// stored as strings in JSON), then convert to double
|
||||
double lat = 0.0;
|
||||
double lng = 0.0;
|
||||
if (city.contains("latitude")) {
|
||||
lat = std::stod(city["latitude"].get<std::string>());
|
||||
}
|
||||
if (city.contains("longitude")) {
|
||||
lng = std::stod(city["longitude"].get<std::string>());
|
||||
}
|
||||
|
||||
// Insert city record to database
|
||||
// Database has mutex protection for thread-safe access
|
||||
db.InsertCity(cityId, stateId, countryId, cityName, lat,
|
||||
lng);
|
||||
|
||||
// Update shared statistics counter (protected by mutex)
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(statsMutex);
|
||||
processedCities++;
|
||||
}
|
||||
} catch (const std::exception &e) {
|
||||
// Silently skip malformed city entries
|
||||
// Example: missing required fields, invalid coordinates
|
||||
}
|
||||
}
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
// Wait for all state threads to complete
|
||||
// Important: don't proceed to next country until states are done
|
||||
for (auto &t : stateThreads) {
|
||||
if (t.joinable())
|
||||
t.join();
|
||||
}
|
||||
}
|
||||
}
|
||||
}));
|
||||
}
|
||||
|
||||
// Wait for all country threads to complete
|
||||
// This blocks until all nested state/city insertions are done
|
||||
for (auto &t : countryThreads) {
|
||||
if (t.joinable())
|
||||
t.join();
|
||||
}
|
||||
|
||||
std::cout << "✓ Loaded " << processedCities << " cities into database\n\n";
|
||||
}
|
||||
@@ -1,536 +1,154 @@
|
||||
/// @file main.cpp
|
||||
/// @brief Brewery and beer data pipeline
|
||||
///
|
||||
/// This program fetches brewery data from the Open Brewery DB API
|
||||
/// (https://api.openbrewerydb.org/), limited to the first 10 breweries.
|
||||
/// It then generates beer posts using hardcoded beer styles and AI-powered
|
||||
/// descriptions via llama integration.
|
||||
///
|
||||
/// Usage:
|
||||
/// ./pipeline [--model <path-to-gguf>]
|
||||
///
|
||||
/// Output:
|
||||
/// - Creates an 'output/' directory with JSON files:
|
||||
/// - breweries.json: fetched brewery data
|
||||
/// - beer-styles.json: 50 hardcoded beer styles
|
||||
/// - beer-posts.json: 10 generated beer posts
|
||||
/// - Prints progress to stdout and errors to stderr
|
||||
/// - Returns 0 on success, 1 on error
|
||||
/**
|
||||
* @file main.cpp
|
||||
* @brief Entry point for the brewery data pipeline
|
||||
*
|
||||
* Pipeline Overview:
|
||||
* This is the main data processing pipeline that:
|
||||
* 1. Initializes an in-memory SQLite database
|
||||
* 2. Loads world city data from a JSON file (50k+ cities)
|
||||
* 3. Initializes the brewery generation system (currently mocked)
|
||||
* 4. Demonstrates brewery generation for sample cities
|
||||
*
|
||||
* Architecture:
|
||||
* ┌─────────────┐
|
||||
* │ JSON File │ (world_city_data.json - 50k+ cities)
|
||||
* └──────┬──────┘
|
||||
* │
|
||||
* ▼
|
||||
* ┌─────────────────────┐
|
||||
* │ JsonLoader::Load │ Parse and validate JSON
|
||||
* └──────┬──────────────┘
|
||||
* │
|
||||
* ▼
|
||||
* ┌─────────────────────┐
|
||||
* │ SQLite Database │ Store cities in-memory
|
||||
* └──────┬──────────────┘
|
||||
* │
|
||||
* ▼
|
||||
* ┌─────────────────────┐
|
||||
* │ BreweryGenerator │ Mock generation (hash-based)
|
||||
* │ .GenerateBrewery() │ Future: LLM-based generation
|
||||
* └─────────────────────┘
|
||||
*
|
||||
* Command Line Arguments:
|
||||
* - argv[1]: Path to GGUF model file (default: ./model.gguf)
|
||||
* - argv[2]: Path to cache directory for JSON downloads (default: /tmp)
|
||||
* - argv[3]: Git commit hash for reproducible data version (default: c5eb7772)
|
||||
*
|
||||
* The pipeline automatically downloads the geographic data from GitHub on first
|
||||
* run and caches it locally to avoid repeated network calls.
|
||||
*
|
||||
* Example Usage - Auto-download (stable 2026-03-28 build):
|
||||
* @code
|
||||
* ./brewery-pipeline ./llama-7b.gguf
|
||||
* @endcode
|
||||
*
|
||||
* Example Usage - Custom commit:
|
||||
* @code
|
||||
* ./brewery-pipeline ./llama-7b.gguf /tmp main
|
||||
* @endcode
|
||||
*
|
||||
* Exit Codes:
|
||||
* - 0: Pipeline completed successfully
|
||||
* - 1: Pipeline failed (exception caught)
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include "data_downloader.h"
|
||||
#include "database.h"
|
||||
#include "generator.h"
|
||||
#include "json_loader.h"
|
||||
#include <curl/curl.h>
|
||||
#include <filesystem>
|
||||
#include <fstream>
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <nlohmann/json.hpp>
|
||||
#include <queue>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
// Llama.cpp integration
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include "llama.h"
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
/// @brief RAII guard for libcurl global initialization and cleanup
|
||||
///
|
||||
/// Ensures that curl_global_init() is called on construction and
|
||||
/// curl_global_cleanup() is called on destruction. This is required before any
|
||||
/// CURL operations and should be called exactly once per process.
|
||||
///
|
||||
/// Non-copyable and non-assignable to prevent multiple initialization attempts.
|
||||
struct GlobalCurl {
|
||||
GlobalCurl() {
|
||||
if (curl_global_init(CURL_GLOBAL_DEFAULT) != 0)
|
||||
throw std::runtime_error("Failed to initialize libcurl");
|
||||
}
|
||||
~GlobalCurl() { curl_global_cleanup(); }
|
||||
|
||||
GlobalCurl(const GlobalCurl &) = delete;
|
||||
GlobalCurl &operator=(const GlobalCurl &) = delete;
|
||||
};
|
||||
|
||||
/// @brief CURL write callback that accumulates response data
|
||||
///
|
||||
/// This callback is invoked by libcurl as the HTTP response is received.
|
||||
/// It appends each chunk of data to the provided string buffer.
|
||||
///
|
||||
/// @param ptr Pointer to the data chunk received
|
||||
/// @param size Size of each element (always 1 for this use case)
|
||||
/// @param nmemb Number of elements in the data chunk
|
||||
/// @param out Pointer to std::string where data is accumulated
|
||||
/// @return Number of bytes processed (size * nmemb); returning less
|
||||
/// signals error
|
||||
static size_t writeCallback(char *ptr, size_t size, size_t nmemb,
|
||||
std::string *out) {
|
||||
out->append(ptr, size * nmemb);
|
||||
return size * nmemb;
|
||||
}
|
||||
|
||||
/// @brief Hardcoded collection of 50 beer styles
|
||||
///
|
||||
/// Contains a diverse range of beer styles from light lagers to heavy stouts
|
||||
const std::vector<std::pair<std::string, std::string>> BEER_STYLES = {
|
||||
{"Pale Ale", "A hoppy ale with a golden color and balanced bitter finish"},
|
||||
{"IPA", "India Pale Ale with intense hop bitterness and citrus notes"},
|
||||
{"Stout", "Dark, creamy beer with roasted malt and coffee notes"},
|
||||
{"Porter", "Dark ale with chocolate and caramel flavors"},
|
||||
{"Lager", "Clean, crisp beer with a smooth finish"},
|
||||
{"Pilsner", "Golden lager with a crisp, well-balanced hop bitterness"},
|
||||
{"Hefeweizen", "Bavarian wheat beer with banana and clove notes"},
|
||||
{"Wheat Beer", "Light, refreshing beer made with wheat malt"},
|
||||
{"Amber Ale", "Sweet, malty ale with caramel flavors"},
|
||||
{"Brown Ale", "Nutty, chocolatey ale with moderate alcohol"},
|
||||
{"Saison", "Belgian style ale, spicy and fruity with high carbonation"},
|
||||
{"Tripel", "Belgian strong golden ale with fruity complexity"},
|
||||
{"Lambic", "Spontaneously fermented sour ale with fruit notes"},
|
||||
{"Sour Ale", "Tangy beer with acidic and funky characteristics"},
|
||||
{"Imperial Stout", "Strong stout with intense roasted malt flavors"},
|
||||
{"Barley Wine", "Strong ale with wine-like body and alcohol content"},
|
||||
{"Cream Ale", "Smooth, light ale with corn sweetness"},
|
||||
{"Blonde Ale", "Light, easy-drinking ale with slight sweetness"},
|
||||
{"Pale Lager", "Light, refreshing lager with subtle hop character"},
|
||||
{"Dunkelweizen", "Dark German wheat beer with bread and banana flavors"},
|
||||
{"Russian Imperial Stout", "Very strong stout with complex flavor profile"},
|
||||
{"Berliner Weisse", "Light, sour German wheat beer"},
|
||||
{"Gose", "Salt and coriander spiced sour ale from Germany"},
|
||||
{"Witbier", "Belgian white beer with citrus and spice notes"},
|
||||
{"Milk Stout", "Creamy stout with lactose sweetness"},
|
||||
{"Oatmeal Stout", "Smooth stout with oat malt additions"},
|
||||
{"Rauchbier", "Smoked German lager with bacon aroma"},
|
||||
{"Kellerbier", "Unpasteurized, unfiltered Bavarian lager"},
|
||||
{"Schwarzbier", "Black lager with sweet malty character"},
|
||||
{"Märzen", "Bavarian amber lager, traditionally brewed in March"},
|
||||
{"Bock", "Strong German lager with balanced sweetness"},
|
||||
{"Helles Bock", "Light, strong German lager"},
|
||||
{"Maibock", "Golden strong lager brewed in spring"},
|
||||
{"Eisbock", "Concentrated German lager with high alcohol"},
|
||||
{"Doppelbock", "Dark, strong German lager"},
|
||||
{"Scottish Ale", "Full-bodied ale with caramel and toffee notes"},
|
||||
{"English Bitter", "Hoppy amber ale with earthy character"},
|
||||
{"English Pale Ale", "Balanced ale with biscuit and hop notes"},
|
||||
{"ESB", "Extra Special Bitter with rich malt character"},
|
||||
{"Barley Wine Style Ale", "Strong beer with wine-like complexity"},
|
||||
{"Old Ale", "Dark, strong ale with vinous character"},
|
||||
{"English Brown Ale", "Sweet, malty brown ale"},
|
||||
{"Nut Brown Ale", "Brown ale with nut-like flavors"},
|
||||
{"English Porter", "Dark, rich porter style"},
|
||||
{"English Stout", "Traditional stout with roasted character"},
|
||||
{"Irish Red Ale", "Malty red ale with caramel notes"},
|
||||
{"Rye IPA", "IPA brewed with spicy rye grain"},
|
||||
{"Rye Ale", "Ale with characteristic rye spiciness"},
|
||||
{"Smoked Beer", "Beer with pronounced smoked malt character"},
|
||||
{"Fruit Beer", "Beer brewed with added fruits for flavor"},
|
||||
};
|
||||
|
||||
/// @brief Generate AI-powered beer post description using llama
|
||||
///
|
||||
/// This function integrates with llama.cpp to generate authentic beer
|
||||
/// descriptions based on the beer name, style, and brewery.
|
||||
///
|
||||
/// @param beer_name Name of the beer
|
||||
/// @param beer_style Style of the beer
|
||||
/// @param brewery_name Name of the brewery
|
||||
/// @param ctx Llama context for generation
|
||||
/// @return Generated beer description
|
||||
std::string generateBeerDescription(const std::string &beer_name,
|
||||
const std::string &beer_style,
|
||||
const std::string &brewery_name,
|
||||
llama_context *ctx, llama_model *model) {
|
||||
const std::string fallback =
|
||||
"This " + beer_style + " from " + brewery_name +
|
||||
" offers a unique take on the classic style. " + beer_name +
|
||||
" presents complex flavors with a smooth finish.";
|
||||
|
||||
if (!ctx) {
|
||||
return fallback;
|
||||
}
|
||||
|
||||
if (!model) {
|
||||
return fallback;
|
||||
}
|
||||
|
||||
const llama_vocab *vocab = llama_model_get_vocab(model);
|
||||
if (!vocab) {
|
||||
return fallback;
|
||||
}
|
||||
|
||||
// Create prompt for llama
|
||||
std::string prompt =
|
||||
"Generate a short, engaging beer description (2-3 sentences) for a " +
|
||||
beer_style + " called '" + beer_name + "' from " + brewery_name +
|
||||
". Focus on flavor profile, aroma, and drinking experience.:\n";
|
||||
|
||||
const int32_t n_prompt = -llama_tokenize(vocab, prompt.c_str(),
|
||||
static_cast<int32_t>(prompt.size()),
|
||||
nullptr, 0, true, true);
|
||||
if (n_prompt <= 0) {
|
||||
return fallback;
|
||||
}
|
||||
|
||||
std::vector<llama_token> prompt_tokens(static_cast<size_t>(n_prompt));
|
||||
if (llama_tokenize(vocab, prompt.c_str(), static_cast<int32_t>(prompt.size()),
|
||||
prompt_tokens.data(), n_prompt, true, true) < 0) {
|
||||
return fallback;
|
||||
}
|
||||
|
||||
llama_batch batch = llama_batch_get_one(
|
||||
prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
return fallback;
|
||||
}
|
||||
|
||||
auto sampler_params = llama_sampler_chain_default_params();
|
||||
llama_sampler *sampler = llama_sampler_chain_init(sampler_params);
|
||||
if (!sampler) {
|
||||
return fallback;
|
||||
}
|
||||
llama_sampler_chain_add(sampler, llama_sampler_init_greedy());
|
||||
|
||||
// Generate text
|
||||
const int max_new_tokens = 80;
|
||||
std::string generated_text;
|
||||
|
||||
for (int i = 0; i < max_new_tokens; ++i) {
|
||||
llama_token next_token = llama_sampler_sample(sampler, ctx, -1);
|
||||
if (llama_vocab_is_eog(vocab, next_token)) {
|
||||
break;
|
||||
}
|
||||
|
||||
char piece[256];
|
||||
const int32_t piece_len =
|
||||
llama_token_to_piece(vocab, next_token, piece, sizeof(piece), 0, true);
|
||||
if (piece_len < 0) {
|
||||
break;
|
||||
}
|
||||
generated_text.append(piece, static_cast<size_t>(piece_len));
|
||||
|
||||
batch = llama_batch_get_one(&next_token, 1);
|
||||
if (llama_decode(ctx, batch) != 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Keep descriptions concise and sentence-like.
|
||||
if (generated_text.size() >= 220 ||
|
||||
(generated_text.size() > 40 &&
|
||||
generated_text.find('.') != std::string::npos)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
llama_sampler_free(sampler);
|
||||
|
||||
// Clean up generated text
|
||||
if (generated_text.empty()) {
|
||||
generated_text = fallback;
|
||||
}
|
||||
|
||||
return generated_text;
|
||||
}
|
||||
|
||||
/// @brief Main entry point for the brewery and beer data pipeline
|
||||
///
|
||||
/// Coordinates fetching of brewery data (limited to 10) and generation of
|
||||
/// beer posts with AI-powered descriptions using llama.cpp integration.
|
||||
/// Initializes llama model for description generation.
|
||||
int main(int argc, char **argv) {
|
||||
int total_count = 0;
|
||||
|
||||
std::string model_path = "models/llama-2-7b-chat.gguf";
|
||||
for (int i = 1; i < argc; ++i) {
|
||||
const std::string arg = argv[i];
|
||||
|
||||
if (arg == "--model" || arg == "-m") {
|
||||
if (i + 1 >= argc) {
|
||||
std::cerr << "Error: missing value for " << arg << std::endl;
|
||||
return 1;
|
||||
}
|
||||
model_path = argv[++i];
|
||||
} else if (arg == "--help" || arg == "-h") {
|
||||
std::cout << "Usage: " << argv[0] << " [--model <path-to-gguf>]"
|
||||
<< std::endl;
|
||||
return 0;
|
||||
} else {
|
||||
std::cerr << "Error: unknown argument " << arg << std::endl;
|
||||
std::cerr << "Usage: " << argv[0] << " [--model <path-to-gguf>]"
|
||||
<< std::endl;
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Create output directory for storing JSON files
|
||||
fs::create_directories("output");
|
||||
|
||||
// Ensure libcurl is initialized and will be cleaned up on scope exit
|
||||
GlobalCurl curl_guard;
|
||||
|
||||
// Initialize llama.cpp model
|
||||
std::cout << "Initializing llama model..." << std::endl;
|
||||
llama_context *llama_ctx = nullptr;
|
||||
llama_model *llama_model_ptr = nullptr;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
try {
|
||||
// Check if model exists
|
||||
if (!fs::exists(model_path)) {
|
||||
std::cerr << "Warning: Model file not found at " << model_path
|
||||
<< ". Using template descriptions." << std::endl;
|
||||
} else {
|
||||
// Load model with default parameters
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
llama_model_ptr =
|
||||
llama_model_load_from_file(model_path.c_str(), model_params);
|
||||
// Initialize libcurl globally (thread-safe mode)
|
||||
curl_global_init(CURL_GLOBAL_DEFAULT);
|
||||
|
||||
if (!llama_model_ptr) {
|
||||
std::cerr << "Warning: Failed to load llama model. Using template "
|
||||
"descriptions."
|
||||
<< std::endl;
|
||||
} else {
|
||||
// Create context
|
||||
llama_context_params ctx_params = llama_context_default_params();
|
||||
ctx_params.n_ctx = 512; // Context size
|
||||
ctx_params.n_batch = 256; // Prompt batch size
|
||||
ctx_params.n_threads = 4; // Number of threads
|
||||
// Parse command-line arguments
|
||||
std::string modelPath = argc > 1 ? argv[1] : "./model.gguf";
|
||||
std::string cacheDir = argc > 2 ? argv[2] : "/tmp";
|
||||
std::string commit =
|
||||
argc > 3 ? argv[3] : "c5eb7772"; // Default: stable 2026-03-28
|
||||
|
||||
llama_ctx = llama_init_from_model(llama_model_ptr, ctx_params);
|
||||
// Construct cache path for downloaded JSON
|
||||
std::string jsonPath = cacheDir + "/countries+states+cities.json";
|
||||
|
||||
if (!llama_ctx) {
|
||||
std::cerr
|
||||
<< "Warning: Failed to create llama context. Using template "
|
||||
"descriptions."
|
||||
<< std::endl;
|
||||
llama_model_free(llama_model_ptr);
|
||||
llama_model_ptr = nullptr;
|
||||
} else {
|
||||
std::cout << "Llama model loaded successfully!" << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (const std::exception &ex) {
|
||||
std::cerr << "Warning: Llama initialization error: " << ex.what()
|
||||
<< ". Using template descriptions." << std::endl;
|
||||
}
|
||||
// Step 0: Download geographic data from GitHub (cached locally)
|
||||
// On first run, downloads 45MB JSON. On subsequent runs, uses cached file.
|
||||
// Commit hash allows pinning to specific data versions for reproducibility.
|
||||
std::cout << "\n[Pipeline] Downloading geographic data from GitHub...\n";
|
||||
DataDownloader downloader;
|
||||
downloader.DownloadCountriesDatabase(jsonPath, commit);
|
||||
|
||||
/// Result of fetching a single page from the API
|
||||
struct PageResult {
|
||||
int page; ///< Page number requested
|
||||
int count; ///< Number of breweries in this page
|
||||
std::string error; ///< Error message if fetch failed (empty = success)
|
||||
};
|
||||
SqliteDatabase db;
|
||||
|
||||
std::vector<PageResult> results; ///< Thread-safe storage for page results
|
||||
std::vector<std::thread> threads; ///< Active worker threads
|
||||
std::mutex results_mutex; ///< Guards access to results vector
|
||||
const int MAX_THREADS = 5; ///< Maximum concurrent API requests
|
||||
const int MAX_BREWERIES = 10; ///< Limit to 10 breweries
|
||||
// Step 1: Initialize empty in-memory database
|
||||
std::cout << "Initializing in-memory SQLite database...\n";
|
||||
db.Initialize();
|
||||
|
||||
/// Fetch only the first page of breweries to get our 10 breweries
|
||||
std::cout << "Fetching breweries from Open Brewery DB API..." << std::endl;
|
||||
// Step 2: Load world city data from JSON file
|
||||
// This populates the database with ~50k city records
|
||||
// Each record includes: city name, country, latitude, longitude, population
|
||||
JsonLoader::LoadWorldCities(jsonPath, db);
|
||||
|
||||
for (int page = 1; page <= 1; ++page) {
|
||||
// Only need 1 page
|
||||
if (threads.size() >= MAX_THREADS) {
|
||||
threads[0].join();
|
||||
threads.erase(threads.begin());
|
||||
// Step 3: Initialize brewery generator
|
||||
// Current: Mock implementation using deterministic hashing
|
||||
// Future: LLM-based generation with llama.cpp
|
||||
std::cout << "Initializing brewery generator...\n";
|
||||
LlamaBreweryGenerator generator;
|
||||
generator.LoadModel(modelPath);
|
||||
|
||||
// Step 4: Query geographic data from database
|
||||
std::cout << "\n=== GEOGRAPHIC DATA OVERVIEW ===\n";
|
||||
|
||||
auto countries = db.QueryCountries(50);
|
||||
auto states = db.QueryStates(50);
|
||||
auto cities = db.QueryCities();
|
||||
|
||||
std::cout << "\nTotal records loaded:";
|
||||
std::cout << "\n Countries: " << db.QueryCountries(0).size();
|
||||
std::cout << "\n States: " << db.QueryStates(0).size();
|
||||
std::cout << "\n Cities: " << cities.size() << "\n";
|
||||
|
||||
// Display 50 countries
|
||||
std::cout << "\n--- 50 COUNTRIES ---\n";
|
||||
for (size_t i = 0; i < countries.size(); i++) {
|
||||
std::cout << (i + 1) << ". " << countries[i].iso2 << " ("
|
||||
<< countries[i].iso3 << ") " << countries[i].name << "\n";
|
||||
}
|
||||
|
||||
/// Launch a new worker thread to fetch this page
|
||||
threads.emplace_back([page, &results, &results_mutex, MAX_BREWERIES]() {
|
||||
PageResult result{page, 0, ""};
|
||||
|
||||
/// Initialize CURL handle for this thread
|
||||
CURL *curl = curl_easy_init();
|
||||
if (!curl) {
|
||||
result.error = "Failed to initialize CURL";
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(results_mutex);
|
||||
results.push_back(result);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/// Fetch the page from the Open Brewery DB API
|
||||
/// Parameters: per_page=10 (limited), page=1
|
||||
std::string response;
|
||||
std::string api_url =
|
||||
"https://api.openbrewerydb.org/v1/breweries?per_page=" +
|
||||
std::to_string(MAX_BREWERIES) + "&page=" + std::to_string(page);
|
||||
|
||||
/// Configure CURL: set URL, write callback, and output buffer
|
||||
curl_easy_setopt(curl, CURLOPT_URL, api_url.c_str());
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback);
|
||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
|
||||
|
||||
/// Execute the HTTP GET request
|
||||
CURLcode res = curl_easy_perform(curl);
|
||||
if (res != CURLE_OK) {
|
||||
result.error = curl_easy_strerror(res);
|
||||
curl_easy_cleanup(curl);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(results_mutex);
|
||||
results.push_back(result);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
/// Parse JSON response and save to file if not empty
|
||||
try {
|
||||
nlohmann::json breweries = nlohmann::json::parse(response);
|
||||
result.count = static_cast<int>(breweries.size());
|
||||
|
||||
/// Save breweries to output file
|
||||
if (result.count > 0) {
|
||||
std::string out_path = "output/breweries.json";
|
||||
std::ofstream out_file(out_path);
|
||||
out_file << breweries.dump(2); // Pretty-print with 2-space indent
|
||||
}
|
||||
} catch (const std::exception &ex) {
|
||||
result.error = ex.what();
|
||||
}
|
||||
|
||||
/// Cleanup CURL handle and store result thread-safely
|
||||
curl_easy_cleanup(curl);
|
||||
{
|
||||
std::lock_guard<std::mutex> lock(results_mutex);
|
||||
results.push_back(result);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
/// Wait for all remaining worker threads to complete
|
||||
for (auto &thread : threads) {
|
||||
thread.join();
|
||||
}
|
||||
|
||||
/// Process and display results: check for errors
|
||||
nlohmann::json breweries_data;
|
||||
for (auto &r : results) {
|
||||
std::cout << "Fetching page " << r.page << "..." << std::endl;
|
||||
|
||||
/// Exit on first error
|
||||
if (!r.error.empty()) {
|
||||
std::cerr << "Error on page " << r.page << ": " << r.error << std::endl;
|
||||
curl_global_cleanup();
|
||||
return 1;
|
||||
// Display 50 states
|
||||
std::cout << "\n--- 50 STATES ---\n";
|
||||
for (size_t i = 0; i < states.size(); i++) {
|
||||
std::cout << (i + 1) << ". " << states[i].iso2 << ": " << states[i].name
|
||||
<< "\n";
|
||||
}
|
||||
|
||||
/// Accumulate brewery count and log progress
|
||||
total_count += r.count;
|
||||
std::cout << " Got " << r.count << " breweries (total: " << total_count
|
||||
<< ")" << std::endl;
|
||||
}
|
||||
// Display 50 cities
|
||||
std::cout << "\n--- 50 CITIES ---\n";
|
||||
for (size_t i = 0; i < std::min(size_t(50), cities.size()); i++) {
|
||||
std::cout << (i + 1) << ". " << cities[i].second << "\n";
|
||||
}
|
||||
|
||||
/// Load breweries from file for beer post generation
|
||||
try {
|
||||
std::ifstream breweries_file("output/breweries.json");
|
||||
breweries_file >> breweries_data;
|
||||
} catch (const std::exception &ex) {
|
||||
std::cerr << "Error loading breweries: " << ex.what() << std::endl;
|
||||
// Step 5: Demonstrate brewery generation on sample cities
|
||||
std::cout << "\n=== SAMPLE BREWERY GENERATION ===\n\n";
|
||||
for (size_t i = 0; i < std::min(size_t(5), cities.size()); i++) {
|
||||
const auto &[cityId, cityName] = cities[i];
|
||||
auto brewery = generator.GenerateBrewery(cityName, i);
|
||||
std::cout << " " << cityName << ": " << brewery.name << "\n";
|
||||
std::cout << " → " << brewery.description << "\n";
|
||||
}
|
||||
|
||||
std::cout << "\n✓ Pipeline completed successfully\n";
|
||||
|
||||
// Cleanup
|
||||
curl_global_cleanup();
|
||||
return 0;
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
std::cerr << "✗ Pipeline failed: " << e.what() << "\n";
|
||||
curl_global_cleanup();
|
||||
return 1;
|
||||
}
|
||||
|
||||
/// Generate and save beer styles output
|
||||
std::cout << "\nGenerating beer styles..." << std::endl;
|
||||
nlohmann::json beer_styles_json = nlohmann::json::array();
|
||||
for (size_t i = 0; i < BEER_STYLES.size(); ++i) {
|
||||
beer_styles_json.push_back({
|
||||
{"BeerStyleID", i + 1},
|
||||
{"StyleName", BEER_STYLES[i].first},
|
||||
{"Description", BEER_STYLES[i].second},
|
||||
});
|
||||
}
|
||||
std::ofstream styles_file("output/beer-styles.json");
|
||||
styles_file << beer_styles_json.dump(2);
|
||||
std::cout << "Generated " << BEER_STYLES.size() << " beer styles"
|
||||
<< std::endl;
|
||||
|
||||
/// Generate 10 beer posts using breweries and beer styles
|
||||
std::cout << "\nGenerating beer posts..." << std::endl;
|
||||
nlohmann::json beer_posts_json = nlohmann::json::array();
|
||||
|
||||
int beer_posts_generated = 0;
|
||||
for (int i = 0; i < 10 && i < static_cast<int>(breweries_data.size()); ++i) {
|
||||
const auto &brewery = breweries_data[i];
|
||||
const auto &beer_style = BEER_STYLES[i % BEER_STYLES.size()];
|
||||
|
||||
std::string brewery_name = brewery.contains("name")
|
||||
? brewery["name"].get<std::string>()
|
||||
: "Unknown";
|
||||
|
||||
// Generate beer name from brewery
|
||||
std::string beer_name = brewery_name + " " + beer_style.first;
|
||||
|
||||
// Generate description using llama integration (with fallback)
|
||||
std::string description = generateBeerDescription(
|
||||
beer_name, beer_style.first, brewery_name, llama_ctx, llama_model_ptr);
|
||||
|
||||
// Generate random ABV (3.5% to 9.5%)
|
||||
double abv = 3.5 + (i % 6) * 1.0;
|
||||
|
||||
// Generate random IBU (15 to 85)
|
||||
int ibu = 15 + (i % 7) * 10;
|
||||
|
||||
// Extract additional brewery data if available
|
||||
std::string brewery_city = brewery.contains("city")
|
||||
? brewery["city"].get<std::string>()
|
||||
: "Unknown";
|
||||
std::string brewery_state = brewery.contains("state")
|
||||
? brewery["state"].get<std::string>()
|
||||
: "Unknown";
|
||||
|
||||
beer_posts_json.push_back({
|
||||
{"BeerPostID", i + 1},
|
||||
{"Name", beer_name},
|
||||
{"Description", description},
|
||||
{"ABV", abv},
|
||||
{"IBU", ibu},
|
||||
{"BeerStyleID", (i % BEER_STYLES.size()) + 1},
|
||||
{"StyleName", beer_style.first},
|
||||
{"BreweryName", brewery_name},
|
||||
{"BreweryCity", brewery_city},
|
||||
{"BreweryState", brewery_state},
|
||||
{"CreatedAt", "2026-03-24T00:00:00Z"},
|
||||
});
|
||||
|
||||
beer_posts_generated++;
|
||||
std::cout << " Generated: " << beer_name << " (" << abv << "% ABV, " << ibu
|
||||
<< " IBU)" << std::endl;
|
||||
}
|
||||
|
||||
std::ofstream posts_file("output/beer-posts.json");
|
||||
posts_file << beer_posts_json.dump(2);
|
||||
std::cout << "Generated " << beer_posts_generated << " beer posts"
|
||||
<< std::endl;
|
||||
|
||||
/// Cleanup llama resources
|
||||
if (llama_ctx) {
|
||||
std::cout << "\nCleaning up llama context..." << std::endl;
|
||||
llama_free(llama_ctx);
|
||||
llama_ctx = nullptr;
|
||||
}
|
||||
if (llama_model_ptr) {
|
||||
llama_model_free(llama_model_ptr);
|
||||
llama_model_ptr = nullptr;
|
||||
}
|
||||
|
||||
/// Summary of generated data
|
||||
std::cout << "\n=== Pipeline Complete ===" << std::endl;
|
||||
std::cout << "Breweries fetched: " << total_count << std::endl;
|
||||
std::cout << "Beer styles created: " << BEER_STYLES.size() << std::endl;
|
||||
std::cout << "Beer posts generated: " << beer_posts_generated << std::endl;
|
||||
std::cout << "Output files created:" << std::endl;
|
||||
std::cout << " - output/breweries.json" << std::endl;
|
||||
std::cout << " - output/beer-styles.json" << std::endl;
|
||||
std::cout << " - output/beer-posts.json" << std::endl;
|
||||
|
||||
/// Cleanup is handled by GlobalCurl RAII guard, but explicit cleanup is safe
|
||||
curl_global_cleanup();
|
||||
return 0;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user