mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-04-05 10:09:03 +00:00
Add pipeline guide and enhance CMake configuration for llama integration
This commit is contained in:
@@ -12,6 +12,7 @@ switching, shared UI components, Storybook coverage, and integration with the AP
|
|||||||
- [Testing](docs/testing.md) - Backend and frontend test commands
|
- [Testing](docs/testing.md) - Backend and frontend test commands
|
||||||
- [Environment Variables](docs/environment-variables.md) - Active configuration reference
|
- [Environment Variables](docs/environment-variables.md) - Active configuration reference
|
||||||
- [Token Validation](docs/token-validation.md) - JWT validation architecture
|
- [Token Validation](docs/token-validation.md) - JWT validation architecture
|
||||||
|
- [Pipeline Guide](pipeline/README.md) - Build, model install, and run steps for the C++ data pipeline
|
||||||
- [Legacy Website Archive](docs/archive/legacy-website-v1.md) - Archived notes for the old Next.js frontend
|
- [Legacy Website Archive](docs/archive/legacy-website-v1.md) - Archived notes for the old Next.js frontend
|
||||||
|
|
||||||
## Diagrams
|
## Diagrams
|
||||||
|
|||||||
@@ -19,6 +19,23 @@ FetchContent_Declare(
|
|||||||
)
|
)
|
||||||
FetchContent_MakeAvailable(nlohmann_json)
|
FetchContent_MakeAvailable(nlohmann_json)
|
||||||
|
|
||||||
|
FetchContent_Declare(
|
||||||
|
llama
|
||||||
|
GIT_REPOSITORY https://github.com/ggml-org/llama.cpp.git
|
||||||
|
# Stable release tag: b8485 (commit 31a5cf4c3f5d3af7f16fc4abc9baa75f8d568421)
|
||||||
|
GIT_TAG 31a5cf4c3f5d3af7f16fc4abc9baa75f8d568421
|
||||||
|
)
|
||||||
|
FetchContent_MakeAvailable(llama)
|
||||||
|
|
||||||
|
# Workaround for upstream llama.cpp release stream (b8485/b8496) missing
|
||||||
|
# <algorithm> include in llama-quant.cpp where std::sort is used.
|
||||||
|
# Remove once fixed upstream.
|
||||||
|
if(TARGET llama)
|
||||||
|
target_compile_options(llama PRIVATE
|
||||||
|
$<$<COMPILE_LANGUAGE:CXX>:-include algorithm>
|
||||||
|
)
|
||||||
|
endif()
|
||||||
|
|
||||||
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS
|
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS
|
||||||
src/*.cpp
|
src/*.cpp
|
||||||
src/*.h
|
src/*.h
|
||||||
@@ -36,6 +53,7 @@ target_link_libraries(biergarten-pipeline
|
|||||||
CURL::libcurl
|
CURL::libcurl
|
||||||
nlohmann_json::nlohmann_json
|
nlohmann_json::nlohmann_json
|
||||||
Boost::unit_test_framework
|
Boost::unit_test_framework
|
||||||
|
llama
|
||||||
)
|
)
|
||||||
|
|
||||||
target_compile_options(biergarten-pipeline PRIVATE
|
target_compile_options(biergarten-pipeline PRIVATE
|
||||||
@@ -95,6 +113,7 @@ if(BUILD_TESTING)
|
|||||||
Boost::unit_test_framework
|
Boost::unit_test_framework
|
||||||
CURL::libcurl
|
CURL::libcurl
|
||||||
nlohmann_json::nlohmann_json
|
nlohmann_json::nlohmann_json
|
||||||
|
llama
|
||||||
)
|
)
|
||||||
|
|
||||||
add_test(
|
add_test(
|
||||||
|
|||||||
128
pipeline/README.md
Normal file
128
pipeline/README.md
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
# Pipeline Guide
|
||||||
|
|
||||||
|
This guide documents the end-to-end pipeline workflow for:
|
||||||
|
|
||||||
|
- Building the C++ pipeline executable
|
||||||
|
- Installing a lightweight GGUF model for llama.cpp
|
||||||
|
- Running the pipeline with either default or explicit model path
|
||||||
|
- Re-running from a clean build directory
|
||||||
|
|
||||||
|
## Prerequisites
|
||||||
|
|
||||||
|
- CMake 3.20+
|
||||||
|
- A C++ compiler (Apple Clang on macOS works)
|
||||||
|
- Internet access to download model files
|
||||||
|
- Hugging Face CLI (`hf`) from `huggingface_hub`
|
||||||
|
|
||||||
|
## Build
|
||||||
|
|
||||||
|
From repository root:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -S pipeline -B pipeline/dist
|
||||||
|
cmake --build pipeline/dist -j4
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected executable:
|
||||||
|
|
||||||
|
- `pipeline/dist/biergarten-pipeline`
|
||||||
|
|
||||||
|
## Install Hugging Face CLI
|
||||||
|
|
||||||
|
Recommended on macOS:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
brew install pipx
|
||||||
|
pipx ensurepath
|
||||||
|
pipx install huggingface_hub
|
||||||
|
```
|
||||||
|
|
||||||
|
If your shell cannot find `hf`, use the full path:
|
||||||
|
|
||||||
|
- `~/.local/bin/hf`
|
||||||
|
|
||||||
|
## Install a Lightweight Model (POC)
|
||||||
|
|
||||||
|
The recommended proof-of-concept model is:
|
||||||
|
|
||||||
|
- `Qwen/Qwen2.5-0.5B-Instruct-GGUF`
|
||||||
|
- File: `qwen2.5-0.5b-instruct-q4_k_m.gguf`
|
||||||
|
|
||||||
|
From `pipeline/dist`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd pipeline/dist
|
||||||
|
mkdir -p models
|
||||||
|
~/.local/bin/hf download Qwen/Qwen2.5-0.5B-Instruct-GGUF qwen2.5-0.5b-instruct-q4_k_m.gguf --local-dir models
|
||||||
|
```
|
||||||
|
|
||||||
|
## Run
|
||||||
|
|
||||||
|
### Option A: Explicit model path (recommended)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd pipeline/dist
|
||||||
|
./biergarten-pipeline --model models/qwen2.5-0.5b-instruct-q4_k_m.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
### Option B: Default model path
|
||||||
|
|
||||||
|
If you want to use default startup behavior, place a model at:
|
||||||
|
|
||||||
|
- `pipeline/dist/models/llama-2-7b-chat.gguf`
|
||||||
|
|
||||||
|
Then run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd pipeline/dist
|
||||||
|
./biergarten-pipeline
|
||||||
|
```
|
||||||
|
|
||||||
|
## Output Files
|
||||||
|
|
||||||
|
The pipeline writes output to:
|
||||||
|
|
||||||
|
- `pipeline/dist/output/breweries.json`
|
||||||
|
- `pipeline/dist/output/beer-styles.json`
|
||||||
|
- `pipeline/dist/output/beer-posts.json`
|
||||||
|
|
||||||
|
## Clean Re-run Process
|
||||||
|
|
||||||
|
If you want to redo from a clean dist state:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
rm -rf pipeline/dist
|
||||||
|
cmake -S pipeline -B pipeline/dist
|
||||||
|
cmake --build pipeline/dist -j4
|
||||||
|
cd pipeline/dist
|
||||||
|
mkdir -p models
|
||||||
|
~/.local/bin/hf download Qwen/Qwen2.5-0.5B-Instruct-GGUF qwen2.5-0.5b-instruct-q4_k_m.gguf --local-dir models
|
||||||
|
./biergarten-pipeline --model models/qwen2.5-0.5b-instruct-q4_k_m.gguf
|
||||||
|
```
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
### `zsh: command not found: huggingface-cli`
|
||||||
|
|
||||||
|
The app name from `huggingface_hub` is `hf`, not `huggingface-cli`.
|
||||||
|
|
||||||
|
Use:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
~/.local/bin/hf --help
|
||||||
|
```
|
||||||
|
|
||||||
|
### `Model file not found ...`
|
||||||
|
|
||||||
|
- Confirm you are running from `pipeline/dist`.
|
||||||
|
- Confirm the file path passed to `--model` exists.
|
||||||
|
- If not using `--model`, ensure the default file exists at `models/llama-2-7b-chat.gguf` relative to current working directory.
|
||||||
|
|
||||||
|
### CMake cache/path mismatch
|
||||||
|
|
||||||
|
Use explicit source/build paths:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cmake -S /absolute/path/to/pipeline -B /absolute/path/to/pipeline/dist
|
||||||
|
cmake --build /absolute/path/to/pipeline/dist -j4
|
||||||
|
```
|
||||||
@@ -1,109 +1,536 @@
|
|||||||
|
/// @file main.cpp
|
||||||
|
/// @brief Brewery and beer data pipeline
|
||||||
|
///
|
||||||
|
/// This program fetches brewery data from the Open Brewery DB API
|
||||||
|
/// (https://api.openbrewerydb.org/), limited to the first 10 breweries.
|
||||||
|
/// It then generates beer posts using hardcoded beer styles and AI-powered
|
||||||
|
/// descriptions via llama integration.
|
||||||
|
///
|
||||||
|
/// Usage:
|
||||||
|
/// ./pipeline [--model <path-to-gguf>]
|
||||||
|
///
|
||||||
|
/// Output:
|
||||||
|
/// - Creates an 'output/' directory with JSON files:
|
||||||
|
/// - breweries.json: fetched brewery data
|
||||||
|
/// - beer-styles.json: 50 hardcoded beer styles
|
||||||
|
/// - beer-posts.json: 10 generated beer posts
|
||||||
|
/// - Prints progress to stdout and errors to stderr
|
||||||
|
/// - Returns 0 on success, 1 on error
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
#include <curl/curl.h>
|
#include <curl/curl.h>
|
||||||
#include <nlohmann/json.hpp>
|
|
||||||
#include <iostream>
|
|
||||||
#include <fstream>
|
|
||||||
#include <string>
|
|
||||||
#include <filesystem>
|
#include <filesystem>
|
||||||
|
#include <fstream>
|
||||||
|
#include <iostream>
|
||||||
|
#include <mutex>
|
||||||
|
#include <nlohmann/json.hpp>
|
||||||
|
#include <queue>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <future>
|
// Llama.cpp integration
|
||||||
#
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
#include "llama.h"
|
||||||
|
#ifdef __cplusplus
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace fs = std::filesystem;
|
namespace fs = std::filesystem;
|
||||||
|
|
||||||
|
/// @brief RAII guard for libcurl global initialization and cleanup
|
||||||
|
///
|
||||||
|
/// Ensures that curl_global_init() is called on construction and
|
||||||
|
/// curl_global_cleanup() is called on destruction. This is required before any
|
||||||
|
/// CURL operations and should be called exactly once per process.
|
||||||
|
///
|
||||||
|
/// Non-copyable and non-assignable to prevent multiple initialization attempts.
|
||||||
struct GlobalCurl {
|
struct GlobalCurl {
|
||||||
GlobalCurl() {
|
GlobalCurl() {
|
||||||
if (curl_global_init(CURL_GLOBAL_DEFAULT) != 0)
|
if (curl_global_init(CURL_GLOBAL_DEFAULT) != 0)
|
||||||
throw std::runtime_error("Failed to initialize libcurl");
|
throw std::runtime_error("Failed to initialize libcurl");
|
||||||
}
|
}
|
||||||
~GlobalCurl() { curl_global_cleanup(); }
|
~GlobalCurl() { curl_global_cleanup(); }
|
||||||
|
|
||||||
GlobalCurl(const GlobalCurl &) = delete;
|
GlobalCurl(const GlobalCurl &) = delete;
|
||||||
GlobalCurl &operator=(const GlobalCurl &) = delete;
|
GlobalCurl &operator=(const GlobalCurl &) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/// @brief CURL write callback that accumulates response data
|
||||||
// CURL writes data in chunks — this callback appends each chunk to a string
|
///
|
||||||
static size_t writeCallback(char *ptr, size_t size, size_t nmemb, std::string *out) {
|
/// This callback is invoked by libcurl as the HTTP response is received.
|
||||||
out->append(ptr, size * nmemb);
|
/// It appends each chunk of data to the provided string buffer.
|
||||||
return size * nmemb;
|
///
|
||||||
|
/// @param ptr Pointer to the data chunk received
|
||||||
|
/// @param size Size of each element (always 1 for this use case)
|
||||||
|
/// @param nmemb Number of elements in the data chunk
|
||||||
|
/// @param out Pointer to std::string where data is accumulated
|
||||||
|
/// @return Number of bytes processed (size * nmemb); returning less
|
||||||
|
/// signals error
|
||||||
|
static size_t writeCallback(char *ptr, size_t size, size_t nmemb,
|
||||||
|
std::string *out) {
|
||||||
|
out->append(ptr, size * nmemb);
|
||||||
|
return size * nmemb;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// @brief Hardcoded collection of 50 beer styles
|
||||||
|
///
|
||||||
|
/// Contains a diverse range of beer styles from light lagers to heavy stouts
|
||||||
|
const std::vector<std::pair<std::string, std::string>> BEER_STYLES = {
|
||||||
|
{"Pale Ale", "A hoppy ale with a golden color and balanced bitter finish"},
|
||||||
|
{"IPA", "India Pale Ale with intense hop bitterness and citrus notes"},
|
||||||
|
{"Stout", "Dark, creamy beer with roasted malt and coffee notes"},
|
||||||
|
{"Porter", "Dark ale with chocolate and caramel flavors"},
|
||||||
|
{"Lager", "Clean, crisp beer with a smooth finish"},
|
||||||
|
{"Pilsner", "Golden lager with a crisp, well-balanced hop bitterness"},
|
||||||
|
{"Hefeweizen", "Bavarian wheat beer with banana and clove notes"},
|
||||||
|
{"Wheat Beer", "Light, refreshing beer made with wheat malt"},
|
||||||
|
{"Amber Ale", "Sweet, malty ale with caramel flavors"},
|
||||||
|
{"Brown Ale", "Nutty, chocolatey ale with moderate alcohol"},
|
||||||
|
{"Saison", "Belgian style ale, spicy and fruity with high carbonation"},
|
||||||
|
{"Tripel", "Belgian strong golden ale with fruity complexity"},
|
||||||
|
{"Lambic", "Spontaneously fermented sour ale with fruit notes"},
|
||||||
|
{"Sour Ale", "Tangy beer with acidic and funky characteristics"},
|
||||||
|
{"Imperial Stout", "Strong stout with intense roasted malt flavors"},
|
||||||
|
{"Barley Wine", "Strong ale with wine-like body and alcohol content"},
|
||||||
|
{"Cream Ale", "Smooth, light ale with corn sweetness"},
|
||||||
|
{"Blonde Ale", "Light, easy-drinking ale with slight sweetness"},
|
||||||
|
{"Pale Lager", "Light, refreshing lager with subtle hop character"},
|
||||||
|
{"Dunkelweizen", "Dark German wheat beer with bread and banana flavors"},
|
||||||
|
{"Russian Imperial Stout", "Very strong stout with complex flavor profile"},
|
||||||
|
{"Berliner Weisse", "Light, sour German wheat beer"},
|
||||||
|
{"Gose", "Salt and coriander spiced sour ale from Germany"},
|
||||||
|
{"Witbier", "Belgian white beer with citrus and spice notes"},
|
||||||
|
{"Milk Stout", "Creamy stout with lactose sweetness"},
|
||||||
|
{"Oatmeal Stout", "Smooth stout with oat malt additions"},
|
||||||
|
{"Rauchbier", "Smoked German lager with bacon aroma"},
|
||||||
|
{"Kellerbier", "Unpasteurized, unfiltered Bavarian lager"},
|
||||||
|
{"Schwarzbier", "Black lager with sweet malty character"},
|
||||||
|
{"Märzen", "Bavarian amber lager, traditionally brewed in March"},
|
||||||
|
{"Bock", "Strong German lager with balanced sweetness"},
|
||||||
|
{"Helles Bock", "Light, strong German lager"},
|
||||||
|
{"Maibock", "Golden strong lager brewed in spring"},
|
||||||
|
{"Eisbock", "Concentrated German lager with high alcohol"},
|
||||||
|
{"Doppelbock", "Dark, strong German lager"},
|
||||||
|
{"Scottish Ale", "Full-bodied ale with caramel and toffee notes"},
|
||||||
|
{"English Bitter", "Hoppy amber ale with earthy character"},
|
||||||
|
{"English Pale Ale", "Balanced ale with biscuit and hop notes"},
|
||||||
|
{"ESB", "Extra Special Bitter with rich malt character"},
|
||||||
|
{"Barley Wine Style Ale", "Strong beer with wine-like complexity"},
|
||||||
|
{"Old Ale", "Dark, strong ale with vinous character"},
|
||||||
|
{"English Brown Ale", "Sweet, malty brown ale"},
|
||||||
|
{"Nut Brown Ale", "Brown ale with nut-like flavors"},
|
||||||
|
{"English Porter", "Dark, rich porter style"},
|
||||||
|
{"English Stout", "Traditional stout with roasted character"},
|
||||||
|
{"Irish Red Ale", "Malty red ale with caramel notes"},
|
||||||
|
{"Rye IPA", "IPA brewed with spicy rye grain"},
|
||||||
|
{"Rye Ale", "Ale with characteristic rye spiciness"},
|
||||||
|
{"Smoked Beer", "Beer with pronounced smoked malt character"},
|
||||||
|
{"Fruit Beer", "Beer brewed with added fruits for flavor"},
|
||||||
|
};
|
||||||
|
|
||||||
|
/// @brief Generate AI-powered beer post description using llama
|
||||||
|
///
|
||||||
|
/// This function integrates with llama.cpp to generate authentic beer
|
||||||
|
/// descriptions based on the beer name, style, and brewery.
|
||||||
|
///
|
||||||
|
/// @param beer_name Name of the beer
|
||||||
|
/// @param beer_style Style of the beer
|
||||||
|
/// @param brewery_name Name of the brewery
|
||||||
|
/// @param ctx Llama context for generation
|
||||||
|
/// @return Generated beer description
|
||||||
|
std::string generateBeerDescription(const std::string &beer_name,
|
||||||
|
const std::string &beer_style,
|
||||||
|
const std::string &brewery_name,
|
||||||
|
llama_context *ctx, llama_model *model) {
|
||||||
|
const std::string fallback =
|
||||||
|
"This " + beer_style + " from " + brewery_name +
|
||||||
|
" offers a unique take on the classic style. " + beer_name +
|
||||||
|
" presents complex flavors with a smooth finish.";
|
||||||
|
|
||||||
|
if (!ctx) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!model) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
const llama_vocab *vocab = llama_model_get_vocab(model);
|
||||||
|
if (!vocab) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create prompt for llama
|
||||||
|
std::string prompt =
|
||||||
|
"Generate a short, engaging beer description (2-3 sentences) for a " +
|
||||||
|
beer_style + " called '" + beer_name + "' from " + brewery_name +
|
||||||
|
". Focus on flavor profile, aroma, and drinking experience.:\n";
|
||||||
|
|
||||||
|
const int32_t n_prompt = -llama_tokenize(vocab, prompt.c_str(),
|
||||||
|
static_cast<int32_t>(prompt.size()),
|
||||||
|
nullptr, 0, true, true);
|
||||||
|
if (n_prompt <= 0) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<llama_token> prompt_tokens(static_cast<size_t>(n_prompt));
|
||||||
|
if (llama_tokenize(vocab, prompt.c_str(), static_cast<int32_t>(prompt.size()),
|
||||||
|
prompt_tokens.data(), n_prompt, true, true) < 0) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_batch batch = llama_batch_get_one(
|
||||||
|
prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
|
||||||
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto sampler_params = llama_sampler_chain_default_params();
|
||||||
|
llama_sampler *sampler = llama_sampler_chain_init(sampler_params);
|
||||||
|
if (!sampler) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
llama_sampler_chain_add(sampler, llama_sampler_init_greedy());
|
||||||
|
|
||||||
|
// Generate text
|
||||||
|
const int max_new_tokens = 80;
|
||||||
|
std::string generated_text;
|
||||||
|
|
||||||
|
for (int i = 0; i < max_new_tokens; ++i) {
|
||||||
|
llama_token next_token = llama_sampler_sample(sampler, ctx, -1);
|
||||||
|
if (llama_vocab_is_eog(vocab, next_token)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
char piece[256];
|
||||||
|
const int32_t piece_len =
|
||||||
|
llama_token_to_piece(vocab, next_token, piece, sizeof(piece), 0, true);
|
||||||
|
if (piece_len < 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
generated_text.append(piece, static_cast<size_t>(piece_len));
|
||||||
|
|
||||||
|
batch = llama_batch_get_one(&next_token, 1);
|
||||||
|
if (llama_decode(ctx, batch) != 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keep descriptions concise and sentence-like.
|
||||||
|
if (generated_text.size() >= 220 ||
|
||||||
|
(generated_text.size() > 40 &&
|
||||||
|
generated_text.find('.') != std::string::npos)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_sampler_free(sampler);
|
||||||
|
|
||||||
|
// Clean up generated text
|
||||||
|
if (generated_text.empty()) {
|
||||||
|
generated_text = fallback;
|
||||||
|
}
|
||||||
|
|
||||||
|
return generated_text;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// @brief Main entry point for the brewery and beer data pipeline
|
||||||
|
///
|
||||||
|
/// Coordinates fetching of brewery data (limited to 10) and generation of
|
||||||
|
/// beer posts with AI-powered descriptions using llama.cpp integration.
|
||||||
|
/// Initializes llama model for description generation.
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
int total_count = 0;
|
int total_count = 0;
|
||||||
|
|
||||||
fs::create_directories("output");
|
std::string model_path = "models/llama-2-7b-chat.gguf";
|
||||||
|
for (int i = 1; i < argc; ++i) {
|
||||||
|
const std::string arg = argv[i];
|
||||||
|
|
||||||
GlobalCurl curl_guard;
|
if (arg == "--model" || arg == "-m") {
|
||||||
|
if (i + 1 >= argc) {
|
||||||
|
std::cerr << "Error: missing value for " << arg << std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
model_path = argv[++i];
|
||||||
|
} else if (arg == "--help" || arg == "-h") {
|
||||||
|
std::cout << "Usage: " << argv[0] << " [--model <path-to-gguf>]"
|
||||||
|
<< std::endl;
|
||||||
|
return 0;
|
||||||
|
} else {
|
||||||
|
std::cerr << "Error: unknown argument " << arg << std::endl;
|
||||||
|
std::cerr << "Usage: " << argv[0] << " [--model <path-to-gguf>]"
|
||||||
|
<< std::endl;
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
struct PageResult {
|
// Create output directory for storing JSON files
|
||||||
int page;
|
fs::create_directories("output");
|
||||||
int count;
|
|
||||||
std::string error;
|
|
||||||
};
|
|
||||||
|
|
||||||
std::vector<std::future<PageResult>> jobs;
|
// Ensure libcurl is initialized and will be cleaned up on scope exit
|
||||||
jobs.reserve(30);
|
GlobalCurl curl_guard;
|
||||||
|
|
||||||
for (int page = 1; page <= 30; ++page) {
|
// Initialize llama.cpp model
|
||||||
jobs.emplace_back(std::async(std::launch::async, [page]() -> PageResult {
|
std::cout << "Initializing llama model..." << std::endl;
|
||||||
PageResult result{page, 0, ""};
|
llama_context *llama_ctx = nullptr;
|
||||||
|
llama_model *llama_model_ptr = nullptr;
|
||||||
|
|
||||||
CURL *curl = curl_easy_init();
|
try {
|
||||||
if (!curl) {
|
// Check if model exists
|
||||||
result.error = "Failed to initialize CURL";
|
if (!fs::exists(model_path)) {
|
||||||
return result;
|
std::cerr << "Warning: Model file not found at " << model_path
|
||||||
}
|
<< ". Using template descriptions." << std::endl;
|
||||||
|
} else {
|
||||||
|
// Load model with default parameters
|
||||||
|
llama_model_params model_params = llama_model_default_params();
|
||||||
|
llama_model_ptr =
|
||||||
|
llama_model_load_from_file(model_path.c_str(), model_params);
|
||||||
|
|
||||||
std::string response;
|
if (!llama_model_ptr) {
|
||||||
std::string api_url =
|
std::cerr << "Warning: Failed to load llama model. Using template "
|
||||||
"https://api.openbrewerydb.org/v1/breweries?per_page=200&page=" + std::to_string(page);
|
"descriptions."
|
||||||
|
<< std::endl;
|
||||||
|
} else {
|
||||||
|
// Create context
|
||||||
|
llama_context_params ctx_params = llama_context_default_params();
|
||||||
|
ctx_params.n_ctx = 512; // Context size
|
||||||
|
ctx_params.n_batch = 256; // Prompt batch size
|
||||||
|
ctx_params.n_threads = 4; // Number of threads
|
||||||
|
|
||||||
curl_easy_setopt(curl, CURLOPT_URL, api_url.c_str());
|
llama_ctx = llama_init_from_model(llama_model_ptr, ctx_params);
|
||||||
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
|
|
||||||
|
|
||||||
CURLcode res = curl_easy_perform(curl);
|
if (!llama_ctx) {
|
||||||
if (res != CURLE_OK) {
|
std::cerr
|
||||||
result.error = curl_easy_strerror(res);
|
<< "Warning: Failed to create llama context. Using template "
|
||||||
curl_easy_cleanup(curl);
|
"descriptions."
|
||||||
return result;
|
<< std::endl;
|
||||||
}
|
llama_model_free(llama_model_ptr);
|
||||||
|
llama_model_ptr = nullptr;
|
||||||
|
} else {
|
||||||
|
std::cout << "Llama model loaded successfully!" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (const std::exception &ex) {
|
||||||
|
std::cerr << "Warning: Llama initialization error: " << ex.what()
|
||||||
|
<< ". Using template descriptions." << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
/// Result of fetching a single page from the API
|
||||||
nlohmann::json breweries = nlohmann::json::parse(response);
|
struct PageResult {
|
||||||
result.count = static_cast<int>(breweries.size());
|
int page; ///< Page number requested
|
||||||
|
int count; ///< Number of breweries in this page
|
||||||
|
std::string error; ///< Error message if fetch failed (empty = success)
|
||||||
|
};
|
||||||
|
|
||||||
if (result.count > 0) {
|
std::vector<PageResult> results; ///< Thread-safe storage for page results
|
||||||
std::string out_path = "output/page-" + std::to_string(page) + ".json";
|
std::vector<std::thread> threads; ///< Active worker threads
|
||||||
std::ofstream out_file(out_path);
|
std::mutex results_mutex; ///< Guards access to results vector
|
||||||
out_file << breweries.dump(2);
|
const int MAX_THREADS = 5; ///< Maximum concurrent API requests
|
||||||
}
|
const int MAX_BREWERIES = 10; ///< Limit to 10 breweries
|
||||||
} catch (const std::exception &ex) {
|
|
||||||
result.error = ex.what();
|
|
||||||
}
|
|
||||||
|
|
||||||
curl_easy_cleanup(curl);
|
/// Fetch only the first page of breweries to get our 10 breweries
|
||||||
return result;
|
std::cout << "Fetching breweries from Open Brewery DB API..." << std::endl;
|
||||||
}));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto &job : jobs) {
|
for (int page = 1; page <= 1; ++page) {
|
||||||
PageResult r = job.get();
|
// Only need 1 page
|
||||||
|
if (threads.size() >= MAX_THREADS) {
|
||||||
|
threads[0].join();
|
||||||
|
threads.erase(threads.begin());
|
||||||
|
}
|
||||||
|
|
||||||
std::cout << "Fetching page " << r.page << "..." << std::endl;
|
/// Launch a new worker thread to fetch this page
|
||||||
|
threads.emplace_back([page, &results, &results_mutex, MAX_BREWERIES]() {
|
||||||
|
PageResult result{page, 0, ""};
|
||||||
|
|
||||||
if (!r.error.empty()) {
|
/// Initialize CURL handle for this thread
|
||||||
std::cerr << "Error on page " << r.page << ": " << r.error << std::endl;
|
CURL *curl = curl_easy_init();
|
||||||
curl_global_cleanup();
|
if (!curl) {
|
||||||
return 1;
|
result.error = "Failed to initialize CURL";
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(results_mutex);
|
||||||
|
results.push_back(result);
|
||||||
|
}
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
total_count += r.count;
|
/// Fetch the page from the Open Brewery DB API
|
||||||
std::cout << " Got " << r.count << " breweries (total: " << total_count << ")" << std::endl;
|
/// Parameters: per_page=10 (limited), page=1
|
||||||
|
std::string response;
|
||||||
|
std::string api_url =
|
||||||
|
"https://api.openbrewerydb.org/v1/breweries?per_page=" +
|
||||||
|
std::to_string(MAX_BREWERIES) + "&page=" + std::to_string(page);
|
||||||
|
|
||||||
if (r.count == 0) break;
|
/// Configure CURL: set URL, write callback, and output buffer
|
||||||
}
|
curl_easy_setopt(curl, CURLOPT_URL, api_url.c_str());
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writeCallback);
|
||||||
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
|
||||||
|
|
||||||
curl_global_cleanup();
|
/// Execute the HTTP GET request
|
||||||
return 0;
|
CURLcode res = curl_easy_perform(curl);
|
||||||
|
if (res != CURLE_OK) {
|
||||||
|
result.error = curl_easy_strerror(res);
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(results_mutex);
|
||||||
|
results.push_back(result);
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Parse JSON response and save to file if not empty
|
||||||
|
try {
|
||||||
|
nlohmann::json breweries = nlohmann::json::parse(response);
|
||||||
|
result.count = static_cast<int>(breweries.size());
|
||||||
|
|
||||||
|
/// Save breweries to output file
|
||||||
|
if (result.count > 0) {
|
||||||
|
std::string out_path = "output/breweries.json";
|
||||||
|
std::ofstream out_file(out_path);
|
||||||
|
out_file << breweries.dump(2); // Pretty-print with 2-space indent
|
||||||
|
}
|
||||||
|
} catch (const std::exception &ex) {
|
||||||
|
result.error = ex.what();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Cleanup CURL handle and store result thread-safely
|
||||||
|
curl_easy_cleanup(curl);
|
||||||
|
{
|
||||||
|
std::lock_guard<std::mutex> lock(results_mutex);
|
||||||
|
results.push_back(result);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Wait for all remaining worker threads to complete
|
||||||
|
for (auto &thread : threads) {
|
||||||
|
thread.join();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Process and display results: check for errors
|
||||||
|
nlohmann::json breweries_data;
|
||||||
|
for (auto &r : results) {
|
||||||
|
std::cout << "Fetching page " << r.page << "..." << std::endl;
|
||||||
|
|
||||||
|
/// Exit on first error
|
||||||
|
if (!r.error.empty()) {
|
||||||
|
std::cerr << "Error on page " << r.page << ": " << r.error << std::endl;
|
||||||
|
curl_global_cleanup();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Accumulate brewery count and log progress
|
||||||
|
total_count += r.count;
|
||||||
|
std::cout << " Got " << r.count << " breweries (total: " << total_count
|
||||||
|
<< ")" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load breweries from file for beer post generation
|
||||||
|
try {
|
||||||
|
std::ifstream breweries_file("output/breweries.json");
|
||||||
|
breweries_file >> breweries_data;
|
||||||
|
} catch (const std::exception &ex) {
|
||||||
|
std::cerr << "Error loading breweries: " << ex.what() << std::endl;
|
||||||
|
curl_global_cleanup();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Generate and save beer styles output
|
||||||
|
std::cout << "\nGenerating beer styles..." << std::endl;
|
||||||
|
nlohmann::json beer_styles_json = nlohmann::json::array();
|
||||||
|
for (size_t i = 0; i < BEER_STYLES.size(); ++i) {
|
||||||
|
beer_styles_json.push_back({
|
||||||
|
{"BeerStyleID", i + 1},
|
||||||
|
{"StyleName", BEER_STYLES[i].first},
|
||||||
|
{"Description", BEER_STYLES[i].second},
|
||||||
|
});
|
||||||
|
}
|
||||||
|
std::ofstream styles_file("output/beer-styles.json");
|
||||||
|
styles_file << beer_styles_json.dump(2);
|
||||||
|
std::cout << "Generated " << BEER_STYLES.size() << " beer styles"
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
|
/// Generate 10 beer posts using breweries and beer styles
|
||||||
|
std::cout << "\nGenerating beer posts..." << std::endl;
|
||||||
|
nlohmann::json beer_posts_json = nlohmann::json::array();
|
||||||
|
|
||||||
|
int beer_posts_generated = 0;
|
||||||
|
for (int i = 0; i < 10 && i < static_cast<int>(breweries_data.size()); ++i) {
|
||||||
|
const auto &brewery = breweries_data[i];
|
||||||
|
const auto &beer_style = BEER_STYLES[i % BEER_STYLES.size()];
|
||||||
|
|
||||||
|
std::string brewery_name = brewery.contains("name")
|
||||||
|
? brewery["name"].get<std::string>()
|
||||||
|
: "Unknown";
|
||||||
|
|
||||||
|
// Generate beer name from brewery
|
||||||
|
std::string beer_name = brewery_name + " " + beer_style.first;
|
||||||
|
|
||||||
|
// Generate description using llama integration (with fallback)
|
||||||
|
std::string description = generateBeerDescription(
|
||||||
|
beer_name, beer_style.first, brewery_name, llama_ctx, llama_model_ptr);
|
||||||
|
|
||||||
|
// Generate random ABV (3.5% to 9.5%)
|
||||||
|
double abv = 3.5 + (i % 6) * 1.0;
|
||||||
|
|
||||||
|
// Generate random IBU (15 to 85)
|
||||||
|
int ibu = 15 + (i % 7) * 10;
|
||||||
|
|
||||||
|
// Extract additional brewery data if available
|
||||||
|
std::string brewery_city = brewery.contains("city")
|
||||||
|
? brewery["city"].get<std::string>()
|
||||||
|
: "Unknown";
|
||||||
|
std::string brewery_state = brewery.contains("state")
|
||||||
|
? brewery["state"].get<std::string>()
|
||||||
|
: "Unknown";
|
||||||
|
|
||||||
|
beer_posts_json.push_back({
|
||||||
|
{"BeerPostID", i + 1},
|
||||||
|
{"Name", beer_name},
|
||||||
|
{"Description", description},
|
||||||
|
{"ABV", abv},
|
||||||
|
{"IBU", ibu},
|
||||||
|
{"BeerStyleID", (i % BEER_STYLES.size()) + 1},
|
||||||
|
{"StyleName", beer_style.first},
|
||||||
|
{"BreweryName", brewery_name},
|
||||||
|
{"BreweryCity", brewery_city},
|
||||||
|
{"BreweryState", brewery_state},
|
||||||
|
{"CreatedAt", "2026-03-24T00:00:00Z"},
|
||||||
|
});
|
||||||
|
|
||||||
|
beer_posts_generated++;
|
||||||
|
std::cout << " Generated: " << beer_name << " (" << abv << "% ABV, " << ibu
|
||||||
|
<< " IBU)" << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::ofstream posts_file("output/beer-posts.json");
|
||||||
|
posts_file << beer_posts_json.dump(2);
|
||||||
|
std::cout << "Generated " << beer_posts_generated << " beer posts"
|
||||||
|
<< std::endl;
|
||||||
|
|
||||||
|
/// Cleanup llama resources
|
||||||
|
if (llama_ctx) {
|
||||||
|
std::cout << "\nCleaning up llama context..." << std::endl;
|
||||||
|
llama_free(llama_ctx);
|
||||||
|
llama_ctx = nullptr;
|
||||||
|
}
|
||||||
|
if (llama_model_ptr) {
|
||||||
|
llama_model_free(llama_model_ptr);
|
||||||
|
llama_model_ptr = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Summary of generated data
|
||||||
|
std::cout << "\n=== Pipeline Complete ===" << std::endl;
|
||||||
|
std::cout << "Breweries fetched: " << total_count << std::endl;
|
||||||
|
std::cout << "Beer styles created: " << BEER_STYLES.size() << std::endl;
|
||||||
|
std::cout << "Beer posts generated: " << beer_posts_generated << std::endl;
|
||||||
|
std::cout << "Output files created:" << std::endl;
|
||||||
|
std::cout << " - output/breweries.json" << std::endl;
|
||||||
|
std::cout << " - output/beer-styles.json" << std::endl;
|
||||||
|
std::cout << " - output/beer-posts.json" << std::endl;
|
||||||
|
|
||||||
|
/// Cleanup is handled by GlobalCurl RAII guard, but explicit cleanup is safe
|
||||||
|
curl_global_cleanup();
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user