From b8ebe03921eaf7d545a85d83fe5935fb774e7bf6 Mon Sep 17 00:00:00 2001 From: Aaron Po Date: Tue, 12 May 2026 00:44:09 -0400 Subject: [PATCH] Pipeline: Add Runpod docker configuration (#222) * Begin work on Runpod docker config * Reduce docker image size * Create .dockerignore --- docs/pipeline/README.md | 161 +++++++++++++++--- docs/pipeline/diagrams/current/activity.puml | 2 +- docs/pipeline/diagrams/current/class.puml | 4 +- docs/pipeline/diagrams/planned/activity.puml | 2 +- docs/pipeline/diagrams/planned/class.puml | 4 +- tooling/pipeline/.dockerignore | 9 + tooling/pipeline/CMakeLists.txt | 142 ++++++++------- .../data_generation/llama_generator.h | 3 +- tooling/pipeline/includes/data_model/models.h | 6 +- tooling/pipeline/runpod/.dockerignore | 9 + tooling/pipeline/runpod/Dockerfile | 72 ++++++++ tooling/pipeline/runpod/README.md | 8 + tooling/pipeline/runpod/pod-template.yaml | 22 +++ tooling/pipeline/runpod/start.sh | 58 +++++++ .../application_options/parse_arguments.cc | 4 + .../data_generation/llama/llama_generator.cc | 1 + .../src/data_generation/llama/load.cc | 8 +- 17 files changed, 425 insertions(+), 90 deletions(-) create mode 100644 tooling/pipeline/.dockerignore create mode 100644 tooling/pipeline/runpod/.dockerignore create mode 100644 tooling/pipeline/runpod/Dockerfile create mode 100644 tooling/pipeline/runpod/README.md create mode 100644 tooling/pipeline/runpod/pod-template.yaml create mode 100644 tooling/pipeline/runpod/start.sh diff --git a/docs/pipeline/README.md b/docs/pipeline/README.md index aafa3a0..f08757c 100644 --- a/docs/pipeline/README.md +++ b/docs/pipeline/README.md @@ -18,6 +18,7 @@ descriptions via a local GGUF model or a deterministic mock. - [Build](#build) - [Model](#model) - [Run](#run) +- [Docker / RunPod](#docker--runpod) - [Architecture](#architecture) - [Pipeline Stages](#pipeline-stages) - [Key Components](#key-components) @@ -51,7 +52,7 @@ step. ### Build -Requirements: C++20 compiler, CMake 3.24+, libcurl, Boost (JSON and +Requirements: C++20 compiler, CMake 3.31+, OpenSSL, Boost (JSON and ProgramOptions). SQLite is fetched from the upstream amalgamation, so no system SQLite package is required. @@ -60,6 +61,16 @@ cmake -S . -B build cmake --build build ``` +CMake automatically detects whether a compatible llama.cpp installation is +present on the system (`libllama`, `libggml`, `libggml-base`, and `llama.h` +visible on the default search paths). If found, it links against those +libraries and skips the FetchContent build. If not found, it fetches and builds +llama.cpp from source at tag `b9012`. No additional flags are required in +either case. + +Metal is enabled automatically on Apple Silicon. CUDA or HIP/ROCm is detected +automatically on Linux when the relevant toolkit is present. + ### Model > Skip this step if you only need `--mocked`. @@ -74,33 +85,124 @@ curl -L \ ### Run Run from `build/` so the copied `locations.json` and `prompts/` are available. -Each run also writes a fresh dated SQLite file such as +Each run writes a fresh dated SQLite file such as `biergarten_seed_2026-04-19T15-30-45.123456Z.sqlite` into the working directory. ```bash ./biergarten-pipeline --mocked -./biergarten-pipeline --model models/google_gemma-4-E4B-it-Q6_K.gguf --temperature 1.0 --top-p 0.95 --top-k 64 --n-ctx 8192 --seed -1 + +./biergarten-pipeline \ + --model ../models/google_gemma-4-E4B-it-Q6_K.gguf \ + --prompt-dir prompts \ + --temperature 1.0 --top-p 0.95 --top-k 64 --n-ctx 8192 --seed -1 ``` #### CLI Flags -| Flag | Purpose | -| --------------- | ------------------------------------------------------- | -| `--mocked` | Deterministic mock generator, no model required. | -| `--model, -m` | Path to a GGUF file. Required unless `--mocked` is set. | -| `--temperature` | Sampling temperature. Default: `1.0`. | -| `--top-p` | Nucleus sampling. Default: `0.95`. | -| `--top-k` | Top-k sampling. Default: `64`. | -| `--n-ctx` | Context window size. Default: `8192`. | -| `--seed` | Random seed. Default: `-1` (random at runtime). | -| `--help, -h` | Print usage and exit. | +| Flag | Purpose | +| --------------- | ---------------------------------------------------------------------------------------------------- | +| `--mocked` | Deterministic mock generator, no model required. | +| `--model, -m` | Path to a GGUF file. Required unless `--mocked` is set. | +| `--prompt-dir` | Directory containing prompt files (e.g. `BREWERY_GENERATION.md`). Required unless `--mocked` is set. | +| `--output, -o` | Directory for generated SQLite artifacts. Default: `output`. | +| `--log-path` | Path for application logs. Default: `pipeline.log`. | +| `--temperature` | Sampling temperature. Default: `1.0`. | +| `--top-p` | Nucleus sampling. Default: `0.95`. | +| `--top-k` | Top-k sampling. Default: `64`. | +| `--n-ctx` | Context window size. Default: `8192`. | +| `--seed` | Random seed. Default: `-1` (random at runtime). | +| `--help, -h` | Print usage and exit. | `--mocked` and `--model` are mutually exclusive. Omitting both exits with an error before the pipeline starts. Sampling flags are ignored when `--mocked` is set. The post-build step copies `prompts/` into `build/prompts/`. Rebuild after -editing `prompts/system.md`. +editing any prompt file. + +--- + +## Docker / RunPod + +The `tooling/pipeline/runpod/` directory contains a GPU-ready container +configuration for running the pipeline on RunPod or any Docker host with an +NVIDIA GPU. + +### How it works + +The container uses a two-stage build. The first stage pulls prebuilt +`libllama`, `libggml`, and backend plugin libraries (including `libggml-cuda.so` +and the CPU variant plugins) from `ghcr.io/ggml-org/llama.cpp:full-cuda`. The +second stage copies those libraries into `/usr/local/lib` and runs `ldconfig` so +the dynamic linker and `dlopen` calls from `ggml_backend_load_all()` can resolve +the CUDA backend plugin at runtime. llama.cpp headers are cloned at the matching +tag and installed into `/usr/local/include`. CMake auto-detects both and skips +the FetchContent source build entirely, keeping image build times short. + +`GGML_BACKEND_PATH` is set to `/usr/local/lib` so llama.cpp knows where to scan +for backend plugins. + +### Build the image + +Run from the `tooling/pipeline/` directory (the CMake project root), not from +inside `runpod/`, so the `COPY . .` step picks up the full project context. + +```bash +docker build -t biergarten-pipeline:latest -f runpod/Dockerfile . +``` + +To monitor the full build output and confirm CMake selects the system llama.cpp: + +```bash +docker build \ + --progress=plain \ + --no-cache \ + -t biergarten-pipeline:latest \ + -f runpod/Dockerfile \ + . 2>&1 | tee build.log +``` + +Look for `[biergarten] Found system llama.cpp — skipping FetchContent` in the +output to confirm the fast path was taken. + +### Run in mocked mode + +No model or GPU required. Useful for validating the pipeline logic and SQLite +export path. + +```bash +docker run --rm \ + -e BIERGARTEN_MODE=mocked \ + -v "$PWD/output:/workspace/output" \ + -v "$PWD/logs:/workspace/logs" \ + biergarten-pipeline:latest +``` + +### Run in live mode + +Mount your GGUF model before starting. The container validates the model path +before launching the binary. + +```bash +docker run --rm \ + --runtime=nvidia \ + -e BIERGARTEN_MODE=live \ + -e GGML_BACKEND_PATH="/usr/local/lib/libggml-cuda.so" \ + -v "$PWD/models:/workspace/models" \ + -v "$PWD/output:/workspace/output" \ + -v "$PWD/logs:/workspace/logs" \ + biergarten-pipeline:latest +``` + +The model must be present at `./models/google_gemma-4-E4B-it-Q6_K.gguf` on the +host. See [Model](#model) above for the download command. + +### RunPod deployment + +Use a GPU pod template. Mount persistent storage for `/workspace/models`, +`/workspace/output`, and `/workspace/logs`. Set `BIERGARTEN_MODE=live` in the +template environment. See `tooling/pipeline/runpod/pod-template.yaml` for a +starter template. --- @@ -197,16 +299,18 @@ code, latitude, and longitude for each entry. ## Tech Stack - C++20 -- CMake 3.24+ +- CMake 3.31+ - Boost.JSON, Boost.ProgramOptions, Boost.DI - spdlog -- libcurl +- cpp-httplib (with OpenSSL) - SQLite amalgamation fetched and compiled via CMake FetchContent -- llama.cpp +- llama.cpp (auto-detected from system install or fetched via FetchContent) +- Docker with NVIDIA CUDA 12.6 base image for GPU container builds +- RunPod for cloud GPU inference -The build fetches Boost.DI, spdlog, llama.cpp, and SQLite via CMake. Metal is -enabled on Apple Silicon; CUDA or HIP/ROCm is detected on Linux when the toolkit -is present. +The build fetches Boost.DI, spdlog, and SQLite via CMake. llama.cpp is fetched +only when a system installation is not detected. Metal is enabled on Apple +Silicon; CUDA or HIP/ROCm is detected on Linux when the toolkit is present. > **Code Style:** Modern C++20 throughout — RAII for ownership, > `std::unique_ptr` for injected dependencies, `std::optional` for parse @@ -218,7 +322,7 @@ is present. ## Tested Hardware -### ARM macOS - M1 Pro +### ARM macOS — M1 Pro | | | | --------- | --------------------------------- | @@ -229,7 +333,7 @@ is present. | Model | Gemma 4 E4B | | Inference | llama.cpp with Metal | -### x86_64 Linux - NVIDIA RTX 2000 +### x86_64 Linux — NVIDIA RTX 2000 | | | | --------- | ------------------------------ | @@ -240,6 +344,15 @@ is present. | Model | Gemma 4 E4B | | Inference | llama.cpp with CUDA 12.x | +### x86_64 Linux — Docker / RunPod (NVIDIA CUDA) + +| | | +| --------- | ------------------------------------------- | +| Host | RunPod GPU pod | +| Base | nvidia/cuda:12.6.3-devel-ubuntu24.04 | +| Model | Gemma 4 E4B Q6_K | +| Inference | llama.cpp prebuilt CUDA backends via dlopen | + --- ## Fixture Strategy @@ -260,8 +373,9 @@ is present. | `includes/` | Public headers and shared models. | | `src/` | Implementation files. | | `locations.json` | Curated city input copied into the build tree. | -| `prompts/` | System prompt used by the model-backed path. | +| `prompts/` | System prompts used by the model-backed path. | | `diagrams/` | Architecture and pipeline diagrams. | +| `tooling/pipeline/runpod/` | Dockerfile, launcher, and RunPod pod template. | | `ETHICS-AND-KNOWN-ISSUES.md` | Ethics, bias, hallucination analysis, mitigations. | --- @@ -276,6 +390,7 @@ is present. - `src/data_generation/llama/` — local inference, prompt loading, output validation. - `src/data_generation/mock/` — deterministic fallback. +- `tooling/pipeline/runpod/` — container build and runtime launcher. --- diff --git a/docs/pipeline/diagrams/current/activity.puml b/docs/pipeline/diagrams/current/activity.puml index 87caefb..6a4d8c2 100644 --- a/docs/pipeline/diagrams/current/activity.puml +++ b/docs/pipeline/diagrams/current/activity.puml @@ -29,7 +29,7 @@ if (Are arguments valid?) then (no) else (yes) endif -:Init CurlGlobalState & LlamaBackendState; +:Init OpenSSL global state & LlamaBackendState; :di::make_injector(...); :injector.create>(); :BiergartenDataGenerator::Run(); diff --git a/docs/pipeline/diagrams/current/class.puml b/docs/pipeline/diagrams/current/class.puml index 74acc97..76955e2 100644 --- a/docs/pipeline/diagrams/current/class.puml +++ b/docs/pipeline/diagrams/current/class.puml @@ -52,7 +52,7 @@ interface WebClient <> { + UrlEncode(value : const std::string&) : std::string } -class CURLWebClient { +class HttpWebClient { + Get(url : const std::string&) : std::string + UrlEncode(value : const std::string&) : std::string } @@ -130,7 +130,7 @@ BiergartenDataGenerator *-- IExportService : owns IEnrichmentService <|.. WikipediaService : implements WikipediaService *-- WebClient : owns -WebClient <|.. CURLWebClient : implements +WebClient <|.. HttpWebClient : implements DataGenerator <|.. MockGenerator : implements DataGenerator <|.. LlamaGenerator : implements diff --git a/docs/pipeline/diagrams/planned/activity.puml b/docs/pipeline/diagrams/planned/activity.puml index 6f92560..a6f1d18 100644 --- a/docs/pipeline/diagrams/planned/activity.puml +++ b/docs/pipeline/diagrams/planned/activity.puml @@ -13,7 +13,7 @@ if (Invalid args?) then (yes) stop else (no) endif -:Init CurlGlobalState & LlamaBackendState; +:Init OpenSSL global state & LlamaBackendState; :Build DI injector; :Initialize SqliteExportService; diff --git a/docs/pipeline/diagrams/planned/class.puml b/docs/pipeline/diagrams/planned/class.puml index fd950c9..ad1a819 100644 --- a/docs/pipeline/diagrams/planned/class.puml +++ b/docs/pipeline/diagrams/planned/class.puml @@ -356,7 +356,7 @@ package "Infrastructure: Enrichment" { + UrlEncode(value : const std::string&) : std::string } - class CURLWebClient { + class HttpWebClient { + Get(url : const std::string&) : std::string + UrlEncode(value : const std::string&) : std::string } @@ -520,7 +520,7 @@ CheckinDistributionStrategy <|.. RandomCheckinStrategy FollowGenerationStrategy <|.. RandomFollowStrategy FollowGenerationStrategy <|.. ActivityWeightedFollowStrategy EnrichmentService <|.. WikipediaService -WebClient <|.. CURLWebClient +WebClient <|.. HttpWebClient DataGenerator <|.. MockGenerator DataGenerator <|.. LlamaGenerator PromptFormatter <|.. Gemma4JinjaPromptFormatter diff --git a/tooling/pipeline/.dockerignore b/tooling/pipeline/.dockerignore new file mode 100644 index 0000000..5a9e811 --- /dev/null +++ b/tooling/pipeline/.dockerignore @@ -0,0 +1,9 @@ +build/ +cmake-build-debug/ +.git/ +.idea/ +**/*.sqlite +**/*.log +**/*.sqlite3 +**/*.db + diff --git a/tooling/pipeline/CMakeLists.txt b/tooling/pipeline/CMakeLists.txt index 0a569b3..9aea17d 100644 --- a/tooling/pipeline/CMakeLists.txt +++ b/tooling/pipeline/CMakeLists.txt @@ -1,41 +1,45 @@ cmake_minimum_required(VERSION 3.31) project(biergarten-pipeline) +# Set policy to allow FetchContent_Populate for header-only libraries +# that have outdated CMakeLists.txt files +cmake_policy(SET CMP0169 OLD) + # 1. Build Options option(BIERGARTEN_MOCK_ONLY "Build with mock data generators only — skips llama.cpp" OFF) -if (BIERGARTEN_MOCK_ONLY) - message(STATUS "[biergarten] MOCK_ONLY build — llama.cpp will not be compiled.") -endif () +if(BIERGARTEN_MOCK_ONLY) + message(STATUS "[biergarten] MOCK_ONLY build — llama.cpp will not be compiled.") +endif() # 2. Platform & GPU Detection -if (NOT UNIX) - message(FATAL_ERROR "[biergarten] Windows is not supported. Please use Linux (Fedora 43) or macOS (M1 Pro).") -endif () +if(NOT UNIX) + message(FATAL_ERROR "[biergarten] Windows is not supported. Please use Linux (Fedora 43) or macOS (M1 Pro).") +endif() -if (APPLE) - if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") - message(STATUS "[biergarten] Apple Silicon detected — enabling Metal acceleration.") - set(GGML_METAL ON CACHE BOOL "Enable Metal for Apple Silicon" FORCE) - else () - message(STATUS "[biergarten] Intel Mac detected — using CPU / Accelerate framework.") - set(GGML_METAL OFF CACHE BOOL "Disable Metal for Intel Macs" FORCE) - endif () -else () - find_package(CUDAToolkit QUIET) - find_package(hip CONFIG QUIET) +if(APPLE) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64") + message(STATUS "[biergarten] Apple Silicon detected — enabling Metal acceleration.") + set(GGML_METAL ON CACHE BOOL "Enable Metal for Apple Silicon" FORCE) + else() + message(STATUS "[biergarten] Intel Mac detected — using CPU / Accelerate framework.") + set(GGML_METAL OFF CACHE BOOL "Disable Metal for Intel Macs" FORCE) + endif() +else() + find_package(CUDAToolkit QUIET) + find_package(hip CONFIG QUIET) - if (CUDAToolkit_FOUND) - message(STATUS "[biergarten] NVIDIA GPU detected — enabling CUDA acceleration.") - set(GGML_CUDA ON CACHE BOOL "Enable CUDA for NVIDIA GPUs" FORCE) - set(CMAKE_CUDA_ARCHITECTURES native) - elseif (hip_FOUND OR DEFINED ENV{ROCM_PATH} OR EXISTS "/opt/rocm") - message(STATUS "[biergarten] AMD GPU detected — enabling HIP/ROCm acceleration.") - set(GGML_HIPBLAS ON CACHE BOOL "Enable HIP for AMD GPUs" FORCE) - else () - message(STATUS "[biergarten] No NVIDIA or AMD GPU found — falling back to CPU.") - endif () -endif () + if(CUDAToolkit_FOUND) + message(STATUS "[biergarten] NVIDIA GPU detected — enabling CUDA acceleration.") + set(GGML_CUDA ON CACHE BOOL "Enable CUDA for NVIDIA GPUs" FORCE) + set(CMAKE_CUDA_ARCHITECTURES native) + elseif(hip_FOUND OR DEFINED ENV{ROCM_PATH} OR EXISTS "/opt/rocm") + message(STATUS "[biergarten] AMD GPU detected — enabling HIP/ROCm acceleration.") + set(GGML_HIPBLAS ON CACHE BOOL "Enable HIP for AMD GPUs" FORCE) + else() + message(STATUS "[biergarten] No NVIDIA or AMD GPU found — falling back to CPU.") + endif() +endif() # 3. Project-wide Settings set(CMAKE_CXX_STANDARD 20) @@ -51,16 +55,23 @@ include(FetchContent) find_package(Boost REQUIRED COMPONENTS json program_options) # Boost.DI (unofficial Boost extension, must declare separately from main Boost dependency) +# Header-only library, so we only fetch without invoking its CMakeLists.txt FetchContent_Declare( boost-di GIT_REPOSITORY https://github.com/boost-ext/di.git GIT_TAG v1.3.0 + GIT_SHALLOW TRUE ) -FetchContent_MakeAvailable(boost-di) -if (TARGET Boost.DI AND NOT TARGET boost::di) - add_library(boost::di ALIAS Boost.DI) -endif () +FetchContent_GetProperties(boost-di) +if(NOT boost-di_POPULATED) + FetchContent_Populate(boost-di) +endif() +add_library(boost_di INTERFACE) +add_library(boost::di ALIAS boost_di) +target_include_directories(boost_di INTERFACE + $ +) # SQLite amalgamation FetchContent_Declare( sqlite_amalgamation @@ -69,21 +80,38 @@ FetchContent_Declare( EXCLUDE_FROM_ALL ) FetchContent_MakeAvailable(sqlite_amalgamation) -if (NOT TARGET sqlite3) - add_library(sqlite3 STATIC ${sqlite_amalgamation_SOURCE_DIR}/sqlite3.c) - target_include_directories(sqlite3 PUBLIC ${sqlite_amalgamation_SOURCE_DIR}) - target_compile_definitions(sqlite3 PUBLIC SQLITE_THREADSAFE=1) -endif () +if(NOT TARGET sqlite3) + add_library(sqlite3 STATIC ${sqlite_amalgamation_SOURCE_DIR}/sqlite3.c) + target_include_directories(sqlite3 PUBLIC ${sqlite_amalgamation_SOURCE_DIR}) + target_compile_definitions(sqlite3 PUBLIC SQLITE_THREADSAFE=1) +endif() # llama.cpp — skipped for mock-only builds -if (NOT BIERGARTEN_MOCK_ONLY) - FetchContent_Declare( - llama-cpp - GIT_REPOSITORY https://github.com/ggml-org/llama.cpp.git - GIT_TAG b8742 - ) - FetchContent_MakeAvailable(llama-cpp) -endif () +if(NOT BIERGARTEN_MOCK_ONLY) + find_library(LLAMA_LIB NAMES llama) + find_library(GGML_LIB NAMES ggml) + find_library(GGML_BASE_LIB NAMES ggml-base) + find_path(LLAMA_INC_DIR NAMES llama.h PATH_SUFFIXES include) + + if(LLAMA_LIB AND GGML_LIB AND GGML_BASE_LIB AND LLAMA_INC_DIR) + message(STATUS "[biergarten] Found system llama.cpp — skipping FetchContent") + + add_library(llama SHARED IMPORTED) + set_target_properties(llama PROPERTIES + IMPORTED_LOCATION "${LLAMA_LIB}" + INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INC_DIR}" + INTERFACE_LINK_LIBRARIES "${GGML_LIB};${GGML_BASE_LIB}" + ) + else() + message(STATUS "[biergarten] System llama.cpp not found — fetching via FetchContent") + FetchContent_Declare( + llama-cpp + GIT_REPOSITORY https://github.com/ggml-org/llama.cpp.git + GIT_TAG b9012 + ) + FetchContent_MakeAvailable(llama-cpp) + endif() +endif() # spdlog FetchContent_Declare( @@ -153,16 +181,16 @@ target_sources(${PROJECT_NAME} PRIVATE ) # --- data_generation: llama (skipped for mock-only builds) --- -if (NOT BIERGARTEN_MOCK_ONLY) - target_sources(${PROJECT_NAME} PRIVATE - src/data_generation/llama/load.cc - src/data_generation/llama/helpers.cc - src/data_generation/llama/generate_brewery.cc - src/data_generation/llama/infer.cc - src/data_generation/llama/llama_generator.cc - src/data_generation/llama/generate_user.cc - ) -endif () +if(NOT BIERGARTEN_MOCK_ONLY) + target_sources(${PROJECT_NAME} PRIVATE + src/data_generation/llama/load.cc + src/data_generation/llama/helpers.cc + src/data_generation/llama/generate_brewery.cc + src/data_generation/llama/infer.cc + src/data_generation/llama/llama_generator.cc + src/data_generation/llama/generate_user.cc + ) +endif() # --- services: wikipedia --- target_sources(${PROJECT_NAME} PRIVATE @@ -189,8 +217,6 @@ target_sources(${PROJECT_NAME} PRIVATE # 6. Include Directories, Link Libraries & Compile Definitions target_include_directories(${PROJECT_NAME} PRIVATE includes - $<$>:${llama-cpp_SOURCE_DIR}/include> - $<$>:${llama-cpp_SOURCE_DIR}/common> ) target_link_libraries(${PROJECT_NAME} PRIVATE @@ -225,4 +251,4 @@ add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/prompts ${CMAKE_BINARY_DIR}/prompts -) \ No newline at end of file +) diff --git a/tooling/pipeline/includes/data_generation/llama_generator.h b/tooling/pipeline/includes/data_generation/llama_generator.h index c3c4447..35c1340 100644 --- a/tooling/pipeline/includes/data_generation/llama_generator.h +++ b/tooling/pipeline/includes/data_generation/llama_generator.h @@ -14,10 +14,10 @@ #include #include +#include "../services/prompting/prompt_directory.h" #include "data_generation/data_generator.h" #include "data_generation/prompt_formatting/prompt_formatter.h" #include "data_model/models.h" -#include "../services/prompting/prompt_directory.h" struct llama_model; struct llama_context; @@ -129,6 +129,7 @@ class LlamaGenerator final : public DataGenerator { uint32_t sampling_top_k_ = kDefaultSamplingTopK; std::mt19937 rng_; uint32_t n_ctx_ = kDefaultContextSize; + int n_gpu_layers_ = 0; std::unique_ptr prompt_formatter_; std::unique_ptr prompt_directory_; }; diff --git a/tooling/pipeline/includes/data_model/models.h b/tooling/pipeline/includes/data_model/models.h index 0b97f52..f08cf41 100644 --- a/tooling/pipeline/includes/data_model/models.h +++ b/tooling/pipeline/includes/data_model/models.h @@ -3,7 +3,8 @@ /** * @file data_model/models.h - * @brief Core data models: locations, application configuration, and generation inputs. + * @brief Core data models: locations, application configuration, and generation + * inputs. */ #include @@ -94,6 +95,9 @@ struct GeneratorOptions { /// @brief Use mocked generator instead of actual LLM inference. bool use_mocked = false; + /// @brief Number of layers to offload to GPU. + int n_gpu_layers = 0; + /// @brief Specific sampling parameters for this generator. /// If nullopt, the application should use global defaults. std::optional sampling; diff --git a/tooling/pipeline/runpod/.dockerignore b/tooling/pipeline/runpod/.dockerignore new file mode 100644 index 0000000..9359252 --- /dev/null +++ b/tooling/pipeline/runpod/.dockerignore @@ -0,0 +1,9 @@ +# Ignore model files! +*.gguf +*.bin +models/ +weights/ + +# Ignore local build folders +build/ +.git/ diff --git a/tooling/pipeline/runpod/Dockerfile b/tooling/pipeline/runpod/Dockerfile new file mode 100644 index 0000000..09fa455 --- /dev/null +++ b/tooling/pipeline/runpod/Dockerfile @@ -0,0 +1,72 @@ +# --- Stage 1: Build Environment (The "Heavy" Stage) --- +FROM nvidia/cuda:12.6.3-devel-ubuntu24.04 AS builder + +ENV DEBIAN_FRONTEND=noninteractive \ + CMAKE_GENERATOR=Ninja + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential ca-certificates curl git libboost-json-dev \ + libboost-program-options-dev libssl-dev ninja-build pkg-config zlib1g-dev \ + && rm -rf /var/lib/apt/lists/* + +# Install modern CMake +RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.31.0/cmake-3.31.0-linux-x86_64.sh -o cmake.sh && \ + sh cmake.sh --skip-license --prefix=/usr/local && rm cmake.sh + +# Get headers for C++ build +RUN curl -L https://github.com/ggml-org/llama.cpp/archive/refs/tags/b9012.tar.gz -o /tmp/llama-src.tar.gz && \ + tar -xzf /tmp/llama-src.tar.gz -C /tmp && \ + cp -r /tmp/llama.cpp-b9012/include/* /usr/local/include/ && \ + cp -r /tmp/llama.cpp-b9012/ggml/include/* /usr/local/include/ + +# Pull llama.cpp binaries to use during build if needed +COPY --from=ghcr.io/ggml-org/llama.cpp:full-cuda /app/lib*.so* /usr/local/lib/ + +WORKDIR /app +COPY . . + +# Build the C++ pipeline +RUN cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release && \ + cmake --build build -j$(nproc) + +# --- Stage 2: Runtime Environment (The "Slim" Stage) --- +FROM nvidia/cuda:12.6.3-runtime-ubuntu24.04 AS runtime + +# Install only necessary runtime shared libraries +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl \ + ca-certificates \ + libboost-json1.83.0 \ + libboost-program-options1.83.0 \ + libgomp1 \ + libssl3 \ + zlib1g \ + && rm -rf /var/lib/apt/lists/* + +ENV APP_ROOT=/app \ + LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" + +WORKDIR /app/build + +# Copy only the compiled binaries from the builder +COPY --from=builder /app/build/biergarten-pipeline ./ + +# Copy required config files +COPY locations.json /app/build/ +COPY beer-styles.json /app/build/ + +# Copy prompt templates +COPY prompts /app/prompts + +# Copy only the necessary shared libraries from builder/llama-bin +COPY --from=ghcr.io/ggml-org/llama.cpp:full-cuda /app/lib*.so* /usr/local/lib/ + +# Co-locate plugins +RUN cp /usr/local/lib/libggml-cuda.so . 2>/dev/null || true && \ + cp /usr/local/lib/libggml-cpu*.so . 2>/dev/null || true + +# Setup Start Script +COPY ./runpod/start.sh /usr/local/bin/biergarten-start +RUN chmod +x /usr/local/bin/biergarten-start + +ENTRYPOINT ["/usr/local/bin/biergarten-start"] diff --git a/tooling/pipeline/runpod/README.md b/tooling/pipeline/runpod/README.md new file mode 100644 index 0000000..d74c275 --- /dev/null +++ b/tooling/pipeline/runpod/README.md @@ -0,0 +1,8 @@ +```bash +touch runpod/start.sh +docker build \ + --progress=plain \ + -t biergarten-pipeline:latest \ + -f runpod/Dockerfile \ + . 2>&1 | tee build.log +``` diff --git a/tooling/pipeline/runpod/pod-template.yaml b/tooling/pipeline/runpod/pod-template.yaml new file mode 100644 index 0000000..3b8d5c0 --- /dev/null +++ b/tooling/pipeline/runpod/pod-template.yaml @@ -0,0 +1,22 @@ +name: biergarten-pipeline-live +imageName: biergarten-pipeline:latest +category: NVIDIA +containerDiskInGb: 50 +volumeInGb: 50 +volumeMountPath: /workspace +dockerEntrypoint: + - /usr/local/bin/biergarten-start +dockerStartCmd: [] +isPublic: false +isServerless: false +env: + BIERGARTEN_MODE: live + BIERGARTEN_MODEL_PATH: /workspace/models/google_gemma-4-E4B-it-Q6_K.gguf + BIERGARTEN_PROMPT_DIR: /workspace/app/build/prompts + BIERGARTEN_OUTPUT_DIR: /workspace/output + BIERGARTEN_LOG_PATH: /workspace/logs/pipeline.log + BIERGARTEN_TEMPERATURE: "1.0" + BIERGARTEN_TOP_P: "0.95" + BIERGARTEN_TOP_K: "64" + BIERGARTEN_N_CTX: "8192" + BIERGARTEN_SEED: "-1" diff --git a/tooling/pipeline/runpod/start.sh b/tooling/pipeline/runpod/start.sh new file mode 100644 index 0000000..8deabaa --- /dev/null +++ b/tooling/pipeline/runpod/start.sh @@ -0,0 +1,58 @@ +#!/bin/bash +set -e + +MODEL_PATH="${BIERGARTEN_MODEL_PATH:-/workspace/models/google_gemma-4-E4B-it-Q6_K.gguf}" +OUTPUT_DIR="${BIERGARTEN_OUTPUT_DIR:-/workspace/output}" +LOG_PATH="${BIERGARTEN_LOG_PATH:-/workspace/logs/pipeline.log}" +EXECUTABLE="/app/build/biergarten-pipeline" +PROMPT_DIR="/app/prompts" + +echo "--- Starting Biergarten Pipeline Environment Check ---" + +# Ensure directories exist +mkdir -p "$OUTPUT_DIR" +mkdir -p "$(dirname "$LOG_PATH")" +mkdir -p "$(dirname "$MODEL_PATH")" + +# Download model if missing +if [ ! -f "$MODEL_PATH" ]; then + echo "Model not found. Downloading (this may take a while)..." + + curl -L -C - \ + -o "$MODEL_PATH" \ + "https://huggingface.co/bartowski/google_gemma-4-E4B-it-GGUF/resolve/main/google_gemma-4-E4B-it-Q6_K.gguf?download=true" + + echo "Download complete." +fi + +# Verify model exists +if [ ! -f "$MODEL_PATH" ]; then + echo "ERROR: Model still not found after download attempt." + exit 1 +fi + +# Default GPU layers +GL_LAYERS="${BIERGARTEN_GL_LAYERS:-40}" + +# Build args +ARGS=( + "--model" "$MODEL_PATH" + "--prompt-dir" "$PROMPT_DIR" + "--output" "$OUTPUT_DIR" + "--log-path" "$LOG_PATH" + "--n-gpu-layers" "$GL_LAYERS" +) + +# Optional params +[[ -n "$BIERGARTEN_TEMPERATURE" ]] && ARGS+=("--temperature" "$BIERGARTEN_TEMPERATURE") +[[ -n "$BIERGARTEN_TOP_P" ]] && ARGS+=("--top-p" "$BIERGARTEN_TOP_P") +[[ -n "$BIERGARTEN_TOP_K" ]] && ARGS+=("--top-k" "$BIERGARTEN_TOP_K") +[[ -n "$BIERGARTEN_N_CTX" ]] && ARGS+=("--n-ctx" "$BIERGARTEN_N_CTX") +[[ -n "$BIERGARTEN_SEED" ]] && ARGS+=("--seed" "$BIERGARTEN_SEED") + +# Extra args +[[ -n "$BIERGARTEN_EXTRA_ARGS" ]] && ARGS+=($BIERGARTEN_EXTRA_ARGS) + +echo "--- Executing: $EXECUTABLE ${ARGS[*]} ---" + +exec "$EXECUTABLE" "${ARGS[@]}" diff --git a/tooling/pipeline/src/application_options/parse_arguments.cc b/tooling/pipeline/src/application_options/parse_arguments.cc index 64c31c3..b06c1b7 100644 --- a/tooling/pipeline/src/application_options/parse_arguments.cc +++ b/tooling/pipeline/src/application_options/parse_arguments.cc @@ -50,6 +50,8 @@ std::optional ParseArguments(const int argc, char** argv) { opt("prompt-dir", prog_opts::value()->default_value(""), "Directory containing named prompt files (e.g. BREWERY_GENERATION.md)." " Required when not using --mocked."); + opt("n-gpu-layers", prog_opts::value()->default_value(0), + "Number of layers to offload to GPU"); }; add_sampling_options(); @@ -85,6 +87,7 @@ std::optional ParseArguments(const int argc, char** argv) { const bool use_mocked = var_map["mocked"].as(); const std::string model_path = var_map["model"].as(); + const int n_gpu_layers = var_map["n-gpu-layers"].as(); // Enforce mutual exclusivity before any further configuration is applied. if (use_mocked && !model_path.empty()) { @@ -110,6 +113,7 @@ std::optional ParseArguments(const int argc, char** argv) { options.generator.use_mocked = use_mocked; options.generator.model_path = model_path; + options.generator.n_gpu_layers = n_gpu_layers; // Only populate sampling config when the user explicitly overrides at // least one value. Leaving it as std::nullopt lets LlamaGenerator fall diff --git a/tooling/pipeline/src/data_generation/llama/llama_generator.cc b/tooling/pipeline/src/data_generation/llama/llama_generator.cc index 646c9bb..72a888e 100644 --- a/tooling/pipeline/src/data_generation/llama/llama_generator.cc +++ b/tooling/pipeline/src/data_generation/llama/llama_generator.cc @@ -89,6 +89,7 @@ LlamaGenerator::LlamaGenerator( } n_ctx_ = sampling.n_ctx; + n_gpu_layers_ = options.generator.n_gpu_layers; this->Load(model_path); } diff --git a/tooling/pipeline/src/data_generation/llama/load.cc b/tooling/pipeline/src/data_generation/llama/load.cc index 8ce3142..0829efb 100644 --- a/tooling/pipeline/src/data_generation/llama/load.cc +++ b/tooling/pipeline/src/data_generation/llama/load.cc @@ -12,6 +12,7 @@ #include #include "data_generation/llama_generator.h" +#include "ggml-backend.h" #include "llama.h" // Maximum batch size for decode operations. Capping the batch prevents @@ -22,7 +23,12 @@ void LlamaGenerator::Load(const std::string& model_path) { context_.reset(); model_.reset(); - const llama_model_params model_params = llama_model_default_params(); + // Specifically load dynamic ggml backends (like CUDA) that are provided + // externally before attempting to load a model. + ggml_backend_load_all(); + + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = n_gpu_layers_; LlamaGenerator::ModelHandle loaded_model( llama_model_load_from_file(model_path.c_str(), model_params)); if (!loaded_model) {