From 6eaa184eaaa17e94567bb84e04586da15b64f317 Mon Sep 17 00:00:00 2001 From: Aaron Po Date: Mon, 4 May 2026 15:44:32 -0400 Subject: [PATCH] updates --- .../data_generation/llama_generator.h | 3 +- tooling/pipeline/includes/data_model/models.h | 6 +- tooling/pipeline/runpod/Dockerfile | 24 ++++--- tooling/pipeline/runpod/README.md | 70 ++----------------- tooling/pipeline/runpod/pod-template.yaml | 37 +++------- tooling/pipeline/runpod/start.sh | 36 +++++----- .../application_options/parse_arguments.cc | 4 ++ .../data_generation/llama/llama_generator.cc | 1 + .../src/data_generation/llama/load.cc | 3 +- 9 files changed, 62 insertions(+), 122 deletions(-) diff --git a/tooling/pipeline/includes/data_generation/llama_generator.h b/tooling/pipeline/includes/data_generation/llama_generator.h index c3c4447..35c1340 100644 --- a/tooling/pipeline/includes/data_generation/llama_generator.h +++ b/tooling/pipeline/includes/data_generation/llama_generator.h @@ -14,10 +14,10 @@ #include #include +#include "../services/prompting/prompt_directory.h" #include "data_generation/data_generator.h" #include "data_generation/prompt_formatting/prompt_formatter.h" #include "data_model/models.h" -#include "../services/prompting/prompt_directory.h" struct llama_model; struct llama_context; @@ -129,6 +129,7 @@ class LlamaGenerator final : public DataGenerator { uint32_t sampling_top_k_ = kDefaultSamplingTopK; std::mt19937 rng_; uint32_t n_ctx_ = kDefaultContextSize; + int n_gpu_layers_ = 0; std::unique_ptr prompt_formatter_; std::unique_ptr prompt_directory_; }; diff --git a/tooling/pipeline/includes/data_model/models.h b/tooling/pipeline/includes/data_model/models.h index 0b97f52..f08cf41 100644 --- a/tooling/pipeline/includes/data_model/models.h +++ b/tooling/pipeline/includes/data_model/models.h @@ -3,7 +3,8 @@ /** * @file data_model/models.h - * @brief Core data models: locations, application configuration, and generation inputs. + * @brief Core data models: locations, application configuration, and generation + * inputs. */ #include @@ -94,6 +95,9 @@ struct GeneratorOptions { /// @brief Use mocked generator instead of actual LLM inference. bool use_mocked = false; + /// @brief Number of layers to offload to GPU. + int n_gpu_layers = 0; + /// @brief Specific sampling parameters for this generator. /// If nullopt, the application should use global defaults. std::optional sampling; diff --git a/tooling/pipeline/runpod/Dockerfile b/tooling/pipeline/runpod/Dockerfile index c167524..587e8bc 100644 --- a/tooling/pipeline/runpod/Dockerfile +++ b/tooling/pipeline/runpod/Dockerfile @@ -26,15 +26,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ RUN curl -L https://github.com/Kitware/CMake/releases/download/v3.31.0/cmake-3.31.0-linux-x86_64.sh -o cmake.sh && \ sh cmake.sh --skip-license --prefix=/usr/local && rm cmake.sh -# Copy and link backends +# Copy backends to /usr/local/lib and register with ldconfig so the +# runtime linker can resolve libllama.so, libggml.so, libggml-base.so etc. COPY --from=llama-bin /app/lib*.so* /usr/local/lib/ -RUN ldconfig && \ - find /usr/local/lib -name "libggml-cuda.so*" -exec ln -s {} /usr/local/lib/libggml-cuda.so \; 2>/dev/null || true && \ - find /usr/local/lib -name "libggml-cpu.so*" -exec ln -s {} /usr/local/lib/libggml-cpu.so \; 2>/dev/null || true - -# Set Environment for the loader -ENV GGML_BACKEND_PATH="/usr/local/lib" -ENV LD_LIBRARY_PATH="/usr/local/lib:$LD_LIBRARY_PATH" +RUN ldconfig # Headers for C++ Build RUN git clone --depth 1 -b b9012 https://github.com/ggml-org/llama.cpp.git /tmp/llama-src && \ @@ -42,6 +37,8 @@ RUN git clone --depth 1 -b b9012 https://github.com/ggml-org/llama.cpp.git /tmp/ cp -r /tmp/llama-src/ggml/include/* /usr/local/include/ && \ rm -rf /tmp/llama-src +ENV LD_LIBRARY_PATH="/usr/local/lib:${LD_LIBRARY_PATH}" + WORKDIR /workspace/app COPY . . @@ -49,6 +46,17 @@ COPY . . RUN cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release && \ cmake --build build -j$(nproc) +# Co-locate GGML backend plugins with the executable. +# ggml_backend_load_all() searches the executable directory first when +# GGML_BACKEND_DIR is not set. Copying the ggml-*.so plugin files here +# ensures the loader finds them without any environment variable. +# libllama.so, libggml.so, and libggml-base.so are NOT copied here — +# those are proper shared libraries resolved via ldconfig/LD_LIBRARY_PATH. +RUN cp /usr/local/lib/libggml-cuda.so /workspace/app/build/ 2>/dev/null || true && \ + cp /usr/local/lib/libggml-cpu*.so /workspace/app/build/ 2>/dev/null || true && \ + cp /usr/local/lib/libggml-blas*.so /workspace/app/build/ 2>/dev/null || true && \ + cp /usr/local/lib/libggml-rpc*.so /workspace/app/build/ 2>/dev/null || true + # Setup Start Script COPY runpod/start.sh /usr/local/bin/biergarten-start RUN chmod +x /usr/local/bin/biergarten-start diff --git a/tooling/pipeline/runpod/README.md b/tooling/pipeline/runpod/README.md index 4fa1d9e..d74c275 100644 --- a/tooling/pipeline/runpod/README.md +++ b/tooling/pipeline/runpod/README.md @@ -1,66 +1,8 @@ -# RunPod Pod Template for Biergarten Pipeline - -This folder contains a starter RunPod pod template for the C++ pipeline in the -repository root. - -## What it does - -- Builds `biergarten-pipeline` inside the container. -- Builds the binary on first pod start, then reuses a mode-specific build - directory (`build-mocked/` or `build-live/`). -- Runs from the repository root and lets the launcher switch into the active - build directory after CMake has copied `locations.json` and `prompts/`. -- Supports two runtime modes: - - `BIERGARTEN_MODE=mocked` — fast deterministic generation, no model required. - - `BIERGARTEN_MODE=live` — uses a mounted GGUF model and the prompt files. -- Writes generated SQLite exports and logs to writable volumes. - -## Files - -- `Dockerfile` — GPU-ready build image for the application. -- `start.sh` — runtime launcher that selects mocked or live mode. -- `pod-template.yaml` — starter pod template you can adapt to the exact RunPod - import/export schema. - -## Build the image - ```bash -docker build -t biergarten-pipeline:latest -f runpod/Dockerfile . +touch runpod/start.sh +docker build \ + --progress=plain \ + -t biergarten-pipeline:latest \ + -f runpod/Dockerfile \ + . 2>&1 | tee build.log ``` - -## Run locally in mocked mode - -```bash -docker run --rm \ - --gpus all \ - -e BIERGARTEN_MODE=mocked \ - -v "$PWD/output:/workspace/output" \ - -v "$PWD/logs:/workspace/logs" \ - biergarten-pipeline:latest -``` - -## Run locally in live mode - -Mount your GGUF model at `/workspace/models/google_gemma-4-E4B-it-Q6_K.gguf` -and switch to `BIERGARTEN_MODE=live`. - -```bash -docker run --rm \ - --gpus all \ - -e BIERGARTEN_MODE=live \ - -v "$PWD/models:/workspace/models" \ - -v "$PWD/output:/workspace/output" \ - -v "$PWD/logs:/workspace/logs" \ - biergarten-pipeline:latest -``` - -## Notes for RunPod - -- Use a GPU pod for live inference. -- Mount persistent storage for `/workspace/models`, `/workspace/output`, and - `/workspace/logs`. -- If you only want deterministic seed generation, change the template's - `BIERGARTEN_MODE` to `mocked`. -- The launcher handles the build directory automatically; CMake still copies - `locations.json` and `prompts/` into the active build tree before execution. - diff --git a/tooling/pipeline/runpod/pod-template.yaml b/tooling/pipeline/runpod/pod-template.yaml index 261dd6a..3b8d5c0 100644 --- a/tooling/pipeline/runpod/pod-template.yaml +++ b/tooling/pipeline/runpod/pod-template.yaml @@ -1,24 +1,15 @@ -# Biergarten Pipeline — RunPod pod template -# -# This template is meant to be imported into RunPod or adapted to the exact -# schema used by your account/export format. It intentionally keeps the runtime -# contract simple: -# - the container boots into /workspace/app/build -# - prompts are available from build/prompts -# - generated SQLite exports and logs go to writable volumes -# - mocked mode works without a model file -# - live mode can be enabled by setting BIERGARTEN_MODE=live and mounting a GGUF model - name: biergarten-pipeline-live -image: biergarten-pipeline:latest -workingDir: /workspace/app -entrypoint: +imageName: biergarten-pipeline:latest +category: NVIDIA +containerDiskInGb: 50 +volumeInGb: 50 +volumeMountPath: /workspace +dockerEntrypoint: - /usr/local/bin/biergarten-start -resources: - gpu: 1 - containerDiskInGb: 50 - volumeInGb: 50 -environment: +dockerStartCmd: [] +isPublic: false +isServerless: false +env: BIERGARTEN_MODE: live BIERGARTEN_MODEL_PATH: /workspace/models/google_gemma-4-E4B-it-Q6_K.gguf BIERGARTEN_PROMPT_DIR: /workspace/app/build/prompts @@ -29,11 +20,3 @@ environment: BIERGARTEN_TOP_K: "64" BIERGARTEN_N_CTX: "8192" BIERGARTEN_SEED: "-1" -volumes: - - name: models - mountPath: /workspace/models - - name: output - mountPath: /workspace/output - - name: logs - mountPath: /workspace/logs - diff --git a/tooling/pipeline/runpod/start.sh b/tooling/pipeline/runpod/start.sh index e23b20c..46b8c04 100644 --- a/tooling/pipeline/runpod/start.sh +++ b/tooling/pipeline/runpod/start.sh @@ -10,44 +10,40 @@ PROMPT_DIR="/workspace/app/build/prompts" echo "--- Starting Biergarten Pipeline Environment Check ---" -# 1. Ensure Volume Mounts exist +# 1. Ensure volume mount directories exist mkdir -p "$OUTPUT_DIR" mkdir -p "$(dirname "$LOG_PATH")" -# 2. Check for Model +# 2. Check for model file if [ ! -f "$MODEL_PATH" ]; then echo "ERROR: Model not found at $MODEL_PATH" echo "Current /workspace/models contents:" - ls -lh /workspace/models + ls -lh /workspace/models 2>/dev/null || echo "(directory does not exist)" exit 1 fi -# 3. Check for Backends (Diagnostic) -echo "Loading backends from: $GGML_BACKEND_PATH" -ls -l /usr/local/lib/libggml* - -# 4. Build the command arguments +# 3. Build the command arguments ARGS=( - "--model" "$MODEL_PATH" + "--model" "$MODEL_PATH" "--prompt-dir" "$PROMPT_DIR" - "--output" "$OUTPUT_DIR" - "--log-path" "$LOG_PATH" + "--output" "$OUTPUT_DIR" + "--log-path" "$LOG_PATH" ) -# Optional Hyperparameters -[[ -n "$BIERGARTEN_TEMPERATURE" ]] && ARGS+=("--temperature" "$BIERGARTEN_TEMPERATURE") -[[ -n "$BIERGARTEN_TOP_P" ]] && ARGS+=("--top-p" "$BIERGARTEN_TOP_P") -[[ -n "$BIERGARTEN_TOP_K" ]] && ARGS+=("--top-k" "$BIERGARTEN_TOP_K") -[[ -n "$BIERGARTEN_N_CTX" ]] && ARGS+=("--n-ctx" "$BIERGARTEN_N_CTX") -[[ -n "$BIERGARTEN_SEED" ]] && ARGS+=("--seed" "$BIERGARTEN_SEED") +# Optional hyperparameters +[[ -n "$BIERGARTEN_TEMPERATURE" ]] && ARGS+=("--temperature" "$BIERGARTEN_TEMPERATURE") +[[ -n "$BIERGARTEN_TOP_P" ]] && ARGS+=("--top-p" "$BIERGARTEN_TOP_P") +[[ -n "$BIERGARTEN_TOP_K" ]] && ARGS+=("--top-k" "$BIERGARTEN_TOP_K") +[[ -n "$BIERGARTEN_N_CTX" ]] && ARGS+=("--n-ctx" "$BIERGARTEN_N_CTX") +[[ -n "$BIERGARTEN_SEED" ]] && ARGS+=("--seed" "$BIERGARTEN_SEED") [[ -n "$BIERGARTEN_GL_LAYERS" ]] && ARGS+=("--n-gpu-layers" "$BIERGARTEN_GL_LAYERS") -# Append extra custom args +# Append any extra custom args if [[ -n "$BIERGARTEN_EXTRA_ARGS" ]]; then ARGS+=($BIERGARTEN_EXTRA_ARGS) fi -echo "--- Executing: $EXECUTABLE ${ARGS[@]} ---" +echo "--- Executing: $EXECUTABLE ${ARGS[*]} ---" -# Execute the binary directly (replaces shell process) +# Execute the binary directly, replacing the shell process exec "$EXECUTABLE" "${ARGS[@]}" diff --git a/tooling/pipeline/src/application_options/parse_arguments.cc b/tooling/pipeline/src/application_options/parse_arguments.cc index 64c31c3..b06c1b7 100644 --- a/tooling/pipeline/src/application_options/parse_arguments.cc +++ b/tooling/pipeline/src/application_options/parse_arguments.cc @@ -50,6 +50,8 @@ std::optional ParseArguments(const int argc, char** argv) { opt("prompt-dir", prog_opts::value()->default_value(""), "Directory containing named prompt files (e.g. BREWERY_GENERATION.md)." " Required when not using --mocked."); + opt("n-gpu-layers", prog_opts::value()->default_value(0), + "Number of layers to offload to GPU"); }; add_sampling_options(); @@ -85,6 +87,7 @@ std::optional ParseArguments(const int argc, char** argv) { const bool use_mocked = var_map["mocked"].as(); const std::string model_path = var_map["model"].as(); + const int n_gpu_layers = var_map["n-gpu-layers"].as(); // Enforce mutual exclusivity before any further configuration is applied. if (use_mocked && !model_path.empty()) { @@ -110,6 +113,7 @@ std::optional ParseArguments(const int argc, char** argv) { options.generator.use_mocked = use_mocked; options.generator.model_path = model_path; + options.generator.n_gpu_layers = n_gpu_layers; // Only populate sampling config when the user explicitly overrides at // least one value. Leaving it as std::nullopt lets LlamaGenerator fall diff --git a/tooling/pipeline/src/data_generation/llama/llama_generator.cc b/tooling/pipeline/src/data_generation/llama/llama_generator.cc index 646c9bb..72a888e 100644 --- a/tooling/pipeline/src/data_generation/llama/llama_generator.cc +++ b/tooling/pipeline/src/data_generation/llama/llama_generator.cc @@ -89,6 +89,7 @@ LlamaGenerator::LlamaGenerator( } n_ctx_ = sampling.n_ctx; + n_gpu_layers_ = options.generator.n_gpu_layers; this->Load(model_path); } diff --git a/tooling/pipeline/src/data_generation/llama/load.cc b/tooling/pipeline/src/data_generation/llama/load.cc index 64c4127..0829efb 100644 --- a/tooling/pipeline/src/data_generation/llama/load.cc +++ b/tooling/pipeline/src/data_generation/llama/load.cc @@ -27,7 +27,8 @@ void LlamaGenerator::Load(const std::string& model_path) { // externally before attempting to load a model. ggml_backend_load_all(); - const llama_model_params model_params = llama_model_default_params(); + llama_model_params model_params = llama_model_default_params(); + model_params.n_gpu_layers = n_gpu_layers_; LlamaGenerator::ModelHandle loaded_model( llama_model_load_from_file(model_path.c_str(), model_params)); if (!loaded_model) {