From 031be8ad5deddcbaedd6b4351edcde8ebe9c8105 Mon Sep 17 00:00:00 2001 From: Aaron Po Date: Sun, 3 May 2026 13:35:58 -0400 Subject: [PATCH] Pipeline: Remove CURL as a dependency, add new HTTP module (#219) Rationale: HTTP is a supporting concern in the pipeline, used only for Wikipedia enrichment calls. libcurl's C API required significant boilerplate to wrap safely. cpp-httplib is a header-only library that covers the same functionality with far less overhead and no manual resource management. --- tooling/pipeline/CMakeLists.txt | 69 +++++++++------ .../includes/web_client/curl_web_client.h | 54 ------------ .../includes/web_client/http_web_client.h | 49 +++++++++++ tooling/pipeline/src/main.cc | 21 +++-- .../src/web_client/curl_global_state.cc | 19 ---- .../src/web_client/curl_web_client_get.cc | 87 ------------------- .../web_client/curl_web_client_url_encode.cc | 24 ----- .../src/web_client/http_web_client.cc | 61 +++++++++++++ 8 files changed, 167 insertions(+), 217 deletions(-) delete mode 100644 tooling/pipeline/includes/web_client/curl_web_client.h create mode 100644 tooling/pipeline/includes/web_client/http_web_client.h delete mode 100644 tooling/pipeline/src/web_client/curl_global_state.cc delete mode 100644 tooling/pipeline/src/web_client/curl_web_client_get.cc delete mode 100644 tooling/pipeline/src/web_client/curl_web_client_url_encode.cc create mode 100644 tooling/pipeline/src/web_client/http_web_client.cc diff --git a/tooling/pipeline/CMakeLists.txt b/tooling/pipeline/CMakeLists.txt index 8cc373c..0a569b3 100644 --- a/tooling/pipeline/CMakeLists.txt +++ b/tooling/pipeline/CMakeLists.txt @@ -42,16 +42,25 @@ set(CMAKE_CXX_STANDARD 20) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -flto") -set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og -g") +set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og -g") # 4. Dependencies include(FetchContent) -find_package(CURL QUIET) -if (NOT CURL_FOUND) - message(FATAL_ERROR "[biergarten] libcurl not found. Install it (e.g. 'sudo dnf install libcurl-devel').") -endif () + +# Boost (system install — via dnf/brew) find_package(Boost REQUIRED COMPONENTS json program_options) +# Boost.DI (unofficial Boost extension, must declare separately from main Boost dependency) +FetchContent_Declare( + boost-di + GIT_REPOSITORY https://github.com/boost-ext/di.git + GIT_TAG v1.3.0 +) +FetchContent_MakeAvailable(boost-di) +if (TARGET Boost.DI AND NOT TARGET boost::di) + add_library(boost::di ALIAS Boost.DI) +endif () + # SQLite amalgamation FetchContent_Declare( sqlite_amalgamation @@ -76,17 +85,6 @@ if (NOT BIERGARTEN_MOCK_ONLY) FetchContent_MakeAvailable(llama-cpp) endif () -# Boost.DI (unofficial Boost extension, must declare separately from main Boost dependency) -FetchContent_Declare( - boost-di - GIT_REPOSITORY https://github.com/boost-ext/di.git - GIT_TAG v1.3.0 -) -FetchContent_MakeAvailable(boost-di) -if (TARGET Boost.DI AND NOT TARGET boost::di) - add_library(boost::di ALIAS Boost.DI) -endif () - # spdlog FetchContent_Declare( spdlog @@ -95,6 +93,21 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(spdlog) +# cpp-httplib — header-only HTTP/HTTPS client replacing libcurl. +# OpenSSL is required for HTTPS (Wikipedia API). find_package locates +# libssl/libcrypto; HTTPLIB_REQUIRE_OPENSSL causes a hard build failure +# if OpenSSL is absent rather than silently producing an HTTP-only binary. +find_package(OpenSSL REQUIRED) +FetchContent_Declare( + cpp-httplib + GIT_REPOSITORY https://github.com/yhirose/cpp-httplib.git + GIT_TAG v0.43.2 + GIT_SHALLOW TRUE + SYSTEM +) +set(HTTPLIB_REQUIRE_OPENSSL ON CACHE BOOL "Require OpenSSL for cpp-httplib" FORCE) +FetchContent_MakeAvailable(cpp-httplib) + # 5. Executable & Sources add_executable(${PROJECT_NAME}) @@ -124,9 +137,7 @@ target_sources(${PROJECT_NAME} PRIVATE # --- web_client --- target_sources(${PROJECT_NAME} PRIVATE - src/web_client/curl_web_client_url_encode.cc - src/web_client/curl_web_client_get.cc - src/web_client/curl_global_state.cc + src/web_client/http_web_client.cc ) # --- data_generation: prompt_formatting --- @@ -175,7 +186,7 @@ target_sources(${PROJECT_NAME} PRIVATE src/services/prompt_directory.cc ) -# 6. Include Directories & Link Libraries +# 6. Include Directories, Link Libraries & Compile Definitions target_include_directories(${PROJECT_NAME} PRIVATE includes $<$>:${llama-cpp_SOURCE_DIR}/include> @@ -189,12 +200,20 @@ target_link_libraries(${PROJECT_NAME} PRIVATE Boost::program_options spdlog::spdlog sqlite3 - CURL::libcurl + httplib::httplib + OpenSSL::SSL + OpenSSL::Crypto ) -if (BIERGARTEN_MOCK_ONLY) - target_compile_definitions(${PROJECT_NAME} PRIVATE BIERGARTEN_MOCK_ONLY) -endif () +target_compile_definitions(${PROJECT_NAME} PRIVATE + # Defined when -DBIERGARTEN_MOCK_ONLY=ON — skips llama.cpp entirely. + # Use #ifdef BIERGARTEN_MOCK_ONLY in source to guard llama-specific code. + $<$:BIERGARTEN_MOCK_ONLY> + + # Defined for Debug configuration builds. + # Use #ifdef DEBUG in source to enable debug-only behaviour (e.g. verbose logging). + $<$:DEBUG> +) # 7. Runtime Assets configure_file( @@ -206,4 +225,4 @@ add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/prompts ${CMAKE_BINARY_DIR}/prompts -) +) \ No newline at end of file diff --git a/tooling/pipeline/includes/web_client/curl_web_client.h b/tooling/pipeline/includes/web_client/curl_web_client.h deleted file mode 100644 index 9ffca2c..0000000 --- a/tooling/pipeline/includes/web_client/curl_web_client.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_CURL_WEB_CLIENT_H_ -#define BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_CURL_WEB_CLIENT_H_ - -/** - * @file web_client/curl_web_client.h - * @brief libcurl-based WebClient implementation. - */ - -#include "web_client/web_client.h" - -/** - * @brief RAII wrapper for curl_global_init and curl_global_cleanup. - * - * Create one instance in application startup before using libcurl and keep it - * alive for application lifetime. - */ -class CurlGlobalState { - public: - /// @brief Initializes global libcurl state. - CurlGlobalState(); - - /// @brief Cleans up global libcurl state. - ~CurlGlobalState(); - - /// @brief Non-copyable type. - CurlGlobalState(const CurlGlobalState&) = delete; - - /// @brief Non-copyable type. - CurlGlobalState& operator=(const CurlGlobalState&) = delete; -}; - -/** - * @brief WebClient implementation backed by libcurl. - */ -class CURLWebClient : public WebClient { - public: - /** - * @brief Executes an HTTP GET request. - * - * @param url Request URL. - * @return Response body. - */ - std::string Get(const std::string& url) override; - - /** - * @brief URL-encodes a string value. - * - * @param value Raw value. - * @return URL-encoded string. - */ - std::string UrlEncode(const std::string& value) override; -}; - -#endif // BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_CURL_WEB_CLIENT_H_ diff --git a/tooling/pipeline/includes/web_client/http_web_client.h b/tooling/pipeline/includes/web_client/http_web_client.h new file mode 100644 index 0000000..38bedca --- /dev/null +++ b/tooling/pipeline/includes/web_client/http_web_client.h @@ -0,0 +1,49 @@ +/** +* @file web_client/http_web_client.h +* @brief cpp-httplib implementation of the WebClient interface. +*/ + +#ifndef BIERGARTEN_PIPELINE_INCLUDES_HTTP_WEB_CLIENT_CURL_WEB_CLIENT_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_HTTP_WEB_CLIENT_CURL_WEB_CLIENT_H_ + + +#include "web_client/web_client.h" + +#include + +/** + * @brief WebClient implementation backed by cpp-httplib. + * + * Supports HTTP and HTTPS (requires OpenSSL; see HTTPLIB_REQUIRE_OPENSSL + * in CMakeLists.txt). + * + * URL parsing splits a full URL into origin (scheme://host[:port]) and + * path + query so that httplib::Client can be constructed correctly. + * A new client instance is created per request because the client is + * bound to a single origin at construction time. + */ +class HttpWebClient final : public WebClient { +public: + HttpWebClient() = default; + ~HttpWebClient() override = default; + + /** + * @brief Executes a blocking HTTP/HTTPS GET request against a full URL. + * + * @param url Fully-qualified URL, e.g. "https://en.wikipedia.org/api/rest_v1/page/summary/Berlin" + * @return Response body on HTTP 2xx; throws std::runtime_error otherwise. + */ + std::string Get(const std::string& url) override; + + /** + * @brief Percent-encodes a single URI component (query parameter value or + * path segment). Delegates to httplib::encode_uri_component(). + * + * @param value Raw string to encode. + * @return Percent-encoded string safe for use in a URL. + */ + std::string UrlEncode(const std::string& value) override; +}; + + +#endif diff --git a/tooling/pipeline/src/main.cc b/tooling/pipeline/src/main.cc index e457d54..924d5b5 100644 --- a/tooling/pipeline/src/main.cc +++ b/tooling/pipeline/src/main.cc @@ -8,12 +8,11 @@ #include #include -#include -#include + #include #include #include -#include + #include #include "biergarten_data_generator.h" @@ -29,16 +28,22 @@ #include "services/timer.h" #include "services/wikipedia_service.h" #include "web_client/curl_web_client.h" +#include "web_client/http_web_client.h" namespace di = boost::di; int main(const int argc, char** argv) { try { Timer timer; - const CurlGlobalState curl_state; - const LlamaBackendState llama_backend_state; spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v"); +#ifndef BIERGARTEN_MOCK_ONLY + const LlamaBackendState llama_backend_state; +#endif +#ifdef DEBUG + spdlog::set_level(spdlog::level::debug); +#endif + const auto parsed_options = ParseArguments(argc, argv); if (!parsed_options.has_value()) { return 0; @@ -61,7 +66,7 @@ int main(const int argc, char** argv) { } const auto injector = di::make_injector( - di::bind().to(), + di::bind().to(), di::bind().to(options), di::bind().to(), di::bind().to(), @@ -69,7 +74,7 @@ int main(const int argc, char** argv) { di::bind().to(model_path), di::bind().to( [options, model_path, sampling, &prompt_directory]( - const auto& inj) -> std::unique_ptr { + const auto& inj) -> std::unique_ptr { if (options.generator.use_mocked) { spdlog::info( "[Generator] Using MockGenerator (no model path provided)"); @@ -101,4 +106,4 @@ int main(const int argc, char** argv) { spdlog::critical("Unhandled fatal error in main: {}", exception.what()); return 1; } -} +} \ No newline at end of file diff --git a/tooling/pipeline/src/web_client/curl_global_state.cc b/tooling/pipeline/src/web_client/curl_global_state.cc deleted file mode 100644 index da7847f..0000000 --- a/tooling/pipeline/src/web_client/curl_global_state.cc +++ /dev/null @@ -1,19 +0,0 @@ -/** - * @file web_client/curl_global_state.cc - * @brief CurlGlobalState constructor and destructor implementation. - */ - -#include - -#include - -#include "web_client/curl_web_client.h" - -CurlGlobalState::CurlGlobalState() { - if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) { - throw std::runtime_error( - "[CURLWebClient] Failed to initialize libcurl globally"); - } -} - -CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); } diff --git a/tooling/pipeline/src/web_client/curl_web_client_get.cc b/tooling/pipeline/src/web_client/curl_web_client_get.cc deleted file mode 100644 index 0a8473e..0000000 --- a/tooling/pipeline/src/web_client/curl_web_client_get.cc +++ /dev/null @@ -1,87 +0,0 @@ -/** - * @file web_client/curl_web_client_get.cc - * @brief CURLWebClient::Get() implementation. - */ - -#include - -#include -#include -#include -#include -#include - -#include "web_client/curl_web_client.h" - -using CurlHandle = std::unique_ptr; - -static constexpr long kConnectionTimeout = 10; -static constexpr long kRequestTimeout = 30; -static constexpr long kMaxRedirects = 5; -static constexpr int32_t kOkHttpStatus = 200; - -static CurlHandle CreateHandle() { - CURL* handle = curl_easy_init(); - if (handle == nullptr) { - throw std::runtime_error( - "[CURLWebClient] Failed to initialize libcurl handle"); - } - return {handle, &curl_easy_cleanup}; -} - -static void SetCommonGetOptions(CURL* curl, const std::string& url) { - curl_easy_setopt(curl, CURLOPT_URL, url.c_str()); - curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0"); - curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); - curl_easy_setopt(curl, CURLOPT_MAXREDIRS, kMaxRedirects); - curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, kConnectionTimeout); - curl_easy_setopt(curl, CURLOPT_TIMEOUT, kRequestTimeout); - curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip"); -} - -// curl write callback that appends response data into a std::string -static size_t WriteCallbackString(void* contents, const size_t size, - const size_t nmemb, void* userp) { - const size_t real_size = size * nmemb; - auto* str = static_cast(userp); - str->append(static_cast(contents), real_size); - return real_size; -} - -std::string CURLWebClient::Get(const std::string& url) { - const CurlHandle curl = CreateHandle(); - - std::string response_string; - - SetCommonGetOptions(curl.get(), url); - - curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString); - curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string); - - CURLcode curl_result = curl_easy_perform(curl.get()); - - if (curl_result != CURLE_OK) { - const auto error = std::string("[CURLWebClient] GET failed: ") + - curl_easy_strerror(curl_result); - throw std::runtime_error(error); - } - - long curl_http_code = 0; - curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &curl_http_code); - - if (curl_http_code < std::numeric_limits::min() || - curl_http_code > std::numeric_limits::max()) { - throw std::runtime_error("[CURLWebClient] Invalid HTTP status code: " + - std::to_string(curl_http_code)); - } - - const int32_t http_code = static_cast(curl_http_code); - - if (http_code != kOkHttpStatus) { - const std::string error = "[CURLWebClient] HTTP error " + - std::to_string(http_code) + " for URL " + url; - throw std::runtime_error(error); - } - - return response_string; -} diff --git a/tooling/pipeline/src/web_client/curl_web_client_url_encode.cc b/tooling/pipeline/src/web_client/curl_web_client_url_encode.cc deleted file mode 100644 index 1fb6297..0000000 --- a/tooling/pipeline/src/web_client/curl_web_client_url_encode.cc +++ /dev/null @@ -1,24 +0,0 @@ -/** - * @file web_client/curl_web_client_url_encode.cc - * @brief CURLWebClient::UrlEncode() implementation. - */ - -#include - -#include -#include - -#include "web_client/curl_web_client.h" - -std::string CURLWebClient::UrlEncode(const std::string& value) { - // A NULL handle is fine for UTF-8 encoding according to libcurl docs. - char* output = curl_easy_escape(nullptr, value.c_str(), 0); - - if (!output) { - throw std::runtime_error("[CURLWebClient] curl_easy_escape failed"); - } - - std::string result(output); - curl_free(output); - return result; -} diff --git a/tooling/pipeline/src/web_client/http_web_client.cc b/tooling/pipeline/src/web_client/http_web_client.cc new file mode 100644 index 0000000..aba30cf --- /dev/null +++ b/tooling/pipeline/src/web_client/http_web_client.cc @@ -0,0 +1,61 @@ +/** +* @file web_client/http_web_client.cc + * @brief cpp-httplib implementation of WebClient. + */ + +#include "web_client/http_web_client.h" + +#include + +#include +#include +#include +#include + +namespace { +constexpr time_t kConnectionTimeoutSeconds = 5; +constexpr time_t kReadTimeoutSeconds = 10; + +constexpr int kSuccessMin = 200; +constexpr int kSuccessMax = 300; +const std::regex kUrlRegex( + R"(^(https?://[^/?#]+)(/[^?#]*(?:\?[^#]*)?(?:#.*)?)?)"); + +std::pair SplitUrl(const std::string& url) { + std::smatch match; + if (!std::regex_match(url, match, kUrlRegex)) { + throw std::invalid_argument("[HttpWebClient] Malformed URL: " + url); + } + + return {match[1].str(), match[2].matched ? match[2].str() : "/"}; +} +} // namespace + +std::string HttpWebClient::Get(const std::string& url) { + const auto [origin, path] = SplitUrl(url); + + httplib::Client client(origin); + client.set_follow_location(true); + client.set_connection_timeout(kConnectionTimeoutSeconds); + client.set_read_timeout(kReadTimeoutSeconds); + + const auto result = client.Get(path); + + if (!result) { + throw std::runtime_error( + "[HttpWebClient] Request failed for URL: " + url + + " — " + httplib::to_string(result.error())); + } + + if (result->status < kSuccessMin || result->status >= kSuccessMax) { + throw std::runtime_error( + "[HttpWebClient] HTTP " + std::to_string(result->status) + + " for URL: " + url); + } + + return result->body; +} + +std::string HttpWebClient::UrlEncode(const std::string& value) { + return httplib::encode_uri_component(value); +} \ No newline at end of file