From b52d4d5f27257875b0dd3781e96be098a844cb0d Mon Sep 17 00:00:00 2001 From: Aaron Po Date: Sun, 3 May 2026 04:07:25 -0400 Subject: [PATCH] add new http module --- tooling/pipeline/CMakeLists.txt | 34 ++++++++++- .../includes/web_client/http_web_client.h | 49 +++++++++++++++ .../src/web_client/http_web_client.cc | 61 +++++++++++++++++++ 3 files changed, 142 insertions(+), 2 deletions(-) create mode 100644 tooling/pipeline/includes/web_client/http_web_client.h create mode 100644 tooling/pipeline/src/web_client/http_web_client.cc diff --git a/tooling/pipeline/CMakeLists.txt b/tooling/pipeline/CMakeLists.txt index 8cc373c..cf9ecb8 100644 --- a/tooling/pipeline/CMakeLists.txt +++ b/tooling/pipeline/CMakeLists.txt @@ -1,3 +1,4 @@ +# CMakeLists.txt (project root) cmake_minimum_required(VERSION 3.31) project(biergarten-pipeline) @@ -46,10 +47,15 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og -g") # 4. Dependencies include(FetchContent) + +# DEPRECATED: libcurl — to be removed once all usages are migrated to cpp-httplib. +# Tracked in: web_client/curl_web_client_get.cc, web_client/curl_web_client_url_encode.cc, +# web_client/curl_global_state.cc find_package(CURL QUIET) if (NOT CURL_FOUND) message(FATAL_ERROR "[biergarten] libcurl not found. Install it (e.g. 'sudo dnf install libcurl-devel').") endif () + find_package(Boost REQUIRED COMPONENTS json program_options) # SQLite amalgamation @@ -95,6 +101,22 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(spdlog) +# cpp-httplib — replaces direct libcurl usage in web_client. +# OpenSSL is required for HTTPS (Wikipedia). find_package is called first so +# CMake can locate libssl/libcrypto; cpp-httplib itself is header-only so the +# CPPHTTPLIB_OPENSSL_SUPPORT compile definition is propagated via the target. +find_package(OpenSSL REQUIRED) + +FetchContent_Declare( + cpp-httplib + GIT_REPOSITORY https://github.com/yhirose/cpp-httplib.git + GIT_TAG v0.41.0 + GIT_SHALLOW TRUE + SYSTEM +) +set(HTTPLIB_REQUIRE_OPENSSL ON CACHE BOOL "Require OpenSSL for cpp-httplib" FORCE) +FetchContent_MakeAvailable(cpp-httplib) + # 5. Executable & Sources add_executable(${PROJECT_NAME}) @@ -123,12 +145,17 @@ target_sources(${PROJECT_NAME} PRIVATE ) # --- web_client --- +# DEPRECATED: curl_web_client_* — to be replaced with cpp-httplib equivalents. target_sources(${PROJECT_NAME} PRIVATE src/web_client/curl_web_client_url_encode.cc src/web_client/curl_web_client_get.cc src/web_client/curl_global_state.cc ) +target_sources(${PROJECT_NAME} PRIVATE + src/web_client/http_web_client.cc +) + # --- data_generation: prompt_formatting --- target_sources(${PROJECT_NAME} PRIVATE src/data_generation/prompt_formatting/gemma4_jinja_prompt_formatter.cc @@ -189,7 +216,10 @@ target_link_libraries(${PROJECT_NAME} PRIVATE Boost::program_options spdlog::spdlog sqlite3 - CURL::libcurl + httplib::httplib + OpenSSL::SSL + OpenSSL::Crypto + CURL::libcurl # DEPRECATED: remove once web_client is migrated to cpp-httplib ) if (BIERGARTEN_MOCK_ONLY) @@ -206,4 +236,4 @@ add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_SOURCE_DIR}/prompts ${CMAKE_BINARY_DIR}/prompts -) +) \ No newline at end of file diff --git a/tooling/pipeline/includes/web_client/http_web_client.h b/tooling/pipeline/includes/web_client/http_web_client.h new file mode 100644 index 0000000..b4ae584 --- /dev/null +++ b/tooling/pipeline/includes/web_client/http_web_client.h @@ -0,0 +1,49 @@ +/** +* @file web_client/http_web_client.h +* @brief cpp-httplib implementation of the WebClient interface. +*/ + +#ifndef BIERGARTEN_PIPELINE_INCLUDES_HTTP_WEB_CLIENT_CURL_WEB_CLIENT_H_ +#define BIERGARTEN_PIPELINE_INCLUDES_HTTP_WEB_CLIENT_CURL_WEB_CLIENT_H_ + + +#include "web_client/web_client.h" + +#include + +/** + * @brief WebClient implementation backed by cpp-httplib. + * + * Supports HTTP and HTTPS (requires + * OpenSSL; see HTTPLIB_USE_OPENSSL_IF_AVAILABLE in CMakeLists.txt). + * + * URL parsing splits a full URL into scheme + host and path + query so that + * httplib::Client can be constructed correctly. A new client instance is + * created per request; th is is intentional given the low call volume in the + * pipeline (Wikipedia enrichment, near-100 % cache hits). + */ +class HttpWebClient final : public WebClient { +public: + HttpWebClient() = default; + ~HttpWebClient() override = default; + + /** + * @brief Executes a blocking HTTP/HTTPS GET request against a full URL. + * + * @param url Fully-qualified URL, e.g. "https://en.wikipedia.org/api/rest_v1/page/summary/Berlin" + * @return Response body on HTTP 2xx; throws std::runtime_error otherwise. + */ + std::string Get(const std::string& url) override; + + /** + * @brief Percent-encodes a single URI component (query parameter value or + * path segment). Delegates to httplib::encode_uri_component(). + * + * @param value Raw string to encode. + * @return Percent-encoded string safe for use in a URL. + */ + std::string UrlEncode(const std::string& value) override; +}; + + +#endif \ No newline at end of file diff --git a/tooling/pipeline/src/web_client/http_web_client.cc b/tooling/pipeline/src/web_client/http_web_client.cc new file mode 100644 index 0000000..aba30cf --- /dev/null +++ b/tooling/pipeline/src/web_client/http_web_client.cc @@ -0,0 +1,61 @@ +/** +* @file web_client/http_web_client.cc + * @brief cpp-httplib implementation of WebClient. + */ + +#include "web_client/http_web_client.h" + +#include + +#include +#include +#include +#include + +namespace { +constexpr time_t kConnectionTimeoutSeconds = 5; +constexpr time_t kReadTimeoutSeconds = 10; + +constexpr int kSuccessMin = 200; +constexpr int kSuccessMax = 300; +const std::regex kUrlRegex( + R"(^(https?://[^/?#]+)(/[^?#]*(?:\?[^#]*)?(?:#.*)?)?)"); + +std::pair SplitUrl(const std::string& url) { + std::smatch match; + if (!std::regex_match(url, match, kUrlRegex)) { + throw std::invalid_argument("[HttpWebClient] Malformed URL: " + url); + } + + return {match[1].str(), match[2].matched ? match[2].str() : "/"}; +} +} // namespace + +std::string HttpWebClient::Get(const std::string& url) { + const auto [origin, path] = SplitUrl(url); + + httplib::Client client(origin); + client.set_follow_location(true); + client.set_connection_timeout(kConnectionTimeoutSeconds); + client.set_read_timeout(kReadTimeoutSeconds); + + const auto result = client.Get(path); + + if (!result) { + throw std::runtime_error( + "[HttpWebClient] Request failed for URL: " + url + + " — " + httplib::to_string(result.error())); + } + + if (result->status < kSuccessMin || result->status >= kSuccessMax) { + throw std::runtime_error( + "[HttpWebClient] HTTP " + std::to_string(result->status) + + " for URL: " + url); + } + + return result->body; +} + +std::string HttpWebClient::UrlEncode(const std::string& value) { + return httplib::encode_uri_component(value); +} \ No newline at end of file