2 Commits

Author SHA1 Message Date
Aaron Po
3c47f74fb9 Remove CURL
rationale:

http requests are not a primary concern of the application, and can be delegated to a lighter solution rather than interfacing with the CURL lib
2026-05-03 13:30:21 -04:00
Aaron Po
3729b9469c add new http module 2026-05-03 13:09:02 -04:00
8 changed files with 167 additions and 217 deletions

View File

@@ -42,16 +42,25 @@ set(CMAKE_CXX_STANDARD 20)
set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -flto") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -march=native -flto")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og -g") set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og -g")
# 4. Dependencies # 4. Dependencies
include(FetchContent) include(FetchContent)
find_package(CURL QUIET)
if (NOT CURL_FOUND) # Boost (system install — via dnf/brew)
message(FATAL_ERROR "[biergarten] libcurl not found. Install it (e.g. 'sudo dnf install libcurl-devel').")
endif ()
find_package(Boost REQUIRED COMPONENTS json program_options) find_package(Boost REQUIRED COMPONENTS json program_options)
# Boost.DI (unofficial Boost extension, must declare separately from main Boost dependency)
FetchContent_Declare(
boost-di
GIT_REPOSITORY https://github.com/boost-ext/di.git
GIT_TAG v1.3.0
)
FetchContent_MakeAvailable(boost-di)
if (TARGET Boost.DI AND NOT TARGET boost::di)
add_library(boost::di ALIAS Boost.DI)
endif ()
# SQLite amalgamation # SQLite amalgamation
FetchContent_Declare( FetchContent_Declare(
sqlite_amalgamation sqlite_amalgamation
@@ -76,17 +85,6 @@ if (NOT BIERGARTEN_MOCK_ONLY)
FetchContent_MakeAvailable(llama-cpp) FetchContent_MakeAvailable(llama-cpp)
endif () endif ()
# Boost.DI (unofficial Boost extension, must declare separately from main Boost dependency)
FetchContent_Declare(
boost-di
GIT_REPOSITORY https://github.com/boost-ext/di.git
GIT_TAG v1.3.0
)
FetchContent_MakeAvailable(boost-di)
if (TARGET Boost.DI AND NOT TARGET boost::di)
add_library(boost::di ALIAS Boost.DI)
endif ()
# spdlog # spdlog
FetchContent_Declare( FetchContent_Declare(
spdlog spdlog
@@ -95,6 +93,21 @@ FetchContent_Declare(
) )
FetchContent_MakeAvailable(spdlog) FetchContent_MakeAvailable(spdlog)
# cpp-httplib — header-only HTTP/HTTPS client replacing libcurl.
# OpenSSL is required for HTTPS (Wikipedia API). find_package locates
# libssl/libcrypto; HTTPLIB_REQUIRE_OPENSSL causes a hard build failure
# if OpenSSL is absent rather than silently producing an HTTP-only binary.
find_package(OpenSSL REQUIRED)
FetchContent_Declare(
cpp-httplib
GIT_REPOSITORY https://github.com/yhirose/cpp-httplib.git
GIT_TAG v0.43.2
GIT_SHALLOW TRUE
SYSTEM
)
set(HTTPLIB_REQUIRE_OPENSSL ON CACHE BOOL "Require OpenSSL for cpp-httplib" FORCE)
FetchContent_MakeAvailable(cpp-httplib)
# 5. Executable & Sources # 5. Executable & Sources
add_executable(${PROJECT_NAME}) add_executable(${PROJECT_NAME})
@@ -124,9 +137,7 @@ target_sources(${PROJECT_NAME} PRIVATE
# --- web_client --- # --- web_client ---
target_sources(${PROJECT_NAME} PRIVATE target_sources(${PROJECT_NAME} PRIVATE
src/web_client/curl_web_client_url_encode.cc src/web_client/http_web_client.cc
src/web_client/curl_web_client_get.cc
src/web_client/curl_global_state.cc
) )
# --- data_generation: prompt_formatting --- # --- data_generation: prompt_formatting ---
@@ -175,7 +186,7 @@ target_sources(${PROJECT_NAME} PRIVATE
src/services/prompt_directory.cc src/services/prompt_directory.cc
) )
# 6. Include Directories & Link Libraries # 6. Include Directories, Link Libraries & Compile Definitions
target_include_directories(${PROJECT_NAME} PRIVATE target_include_directories(${PROJECT_NAME} PRIVATE
includes includes
$<$<NOT:$<BOOL:${BIERGARTEN_MOCK_ONLY}>>:${llama-cpp_SOURCE_DIR}/include> $<$<NOT:$<BOOL:${BIERGARTEN_MOCK_ONLY}>>:${llama-cpp_SOURCE_DIR}/include>
@@ -189,12 +200,20 @@ target_link_libraries(${PROJECT_NAME} PRIVATE
Boost::program_options Boost::program_options
spdlog::spdlog spdlog::spdlog
sqlite3 sqlite3
CURL::libcurl httplib::httplib
OpenSSL::SSL
OpenSSL::Crypto
) )
if (BIERGARTEN_MOCK_ONLY) target_compile_definitions(${PROJECT_NAME} PRIVATE
target_compile_definitions(${PROJECT_NAME} PRIVATE BIERGARTEN_MOCK_ONLY) # Defined when -DBIERGARTEN_MOCK_ONLY=ON — skips llama.cpp entirely.
endif () # Use #ifdef BIERGARTEN_MOCK_ONLY in source to guard llama-specific code.
$<$<BOOL:${BIERGARTEN_MOCK_ONLY}>:BIERGARTEN_MOCK_ONLY>
# Defined for Debug configuration builds.
# Use #ifdef DEBUG in source to enable debug-only behaviour (e.g. verbose logging).
$<$<CONFIG:Debug>:DEBUG>
)
# 7. Runtime Assets # 7. Runtime Assets
configure_file( configure_file(
@@ -206,4 +225,4 @@ add_custom_command(TARGET ${PROJECT_NAME} POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy_directory COMMAND ${CMAKE_COMMAND} -E copy_directory
${CMAKE_SOURCE_DIR}/prompts ${CMAKE_SOURCE_DIR}/prompts
${CMAKE_BINARY_DIR}/prompts ${CMAKE_BINARY_DIR}/prompts
) )

View File

@@ -1,54 +0,0 @@
#ifndef BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_CURL_WEB_CLIENT_H_
#define BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_CURL_WEB_CLIENT_H_
/**
* @file web_client/curl_web_client.h
* @brief libcurl-based WebClient implementation.
*/
#include "web_client/web_client.h"
/**
* @brief RAII wrapper for curl_global_init and curl_global_cleanup.
*
* Create one instance in application startup before using libcurl and keep it
* alive for application lifetime.
*/
class CurlGlobalState {
public:
/// @brief Initializes global libcurl state.
CurlGlobalState();
/// @brief Cleans up global libcurl state.
~CurlGlobalState();
/// @brief Non-copyable type.
CurlGlobalState(const CurlGlobalState&) = delete;
/// @brief Non-copyable type.
CurlGlobalState& operator=(const CurlGlobalState&) = delete;
};
/**
* @brief WebClient implementation backed by libcurl.
*/
class CURLWebClient : public WebClient {
public:
/**
* @brief Executes an HTTP GET request.
*
* @param url Request URL.
* @return Response body.
*/
std::string Get(const std::string& url) override;
/**
* @brief URL-encodes a string value.
*
* @param value Raw value.
* @return URL-encoded string.
*/
std::string UrlEncode(const std::string& value) override;
};
#endif // BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_CURL_WEB_CLIENT_H_

View File

@@ -0,0 +1,49 @@
/**
* @file web_client/http_web_client.h
* @brief cpp-httplib implementation of the WebClient interface.
*/
#ifndef BIERGARTEN_PIPELINE_INCLUDES_HTTP_WEB_CLIENT_CURL_WEB_CLIENT_H_
#define BIERGARTEN_PIPELINE_INCLUDES_HTTP_WEB_CLIENT_CURL_WEB_CLIENT_H_
#include "web_client/web_client.h"
#include <string>
/**
* @brief WebClient implementation backed by cpp-httplib.
*
* Supports HTTP and HTTPS (requires OpenSSL; see HTTPLIB_REQUIRE_OPENSSL
* in CMakeLists.txt).
*
* URL parsing splits a full URL into origin (scheme://host[:port]) and
* path + query so that httplib::Client can be constructed correctly.
* A new client instance is created per request because the client is
* bound to a single origin at construction time.
*/
class HttpWebClient final : public WebClient {
public:
HttpWebClient() = default;
~HttpWebClient() override = default;
/**
* @brief Executes a blocking HTTP/HTTPS GET request against a full URL.
*
* @param url Fully-qualified URL, e.g. "https://en.wikipedia.org/api/rest_v1/page/summary/Berlin"
* @return Response body on HTTP 2xx; throws std::runtime_error otherwise.
*/
std::string Get(const std::string& url) override;
/**
* @brief Percent-encodes a single URI component (query parameter value or
* path segment). Delegates to httplib::encode_uri_component().
*
* @param value Raw string to encode.
* @return Percent-encoded string safe for use in a URL.
*/
std::string UrlEncode(const std::string& value) override;
};
#endif

View File

@@ -8,12 +8,11 @@
#include <boost/di.hpp> #include <boost/di.hpp>
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#include <chrono>
#include <cstdint>
#include <exception> #include <exception>
#include <memory> #include <memory>
#include <optional> #include <optional>
#include <sstream>
#include <string> #include <string>
#include "biergarten_data_generator.h" #include "biergarten_data_generator.h"
@@ -29,16 +28,22 @@
#include "services/timer.h" #include "services/timer.h"
#include "services/wikipedia_service.h" #include "services/wikipedia_service.h"
#include "web_client/curl_web_client.h" #include "web_client/curl_web_client.h"
#include "web_client/http_web_client.h"
namespace di = boost::di; namespace di = boost::di;
int main(const int argc, char** argv) { int main(const int argc, char** argv) {
try { try {
Timer timer; Timer timer;
const CurlGlobalState curl_state;
const LlamaBackendState llama_backend_state;
spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v"); spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v");
#ifndef BIERGARTEN_MOCK_ONLY
const LlamaBackendState llama_backend_state;
#endif
#ifdef DEBUG
spdlog::set_level(spdlog::level::debug);
#endif
const auto parsed_options = ParseArguments(argc, argv); const auto parsed_options = ParseArguments(argc, argv);
if (!parsed_options.has_value()) { if (!parsed_options.has_value()) {
return 0; return 0;
@@ -61,7 +66,7 @@ int main(const int argc, char** argv) {
} }
const auto injector = di::make_injector( const auto injector = di::make_injector(
di::bind<WebClient>().to<CURLWebClient>(), di::bind<WebClient>().to<HttpWebClient>(),
di::bind<ApplicationOptions>().to(options), di::bind<ApplicationOptions>().to(options),
di::bind<IEnrichmentService>().to<WikipediaService>(), di::bind<IEnrichmentService>().to<WikipediaService>(),
di::bind<IExportService>().to<SqliteExportService>(), di::bind<IExportService>().to<SqliteExportService>(),
@@ -69,7 +74,7 @@ int main(const int argc, char** argv) {
di::bind<std::string>().to(model_path), di::bind<std::string>().to(model_path),
di::bind<DataGenerator>().to( di::bind<DataGenerator>().to(
[options, model_path, sampling, &prompt_directory]( [options, model_path, sampling, &prompt_directory](
const auto& inj) -> std::unique_ptr<DataGenerator> { const auto& inj) -> std::unique_ptr<DataGenerator> {
if (options.generator.use_mocked) { if (options.generator.use_mocked) {
spdlog::info( spdlog::info(
"[Generator] Using MockGenerator (no model path provided)"); "[Generator] Using MockGenerator (no model path provided)");
@@ -101,4 +106,4 @@ int main(const int argc, char** argv) {
spdlog::critical("Unhandled fatal error in main: {}", exception.what()); spdlog::critical("Unhandled fatal error in main: {}", exception.what());
return 1; return 1;
} }
} }

View File

@@ -1,19 +0,0 @@
/**
* @file web_client/curl_global_state.cc
* @brief CurlGlobalState constructor and destructor implementation.
*/
#include <curl/curl.h>
#include <stdexcept>
#include "web_client/curl_web_client.h"
CurlGlobalState::CurlGlobalState() {
if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
throw std::runtime_error(
"[CURLWebClient] Failed to initialize libcurl globally");
}
}
CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }

View File

@@ -1,87 +0,0 @@
/**
* @file web_client/curl_web_client_get.cc
* @brief CURLWebClient::Get() implementation.
*/
#include <curl/curl.h>
#include <cstdint>
#include <limits>
#include <memory>
#include <stdexcept>
#include <string>
#include "web_client/curl_web_client.h"
using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
static constexpr long kConnectionTimeout = 10;
static constexpr long kRequestTimeout = 30;
static constexpr long kMaxRedirects = 5;
static constexpr int32_t kOkHttpStatus = 200;
static CurlHandle CreateHandle() {
CURL* handle = curl_easy_init();
if (handle == nullptr) {
throw std::runtime_error(
"[CURLWebClient] Failed to initialize libcurl handle");
}
return {handle, &curl_easy_cleanup};
}
static void SetCommonGetOptions(CURL* curl, const std::string& url) {
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, kMaxRedirects);
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, kConnectionTimeout);
curl_easy_setopt(curl, CURLOPT_TIMEOUT, kRequestTimeout);
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
}
// curl write callback that appends response data into a std::string
static size_t WriteCallbackString(void* contents, const size_t size,
const size_t nmemb, void* userp) {
const size_t real_size = size * nmemb;
auto* str = static_cast<std::string*>(userp);
str->append(static_cast<char*>(contents), real_size);
return real_size;
}
std::string CURLWebClient::Get(const std::string& url) {
const CurlHandle curl = CreateHandle();
std::string response_string;
SetCommonGetOptions(curl.get(), url);
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
CURLcode curl_result = curl_easy_perform(curl.get());
if (curl_result != CURLE_OK) {
const auto error = std::string("[CURLWebClient] GET failed: ") +
curl_easy_strerror(curl_result);
throw std::runtime_error(error);
}
long curl_http_code = 0;
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &curl_http_code);
if (curl_http_code < std::numeric_limits<int32_t>::min() ||
curl_http_code > std::numeric_limits<int32_t>::max()) {
throw std::runtime_error("[CURLWebClient] Invalid HTTP status code: " +
std::to_string(curl_http_code));
}
const int32_t http_code = static_cast<int32_t>(curl_http_code);
if (http_code != kOkHttpStatus) {
const std::string error = "[CURLWebClient] HTTP error " +
std::to_string(http_code) + " for URL " + url;
throw std::runtime_error(error);
}
return response_string;
}

View File

@@ -1,24 +0,0 @@
/**
* @file web_client/curl_web_client_url_encode.cc
* @brief CURLWebClient::UrlEncode() implementation.
*/
#include <curl/curl.h>
#include <stdexcept>
#include <string>
#include "web_client/curl_web_client.h"
std::string CURLWebClient::UrlEncode(const std::string& value) {
// A NULL handle is fine for UTF-8 encoding according to libcurl docs.
char* output = curl_easy_escape(nullptr, value.c_str(), 0);
if (!output) {
throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
}
std::string result(output);
curl_free(output);
return result;
}

View File

@@ -0,0 +1,61 @@
/**
* @file web_client/http_web_client.cc
* @brief cpp-httplib implementation of WebClient.
*/
#include "web_client/http_web_client.h"
#include <httplib.h>
#include <regex>
#include <stdexcept>
#include <string>
#include <utility>
namespace {
constexpr time_t kConnectionTimeoutSeconds = 5;
constexpr time_t kReadTimeoutSeconds = 10;
constexpr int kSuccessMin = 200;
constexpr int kSuccessMax = 300;
const std::regex kUrlRegex(
R"(^(https?://[^/?#]+)(/[^?#]*(?:\?[^#]*)?(?:#.*)?)?)");
std::pair<std::string, std::string> SplitUrl(const std::string& url) {
std::smatch match;
if (!std::regex_match(url, match, kUrlRegex)) {
throw std::invalid_argument("[HttpWebClient] Malformed URL: " + url);
}
return {match[1].str(), match[2].matched ? match[2].str() : "/"};
}
} // namespace
std::string HttpWebClient::Get(const std::string& url) {
const auto [origin, path] = SplitUrl(url);
httplib::Client client(origin);
client.set_follow_location(true);
client.set_connection_timeout(kConnectionTimeoutSeconds);
client.set_read_timeout(kReadTimeoutSeconds);
const auto result = client.Get(path);
if (!result) {
throw std::runtime_error(
"[HttpWebClient] Request failed for URL: " + url +
"" + httplib::to_string(result.error()));
}
if (result->status < kSuccessMin || result->status >= kSuccessMax) {
throw std::runtime_error(
"[HttpWebClient] HTTP " + std::to_string(result->status) +
" for URL: " + url);
}
return result->body;
}
std::string HttpWebClient::UrlEncode(const std::string& value) {
return httplib::encode_uri_component(value);
}