Pipeline: Remove CURL as a dependency, add new HTTP module (#219)

Rationale: 

HTTP is a supporting concern in the pipeline, used only for Wikipedia enrichment calls. libcurl's C API required significant boilerplate to wrap safely. cpp-httplib is a header-only library that covers the same functionality with far less overhead and no manual resource management.
This commit is contained in:
2026-05-03 13:35:58 -04:00
committed by GitHub
parent f316fabcb0
commit 031be8ad5d
8 changed files with 167 additions and 217 deletions

View File

@@ -0,0 +1,61 @@
/**
* @file web_client/http_web_client.cc
* @brief cpp-httplib implementation of WebClient.
*/
#include "web_client/http_web_client.h"
#include <httplib.h>
#include <regex>
#include <stdexcept>
#include <string>
#include <utility>
namespace {
constexpr time_t kConnectionTimeoutSeconds = 5;
constexpr time_t kReadTimeoutSeconds = 10;
constexpr int kSuccessMin = 200;
constexpr int kSuccessMax = 300;
const std::regex kUrlRegex(
R"(^(https?://[^/?#]+)(/[^?#]*(?:\?[^#]*)?(?:#.*)?)?)");
std::pair<std::string, std::string> SplitUrl(const std::string& url) {
std::smatch match;
if (!std::regex_match(url, match, kUrlRegex)) {
throw std::invalid_argument("[HttpWebClient] Malformed URL: " + url);
}
return {match[1].str(), match[2].matched ? match[2].str() : "/"};
}
} // namespace
std::string HttpWebClient::Get(const std::string& url) {
const auto [origin, path] = SplitUrl(url);
httplib::Client client(origin);
client.set_follow_location(true);
client.set_connection_timeout(kConnectionTimeoutSeconds);
client.set_read_timeout(kReadTimeoutSeconds);
const auto result = client.Get(path);
if (!result) {
throw std::runtime_error(
"[HttpWebClient] Request failed for URL: " + url +
"" + httplib::to_string(result.error()));
}
if (result->status < kSuccessMin || result->status >= kSuccessMax) {
throw std::runtime_error(
"[HttpWebClient] HTTP " + std::to_string(result->status) +
" for URL: " + url);
}
return result->body;
}
std::string HttpWebClient::UrlEncode(const std::string& value) {
return httplib::encode_uri_component(value);
}