mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 10:04:00 +00:00
Compare commits
2 Commits
b52d4d5f27
...
3c47f74fb9
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3c47f74fb9 | ||
|
|
3729b9469c |
@@ -1,4 +1,3 @@
|
|||||||
# CMakeLists.txt (project root)
|
|
||||||
cmake_minimum_required(VERSION 3.31)
|
cmake_minimum_required(VERSION 3.31)
|
||||||
project(biergarten-pipeline)
|
project(biergarten-pipeline)
|
||||||
|
|
||||||
@@ -48,16 +47,20 @@ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Og -g")
|
|||||||
# 4. Dependencies
|
# 4. Dependencies
|
||||||
include(FetchContent)
|
include(FetchContent)
|
||||||
|
|
||||||
# DEPRECATED: libcurl — to be removed once all usages are migrated to cpp-httplib.
|
# Boost (system install — via dnf/brew)
|
||||||
# Tracked in: web_client/curl_web_client_get.cc, web_client/curl_web_client_url_encode.cc,
|
|
||||||
# web_client/curl_global_state.cc
|
|
||||||
find_package(CURL QUIET)
|
|
||||||
if (NOT CURL_FOUND)
|
|
||||||
message(FATAL_ERROR "[biergarten] libcurl not found. Install it (e.g. 'sudo dnf install libcurl-devel').")
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
find_package(Boost REQUIRED COMPONENTS json program_options)
|
find_package(Boost REQUIRED COMPONENTS json program_options)
|
||||||
|
|
||||||
|
# Boost.DI (unofficial Boost extension, must declare separately from main Boost dependency)
|
||||||
|
FetchContent_Declare(
|
||||||
|
boost-di
|
||||||
|
GIT_REPOSITORY https://github.com/boost-ext/di.git
|
||||||
|
GIT_TAG v1.3.0
|
||||||
|
)
|
||||||
|
FetchContent_MakeAvailable(boost-di)
|
||||||
|
if (TARGET Boost.DI AND NOT TARGET boost::di)
|
||||||
|
add_library(boost::di ALIAS Boost.DI)
|
||||||
|
endif ()
|
||||||
|
|
||||||
# SQLite amalgamation
|
# SQLite amalgamation
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
sqlite_amalgamation
|
sqlite_amalgamation
|
||||||
@@ -82,17 +85,6 @@ if (NOT BIERGARTEN_MOCK_ONLY)
|
|||||||
FetchContent_MakeAvailable(llama-cpp)
|
FetchContent_MakeAvailable(llama-cpp)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
# Boost.DI (unofficial Boost extension, must declare separately from main Boost dependency)
|
|
||||||
FetchContent_Declare(
|
|
||||||
boost-di
|
|
||||||
GIT_REPOSITORY https://github.com/boost-ext/di.git
|
|
||||||
GIT_TAG v1.3.0
|
|
||||||
)
|
|
||||||
FetchContent_MakeAvailable(boost-di)
|
|
||||||
if (TARGET Boost.DI AND NOT TARGET boost::di)
|
|
||||||
add_library(boost::di ALIAS Boost.DI)
|
|
||||||
endif ()
|
|
||||||
|
|
||||||
# spdlog
|
# spdlog
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
spdlog
|
spdlog
|
||||||
@@ -101,16 +93,15 @@ FetchContent_Declare(
|
|||||||
)
|
)
|
||||||
FetchContent_MakeAvailable(spdlog)
|
FetchContent_MakeAvailable(spdlog)
|
||||||
|
|
||||||
# cpp-httplib — replaces direct libcurl usage in web_client.
|
# cpp-httplib — header-only HTTP/HTTPS client replacing libcurl.
|
||||||
# OpenSSL is required for HTTPS (Wikipedia). find_package is called first so
|
# OpenSSL is required for HTTPS (Wikipedia API). find_package locates
|
||||||
# CMake can locate libssl/libcrypto; cpp-httplib itself is header-only so the
|
# libssl/libcrypto; HTTPLIB_REQUIRE_OPENSSL causes a hard build failure
|
||||||
# CPPHTTPLIB_OPENSSL_SUPPORT compile definition is propagated via the target.
|
# if OpenSSL is absent rather than silently producing an HTTP-only binary.
|
||||||
find_package(OpenSSL REQUIRED)
|
find_package(OpenSSL REQUIRED)
|
||||||
|
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
cpp-httplib
|
cpp-httplib
|
||||||
GIT_REPOSITORY https://github.com/yhirose/cpp-httplib.git
|
GIT_REPOSITORY https://github.com/yhirose/cpp-httplib.git
|
||||||
GIT_TAG v0.41.0
|
GIT_TAG v0.43.2
|
||||||
GIT_SHALLOW TRUE
|
GIT_SHALLOW TRUE
|
||||||
SYSTEM
|
SYSTEM
|
||||||
)
|
)
|
||||||
@@ -145,13 +136,6 @@ target_sources(${PROJECT_NAME} PRIVATE
|
|||||||
)
|
)
|
||||||
|
|
||||||
# --- web_client ---
|
# --- web_client ---
|
||||||
# DEPRECATED: curl_web_client_* — to be replaced with cpp-httplib equivalents.
|
|
||||||
target_sources(${PROJECT_NAME} PRIVATE
|
|
||||||
src/web_client/curl_web_client_url_encode.cc
|
|
||||||
src/web_client/curl_web_client_get.cc
|
|
||||||
src/web_client/curl_global_state.cc
|
|
||||||
)
|
|
||||||
|
|
||||||
target_sources(${PROJECT_NAME} PRIVATE
|
target_sources(${PROJECT_NAME} PRIVATE
|
||||||
src/web_client/http_web_client.cc
|
src/web_client/http_web_client.cc
|
||||||
)
|
)
|
||||||
@@ -202,7 +186,7 @@ target_sources(${PROJECT_NAME} PRIVATE
|
|||||||
src/services/prompt_directory.cc
|
src/services/prompt_directory.cc
|
||||||
)
|
)
|
||||||
|
|
||||||
# 6. Include Directories & Link Libraries
|
# 6. Include Directories, Link Libraries & Compile Definitions
|
||||||
target_include_directories(${PROJECT_NAME} PRIVATE
|
target_include_directories(${PROJECT_NAME} PRIVATE
|
||||||
includes
|
includes
|
||||||
$<$<NOT:$<BOOL:${BIERGARTEN_MOCK_ONLY}>>:${llama-cpp_SOURCE_DIR}/include>
|
$<$<NOT:$<BOOL:${BIERGARTEN_MOCK_ONLY}>>:${llama-cpp_SOURCE_DIR}/include>
|
||||||
@@ -219,12 +203,17 @@ target_link_libraries(${PROJECT_NAME} PRIVATE
|
|||||||
httplib::httplib
|
httplib::httplib
|
||||||
OpenSSL::SSL
|
OpenSSL::SSL
|
||||||
OpenSSL::Crypto
|
OpenSSL::Crypto
|
||||||
CURL::libcurl # DEPRECATED: remove once web_client is migrated to cpp-httplib
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if (BIERGARTEN_MOCK_ONLY)
|
target_compile_definitions(${PROJECT_NAME} PRIVATE
|
||||||
target_compile_definitions(${PROJECT_NAME} PRIVATE BIERGARTEN_MOCK_ONLY)
|
# Defined when -DBIERGARTEN_MOCK_ONLY=ON — skips llama.cpp entirely.
|
||||||
endif ()
|
# Use #ifdef BIERGARTEN_MOCK_ONLY in source to guard llama-specific code.
|
||||||
|
$<$<BOOL:${BIERGARTEN_MOCK_ONLY}>:BIERGARTEN_MOCK_ONLY>
|
||||||
|
|
||||||
|
# Defined for Debug configuration builds.
|
||||||
|
# Use #ifdef DEBUG in source to enable debug-only behaviour (e.g. verbose logging).
|
||||||
|
$<$<CONFIG:Debug>:DEBUG>
|
||||||
|
)
|
||||||
|
|
||||||
# 7. Runtime Assets
|
# 7. Runtime Assets
|
||||||
configure_file(
|
configure_file(
|
||||||
|
|||||||
@@ -1,54 +0,0 @@
|
|||||||
#ifndef BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
|
||||||
#define BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @file web_client/curl_web_client.h
|
|
||||||
* @brief libcurl-based WebClient implementation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include "web_client/web_client.h"
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief RAII wrapper for curl_global_init and curl_global_cleanup.
|
|
||||||
*
|
|
||||||
* Create one instance in application startup before using libcurl and keep it
|
|
||||||
* alive for application lifetime.
|
|
||||||
*/
|
|
||||||
class CurlGlobalState {
|
|
||||||
public:
|
|
||||||
/// @brief Initializes global libcurl state.
|
|
||||||
CurlGlobalState();
|
|
||||||
|
|
||||||
/// @brief Cleans up global libcurl state.
|
|
||||||
~CurlGlobalState();
|
|
||||||
|
|
||||||
/// @brief Non-copyable type.
|
|
||||||
CurlGlobalState(const CurlGlobalState&) = delete;
|
|
||||||
|
|
||||||
/// @brief Non-copyable type.
|
|
||||||
CurlGlobalState& operator=(const CurlGlobalState&) = delete;
|
|
||||||
};
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief WebClient implementation backed by libcurl.
|
|
||||||
*/
|
|
||||||
class CURLWebClient : public WebClient {
|
|
||||||
public:
|
|
||||||
/**
|
|
||||||
* @brief Executes an HTTP GET request.
|
|
||||||
*
|
|
||||||
* @param url Request URL.
|
|
||||||
* @return Response body.
|
|
||||||
*/
|
|
||||||
std::string Get(const std::string& url) override;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief URL-encodes a string value.
|
|
||||||
*
|
|
||||||
* @param value Raw value.
|
|
||||||
* @return URL-encoded string.
|
|
||||||
*/
|
|
||||||
std::string UrlEncode(const std::string& value) override;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
|
||||||
@@ -14,13 +14,13 @@
|
|||||||
/**
|
/**
|
||||||
* @brief WebClient implementation backed by cpp-httplib.
|
* @brief WebClient implementation backed by cpp-httplib.
|
||||||
*
|
*
|
||||||
* Supports HTTP and HTTPS (requires
|
* Supports HTTP and HTTPS (requires OpenSSL; see HTTPLIB_REQUIRE_OPENSSL
|
||||||
* OpenSSL; see HTTPLIB_USE_OPENSSL_IF_AVAILABLE in CMakeLists.txt).
|
* in CMakeLists.txt).
|
||||||
*
|
*
|
||||||
* URL parsing splits a full URL into scheme + host and path + query so that
|
* URL parsing splits a full URL into origin (scheme://host[:port]) and
|
||||||
* httplib::Client can be constructed correctly. A new client instance is
|
* path + query so that httplib::Client can be constructed correctly.
|
||||||
* created per request; th is is intentional given the low call volume in the
|
* A new client instance is created per request because the client is
|
||||||
* pipeline (Wikipedia enrichment, near-100 % cache hits).
|
* bound to a single origin at construction time.
|
||||||
*/
|
*/
|
||||||
class HttpWebClient final : public WebClient {
|
class HttpWebClient final : public WebClient {
|
||||||
public:
|
public:
|
||||||
|
|||||||
@@ -8,12 +8,11 @@
|
|||||||
|
|
||||||
#include <boost/di.hpp>
|
#include <boost/di.hpp>
|
||||||
#include <boost/program_options.hpp>
|
#include <boost/program_options.hpp>
|
||||||
#include <chrono>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <exception>
|
#include <exception>
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <optional>
|
#include <optional>
|
||||||
#include <sstream>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
#include "biergarten_data_generator.h"
|
#include "biergarten_data_generator.h"
|
||||||
@@ -29,16 +28,22 @@
|
|||||||
#include "services/timer.h"
|
#include "services/timer.h"
|
||||||
#include "services/wikipedia_service.h"
|
#include "services/wikipedia_service.h"
|
||||||
#include "web_client/curl_web_client.h"
|
#include "web_client/curl_web_client.h"
|
||||||
|
#include "web_client/http_web_client.h"
|
||||||
|
|
||||||
namespace di = boost::di;
|
namespace di = boost::di;
|
||||||
|
|
||||||
int main(const int argc, char** argv) {
|
int main(const int argc, char** argv) {
|
||||||
try {
|
try {
|
||||||
Timer timer;
|
Timer timer;
|
||||||
const CurlGlobalState curl_state;
|
|
||||||
const LlamaBackendState llama_backend_state;
|
|
||||||
spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v");
|
spdlog::set_pattern("[%Y-%m-%d %H:%M:%S.%e] [%^%l%$] %v");
|
||||||
|
|
||||||
|
#ifndef BIERGARTEN_MOCK_ONLY
|
||||||
|
const LlamaBackendState llama_backend_state;
|
||||||
|
#endif
|
||||||
|
#ifdef DEBUG
|
||||||
|
spdlog::set_level(spdlog::level::debug);
|
||||||
|
#endif
|
||||||
|
|
||||||
const auto parsed_options = ParseArguments(argc, argv);
|
const auto parsed_options = ParseArguments(argc, argv);
|
||||||
if (!parsed_options.has_value()) {
|
if (!parsed_options.has_value()) {
|
||||||
return 0;
|
return 0;
|
||||||
@@ -61,7 +66,7 @@ int main(const int argc, char** argv) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const auto injector = di::make_injector(
|
const auto injector = di::make_injector(
|
||||||
di::bind<WebClient>().to<CURLWebClient>(),
|
di::bind<WebClient>().to<HttpWebClient>(),
|
||||||
di::bind<ApplicationOptions>().to(options),
|
di::bind<ApplicationOptions>().to(options),
|
||||||
di::bind<IEnrichmentService>().to<WikipediaService>(),
|
di::bind<IEnrichmentService>().to<WikipediaService>(),
|
||||||
di::bind<IExportService>().to<SqliteExportService>(),
|
di::bind<IExportService>().to<SqliteExportService>(),
|
||||||
|
|||||||
@@ -1,19 +0,0 @@
|
|||||||
/**
|
|
||||||
* @file web_client/curl_global_state.cc
|
|
||||||
* @brief CurlGlobalState constructor and destructor implementation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <curl/curl.h>
|
|
||||||
|
|
||||||
#include <stdexcept>
|
|
||||||
|
|
||||||
#include "web_client/curl_web_client.h"
|
|
||||||
|
|
||||||
CurlGlobalState::CurlGlobalState() {
|
|
||||||
if (curl_global_init(CURL_GLOBAL_DEFAULT) != CURLE_OK) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"[CURLWebClient] Failed to initialize libcurl globally");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
CurlGlobalState::~CurlGlobalState() { curl_global_cleanup(); }
|
|
||||||
@@ -1,87 +0,0 @@
|
|||||||
/**
|
|
||||||
* @file web_client/curl_web_client_get.cc
|
|
||||||
* @brief CURLWebClient::Get() implementation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <curl/curl.h>
|
|
||||||
|
|
||||||
#include <cstdint>
|
|
||||||
#include <limits>
|
|
||||||
#include <memory>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "web_client/curl_web_client.h"
|
|
||||||
|
|
||||||
using CurlHandle = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>;
|
|
||||||
|
|
||||||
static constexpr long kConnectionTimeout = 10;
|
|
||||||
static constexpr long kRequestTimeout = 30;
|
|
||||||
static constexpr long kMaxRedirects = 5;
|
|
||||||
static constexpr int32_t kOkHttpStatus = 200;
|
|
||||||
|
|
||||||
static CurlHandle CreateHandle() {
|
|
||||||
CURL* handle = curl_easy_init();
|
|
||||||
if (handle == nullptr) {
|
|
||||||
throw std::runtime_error(
|
|
||||||
"[CURLWebClient] Failed to initialize libcurl handle");
|
|
||||||
}
|
|
||||||
return {handle, &curl_easy_cleanup};
|
|
||||||
}
|
|
||||||
|
|
||||||
static void SetCommonGetOptions(CURL* curl, const std::string& url) {
|
|
||||||
curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
|
|
||||||
curl_easy_setopt(curl, CURLOPT_USERAGENT, "biergarten-pipeline/0.1.0");
|
|
||||||
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_MAXREDIRS, kMaxRedirects);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, kConnectionTimeout);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_TIMEOUT, kRequestTimeout);
|
|
||||||
curl_easy_setopt(curl, CURLOPT_ACCEPT_ENCODING, "gzip");
|
|
||||||
}
|
|
||||||
|
|
||||||
// curl write callback that appends response data into a std::string
|
|
||||||
static size_t WriteCallbackString(void* contents, const size_t size,
|
|
||||||
const size_t nmemb, void* userp) {
|
|
||||||
const size_t real_size = size * nmemb;
|
|
||||||
auto* str = static_cast<std::string*>(userp);
|
|
||||||
str->append(static_cast<char*>(contents), real_size);
|
|
||||||
return real_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string CURLWebClient::Get(const std::string& url) {
|
|
||||||
const CurlHandle curl = CreateHandle();
|
|
||||||
|
|
||||||
std::string response_string;
|
|
||||||
|
|
||||||
SetCommonGetOptions(curl.get(), url);
|
|
||||||
|
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, WriteCallbackString);
|
|
||||||
curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &response_string);
|
|
||||||
|
|
||||||
CURLcode curl_result = curl_easy_perform(curl.get());
|
|
||||||
|
|
||||||
if (curl_result != CURLE_OK) {
|
|
||||||
const auto error = std::string("[CURLWebClient] GET failed: ") +
|
|
||||||
curl_easy_strerror(curl_result);
|
|
||||||
throw std::runtime_error(error);
|
|
||||||
}
|
|
||||||
|
|
||||||
long curl_http_code = 0;
|
|
||||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &curl_http_code);
|
|
||||||
|
|
||||||
if (curl_http_code < std::numeric_limits<int32_t>::min() ||
|
|
||||||
curl_http_code > std::numeric_limits<int32_t>::max()) {
|
|
||||||
throw std::runtime_error("[CURLWebClient] Invalid HTTP status code: " +
|
|
||||||
std::to_string(curl_http_code));
|
|
||||||
}
|
|
||||||
|
|
||||||
const int32_t http_code = static_cast<int32_t>(curl_http_code);
|
|
||||||
|
|
||||||
if (http_code != kOkHttpStatus) {
|
|
||||||
const std::string error = "[CURLWebClient] HTTP error " +
|
|
||||||
std::to_string(http_code) + " for URL " + url;
|
|
||||||
throw std::runtime_error(error);
|
|
||||||
}
|
|
||||||
|
|
||||||
return response_string;
|
|
||||||
}
|
|
||||||
@@ -1,24 +0,0 @@
|
|||||||
/**
|
|
||||||
* @file web_client/curl_web_client_url_encode.cc
|
|
||||||
* @brief CURLWebClient::UrlEncode() implementation.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <curl/curl.h>
|
|
||||||
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "web_client/curl_web_client.h"
|
|
||||||
|
|
||||||
std::string CURLWebClient::UrlEncode(const std::string& value) {
|
|
||||||
// A NULL handle is fine for UTF-8 encoding according to libcurl docs.
|
|
||||||
char* output = curl_easy_escape(nullptr, value.c_str(), 0);
|
|
||||||
|
|
||||||
if (!output) {
|
|
||||||
throw std::runtime_error("[CURLWebClient] curl_easy_escape failed");
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string result(output);
|
|
||||||
curl_free(output);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
Reference in New Issue
Block a user