Pipeline: add CURL/WebClient & Wikipedia service

Introduce a pluggable web client interface and concrete CURL implementation: adds IWebClient, CURLWebClient, and CurlGlobalState (headers + curl_web_client.cpp). DataDownloader now accepts an IWebClient and delegates downloads. Add WikipediaService for cached Wikipedia summary lookups. Refactor SqliteDatabase to return full City records and update consumers accordingly. Improve JsonLoader to use batched transactions during streaming parses. Enhance LlamaGenerator with sampling options, increased token limits, JSON extraction/validation, and other parsing helpers. Modernize CMake: set policy/version, add project_options, simplify FetchContent usage (spdlog), require Boost components (program_options/json), list pipeline sources explicitly, and tweak post-build/memcheck targets. Update README to match implementation changes and new CLI/config conventions.
This commit is contained in:
Aaron Po
2026-04-02 16:29:16 -04:00
parent ac136f7179
commit 98083ab40c
16 changed files with 1125 additions and 794 deletions

View File

@@ -1,49 +1,52 @@
cmake_minimum_required(VERSION 3.20)
project(biergarten-pipeline VERSION 0.1.0 LANGUAGES CXX)
cmake_policy(SET CMP0167 NEW)
# Allows older dependencies to configure on newer CMake.
set(CMAKE_POLICY_VERSION_MINIMUM 3.5)
# Policies
cmake_policy(SET CMP0167 NEW) # FindBoost improvements
# Global Settings
set(CMAKE_CXX_STANDARD 23)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
# -----------------------------------------------------------------------------
# Compiler Options & Warnings (Interface Library)
# -----------------------------------------------------------------------------
add_library(project_options INTERFACE)
target_compile_options(project_options INTERFACE
$<$<CXX_COMPILER_ID:GNU,Clang>:
-Wall -Wextra -Wpedantic -Wshadow -Wconversion -Wsign-conversion -Wunused
>
$<$<CXX_COMPILER_ID:MSVC>:
/W4 /WX /permissive-
>
)
# -----------------------------------------------------------------------------
# Dependencies
# -----------------------------------------------------------------------------
find_package(CURL REQUIRED)
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
find_package(SQLite3 REQUIRED)
find_package(Boost 1.75 REQUIRED COMPONENTS program_options json)
include(FetchContent)
# RapidJSON (header-only) for true SAX parsing
# Using direct header-only approach without CMakeLists.txt
FetchContent_Declare(
rapidjson
GIT_REPOSITORY https://github.com/Tencent/rapidjson.git
GIT_TAG v1.1.0
SOURCE_SUBDIR "" # Don't use RapidJSON's CMakeLists.txt
)
FetchContent_GetProperties(rapidjson)
if(NOT rapidjson_POPULATED)
FetchContent_Populate(rapidjson)
# RapidJSON is header-only; just make include path available
endif()
# spdlog (logging)
# spdlog (Logging)
FetchContent_Declare(
spdlog
GIT_REPOSITORY https://github.com/gabime/spdlog.git
GIT_TAG v1.11.0
)
FetchContent_GetProperties(spdlog)
if(NOT spdlog_POPULATED)
FetchContent_Populate(spdlog)
add_subdirectory(${spdlog_SOURCE_DIR} ${spdlog_BINARY_DIR} EXCLUDE_FROM_ALL)
endif()
FetchContent_MakeAvailable(spdlog)
# llama.cpp (on-device inference)
# llama.cpp (LLM Inference)
set(LLAMA_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(LLAMA_BUILD_EXAMPLES OFF CACHE BOOL "" FORCE)
set(LLAMA_BUILD_SERVER OFF CACHE BOOL "" FORCE)
FetchContent_Declare(
llama_cpp
GIT_REPOSITORY https://github.com/ggerganov/llama.cpp.git
@@ -57,90 +60,53 @@ if(TARGET llama)
)
endif()
file(GLOB_RECURSE SOURCES CONFIGURE_DEPENDS
src/*.cpp
# -----------------------------------------------------------------------------
# Main Executable
# -----------------------------------------------------------------------------
set(PIPELINE_SOURCES
src/curl_web_client.cpp
src/data_downloader.cpp
src/database.cpp
src/json_loader.cpp
src/llama_generator.cpp
src/mock_generator.cpp
src/stream_parser.cpp
src/wikipedia_service.cpp
src/main.cpp
)
add_executable(biergarten-pipeline ${SOURCES})
add_executable(biergarten-pipeline ${PIPELINE_SOURCES})
target_include_directories(biergarten-pipeline
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/includes
${rapidjson_SOURCE_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/includes
${llama_cpp_SOURCE_DIR}/include
)
target_link_libraries(biergarten-pipeline
PRIVATE
project_options
CURL::libcurl
Boost::unit_test_framework
SQLite::SQLite3
spdlog::spdlog
llama
Boost::program_options
Boost::json
)
target_compile_options(biergarten-pipeline PRIVATE
$<$<CXX_COMPILER_ID:GNU,Clang>:
-Wall
-Wextra
-Wpedantic
-Wshadow
-Wconversion
-Wsign-conversion
>
$<$<CXX_COMPILER_ID:MSVC>:
/W4
/WX
>
)
# -----------------------------------------------------------------------------
# Post-Build Steps & Utilities
# -----------------------------------------------------------------------------
add_custom_command(TARGET biergarten-pipeline POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory
${CMAKE_CURRENT_SOURCE_DIR}/output
COMMENT "Creating output/ directory for seed SQL files"
COMMAND ${CMAKE_COMMAND} -E make_directory ${CMAKE_CURRENT_SOURCE_DIR}/output
COMMENT "Ensuring output directory exists"
)
find_program(VALGRIND valgrind)
if(VALGRIND)
add_custom_target(memcheck
COMMAND ${VALGRIND}
--leak-check=full
--error-exitcode=1
$<TARGET_FILE:biergarten-pipeline> --help
COMMAND ${VALGRIND} --leak-check=full --error-exitcode=1 $<TARGET_FILE:biergarten-pipeline> --help
DEPENDS biergarten-pipeline
COMMENT "Running Valgrind memcheck"
COMMENT "Running Valgrind memory check"
)
endif()
include(CTest)
if(BUILD_TESTING)
find_package(Boost REQUIRED COMPONENTS unit_test_framework)
file(GLOB_RECURSE TEST_SOURCES CONFIGURE_DEPENDS
tests/*.cpp
tests/*.cc
tests/*.cxx
)
if(TEST_SOURCES)
add_executable(biergarten-pipeline-tests ${TEST_SOURCES})
target_include_directories(biergarten-pipeline-tests
PRIVATE
${CMAKE_CURRENT_SOURCE_DIR}/include
)
target_link_libraries(biergarten-pipeline-tests
PRIVATE
Boost::unit_test_framework
CURL::libcurl
nlohmann_json::nlohmann_json
)
add_test(
NAME biergarten-pipeline-tests
COMMAND biergarten-pipeline-tests
)
endif()
endif()