Pipeline: add CURL/WebClient & Wikipedia service

Introduce a pluggable web client interface and concrete CURL implementation: adds IWebClient, CURLWebClient, and CurlGlobalState (headers + curl_web_client.cpp). DataDownloader now accepts an IWebClient and delegates downloads. Add WikipediaService for cached Wikipedia summary lookups. Refactor SqliteDatabase to return full City records and update consumers accordingly. Improve JsonLoader to use batched transactions during streaming parses. Enhance LlamaGenerator with sampling options, increased token limits, JSON extraction/validation, and other parsing helpers. Modernize CMake: set policy/version, add project_options, simplify FetchContent usage (spdlog), require Boost components (program_options/json), list pipeline sources explicitly, and tweak post-build/memcheck targets. Update README to match implementation changes and new CLI/config conventions.
This commit is contained in:
Aaron Po
2026-04-02 16:29:16 -04:00
parent ac136f7179
commit 98083ab40c
16 changed files with 1125 additions and 794 deletions

View File

@@ -0,0 +1,26 @@
#pragma once
#include "web_client.h"
#include <memory>
// RAII for curl_global_init/cleanup.
// An instance of this class should be created in main() before any curl
// operations and exist for the lifetime of the application.
class CurlGlobalState {
public:
CurlGlobalState();
~CurlGlobalState();
CurlGlobalState(const CurlGlobalState &) = delete;
CurlGlobalState &operator=(const CurlGlobalState &) = delete;
};
class CURLWebClient : public IWebClient {
public:
CURLWebClient();
~CURLWebClient() override;
void DownloadToFile(const std::string &url,
const std::string &filePath) override;
std::string Get(const std::string &url) override;
std::string UrlEncode(const std::string &value) override;
};

View File

@@ -1,14 +1,17 @@
#ifndef DATA_DOWNLOADER_H
#define DATA_DOWNLOADER_H
#include <memory>
#include <stdexcept>
#include <string>
#include "web_client.h"
/// @brief Downloads and caches source geography JSON payloads.
class DataDownloader {
public:
/// @brief Initializes global curl state used by this downloader.
DataDownloader();
DataDownloader(std::shared_ptr<IWebClient> webClient);
/// @brief Cleans up global curl state.
~DataDownloader();
@@ -21,6 +24,7 @@ public:
private:
bool FileExists(const std::string &filePath) const;
std::shared_ptr<IWebClient> m_webClient;
};
#endif // DATA_DOWNLOADER_H

View File

@@ -27,6 +27,15 @@ struct State {
int countryId;
};
struct City {
/// @brief City identifier from the source dataset.
int id;
/// @brief City display name.
std::string name;
/// @brief Parent country identifier.
int countryId;
};
/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks.
class SqliteDatabase {
private:
@@ -60,8 +69,8 @@ public:
void InsertCity(int id, int stateId, int countryId, const std::string &name,
double latitude, double longitude);
/// @brief Returns city id and city name pairs.
std::vector<std::pair<int, std::string>> QueryCities();
/// @brief Returns city records including parent country id.
std::vector<City> QueryCities();
/// @brief Returns countries with optional row limit.
std::vector<Country> QueryCountries(int limit = 0);

View File

@@ -1,16 +1,20 @@
#pragma once
#include "data_generator.h"
#include <memory>
#include <cstdint>
#include <string>
#include "data_generator.h"
struct llama_model;
struct llama_context;
class LlamaGenerator final : public IDataGenerator {
public:
LlamaGenerator() = default;
~LlamaGenerator() override;
void setSamplingOptions(float temperature, float topP, int seed = -1);
void load(const std::string &modelPath) override;
BreweryResult generateBrewery(const std::string &cityName,
const std::string &countryName,
@@ -18,14 +22,17 @@ public:
UserResult generateUser(const std::string &locale) override;
private:
std::string infer(const std::string &prompt, int maxTokens = 5000);
std::string infer(const std::string &prompt, int maxTokens = 10000);
// Overload that allows passing a system message separately so chat-capable
// models receive a proper system role instead of having the system text
// concatenated into the user prompt (helps avoid revealing internal
// reasoning or instructions in model output).
std::string infer(const std::string &systemPrompt, const std::string &prompt,
int maxTokens = 5000);
int maxTokens = 10000);
llama_model *model_ = nullptr;
llama_context *context_ = nullptr;
float sampling_temperature_ = 0.8f;
float sampling_top_p_ = 0.92f;
uint32_t sampling_seed_ = 0xFFFFFFFFu;
};

View File

@@ -0,0 +1,19 @@
#pragma once
#include <string>
class IWebClient {
public:
virtual ~IWebClient() = default;
// Downloads content from a URL to a file. Throws on error.
virtual void DownloadToFile(const std::string &url,
const std::string &filePath) = 0;
// Performs a GET request and returns the response body as a string. Throws on
// error.
virtual std::string Get(const std::string &url) = 0;
// URL-encodes a string.
virtual std::string UrlEncode(const std::string &value) = 0;
};

View File

@@ -0,0 +1,24 @@
#pragma once
#include <memory>
#include <string>
#include <string_view>
#include <unordered_map>
#include "web_client.h"
/// @brief Provides cached Wikipedia summary lookups for city and country pairs.
class WikipediaService {
public:
/// @brief Creates a new Wikipedia service with the provided web client.
explicit WikipediaService(std::shared_ptr<IWebClient> client);
/// @brief Returns the Wikipedia summary extract for city and country.
[[nodiscard]] std::string GetSummary(std::string_view city,
std::string_view country);
private:
std::string FetchExtract(std::string_view query);
std::shared_ptr<IWebClient> client_;
std::unordered_map<std::string, std::string> cache_;
};