format codebase

This commit is contained in:
Aaron Po
2026-04-02 21:46:46 -04:00
parent ba165d8aa7
commit 3af053f0eb
31 changed files with 1479 additions and 1445 deletions

View File

@@ -9,22 +9,23 @@
/// @brief Downloads and caches source geography JSON payloads.
class DataDownloader {
public:
/// @brief Initializes global curl state used by this downloader.
explicit DataDownloader(std::shared_ptr<WebClient> web_client);
public:
/// @brief Initializes global curl state used by this downloader.
explicit DataDownloader(std::shared_ptr<WebClient> web_client);
/// @brief Cleans up global curl state.
~DataDownloader();
/// @brief Cleans up global curl state.
~DataDownloader();
/// @brief Returns a local JSON path, downloading it when cache is missing.
std::string DownloadCountriesDatabase(
const std::string &cache_path,
const std::string &commit = "c5eb7772" // Stable commit: 2026-03-28 export
);
/// @brief Returns a local JSON path, downloading it when cache is missing.
std::string DownloadCountriesDatabase(
const std::string& cache_path,
const std::string& commit =
"c5eb7772" // Stable commit: 2026-03-28 export
);
private:
static bool FileExists(const std::string &file_path);
std::shared_ptr<WebClient> web_client_;
private:
static bool FileExists(const std::string& file_path);
std::shared_ptr<WebClient> web_client_;
};
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_

View File

@@ -4,26 +4,26 @@
#include <string>
struct BreweryResult {
std::string name;
std::string description;
std::string name;
std::string description;
};
struct UserResult {
std::string username;
std::string bio;
std::string username;
std::string bio;
};
class DataGenerator {
public:
virtual ~DataGenerator() = default;
public:
virtual ~DataGenerator() = default;
virtual void Load(const std::string &model_path) = 0;
virtual void Load(const std::string& model_path) = 0;
virtual BreweryResult GenerateBrewery(const std::string &city_name,
const std::string &country_name,
const std::string &region_context) = 0;
virtual BreweryResult GenerateBrewery(const std::string& city_name,
const std::string& country_name,
const std::string& region_context) = 0;
virtual UserResult GenerateUser(const std::string &locale) = 0;
virtual UserResult GenerateUser(const std::string& locale) = 0;
};
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_

View File

@@ -10,32 +10,32 @@ struct llama_model;
struct llama_context;
class LlamaGenerator final : public DataGenerator {
public:
LlamaGenerator() = default;
~LlamaGenerator() override;
public:
LlamaGenerator() = default;
~LlamaGenerator() override;
void SetSamplingOptions(float temperature, float top_p, int seed = -1);
void SetSamplingOptions(float temperature, float top_p, int seed = -1);
void Load(const std::string &model_path) override;
BreweryResult GenerateBrewery(const std::string &city_name,
const std::string &country_name,
const std::string &region_context) override;
UserResult GenerateUser(const std::string &locale) override;
void Load(const std::string& model_path) override;
BreweryResult GenerateBrewery(const std::string& city_name,
const std::string& country_name,
const std::string& region_context) override;
UserResult GenerateUser(const std::string& locale) override;
private:
std::string Infer(const std::string &prompt, int max_tokens = 10000);
// Overload that allows passing a system message separately so chat-capable
// models receive a proper system role instead of having the system text
// concatenated into the user prompt (helps avoid revealing internal
// reasoning or instructions in model output).
std::string Infer(const std::string &system_prompt, const std::string &prompt,
int max_tokens = 10000);
private:
std::string Infer(const std::string& prompt, int max_tokens = 10000);
// Overload that allows passing a system message separately so chat-capable
// models receive a proper system role instead of having the system text
// concatenated into the user prompt (helps avoid revealing internal
// reasoning or instructions in model output).
std::string Infer(const std::string& system_prompt,
const std::string& prompt, int max_tokens = 10000);
llama_model *model_ = nullptr;
llama_context *context_ = nullptr;
float sampling_temperature_ = 0.8f;
float sampling_top_p_ = 0.92f;
uint32_t sampling_seed_ = 0xFFFFFFFFu;
llama_model* model_ = nullptr;
llama_context* context_ = nullptr;
float sampling_temperature_ = 0.8f;
float sampling_top_p_ = 0.92f;
uint32_t sampling_seed_ = 0xFFFFFFFFu;
};
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_

View File

@@ -12,18 +12,17 @@ typedef int llama_token;
std::string PrepareRegionContextPublic(std::string_view region_context,
std::size_t max_chars = 700);
std::pair<std::string, std::string>
ParseTwoLineResponsePublic(const std::string& raw,
const std::string& error_message);
std::pair<std::string, std::string> ParseTwoLineResponsePublic(
const std::string& raw, const std::string& error_message);
std::string ToChatPromptPublic(const llama_model *model,
std::string ToChatPromptPublic(const llama_model* model,
const std::string& user_prompt);
std::string ToChatPromptPublic(const llama_model *model,
std::string ToChatPromptPublic(const llama_model* model,
const std::string& system_prompt,
const std::string& user_prompt);
void AppendTokenPiecePublic(const llama_vocab *vocab, llama_token token,
void AppendTokenPiecePublic(const llama_vocab* vocab, llama_token token,
std::string& output);
std::string ValidateBreweryJsonPublic(const std::string& raw,

View File

@@ -1,27 +1,28 @@
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
#define BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
#include "data_generation/data_generator.h"
#include <string>
#include <vector>
#include "data_generation/data_generator.h"
class MockGenerator final : public DataGenerator {
public:
void Load(const std::string &model_path) override;
BreweryResult GenerateBrewery(const std::string &city_name,
const std::string &country_name,
const std::string &region_context) override;
UserResult GenerateUser(const std::string &locale) override;
public:
void Load(const std::string& model_path) override;
BreweryResult GenerateBrewery(const std::string& city_name,
const std::string& country_name,
const std::string& region_context) override;
UserResult GenerateUser(const std::string& locale) override;
private:
static std::size_t DeterministicHash(const std::string &a,
const std::string &b);
private:
static std::size_t DeterministicHash(const std::string& a,
const std::string& b);
static const std::vector<std::string> kBreweryAdjectives;
static const std::vector<std::string> kBreweryNouns;
static const std::vector<std::string> kBreweryDescriptions;
static const std::vector<std::string> kUsernames;
static const std::vector<std::string> kBios;
static const std::vector<std::string> kBreweryAdjectives;
static const std::vector<std::string> kBreweryNouns;
static const std::vector<std::string> kBreweryDescriptions;
static const std::vector<std::string> kUsernames;
static const std::vector<std::string> kBios;
};
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_

View File

@@ -1,83 +1,84 @@
#ifndef BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
#define BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
#include <mutex>
#include <sqlite3.h>
#include <mutex>
#include <string>
#include <vector>
struct Country {
/// @brief Country identifier from the source dataset.
int id;
/// @brief Country display name.
std::string name;
/// @brief ISO 3166-1 alpha-2 code.
std::string iso2;
/// @brief ISO 3166-1 alpha-3 code.
std::string iso3;
/// @brief Country identifier from the source dataset.
int id;
/// @brief Country display name.
std::string name;
/// @brief ISO 3166-1 alpha-2 code.
std::string iso2;
/// @brief ISO 3166-1 alpha-3 code.
std::string iso3;
};
struct State {
/// @brief State or province identifier from the source dataset.
int id;
/// @brief State or province display name.
std::string name;
/// @brief State or province short code.
std::string iso2;
/// @brief Parent country identifier.
int country_id;
/// @brief State or province identifier from the source dataset.
int id;
/// @brief State or province display name.
std::string name;
/// @brief State or province short code.
std::string iso2;
/// @brief Parent country identifier.
int country_id;
};
struct City {
/// @brief City identifier from the source dataset.
int id;
/// @brief City display name.
std::string name;
/// @brief Parent country identifier.
int country_id;
/// @brief City identifier from the source dataset.
int id;
/// @brief City display name.
std::string name;
/// @brief Parent country identifier.
int country_id;
};
/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks.
class SqliteDatabase {
private:
sqlite3 *db_ = nullptr;
std::mutex db_mutex_;
private:
sqlite3* db_ = nullptr;
std::mutex db_mutex_;
void InitializeSchema();
void InitializeSchema();
public:
/// @brief Closes the SQLite connection if initialized.
~SqliteDatabase();
public:
/// @brief Closes the SQLite connection if initialized.
~SqliteDatabase();
/// @brief Opens the SQLite database at db_path and creates schema objects.
void Initialize(const std::string &db_path = ":memory:");
/// @brief Opens the SQLite database at db_path and creates schema objects.
void Initialize(const std::string& db_path = ":memory:");
/// @brief Starts a database transaction for batched writes.
void BeginTransaction();
/// @brief Starts a database transaction for batched writes.
void BeginTransaction();
/// @brief Commits the active database transaction.
void CommitTransaction();
/// @brief Commits the active database transaction.
void CommitTransaction();
/// @brief Inserts a country row.
void InsertCountry(int id, const std::string &name, const std::string &iso2,
const std::string &iso3);
/// @brief Inserts a country row.
void InsertCountry(int id, const std::string& name, const std::string& iso2,
const std::string& iso3);
/// @brief Inserts a state row linked to a country.
void InsertState(int id, int country_id, const std::string &name,
const std::string &iso2);
/// @brief Inserts a state row linked to a country.
void InsertState(int id, int country_id, const std::string& name,
const std::string& iso2);
/// @brief Inserts a city row linked to state and country.
void InsertCity(int id, int state_id, int country_id, const std::string &name,
double latitude, double longitude);
/// @brief Inserts a city row linked to state and country.
void InsertCity(int id, int state_id, int country_id,
const std::string& name, double latitude, double longitude);
/// @brief Returns city records including parent country id.
std::vector<City> QueryCities();
/// @brief Returns city records including parent country id.
std::vector<City> QueryCities();
/// @brief Returns countries with optional row limit.
std::vector<Country> QueryCountries(int limit = 0);
/// @brief Returns countries with optional row limit.
std::vector<Country> QueryCountries(int limit = 0);
/// @brief Returns states with optional row limit.
std::vector<State> QueryStates(int limit = 0);
/// @brief Returns states with optional row limit.
std::vector<State> QueryStates(int limit = 0);
};
#endif // BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_

View File

@@ -1,15 +1,17 @@
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
#include <string>
#include "database/database.h"
#include "json_handling/stream_parser.h"
#include <string>
/// @brief Loads world-city JSON data into SQLite through streaming parsing.
class JsonLoader {
public:
/// @brief Parses a JSON file and writes country/state/city rows into db.
static void LoadWorldCities(const std::string &json_path, SqliteDatabase &db);
public:
/// @brief Parses a JSON file and writes country/state/city rows into db.
static void LoadWorldCities(const std::string& json_path,
SqliteDatabase& db);
};
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_

View File

@@ -1,51 +1,52 @@
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
#define BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
#include "database/database.h"
#include <functional>
#include <string>
#include "database/database.h"
// Forward declaration to avoid circular dependency
class SqliteDatabase;
/// @brief In-memory representation of one parsed city entry.
struct CityRecord {
int id;
int state_id;
int country_id;
std::string name;
double latitude;
double longitude;
int id;
int state_id;
int country_id;
std::string name;
double latitude;
double longitude;
};
/// @brief Streaming SAX parser that emits city records during traversal.
class StreamingJsonParser {
public:
/// @brief Parses file_path and invokes callbacks for city rows and progress.
static void Parse(const std::string &file_path, SqliteDatabase &db,
std::function<void(const CityRecord &)> on_city,
std::function<void(size_t, size_t)> on_progress = nullptr);
public:
/// @brief Parses file_path and invokes callbacks for city rows and progress.
static void Parse(const std::string& file_path, SqliteDatabase& db,
std::function<void(const CityRecord&)> on_city,
std::function<void(size_t, size_t)> on_progress = nullptr);
private:
/// @brief Mutable SAX handler state while traversing nested JSON arrays.
struct ParseState {
int current_country_id = 0;
int current_state_id = 0;
private:
/// @brief Mutable SAX handler state while traversing nested JSON arrays.
struct ParseState {
int current_country_id = 0;
int current_state_id = 0;
CityRecord current_city = {};
bool building_city = false;
std::string current_key;
CityRecord current_city = {};
bool building_city = false;
std::string current_key;
int array_depth = 0;
int object_depth = 0;
bool in_countries_array = false;
bool in_states_array = false;
bool in_cities_array = false;
int array_depth = 0;
int object_depth = 0;
bool in_countries_array = false;
bool in_states_array = false;
bool in_cities_array = false;
std::function<void(const CityRecord &)> on_city;
std::function<void(size_t, size_t)> on_progress;
size_t bytes_processed = 0;
};
std::function<void(const CityRecord&)> on_city;
std::function<void(size_t, size_t)> on_progress;
size_t bytes_processed = 0;
};
};
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_

View File

@@ -1,29 +1,30 @@
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
#define BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
#include "web_client/web_client.h"
#include <memory>
#include "web_client/web_client.h"
// RAII for curl_global_init/cleanup.
// An instance of this class should be created in main() before any curl
// operations and exist for the lifetime of the application.
class CurlGlobalState {
public:
CurlGlobalState();
~CurlGlobalState();
CurlGlobalState(const CurlGlobalState &) = delete;
CurlGlobalState &operator=(const CurlGlobalState &) = delete;
public:
CurlGlobalState();
~CurlGlobalState();
CurlGlobalState(const CurlGlobalState&) = delete;
CurlGlobalState& operator=(const CurlGlobalState&) = delete;
};
class CURLWebClient : public WebClient {
public:
CURLWebClient();
~CURLWebClient() override;
public:
CURLWebClient();
~CURLWebClient() override;
void DownloadToFile(const std::string &url,
const std::string &file_path) override;
std::string Get(const std::string &url) override;
std::string UrlEncode(const std::string &value) override;
void DownloadToFile(const std::string& url,
const std::string& file_path) override;
std::string Get(const std::string& url) override;
std::string UrlEncode(const std::string& value) override;
};
#endif // BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_

View File

@@ -4,19 +4,19 @@
#include <string>
class WebClient {
public:
virtual ~WebClient() = default;
public:
virtual ~WebClient() = default;
// Downloads content from a URL to a file. Throws on error.
virtual void DownloadToFile(const std::string &url,
const std::string &file_path) = 0;
// Downloads content from a URL to a file. Throws on error.
virtual void DownloadToFile(const std::string& url,
const std::string& file_path) = 0;
// Performs a GET request and returns the response body as a string. Throws on
// error.
virtual std::string Get(const std::string &url) = 0;
// Performs a GET request and returns the response body as a string. Throws
// on error.
virtual std::string Get(const std::string& url) = 0;
// URL-encodes a string.
virtual std::string UrlEncode(const std::string &value) = 0;
// URL-encodes a string.
virtual std::string UrlEncode(const std::string& value) = 0;
};
#endif // BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_

View File

@@ -10,18 +10,18 @@
/// @brief Provides cached Wikipedia summary lookups for city and country pairs.
class WikipediaService {
public:
/// @brief Creates a new Wikipedia service with the provided web client.
explicit WikipediaService(std::shared_ptr<WebClient> client);
public:
/// @brief Creates a new Wikipedia service with the provided web client.
explicit WikipediaService(std::shared_ptr<WebClient> client);
/// @brief Returns the Wikipedia summary extract for city and country.
[[nodiscard]] std::string GetSummary(std::string_view city,
std::string_view country);
/// @brief Returns the Wikipedia summary extract for city and country.
[[nodiscard]] std::string GetSummary(std::string_view city,
std::string_view country);
private:
std::string FetchExtract(std::string_view query);
std::shared_ptr<WebClient> client_;
std::unordered_map<std::string, std::string> cache_;
private:
std::string FetchExtract(std::string_view query);
std::shared_ptr<WebClient> client_;
std::unordered_map<std::string, std::string> cache_;
};
#endif // BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_