mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-04-05 10:09:03 +00:00
Refactor web client interface and related components
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
#pragma once
|
#ifndef BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
@@ -15,20 +16,20 @@
|
|||||||
* @brief Program options for the Biergarten pipeline application.
|
* @brief Program options for the Biergarten pipeline application.
|
||||||
*/
|
*/
|
||||||
struct ApplicationOptions {
|
struct ApplicationOptions {
|
||||||
/// @brief Path to the LLM model file (gguf format); mutually exclusive with useMocked.
|
/// @brief Path to the LLM model file (gguf format); mutually exclusive with use_mocked.
|
||||||
std::string modelPath;
|
std::string model_path;
|
||||||
|
|
||||||
/// @brief Use mocked generator instead of LLM; mutually exclusive with modelPath.
|
/// @brief Use mocked generator instead of LLM; mutually exclusive with model_path.
|
||||||
bool useMocked = false;
|
bool use_mocked = false;
|
||||||
|
|
||||||
/// @brief Directory for cached JSON and database files.
|
/// @brief Directory for cached JSON and database files.
|
||||||
std::string cacheDir;
|
std::string cache_dir;
|
||||||
|
|
||||||
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
|
/// @brief LLM sampling temperature (0.0 to 1.0, higher = more random).
|
||||||
float temperature = 0.8f;
|
float temperature = 0.8f;
|
||||||
|
|
||||||
/// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more random).
|
/// @brief LLM nucleus sampling top-p parameter (0.0 to 1.0, higher = more random).
|
||||||
float topP = 0.92f;
|
float top_p = 0.92f;
|
||||||
|
|
||||||
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
|
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
|
||||||
int seed = -1;
|
int seed = -1;
|
||||||
@@ -37,6 +38,8 @@ struct ApplicationOptions {
|
|||||||
std::string commit = "c5eb7772";
|
std::string commit = "c5eb7772";
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_BIERGARTEN_DATA_GENERATOR_H_
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Main data generator class for the Biergarten pipeline.
|
* @brief Main data generator class for the Biergarten pipeline.
|
||||||
@@ -50,11 +53,11 @@ public:
|
|||||||
* @brief Construct a BiergartenDataGenerator with injected dependencies.
|
* @brief Construct a BiergartenDataGenerator with injected dependencies.
|
||||||
*
|
*
|
||||||
* @param options Application configuration options.
|
* @param options Application configuration options.
|
||||||
* @param webClient HTTP client for downloading data.
|
* @param web_client HTTP client for downloading data.
|
||||||
* @param database SQLite database instance.
|
* @param database SQLite database instance.
|
||||||
*/
|
*/
|
||||||
BiergartenDataGenerator(const ApplicationOptions &options,
|
BiergartenDataGenerator(const ApplicationOptions &options,
|
||||||
std::shared_ptr<IWebClient> webClient,
|
std::shared_ptr<WebClient> web_client,
|
||||||
SqliteDatabase &database);
|
SqliteDatabase &database);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -75,7 +78,7 @@ private:
|
|||||||
const ApplicationOptions options_;
|
const ApplicationOptions options_;
|
||||||
|
|
||||||
/// @brief Shared HTTP client dependency.
|
/// @brief Shared HTTP client dependency.
|
||||||
std::shared_ptr<IWebClient> webClient_;
|
std::shared_ptr<WebClient> webClient_;
|
||||||
|
|
||||||
/// @brief Database dependency.
|
/// @brief Database dependency.
|
||||||
SqliteDatabase &database_;
|
SqliteDatabase &database_;
|
||||||
@@ -87,7 +90,7 @@ private:
|
|||||||
*
|
*
|
||||||
* @return A unique_ptr to the initialized generator.
|
* @return A unique_ptr to the initialized generator.
|
||||||
*/
|
*/
|
||||||
std::unique_ptr<IDataGenerator> InitializeGenerator();
|
std::unique_ptr<DataGenerator> InitializeGenerator();
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Download and load geographic data if not cached.
|
* @brief Download and load geographic data if not cached.
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
#ifndef DATA_DOWNLOADER_H
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
|
||||||
#define DATA_DOWNLOADER_H
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
@@ -11,20 +11,20 @@
|
|||||||
class DataDownloader {
|
class DataDownloader {
|
||||||
public:
|
public:
|
||||||
/// @brief Initializes global curl state used by this downloader.
|
/// @brief Initializes global curl state used by this downloader.
|
||||||
explicit DataDownloader(std::shared_ptr<IWebClient> webClient);
|
explicit DataDownloader(std::shared_ptr<WebClient> web_client);
|
||||||
|
|
||||||
/// @brief Cleans up global curl state.
|
/// @brief Cleans up global curl state.
|
||||||
~DataDownloader();
|
~DataDownloader();
|
||||||
|
|
||||||
/// @brief Returns a local JSON path, downloading it when cache is missing.
|
/// @brief Returns a local JSON path, downloading it when cache is missing.
|
||||||
std::string DownloadCountriesDatabase(
|
std::string DownloadCountriesDatabase(
|
||||||
const std::string &cachePath,
|
const std::string &cache_path,
|
||||||
const std::string &commit = "c5eb7772" // Stable commit: 2026-03-28 export
|
const std::string &commit = "c5eb7772" // Stable commit: 2026-03-28 export
|
||||||
);
|
);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static bool FileExists(const std::string &filePath) ;
|
static bool FileExists(const std::string &file_path);
|
||||||
std::shared_ptr<IWebClient> m_webClient;
|
std::shared_ptr<WebClient> web_client_;
|
||||||
};
|
};
|
||||||
|
|
||||||
#endif // DATA_DOWNLOADER_H
|
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_DOWNLOADER_H_
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
#pragma once
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
@@ -12,15 +13,17 @@ struct UserResult {
|
|||||||
std::string bio;
|
std::string bio;
|
||||||
};
|
};
|
||||||
|
|
||||||
class IDataGenerator {
|
class DataGenerator {
|
||||||
public:
|
public:
|
||||||
virtual ~IDataGenerator() = default;
|
virtual ~DataGenerator() = default;
|
||||||
|
|
||||||
virtual void load(const std::string &modelPath) = 0;
|
virtual void Load(const std::string &model_path) = 0;
|
||||||
|
|
||||||
virtual BreweryResult generateBrewery(const std::string &cityName,
|
virtual BreweryResult GenerateBrewery(const std::string &city_name,
|
||||||
const std::string &countryName,
|
const std::string &country_name,
|
||||||
const std::string ®ionContext) = 0;
|
const std::string ®ion_context) = 0;
|
||||||
|
|
||||||
virtual UserResult generateUser(const std::string &locale) = 0;
|
virtual UserResult GenerateUser(const std::string &locale) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_DATA_GENERATOR_H_
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
#pragma once
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||||
|
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <string>
|
#include <string>
|
||||||
@@ -8,27 +9,27 @@
|
|||||||
struct llama_model;
|
struct llama_model;
|
||||||
struct llama_context;
|
struct llama_context;
|
||||||
|
|
||||||
class LlamaGenerator final : public IDataGenerator {
|
class LlamaGenerator final : public DataGenerator {
|
||||||
public:
|
public:
|
||||||
LlamaGenerator() = default;
|
LlamaGenerator() = default;
|
||||||
~LlamaGenerator() override;
|
~LlamaGenerator() override;
|
||||||
|
|
||||||
void setSamplingOptions(float temperature, float topP, int seed = -1);
|
void SetSamplingOptions(float temperature, float top_p, int seed = -1);
|
||||||
|
|
||||||
void load(const std::string &modelPath) override;
|
void Load(const std::string &model_path) override;
|
||||||
BreweryResult generateBrewery(const std::string &cityName,
|
BreweryResult GenerateBrewery(const std::string &city_name,
|
||||||
const std::string &countryName,
|
const std::string &country_name,
|
||||||
const std::string ®ionContext) override;
|
const std::string ®ion_context) override;
|
||||||
UserResult generateUser(const std::string &locale) override;
|
UserResult GenerateUser(const std::string &locale) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::string infer(const std::string &prompt, int maxTokens = 10000);
|
std::string Infer(const std::string &prompt, int max_tokens = 10000);
|
||||||
// Overload that allows passing a system message separately so chat-capable
|
// Overload that allows passing a system message separately so chat-capable
|
||||||
// models receive a proper system role instead of having the system text
|
// models receive a proper system role instead of having the system text
|
||||||
// concatenated into the user prompt (helps avoid revealing internal
|
// concatenated into the user prompt (helps avoid revealing internal
|
||||||
// reasoning or instructions in model output).
|
// reasoning or instructions in model output).
|
||||||
std::string infer(const std::string &systemPrompt, const std::string &prompt,
|
std::string Infer(const std::string &system_prompt, const std::string &prompt,
|
||||||
int maxTokens = 10000);
|
int max_tokens = 10000);
|
||||||
|
|
||||||
llama_model *model_ = nullptr;
|
llama_model *model_ = nullptr;
|
||||||
llama_context *context_ = nullptr;
|
llama_context *context_ = nullptr;
|
||||||
@@ -36,3 +37,5 @@ private:
|
|||||||
float sampling_top_p_ = 0.92f;
|
float sampling_top_p_ = 0.92f;
|
||||||
uint32_t sampling_seed_ = 0xFFFFFFFFu;
|
uint32_t sampling_seed_ = 0xFFFFFFFFu;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||||
|
|||||||
@@ -1,19 +1,20 @@
|
|||||||
#pragma once
|
#ifndef BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
|
||||||
|
|
||||||
#include "data_generation/data_generator.h"
|
#include "data_generation/data_generator.h"
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
class MockGenerator final : public IDataGenerator {
|
class MockGenerator final : public DataGenerator {
|
||||||
public:
|
public:
|
||||||
void load(const std::string &modelPath) override;
|
void Load(const std::string &model_path) override;
|
||||||
BreweryResult generateBrewery(const std::string &cityName,
|
BreweryResult GenerateBrewery(const std::string &city_name,
|
||||||
const std::string &countryName,
|
const std::string &country_name,
|
||||||
const std::string ®ionContext) override;
|
const std::string ®ion_context) override;
|
||||||
UserResult generateUser(const std::string &locale) override;
|
UserResult GenerateUser(const std::string &locale) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static std::size_t deterministicHash(const std::string &a,
|
static std::size_t DeterministicHash(const std::string &a,
|
||||||
const std::string &b);
|
const std::string &b);
|
||||||
|
|
||||||
static const std::vector<std::string> kBreweryAdjectives;
|
static const std::vector<std::string> kBreweryAdjectives;
|
||||||
@@ -22,3 +23,5 @@ private:
|
|||||||
static const std::vector<std::string> kUsernames;
|
static const std::vector<std::string> kUsernames;
|
||||||
static const std::vector<std::string> kBios;
|
static const std::vector<std::string> kBios;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_MOCK_GENERATOR_H_
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
#pragma once
|
#ifndef BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
|
||||||
|
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <sqlite3.h>
|
#include <sqlite3.h>
|
||||||
@@ -24,7 +25,7 @@ struct State {
|
|||||||
/// @brief State or province short code.
|
/// @brief State or province short code.
|
||||||
std::string iso2;
|
std::string iso2;
|
||||||
/// @brief Parent country identifier.
|
/// @brief Parent country identifier.
|
||||||
int countryId;
|
int country_id;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct City {
|
struct City {
|
||||||
@@ -33,14 +34,14 @@ struct City {
|
|||||||
/// @brief City display name.
|
/// @brief City display name.
|
||||||
std::string name;
|
std::string name;
|
||||||
/// @brief Parent country identifier.
|
/// @brief Parent country identifier.
|
||||||
int countryId;
|
int country_id;
|
||||||
};
|
};
|
||||||
|
|
||||||
/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks.
|
/// @brief Thread-safe SQLite wrapper for pipeline writes and readbacks.
|
||||||
class SqliteDatabase {
|
class SqliteDatabase {
|
||||||
private:
|
private:
|
||||||
sqlite3 *db = nullptr;
|
sqlite3 *db_ = nullptr;
|
||||||
std::mutex dbMutex;
|
std::mutex db_mutex_;
|
||||||
|
|
||||||
void InitializeSchema();
|
void InitializeSchema();
|
||||||
|
|
||||||
@@ -48,8 +49,8 @@ public:
|
|||||||
/// @brief Closes the SQLite connection if initialized.
|
/// @brief Closes the SQLite connection if initialized.
|
||||||
~SqliteDatabase();
|
~SqliteDatabase();
|
||||||
|
|
||||||
/// @brief Opens the SQLite database at dbPath and creates schema objects.
|
/// @brief Opens the SQLite database at db_path and creates schema objects.
|
||||||
void Initialize(const std::string &dbPath = ":memory:");
|
void Initialize(const std::string &db_path = ":memory:");
|
||||||
|
|
||||||
/// @brief Starts a database transaction for batched writes.
|
/// @brief Starts a database transaction for batched writes.
|
||||||
void BeginTransaction();
|
void BeginTransaction();
|
||||||
@@ -62,11 +63,11 @@ public:
|
|||||||
const std::string &iso3);
|
const std::string &iso3);
|
||||||
|
|
||||||
/// @brief Inserts a state row linked to a country.
|
/// @brief Inserts a state row linked to a country.
|
||||||
void InsertState(int id, int countryId, const std::string &name,
|
void InsertState(int id, int country_id, const std::string &name,
|
||||||
const std::string &iso2);
|
const std::string &iso2);
|
||||||
|
|
||||||
/// @brief Inserts a city row linked to state and country.
|
/// @brief Inserts a city row linked to state and country.
|
||||||
void InsertCity(int id, int stateId, int countryId, const std::string &name,
|
void InsertCity(int id, int state_id, int country_id, const std::string &name,
|
||||||
double latitude, double longitude);
|
double latitude, double longitude);
|
||||||
|
|
||||||
/// @brief Returns city records including parent country id.
|
/// @brief Returns city records including parent country id.
|
||||||
@@ -78,3 +79,5 @@ public:
|
|||||||
/// @brief Returns states with optional row limit.
|
/// @brief Returns states with optional row limit.
|
||||||
std::vector<State> QueryStates(int limit = 0);
|
std::vector<State> QueryStates(int limit = 0);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_DATABASE_DATABASE_H_
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
#pragma once
|
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||||
|
|
||||||
#include "database/database.h"
|
#include "database/database.h"
|
||||||
#include "json_handling/stream_parser.h"
|
#include "json_handling/stream_parser.h"
|
||||||
@@ -8,5 +9,7 @@
|
|||||||
class JsonLoader {
|
class JsonLoader {
|
||||||
public:
|
public:
|
||||||
/// @brief Parses a JSON file and writes country/state/city rows into db.
|
/// @brief Parses a JSON file and writes country/state/city rows into db.
|
||||||
static void LoadWorldCities(const std::string &jsonPath, SqliteDatabase &db);
|
static void LoadWorldCities(const std::string &json_path, SqliteDatabase &db);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_JSON_LOADER_H_
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
#pragma once
|
#ifndef BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
|
||||||
|
|
||||||
#include "database/database.h"
|
#include "database/database.h"
|
||||||
#include <functional>
|
#include <functional>
|
||||||
@@ -20,10 +21,10 @@ struct CityRecord {
|
|||||||
/// @brief Streaming SAX parser that emits city records during traversal.
|
/// @brief Streaming SAX parser that emits city records during traversal.
|
||||||
class StreamingJsonParser {
|
class StreamingJsonParser {
|
||||||
public:
|
public:
|
||||||
/// @brief Parses filePath and invokes callbacks for city rows and progress.
|
/// @brief Parses file_path and invokes callbacks for city rows and progress.
|
||||||
static void Parse(const std::string &filePath, SqliteDatabase &db,
|
static void Parse(const std::string &file_path, SqliteDatabase &db,
|
||||||
std::function<void(const CityRecord &)> onCity,
|
std::function<void(const CityRecord &)> on_city,
|
||||||
std::function<void(size_t, size_t)> onProgress = nullptr);
|
std::function<void(size_t, size_t)> on_progress = nullptr);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
/// @brief Mutable SAX handler state while traversing nested JSON arrays.
|
/// @brief Mutable SAX handler state while traversing nested JSON arrays.
|
||||||
@@ -46,3 +47,5 @@ private:
|
|||||||
size_t bytes_processed = 0;
|
size_t bytes_processed = 0;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_JSON_HANDLING_STREAM_PARSER_H_
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
#pragma once
|
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
||||||
|
|
||||||
#include "web_client/web_client.h"
|
#include "web_client/web_client.h"
|
||||||
#include <memory>
|
#include <memory>
|
||||||
@@ -14,13 +15,15 @@ public:
|
|||||||
CurlGlobalState &operator=(const CurlGlobalState &) = delete;
|
CurlGlobalState &operator=(const CurlGlobalState &) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
class CURLWebClient : public IWebClient {
|
class CURLWebClient : public WebClient {
|
||||||
public:
|
public:
|
||||||
CURLWebClient();
|
CURLWebClient();
|
||||||
~CURLWebClient() override;
|
~CURLWebClient() override;
|
||||||
|
|
||||||
void DownloadToFile(const std::string &url,
|
void DownloadToFile(const std::string &url,
|
||||||
const std::string &filePath) override;
|
const std::string &file_path) override;
|
||||||
std::string Get(const std::string &url) override;
|
std::string Get(const std::string &url) override;
|
||||||
std::string UrlEncode(const std::string &value) override;
|
std::string UrlEncode(const std::string &value) override;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_WEB_CLIENT_CURL_WEB_CLIENT_H_
|
||||||
|
|||||||
@@ -1,14 +1,15 @@
|
|||||||
#pragma once
|
#ifndef BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
class IWebClient {
|
class WebClient {
|
||||||
public:
|
public:
|
||||||
virtual ~IWebClient() = default;
|
virtual ~WebClient() = default;
|
||||||
|
|
||||||
// Downloads content from a URL to a file. Throws on error.
|
// Downloads content from a URL to a file. Throws on error.
|
||||||
virtual void DownloadToFile(const std::string &url,
|
virtual void DownloadToFile(const std::string &url,
|
||||||
const std::string &filePath) = 0;
|
const std::string &file_path) = 0;
|
||||||
|
|
||||||
// Performs a GET request and returns the response body as a string. Throws on
|
// Performs a GET request and returns the response body as a string. Throws on
|
||||||
// error.
|
// error.
|
||||||
@@ -17,3 +18,5 @@ public:
|
|||||||
// URL-encodes a string.
|
// URL-encodes a string.
|
||||||
virtual std::string UrlEncode(const std::string &value) = 0;
|
virtual std::string UrlEncode(const std::string &value) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_WEB_CLIENT_WEB_CLIENT_H_
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
#pragma once
|
#ifndef BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
||||||
|
#define BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
||||||
|
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
@@ -11,7 +12,7 @@
|
|||||||
class WikipediaService {
|
class WikipediaService {
|
||||||
public:
|
public:
|
||||||
/// @brief Creates a new Wikipedia service with the provided web client.
|
/// @brief Creates a new Wikipedia service with the provided web client.
|
||||||
explicit WikipediaService(std::shared_ptr<IWebClient> client);
|
explicit WikipediaService(std::shared_ptr<WebClient> client);
|
||||||
|
|
||||||
/// @brief Returns the Wikipedia summary extract for city and country.
|
/// @brief Returns the Wikipedia summary extract for city and country.
|
||||||
[[nodiscard]] std::string GetSummary(std::string_view city,
|
[[nodiscard]] std::string GetSummary(std::string_view city,
|
||||||
@@ -19,6 +20,8 @@ public:
|
|||||||
|
|
||||||
private:
|
private:
|
||||||
std::string FetchExtract(std::string_view query);
|
std::string FetchExtract(std::string_view query);
|
||||||
std::shared_ptr<IWebClient> client_;
|
std::shared_ptr<WebClient> client_;
|
||||||
std::unordered_map<std::string, std::string> cache_;
|
std::unordered_map<std::string, std::string> cache_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif // BIERGARTEN_PIPELINE_WIKIPEDIA_WIKIPEDIA_SERVICE_H_
|
||||||
|
|||||||
@@ -14,57 +14,57 @@
|
|||||||
|
|
||||||
BiergartenDataGenerator::BiergartenDataGenerator(
|
BiergartenDataGenerator::BiergartenDataGenerator(
|
||||||
const ApplicationOptions &options,
|
const ApplicationOptions &options,
|
||||||
std::shared_ptr<IWebClient> webClient,
|
std::shared_ptr<WebClient> web_client,
|
||||||
SqliteDatabase &database)
|
SqliteDatabase &database)
|
||||||
: options_(options), webClient_(webClient), database_(database) {}
|
: options_(options), webClient_(web_client), database_(database) {}
|
||||||
|
|
||||||
std::unique_ptr<IDataGenerator> BiergartenDataGenerator::InitializeGenerator() {
|
std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
|
||||||
spdlog::info("Initializing brewery generator...");
|
spdlog::info("Initializing brewery generator...");
|
||||||
|
|
||||||
std::unique_ptr<IDataGenerator> generator;
|
std::unique_ptr<DataGenerator> generator;
|
||||||
if (options_.modelPath.empty()) {
|
if (options_.model_path.empty()) {
|
||||||
generator = std::make_unique<MockGenerator>();
|
generator = std::make_unique<MockGenerator>();
|
||||||
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
|
spdlog::info("[Generator] Using MockGenerator (no model path provided)");
|
||||||
} else {
|
} else {
|
||||||
auto llamaGenerator = std::make_unique<LlamaGenerator>();
|
auto llama_generator = std::make_unique<LlamaGenerator>();
|
||||||
llamaGenerator->setSamplingOptions(options_.temperature, options_.topP,
|
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
|
||||||
options_.seed);
|
options_.seed);
|
||||||
spdlog::info(
|
spdlog::info(
|
||||||
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
|
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
|
||||||
"seed={})",
|
"seed={})",
|
||||||
options_.modelPath, options_.temperature, options_.topP,
|
options_.model_path, options_.temperature, options_.top_p,
|
||||||
options_.seed);
|
options_.seed);
|
||||||
generator = std::move(llamaGenerator);
|
generator = std::move(llama_generator);
|
||||||
}
|
}
|
||||||
generator->load(options_.modelPath);
|
generator->Load(options_.model_path);
|
||||||
|
|
||||||
return generator;
|
return generator;
|
||||||
}
|
}
|
||||||
|
|
||||||
void BiergartenDataGenerator::LoadGeographicData() {
|
void BiergartenDataGenerator::LoadGeographicData() {
|
||||||
std::string jsonPath = options_.cacheDir + "/countries+states+cities.json";
|
std::string json_path = options_.cache_dir + "/countries+states+cities.json";
|
||||||
std::string dbPath = options_.cacheDir + "/biergarten-pipeline.db";
|
std::string db_path = options_.cache_dir + "/biergarten-pipeline.db";
|
||||||
|
|
||||||
bool hasJsonCache = std::filesystem::exists(jsonPath);
|
bool has_json_cache = std::filesystem::exists(json_path);
|
||||||
bool hasDbCache = std::filesystem::exists(dbPath);
|
bool has_db_cache = std::filesystem::exists(db_path);
|
||||||
|
|
||||||
spdlog::info("Initializing SQLite database at {}...", dbPath);
|
spdlog::info("Initializing SQLite database at {}...", db_path);
|
||||||
database_.Initialize(dbPath);
|
database_.Initialize(db_path);
|
||||||
|
|
||||||
if (hasDbCache && hasJsonCache) {
|
if (has_db_cache && has_json_cache) {
|
||||||
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
|
spdlog::info("[Pipeline] Cache hit: skipping download and parse");
|
||||||
} else {
|
} else {
|
||||||
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
|
spdlog::info("\n[Pipeline] Downloading geographic data from GitHub...");
|
||||||
DataDownloader downloader(webClient_);
|
DataDownloader downloader(webClient_);
|
||||||
downloader.DownloadCountriesDatabase(jsonPath, options_.commit);
|
downloader.DownloadCountriesDatabase(json_path, options_.commit);
|
||||||
|
|
||||||
JsonLoader::LoadWorldCities(jsonPath, database_);
|
JsonLoader::LoadWorldCities(json_path, database_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void BiergartenDataGenerator::GenerateSampleBreweries() {
|
void BiergartenDataGenerator::GenerateSampleBreweries() {
|
||||||
auto generator = InitializeGenerator();
|
auto generator = InitializeGenerator();
|
||||||
WikipediaService wikipediaService(webClient_);
|
WikipediaService wikipedia_service(webClient_);
|
||||||
|
|
||||||
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
spdlog::info("\n=== GEOGRAPHIC DATA OVERVIEW ===");
|
||||||
|
|
||||||
@@ -73,10 +73,10 @@ void BiergartenDataGenerator::GenerateSampleBreweries() {
|
|||||||
auto cities = database_.QueryCities();
|
auto cities = database_.QueryCities();
|
||||||
|
|
||||||
// Build a quick map of country id -> name for per-city lookups.
|
// Build a quick map of country id -> name for per-city lookups.
|
||||||
auto allCountries = database_.QueryCountries(0);
|
auto all_countries = database_.QueryCountries(0);
|
||||||
std::unordered_map<int, std::string> countryMap;
|
std::unordered_map<int, std::string> country_map;
|
||||||
for (const auto &c : allCountries)
|
for (const auto &c : all_countries)
|
||||||
countryMap[c.id] = c.name;
|
country_map[c.id] = c.name;
|
||||||
|
|
||||||
spdlog::info("\nTotal records loaded:");
|
spdlog::info("\nTotal records loaded:");
|
||||||
spdlog::info(" Countries: {}", database_.QueryCountries(0).size());
|
spdlog::info(" Countries: {}", database_.QueryCountries(0).size());
|
||||||
@@ -84,28 +84,28 @@ void BiergartenDataGenerator::GenerateSampleBreweries() {
|
|||||||
spdlog::info(" Cities: {}", cities.size());
|
spdlog::info(" Cities: {}", cities.size());
|
||||||
|
|
||||||
generatedBreweries_.clear();
|
generatedBreweries_.clear();
|
||||||
const size_t sampleCount = std::min(size_t(30), cities.size());
|
const size_t sample_count = std::min(size_t(30), cities.size());
|
||||||
|
|
||||||
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
|
spdlog::info("\n=== SAMPLE BREWERY GENERATION ===");
|
||||||
for (size_t i = 0; i < sampleCount; i++) {
|
for (size_t i = 0; i < sample_count; i++) {
|
||||||
const auto &city = cities[i];
|
const auto &city = cities[i];
|
||||||
const int cityId = city.id;
|
const int city_id = city.id;
|
||||||
const std::string cityName = city.name;
|
const std::string city_name = city.name;
|
||||||
|
|
||||||
std::string localCountry;
|
std::string local_country;
|
||||||
const auto countryIt = countryMap.find(city.countryId);
|
const auto country_it = country_map.find(city.country_id);
|
||||||
if (countryIt != countryMap.end()) {
|
if (country_it != country_map.end()) {
|
||||||
localCountry = countryIt->second;
|
local_country = country_it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string regionContext =
|
const std::string region_context =
|
||||||
wikipediaService.GetSummary(cityName, localCountry);
|
wikipedia_service.GetSummary(city_name, local_country);
|
||||||
spdlog::debug("[Pipeline] Region context for {}: {}", cityName,
|
spdlog::debug("[Pipeline] Region context for {}: {}", city_name,
|
||||||
regionContext);
|
region_context);
|
||||||
|
|
||||||
auto brewery =
|
auto brewery =
|
||||||
generator->generateBrewery(cityName, localCountry, regionContext);
|
generator->GenerateBrewery(city_name, local_country, region_context);
|
||||||
generatedBreweries_.push_back({cityId, cityName, brewery});
|
generatedBreweries_.push_back({city_id, city_name, brewery});
|
||||||
}
|
}
|
||||||
|
|
||||||
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
spdlog::info("\n=== GENERATED DATA DUMP ===");
|
||||||
|
|||||||
@@ -6,41 +6,41 @@
|
|||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
DataDownloader::DataDownloader(std::shared_ptr<IWebClient> webClient)
|
DataDownloader::DataDownloader(std::shared_ptr<WebClient> web_client)
|
||||||
: m_webClient(std::move(webClient)) {}
|
: web_client_(std::move(web_client)) {}
|
||||||
|
|
||||||
DataDownloader::~DataDownloader() {}
|
DataDownloader::~DataDownloader() {}
|
||||||
|
|
||||||
bool DataDownloader::FileExists(const std::string &filePath) {
|
bool DataDownloader::FileExists(const std::string &file_path) {
|
||||||
return std::filesystem::exists(filePath);
|
return std::filesystem::exists(file_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string
|
std::string
|
||||||
DataDownloader::DownloadCountriesDatabase(const std::string &cachePath,
|
DataDownloader::DownloadCountriesDatabase(const std::string &cache_path,
|
||||||
const std::string &commit) {
|
const std::string &commit) {
|
||||||
if (FileExists(cachePath)) {
|
if (FileExists(cache_path)) {
|
||||||
spdlog::info("[DataDownloader] Cache hit: {}", cachePath);
|
spdlog::info("[DataDownloader] Cache hit: {}", cache_path);
|
||||||
return cachePath;
|
return cache_path;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string shortCommit = commit;
|
std::string short_commit = commit;
|
||||||
if (commit.length() > 7) {
|
if (commit.length() > 7) {
|
||||||
shortCommit = commit.substr(0, 7);
|
short_commit = commit.substr(0, 7);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string url = "https://raw.githubusercontent.com/dr5hn/"
|
std::string url = "https://raw.githubusercontent.com/dr5hn/"
|
||||||
"countries-states-cities-database/" +
|
"countries-states-cities-database/" +
|
||||||
shortCommit + "/json/countries+states+cities.json";
|
short_commit + "/json/countries+states+cities.json";
|
||||||
|
|
||||||
spdlog::info("[DataDownloader] Downloading: {}", url);
|
spdlog::info("[DataDownloader] Downloading: {}", url);
|
||||||
|
|
||||||
m_webClient->DownloadToFile(url, cachePath);
|
web_client_->DownloadToFile(url, cache_path);
|
||||||
|
|
||||||
std::ifstream fileCheck(cachePath, std::ios::binary | std::ios::ate);
|
std::ifstream file_check(cache_path, std::ios::binary | std::ios::ate);
|
||||||
std::streamsize size = fileCheck.tellg();
|
std::streamsize size = file_check.tellg();
|
||||||
fileCheck.close();
|
file_check.close();
|
||||||
|
|
||||||
spdlog::info("[DataDownloader] OK: Download complete: {} ({:.2f} MB)",
|
spdlog::info("[DataDownloader] OK: Download complete: {} ({:.2f} MB)",
|
||||||
cachePath, (size / (1024.0 * 1024.0)));
|
cache_path, (size / (1024.0 * 1024.0)));
|
||||||
return cachePath;
|
return cache_path;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -180,14 +180,14 @@ std::string toChatPrompt(const llama_model *model,
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::string toChatPrompt(const llama_model *model,
|
std::string toChatPrompt(const llama_model *model,
|
||||||
const std::string &systemPrompt,
|
const std::string &system_prompt,
|
||||||
const std::string &userPrompt) {
|
const std::string &userPrompt) {
|
||||||
const char *tmpl = llama_model_chat_template(model, nullptr);
|
const char *tmpl = llama_model_chat_template(model, nullptr);
|
||||||
if (tmpl == nullptr) {
|
if (tmpl == nullptr) {
|
||||||
return systemPrompt + "\n\n" + userPrompt;
|
return system_prompt + "\n\n" + userPrompt;
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_chat_message messages[2] = {{"system", systemPrompt.c_str()},
|
const llama_chat_message messages[2] = {{"system", system_prompt.c_str()},
|
||||||
{"user", userPrompt.c_str()}};
|
{"user", userPrompt.c_str()}};
|
||||||
|
|
||||||
std::vector<char> buffer(std::max<std::size_t>(
|
std::vector<char> buffer(std::max<std::size_t>(
|
||||||
@@ -381,13 +381,13 @@ LlamaGenerator::~LlamaGenerator() {
|
|||||||
llama_backend_free();
|
llama_backend_free();
|
||||||
}
|
}
|
||||||
|
|
||||||
void LlamaGenerator::setSamplingOptions(float temperature, float topP,
|
void LlamaGenerator::SetSamplingOptions(float temperature, float top_p,
|
||||||
int seed) {
|
int seed) {
|
||||||
if (temperature < 0.0f) {
|
if (temperature < 0.0f) {
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
"LlamaGenerator: sampling temperature must be >= 0");
|
"LlamaGenerator: sampling temperature must be >= 0");
|
||||||
}
|
}
|
||||||
if (!(topP > 0.0f && topP <= 1.0f)) {
|
if (!(top_p > 0.0f && top_p <= 1.0f)) {
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
"LlamaGenerator: sampling top-p must be in (0, 1]");
|
"LlamaGenerator: sampling top-p must be in (0, 1]");
|
||||||
}
|
}
|
||||||
@@ -397,13 +397,13 @@ void LlamaGenerator::setSamplingOptions(float temperature, float topP,
|
|||||||
}
|
}
|
||||||
|
|
||||||
sampling_temperature_ = temperature;
|
sampling_temperature_ = temperature;
|
||||||
sampling_top_p_ = topP;
|
sampling_top_p_ = top_p;
|
||||||
sampling_seed_ = (seed < 0) ? static_cast<uint32_t>(LLAMA_DEFAULT_SEED)
|
sampling_seed_ = (seed < 0) ? static_cast<uint32_t>(LLAMA_DEFAULT_SEED)
|
||||||
: static_cast<uint32_t>(seed);
|
: static_cast<uint32_t>(seed);
|
||||||
}
|
}
|
||||||
|
|
||||||
void LlamaGenerator::load(const std::string &modelPath) {
|
void LlamaGenerator::Load(const std::string &model_path) {
|
||||||
if (modelPath.empty())
|
if (model_path.empty())
|
||||||
throw std::runtime_error("LlamaGenerator: model path must not be empty");
|
throw std::runtime_error("LlamaGenerator: model path must not be empty");
|
||||||
|
|
||||||
if (context_ != nullptr) {
|
if (context_ != nullptr) {
|
||||||
@@ -417,27 +417,27 @@ void LlamaGenerator::load(const std::string &modelPath) {
|
|||||||
|
|
||||||
llama_backend_init();
|
llama_backend_init();
|
||||||
|
|
||||||
llama_model_params modelParams = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_ = llama_model_load_from_file(modelPath.c_str(), modelParams);
|
model_ = llama_model_load_from_file(model_path.c_str(), model_params);
|
||||||
if (model_ == nullptr) {
|
if (model_ == nullptr) {
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
"LlamaGenerator: failed to load model from path: " + modelPath);
|
"LlamaGenerator: failed to load model from path: " + model_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_context_params contextParams = llama_context_default_params();
|
llama_context_params context_params = llama_context_default_params();
|
||||||
contextParams.n_ctx = 2048;
|
context_params.n_ctx = 2048;
|
||||||
|
|
||||||
context_ = llama_init_from_model(model_, contextParams);
|
context_ = llama_init_from_model(model_, context_params);
|
||||||
if (context_ == nullptr) {
|
if (context_ == nullptr) {
|
||||||
llama_model_free(model_);
|
llama_model_free(model_);
|
||||||
model_ = nullptr;
|
model_ = nullptr;
|
||||||
throw std::runtime_error("LlamaGenerator: failed to create context");
|
throw std::runtime_error("LlamaGenerator: failed to create context");
|
||||||
}
|
}
|
||||||
|
|
||||||
spdlog::info("[LlamaGenerator] Loaded model: {}", modelPath);
|
spdlog::info("[LlamaGenerator] Loaded model: {}", model_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string LlamaGenerator::infer(const std::string &prompt, int maxTokens) {
|
std::string LlamaGenerator::Infer(const std::string &prompt, int max_tokens) {
|
||||||
if (model_ == nullptr || context_ == nullptr)
|
if (model_ == nullptr || context_ == nullptr)
|
||||||
throw std::runtime_error("LlamaGenerator: model not loaded");
|
throw std::runtime_error("LlamaGenerator: model not loaded");
|
||||||
|
|
||||||
@@ -447,19 +447,19 @@ std::string LlamaGenerator::infer(const std::string &prompt, int maxTokens) {
|
|||||||
|
|
||||||
llama_memory_clear(llama_get_memory(context_), true);
|
llama_memory_clear(llama_get_memory(context_), true);
|
||||||
|
|
||||||
const std::string formattedPrompt = toChatPrompt(model_, prompt);
|
const std::string formatted_prompt = toChatPrompt(model_, prompt);
|
||||||
|
|
||||||
std::vector<llama_token> promptTokens(formattedPrompt.size() + 8);
|
std::vector<llama_token> promptTokens(formatted_prompt.size() + 8);
|
||||||
int32_t tokenCount = llama_tokenize(
|
int32_t tokenCount = llama_tokenize(
|
||||||
vocab, formattedPrompt.c_str(),
|
vocab, formatted_prompt.c_str(),
|
||||||
static_cast<int32_t>(formattedPrompt.size()), promptTokens.data(),
|
static_cast<int32_t>(formatted_prompt.size()), promptTokens.data(),
|
||||||
static_cast<int32_t>(promptTokens.size()), true, true);
|
static_cast<int32_t>(promptTokens.size()), true, true);
|
||||||
|
|
||||||
if (tokenCount < 0) {
|
if (tokenCount < 0) {
|
||||||
promptTokens.resize(static_cast<std::size_t>(-tokenCount));
|
promptTokens.resize(static_cast<std::size_t>(-tokenCount));
|
||||||
tokenCount = llama_tokenize(
|
tokenCount = llama_tokenize(
|
||||||
vocab, formattedPrompt.c_str(),
|
vocab, formatted_prompt.c_str(),
|
||||||
static_cast<int32_t>(formattedPrompt.size()), promptTokens.data(),
|
static_cast<int32_t>(formatted_prompt.size()), promptTokens.data(),
|
||||||
static_cast<int32_t>(promptTokens.size()), true, true);
|
static_cast<int32_t>(promptTokens.size()), true, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -472,18 +472,18 @@ std::string LlamaGenerator::infer(const std::string &prompt, int maxTokens) {
|
|||||||
throw std::runtime_error("LlamaGenerator: invalid context or batch size");
|
throw std::runtime_error("LlamaGenerator: invalid context or batch size");
|
||||||
}
|
}
|
||||||
|
|
||||||
const int32_t effectiveMaxTokens = std::max(1, std::min(maxTokens, nCtx - 1));
|
const int32_t effective_max_tokens = std::max(1, std::min(max_tokens, nCtx - 1));
|
||||||
int32_t promptBudget = std::min(nBatch, nCtx - effectiveMaxTokens);
|
const int32_t prompt_budget = std::min(nBatch, nCtx - effective_max_tokens);
|
||||||
promptBudget = std::max<int32_t>(1, promptBudget);
|
prompt_budget = std::max<int32_t>(1, prompt_budget);
|
||||||
|
|
||||||
promptTokens.resize(static_cast<std::size_t>(tokenCount));
|
promptTokens.resize(static_cast<std::size_t>(tokenCount));
|
||||||
if (tokenCount > promptBudget) {
|
if (tokenCount > prompt_budget) {
|
||||||
spdlog::warn(
|
spdlog::warn(
|
||||||
"LlamaGenerator: prompt too long ({} tokens), truncating to {} tokens "
|
"LlamaGenerator: prompt too long ({} tokens), truncating to {} tokens "
|
||||||
"to fit n_batch/n_ctx limits",
|
"to fit n_batch/n_ctx limits",
|
||||||
tokenCount, promptBudget);
|
tokenCount, prompt_budget);
|
||||||
promptTokens.resize(static_cast<std::size_t>(promptBudget));
|
promptTokens.resize(static_cast<std::size_t>(prompt_budget));
|
||||||
tokenCount = promptBudget;
|
tokenCount = prompt_budget;
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_batch promptBatch = llama_batch_get_one(
|
const llama_batch promptBatch = llama_batch_get_one(
|
||||||
@@ -491,11 +491,11 @@ std::string LlamaGenerator::infer(const std::string &prompt, int maxTokens) {
|
|||||||
if (llama_decode(context_, promptBatch) != 0)
|
if (llama_decode(context_, promptBatch) != 0)
|
||||||
throw std::runtime_error("LlamaGenerator: prompt decode failed");
|
throw std::runtime_error("LlamaGenerator: prompt decode failed");
|
||||||
|
|
||||||
llama_sampler_chain_params samplerParams =
|
llama_sampler_chain_params sampler_params =
|
||||||
llama_sampler_chain_default_params();
|
llama_sampler_chain_default_params();
|
||||||
using SamplerPtr =
|
using SamplerPtr =
|
||||||
std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
|
std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
|
||||||
SamplerPtr sampler(llama_sampler_chain_init(samplerParams),
|
SamplerPtr sampler(llama_sampler_chain_init(sampler_params),
|
||||||
&llama_sampler_free);
|
&llama_sampler_free);
|
||||||
if (!sampler)
|
if (!sampler)
|
||||||
throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
|
throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
|
||||||
@@ -507,29 +507,29 @@ std::string LlamaGenerator::infer(const std::string &prompt, int maxTokens) {
|
|||||||
llama_sampler_chain_add(sampler.get(),
|
llama_sampler_chain_add(sampler.get(),
|
||||||
llama_sampler_init_dist(sampling_seed_));
|
llama_sampler_init_dist(sampling_seed_));
|
||||||
|
|
||||||
std::vector<llama_token> generatedTokens;
|
std::vector<llama_token> generated_tokens;
|
||||||
generatedTokens.reserve(static_cast<std::size_t>(maxTokens));
|
generated_tokens.reserve(static_cast<std::size_t>(max_tokens));
|
||||||
|
|
||||||
for (int i = 0; i < effectiveMaxTokens; ++i) {
|
for (int i = 0; i < effective_max_tokens; ++i) {
|
||||||
const llama_token next = llama_sampler_sample(sampler.get(), context_, -1);
|
const llama_token next = llama_sampler_sample(sampler.get(), context_, -1);
|
||||||
if (llama_vocab_is_eog(vocab, next))
|
if (llama_vocab_is_eog(vocab, next))
|
||||||
break;
|
break;
|
||||||
generatedTokens.push_back(next);
|
generated_tokens.push_back(next);
|
||||||
llama_token token = next;
|
llama_token token = next;
|
||||||
const llama_batch oneTokenBatch = llama_batch_get_one(&token, 1);
|
const llama_batch one_token_batch = llama_batch_get_one(&token, 1);
|
||||||
if (llama_decode(context_, oneTokenBatch) != 0)
|
if (llama_decode(context_, one_token_batch) != 0)
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
"LlamaGenerator: decode failed during generation");
|
"LlamaGenerator: decode failed during generation");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string output;
|
std::string output;
|
||||||
for (const llama_token token : generatedTokens)
|
for (const llama_token token : generated_tokens)
|
||||||
appendTokenPiece(vocab, token, output);
|
appendTokenPiece(vocab, token, output);
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string LlamaGenerator::infer(const std::string &systemPrompt,
|
std::string LlamaGenerator::Infer(const std::string &system_prompt,
|
||||||
const std::string &prompt, int maxTokens) {
|
const std::string &prompt, int max_tokens) {
|
||||||
if (model_ == nullptr || context_ == nullptr)
|
if (model_ == nullptr || context_ == nullptr)
|
||||||
throw std::runtime_error("LlamaGenerator: model not loaded");
|
throw std::runtime_error("LlamaGenerator: model not loaded");
|
||||||
|
|
||||||
@@ -539,20 +539,20 @@ std::string LlamaGenerator::infer(const std::string &systemPrompt,
|
|||||||
|
|
||||||
llama_memory_clear(llama_get_memory(context_), true);
|
llama_memory_clear(llama_get_memory(context_), true);
|
||||||
|
|
||||||
const std::string formattedPrompt =
|
const std::string formatted_prompt =
|
||||||
toChatPrompt(model_, systemPrompt, prompt);
|
toChatPrompt(model_, system_prompt, prompt);
|
||||||
|
|
||||||
std::vector<llama_token> promptTokens(formattedPrompt.size() + 8);
|
std::vector<llama_token> promptTokens(formatted_prompt.size() + 8);
|
||||||
int32_t tokenCount = llama_tokenize(
|
int32_t tokenCount = llama_tokenize(
|
||||||
vocab, formattedPrompt.c_str(),
|
vocab, formatted_prompt.c_str(),
|
||||||
static_cast<int32_t>(formattedPrompt.size()), promptTokens.data(),
|
static_cast<int32_t>(formatted_prompt.size()), promptTokens.data(),
|
||||||
static_cast<int32_t>(promptTokens.size()), true, true);
|
static_cast<int32_t>(promptTokens.size()), true, true);
|
||||||
|
|
||||||
if (tokenCount < 0) {
|
if (tokenCount < 0) {
|
||||||
promptTokens.resize(static_cast<std::size_t>(-tokenCount));
|
promptTokens.resize(static_cast<std::size_t>(-tokenCount));
|
||||||
tokenCount = llama_tokenize(
|
tokenCount = llama_tokenize(
|
||||||
vocab, formattedPrompt.c_str(),
|
vocab, formatted_prompt.c_str(),
|
||||||
static_cast<int32_t>(formattedPrompt.size()), promptTokens.data(),
|
static_cast<int32_t>(formatted_prompt.size()), promptTokens.data(),
|
||||||
static_cast<int32_t>(promptTokens.size()), true, true);
|
static_cast<int32_t>(promptTokens.size()), true, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -565,18 +565,18 @@ std::string LlamaGenerator::infer(const std::string &systemPrompt,
|
|||||||
throw std::runtime_error("LlamaGenerator: invalid context or batch size");
|
throw std::runtime_error("LlamaGenerator: invalid context or batch size");
|
||||||
}
|
}
|
||||||
|
|
||||||
const int32_t effectiveMaxTokens = std::max(1, std::min(maxTokens, nCtx - 1));
|
const int32_t effective_max_tokens = std::max(1, std::min(max_tokens, nCtx - 1));
|
||||||
int32_t promptBudget = std::min(nBatch, nCtx - effectiveMaxTokens);
|
int32_t prompt_budget = std::min(nBatch, nCtx - effective_max_tokens);
|
||||||
promptBudget = std::max<int32_t>(1, promptBudget);
|
prompt_budget = std::max<int32_t>(1, prompt_budget);
|
||||||
|
|
||||||
promptTokens.resize(static_cast<std::size_t>(tokenCount));
|
promptTokens.resize(static_cast<std::size_t>(tokenCount));
|
||||||
if (tokenCount > promptBudget) {
|
if (tokenCount > prompt_budget) {
|
||||||
spdlog::warn(
|
spdlog::warn(
|
||||||
"LlamaGenerator: prompt too long ({} tokens), truncating to {} tokens "
|
"LlamaGenerator: prompt too long ({} tokens), truncating to {} tokens "
|
||||||
"to fit n_batch/n_ctx limits",
|
"to fit n_batch/n_ctx limits",
|
||||||
tokenCount, promptBudget);
|
tokenCount, prompt_budget);
|
||||||
promptTokens.resize(static_cast<std::size_t>(promptBudget));
|
promptTokens.resize(static_cast<std::size_t>(prompt_budget));
|
||||||
tokenCount = promptBudget;
|
tokenCount = prompt_budget;
|
||||||
}
|
}
|
||||||
|
|
||||||
const llama_batch promptBatch = llama_batch_get_one(
|
const llama_batch promptBatch = llama_batch_get_one(
|
||||||
@@ -584,11 +584,11 @@ std::string LlamaGenerator::infer(const std::string &systemPrompt,
|
|||||||
if (llama_decode(context_, promptBatch) != 0)
|
if (llama_decode(context_, promptBatch) != 0)
|
||||||
throw std::runtime_error("LlamaGenerator: prompt decode failed");
|
throw std::runtime_error("LlamaGenerator: prompt decode failed");
|
||||||
|
|
||||||
llama_sampler_chain_params samplerParams =
|
llama_sampler_chain_params sampler_params =
|
||||||
llama_sampler_chain_default_params();
|
llama_sampler_chain_default_params();
|
||||||
using SamplerPtr =
|
using SamplerPtr =
|
||||||
std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
|
std::unique_ptr<llama_sampler, decltype(&llama_sampler_free)>;
|
||||||
SamplerPtr sampler(llama_sampler_chain_init(samplerParams),
|
SamplerPtr sampler(llama_sampler_chain_init(sampler_params),
|
||||||
&llama_sampler_free);
|
&llama_sampler_free);
|
||||||
if (!sampler)
|
if (!sampler)
|
||||||
throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
|
throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
|
||||||
@@ -600,34 +600,34 @@ std::string LlamaGenerator::infer(const std::string &systemPrompt,
|
|||||||
llama_sampler_chain_add(sampler.get(),
|
llama_sampler_chain_add(sampler.get(),
|
||||||
llama_sampler_init_dist(sampling_seed_));
|
llama_sampler_init_dist(sampling_seed_));
|
||||||
|
|
||||||
std::vector<llama_token> generatedTokens;
|
std::vector<llama_token> generated_tokens;
|
||||||
generatedTokens.reserve(static_cast<std::size_t>(maxTokens));
|
generated_tokens.reserve(static_cast<std::size_t>(max_tokens));
|
||||||
|
|
||||||
for (int i = 0; i < effectiveMaxTokens; ++i) {
|
for (int i = 0; i < effective_max_tokens; ++i) {
|
||||||
const llama_token next = llama_sampler_sample(sampler.get(), context_, -1);
|
const llama_token next = llama_sampler_sample(sampler.get(), context_, -1);
|
||||||
if (llama_vocab_is_eog(vocab, next))
|
if (llama_vocab_is_eog(vocab, next))
|
||||||
break;
|
break;
|
||||||
generatedTokens.push_back(next);
|
generated_tokens.push_back(next);
|
||||||
llama_token token = next;
|
llama_token token = next;
|
||||||
const llama_batch oneTokenBatch = llama_batch_get_one(&token, 1);
|
const llama_batch one_token_batch = llama_batch_get_one(&token, 1);
|
||||||
if (llama_decode(context_, oneTokenBatch) != 0)
|
if (llama_decode(context_, one_token_batch) != 0)
|
||||||
throw std::runtime_error(
|
throw std::runtime_error(
|
||||||
"LlamaGenerator: decode failed during generation");
|
"LlamaGenerator: decode failed during generation");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string output;
|
std::string output;
|
||||||
for (const llama_token token : generatedTokens)
|
for (const llama_token token : generated_tokens)
|
||||||
appendTokenPiece(vocab, token, output);
|
appendTokenPiece(vocab, token, output);
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
BreweryResult
|
BreweryResult
|
||||||
LlamaGenerator::generateBrewery(const std::string &cityName,
|
LlamaGenerator::GenerateBrewery(const std::string &city_name,
|
||||||
const std::string &countryName,
|
const std::string &country_name,
|
||||||
const std::string ®ionContext) {
|
const std::string ®ion_context) {
|
||||||
const std::string safeRegionContext = PrepareRegionContext(regionContext);
|
const std::string safe_region_context = PrepareRegionContext(region_context);
|
||||||
|
|
||||||
const std::string systemPrompt =
|
const std::string system_prompt =
|
||||||
"You are a copywriter for a craft beer travel guide. "
|
"You are a copywriter for a craft beer travel guide. "
|
||||||
"Your writing is vivid, specific to place, and avoids generic beer "
|
"Your writing is vivid, specific to place, and avoids generic beer "
|
||||||
"cliches. "
|
"cliches. "
|
||||||
@@ -639,18 +639,18 @@ LlamaGenerator::generateBrewery(const std::string &cityName,
|
|||||||
std::string prompt =
|
std::string prompt =
|
||||||
"Write a brewery name and place-specific description for a craft "
|
"Write a brewery name and place-specific description for a craft "
|
||||||
"brewery in " +
|
"brewery in " +
|
||||||
cityName +
|
city_name +
|
||||||
(countryName.empty() ? std::string("")
|
(country_name.empty() ? std::string("")
|
||||||
: std::string(", ") + countryName) +
|
: std::string(", ") + country_name) +
|
||||||
(safeRegionContext.empty()
|
(safe_region_context.empty()
|
||||||
? std::string(".")
|
? std::string(".")
|
||||||
: std::string(". Regional context: ") + safeRegionContext);
|
: std::string(". Regional context: ") + safe_region_context);
|
||||||
|
|
||||||
const int maxAttempts = 3;
|
const int maxAttempts = 3;
|
||||||
std::string raw;
|
std::string raw;
|
||||||
std::string lastError;
|
std::string lastError;
|
||||||
for (int attempt = 0; attempt < maxAttempts; ++attempt) {
|
for (int attempt = 0; attempt < maxAttempts; ++attempt) {
|
||||||
raw = infer(systemPrompt, prompt, 384);
|
raw = Infer(system_prompt, prompt, 384);
|
||||||
spdlog::debug("LlamaGenerator: raw output (attempt {}): {}", attempt + 1,
|
spdlog::debug("LlamaGenerator: raw output (attempt {}): {}", attempt + 1,
|
||||||
raw);
|
raw);
|
||||||
|
|
||||||
@@ -671,12 +671,12 @@ LlamaGenerator::generateBrewery(const std::string &cityName,
|
|||||||
"{\"name\": \"string\", \"description\": \"string\"}."
|
"{\"name\": \"string\", \"description\": \"string\"}."
|
||||||
"\nDo not include markdown, comments, or extra keys."
|
"\nDo not include markdown, comments, or extra keys."
|
||||||
"\n\nLocation: " +
|
"\n\nLocation: " +
|
||||||
cityName +
|
city_name +
|
||||||
(countryName.empty() ? std::string("")
|
(country_name.empty() ? std::string("")
|
||||||
: std::string(", ") + countryName) +
|
: std::string(", ") + country_name) +
|
||||||
(safeRegionContext.empty()
|
(safe_region_context.empty()
|
||||||
? std::string("")
|
? std::string("")
|
||||||
: std::string("\nRegional context: ") + safeRegionContext);
|
: std::string("\nRegional context: ") + safe_region_context);
|
||||||
}
|
}
|
||||||
|
|
||||||
spdlog::error("LlamaGenerator: malformed brewery response after {} attempts: "
|
spdlog::error("LlamaGenerator: malformed brewery response after {} attempts: "
|
||||||
@@ -685,8 +685,8 @@ LlamaGenerator::generateBrewery(const std::string &cityName,
|
|||||||
throw std::runtime_error("LlamaGenerator: malformed brewery response");
|
throw std::runtime_error("LlamaGenerator: malformed brewery response");
|
||||||
}
|
}
|
||||||
|
|
||||||
UserResult LlamaGenerator::generateUser(const std::string &locale) {
|
UserResult LlamaGenerator::GenerateUser(const std::string &locale) {
|
||||||
const std::string systemPrompt =
|
const std::string system_prompt =
|
||||||
"You generate plausible social media profiles for craft beer "
|
"You generate plausible social media profiles for craft beer "
|
||||||
"enthusiasts. "
|
"enthusiasts. "
|
||||||
"Respond with exactly two lines: "
|
"Respond with exactly two lines: "
|
||||||
@@ -701,7 +701,7 @@ UserResult LlamaGenerator::generateUser(const std::string &locale) {
|
|||||||
const int maxAttempts = 3;
|
const int maxAttempts = 3;
|
||||||
std::string raw;
|
std::string raw;
|
||||||
for (int attempt = 0; attempt < maxAttempts; ++attempt) {
|
for (int attempt = 0; attempt < maxAttempts; ++attempt) {
|
||||||
raw = infer(systemPrompt, prompt, 128);
|
raw = Infer(system_prompt, prompt, 128);
|
||||||
spdlog::debug("LlamaGenerator (user): raw output (attempt {}): {}",
|
spdlog::debug("LlamaGenerator (user): raw output (attempt {}): {}",
|
||||||
attempt + 1, raw);
|
attempt + 1, raw);
|
||||||
|
|
||||||
|
|||||||
@@ -64,11 +64,11 @@ const std::vector<std::string> MockGenerator::kBios = {
|
|||||||
"Always ready to trade recommendations for underrated local breweries.",
|
"Always ready to trade recommendations for underrated local breweries.",
|
||||||
"Keeping a running list of must-try collab releases and tap takeovers."};
|
"Keeping a running list of must-try collab releases and tap takeovers."};
|
||||||
|
|
||||||
void MockGenerator::load(const std::string & /*modelPath*/) {
|
void MockGenerator::Load(const std::string & /*modelPath*/) {
|
||||||
spdlog::info("[MockGenerator] No model needed");
|
spdlog::info("[MockGenerator] No model needed");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::size_t MockGenerator::deterministicHash(const std::string &a,
|
std::size_t MockGenerator::DeterministicHash(const std::string &a,
|
||||||
const std::string &b) {
|
const std::string &b) {
|
||||||
std::size_t seed = std::hash<std::string>{}(a);
|
std::size_t seed = std::hash<std::string>{}(a);
|
||||||
const std::size_t mixed = std::hash<std::string>{}(b);
|
const std::size_t mixed = std::hash<std::string>{}(b);
|
||||||
@@ -77,14 +77,14 @@ std::size_t MockGenerator::deterministicHash(const std::string &a,
|
|||||||
return seed;
|
return seed;
|
||||||
}
|
}
|
||||||
|
|
||||||
BreweryResult MockGenerator::generateBrewery(const std::string &cityName,
|
BreweryResult MockGenerator::GenerateBrewery(const std::string &city_name,
|
||||||
const std::string &countryName,
|
const std::string &country_name,
|
||||||
const std::string ®ionContext) {
|
const std::string ®ion_context) {
|
||||||
const std::string locationKey =
|
const std::string location_key =
|
||||||
countryName.empty() ? cityName : cityName + "," + countryName;
|
country_name.empty() ? city_name : city_name + "," + country_name;
|
||||||
const std::size_t hash = regionContext.empty()
|
const std::size_t hash = region_context.empty()
|
||||||
? std::hash<std::string>{}(locationKey)
|
? std::hash<std::string>{}(location_key)
|
||||||
: deterministicHash(locationKey, regionContext);
|
: DeterministicHash(location_key, region_context);
|
||||||
|
|
||||||
BreweryResult result;
|
BreweryResult result;
|
||||||
result.name = kBreweryAdjectives[hash % kBreweryAdjectives.size()] + " " +
|
result.name = kBreweryAdjectives[hash % kBreweryAdjectives.size()] + " " +
|
||||||
@@ -94,7 +94,7 @@ BreweryResult MockGenerator::generateBrewery(const std::string &cityName,
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
UserResult MockGenerator::generateUser(const std::string &locale) {
|
UserResult MockGenerator::GenerateUser(const std::string &locale) {
|
||||||
const std::size_t hash = std::hash<std::string>{}(locale);
|
const std::size_t hash = std::hash<std::string>{}(locale);
|
||||||
|
|
||||||
UserResult result;
|
UserResult result;
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
|
||||||
void SqliteDatabase::InitializeSchema() {
|
void SqliteDatabase::InitializeSchema() {
|
||||||
std::lock_guard<std::mutex> lock(dbMutex);
|
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||||
|
|
||||||
const char *schema = R"(
|
const char *schema = R"(
|
||||||
CREATE TABLE IF NOT EXISTS countries (
|
CREATE TABLE IF NOT EXISTS countries (
|
||||||
@@ -34,7 +34,7 @@ void SqliteDatabase::InitializeSchema() {
|
|||||||
)";
|
)";
|
||||||
|
|
||||||
char *errMsg = nullptr;
|
char *errMsg = nullptr;
|
||||||
int rc = sqlite3_exec(db, schema, nullptr, nullptr, &errMsg);
|
int rc = sqlite3_exec(db_, schema, nullptr, nullptr, &errMsg);
|
||||||
if (rc != SQLITE_OK) {
|
if (rc != SQLITE_OK) {
|
||||||
std::string error = errMsg ? std::string(errMsg) : "Unknown error";
|
std::string error = errMsg ? std::string(errMsg) : "Unknown error";
|
||||||
sqlite3_free(errMsg);
|
sqlite3_free(errMsg);
|
||||||
@@ -43,24 +43,24 @@ void SqliteDatabase::InitializeSchema() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
SqliteDatabase::~SqliteDatabase() {
|
SqliteDatabase::~SqliteDatabase() {
|
||||||
if (db) {
|
if (db_) {
|
||||||
sqlite3_close(db);
|
sqlite3_close(db_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void SqliteDatabase::Initialize(const std::string &dbPath) {
|
void SqliteDatabase::Initialize(const std::string &db_path) {
|
||||||
int rc = sqlite3_open(dbPath.c_str(), &db);
|
int rc = sqlite3_open(db_path.c_str(), &db_);
|
||||||
if (rc) {
|
if (rc) {
|
||||||
throw std::runtime_error("Failed to open SQLite database: " + dbPath);
|
throw std::runtime_error("Failed to open SQLite database: " + db_path);
|
||||||
}
|
}
|
||||||
spdlog::info("OK: SQLite database opened: {}", dbPath);
|
spdlog::info("OK: SQLite database opened: {}", db_path);
|
||||||
InitializeSchema();
|
InitializeSchema();
|
||||||
}
|
}
|
||||||
|
|
||||||
void SqliteDatabase::BeginTransaction() {
|
void SqliteDatabase::BeginTransaction() {
|
||||||
std::lock_guard<std::mutex> lock(dbMutex);
|
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||||
char *err = nullptr;
|
char *err = nullptr;
|
||||||
if (sqlite3_exec(db, "BEGIN TRANSACTION", nullptr, nullptr, &err) !=
|
if (sqlite3_exec(db_, "BEGIN TRANSACTION", nullptr, nullptr, &err) !=
|
||||||
SQLITE_OK) {
|
SQLITE_OK) {
|
||||||
std::string msg = err ? err : "unknown";
|
std::string msg = err ? err : "unknown";
|
||||||
sqlite3_free(err);
|
sqlite3_free(err);
|
||||||
@@ -69,9 +69,9 @@ void SqliteDatabase::BeginTransaction() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void SqliteDatabase::CommitTransaction() {
|
void SqliteDatabase::CommitTransaction() {
|
||||||
std::lock_guard<std::mutex> lock(dbMutex);
|
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||||
char *err = nullptr;
|
char *err = nullptr;
|
||||||
if (sqlite3_exec(db, "COMMIT", nullptr, nullptr, &err) != SQLITE_OK) {
|
if (sqlite3_exec(db_, "COMMIT", nullptr, nullptr, &err) != SQLITE_OK) {
|
||||||
std::string msg = err ? err : "unknown";
|
std::string msg = err ? err : "unknown";
|
||||||
sqlite3_free(err);
|
sqlite3_free(err);
|
||||||
throw std::runtime_error("CommitTransaction failed: " + msg);
|
throw std::runtime_error("CommitTransaction failed: " + msg);
|
||||||
@@ -81,7 +81,7 @@ void SqliteDatabase::CommitTransaction() {
|
|||||||
void SqliteDatabase::InsertCountry(int id, const std::string &name,
|
void SqliteDatabase::InsertCountry(int id, const std::string &name,
|
||||||
const std::string &iso2,
|
const std::string &iso2,
|
||||||
const std::string &iso3) {
|
const std::string &iso3) {
|
||||||
std::lock_guard<std::mutex> lock(dbMutex);
|
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||||
|
|
||||||
const char *query = R"(
|
const char *query = R"(
|
||||||
INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
|
INSERT OR IGNORE INTO countries (id, name, iso2, iso3)
|
||||||
@@ -89,7 +89,7 @@ void SqliteDatabase::InsertCountry(int id, const std::string &name,
|
|||||||
)";
|
)";
|
||||||
|
|
||||||
sqlite3_stmt *stmt;
|
sqlite3_stmt *stmt;
|
||||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
||||||
if (rc != SQLITE_OK)
|
if (rc != SQLITE_OK)
|
||||||
throw std::runtime_error("Failed to prepare country insert");
|
throw std::runtime_error("Failed to prepare country insert");
|
||||||
|
|
||||||
@@ -104,9 +104,9 @@ void SqliteDatabase::InsertCountry(int id, const std::string &name,
|
|||||||
sqlite3_finalize(stmt);
|
sqlite3_finalize(stmt);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SqliteDatabase::InsertState(int id, int countryId, const std::string &name,
|
void SqliteDatabase::InsertState(int id, int country_id, const std::string &name,
|
||||||
const std::string &iso2) {
|
const std::string &iso2) {
|
||||||
std::lock_guard<std::mutex> lock(dbMutex);
|
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||||
|
|
||||||
const char *query = R"(
|
const char *query = R"(
|
||||||
INSERT OR IGNORE INTO states (id, country_id, name, iso2)
|
INSERT OR IGNORE INTO states (id, country_id, name, iso2)
|
||||||
@@ -114,12 +114,12 @@ void SqliteDatabase::InsertState(int id, int countryId, const std::string &name,
|
|||||||
)";
|
)";
|
||||||
|
|
||||||
sqlite3_stmt *stmt;
|
sqlite3_stmt *stmt;
|
||||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
||||||
if (rc != SQLITE_OK)
|
if (rc != SQLITE_OK)
|
||||||
throw std::runtime_error("Failed to prepare state insert");
|
throw std::runtime_error("Failed to prepare state insert");
|
||||||
|
|
||||||
sqlite3_bind_int(stmt, 1, id);
|
sqlite3_bind_int(stmt, 1, id);
|
||||||
sqlite3_bind_int(stmt, 2, countryId);
|
sqlite3_bind_int(stmt, 2, country_id);
|
||||||
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_STATIC);
|
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_STATIC);
|
||||||
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_STATIC);
|
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_STATIC);
|
||||||
|
|
||||||
@@ -129,10 +129,10 @@ void SqliteDatabase::InsertState(int id, int countryId, const std::string &name,
|
|||||||
sqlite3_finalize(stmt);
|
sqlite3_finalize(stmt);
|
||||||
}
|
}
|
||||||
|
|
||||||
void SqliteDatabase::InsertCity(int id, int stateId, int countryId,
|
void SqliteDatabase::InsertCity(int id, int state_id, int country_id,
|
||||||
const std::string &name, double latitude,
|
const std::string &name, double latitude,
|
||||||
double longitude) {
|
double longitude) {
|
||||||
std::lock_guard<std::mutex> lock(dbMutex);
|
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||||
|
|
||||||
const char *query = R"(
|
const char *query = R"(
|
||||||
INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
|
INSERT OR IGNORE INTO cities (id, state_id, country_id, name, latitude, longitude)
|
||||||
@@ -140,13 +140,13 @@ void SqliteDatabase::InsertCity(int id, int stateId, int countryId,
|
|||||||
)";
|
)";
|
||||||
|
|
||||||
sqlite3_stmt *stmt;
|
sqlite3_stmt *stmt;
|
||||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
||||||
if (rc != SQLITE_OK)
|
if (rc != SQLITE_OK)
|
||||||
throw std::runtime_error("Failed to prepare city insert");
|
throw std::runtime_error("Failed to prepare city insert");
|
||||||
|
|
||||||
sqlite3_bind_int(stmt, 1, id);
|
sqlite3_bind_int(stmt, 1, id);
|
||||||
sqlite3_bind_int(stmt, 2, stateId);
|
sqlite3_bind_int(stmt, 2, state_id);
|
||||||
sqlite3_bind_int(stmt, 3, countryId);
|
sqlite3_bind_int(stmt, 3, country_id);
|
||||||
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_STATIC);
|
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_STATIC);
|
||||||
sqlite3_bind_double(stmt, 5, latitude);
|
sqlite3_bind_double(stmt, 5, latitude);
|
||||||
sqlite3_bind_double(stmt, 6, longitude);
|
sqlite3_bind_double(stmt, 6, longitude);
|
||||||
@@ -158,12 +158,12 @@ void SqliteDatabase::InsertCity(int id, int stateId, int countryId,
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<City> SqliteDatabase::QueryCities() {
|
std::vector<City> SqliteDatabase::QueryCities() {
|
||||||
std::lock_guard<std::mutex> lock(dbMutex);
|
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||||
std::vector<City> cities;
|
std::vector<City> cities;
|
||||||
sqlite3_stmt *stmt = nullptr;
|
sqlite3_stmt *stmt = nullptr;
|
||||||
|
|
||||||
const char *query = "SELECT id, name, country_id FROM cities ORDER BY name";
|
const char *query = "SELECT id, name, country_id FROM cities ORDER BY name";
|
||||||
int rc = sqlite3_prepare_v2(db, query, -1, &stmt, nullptr);
|
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
||||||
|
|
||||||
if (rc != SQLITE_OK) {
|
if (rc != SQLITE_OK) {
|
||||||
throw std::runtime_error("Failed to prepare query");
|
throw std::runtime_error("Failed to prepare query");
|
||||||
@@ -173,8 +173,8 @@ std::vector<City> SqliteDatabase::QueryCities() {
|
|||||||
int id = sqlite3_column_int(stmt, 0);
|
int id = sqlite3_column_int(stmt, 0);
|
||||||
const char *name =
|
const char *name =
|
||||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
||||||
int countryId = sqlite3_column_int(stmt, 2);
|
int country_id = sqlite3_column_int(stmt, 2);
|
||||||
cities.push_back({id, name ? std::string(name) : "", countryId});
|
cities.push_back({id, name ? std::string(name) : "", country_id});
|
||||||
}
|
}
|
||||||
|
|
||||||
sqlite3_finalize(stmt);
|
sqlite3_finalize(stmt);
|
||||||
@@ -182,7 +182,7 @@ std::vector<City> SqliteDatabase::QueryCities() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
|
std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
|
||||||
std::lock_guard<std::mutex> lock(dbMutex);
|
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||||
|
|
||||||
std::vector<Country> countries;
|
std::vector<Country> countries;
|
||||||
sqlite3_stmt *stmt = nullptr;
|
sqlite3_stmt *stmt = nullptr;
|
||||||
@@ -193,7 +193,7 @@ std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
|
|||||||
query += " LIMIT " + std::to_string(limit);
|
query += " LIMIT " + std::to_string(limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
int rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt, nullptr);
|
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
|
||||||
|
|
||||||
if (rc != SQLITE_OK) {
|
if (rc != SQLITE_OK) {
|
||||||
throw std::runtime_error("Failed to prepare countries query");
|
throw std::runtime_error("Failed to prepare countries query");
|
||||||
@@ -217,7 +217,7 @@ std::vector<Country> SqliteDatabase::QueryCountries(int limit) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::vector<State> SqliteDatabase::QueryStates(int limit) {
|
std::vector<State> SqliteDatabase::QueryStates(int limit) {
|
||||||
std::lock_guard<std::mutex> lock(dbMutex);
|
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||||
|
|
||||||
std::vector<State> states;
|
std::vector<State> states;
|
||||||
sqlite3_stmt *stmt = nullptr;
|
sqlite3_stmt *stmt = nullptr;
|
||||||
@@ -228,7 +228,7 @@ std::vector<State> SqliteDatabase::QueryStates(int limit) {
|
|||||||
query += " LIMIT " + std::to_string(limit);
|
query += " LIMIT " + std::to_string(limit);
|
||||||
}
|
}
|
||||||
|
|
||||||
int rc = sqlite3_prepare_v2(db, query.c_str(), -1, &stmt, nullptr);
|
int rc = sqlite3_prepare_v2(db_, query.c_str(), -1, &stmt, nullptr);
|
||||||
|
|
||||||
if (rc != SQLITE_OK) {
|
if (rc != SQLITE_OK) {
|
||||||
throw std::runtime_error("Failed to prepare states query");
|
throw std::runtime_error("Failed to prepare states query");
|
||||||
@@ -240,9 +240,9 @@ std::vector<State> SqliteDatabase::QueryStates(int limit) {
|
|||||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 1));
|
||||||
const char *iso2 =
|
const char *iso2 =
|
||||||
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 2));
|
reinterpret_cast<const char *>(sqlite3_column_text(stmt, 2));
|
||||||
int countryId = sqlite3_column_int(stmt, 3);
|
int country_id = sqlite3_column_int(stmt, 3);
|
||||||
states.push_back({id, name ? std::string(name) : "",
|
states.push_back({id, name ? std::string(name) : "",
|
||||||
iso2 ? std::string(iso2) : "", countryId});
|
iso2 ? std::string(iso2) : "", country_id});
|
||||||
}
|
}
|
||||||
|
|
||||||
sqlite3_finalize(stmt);
|
sqlite3_finalize(stmt);
|
||||||
|
|||||||
@@ -5,12 +5,12 @@
|
|||||||
#include "json_handling/json_loader.h"
|
#include "json_handling/json_loader.h"
|
||||||
#include "json_handling/stream_parser.h"
|
#include "json_handling/stream_parser.h"
|
||||||
|
|
||||||
void JsonLoader::LoadWorldCities(const std::string &jsonPath,
|
void JsonLoader::LoadWorldCities(const std::string &json_path,
|
||||||
SqliteDatabase &db) {
|
SqliteDatabase &db) {
|
||||||
constexpr size_t kBatchSize = 10000;
|
constexpr size_t kBatchSize = 10000;
|
||||||
|
|
||||||
auto startTime = std::chrono::high_resolution_clock::now();
|
auto startTime = std::chrono::high_resolution_clock::now();
|
||||||
spdlog::info("\nLoading {} (streaming RapidJSON SAX)...", jsonPath);
|
spdlog::info("\nLoading {} (streaming RapidJSON SAX)...", json_path);
|
||||||
|
|
||||||
db.BeginTransaction();
|
db.BeginTransaction();
|
||||||
bool transactionOpen = true;
|
bool transactionOpen = true;
|
||||||
@@ -18,7 +18,7 @@ void JsonLoader::LoadWorldCities(const std::string &jsonPath,
|
|||||||
size_t citiesProcessed = 0;
|
size_t citiesProcessed = 0;
|
||||||
try {
|
try {
|
||||||
StreamingJsonParser::Parse(
|
StreamingJsonParser::Parse(
|
||||||
jsonPath, db,
|
json_path, db,
|
||||||
[&](const CityRecord &record) {
|
[&](const CityRecord &record) {
|
||||||
db.InsertCity(record.id, record.state_id, record.country_id,
|
db.InsertCity(record.id, record.state_id, record.country_id,
|
||||||
record.name, record.latitude, record.longitude);
|
record.name, record.latitude, record.longitude);
|
||||||
|
|||||||
@@ -232,15 +232,15 @@ private:
|
|||||||
};
|
};
|
||||||
|
|
||||||
void StreamingJsonParser::Parse(
|
void StreamingJsonParser::Parse(
|
||||||
const std::string &filePath, SqliteDatabase &db,
|
const std::string &file_path, SqliteDatabase &db,
|
||||||
std::function<void(const CityRecord &)> onCity,
|
std::function<void(const CityRecord &)> on_city,
|
||||||
std::function<void(size_t, size_t)> onProgress) {
|
std::function<void(size_t, size_t)> on_progress) {
|
||||||
|
|
||||||
spdlog::info(" Streaming parse of {} (Boost.JSON)...", filePath);
|
spdlog::info(" Streaming parse of {} (Boost.JSON)...", file_path);
|
||||||
|
|
||||||
FILE *file = std::fopen(filePath.c_str(), "rb");
|
FILE *file = std::fopen(file_path.c_str(), "rb");
|
||||||
if (!file) {
|
if (!file) {
|
||||||
throw std::runtime_error("Failed to open JSON file: " + filePath);
|
throw std::runtime_error("Failed to open JSON file: " + file_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t total_size = 0;
|
size_t total_size = 0;
|
||||||
@@ -252,7 +252,7 @@ void StreamingJsonParser::Parse(
|
|||||||
std::rewind(file);
|
std::rewind(file);
|
||||||
}
|
}
|
||||||
|
|
||||||
CityRecordHandler::ParseContext ctx{&db, onCity, onProgress, 0,
|
CityRecordHandler::ParseContext ctx{&db, on_city, on_progress, 0,
|
||||||
total_size, 0, 0};
|
total_size, 0, 0};
|
||||||
boost::json::basic_parser<CityRecordHandler> parser(
|
boost::json::basic_parser<CityRecordHandler> parser(
|
||||||
boost::json::parse_options{}, ctx);
|
boost::json::parse_options{}, ctx);
|
||||||
|
|||||||
@@ -61,21 +61,21 @@ bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Check for mutually exclusive --mocked and --model flags
|
// Check for mutually exclusive --mocked and --model flags
|
||||||
bool useMocked = vm["mocked"].as<bool>();
|
bool use_mocked = vm["mocked"].as<bool>();
|
||||||
std::string modelPath = vm["model"].as<std::string>();
|
std::string model_path = vm["model"].as<std::string>();
|
||||||
|
|
||||||
if (useMocked && !modelPath.empty()) {
|
if (use_mocked && !model_path.empty()) {
|
||||||
spdlog::error("ERROR: --mocked and --model are mutually exclusive");
|
spdlog::error("ERROR: --mocked and --model are mutually exclusive");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!useMocked && modelPath.empty()) {
|
if (!use_mocked && model_path.empty()) {
|
||||||
spdlog::error("ERROR: Either --mocked or --model must be specified");
|
spdlog::error("ERROR: Either --mocked or --model must be specified");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Warn if sampling parameters are provided with --mocked
|
// Warn if sampling parameters are provided with --mocked
|
||||||
if (useMocked) {
|
if (use_mocked) {
|
||||||
bool hasTemperature = vm["temperature"].defaulted() == false;
|
bool hasTemperature = vm["temperature"].defaulted() == false;
|
||||||
bool hasTopP = vm["top-p"].defaulted() == false;
|
bool hasTopP = vm["top-p"].defaulted() == false;
|
||||||
bool hasSeed = vm["seed"].defaulted() == false;
|
bool hasSeed = vm["seed"].defaulted() == false;
|
||||||
@@ -85,11 +85,11 @@ bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
options.useMocked = useMocked;
|
options.use_mocked = use_mocked;
|
||||||
options.modelPath = modelPath;
|
options.model_path = model_path;
|
||||||
options.cacheDir = vm["cache-dir"].as<std::string>();
|
options.cache_dir = vm["cache-dir"].as<std::string>();
|
||||||
options.temperature = vm["temperature"].as<float>();
|
options.temperature = vm["temperature"].as<float>();
|
||||||
options.topP = vm["top-p"].as<float>();
|
options.top_p = vm["top-p"].as<float>();
|
||||||
options.seed = vm["seed"].as<int>();
|
options.seed = vm["seed"].as<int>();
|
||||||
// commit is always pinned to c5eb7772
|
// commit is always pinned to c5eb7772
|
||||||
|
|
||||||
|
|||||||
@@ -63,13 +63,13 @@ CURLWebClient::CURLWebClient() {}
|
|||||||
CURLWebClient::~CURLWebClient() {}
|
CURLWebClient::~CURLWebClient() {}
|
||||||
|
|
||||||
void CURLWebClient::DownloadToFile(const std::string &url,
|
void CURLWebClient::DownloadToFile(const std::string &url,
|
||||||
const std::string &filePath) {
|
const std::string &file_path) {
|
||||||
auto curl = create_handle();
|
auto curl = create_handle();
|
||||||
|
|
||||||
std::ofstream outFile(filePath, std::ios::binary);
|
std::ofstream outFile(file_path, std::ios::binary);
|
||||||
if (!outFile.is_open()) {
|
if (!outFile.is_open()) {
|
||||||
throw std::runtime_error("[CURLWebClient] Cannot open file for writing: " +
|
throw std::runtime_error("[CURLWebClient] Cannot open file for writing: " +
|
||||||
filePath);
|
file_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
set_common_get_options(curl.get(), url, 30L, 300L);
|
set_common_get_options(curl.get(), url, 30L, 300L);
|
||||||
@@ -81,7 +81,7 @@ void CURLWebClient::DownloadToFile(const std::string &url,
|
|||||||
outFile.close();
|
outFile.close();
|
||||||
|
|
||||||
if (res != CURLE_OK) {
|
if (res != CURLE_OK) {
|
||||||
std::remove(filePath.c_str());
|
std::remove(file_path.c_str());
|
||||||
std::string error = std::string("[CURLWebClient] Download failed: ") +
|
std::string error = std::string("[CURLWebClient] Download failed: ") +
|
||||||
curl_easy_strerror(res);
|
curl_easy_strerror(res);
|
||||||
throw std::runtime_error(error);
|
throw std::runtime_error(error);
|
||||||
@@ -91,7 +91,7 @@ void CURLWebClient::DownloadToFile(const std::string &url,
|
|||||||
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
|
curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &httpCode);
|
||||||
|
|
||||||
if (httpCode != 200) {
|
if (httpCode != 200) {
|
||||||
std::remove(filePath.c_str());
|
std::remove(file_path.c_str());
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
|
ss << "[CURLWebClient] HTTP error " << httpCode << " for URL " << url;
|
||||||
throw std::runtime_error(ss.str());
|
throw std::runtime_error(ss.str());
|
||||||
|
|||||||
Reference in New Issue
Block a user