Add timeout to wikipedia enrichment to avoid breaking rate limits, add mock enrichment (#224)

* Add timeout for enrichment, refactor json deserialization

* Add location count to application options and as a cli arg

* Add mock enrichment process
This commit is contained in:
2026-05-14 19:15:51 -04:00
committed by GitHub
parent b7c0b1c8d4
commit 2ee7b3d2a2
19 changed files with 261 additions and 147 deletions

View File

@@ -12,8 +12,8 @@
#include "data_generation/data_generator.h"
#include "data_model/generated_models.h"
#include "services/enrichment/enrichment_service.h"
#include "services/database/export_service.h"
#include "services/enrichment/enrichment_service.h"
/**
* @brief Main data generator class for the Biergarten pipeline.
@@ -32,7 +32,8 @@ class BiergartenDataGenerator {
*/
BiergartenDataGenerator(std::unique_ptr<IEnrichmentService> context_service,
std::unique_ptr<DataGenerator> generator,
std::unique_ptr<IExportService> exporter);
std::unique_ptr<IExportService> exporter,
const ApplicationOptions& application_options);
/**
* @brief Run the data generation pipeline.
@@ -56,12 +57,14 @@ class BiergartenDataGenerator {
/// @brief Storage backend for generated brewery records.
std::unique_ptr<IExportService> exporter_;
const ApplicationOptions application_options_;
/**
* @brief Load locations from JSON and sample cities.
*
* @return Vector of sampled locations capped at 50 entries.
*/
static std::vector<Location> QueryCitiesWithCountries();
std::vector<Location> QueryCitiesWithCountries();
/**
* @brief Generate breweries for enriched cities.

View File

@@ -83,6 +83,9 @@ struct SamplingOptions {
/// @brief Random seed (-1 for random, otherwise non-negative).
int seed = -1;
/// @brief Number of layers to offload to GPU.
int n_gpu_layers = 0;
};
/**
@@ -95,8 +98,7 @@ struct GeneratorOptions {
/// @brief Use mocked generator instead of actual LLM inference.
bool use_mocked = false;
/// @brief Number of layers to offload to GPU.
int n_gpu_layers = 0;
/// @brief Specific sampling parameters for this generator.
/// If nullopt, the application should use global defaults.
@@ -116,6 +118,10 @@ struct PipelineOptions {
/// @brief Path for application logs.
std::filesystem::path log_path;
/// @brief Number of locations to sample from the dataset
/// More locations -> more users/more breweries
uint32_t location_count;
};
/**

View File

@@ -0,0 +1,17 @@
//
// Created by aaronpo on 13/05/2026.
//
#ifndef BIERGARTEN_PIPELINE_INCLUDES_SERVICES_ENRICHMENT_MOCK_ENRICHMENT_H_
#define BIERGARTEN_PIPELINE_INCLUDES_SERVICES_ENRICHMENT_MOCK_ENRICHMENT_H_
#include <string>
#include "enrichment_service.h"
class MockEnrichmentService final : public IEnrichmentService {
public:
std::string GetLocationContext(const Location& /*loc*/) override {
return {};
}
};
#endif // BIERGARTEN_PIPELINE_INCLUDES_SERVICES_ENRICHMENT_MOCK_ENRICHMENT_H_

View File

@@ -15,10 +15,10 @@
#include "web_client/web_client.h"
/// @brief Provides Wikipedia summary lookups backed by cached raw extracts.
class WikipediaService final : public IEnrichmentService {
class WikipediaEnrichmentService final : public IEnrichmentService {
public:
/// @brief Creates a new Wikipedia service with the provided web client.
explicit WikipediaService(std::unique_ptr<WebClient> client);
explicit WikipediaEnrichmentService(std::unique_ptr<WebClient> client);
/// @brief Returns the Wikipedia-derived context for a location.
[[nodiscard]] std::string GetLocationContext(const Location& loc) override;

View File

@@ -42,7 +42,7 @@ public:
* @param value Raw string to encode.
* @return Percent-encoded string safe for use in a URL.
*/
std::string UrlEncode(const std::string& value) override;
std::string EncodeURL(const std::string& value) override;
};

View File

@@ -30,7 +30,7 @@ class WebClient {
* @param value Raw string value.
* @return Encoded value safe for URL usage.
*/
virtual std::string UrlEncode(const std::string& value) = 0;
virtual std::string EncodeURL(const std::string& value) = 0;
};
#endif // BIERGARTEN_PIPELINE_INCLUDES_WEB_CLIENT_WEB_CLIENT_H_