Add timeout to wikipedia enrichment to avoid breaking rate limits, add mock enrichment (#224)

* Add timeout for enrichment, refactor json deserialization * Add location count to application options and as a cli arg * Add mock enrichment process
2026-07-16 17:47:22 +00:00 · 2026-05-14 19:15:51 -04:00
parent b7c0b1c8d4
commit 2ee7b3d2a2
19 changed files with 261 additions and 147 deletions
--- a/tooling/pipeline/includes/data_model/models.h
+++ b/tooling/pipeline/includes/data_model/models.h
@@ -83,6 +83,9 @@ struct SamplingOptions {

  /// @brief Random seed (-1 for random, otherwise non-negative).
  int seed = -1;
+
+  /// @brief Number of layers to offload to GPU.
+  int n_gpu_layers = 0;
 };

 /**
@@ -95,8 +98,7 @@ struct GeneratorOptions {
  /// @brief Use mocked generator instead of actual LLM inference.
  bool use_mocked = false;

-  /// @brief Number of layers to offload to GPU.
-  int n_gpu_layers = 0;
+

  /// @brief Specific sampling parameters for this generator.
  /// If nullopt, the application should use global defaults.
@@ -116,6 +118,10 @@ struct PipelineOptions {

  /// @brief Path for application logs.
  std::filesystem::path log_path;
+
+  /// @brief Number of locations to sample from the dataset
+  /// More locations -> more users/more breweries
+  uint32_t location_count;
 };

 /**