Pipeline: add CURL/WebClient & Wikipedia service

Introduce a pluggable web client interface and concrete CURL implementation: adds IWebClient, CURLWebClient, and CurlGlobalState (headers + curl_web_client.cpp). DataDownloader now accepts an IWebClient and delegates downloads. Add WikipediaService for cached Wikipedia summary lookups. Refactor SqliteDatabase to return full City records and update consumers accordingly. Improve JsonLoader to use batched transactions during streaming parses. Enhance LlamaGenerator with sampling options, increased token limits, JSON extraction/validation, and other parsing helpers. Modernize CMake: set policy/version, add project_options, simplify FetchContent usage (spdlog), require Boost components (program_options/json), list pipeline sources explicitly, and tweak post-build/memcheck targets. Update README to match implementation changes and new CLI/config conventions.
2026-04-05 18:09:04 +00:00 · 2026-04-02 16:29:16 -04:00
parent ac136f7179
commit 98083ab40c
16 changed files with 1125 additions and 794 deletions
--- a/pipeline/includes/llama_generator.h
+++ b/pipeline/includes/llama_generator.h
@@ -1,16 +1,20 @@
 #pragma once

-#include "data_generator.h"
-#include <memory>
+#include <cstdint>
 #include <string>

+#include "data_generator.h"
+
 struct llama_model;
 struct llama_context;

 class LlamaGenerator final : public IDataGenerator {
 public:
+  LlamaGenerator() = default;
  ~LlamaGenerator() override;

+  void setSamplingOptions(float temperature, float topP, int seed = -1);
+
  void load(const std::string &modelPath) override;
  BreweryResult generateBrewery(const std::string &cityName,
                                const std::string &countryName,
@@ -18,14 +22,17 @@ public:
  UserResult generateUser(const std::string &locale) override;

 private:
-  std::string infer(const std::string &prompt, int maxTokens = 5000);
+  std::string infer(const std::string &prompt, int maxTokens = 10000);
  // Overload that allows passing a system message separately so chat-capable
  // models receive a proper system role instead of having the system text
  // concatenated into the user prompt (helps avoid revealing internal
  // reasoning or instructions in model output).
  std::string infer(const std::string &systemPrompt, const std::string &prompt,
-                    int maxTokens = 5000);
+                    int maxTokens = 10000);

  llama_model *model_ = nullptr;
  llama_context *context_ = nullptr;
+  float sampling_temperature_ = 0.8f;
+  float sampling_top_p_ = 0.92f;
+  uint32_t sampling_seed_ = 0xFFFFFFFFu;
 };