Add timeout to wikipedia enrichment to avoid breaking rate limits, add mock enrichment (#224)

* Add timeout for enrichment, refactor json deserialization

* Add location count to application options and as a cli arg

* Add mock enrichment process
This commit is contained in:
2026-05-14 19:15:51 -04:00
committed by GitHub
parent b7c0b1c8d4
commit 2ee7b3d2a2
19 changed files with 261 additions and 147 deletions

View File

@@ -0,0 +1,112 @@
/**
* @file wikipedia/fetch_extract.cc
*/
#include <spdlog/spdlog.h>
#include <boost/json.hpp>
#include <chrono>
#include <format>
#include <string>
#include <string_view>
#include <thread>
#include "services/enrichment/wikipedia_service.h"
using namespace boost;
std::string WikipediaEnrichmentService::FetchExtract(std::string_view query) {
const std::string cache_key(query);
// 1. Cache Lookup
if (const auto cache_it = this->extract_cache_.find(cache_key);
cache_it != this->extract_cache_.end()) {
spdlog::debug("Wikipedia: Cache hit for {}!", cache_key);
return cache_it->second;
}
const std::string encoded = this->client_->EncodeURL(cache_key);
const std::string url = std::format(
"https://en.wikipedia.org/w/"
"api.php?action=query&titles={}&prop=extracts&explaintext=1&format=json",
encoded);
const std::string body = this->client_->Get(url);
{
using namespace std::literals::chrono_literals;
std::this_thread::sleep_for(1s);
}
// 2. Parse JSON
system::error_code ec;
json::value doc = json::parse(body, ec);
if (ec) {
spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
ec.message());
return {};
}
// 3. Safe Extraction
const json::object* obj = doc.if_object();
if (obj == nullptr) {
spdlog::warn("WikipediaService: Expected root object for '{}'", query);
return {};
}
const json::value* query_ptr = obj->if_contains("query");
const json::value* pages_ptr =
((query_ptr != nullptr) && query_ptr->is_object())
? query_ptr->get_object().if_contains("pages")
: nullptr;
if ((pages_ptr == nullptr) || !pages_ptr->is_object()) {
spdlog::warn("WikipediaService: Missing query.pages for '{}'", query);
return {};
}
const json::object& pages = pages_ptr->get_object();
if (pages.empty()) {
spdlog::warn("WikipediaService: No pages returned for '{}'", query);
this->extract_cache_.emplace(cache_key, "");
return {};
}
// Wikipedia returns the page under a dynamic ID key; we just want the first
// one
const json::value& page_val = pages.begin()->value();
if (!page_val.is_object()) {
spdlog::warn("WikipediaService: Unexpected page format for '{}'", query);
return {};
}
const json::object& page = page_val.get_object();
// Handle 404/Missing status
if (page.contains("missing")) {
spdlog::warn("WikipediaService: Page '{}' does not exist", query);
this->extract_cache_.emplace(cache_key, "");
return {};
}
const json::value* extract_ptr = page.if_contains("extract");
if ((extract_ptr == nullptr) || !extract_ptr->is_string()) {
spdlog::warn("WikipediaService: No extract string found for '{}'", query);
this->extract_cache_.emplace(cache_key, "");
return {};
}
// 4. Success
std::string extract(extract_ptr->as_string());
spdlog::info("WikipediaService: Fetched {} chars for '{}'", extract.size(),
query);
this->extract_cache_.insert_or_assign(cache_key, extract);
return extract;
}