mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-06-01 10:04:00 +00:00
fix: address critical correctness, reliability, and design issues in pipeline
CORRECTNESS FIXES: - json_loader: Add RollbackTransaction() and call it on exception instead of CommitTransaction(). Prevents partial data corruption on parse/disk errors. - wikipedia_service: Fix invalid MediaWiki API parameter explaintext=true -> explaintext=1. Now returns plain text instead of HTML markup in contexts. - helpers: Fix ParseTwoLineResponse filter to only remove known thinking tags (<think>, <reasoning>, <reflect>) instead of any <...> pattern. Prevents silently removing legitimate output like <username>content</username>. RELIABILITY & DESIGN IMPROVEMENTS: - load/main: Make n_ctx (context window size) configurable via --n-ctx flag (default 2048, range 1-32768) to support larger models like Qwen3-14B. - generate_brewery: Prevent retry prompt growth by extracting location context into constant and using compact retry format (error + schema + location only). Avoids token truncation on final retry attempts. - database: Fix data representativeness by changing QueryCities from ORDER BY name (alphabetic bias) to ORDER BY RANDOM() for unbiased sampling. Convert all SQLITE_STATIC to SQLITE_TRANSIENT to prevent use-after-free risks. POLISH: - infer: Advance sampling seed between generation calls to improve diversity across brewery and user generation. - data_downloader: Remove unnecessary commit hash truncation; use full hash. - json_loader: Fix misleading log message from "RapidJSON" to "Boost.JSON".
This commit is contained in:
@@ -1,12 +1,12 @@
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
#include "web_client/curl_web_client.h"
|
||||
#include "database/database.h"
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
@@ -18,101 +18,117 @@ namespace po = boost::program_options;
|
||||
* @param options Output ApplicationOptions struct.
|
||||
* @return true if parsing succeeded and should proceed, false otherwise.
|
||||
*/
|
||||
bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
|
||||
// If no arguments provided, display usage and exit
|
||||
if (argc == 1) {
|
||||
std::cout << "Biergarten Pipeline - Geographic Data Pipeline with Brewery Generation\n\n";
|
||||
std::cout << "Usage: biergarten-pipeline [options]\n\n";
|
||||
std::cout << "Options:\n";
|
||||
std::cout << " --mocked Use mocked generator for brewery/user data\n";
|
||||
std::cout << " --model, -m PATH Path to LLM model file (gguf) for generation\n";
|
||||
std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: /tmp)\n";
|
||||
std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 (default: 0.8)\n";
|
||||
std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 (default: 0.92)\n";
|
||||
std::cout << " --seed SEED Random seed: -1 for random (default: -1)\n";
|
||||
std::cout << " --help, -h Show this help message\n\n";
|
||||
std::cout << "Note: --mocked and --model are mutually exclusive. Exactly one must be provided.\n";
|
||||
std::cout << "Data source is always pinned to commit c5eb7772 (stable 2026-03-28).\n";
|
||||
return false;
|
||||
}
|
||||
bool ParseArguments(int argc, char** argv, ApplicationOptions& options) {
|
||||
// If no arguments provided, display usage and exit
|
||||
if (argc == 1) {
|
||||
std::cout << "Biergarten Pipeline - Geographic Data Pipeline with "
|
||||
"Brewery Generation\n\n";
|
||||
std::cout << "Usage: biergarten-pipeline [options]\n\n";
|
||||
std::cout << "Options:\n";
|
||||
std::cout << " --mocked Use mocked generator for "
|
||||
"brewery/user data\n";
|
||||
std::cout << " --model, -m PATH Path to LLM model file (gguf) for "
|
||||
"generation\n";
|
||||
std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: "
|
||||
"/tmp)\n";
|
||||
std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 "
|
||||
"(default: 0.8)\n";
|
||||
std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 "
|
||||
"(default: 0.92)\n";
|
||||
std::cout << " --n-ctx SIZE Context window size in tokens "
|
||||
"(default: 2048)\n";
|
||||
std::cout << " --seed SEED Random seed: -1 for random "
|
||||
"(default: -1)\n";
|
||||
std::cout << " --help, -h Show this help message\n\n";
|
||||
std::cout << "Note: --mocked and --model are mutually exclusive. Exactly "
|
||||
"one must be provided.\n";
|
||||
std::cout << "Data source is always pinned to commit c5eb7772 (stable "
|
||||
"2026-03-28).\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
po::options_description desc("Pipeline Options");
|
||||
desc.add_options()("help,h", "Produce help message")(
|
||||
"mocked", po::bool_switch(),
|
||||
"Use mocked generator for brewery/user data")(
|
||||
"model,m", po::value<std::string>()->default_value(""),
|
||||
"Path to LLM model (gguf)")(
|
||||
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
|
||||
"Directory for cached JSON")(
|
||||
"temperature", po::value<float>()->default_value(0.8f),
|
||||
"Sampling temperature (higher = more random)")(
|
||||
"top-p", po::value<float>()->default_value(0.92f),
|
||||
"Nucleus sampling top-p in (0,1] (higher = more random)")(
|
||||
"seed", po::value<int>()->default_value(-1),
|
||||
"Sampler seed: -1 for random, otherwise non-negative integer");
|
||||
po::options_description desc("Pipeline Options");
|
||||
desc.add_options()("help,h", "Produce help message")(
|
||||
"mocked", po::bool_switch(),
|
||||
"Use mocked generator for brewery/user data")(
|
||||
"model,m", po::value<std::string>()->default_value(""),
|
||||
"Path to LLM model (gguf)")(
|
||||
"cache-dir,c", po::value<std::string>()->default_value("/tmp"),
|
||||
"Directory for cached JSON")(
|
||||
"temperature", po::value<float>()->default_value(0.8f),
|
||||
"Sampling temperature (higher = more random)")(
|
||||
"top-p", po::value<float>()->default_value(0.92f),
|
||||
"Nucleus sampling top-p in (0,1] (higher = more random)")(
|
||||
"n-ctx", po::value<uint32_t>()->default_value(2048),
|
||||
"Context window size in tokens (1-32768)")(
|
||||
"seed", po::value<int>()->default_value(-1),
|
||||
"Sampler seed: -1 for random, otherwise non-negative integer");
|
||||
|
||||
po::variables_map vm;
|
||||
po::store(po::parse_command_line(argc, argv, desc), vm);
|
||||
po::notify(vm);
|
||||
po::variables_map vm;
|
||||
po::store(po::parse_command_line(argc, argv, desc), vm);
|
||||
po::notify(vm);
|
||||
|
||||
if (vm.count("help")) {
|
||||
std::cout << desc << "\n";
|
||||
return false;
|
||||
}
|
||||
if (vm.count("help")) {
|
||||
std::cout << desc << "\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for mutually exclusive --mocked and --model flags
|
||||
bool use_mocked = vm["mocked"].as<bool>();
|
||||
std::string model_path = vm["model"].as<std::string>();
|
||||
// Check for mutually exclusive --mocked and --model flags
|
||||
bool use_mocked = vm["mocked"].as<bool>();
|
||||
std::string model_path = vm["model"].as<std::string>();
|
||||
|
||||
if (use_mocked && !model_path.empty()) {
|
||||
spdlog::error("ERROR: --mocked and --model are mutually exclusive");
|
||||
return false;
|
||||
}
|
||||
if (use_mocked && !model_path.empty()) {
|
||||
spdlog::error("ERROR: --mocked and --model are mutually exclusive");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!use_mocked && model_path.empty()) {
|
||||
spdlog::error("ERROR: Either --mocked or --model must be specified");
|
||||
return false;
|
||||
}
|
||||
if (!use_mocked && model_path.empty()) {
|
||||
spdlog::error("ERROR: Either --mocked or --model must be specified");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Warn if sampling parameters are provided with --mocked
|
||||
if (use_mocked) {
|
||||
bool hasTemperature = vm["temperature"].defaulted() == false;
|
||||
bool hasTopP = vm["top-p"].defaulted() == false;
|
||||
bool hasSeed = vm["seed"].defaulted() == false;
|
||||
// Warn if sampling parameters are provided with --mocked
|
||||
if (use_mocked) {
|
||||
bool hasTemperature = vm["temperature"].defaulted() == false;
|
||||
bool hasTopP = vm["top-p"].defaulted() == false;
|
||||
bool hasSeed = vm["seed"].defaulted() == false;
|
||||
|
||||
if (hasTemperature || hasTopP || hasSeed) {
|
||||
spdlog::warn("WARNING: Sampling parameters (--temperature, --top-p, --seed) are ignored when using --mocked");
|
||||
}
|
||||
}
|
||||
if (hasTemperature || hasTopP || hasSeed) {
|
||||
spdlog::warn(
|
||||
"WARNING: Sampling parameters (--temperature, --top-p, --seed) "
|
||||
"are ignored when using --mocked");
|
||||
}
|
||||
}
|
||||
|
||||
options.use_mocked = use_mocked;
|
||||
options.model_path = model_path;
|
||||
options.cache_dir = vm["cache-dir"].as<std::string>();
|
||||
options.temperature = vm["temperature"].as<float>();
|
||||
options.top_p = vm["top-p"].as<float>();
|
||||
options.seed = vm["seed"].as<int>();
|
||||
// commit is always pinned to c5eb7772
|
||||
options.use_mocked = use_mocked;
|
||||
options.model_path = model_path;
|
||||
options.cache_dir = vm["cache-dir"].as<std::string>();
|
||||
options.temperature = vm["temperature"].as<float>();
|
||||
options.top_p = vm["top-p"].as<float>();
|
||||
options.n_ctx = vm["n-ctx"].as<uint32_t>();
|
||||
options.seed = vm["seed"].as<int>();
|
||||
// commit is always pinned to c5eb7772
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
try {
|
||||
const CurlGlobalState curl_state;
|
||||
int main(int argc, char* argv[]) {
|
||||
try {
|
||||
const CurlGlobalState curl_state;
|
||||
|
||||
ApplicationOptions options;
|
||||
if (!ParseArguments(argc, argv, options)) {
|
||||
return 0;
|
||||
}
|
||||
ApplicationOptions options;
|
||||
if (!ParseArguments(argc, argv, options)) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
auto webClient = std::make_shared<CURLWebClient>();
|
||||
SqliteDatabase database;
|
||||
auto webClient = std::make_shared<CURLWebClient>();
|
||||
SqliteDatabase database;
|
||||
|
||||
BiergartenDataGenerator generator(options, webClient, database);
|
||||
return generator.Run();
|
||||
BiergartenDataGenerator generator(options, webClient, database);
|
||||
return generator.Run();
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
spdlog::error("ERROR: Application failed: {}", e.what());
|
||||
return 1;
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::error("ERROR: Application failed: {}", e.what());
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user