3 Commits

Author SHA1 Message Date
Aaron Po
60ee2ecf74 add prompts 2026-04-03 15:53:04 -04:00
Aaron Po
e4e16a5084 fix: address critical correctness, reliability, and design issues in pipeline
CORRECTNESS FIXES:
- json_loader: Add RollbackTransaction() and call it on exception instead of
  CommitTransaction(). Prevents partial data corruption on parse/disk errors.
- wikipedia_service: Fix invalid MediaWiki API parameter explaintext=true ->
  explaintext=1. Now returns plain text instead of HTML markup in contexts.
- helpers: Fix ParseTwoLineResponse filter to only remove known thinking tags
  (<think>, <reasoning>, <reflect>) instead of any <...> pattern. Prevents
  silently removing legitimate output like <username>content</username>.

RELIABILITY & DESIGN IMPROVEMENTS:
- load/main: Make n_ctx (context window size) configurable via --n-ctx flag
  (default 2048, range 1-32768) to support larger models like Qwen3-14B.
- generate_brewery: Prevent retry prompt growth by extracting location context
  into constant and using compact retry format (error + schema + location only).
  Avoids token truncation on final retry attempts.
- database: Fix data representativeness by changing QueryCities from
  ORDER BY name (alphabetic bias) to ORDER BY RANDOM() for unbiased sampling.
  Convert all SQLITE_STATIC to SQLITE_TRANSIENT to prevent use-after-free risks.

POLISH:
- infer: Advance sampling seed between generation calls to improve diversity
  across brewery and user generation.
- data_downloader: Remove unnecessary commit hash truncation; use full hash.
- json_loader: Fix misleading log message from "RapidJSON" to "Boost.JSON".
2026-04-03 11:58:00 -04:00
Aaron Po
8d306bf691 Update documentation for llama 2026-04-02 23:24:06 -04:00
20 changed files with 1123 additions and 130 deletions

View File

@@ -90,6 +90,7 @@ set(PIPELINE_SOURCES
src/data_generation/llama/generate_brewery.cpp src/data_generation/llama/generate_brewery.cpp
src/data_generation/llama/generate_user.cpp src/data_generation/llama/generate_user.cpp
src/data_generation/llama/helpers.cpp src/data_generation/llama/helpers.cpp
src/data_generation/llama/load_brewery_prompt.cpp
src/data_generation/mock/data.cpp src/data_generation/mock/data.cpp
src/data_generation/mock/deterministic_hash.cpp src/data_generation/mock/deterministic_hash.cpp
src/data_generation/mock/load.cpp src/data_generation/mock/load.cpp

View File

@@ -33,6 +33,10 @@ struct ApplicationOptions {
/// random). /// random).
float top_p = 0.92f; float top_p = 0.92f;
/// @brief Context window size (tokens) for LLM inference. Higher values
/// support longer prompts but use more memory.
uint32_t n_ctx = 2048;
/// @brief Random seed for sampling (-1 for random, otherwise non-negative). /// @brief Random seed for sampling (-1 for random, otherwise non-negative).
int seed = -1; int seed = -1;

View File

@@ -16,6 +16,8 @@ class LlamaGenerator final : public DataGenerator {
void SetSamplingOptions(float temperature, float top_p, int seed = -1); void SetSamplingOptions(float temperature, float top_p, int seed = -1);
void SetContextSize(uint32_t n_ctx);
void Load(const std::string& model_path) override; void Load(const std::string& model_path) override;
BreweryResult GenerateBrewery(const std::string& city_name, BreweryResult GenerateBrewery(const std::string& city_name,
const std::string& country_name, const std::string& country_name,
@@ -34,11 +36,16 @@ class LlamaGenerator final : public DataGenerator {
std::string InferFormatted(const std::string& formatted_prompt, std::string InferFormatted(const std::string& formatted_prompt,
int max_tokens = 10000); int max_tokens = 10000);
std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path);
std::string GetFallbackBreweryPrompt();
llama_model* model_ = nullptr; llama_model* model_ = nullptr;
llama_context* context_ = nullptr; llama_context* context_ = nullptr;
float sampling_temperature_ = 0.8f; float sampling_temperature_ = 0.8f;
float sampling_top_p_ = 0.92f; float sampling_top_p_ = 0.92f;
uint32_t sampling_seed_ = 0xFFFFFFFFu; uint32_t sampling_seed_ = 0xFFFFFFFFu;
uint32_t n_ctx_ = 8192;
std::string brewery_system_prompt_;
}; };
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_ #endif // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_

View File

@@ -59,6 +59,9 @@ class SqliteDatabase {
/// @brief Commits the active database transaction. /// @brief Commits the active database transaction.
void CommitTransaction(); void CommitTransaction();
/// @brief Rolls back the active database transaction.
void RollbackTransaction();
/// @brief Inserts a country row. /// @brief Inserts a country row.
void InsertCountry(int id, const std::string& name, const std::string& iso2, void InsertCountry(int id, const std::string& name, const std::string& iso2,
const std::string& iso3); const std::string& iso3);

View File

@@ -0,0 +1,425 @@
================================================================================
BREWERY DATA GENERATION - COMPREHENSIVE SYSTEM PROMPT
================================================================================
ROLE AND OBJECTIVE
You are an experienced brewmaster and owner of a local craft brewery. Your task
is to create a distinctive, authentic name and a detailed description for your
brewery that genuinely reflects your specific location, your brewing philosophy,
the local culture, and your connection to the community.
The brewery must feel real and grounded in its specific place—not generic or
interchangeable with breweries from other regions. Every detail should build
authenticity and distinctiveness.
================================================================================
FORBIDDEN PHRASES AND CLICHÉS
================================================================================
NEVER USE THESE OVERUSED CONSTRUCTIONS (even in modified form):
- "Love letter to" / "tribute to" / "ode to"
- "Rolling hills" / "picturesque landscape" / "scenic beauty"
- "Every sip tells a story" / "every pint tells a story" / "transporting you"
- "Come for X, stay for Y" formula (Come for beer, stay for...)
- "Rich history/traditions" / "storied past" / "storied brewing tradition"
- "Passion" as a generic descriptor ("crafted with passion", "our passion")
- "Woven into the fabric" / "echoes of" / "steeped in"
- "Ancient roots" / "timeless traditions" / "time-honored heritage"
- Opening ONLY with landscape/geography (no standalone "Nestled...", "Where...")
- "Where tradition meets innovation"
- "Celebrating the spirit of [place]"
- "Raised on the values of" / "rooted in the values of"
- "Taste of [place]" / "essence of [place]"
- "From our family to yours"
- "Brewing excellence" / "committed to excellence"
- "Bringing people together" (without showing HOW)
- "Honoring local heritage" (without specifics)
================================================================================
SEVEN OPENING APPROACHES - ROTATE BETWEEN THESE
================================================================================
1. BEER STYLE ORIGIN ANGLE
Start by identifying a specific beer style historically made in or
influenced by the region. Explain why THIS place inspired that style.
Example Foundation: "Belgian Trappist ales developed from monastic traditions
in the Ardennes; our brewery continues that contemplative approach..."
2. BREWING CHALLENGE / ADVANTAGE ANGLE
Begin with a specific environmental or geographic challenge that shapes
the brewery's approach. Water hardness, altitude, climate, ingredient scarcity.
Example Foundation: "High-altitude fermentation requires patience; at 1,500m,
our lagers need 8 weeks to develop the crisp finish..."
3. FOUNDING STORY / PERSONAL MOTIVATION
Open with why the founder started THIS brewery HERE. Personal history,
escape from corporate work, multi-generational family legacy, career change.
Example Foundation: "After 20 years in finance, I returned to my hometown to
revive my grandfather's closed brewery using his original recipe notes..."
4. SPECIFIC LOCAL INGREDIENT / RESOURCE
Lead with a unique input source: special water, rare hops grown locally,
grain from a specific mill, honey from local apiaries, barrel aging with
local wood.
Example Foundation: "The cold springs below Sniffels Peak provide water so soft
it inspired our signature pale lager..."
5. CONTRADICTION / UNEXPECTED ANGLE
Start with a surprising fact about the place that defies stereotype.
Example Foundation: "Nobody expects beer culture in a Muslim-majority city,
yet our secular neighborhood has deep roots in 1920s beer halls..."
6. LOCAL EVENT / CULTURAL MOMENT
Begin with a specific historical moment, festival, cultural practice, or
seasonal tradition in the place.
Example Foundation: "Every October, the hop harvest brings itinerant workers
and tradition. Our brewery grew from a harvest celebration in 2008..."
7. TANGIBLE PHYSICAL DETAIL
Open by describing a concrete architectural or geographic feature: building
age, material, location relative to notable structures, layout, history of
the space.
Example Foundation: "This 1887 mill house once crushed grain; the original
water wheel still runs below our fermentation room..."
================================================================================
SPECIFICITY AND CONCRETENESS REQUIREMENTS
================================================================================
DO NOT GENERALIZE. Every brewery description must include:
✓ At least ONE concrete proper noun or specific reference:
- Actual local landmarks (mountain name, river name, street, neighborhood)
- Specific business partner or supplier name (if real to the region)
- Named local cultural event or historical period
- Specific beer style(s) with regional significance
- Actual geographic feature (e.g., "the volcanic ash in our soil")
✓ Mention specific beer styles relevant to the region's culture:
- German Bavaria: Dunkelweizen, Märzen, Kellerbier, Helles
- Belgian/Flemish: Lambic, Trappist, Strong Dark Ale
- British Isles: Brown Ale, Real Ale, Bitter, Cask Ale
- Czech: Pilsner, Bohemian Lager
- IPA/Hoppy: American regions, UK (origin)
- New Zealand/Australia: Hop-forward, experimental
- Japanese: Clean lagers, sake influence
- Mexican: Lager-centric, sometimes citrus
✓ Name concrete brewing challenges or advantages:
Examples: water minerality, altitude, temperature swings, grain varieties,
humidity, wild yeasts in the region, traditional equipment preserved in place
✓ Use sensory language SPECIFIC to the place:
NOT: "beautiful views" → "the copper beech trees turn rust-colored by
September"
NOT: "charming" → "the original tile floor from 1924 still mosaic-patterns
the taproom"
NOT: "authentic" → "the water chiller uses the original 1950s ammonia system"
✓ Avoid describing multiple regions with the same adjectives:
Don't say every brewery is "cozy" or "vibrant" or "historic"—be specific
about WHAT makes this one different from others in different regions.
================================================================================
STRUCTURAL PATTERNS - MIX THESE UP
================================================================================
NOT every description should follow: legacy → current brewing → call to action
TEMPLATE ROTATION (these are EXAMPLES, not formulas):
TEMPLATE A: [Region origin] → [specific challenge] → [how we adapted] → [result]
"The Saône River flooded predictably each spring. Medieval brewers learned
to schedule production around it. We use the same seasonal rhythm..."
TEMPLATE B: [Ingredient story] → [technique developed because of it] → [distinctive result]
"Our barley terraces face southwest; the afternoon sun dries the crop weeks
before northern valleys. This inspired our crisp, mineral-forward pale ale..."
TEMPLATE C: [Personal/family history (without generic framing)] → [specific challenge overcome] → [philosophy]
"My mother was a chemist studying water quality; she noticed the local supply
had unusual pH. Rather than fight it, we formulated our entire range around
it. The sulfate content sharpens our bitters..."
TEMPLATE D: [Describe the physical space in detail] → [how space enables brewing style] → [sensory experience]
"The brewhouse occupies a converted 1960s chemical factory. The stainless steel
vats still bear faded original markings. The building's thermal mass keeps
fermentation stable without modern refrigeration..."
TEMPLATE E: [Unexpected contradiction] → [explanation] → [brewing philosophy]
"In a region famous for wine, we're a beer-only operation. We embrace that
outsider status and brew adventurously, avoiding the 'respect tradition'
pressure wine makes locals feel..."
TEMPLATE F: [Community role, specific] → [what that demands] → [brewing expression]
"We're the only gathering space in the village that stays open after 10pm.
That responsibility means brewing beers that pair with conversation, not
provocation. Sessionable, food-friendly, endlessly drinkable..."
TEMPLATE G: [Backward chronology] → [how practices persist] → [what's evolved]
"Our great-grandfather hand-packed bottles in 1952. We still own his bench.
Even though we use machines now, the pace he set—careful, thoughtful—shapes
every decision. Nothing about us is fast..."
SOMETIMES skip the narrative entirely and just describe:
"We brew four core beers—a dry lager, a copper ale, a wheat beer, and a hop-
forward pale. The range itself tells our story: accessible, varied,
unpretentious. No flagship. No hero beer. Balance."
================================================================================
REGIONAL AUTHENTICITY GUIDELINES
================================================================================
GERMAN / ALPINE / CENTRAL EUROPEAN
- Discuss water hardness and mineral content
- Reference specific beer laws (Reinheitsgebot, Bavarian purity traditions)
- Name specific styles: Kellerbier, Märzen, Dunkelweizen, Helles, Alt, Zwickel
- Mention lager fermentation dominance and cool-cave advantages
- Consider beer hall culture, tradition of communal spaces
- Discuss barrel aging if applicable
- Reference precision/engineering in brewing approach
- Don't romanticize; emphasis can be on technique and consistency
MEDITERRANEAN / SOUTHERN EUROPEAN
- Reference local wine culture (compare or contrast with brewing)
- Mention grape varieties if relevant (some regions have wine-brewery overlap)
- Discuss sun exposure, heat challenges during fermentation
- Ingredient sourcing: local herbs, citrus, wheat quality
- May emphasize Mediterranean sociability and gathering spaces
- Consider how northern European brewing tradition transplanted here
- Water source and quality specific to region
- Seasonal agricultural connections (harvest timing, etc.)
ANGLO-SAXON / BRITISH ISLES / SCANDINAVIAN
- Real ale, cask conditioning, hand-pulled pints
- IPA heritage (if British, England specifically; if American, different innovation story)
- Hops: specific varietal heritage (Fuggle, Golding, Cascade, etc.)
- Pub culture and community gathering
- Ales: top-fermented, warmer fermentation temperatures
- May emphasize working-class history or rural traditions
- Cider/mead/fermented heritage alongside beer
NEW WORLD (US, AUSTRALIA, NZ, SOUTH AFRICA)
- Emphasize experimentation and lack of brewing "rules"
- Ingredient sourcing: local grain growers, foraged hops, local suppliers
- May reference mining heritage, recent settlement, diverse immigration
- Craft beer boom influence: how does this brewery differentiate?
- Often: bold flavors, high ABVs, creative adjuncts
- Can emphasize anti-tradition or deliberate rule-breaking
- Emphasis on farmer partnerships and local food scenes
SMALL VILLAGES / RURAL AREAS
- Brewery likely serves as actual gathering place—explain HOW
- Ingredient sourcing highly local (grain from X farm, water from Y spring)
- May be family operation or multi-generation story
- Role in community identity and events
- Accessibility and lack of pretension
- Seasonal rhythm and agricultural calendar influence
- Risk: Don't make it overly quaint or "simpler times" nostalgic
URBAN / NEIGHBORHOOD-BASED
- Distinctive neighborhood identity (don't just say "vibrant")
- Specific business community or residential character
- Street-level visibility and casual drop-in culture
- May emphasize diversity, immigrant heritage, gentrification navigation
- Smaller brewing scale in dense area (space constraints)
- Walking-distance customer base instead of destination draw
- May have stronger food pairing focus (food truck culture, restaurant neighbors)
WINE REGIONS (Italy, France, Spain, Germany's Mosel, etc.)
- Show awareness of wine's prestige locally
- Explain why brewing exists here despite wine dominance
- Does brewery respect wine or deliberately provide alternative?
- Ingredient differences: water quality suited to beer, not wine
- Brewing approach: precise, clean—influenced by wine mentality
- May emphasize beer's sociability vs. wine's formality
- Historical context: beer predates or coexists with wine tradition
BEER-HERITAGE HOTSPOTS (Belgium, Germany, UK, Czech Republic)
- Can't ignore the weight of history without acknowledging it
- Do you innovate within tradition or break from it? Say which.
- Specific pride in one style over others (Lambic specialist, Trappist-inspired, etc.)
- May emphasize family legacy or generational knowledge
- Regional identity VERY strong—brewery reflects this unapologetically
- Risk: Avoid claiming to "honor" or "continue" without specifics
================================================================================
TONE VARIATIONS - NOT ALL BREWERIES ARE SOULFUL
================================================================================
These descriptions should NOT all sound romantic, quaint, or emotionally
passionate. These are alternative tones:
IRREVERENT / HUMOROUS
"We're brewing beer because wine required too much prayer. Less spirituality,
more hops. Our ales are big, unpolished, and perfect after a day's work."
MATTER-OF-FACT / ENGINEERING-FOCUSED
"Brewing is chemistry. We source ingredient components, control variables,
and optimize for reproducibility. If that sounds clinical, good—consistency
is our craft."
PROUDLY UNPRETENTIOUS / WORKING-CLASS
"This isn't farm-to-table aspirational nonsense. It's a neighborhood beer.
$4 pints. No reservations. No sipping notes. Tastes good, fills the glass,
keeps you coming back."
MINIMALIST / DIRECT
"We brew three beers. They're good. Come drink one."
BUSINESS-FOCUSED / PRACTICAL
"Starting a brewery in 2015 meant finding a niche. We're the only nano-
brewery serving the airport district. Our rapid turnover and distribution
focus differentiate us from weekend hobbyists."
CONFRONTATIONAL / REBELLIOUS
"Craft beer got boring. Expensive IPAs and flavor-chasing. We're brewing
wheat beers and forgotten styles because fashion is temporary; good beer is timeless."
MIX these tones across your descriptions. Some breweries should sound romantic
and place-proud. Others should sound irreverent or practical.
================================================================================
NARRATIVE CLICHÉS TO ABSOLUTELY AVOID
================================================================================
1. THE "HIDDEN GEM" FRAMING
Don't use discovery language: "hidden," "lesser-known," "off the beaten path,"
"tucked away." Implies marketing speak, not authenticity.
2. OVERT NOSTALGIA / "SIMPLER TIMES"
Don't appeal to vague sense that past was better: "yearning for," "those
days," "how things used to be." Lazy and off-putting.
3. EMPTY "GATHERING PLACE" CLAIMS
Don't just assert "we bring people together." Show HOW: local workers' lunch
spot? Trivia night tradition? Live music venue? Political meeting ground?
4. "SPECIAL" WITHOUT EVIDENCE
Don't declare location is "special" or "unique." SHOW what makes it distinct
through specific details, not assertion.
5. "WE BELIEVE IN" AS PLACEHOLDER
Every brewery claims to "believe in" quality, community, craft, sustainability.
These are empty. What specific belief drives THIS brewery's choices?
6. "ESCAPE / RETREAT" FRAMING
Don't suggest beer allows people to escape reality, retreat from the world,
or "get away." Implies you don't trust the place itself to be compelling.
7. SUPERLATIVE CLAIMS
Don't use: "finest," "best," "most authentic," "truly legendary." Let details
prove these implied claims instead.
8. PASSIVE VOICE ABOUT YOUR OWN BREWERY
Avoid: "beloved by locals," "known for its," "celebrated for." Active voice:
what does the brewery actively DO?
================================================================================
LENGTH AND CONTENT REQUIREMENTS
================================================================================
TARGET LENGTH: 120-180 words
- Long enough to establish place and brewing philosophy
- Short enough to avoid meandering or repetition
- Specific enough that brewery feels real and unreplicable
REQUIRED ELEMENTS (at least ONE each):
✓ Concrete location reference (proper noun, landmark, geographic feature)
✓ One specific brewing detail (challenge, advantage, technique, ingredient)
✓ Sensory language specific to the place (NOT generic adjectives)
✓ Distinct tone/voice (don't all sound the same quiet reverence)
OPTIONAL ELEMENTS:
- Name 1-2 specific beer styles or beer names
- Personal/family story (if it illuminates why brewery exists here)
- Ingredient sourcing or supply chain detail
- Community role (with evidence, not assertion)
- Regional historical context (brief, specific)
WORD ECONOMY:
- Don't waste words on "we believe in quality" or "committed to excellence"
- Don't use filler adjectives: "authentic," "genuine," "real," "true," "local"
(these should be IMPLIED by specific details)
- Every sentence should add information, flavor, or distinctive detail
================================================================================
SENSORY LANGUAGE GUIDELINES
================================================================================
AVOID THESE GENERIC SENSORY WORDS (they're lazy placeholders):
- "Beautiful," "picturesque," "gorgeous," "stunning"
- "Warm," "cozy," "inviting" (without context)
- "Vibrant," "lively," "energetic" (without examples)
- "Charming," "quaint," "rustic" (without specifics)
USE INSTEAD: Specific, concrete sensory details
- Colors: "copper beech," "rust-stained brick," "frost-blue shutters"
- Textures: "the grain of wooden barrel hoops," "hand-smoothed stone," "grime-darkened windows"
- Sounds: "the hiss of the hand-pump," "coin-drop in the old register," "church bells on Sunday"
- Smells: "yeast-heavy floor," "wet limestone," "Hallertau hop resin"
- Tastes: (in the beer) "mineral-sharp," "sulfate clarity," "heather honey notes"
EXAMPLE SENSORY COMPARISON:
AVOID: "Our brewery captures the essence of the region's rustic charm."
USE: "The five-meter stone walls keep fermentation at 12°C without refrigeration.
On warm days, water drips from moss-covered blocks—the original cooling
system that hasn't changed in 150 years."
================================================================================
DIVERSITY ACROSS DATASET - WHAT NOT TO REPEAT
================================================================================
Since you're generating many breweries, ensure variety by:
□ Alternating tone (soulful → irreverent → matter-of-fact → working-class, etc.)
□ Varying opening approach (don't use beer-style origin twice in a row)
□ Different geographic contexts (don't make all small villages sound the same)
□ Distinct brewery sizes/models (nano-brewery, family operation, investor-backed, etc.)
□ Various types of "draw" (neighborhood destination vs. local-only vs. tourist
attraction vs. untouched community staple)
□ Diverse relationship to beer history/tradition (embrace it, subvert it, ignore it)
□ Different community roles (political space, athlete hangout, food destination,
working person's bar, experimentation lab, etc.)
If you notice yourself using the same phrasing twice within three breweries,
STOP and take a completely different approach for the next one.
================================================================================
QUALITY CHECKLIST
================================================================================
Before submitting your brewery description, verify:
□ Zero clichés from the FORBIDDEN list appear anywhere
□ At least one specific proper noun or concrete reference included
□ No more than two generic adjectives in the entire description
□ The brewery is genuinely unreplicable (wouldn't work in a different location)
□ Tone matches a SPECIFIC angle (not generic reverence)
□ Opening sentence is distinctive and unexpected
□ No sentence says the same thing twice in different words
□ At least one detail is surprising or specific to this place
□ The description would make sense ONLY for this location/region
□ "Passion," "tradition," "community" either don't appear or appear with
specific context/evidence
================================================================================
OUTPUT FORMAT
================================================================================
Return ONLY a valid JSON object with exactly two keys:
{
"name": "Brewery Name Here",
"description": "Full description text here..."
}
Requirements:
- name: 2-5 words, distinctive, memorable
- description: 120-180 words, follows all guidelines above
- Valid JSON (escaped quotes, no line breaks in strings)
- No markdown, no backticks, no code formatting
- No preamble before the JSON
- No trailing text after the JSON
- No explanations or commentary
================================================================================

View File

@@ -0,0 +1,169 @@
================================================================================
BREWERY DATA GENERATION SYSTEM PROMPT
================================================================================
ROLE AND OBJECTIVE
You are an experienced brewmaster creating authentic brewery descriptions that
feel real and grounded in specific places. Every detail should prove the brewery
could only exist in this location. Write as a brewmaster would—focused on concrete
details, not marketing copy.
================================================================================
FORBIDDEN PHRASES AND CLICHÉS
================================================================================
NEVER USE THESE (even in modified form):
- "Love letter to" / "tribute to" / "ode to" / "rolling hills" / "picturesque"
- "Every sip tells a story" / "Come for X, stay for Y" / "Where tradition meets innovation"
- "Rich history" / "ancient roots" / "timeless traditions" / "time-honored heritage"
- "Passion" (standalone descriptor) / "brewing excellence" / "commitment to quality"
- "Authentic" / "genuine" / "real" / "true" (SHOW these, don't state them)
- "Bringing people together" (without HOW) / "community gathering place" (without proof)
- "Hidden gem" / "secret" / "lesser-known" / "beloved by locals"
- Generic adjectives: "beautiful," "gorgeous," "lovely," "cozy," "charming," "vibrant"
- Vague temporal claims: "simpler times," "the good old days," "escape from the modern world"
- Passive voice: "is known for," "has become famous for," "has earned a reputation"
================================================================================
OPENING APPROACHES (Choose ONE per brewery)
================================================================================
1. BEER STYLE ORIGIN: Start with a specific historical beer style from this
region, explain why this place created it, show how your brewery continues it.
Key: Name specific style → why this region made it → how you continue it
2. BREWING CHALLENGE: Begin with a specific environmental constraint (altitude,
water hardness, temperature, endemic yeasts). Explain the technical consequence
and what decision you made because of it.
Key: Name constraint → technical consequence → your response → distinctive result
3. FOUNDING STORY: Why did the founder return/move HERE? What did they discover?
What specific brewing decision followed? Include a concrete artifact (logs, equipment).
Key: Real motivation → specific discovery → brewing decision that stemmed from it
4. LOCAL INGREDIENT: What unique resource defines your brewery? Why is it unique?
What brewing constraint or opportunity does it create?
Key: Specific ingredient/resource → why unique → brewing choices it enables
5. CONTRADICTION: What is the region famous for? Why does your brewery do the
opposite? Make the contradiction a strength, not an apology.
Key: Regional identity → why you diverge → what you do instead → why it works
6. CULTURAL MOMENT: What specific seasonal tradition or event shapes your brewery?
How do you connect to it? What brewing decisions follow?
Key: Specific tradition/event → your brewery's relationship → brewing decisions
7. PHYSICAL SPACE: Describe a specific architectural feature with date/material.
How does it create technical advantage? What sensory details matter? Why keep
constraints instead of modernizing?
Key: Specific feature → technical consequence → sensory details → why you keep it
================================================================================
SPECIFICITY REQUIREMENTS
================================================================================
Every brewery description MUST include (minimum 2-3 of each):
1. CONCRETE PROPER NOUNS (at least 2)
- Named geographic features: "Saône River," "Monte Guzzo," "Hallertau region"
- Named landmarks: "St. Augustine Cathedral," "the old train station," "Harbor Point"
- Named varieties: "Saaz hops," "Maris Otter barley," "wild Lambic culture"
- Named local suppliers: "[Farmer name]'s wheat," "limestone quarry at Kinderheim"
- Named historical periods: "post-WWII reconstruction," "the 1952 flood"
2. BREWING-SPECIFIC DETAILS (at least 1-2)
- Water chemistry: "58 ppm calcium, 45 ppm sulfate" or temperature/pH specifics
- Altitude/climate constraints: "1,500m elevation means fermentation at 2-3°C lower"
- Temperature swings: "winters reach -20°C, summers hit 35°C; requires separate strategies"
- Endemic challenges: "Brettanomyces naturally present; exposed wort gets infected within hours"
- Equipment constraints: "original wooden tun from 1954 still seals better than stainless steel"
- Ingredient limitations: "fresh hops available only August-September; plan year around that"
3. SENSORY DETAILS SPECIFIC TO THIS PLACE (at least 1)
NOT generic: "beautiful, charming, cozy"
Instead: "copper beech trees turn rust-colored by September, visible from fermentation windows"
Instead: "boot-scrape grooves worn by coal miners still visible in original tile floor"
Instead: "fermentation produces ethanol vapor visible in morning frost every September"
Instead: "3-meter stone walls keep fermentation at 13°C naturally; sitting under stone feels colder"
PROOF TEST: Could this brewery description fit in Chile? Germany? Japan?
- If YES: add more place-specific details
- If NO: you're on track. Identity should be inseparable from location.
================================================================================
TONE VARIATIONS
================================================================================
Rotate tones consciously. Examples:
IRREVERENT: "We're brewing beer because wine required ritual and prayer. Less
spirituality, more hops. Our ales are big, unpolished. Named our Brown Ale
'Medieval Constipation' because the grain gives texture."
MATTER-OF-FACT: "Brewing is applied chemistry. We measure water mineral content
to the ppm, fermentation temperature to 0.5°C. Our Märzen has the same gravity,
ABV, and color every single batch. Precision is our craft."
WORKING-CLASS PROUD: "This isn't farm-to-table aspirational nonsense. It's a
neighborhood beer. Four dollars a pint. No reservations, no tasting notes.
Workers need somewhere to go."
MINIMALIST: "We brew three beers. They're good. That's it."
NOSTALGIC-GROUNDED: "My grandfather brewed in his basement. When he died in
1995, I found his brewing logs in 2015. I copied his exact recipes. Now the
fermentation smells like his basement."
================================================================================
LENGTH & CONTENT REQUIREMENTS
================================================================================
TARGET LENGTH: 150-250 words
REQUIRED ELEMENTS:
- At least 2-3 concrete proper nouns (named locations, suppliers, historical moments)
- At least 1-2 brewing-specific details (water chemistry, altitude, equipment constraints)
- At least 1 sensory detail specific to this place (visible, olfactory, tactile, or temporal)
- Consistent tone throughout (irreverent, matter-of-fact, working-class, nostalgic, etc.)
- One distinctive detail that proves the brewery could ONLY exist in this location
OPTIONAL ELEMENTS:
- Specific beer names (not just styles)
- Names of key people (if central to story)
- Explicit community role (with evidence)
- Actual sales/production details (if relevant)
DO NOT INCLUDE:
- Generic adjectives without evidence: "authentic," "genuine," "soulful," "passionate"
- Vague community claims without HOW: "gathering place," "beloved," "where people come together"
- Marketing language: "award-winning," "nationally recognized," "craft quality"
- Fillers: "and more," "creating memories," "for all to enjoy"
- Predictions: "we're working on," "coming soon," "we plan to"
================================================================================
OUTPUT FORMAT
================================================================================
Return ONLY a valid JSON object with exactly two keys:
{
"name": "Brewery Name Here",
"description": "Full description text here..."
}
Requirements:
- name: 2-5 words, distinctive, memorable
- description: 150-250 words, follows all guidelines
- Valid JSON (properly escaped quotes, no line breaks)
- No markdown, backticks, or code formatting
- No preamble or trailing text after JSON
Example:
{
"name": "Sniffels Peak Brewing",
"description": "The soft spring water beneath Sniffels Peak..."
}
================================================================================

View File

@@ -28,11 +28,12 @@ std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
auto llama_generator = std::make_unique<LlamaGenerator>(); auto llama_generator = std::make_unique<LlamaGenerator>();
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p, llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
options_.seed); options_.seed);
llama_generator->SetContextSize(options_.n_ctx);
spdlog::info( spdlog::info(
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, " "[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
"seed={})", "n_ctx={}, seed={})",
options_.model_path, options_.temperature, options_.top_p, options_.model_path, options_.temperature, options_.top_p,
options_.seed); options_.n_ctx, options_.seed);
generator = std::move(llama_generator); generator = std::move(llama_generator);
} }
generator->Load(options_.model_path); generator->Load(options_.model_path);

View File

@@ -25,15 +25,10 @@ std::string DataDownloader::DownloadCountriesDatabase(
return cache_path; return cache_path;
} }
std::string short_commit = commit;
if (commit.length() > 7) {
short_commit = commit.substr(0, 7);
}
std::string url = std::string url =
"https://raw.githubusercontent.com/dr5hn/" "https://raw.githubusercontent.com/dr5hn/"
"countries-states-cities-database/" + "countries-states-cities-database/" +
short_commit + "/json/countries+states+cities.json"; commit + "/json/countries+states+cities.json";
spdlog::info("[DataDownloader] Downloading: {}", url); spdlog::info("[DataDownloader] Downloading: {}", url);

View File

@@ -1,16 +1,31 @@
/**
* Destructor Module
* Ensures proper cleanup of llama.cpp resources (context and model) when the
* generator is destroyed, preventing memory leaks and resource exhaustion.
*/
#include "data_generation/llama_generator.h" #include "data_generation/llama_generator.h"
#include "llama.h" #include "llama.h"
LlamaGenerator::~LlamaGenerator() { LlamaGenerator::~LlamaGenerator() {
/**
* Free the inference context (contains KV cache and computation state)
*/
if (context_ != nullptr) { if (context_ != nullptr) {
llama_free(context_); llama_free(context_);
context_ = nullptr; context_ = nullptr;
} }
/**
* Free the loaded model (contains weights and vocabulary)
*/
if (model_ != nullptr) { if (model_ != nullptr) {
llama_model_free(model_); llama_model_free(model_);
model_ = nullptr; model_ = nullptr;
} }
/**
* Clean up the backend (GPU/CPU acceleration resources)
*/
llama_backend_free(); llama_backend_free();
} }

View File

@@ -1,3 +1,10 @@
/**
* Brewery Data Generation Module
* Uses the LLM to generate realistic brewery names and descriptions for a given
* location. Implements retry logic with validation and error correction to
* ensure valid JSON output conforming to the expected schema.
*/
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <stdexcept> #include <stdexcept>
@@ -9,19 +16,24 @@
BreweryResult LlamaGenerator::GenerateBrewery( BreweryResult LlamaGenerator::GenerateBrewery(
const std::string& city_name, const std::string& country_name, const std::string& city_name, const std::string& country_name,
const std::string& region_context) { const std::string& region_context) {
/**
* Preprocess and truncate region context to manageable size
*/
const std::string safe_region_context = const std::string safe_region_context =
PrepareRegionContextPublic(region_context); PrepareRegionContextPublic(region_context);
/**
* Load brewery system prompt from file
* Falls back to minimal inline prompt if file not found
* Default path: prompts/brewery_system_prompt_expanded.txt
*/
const std::string system_prompt = const std::string system_prompt =
"You are the brewmaster and owner of a local craft brewery. " LoadBrewerySystemPrompt("prompts/brewery_system_prompt_expanded.txt");
"Write a name and a short, soulful description for your brewery that "
"reflects your pride in the local community and your craft. "
"The tone should be authentic and welcoming, like a note on a "
"chalkboard "
"menu. Output ONLY a single JSON object with keys \"name\" and "
"\"description\". "
"Do not include markdown formatting or backticks.";
/**
* User prompt: provides geographic context to guide generation towards
* culturally appropriate and locally-inspired brewery attributes
*/
std::string prompt = std::string prompt =
"Write a brewery name and place-specific long description for a craft " "Write a brewery name and place-specific long description for a craft "
"brewery in " + "brewery in " +
@@ -32,40 +44,61 @@ BreweryResult LlamaGenerator::GenerateBrewery(
? std::string(".") ? std::string(".")
: std::string(". Regional context: ") + safe_region_context); : std::string(". Regional context: ") + safe_region_context);
/**
* Store location context for retry prompts (without repeating full context)
*/
const std::string retry_location =
"Location: " + city_name +
(country_name.empty() ? std::string("")
: std::string(", ") + country_name);
/**
* RETRY LOOP with validation and error correction
* Attempts to generate valid brewery data up to 3 times, with feedback-based
* refinement
*/
const int max_attempts = 3; const int max_attempts = 3;
std::string raw; std::string raw;
std::string last_error; std::string last_error;
// Limit output length to keep it concise and focused
constexpr int max_tokens = 1052;
for (int attempt = 0; attempt < max_attempts; ++attempt) { for (int attempt = 0; attempt < max_attempts; ++attempt) {
raw = Infer(system_prompt, prompt, 384); // Generate brewery data from LLM
raw = Infer(system_prompt, prompt, max_tokens);
spdlog::debug("LlamaGenerator: raw output (attempt {}): {}", attempt + 1, spdlog::debug("LlamaGenerator: raw output (attempt {}): {}", attempt + 1,
raw); raw);
// Validate output: parse JSON and check required fields
std::string name; std::string name;
std::string description; std::string description;
const std::string validation_error = const std::string validation_error =
ValidateBreweryJsonPublic(raw, name, description); ValidateBreweryJsonPublic(raw, name, description);
if (validation_error.empty()) { if (validation_error.empty()) {
// Success: return parsed brewery data
return {std::move(name), std::move(description)}; return {std::move(name), std::move(description)};
} }
// Validation failed: log error and prepare corrective feedback
last_error = validation_error; last_error = validation_error;
spdlog::warn("LlamaGenerator: malformed brewery JSON (attempt {}): {}", spdlog::warn("LlamaGenerator: malformed brewery JSON (attempt {}): {}",
attempt + 1, validation_error); attempt + 1, validation_error);
// Update prompt with error details to guide LLM toward correct output.
// For retries, use a compact prompt format to avoid exceeding token
// limits.
prompt = prompt =
"Your previous response was invalid. Error: " + validation_error + "Your previous response was invalid. Error: " + validation_error +
"\nReturn ONLY valid JSON with this exact schema: " "\nReturn ONLY valid JSON with this exact schema: "
"{\"name\": \"string\", \"description\": \"string\"}." "{\"name\": \"string\", \"description\": \"string\"}."
"\nDo not include markdown, comments, or extra keys." "\nDo not include markdown, comments, or extra keys."
"\n\nLocation: " + "\n\n" +
city_name + retry_location;
(country_name.empty() ? std::string("")
: std::string(", ") + country_name) +
(safe_region_context.empty()
? std::string("")
: std::string("\nRegional context: ") + safe_region_context);
} }
// All retry attempts exhausted: log failure and throw exception
spdlog::error( spdlog::error(
"LlamaGenerator: malformed brewery response after {} attempts: " "LlamaGenerator: malformed brewery response after {} attempts: "
"{}", "{}",

View File

@@ -1,3 +1,11 @@
/**
* User Profile Generation Module
* Uses the LLM to generate realistic user profiles (username and bio) for craft
* beer enthusiasts. Implements retry logic to handle parsing failures and
* ensures output adheres to strict format constraints (two lines, specific
* character limits).
*/
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <algorithm> #include <algorithm>
@@ -8,6 +16,10 @@
#include "data_generation/llama_generator_helpers.h" #include "data_generation/llama_generator_helpers.h"
UserResult LlamaGenerator::GenerateUser(const std::string& locale) { UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
/**
* System prompt: specifies exact output format to minimize parsing errors
* Constraints: 2-line output, username format, bio length bounds
*/
const std::string system_prompt = const std::string system_prompt =
"You generate plausible social media profiles for craft beer " "You generate plausible social media profiles for craft beer "
"enthusiasts. " "enthusiasts. "
@@ -17,39 +29,72 @@ UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
"The profile should feel consistent with the locale. " "The profile should feel consistent with the locale. "
"No preamble, no labels."; "No preamble, no labels.";
/**
* User prompt: locale parameter guides cultural appropriateness of generated
* profiles
*/
std::string prompt = std::string prompt =
"Generate a craft beer enthusiast profile. Locale: " + locale; "Generate a craft beer enthusiast profile. Locale: " + locale;
/**
* RETRY LOOP with format validation
* Attempts up to 3 times to generate valid user profile with correct format
*/
const int max_attempts = 3; const int max_attempts = 3;
std::string raw; std::string raw;
for (int attempt = 0; attempt < max_attempts; ++attempt) { for (int attempt = 0; attempt < max_attempts; ++attempt) {
/**
* Generate user profile (max 128 tokens - should fit 2 lines easily)
*/
raw = Infer(system_prompt, prompt, 128); raw = Infer(system_prompt, prompt, 128);
spdlog::debug("LlamaGenerator (user): raw output (attempt {}): {}", spdlog::debug("LlamaGenerator (user): raw output (attempt {}): {}",
attempt + 1, raw); attempt + 1, raw);
try { try {
/**
* Parse two-line response: first line = username, second line = bio
*/
auto [username, bio] = ParseTwoLineResponsePublic( auto [username, bio] = ParseTwoLineResponsePublic(
raw, "LlamaGenerator: malformed user response"); raw, "LlamaGenerator: malformed user response");
/**
* Remove any whitespace from username (usernames shouldn't have
* spaces)
*/
username.erase( username.erase(
std::remove_if(username.begin(), username.end(), std::remove_if(username.begin(), username.end(),
[](unsigned char ch) { return std::isspace(ch); }), [](unsigned char ch) { return std::isspace(ch); }),
username.end()); username.end());
/**
* Validate both fields are non-empty after processing
*/
if (username.empty() || bio.empty()) { if (username.empty() || bio.empty()) {
throw std::runtime_error("LlamaGenerator: malformed user response"); throw std::runtime_error("LlamaGenerator: malformed user response");
} }
/**
* Truncate bio if exceeds reasonable length for bio field
*/
if (bio.size() > 200) bio = bio.substr(0, 200); if (bio.size() > 200) bio = bio.substr(0, 200);
/**
* Success: return parsed user profile
*/
return {username, bio}; return {username, bio};
} catch (const std::exception& e) { } catch (const std::exception& e) {
/**
* Parsing failed: log and continue to next attempt
*/
spdlog::warn( spdlog::warn(
"LlamaGenerator: malformed user response (attempt {}): {}", "LlamaGenerator: malformed user response (attempt {}): {}",
attempt + 1, e.what()); attempt + 1, e.what());
} }
} }
/**
* All retry attempts exhausted: log failure and throw exception
*/
spdlog::error( spdlog::error(
"LlamaGenerator: malformed user response after {} attempts: {}", "LlamaGenerator: malformed user response after {} attempts: {}",
max_attempts, raw); max_attempts, raw);

View File

@@ -1,3 +1,11 @@
/**
* Helper Functions Module
* Provides utility functions for text processing, parsing, and chat template
* formatting. Functions handle whitespace normalization, response parsing, and
* conversion of prompts to proper chat format using the model's built-in
* template.
*/
#include <algorithm> #include <algorithm>
#include <array> #include <array>
#include <boost/json.hpp> #include <boost/json.hpp>
@@ -12,6 +20,9 @@
namespace { namespace {
/**
* String trimming: removes leading and trailing whitespace
*/
std::string Trim(std::string value) { std::string Trim(std::string value) {
auto not_space = [](unsigned char ch) { return !std::isspace(ch); }; auto not_space = [](unsigned char ch) { return !std::isspace(ch); };
@@ -23,6 +34,10 @@ std::string Trim(std::string value) {
return value; return value;
} }
/**
* Normalize whitespace: collapses multiple spaces/tabs/newlines into single
* spaces
*/
std::string CondenseWhitespace(std::string text) { std::string CondenseWhitespace(std::string text) {
std::string out; std::string out;
out.reserve(text.size()); out.reserve(text.size());
@@ -44,6 +59,10 @@ std::string CondenseWhitespace(std::string text) {
return Trim(std::move(out)); return Trim(std::move(out));
} }
/**
* Truncate region context to fit within max length while preserving word
* boundaries
*/
std::string PrepareRegionContext(std::string_view region_context, std::string PrepareRegionContext(std::string_view region_context,
std::size_t max_chars) { std::size_t max_chars) {
std::string normalized = CondenseWhitespace(std::string(region_context)); std::string normalized = CondenseWhitespace(std::string(region_context));
@@ -61,6 +80,9 @@ std::string PrepareRegionContext(std::string_view region_context,
return normalized; return normalized;
} }
/**
* Remove common bullet points, numbers, and field labels added by LLM in output
*/
std::string StripCommonPrefix(std::string line) { std::string StripCommonPrefix(std::string line) {
line = Trim(std::move(line)); line = Trim(std::move(line));
@@ -102,6 +124,10 @@ std::string StripCommonPrefix(std::string line) {
return Trim(std::move(line)); return Trim(std::move(line));
} }
/**
* Parse two-line response from LLM: normalize line endings, strip formatting,
* filter spurious output, and combine remaining lines if needed
*/
std::pair<std::string, std::string> ParseTwoLineResponse( std::pair<std::string, std::string> ParseTwoLineResponse(
const std::string& raw, const std::string& error_message) { const std::string& raw, const std::string& error_message) {
std::string normalized = raw; std::string normalized = raw;
@@ -121,7 +147,17 @@ std::pair<std::string, std::string> ParseTwoLineResponse(
std::transform(low.begin(), low.end(), low.begin(), [](unsigned char c) { std::transform(low.begin(), low.end(), low.begin(), [](unsigned char c) {
return static_cast<char>(std::tolower(c)); return static_cast<char>(std::tolower(c));
}); });
if (!l.empty() && l.front() == '<' && low.back() == '>') continue; // Filter known thinking tags like <think>...</think>, but be conservative
// to avoid removing legitimate output. Only filter specific known
// patterns.
if (!l.empty() && l.front() == '<' && low.back() == '>') {
// Only filter if it's a known thinking tag: <think>, <reasoning>, etc.
if (low.find("think") != std::string::npos ||
low.find("reasoning") != std::string::npos ||
low.find("reflect") != std::string::npos) {
continue;
}
}
if (low.rfind("okay,", 0) == 0 || low.rfind("hmm", 0) == 0) continue; if (low.rfind("okay,", 0) == 0 || low.rfind("hmm", 0) == 0) continue;
filtered.push_back(std::move(l)); filtered.push_back(std::move(l));
} }
@@ -140,6 +176,9 @@ std::pair<std::string, std::string> ParseTwoLineResponse(
return {first, second}; return {first, second};
} }
/**
* Apply model's chat template to user-only prompt, formatting it for the model
*/
std::string ToChatPrompt(const llama_model* model, std::string ToChatPrompt(const llama_model* model,
const std::string& user_prompt) { const std::string& user_prompt) {
const char* tmpl = llama_model_chat_template(model, nullptr); const char* tmpl = llama_model_chat_template(model, nullptr);
@@ -173,6 +212,10 @@ std::string ToChatPrompt(const llama_model* model,
return std::string(buffer.data(), static_cast<std::size_t>(required)); return std::string(buffer.data(), static_cast<std::size_t>(required));
} }
/**
* Apply model's chat template to system+user prompt pair, formatting for the
* model
*/
std::string ToChatPrompt(const llama_model* model, std::string ToChatPrompt(const llama_model* model,
const std::string& system_prompt, const std::string& system_prompt,
const std::string& user_prompt) { const std::string& user_prompt) {

View File

@@ -1,3 +1,10 @@
/**
* Text Generation / Inference Module
* Core module that performs LLM inference: converts text prompts into tokens,
* runs the neural network forward pass, samples the next token, and converts
* output tokens back to text. Supports both simple and system+user prompts.
*/
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <algorithm> #include <algorithm>
@@ -22,21 +29,37 @@ std::string LlamaGenerator::Infer(const std::string& system_prompt,
std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt, std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
int max_tokens) { int max_tokens) {
/**
* Validate that model and context are loaded
*/
if (model_ == nullptr || context_ == nullptr) if (model_ == nullptr || context_ == nullptr)
throw std::runtime_error("LlamaGenerator: model not loaded"); throw std::runtime_error("LlamaGenerator: model not loaded");
/**
* Get vocabulary for tokenization and token-to-text conversion
*/
const llama_vocab* vocab = llama_model_get_vocab(model_); const llama_vocab* vocab = llama_model_get_vocab(model_);
if (vocab == nullptr) if (vocab == nullptr)
throw std::runtime_error("LlamaGenerator: vocab unavailable"); throw std::runtime_error("LlamaGenerator: vocab unavailable");
/**
* Clear KV cache to ensure clean inference state (no residual context)
*/
llama_memory_clear(llama_get_memory(context_), true); llama_memory_clear(llama_get_memory(context_), true);
/**
* TOKENIZATION PHASE
* Convert text prompt into token IDs (integers) that the model understands
*/
std::vector<llama_token> prompt_tokens(formatted_prompt.size() + 8); std::vector<llama_token> prompt_tokens(formatted_prompt.size() + 8);
int32_t token_count = llama_tokenize( int32_t token_count = llama_tokenize(
vocab, formatted_prompt.c_str(), vocab, formatted_prompt.c_str(),
static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(), static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
static_cast<int32_t>(prompt_tokens.size()), true, true); static_cast<int32_t>(prompt_tokens.size()), true, true);
/**
* If buffer too small, negative return indicates required size
*/
if (token_count < 0) { if (token_count < 0) {
prompt_tokens.resize(static_cast<std::size_t>(-token_count)); prompt_tokens.resize(static_cast<std::size_t>(-token_count));
token_count = llama_tokenize( token_count = llama_tokenize(
@@ -48,16 +71,31 @@ std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
if (token_count < 0) if (token_count < 0)
throw std::runtime_error("LlamaGenerator: prompt tokenization failed"); throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
/**
* CONTEXT SIZE VALIDATION
* Validate and compute effective token budgets based on context window
* constraints
*/
const int32_t n_ctx = static_cast<int32_t>(llama_n_ctx(context_)); const int32_t n_ctx = static_cast<int32_t>(llama_n_ctx(context_));
const int32_t n_batch = static_cast<int32_t>(llama_n_batch(context_)); const int32_t n_batch = static_cast<int32_t>(llama_n_batch(context_));
if (n_ctx <= 1 || n_batch <= 0) if (n_ctx <= 1 || n_batch <= 0)
throw std::runtime_error("LlamaGenerator: invalid context or batch size"); throw std::runtime_error("LlamaGenerator: invalid context or batch size");
/**
* Clamp generation limit to available context window, reserve space for
* output
*/
const int32_t effective_max_tokens = const int32_t effective_max_tokens =
std::max(1, std::min(max_tokens, n_ctx - 1)); std::max(1, std::min(max_tokens, n_ctx - 1));
/**
* Prompt can use remaining context after reserving space for generation
*/
int32_t prompt_budget = std::min(n_batch, n_ctx - effective_max_tokens); int32_t prompt_budget = std::min(n_batch, n_ctx - effective_max_tokens);
prompt_budget = std::max<int32_t>(1, prompt_budget); prompt_budget = std::max<int32_t>(1, prompt_budget);
/**
* Truncate prompt if necessary to fit within constraints
*/
prompt_tokens.resize(static_cast<std::size_t>(token_count)); prompt_tokens.resize(static_cast<std::size_t>(token_count));
if (token_count > prompt_budget) { if (token_count > prompt_budget) {
spdlog::warn( spdlog::warn(
@@ -68,11 +106,21 @@ std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
token_count = prompt_budget; token_count = prompt_budget;
} }
/**
* PROMPT PROCESSING PHASE
* Create a batch containing all prompt tokens and feed through the model
* This computes internal representations and fills the KV cache
*/
const llama_batch prompt_batch = llama_batch_get_one( const llama_batch prompt_batch = llama_batch_get_one(
prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size())); prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
if (llama_decode(context_, prompt_batch) != 0) if (llama_decode(context_, prompt_batch) != 0)
throw std::runtime_error("LlamaGenerator: prompt decode failed"); throw std::runtime_error("LlamaGenerator: prompt decode failed");
/**
* SAMPLER CONFIGURATION PHASE
* Set up the probabilistic token selection pipeline (sampler chain)
* Samplers are applied in sequence: temperature -> top-p -> distribution
*/
llama_sampler_chain_params sampler_params = llama_sampler_chain_params sampler_params =
llama_sampler_chain_default_params(); llama_sampler_chain_default_params();
using SamplerPtr = using SamplerPtr =
@@ -82,21 +130,48 @@ std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
if (!sampler) if (!sampler)
throw std::runtime_error("LlamaGenerator: failed to initialize sampler"); throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
/**
* Temperature: scales logits before softmax (controls randomness)
*/
llama_sampler_chain_add(sampler.get(), llama_sampler_chain_add(sampler.get(),
llama_sampler_init_temp(sampling_temperature_)); llama_sampler_init_temp(sampling_temperature_));
/**
* Top-P: nucleus sampling - filters to most likely tokens summing to top_p
* probability
*/
llama_sampler_chain_add(sampler.get(), llama_sampler_chain_add(sampler.get(),
llama_sampler_init_top_p(sampling_top_p_, 1)); llama_sampler_init_top_p(sampling_top_p_, 1));
/**
* Distribution sampler: selects actual token using configured seed for
* reproducibility
*/
llama_sampler_chain_add(sampler.get(), llama_sampler_chain_add(sampler.get(),
llama_sampler_init_dist(sampling_seed_)); llama_sampler_init_dist(sampling_seed_));
/**
* TOKEN GENERATION LOOP
* Iteratively generate tokens one at a time until max_tokens or
* end-of-sequence
*/
std::vector<llama_token> generated_tokens; std::vector<llama_token> generated_tokens;
generated_tokens.reserve(static_cast<std::size_t>(effective_max_tokens)); generated_tokens.reserve(static_cast<std::size_t>(effective_max_tokens));
for (int i = 0; i < effective_max_tokens; ++i) { for (int i = 0; i < effective_max_tokens; ++i) {
/**
* Sample next token using configured sampler chain and model logits
* Index -1 means use the last output position from previous batch
*/
const llama_token next = const llama_token next =
llama_sampler_sample(sampler.get(), context_, -1); llama_sampler_sample(sampler.get(), context_, -1);
/**
* Stop if model predicts end-of-generation token (EOS/EOT)
*/
if (llama_vocab_is_eog(vocab, next)) break; if (llama_vocab_is_eog(vocab, next)) break;
generated_tokens.push_back(next); generated_tokens.push_back(next);
/**
* Feed the sampled token back into model for next iteration
* (autoregressive)
*/
llama_token token = next; llama_token token = next;
const llama_batch one_token_batch = llama_batch_get_one(&token, 1); const llama_batch one_token_batch = llama_batch_get_one(&token, 1);
if (llama_decode(context_, one_token_batch) != 0) if (llama_decode(context_, one_token_batch) != 0)
@@ -104,8 +179,18 @@ std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
"LlamaGenerator: decode failed during generation"); "LlamaGenerator: decode failed during generation");
} }
/**
* DETOKENIZATION PHASE
* Convert generated token IDs back to text using vocabulary
*/
std::string output; std::string output;
for (const llama_token token : generated_tokens) for (const llama_token token : generated_tokens)
AppendTokenPiecePublic(vocab, token, output); AppendTokenPiecePublic(vocab, token, output);
/**
* Advance seed for next generation to improve output diversity
*/
sampling_seed_ = (sampling_seed_ == 0xFFFFFFFFu) ? 0 : sampling_seed_ + 1;
return output; return output;
} }

View File

@@ -1,3 +1,10 @@
/**
* Model Loading Module
* This module handles loading a pre-trained LLM model from disk and
* initializing the llama.cpp context for inference. It performs one-time setup
* required before any inference operations can be performed.
*/
#include <spdlog/spdlog.h> #include <spdlog/spdlog.h>
#include <stdexcept> #include <stdexcept>
@@ -7,6 +14,9 @@
#include "llama.h" #include "llama.h"
void LlamaGenerator::Load(const std::string& model_path) { void LlamaGenerator::Load(const std::string& model_path) {
/**
* Validate input and clean up any previously loaded model/context
*/
if (model_path.empty()) if (model_path.empty())
throw std::runtime_error("LlamaGenerator: model path must not be empty"); throw std::runtime_error("LlamaGenerator: model path must not be empty");
@@ -19,6 +29,9 @@ void LlamaGenerator::Load(const std::string& model_path) {
model_ = nullptr; model_ = nullptr;
} }
/**
* Initialize the llama backend (one-time setup for GPU/CPU acceleration)
*/
llama_backend_init(); llama_backend_init();
llama_model_params model_params = llama_model_default_params(); llama_model_params model_params = llama_model_default_params();
@@ -29,7 +42,8 @@ void LlamaGenerator::Load(const std::string& model_path) {
} }
llama_context_params context_params = llama_context_default_params(); llama_context_params context_params = llama_context_default_params();
context_params.n_ctx = 2048; context_params.n_ctx = n_ctx_;
context_params.n_batch = n_ctx_; // Set batch size equal to context window
context_ = llama_init_from_model(model_, context_params); context_ = llama_init_from_model(model_, context_params);
if (context_ == nullptr) { if (context_ == nullptr) {

View File

@@ -0,0 +1,74 @@
#include <fstream>
#include <filesystem>
#include <spdlog/spdlog.h>
#include "data_generation/llama_generator.h"
namespace fs = std::filesystem;
std::string LlamaGenerator::LoadBrewerySystemPrompt(
const std::string& prompt_file_path) {
// Return cached version if already loaded
if (!brewery_system_prompt_.empty()) {
return brewery_system_prompt_;
}
// Try multiple path locations
std::vector<std::string> paths_to_try = {
prompt_file_path, // As provided
"../" + prompt_file_path, // One level up
"../../" + prompt_file_path, // Two levels up
};
for (const auto& path : paths_to_try) {
std::ifstream prompt_file(path);
if (prompt_file.is_open()) {
std::string prompt((std::istreambuf_iterator<char>(prompt_file)),
std::istreambuf_iterator<char>());
prompt_file.close();
if (!prompt.empty()) {
spdlog::info(
"LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)",
path, prompt.length());
brewery_system_prompt_ = prompt;
return brewery_system_prompt_;
}
}
}
spdlog::warn(
"LlamaGenerator: Could not open brewery system prompt file at any of the "
"expected locations. Using fallback inline prompt.");
return GetFallbackBreweryPrompt();
}
// Fallback: minimal inline prompt if file fails to load
std::string LlamaGenerator::GetFallbackBreweryPrompt() {
return "You are an experienced brewmaster and owner of a local craft brewery. "
"Create a distinctive, authentic name and detailed description that "
"genuinely reflects your specific location, brewing philosophy, local "
"culture, and community connection. The brewery must feel real and "
"grounded—not generic or interchangeable.\n\n"
"AVOID REPETITIVE PHRASES - Never use:\n"
"Love letter to, tribute to, rolling hills, picturesque, every sip "
"tells a story, Come for X stay for Y, rich history, passion, woven "
"into, ancient roots, timeless, where tradition meets innovation\n\n"
"OPENING APPROACHES - Choose ONE:\n"
"1. Start with specific beer style and its regional origins\n"
"2. Begin with specific brewing challenge (water, altitude, climate)\n"
"3. Open with founding story or personal motivation\n"
"4. Lead with specific local ingredient or resource\n"
"5. Start with unexpected angle or contradiction\n"
"6. Open with local event, tradition, or cultural moment\n"
"7. Begin with tangible architectural or geographic detail\n\n"
"BE SPECIFIC - Include:\n"
"- At least ONE concrete proper noun (landmark, river, neighborhood)\n"
"- Specific beer styles relevant to the REGION'S culture\n"
"- Concrete brewing challenges or advantages\n"
"- Sensory details SPECIFIC to place—not generic adjectives\n\n"
"LENGTH: 150-250 words. TONE: Can be soulful, irreverent, "
"matter-of-fact, unpretentious, or minimalist.\n\n"
"Output ONLY a raw JSON object with keys name and description. "
"No markdown, backticks, preamble, or trailing text.";
}

View File

@@ -1,3 +1,10 @@
/**
* Sampling Configuration Module
* Configures the hyperparameters that control probabilistic token selection
* during text generation. These settings affect the randomness, diversity, and
* quality of generated output.
*/
#include <stdexcept> #include <stdexcept>
#include "data_generation/llama_generator.h" #include "data_generation/llama_generator.h"
@@ -5,21 +12,54 @@
void LlamaGenerator::SetSamplingOptions(float temperature, float top_p, void LlamaGenerator::SetSamplingOptions(float temperature, float top_p,
int seed) { int seed) {
/**
* Validate temperature: controls randomness in output distribution
* 0.0 = deterministic (always pick highest probability token)
* Higher values = more random/diverse output
*/
if (temperature < 0.0f) { if (temperature < 0.0f) {
throw std::runtime_error( throw std::runtime_error(
"LlamaGenerator: sampling temperature must be >= 0"); "LlamaGenerator: sampling temperature must be >= 0");
} }
/**
* Validate top-p (nucleus sampling): only sample from top cumulative
* probability e.g., top-p=0.9 means sample from tokens that make up 90% of
* probability mass
*/
if (!(top_p > 0.0f && top_p <= 1.0f)) { if (!(top_p > 0.0f && top_p <= 1.0f)) {
throw std::runtime_error( throw std::runtime_error(
"LlamaGenerator: sampling top-p must be in (0, 1]"); "LlamaGenerator: sampling top-p must be in (0, 1]");
} }
/**
* Validate seed: for reproducible results (-1 uses random seed)
*/
if (seed < -1) { if (seed < -1) {
throw std::runtime_error( throw std::runtime_error(
"LlamaGenerator: seed must be >= 0, or -1 for random"); "LlamaGenerator: seed must be >= 0, or -1 for random");
} }
/**
* Store sampling parameters for use during token generation
*/
sampling_temperature_ = temperature; sampling_temperature_ = temperature;
sampling_top_p_ = top_p; sampling_top_p_ = top_p;
sampling_seed_ = (seed < 0) ? static_cast<uint32_t>(LLAMA_DEFAULT_SEED) sampling_seed_ = (seed < 0) ? static_cast<uint32_t>(LLAMA_DEFAULT_SEED)
: static_cast<uint32_t>(seed); : static_cast<uint32_t>(seed);
} }
void LlamaGenerator::SetContextSize(uint32_t n_ctx) {
/**
* Validate context size: must be positive and reasonable for the model
*/
if (n_ctx == 0 || n_ctx > 32768) {
throw std::runtime_error(
"LlamaGenerator: context size must be in range [1, 32768]");
}
/**
* Store context size for use during model loading
*/
n_ctx_ = n_ctx;
}

View File

@@ -80,6 +80,16 @@ void SqliteDatabase::CommitTransaction() {
} }
} }
void SqliteDatabase::RollbackTransaction() {
std::lock_guard<std::mutex> lock(db_mutex_);
char* err = nullptr;
if (sqlite3_exec(db_, "ROLLBACK", nullptr, nullptr, &err) != SQLITE_OK) {
std::string msg = err ? err : "unknown";
sqlite3_free(err);
throw std::runtime_error("RollbackTransaction failed: " + msg);
}
}
void SqliteDatabase::InsertCountry(int id, const std::string& name, void SqliteDatabase::InsertCountry(int id, const std::string& name,
const std::string& iso2, const std::string& iso2,
const std::string& iso3) { const std::string& iso3) {
@@ -96,9 +106,9 @@ void SqliteDatabase::InsertCountry(int id, const std::string& name,
throw std::runtime_error("Failed to prepare country insert"); throw std::runtime_error("Failed to prepare country insert");
sqlite3_bind_int(stmt, 1, id); sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_TRANSIENT);
if (sqlite3_step(stmt) != SQLITE_DONE) { if (sqlite3_step(stmt) != SQLITE_DONE) {
throw std::runtime_error("Failed to insert country"); throw std::runtime_error("Failed to insert country");
@@ -123,8 +133,8 @@ void SqliteDatabase::InsertState(int id, int country_id,
sqlite3_bind_int(stmt, 1, id); sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_int(stmt, 2, country_id); sqlite3_bind_int(stmt, 2, country_id);
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_TRANSIENT);
if (sqlite3_step(stmt) != SQLITE_DONE) { if (sqlite3_step(stmt) != SQLITE_DONE) {
throw std::runtime_error("Failed to insert state"); throw std::runtime_error("Failed to insert state");
@@ -150,7 +160,7 @@ void SqliteDatabase::InsertCity(int id, int state_id, int country_id,
sqlite3_bind_int(stmt, 1, id); sqlite3_bind_int(stmt, 1, id);
sqlite3_bind_int(stmt, 2, state_id); sqlite3_bind_int(stmt, 2, state_id);
sqlite3_bind_int(stmt, 3, country_id); sqlite3_bind_int(stmt, 3, country_id);
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_STATIC); sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_TRANSIENT);
sqlite3_bind_double(stmt, 5, latitude); sqlite3_bind_double(stmt, 5, latitude);
sqlite3_bind_double(stmt, 6, longitude); sqlite3_bind_double(stmt, 6, longitude);
@@ -165,7 +175,8 @@ std::vector<City> SqliteDatabase::QueryCities() {
std::vector<City> cities; std::vector<City> cities;
sqlite3_stmt* stmt = nullptr; sqlite3_stmt* stmt = nullptr;
const char* query = "SELECT id, name, country_id FROM cities ORDER BY name"; const char* query =
"SELECT id, name, country_id FROM cities ORDER BY RANDOM()";
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr); int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
if (rc != SQLITE_OK) { if (rc != SQLITE_OK) {

View File

@@ -11,7 +11,7 @@ void JsonLoader::LoadWorldCities(const std::string& json_path,
constexpr size_t kBatchSize = 10000; constexpr size_t kBatchSize = 10000;
auto startTime = std::chrono::high_resolution_clock::now(); auto startTime = std::chrono::high_resolution_clock::now();
spdlog::info("\nLoading {} (streaming RapidJSON SAX)...", json_path); spdlog::info("\nLoading {} (streaming Boost.JSON SAX)...", json_path);
db.BeginTransaction(); db.BeginTransaction();
bool transactionOpen = true; bool transactionOpen = true;
@@ -44,7 +44,8 @@ void JsonLoader::LoadWorldCities(const std::string& json_path,
} }
} catch (...) { } catch (...) {
if (transactionOpen) { if (transactionOpen) {
db.CommitTransaction(); db.RollbackTransaction();
transactionOpen = false;
} }
throw; throw;
} }

View File

@@ -1,12 +1,12 @@
#include <spdlog/spdlog.h>
#include <boost/program_options.hpp>
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <boost/program_options.hpp>
#include <spdlog/spdlog.h>
#include "biergarten_data_generator.h" #include "biergarten_data_generator.h"
#include "web_client/curl_web_client.h"
#include "database/database.h" #include "database/database.h"
#include "web_client/curl_web_client.h"
namespace po = boost::program_options; namespace po = boost::program_options;
@@ -18,21 +18,32 @@ namespace po = boost::program_options;
* @param options Output ApplicationOptions struct. * @param options Output ApplicationOptions struct.
* @return true if parsing succeeded and should proceed, false otherwise. * @return true if parsing succeeded and should proceed, false otherwise.
*/ */
bool ParseArguments(int argc, char **argv, ApplicationOptions &options) { bool ParseArguments(int argc, char** argv, ApplicationOptions& options) {
// If no arguments provided, display usage and exit // If no arguments provided, display usage and exit
if (argc == 1) { if (argc == 1) {
std::cout << "Biergarten Pipeline - Geographic Data Pipeline with Brewery Generation\n\n"; std::cout << "Biergarten Pipeline - Geographic Data Pipeline with "
"Brewery Generation\n\n";
std::cout << "Usage: biergarten-pipeline [options]\n\n"; std::cout << "Usage: biergarten-pipeline [options]\n\n";
std::cout << "Options:\n"; std::cout << "Options:\n";
std::cout << " --mocked Use mocked generator for brewery/user data\n"; std::cout << " --mocked Use mocked generator for "
std::cout << " --model, -m PATH Path to LLM model file (gguf) for generation\n"; "brewery/user data\n";
std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: /tmp)\n"; std::cout << " --model, -m PATH Path to LLM model file (gguf) for "
std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 (default: 0.8)\n"; "generation\n";
std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 (default: 0.92)\n"; std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: "
std::cout << " --seed SEED Random seed: -1 for random (default: -1)\n"; "/tmp)\n";
std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 "
"(default: 0.8)\n";
std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 "
"(default: 0.92)\n";
std::cout << " --n-ctx SIZE Context window size in tokens "
"(default: 4096)\n";
std::cout << " --seed SEED Random seed: -1 for random "
"(default: -1)\n";
std::cout << " --help, -h Show this help message\n\n"; std::cout << " --help, -h Show this help message\n\n";
std::cout << "Note: --mocked and --model are mutually exclusive. Exactly one must be provided.\n"; std::cout << "Note: --mocked and --model are mutually exclusive. Exactly "
std::cout << "Data source is always pinned to commit c5eb7772 (stable 2026-03-28).\n"; "one must be provided.\n";
std::cout << "Data source is always pinned to commit c5eb7772 (stable "
"2026-03-28).\n";
return false; return false;
} }
@@ -48,6 +59,8 @@ bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
"Sampling temperature (higher = more random)")( "Sampling temperature (higher = more random)")(
"top-p", po::value<float>()->default_value(0.92f), "top-p", po::value<float>()->default_value(0.92f),
"Nucleus sampling top-p in (0,1] (higher = more random)")( "Nucleus sampling top-p in (0,1] (higher = more random)")(
"n-ctx", po::value<uint32_t>()->default_value(8192),
"Context window size in tokens (1-32768)")(
"seed", po::value<int>()->default_value(-1), "seed", po::value<int>()->default_value(-1),
"Sampler seed: -1 for random, otherwise non-negative integer"); "Sampler seed: -1 for random, otherwise non-negative integer");
@@ -81,7 +94,9 @@ bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
bool hasSeed = vm["seed"].defaulted() == false; bool hasSeed = vm["seed"].defaulted() == false;
if (hasTemperature || hasTopP || hasSeed) { if (hasTemperature || hasTopP || hasSeed) {
spdlog::warn("WARNING: Sampling parameters (--temperature, --top-p, --seed) are ignored when using --mocked"); spdlog::warn(
"WARNING: Sampling parameters (--temperature, --top-p, --seed) "
"are ignored when using --mocked");
} }
} }
@@ -90,13 +105,14 @@ bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
options.cache_dir = vm["cache-dir"].as<std::string>(); options.cache_dir = vm["cache-dir"].as<std::string>();
options.temperature = vm["temperature"].as<float>(); options.temperature = vm["temperature"].as<float>();
options.top_p = vm["top-p"].as<float>(); options.top_p = vm["top-p"].as<float>();
options.n_ctx = vm["n-ctx"].as<uint32_t>();
options.seed = vm["seed"].as<int>(); options.seed = vm["seed"].as<int>();
// commit is always pinned to c5eb7772 // commit is always pinned to c5eb7772
return true; return true;
} }
int main(int argc, char *argv[]) { int main(int argc, char* argv[]) {
try { try {
const CurlGlobalState curl_state; const CurlGlobalState curl_state;
@@ -111,7 +127,7 @@ int main(int argc, char *argv[]) {
BiergartenDataGenerator generator(options, webClient, database); BiergartenDataGenerator generator(options, webClient, database);
return generator.Run(); return generator.Run();
} catch (const std::exception &e) { } catch (const std::exception& e) {
spdlog::error("ERROR: Application failed: {}", e.what()); spdlog::error("ERROR: Application failed: {}", e.what());
return 1; return 1;
} }

View File

@@ -11,7 +11,7 @@ std::string WikipediaService::FetchExtract(std::string_view query) {
const std::string encoded = client_->UrlEncode(std::string(query)); const std::string encoded = client_->UrlEncode(std::string(query));
const std::string url = const std::string url =
"https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded + "https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
"&prop=extracts&explaintext=true&format=json"; "&prop=extracts&explaintext=1&format=json";
const std::string body = client_->Get(url); const std::string body = client_->Get(url);
@@ -19,6 +19,7 @@ std::string WikipediaService::FetchExtract(std::string_view query) {
boost::json::value doc = boost::json::parse(body, ec); boost::json::value doc = boost::json::parse(body, ec);
if (!ec && doc.is_object()) { if (!ec && doc.is_object()) {
try {
auto& pages = doc.at("query").at("pages").get_object(); auto& pages = doc.at("query").at("pages").get_object();
if (!pages.empty()) { if (!pages.empty()) {
auto& page = pages.begin()->value().get_object(); auto& page = pages.begin()->value().get_object();
@@ -29,6 +30,16 @@ std::string WikipediaService::FetchExtract(std::string_view query) {
return extract; return extract;
} }
} }
} catch (const std::exception& e) {
spdlog::warn(
"WikipediaService: failed to parse response structure for '{}': "
"{}",
query, e.what());
return {};
}
} else if (ec) {
spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
ec.message());
} }
return {}; return {};
@@ -55,7 +66,7 @@ std::string WikipediaService::GetSummary(std::string_view city,
regionQuery += country; regionQuery += country;
} }
const std::string beerQuery = "beer in " + std::string(city); const std::string beerQuery = "beer in " + std::string(country);
try { try {
const std::string regionExtract = FetchExtract(regionQuery); const std::string regionExtract = FetchExtract(regionQuery);