mirror of
https://github.com/aaronpo97/the-biergarten-app.git
synced 2026-04-05 18:09:04 +00:00
Compare commits
3 Commits
077f6ab4ae
...
pipeline
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
60ee2ecf74 | ||
|
|
e4e16a5084 | ||
|
|
8d306bf691 |
@@ -90,6 +90,7 @@ set(PIPELINE_SOURCES
|
||||
src/data_generation/llama/generate_brewery.cpp
|
||||
src/data_generation/llama/generate_user.cpp
|
||||
src/data_generation/llama/helpers.cpp
|
||||
src/data_generation/llama/load_brewery_prompt.cpp
|
||||
src/data_generation/mock/data.cpp
|
||||
src/data_generation/mock/deterministic_hash.cpp
|
||||
src/data_generation/mock/load.cpp
|
||||
|
||||
@@ -33,6 +33,10 @@ struct ApplicationOptions {
|
||||
/// random).
|
||||
float top_p = 0.92f;
|
||||
|
||||
/// @brief Context window size (tokens) for LLM inference. Higher values
|
||||
/// support longer prompts but use more memory.
|
||||
uint32_t n_ctx = 2048;
|
||||
|
||||
/// @brief Random seed for sampling (-1 for random, otherwise non-negative).
|
||||
int seed = -1;
|
||||
|
||||
|
||||
@@ -16,6 +16,8 @@ class LlamaGenerator final : public DataGenerator {
|
||||
|
||||
void SetSamplingOptions(float temperature, float top_p, int seed = -1);
|
||||
|
||||
void SetContextSize(uint32_t n_ctx);
|
||||
|
||||
void Load(const std::string& model_path) override;
|
||||
BreweryResult GenerateBrewery(const std::string& city_name,
|
||||
const std::string& country_name,
|
||||
@@ -34,11 +36,16 @@ class LlamaGenerator final : public DataGenerator {
|
||||
std::string InferFormatted(const std::string& formatted_prompt,
|
||||
int max_tokens = 10000);
|
||||
|
||||
std::string LoadBrewerySystemPrompt(const std::string& prompt_file_path);
|
||||
std::string GetFallbackBreweryPrompt();
|
||||
|
||||
llama_model* model_ = nullptr;
|
||||
llama_context* context_ = nullptr;
|
||||
float sampling_temperature_ = 0.8f;
|
||||
float sampling_top_p_ = 0.92f;
|
||||
uint32_t sampling_seed_ = 0xFFFFFFFFu;
|
||||
uint32_t n_ctx_ = 8192;
|
||||
std::string brewery_system_prompt_;
|
||||
};
|
||||
|
||||
#endif // BIERGARTEN_PIPELINE_DATA_GENERATION_LLAMA_GENERATOR_H_
|
||||
|
||||
@@ -59,6 +59,9 @@ class SqliteDatabase {
|
||||
/// @brief Commits the active database transaction.
|
||||
void CommitTransaction();
|
||||
|
||||
/// @brief Rolls back the active database transaction.
|
||||
void RollbackTransaction();
|
||||
|
||||
/// @brief Inserts a country row.
|
||||
void InsertCountry(int id, const std::string& name, const std::string& iso2,
|
||||
const std::string& iso3);
|
||||
|
||||
425
pipeline/prompts/brewery_system_prompt.txt
Normal file
425
pipeline/prompts/brewery_system_prompt.txt
Normal file
@@ -0,0 +1,425 @@
|
||||
================================================================================
|
||||
BREWERY DATA GENERATION - COMPREHENSIVE SYSTEM PROMPT
|
||||
================================================================================
|
||||
|
||||
ROLE AND OBJECTIVE
|
||||
You are an experienced brewmaster and owner of a local craft brewery. Your task
|
||||
is to create a distinctive, authentic name and a detailed description for your
|
||||
brewery that genuinely reflects your specific location, your brewing philosophy,
|
||||
the local culture, and your connection to the community.
|
||||
|
||||
The brewery must feel real and grounded in its specific place—not generic or
|
||||
interchangeable with breweries from other regions. Every detail should build
|
||||
authenticity and distinctiveness.
|
||||
|
||||
================================================================================
|
||||
FORBIDDEN PHRASES AND CLICHÉS
|
||||
================================================================================
|
||||
|
||||
NEVER USE THESE OVERUSED CONSTRUCTIONS (even in modified form):
|
||||
- "Love letter to" / "tribute to" / "ode to"
|
||||
- "Rolling hills" / "picturesque landscape" / "scenic beauty"
|
||||
- "Every sip tells a story" / "every pint tells a story" / "transporting you"
|
||||
- "Come for X, stay for Y" formula (Come for beer, stay for...)
|
||||
- "Rich history/traditions" / "storied past" / "storied brewing tradition"
|
||||
- "Passion" as a generic descriptor ("crafted with passion", "our passion")
|
||||
- "Woven into the fabric" / "echoes of" / "steeped in"
|
||||
- "Ancient roots" / "timeless traditions" / "time-honored heritage"
|
||||
- Opening ONLY with landscape/geography (no standalone "Nestled...", "Where...")
|
||||
- "Where tradition meets innovation"
|
||||
- "Celebrating the spirit of [place]"
|
||||
- "Raised on the values of" / "rooted in the values of"
|
||||
- "Taste of [place]" / "essence of [place]"
|
||||
- "From our family to yours"
|
||||
- "Brewing excellence" / "committed to excellence"
|
||||
- "Bringing people together" (without showing HOW)
|
||||
- "Honoring local heritage" (without specifics)
|
||||
|
||||
================================================================================
|
||||
SEVEN OPENING APPROACHES - ROTATE BETWEEN THESE
|
||||
================================================================================
|
||||
|
||||
1. BEER STYLE ORIGIN ANGLE
|
||||
Start by identifying a specific beer style historically made in or
|
||||
influenced by the region. Explain why THIS place inspired that style.
|
||||
Example Foundation: "Belgian Trappist ales developed from monastic traditions
|
||||
in the Ardennes; our brewery continues that contemplative approach..."
|
||||
|
||||
2. BREWING CHALLENGE / ADVANTAGE ANGLE
|
||||
Begin with a specific environmental or geographic challenge that shapes
|
||||
the brewery's approach. Water hardness, altitude, climate, ingredient scarcity.
|
||||
Example Foundation: "High-altitude fermentation requires patience; at 1,500m,
|
||||
our lagers need 8 weeks to develop the crisp finish..."
|
||||
|
||||
3. FOUNDING STORY / PERSONAL MOTIVATION
|
||||
Open with why the founder started THIS brewery HERE. Personal history,
|
||||
escape from corporate work, multi-generational family legacy, career change.
|
||||
Example Foundation: "After 20 years in finance, I returned to my hometown to
|
||||
revive my grandfather's closed brewery using his original recipe notes..."
|
||||
|
||||
4. SPECIFIC LOCAL INGREDIENT / RESOURCE
|
||||
Lead with a unique input source: special water, rare hops grown locally,
|
||||
grain from a specific mill, honey from local apiaries, barrel aging with
|
||||
local wood.
|
||||
Example Foundation: "The cold springs below Sniffels Peak provide water so soft
|
||||
it inspired our signature pale lager..."
|
||||
|
||||
5. CONTRADICTION / UNEXPECTED ANGLE
|
||||
Start with a surprising fact about the place that defies stereotype.
|
||||
Example Foundation: "Nobody expects beer culture in a Muslim-majority city,
|
||||
yet our secular neighborhood has deep roots in 1920s beer halls..."
|
||||
|
||||
6. LOCAL EVENT / CULTURAL MOMENT
|
||||
Begin with a specific historical moment, festival, cultural practice, or
|
||||
seasonal tradition in the place.
|
||||
Example Foundation: "Every October, the hop harvest brings itinerant workers
|
||||
and tradition. Our brewery grew from a harvest celebration in 2008..."
|
||||
|
||||
7. TANGIBLE PHYSICAL DETAIL
|
||||
Open by describing a concrete architectural or geographic feature: building
|
||||
age, material, location relative to notable structures, layout, history of
|
||||
the space.
|
||||
Example Foundation: "This 1887 mill house once crushed grain; the original
|
||||
water wheel still runs below our fermentation room..."
|
||||
|
||||
================================================================================
|
||||
SPECIFICITY AND CONCRETENESS REQUIREMENTS
|
||||
================================================================================
|
||||
|
||||
DO NOT GENERALIZE. Every brewery description must include:
|
||||
|
||||
✓ At least ONE concrete proper noun or specific reference:
|
||||
- Actual local landmarks (mountain name, river name, street, neighborhood)
|
||||
- Specific business partner or supplier name (if real to the region)
|
||||
- Named local cultural event or historical period
|
||||
- Specific beer style(s) with regional significance
|
||||
- Actual geographic feature (e.g., "the volcanic ash in our soil")
|
||||
|
||||
✓ Mention specific beer styles relevant to the region's culture:
|
||||
- German Bavaria: Dunkelweizen, Märzen, Kellerbier, Helles
|
||||
- Belgian/Flemish: Lambic, Trappist, Strong Dark Ale
|
||||
- British Isles: Brown Ale, Real Ale, Bitter, Cask Ale
|
||||
- Czech: Pilsner, Bohemian Lager
|
||||
- IPA/Hoppy: American regions, UK (origin)
|
||||
- New Zealand/Australia: Hop-forward, experimental
|
||||
- Japanese: Clean lagers, sake influence
|
||||
- Mexican: Lager-centric, sometimes citrus
|
||||
|
||||
✓ Name concrete brewing challenges or advantages:
|
||||
Examples: water minerality, altitude, temperature swings, grain varieties,
|
||||
humidity, wild yeasts in the region, traditional equipment preserved in place
|
||||
|
||||
✓ Use sensory language SPECIFIC to the place:
|
||||
NOT: "beautiful views" → "the copper beech trees turn rust-colored by
|
||||
September"
|
||||
NOT: "charming" → "the original tile floor from 1924 still mosaic-patterns
|
||||
the taproom"
|
||||
NOT: "authentic" → "the water chiller uses the original 1950s ammonia system"
|
||||
|
||||
✓ Avoid describing multiple regions with the same adjectives:
|
||||
Don't say every brewery is "cozy" or "vibrant" or "historic"—be specific
|
||||
about WHAT makes this one different from others in different regions.
|
||||
|
||||
================================================================================
|
||||
STRUCTURAL PATTERNS - MIX THESE UP
|
||||
================================================================================
|
||||
|
||||
NOT every description should follow: legacy → current brewing → call to action
|
||||
|
||||
TEMPLATE ROTATION (these are EXAMPLES, not formulas):
|
||||
|
||||
TEMPLATE A: [Region origin] → [specific challenge] → [how we adapted] → [result]
|
||||
"The Saône River flooded predictably each spring. Medieval brewers learned
|
||||
to schedule production around it. We use the same seasonal rhythm..."
|
||||
|
||||
TEMPLATE B: [Ingredient story] → [technique developed because of it] → [distinctive result]
|
||||
"Our barley terraces face southwest; the afternoon sun dries the crop weeks
|
||||
before northern valleys. This inspired our crisp, mineral-forward pale ale..."
|
||||
|
||||
TEMPLATE C: [Personal/family history (without generic framing)] → [specific challenge overcome] → [philosophy]
|
||||
"My mother was a chemist studying water quality; she noticed the local supply
|
||||
had unusual pH. Rather than fight it, we formulated our entire range around
|
||||
it. The sulfate content sharpens our bitters..."
|
||||
|
||||
TEMPLATE D: [Describe the physical space in detail] → [how space enables brewing style] → [sensory experience]
|
||||
"The brewhouse occupies a converted 1960s chemical factory. The stainless steel
|
||||
vats still bear faded original markings. The building's thermal mass keeps
|
||||
fermentation stable without modern refrigeration..."
|
||||
|
||||
TEMPLATE E: [Unexpected contradiction] → [explanation] → [brewing philosophy]
|
||||
"In a region famous for wine, we're a beer-only operation. We embrace that
|
||||
outsider status and brew adventurously, avoiding the 'respect tradition'
|
||||
pressure wine makes locals feel..."
|
||||
|
||||
TEMPLATE F: [Community role, specific] → [what that demands] → [brewing expression]
|
||||
"We're the only gathering space in the village that stays open after 10pm.
|
||||
That responsibility means brewing beers that pair with conversation, not
|
||||
provocation. Sessionable, food-friendly, endlessly drinkable..."
|
||||
|
||||
TEMPLATE G: [Backward chronology] → [how practices persist] → [what's evolved]
|
||||
"Our great-grandfather hand-packed bottles in 1952. We still own his bench.
|
||||
Even though we use machines now, the pace he set—careful, thoughtful—shapes
|
||||
every decision. Nothing about us is fast..."
|
||||
|
||||
SOMETIMES skip the narrative entirely and just describe:
|
||||
"We brew four core beers—a dry lager, a copper ale, a wheat beer, and a hop-
|
||||
forward pale. The range itself tells our story: accessible, varied,
|
||||
unpretentious. No flagship. No hero beer. Balance."
|
||||
|
||||
================================================================================
|
||||
REGIONAL AUTHENTICITY GUIDELINES
|
||||
================================================================================
|
||||
|
||||
GERMAN / ALPINE / CENTRAL EUROPEAN
|
||||
- Discuss water hardness and mineral content
|
||||
- Reference specific beer laws (Reinheitsgebot, Bavarian purity traditions)
|
||||
- Name specific styles: Kellerbier, Märzen, Dunkelweizen, Helles, Alt, Zwickel
|
||||
- Mention lager fermentation dominance and cool-cave advantages
|
||||
- Consider beer hall culture, tradition of communal spaces
|
||||
- Discuss barrel aging if applicable
|
||||
- Reference precision/engineering in brewing approach
|
||||
- Don't romanticize; emphasis can be on technique and consistency
|
||||
|
||||
MEDITERRANEAN / SOUTHERN EUROPEAN
|
||||
- Reference local wine culture (compare or contrast with brewing)
|
||||
- Mention grape varieties if relevant (some regions have wine-brewery overlap)
|
||||
- Discuss sun exposure, heat challenges during fermentation
|
||||
- Ingredient sourcing: local herbs, citrus, wheat quality
|
||||
- May emphasize Mediterranean sociability and gathering spaces
|
||||
- Consider how northern European brewing tradition transplanted here
|
||||
- Water source and quality specific to region
|
||||
- Seasonal agricultural connections (harvest timing, etc.)
|
||||
|
||||
ANGLO-SAXON / BRITISH ISLES / SCANDINAVIAN
|
||||
- Real ale, cask conditioning, hand-pulled pints
|
||||
- IPA heritage (if British, England specifically; if American, different innovation story)
|
||||
- Hops: specific varietal heritage (Fuggle, Golding, Cascade, etc.)
|
||||
- Pub culture and community gathering
|
||||
- Ales: top-fermented, warmer fermentation temperatures
|
||||
- May emphasize working-class history or rural traditions
|
||||
- Cider/mead/fermented heritage alongside beer
|
||||
|
||||
NEW WORLD (US, AUSTRALIA, NZ, SOUTH AFRICA)
|
||||
- Emphasize experimentation and lack of brewing "rules"
|
||||
- Ingredient sourcing: local grain growers, foraged hops, local suppliers
|
||||
- May reference mining heritage, recent settlement, diverse immigration
|
||||
- Craft beer boom influence: how does this brewery differentiate?
|
||||
- Often: bold flavors, high ABVs, creative adjuncts
|
||||
- Can emphasize anti-tradition or deliberate rule-breaking
|
||||
- Emphasis on farmer partnerships and local food scenes
|
||||
|
||||
SMALL VILLAGES / RURAL AREAS
|
||||
- Brewery likely serves as actual gathering place—explain HOW
|
||||
- Ingredient sourcing highly local (grain from X farm, water from Y spring)
|
||||
- May be family operation or multi-generation story
|
||||
- Role in community identity and events
|
||||
- Accessibility and lack of pretension
|
||||
- Seasonal rhythm and agricultural calendar influence
|
||||
- Risk: Don't make it overly quaint or "simpler times" nostalgic
|
||||
|
||||
URBAN / NEIGHBORHOOD-BASED
|
||||
- Distinctive neighborhood identity (don't just say "vibrant")
|
||||
- Specific business community or residential character
|
||||
- Street-level visibility and casual drop-in culture
|
||||
- May emphasize diversity, immigrant heritage, gentrification navigation
|
||||
- Smaller brewing scale in dense area (space constraints)
|
||||
- Walking-distance customer base instead of destination draw
|
||||
- May have stronger food pairing focus (food truck culture, restaurant neighbors)
|
||||
|
||||
WINE REGIONS (Italy, France, Spain, Germany's Mosel, etc.)
|
||||
- Show awareness of wine's prestige locally
|
||||
- Explain why brewing exists here despite wine dominance
|
||||
- Does brewery respect wine or deliberately provide alternative?
|
||||
- Ingredient differences: water quality suited to beer, not wine
|
||||
- Brewing approach: precise, clean—influenced by wine mentality
|
||||
- May emphasize beer's sociability vs. wine's formality
|
||||
- Historical context: beer predates or coexists with wine tradition
|
||||
|
||||
BEER-HERITAGE HOTSPOTS (Belgium, Germany, UK, Czech Republic)
|
||||
- Can't ignore the weight of history without acknowledging it
|
||||
- Do you innovate within tradition or break from it? Say which.
|
||||
- Specific pride in one style over others (Lambic specialist, Trappist-inspired, etc.)
|
||||
- May emphasize family legacy or generational knowledge
|
||||
- Regional identity VERY strong—brewery reflects this unapologetically
|
||||
- Risk: Avoid claiming to "honor" or "continue" without specifics
|
||||
|
||||
================================================================================
|
||||
TONE VARIATIONS - NOT ALL BREWERIES ARE SOULFUL
|
||||
================================================================================
|
||||
|
||||
These descriptions should NOT all sound romantic, quaint, or emotionally
|
||||
passionate. These are alternative tones:
|
||||
|
||||
IRREVERENT / HUMOROUS
|
||||
"We're brewing beer because wine required too much prayer. Less spirituality,
|
||||
more hops. Our ales are big, unpolished, and perfect after a day's work."
|
||||
|
||||
MATTER-OF-FACT / ENGINEERING-FOCUSED
|
||||
"Brewing is chemistry. We source ingredient components, control variables,
|
||||
and optimize for reproducibility. If that sounds clinical, good—consistency
|
||||
is our craft."
|
||||
|
||||
PROUDLY UNPRETENTIOUS / WORKING-CLASS
|
||||
"This isn't farm-to-table aspirational nonsense. It's a neighborhood beer.
|
||||
$4 pints. No reservations. No sipping notes. Tastes good, fills the glass,
|
||||
keeps you coming back."
|
||||
|
||||
MINIMALIST / DIRECT
|
||||
"We brew three beers. They're good. Come drink one."
|
||||
|
||||
BUSINESS-FOCUSED / PRACTICAL
|
||||
"Starting a brewery in 2015 meant finding a niche. We're the only nano-
|
||||
brewery serving the airport district. Our rapid turnover and distribution
|
||||
focus differentiate us from weekend hobbyists."
|
||||
|
||||
CONFRONTATIONAL / REBELLIOUS
|
||||
"Craft beer got boring. Expensive IPAs and flavor-chasing. We're brewing
|
||||
wheat beers and forgotten styles because fashion is temporary; good beer is timeless."
|
||||
|
||||
MIX these tones across your descriptions. Some breweries should sound romantic
|
||||
and place-proud. Others should sound irreverent or practical.
|
||||
|
||||
================================================================================
|
||||
NARRATIVE CLICHÉS TO ABSOLUTELY AVOID
|
||||
================================================================================
|
||||
|
||||
1. THE "HIDDEN GEM" FRAMING
|
||||
Don't use discovery language: "hidden," "lesser-known," "off the beaten path,"
|
||||
"tucked away." Implies marketing speak, not authenticity.
|
||||
|
||||
2. OVERT NOSTALGIA / "SIMPLER TIMES"
|
||||
Don't appeal to vague sense that past was better: "yearning for," "those
|
||||
days," "how things used to be." Lazy and off-putting.
|
||||
|
||||
3. EMPTY "GATHERING PLACE" CLAIMS
|
||||
Don't just assert "we bring people together." Show HOW: local workers' lunch
|
||||
spot? Trivia night tradition? Live music venue? Political meeting ground?
|
||||
|
||||
4. "SPECIAL" WITHOUT EVIDENCE
|
||||
Don't declare location is "special" or "unique." SHOW what makes it distinct
|
||||
through specific details, not assertion.
|
||||
|
||||
5. "WE BELIEVE IN" AS PLACEHOLDER
|
||||
Every brewery claims to "believe in" quality, community, craft, sustainability.
|
||||
These are empty. What specific belief drives THIS brewery's choices?
|
||||
|
||||
6. "ESCAPE / RETREAT" FRAMING
|
||||
Don't suggest beer allows people to escape reality, retreat from the world,
|
||||
or "get away." Implies you don't trust the place itself to be compelling.
|
||||
|
||||
7. SUPERLATIVE CLAIMS
|
||||
Don't use: "finest," "best," "most authentic," "truly legendary." Let details
|
||||
prove these implied claims instead.
|
||||
|
||||
8. PASSIVE VOICE ABOUT YOUR OWN BREWERY
|
||||
Avoid: "beloved by locals," "known for its," "celebrated for." Active voice:
|
||||
what does the brewery actively DO?
|
||||
|
||||
================================================================================
|
||||
LENGTH AND CONTENT REQUIREMENTS
|
||||
================================================================================
|
||||
|
||||
TARGET LENGTH: 120-180 words
|
||||
- Long enough to establish place and brewing philosophy
|
||||
- Short enough to avoid meandering or repetition
|
||||
- Specific enough that brewery feels real and unreplicable
|
||||
|
||||
REQUIRED ELEMENTS (at least ONE each):
|
||||
✓ Concrete location reference (proper noun, landmark, geographic feature)
|
||||
✓ One specific brewing detail (challenge, advantage, technique, ingredient)
|
||||
✓ Sensory language specific to the place (NOT generic adjectives)
|
||||
✓ Distinct tone/voice (don't all sound the same quiet reverence)
|
||||
|
||||
OPTIONAL ELEMENTS:
|
||||
- Name 1-2 specific beer styles or beer names
|
||||
- Personal/family story (if it illuminates why brewery exists here)
|
||||
- Ingredient sourcing or supply chain detail
|
||||
- Community role (with evidence, not assertion)
|
||||
- Regional historical context (brief, specific)
|
||||
|
||||
WORD ECONOMY:
|
||||
- Don't waste words on "we believe in quality" or "committed to excellence"
|
||||
- Don't use filler adjectives: "authentic," "genuine," "real," "true," "local"
|
||||
(these should be IMPLIED by specific details)
|
||||
- Every sentence should add information, flavor, or distinctive detail
|
||||
|
||||
================================================================================
|
||||
SENSORY LANGUAGE GUIDELINES
|
||||
================================================================================
|
||||
|
||||
AVOID THESE GENERIC SENSORY WORDS (they're lazy placeholders):
|
||||
- "Beautiful," "picturesque," "gorgeous," "stunning"
|
||||
- "Warm," "cozy," "inviting" (without context)
|
||||
- "Vibrant," "lively," "energetic" (without examples)
|
||||
- "Charming," "quaint," "rustic" (without specifics)
|
||||
|
||||
USE INSTEAD: Specific, concrete sensory details
|
||||
- Colors: "copper beech," "rust-stained brick," "frost-blue shutters"
|
||||
- Textures: "the grain of wooden barrel hoops," "hand-smoothed stone," "grime-darkened windows"
|
||||
- Sounds: "the hiss of the hand-pump," "coin-drop in the old register," "church bells on Sunday"
|
||||
- Smells: "yeast-heavy floor," "wet limestone," "Hallertau hop resin"
|
||||
- Tastes: (in the beer) "mineral-sharp," "sulfate clarity," "heather honey notes"
|
||||
|
||||
EXAMPLE SENSORY COMPARISON:
|
||||
AVOID: "Our brewery captures the essence of the region's rustic charm."
|
||||
USE: "The five-meter stone walls keep fermentation at 12°C without refrigeration.
|
||||
On warm days, water drips from moss-covered blocks—the original cooling
|
||||
system that hasn't changed in 150 years."
|
||||
|
||||
================================================================================
|
||||
DIVERSITY ACROSS DATASET - WHAT NOT TO REPEAT
|
||||
================================================================================
|
||||
|
||||
Since you're generating many breweries, ensure variety by:
|
||||
|
||||
□ Alternating tone (soulful → irreverent → matter-of-fact → working-class, etc.)
|
||||
□ Varying opening approach (don't use beer-style origin twice in a row)
|
||||
□ Different geographic contexts (don't make all small villages sound the same)
|
||||
□ Distinct brewery sizes/models (nano-brewery, family operation, investor-backed, etc.)
|
||||
□ Various types of "draw" (neighborhood destination vs. local-only vs. tourist
|
||||
attraction vs. untouched community staple)
|
||||
□ Diverse relationship to beer history/tradition (embrace it, subvert it, ignore it)
|
||||
□ Different community roles (political space, athlete hangout, food destination,
|
||||
working person's bar, experimentation lab, etc.)
|
||||
|
||||
If you notice yourself using the same phrasing twice within three breweries,
|
||||
STOP and take a completely different approach for the next one.
|
||||
|
||||
================================================================================
|
||||
QUALITY CHECKLIST
|
||||
================================================================================
|
||||
|
||||
Before submitting your brewery description, verify:
|
||||
|
||||
□ Zero clichés from the FORBIDDEN list appear anywhere
|
||||
□ At least one specific proper noun or concrete reference included
|
||||
□ No more than two generic adjectives in the entire description
|
||||
□ The brewery is genuinely unreplicable (wouldn't work in a different location)
|
||||
□ Tone matches a SPECIFIC angle (not generic reverence)
|
||||
□ Opening sentence is distinctive and unexpected
|
||||
□ No sentence says the same thing twice in different words
|
||||
□ At least one detail is surprising or specific to this place
|
||||
□ The description would make sense ONLY for this location/region
|
||||
□ "Passion," "tradition," "community" either don't appear or appear with
|
||||
specific context/evidence
|
||||
|
||||
================================================================================
|
||||
OUTPUT FORMAT
|
||||
================================================================================
|
||||
|
||||
Return ONLY a valid JSON object with exactly two keys:
|
||||
{
|
||||
"name": "Brewery Name Here",
|
||||
"description": "Full description text here..."
|
||||
}
|
||||
|
||||
Requirements:
|
||||
- name: 2-5 words, distinctive, memorable
|
||||
- description: 120-180 words, follows all guidelines above
|
||||
- Valid JSON (escaped quotes, no line breaks in strings)
|
||||
- No markdown, no backticks, no code formatting
|
||||
- No preamble before the JSON
|
||||
- No trailing text after the JSON
|
||||
- No explanations or commentary
|
||||
|
||||
================================================================================
|
||||
169
pipeline/prompts/brewery_system_prompt_expanded.txt
Normal file
169
pipeline/prompts/brewery_system_prompt_expanded.txt
Normal file
@@ -0,0 +1,169 @@
|
||||
================================================================================
|
||||
BREWERY DATA GENERATION SYSTEM PROMPT
|
||||
================================================================================
|
||||
|
||||
ROLE AND OBJECTIVE
|
||||
You are an experienced brewmaster creating authentic brewery descriptions that
|
||||
feel real and grounded in specific places. Every detail should prove the brewery
|
||||
could only exist in this location. Write as a brewmaster would—focused on concrete
|
||||
details, not marketing copy.
|
||||
|
||||
================================================================================
|
||||
FORBIDDEN PHRASES AND CLICHÉS
|
||||
================================================================================
|
||||
|
||||
NEVER USE THESE (even in modified form):
|
||||
- "Love letter to" / "tribute to" / "ode to" / "rolling hills" / "picturesque"
|
||||
- "Every sip tells a story" / "Come for X, stay for Y" / "Where tradition meets innovation"
|
||||
- "Rich history" / "ancient roots" / "timeless traditions" / "time-honored heritage"
|
||||
- "Passion" (standalone descriptor) / "brewing excellence" / "commitment to quality"
|
||||
- "Authentic" / "genuine" / "real" / "true" (SHOW these, don't state them)
|
||||
- "Bringing people together" (without HOW) / "community gathering place" (without proof)
|
||||
- "Hidden gem" / "secret" / "lesser-known" / "beloved by locals"
|
||||
- Generic adjectives: "beautiful," "gorgeous," "lovely," "cozy," "charming," "vibrant"
|
||||
- Vague temporal claims: "simpler times," "the good old days," "escape from the modern world"
|
||||
- Passive voice: "is known for," "has become famous for," "has earned a reputation"
|
||||
|
||||
================================================================================
|
||||
OPENING APPROACHES (Choose ONE per brewery)
|
||||
================================================================================
|
||||
|
||||
1. BEER STYLE ORIGIN: Start with a specific historical beer style from this
|
||||
region, explain why this place created it, show how your brewery continues it.
|
||||
Key: Name specific style → why this region made it → how you continue it
|
||||
|
||||
2. BREWING CHALLENGE: Begin with a specific environmental constraint (altitude,
|
||||
water hardness, temperature, endemic yeasts). Explain the technical consequence
|
||||
and what decision you made because of it.
|
||||
Key: Name constraint → technical consequence → your response → distinctive result
|
||||
|
||||
3. FOUNDING STORY: Why did the founder return/move HERE? What did they discover?
|
||||
What specific brewing decision followed? Include a concrete artifact (logs, equipment).
|
||||
Key: Real motivation → specific discovery → brewing decision that stemmed from it
|
||||
|
||||
4. LOCAL INGREDIENT: What unique resource defines your brewery? Why is it unique?
|
||||
What brewing constraint or opportunity does it create?
|
||||
Key: Specific ingredient/resource → why unique → brewing choices it enables
|
||||
|
||||
5. CONTRADICTION: What is the region famous for? Why does your brewery do the
|
||||
opposite? Make the contradiction a strength, not an apology.
|
||||
Key: Regional identity → why you diverge → what you do instead → why it works
|
||||
|
||||
6. CULTURAL MOMENT: What specific seasonal tradition or event shapes your brewery?
|
||||
How do you connect to it? What brewing decisions follow?
|
||||
Key: Specific tradition/event → your brewery's relationship → brewing decisions
|
||||
|
||||
7. PHYSICAL SPACE: Describe a specific architectural feature with date/material.
|
||||
How does it create technical advantage? What sensory details matter? Why keep
|
||||
constraints instead of modernizing?
|
||||
Key: Specific feature → technical consequence → sensory details → why you keep it
|
||||
|
||||
================================================================================
|
||||
SPECIFICITY REQUIREMENTS
|
||||
================================================================================
|
||||
|
||||
Every brewery description MUST include (minimum 2-3 of each):
|
||||
|
||||
1. CONCRETE PROPER NOUNS (at least 2)
|
||||
- Named geographic features: "Saône River," "Monte Guzzo," "Hallertau region"
|
||||
- Named landmarks: "St. Augustine Cathedral," "the old train station," "Harbor Point"
|
||||
- Named varieties: "Saaz hops," "Maris Otter barley," "wild Lambic culture"
|
||||
- Named local suppliers: "[Farmer name]'s wheat," "limestone quarry at Kinderheim"
|
||||
- Named historical periods: "post-WWII reconstruction," "the 1952 flood"
|
||||
|
||||
2. BREWING-SPECIFIC DETAILS (at least 1-2)
|
||||
- Water chemistry: "58 ppm calcium, 45 ppm sulfate" or temperature/pH specifics
|
||||
- Altitude/climate constraints: "1,500m elevation means fermentation at 2-3°C lower"
|
||||
- Temperature swings: "winters reach -20°C, summers hit 35°C; requires separate strategies"
|
||||
- Endemic challenges: "Brettanomyces naturally present; exposed wort gets infected within hours"
|
||||
- Equipment constraints: "original wooden tun from 1954 still seals better than stainless steel"
|
||||
- Ingredient limitations: "fresh hops available only August-September; plan year around that"
|
||||
|
||||
3. SENSORY DETAILS SPECIFIC TO THIS PLACE (at least 1)
|
||||
NOT generic: "beautiful, charming, cozy"
|
||||
Instead: "copper beech trees turn rust-colored by September, visible from fermentation windows"
|
||||
Instead: "boot-scrape grooves worn by coal miners still visible in original tile floor"
|
||||
Instead: "fermentation produces ethanol vapor visible in morning frost every September"
|
||||
Instead: "3-meter stone walls keep fermentation at 13°C naturally; sitting under stone feels colder"
|
||||
|
||||
PROOF TEST: Could this brewery description fit in Chile? Germany? Japan?
|
||||
- If YES: add more place-specific details
|
||||
- If NO: you're on track. Identity should be inseparable from location.
|
||||
|
||||
|
||||
================================================================================
|
||||
TONE VARIATIONS
|
||||
================================================================================
|
||||
|
||||
Rotate tones consciously. Examples:
|
||||
|
||||
IRREVERENT: "We're brewing beer because wine required ritual and prayer. Less
|
||||
spirituality, more hops. Our ales are big, unpolished. Named our Brown Ale
|
||||
'Medieval Constipation' because the grain gives texture."
|
||||
|
||||
MATTER-OF-FACT: "Brewing is applied chemistry. We measure water mineral content
|
||||
to the ppm, fermentation temperature to 0.5°C. Our Märzen has the same gravity,
|
||||
ABV, and color every single batch. Precision is our craft."
|
||||
|
||||
WORKING-CLASS PROUD: "This isn't farm-to-table aspirational nonsense. It's a
|
||||
neighborhood beer. Four dollars a pint. No reservations, no tasting notes.
|
||||
Workers need somewhere to go."
|
||||
|
||||
MINIMALIST: "We brew three beers. They're good. That's it."
|
||||
|
||||
NOSTALGIC-GROUNDED: "My grandfather brewed in his basement. When he died in
|
||||
1995, I found his brewing logs in 2015. I copied his exact recipes. Now the
|
||||
fermentation smells like his basement."
|
||||
|
||||
|
||||
================================================================================
|
||||
LENGTH & CONTENT REQUIREMENTS
|
||||
================================================================================
|
||||
|
||||
TARGET LENGTH: 150-250 words
|
||||
|
||||
REQUIRED ELEMENTS:
|
||||
- At least 2-3 concrete proper nouns (named locations, suppliers, historical moments)
|
||||
- At least 1-2 brewing-specific details (water chemistry, altitude, equipment constraints)
|
||||
- At least 1 sensory detail specific to this place (visible, olfactory, tactile, or temporal)
|
||||
- Consistent tone throughout (irreverent, matter-of-fact, working-class, nostalgic, etc.)
|
||||
- One distinctive detail that proves the brewery could ONLY exist in this location
|
||||
|
||||
OPTIONAL ELEMENTS:
|
||||
- Specific beer names (not just styles)
|
||||
- Names of key people (if central to story)
|
||||
- Explicit community role (with evidence)
|
||||
- Actual sales/production details (if relevant)
|
||||
|
||||
DO NOT INCLUDE:
|
||||
- Generic adjectives without evidence: "authentic," "genuine," "soulful," "passionate"
|
||||
- Vague community claims without HOW: "gathering place," "beloved," "where people come together"
|
||||
- Marketing language: "award-winning," "nationally recognized," "craft quality"
|
||||
- Fillers: "and more," "creating memories," "for all to enjoy"
|
||||
- Predictions: "we're working on," "coming soon," "we plan to"
|
||||
|
||||
|
||||
================================================================================
|
||||
OUTPUT FORMAT
|
||||
================================================================================
|
||||
|
||||
Return ONLY a valid JSON object with exactly two keys:
|
||||
{
|
||||
"name": "Brewery Name Here",
|
||||
"description": "Full description text here..."
|
||||
}
|
||||
|
||||
Requirements:
|
||||
- name: 2-5 words, distinctive, memorable
|
||||
- description: 150-250 words, follows all guidelines
|
||||
- Valid JSON (properly escaped quotes, no line breaks)
|
||||
- No markdown, backticks, or code formatting
|
||||
- No preamble or trailing text after JSON
|
||||
|
||||
Example:
|
||||
{
|
||||
"name": "Sniffels Peak Brewing",
|
||||
"description": "The soft spring water beneath Sniffels Peak..."
|
||||
}
|
||||
|
||||
================================================================================
|
||||
@@ -28,11 +28,12 @@ std::unique_ptr<DataGenerator> BiergartenDataGenerator::InitializeGenerator() {
|
||||
auto llama_generator = std::make_unique<LlamaGenerator>();
|
||||
llama_generator->SetSamplingOptions(options_.temperature, options_.top_p,
|
||||
options_.seed);
|
||||
llama_generator->SetContextSize(options_.n_ctx);
|
||||
spdlog::info(
|
||||
"[Generator] Using LlamaGenerator: {} (temperature={}, top-p={}, "
|
||||
"seed={})",
|
||||
"n_ctx={}, seed={})",
|
||||
options_.model_path, options_.temperature, options_.top_p,
|
||||
options_.seed);
|
||||
options_.n_ctx, options_.seed);
|
||||
generator = std::move(llama_generator);
|
||||
}
|
||||
generator->Load(options_.model_path);
|
||||
|
||||
@@ -25,15 +25,10 @@ std::string DataDownloader::DownloadCountriesDatabase(
|
||||
return cache_path;
|
||||
}
|
||||
|
||||
std::string short_commit = commit;
|
||||
if (commit.length() > 7) {
|
||||
short_commit = commit.substr(0, 7);
|
||||
}
|
||||
|
||||
std::string url =
|
||||
"https://raw.githubusercontent.com/dr5hn/"
|
||||
"countries-states-cities-database/" +
|
||||
short_commit + "/json/countries+states+cities.json";
|
||||
commit + "/json/countries+states+cities.json";
|
||||
|
||||
spdlog::info("[DataDownloader] Downloading: {}", url);
|
||||
|
||||
|
||||
@@ -1,16 +1,31 @@
|
||||
/**
|
||||
* Destructor Module
|
||||
* Ensures proper cleanup of llama.cpp resources (context and model) when the
|
||||
* generator is destroyed, preventing memory leaks and resource exhaustion.
|
||||
*/
|
||||
|
||||
#include "data_generation/llama_generator.h"
|
||||
#include "llama.h"
|
||||
|
||||
LlamaGenerator::~LlamaGenerator() {
|
||||
/**
|
||||
* Free the inference context (contains KV cache and computation state)
|
||||
*/
|
||||
if (context_ != nullptr) {
|
||||
llama_free(context_);
|
||||
context_ = nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Free the loaded model (contains weights and vocabulary)
|
||||
*/
|
||||
if (model_ != nullptr) {
|
||||
llama_model_free(model_);
|
||||
model_ = nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clean up the backend (GPU/CPU acceleration resources)
|
||||
*/
|
||||
llama_backend_free();
|
||||
}
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
/**
|
||||
* Brewery Data Generation Module
|
||||
* Uses the LLM to generate realistic brewery names and descriptions for a given
|
||||
* location. Implements retry logic with validation and error correction to
|
||||
* ensure valid JSON output conforming to the expected schema.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <stdexcept>
|
||||
@@ -9,19 +16,24 @@
|
||||
BreweryResult LlamaGenerator::GenerateBrewery(
|
||||
const std::string& city_name, const std::string& country_name,
|
||||
const std::string& region_context) {
|
||||
/**
|
||||
* Preprocess and truncate region context to manageable size
|
||||
*/
|
||||
const std::string safe_region_context =
|
||||
PrepareRegionContextPublic(region_context);
|
||||
|
||||
/**
|
||||
* Load brewery system prompt from file
|
||||
* Falls back to minimal inline prompt if file not found
|
||||
* Default path: prompts/brewery_system_prompt_expanded.txt
|
||||
*/
|
||||
const std::string system_prompt =
|
||||
"You are the brewmaster and owner of a local craft brewery. "
|
||||
"Write a name and a short, soulful description for your brewery that "
|
||||
"reflects your pride in the local community and your craft. "
|
||||
"The tone should be authentic and welcoming, like a note on a "
|
||||
"chalkboard "
|
||||
"menu. Output ONLY a single JSON object with keys \"name\" and "
|
||||
"\"description\". "
|
||||
"Do not include markdown formatting or backticks.";
|
||||
LoadBrewerySystemPrompt("prompts/brewery_system_prompt_expanded.txt");
|
||||
|
||||
/**
|
||||
* User prompt: provides geographic context to guide generation towards
|
||||
* culturally appropriate and locally-inspired brewery attributes
|
||||
*/
|
||||
std::string prompt =
|
||||
"Write a brewery name and place-specific long description for a craft "
|
||||
"brewery in " +
|
||||
@@ -32,40 +44,61 @@ BreweryResult LlamaGenerator::GenerateBrewery(
|
||||
? std::string(".")
|
||||
: std::string(". Regional context: ") + safe_region_context);
|
||||
|
||||
/**
|
||||
* Store location context for retry prompts (without repeating full context)
|
||||
*/
|
||||
const std::string retry_location =
|
||||
"Location: " + city_name +
|
||||
(country_name.empty() ? std::string("")
|
||||
: std::string(", ") + country_name);
|
||||
|
||||
/**
|
||||
* RETRY LOOP with validation and error correction
|
||||
* Attempts to generate valid brewery data up to 3 times, with feedback-based
|
||||
* refinement
|
||||
*/
|
||||
const int max_attempts = 3;
|
||||
std::string raw;
|
||||
std::string last_error;
|
||||
|
||||
// Limit output length to keep it concise and focused
|
||||
constexpr int max_tokens = 1052;
|
||||
for (int attempt = 0; attempt < max_attempts; ++attempt) {
|
||||
raw = Infer(system_prompt, prompt, 384);
|
||||
// Generate brewery data from LLM
|
||||
raw = Infer(system_prompt, prompt, max_tokens);
|
||||
spdlog::debug("LlamaGenerator: raw output (attempt {}): {}", attempt + 1,
|
||||
raw);
|
||||
|
||||
// Validate output: parse JSON and check required fields
|
||||
|
||||
std::string name;
|
||||
std::string description;
|
||||
const std::string validation_error =
|
||||
ValidateBreweryJsonPublic(raw, name, description);
|
||||
if (validation_error.empty()) {
|
||||
// Success: return parsed brewery data
|
||||
return {std::move(name), std::move(description)};
|
||||
}
|
||||
|
||||
// Validation failed: log error and prepare corrective feedback
|
||||
|
||||
last_error = validation_error;
|
||||
spdlog::warn("LlamaGenerator: malformed brewery JSON (attempt {}): {}",
|
||||
attempt + 1, validation_error);
|
||||
|
||||
// Update prompt with error details to guide LLM toward correct output.
|
||||
// For retries, use a compact prompt format to avoid exceeding token
|
||||
// limits.
|
||||
prompt =
|
||||
"Your previous response was invalid. Error: " + validation_error +
|
||||
"\nReturn ONLY valid JSON with this exact schema: "
|
||||
"{\"name\": \"string\", \"description\": \"string\"}."
|
||||
"\nDo not include markdown, comments, or extra keys."
|
||||
"\n\nLocation: " +
|
||||
city_name +
|
||||
(country_name.empty() ? std::string("")
|
||||
: std::string(", ") + country_name) +
|
||||
(safe_region_context.empty()
|
||||
? std::string("")
|
||||
: std::string("\nRegional context: ") + safe_region_context);
|
||||
"\n\n" +
|
||||
retry_location;
|
||||
}
|
||||
|
||||
// All retry attempts exhausted: log failure and throw exception
|
||||
spdlog::error(
|
||||
"LlamaGenerator: malformed brewery response after {} attempts: "
|
||||
"{}",
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
/**
|
||||
* User Profile Generation Module
|
||||
* Uses the LLM to generate realistic user profiles (username and bio) for craft
|
||||
* beer enthusiasts. Implements retry logic to handle parsing failures and
|
||||
* ensures output adheres to strict format constraints (two lines, specific
|
||||
* character limits).
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <algorithm>
|
||||
@@ -8,6 +16,10 @@
|
||||
#include "data_generation/llama_generator_helpers.h"
|
||||
|
||||
UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
|
||||
/**
|
||||
* System prompt: specifies exact output format to minimize parsing errors
|
||||
* Constraints: 2-line output, username format, bio length bounds
|
||||
*/
|
||||
const std::string system_prompt =
|
||||
"You generate plausible social media profiles for craft beer "
|
||||
"enthusiasts. "
|
||||
@@ -17,39 +29,72 @@ UserResult LlamaGenerator::GenerateUser(const std::string& locale) {
|
||||
"The profile should feel consistent with the locale. "
|
||||
"No preamble, no labels.";
|
||||
|
||||
/**
|
||||
* User prompt: locale parameter guides cultural appropriateness of generated
|
||||
* profiles
|
||||
*/
|
||||
std::string prompt =
|
||||
"Generate a craft beer enthusiast profile. Locale: " + locale;
|
||||
|
||||
/**
|
||||
* RETRY LOOP with format validation
|
||||
* Attempts up to 3 times to generate valid user profile with correct format
|
||||
*/
|
||||
const int max_attempts = 3;
|
||||
std::string raw;
|
||||
for (int attempt = 0; attempt < max_attempts; ++attempt) {
|
||||
/**
|
||||
* Generate user profile (max 128 tokens - should fit 2 lines easily)
|
||||
*/
|
||||
raw = Infer(system_prompt, prompt, 128);
|
||||
spdlog::debug("LlamaGenerator (user): raw output (attempt {}): {}",
|
||||
attempt + 1, raw);
|
||||
|
||||
try {
|
||||
/**
|
||||
* Parse two-line response: first line = username, second line = bio
|
||||
*/
|
||||
auto [username, bio] = ParseTwoLineResponsePublic(
|
||||
raw, "LlamaGenerator: malformed user response");
|
||||
|
||||
/**
|
||||
* Remove any whitespace from username (usernames shouldn't have
|
||||
* spaces)
|
||||
*/
|
||||
username.erase(
|
||||
std::remove_if(username.begin(), username.end(),
|
||||
[](unsigned char ch) { return std::isspace(ch); }),
|
||||
username.end());
|
||||
|
||||
/**
|
||||
* Validate both fields are non-empty after processing
|
||||
*/
|
||||
if (username.empty() || bio.empty()) {
|
||||
throw std::runtime_error("LlamaGenerator: malformed user response");
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate bio if exceeds reasonable length for bio field
|
||||
*/
|
||||
if (bio.size() > 200) bio = bio.substr(0, 200);
|
||||
|
||||
/**
|
||||
* Success: return parsed user profile
|
||||
*/
|
||||
return {username, bio};
|
||||
} catch (const std::exception& e) {
|
||||
/**
|
||||
* Parsing failed: log and continue to next attempt
|
||||
*/
|
||||
spdlog::warn(
|
||||
"LlamaGenerator: malformed user response (attempt {}): {}",
|
||||
attempt + 1, e.what());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* All retry attempts exhausted: log failure and throw exception
|
||||
*/
|
||||
spdlog::error(
|
||||
"LlamaGenerator: malformed user response after {} attempts: {}",
|
||||
max_attempts, raw);
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
/**
|
||||
* Helper Functions Module
|
||||
* Provides utility functions for text processing, parsing, and chat template
|
||||
* formatting. Functions handle whitespace normalization, response parsing, and
|
||||
* conversion of prompts to proper chat format using the model's built-in
|
||||
* template.
|
||||
*/
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <boost/json.hpp>
|
||||
@@ -12,6 +20,9 @@
|
||||
|
||||
namespace {
|
||||
|
||||
/**
|
||||
* String trimming: removes leading and trailing whitespace
|
||||
*/
|
||||
std::string Trim(std::string value) {
|
||||
auto not_space = [](unsigned char ch) { return !std::isspace(ch); };
|
||||
|
||||
@@ -23,6 +34,10 @@ std::string Trim(std::string value) {
|
||||
return value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize whitespace: collapses multiple spaces/tabs/newlines into single
|
||||
* spaces
|
||||
*/
|
||||
std::string CondenseWhitespace(std::string text) {
|
||||
std::string out;
|
||||
out.reserve(text.size());
|
||||
@@ -44,6 +59,10 @@ std::string CondenseWhitespace(std::string text) {
|
||||
return Trim(std::move(out));
|
||||
}
|
||||
|
||||
/**
|
||||
* Truncate region context to fit within max length while preserving word
|
||||
* boundaries
|
||||
*/
|
||||
std::string PrepareRegionContext(std::string_view region_context,
|
||||
std::size_t max_chars) {
|
||||
std::string normalized = CondenseWhitespace(std::string(region_context));
|
||||
@@ -61,6 +80,9 @@ std::string PrepareRegionContext(std::string_view region_context,
|
||||
return normalized;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove common bullet points, numbers, and field labels added by LLM in output
|
||||
*/
|
||||
std::string StripCommonPrefix(std::string line) {
|
||||
line = Trim(std::move(line));
|
||||
|
||||
@@ -102,6 +124,10 @@ std::string StripCommonPrefix(std::string line) {
|
||||
return Trim(std::move(line));
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse two-line response from LLM: normalize line endings, strip formatting,
|
||||
* filter spurious output, and combine remaining lines if needed
|
||||
*/
|
||||
std::pair<std::string, std::string> ParseTwoLineResponse(
|
||||
const std::string& raw, const std::string& error_message) {
|
||||
std::string normalized = raw;
|
||||
@@ -121,7 +147,17 @@ std::pair<std::string, std::string> ParseTwoLineResponse(
|
||||
std::transform(low.begin(), low.end(), low.begin(), [](unsigned char c) {
|
||||
return static_cast<char>(std::tolower(c));
|
||||
});
|
||||
if (!l.empty() && l.front() == '<' && low.back() == '>') continue;
|
||||
// Filter known thinking tags like <think>...</think>, but be conservative
|
||||
// to avoid removing legitimate output. Only filter specific known
|
||||
// patterns.
|
||||
if (!l.empty() && l.front() == '<' && low.back() == '>') {
|
||||
// Only filter if it's a known thinking tag: <think>, <reasoning>, etc.
|
||||
if (low.find("think") != std::string::npos ||
|
||||
low.find("reasoning") != std::string::npos ||
|
||||
low.find("reflect") != std::string::npos) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (low.rfind("okay,", 0) == 0 || low.rfind("hmm", 0) == 0) continue;
|
||||
filtered.push_back(std::move(l));
|
||||
}
|
||||
@@ -140,6 +176,9 @@ std::pair<std::string, std::string> ParseTwoLineResponse(
|
||||
return {first, second};
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply model's chat template to user-only prompt, formatting it for the model
|
||||
*/
|
||||
std::string ToChatPrompt(const llama_model* model,
|
||||
const std::string& user_prompt) {
|
||||
const char* tmpl = llama_model_chat_template(model, nullptr);
|
||||
@@ -173,6 +212,10 @@ std::string ToChatPrompt(const llama_model* model,
|
||||
return std::string(buffer.data(), static_cast<std::size_t>(required));
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply model's chat template to system+user prompt pair, formatting for the
|
||||
* model
|
||||
*/
|
||||
std::string ToChatPrompt(const llama_model* model,
|
||||
const std::string& system_prompt,
|
||||
const std::string& user_prompt) {
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
/**
|
||||
* Text Generation / Inference Module
|
||||
* Core module that performs LLM inference: converts text prompts into tokens,
|
||||
* runs the neural network forward pass, samples the next token, and converts
|
||||
* output tokens back to text. Supports both simple and system+user prompts.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <algorithm>
|
||||
@@ -22,21 +29,37 @@ std::string LlamaGenerator::Infer(const std::string& system_prompt,
|
||||
|
||||
std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
|
||||
int max_tokens) {
|
||||
/**
|
||||
* Validate that model and context are loaded
|
||||
*/
|
||||
if (model_ == nullptr || context_ == nullptr)
|
||||
throw std::runtime_error("LlamaGenerator: model not loaded");
|
||||
|
||||
/**
|
||||
* Get vocabulary for tokenization and token-to-text conversion
|
||||
*/
|
||||
const llama_vocab* vocab = llama_model_get_vocab(model_);
|
||||
if (vocab == nullptr)
|
||||
throw std::runtime_error("LlamaGenerator: vocab unavailable");
|
||||
|
||||
/**
|
||||
* Clear KV cache to ensure clean inference state (no residual context)
|
||||
*/
|
||||
llama_memory_clear(llama_get_memory(context_), true);
|
||||
|
||||
/**
|
||||
* TOKENIZATION PHASE
|
||||
* Convert text prompt into token IDs (integers) that the model understands
|
||||
*/
|
||||
std::vector<llama_token> prompt_tokens(formatted_prompt.size() + 8);
|
||||
int32_t token_count = llama_tokenize(
|
||||
vocab, formatted_prompt.c_str(),
|
||||
static_cast<int32_t>(formatted_prompt.size()), prompt_tokens.data(),
|
||||
static_cast<int32_t>(prompt_tokens.size()), true, true);
|
||||
|
||||
/**
|
||||
* If buffer too small, negative return indicates required size
|
||||
*/
|
||||
if (token_count < 0) {
|
||||
prompt_tokens.resize(static_cast<std::size_t>(-token_count));
|
||||
token_count = llama_tokenize(
|
||||
@@ -48,16 +71,31 @@ std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
|
||||
if (token_count < 0)
|
||||
throw std::runtime_error("LlamaGenerator: prompt tokenization failed");
|
||||
|
||||
/**
|
||||
* CONTEXT SIZE VALIDATION
|
||||
* Validate and compute effective token budgets based on context window
|
||||
* constraints
|
||||
*/
|
||||
const int32_t n_ctx = static_cast<int32_t>(llama_n_ctx(context_));
|
||||
const int32_t n_batch = static_cast<int32_t>(llama_n_batch(context_));
|
||||
if (n_ctx <= 1 || n_batch <= 0)
|
||||
throw std::runtime_error("LlamaGenerator: invalid context or batch size");
|
||||
|
||||
/**
|
||||
* Clamp generation limit to available context window, reserve space for
|
||||
* output
|
||||
*/
|
||||
const int32_t effective_max_tokens =
|
||||
std::max(1, std::min(max_tokens, n_ctx - 1));
|
||||
/**
|
||||
* Prompt can use remaining context after reserving space for generation
|
||||
*/
|
||||
int32_t prompt_budget = std::min(n_batch, n_ctx - effective_max_tokens);
|
||||
prompt_budget = std::max<int32_t>(1, prompt_budget);
|
||||
|
||||
/**
|
||||
* Truncate prompt if necessary to fit within constraints
|
||||
*/
|
||||
prompt_tokens.resize(static_cast<std::size_t>(token_count));
|
||||
if (token_count > prompt_budget) {
|
||||
spdlog::warn(
|
||||
@@ -68,11 +106,21 @@ std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
|
||||
token_count = prompt_budget;
|
||||
}
|
||||
|
||||
/**
|
||||
* PROMPT PROCESSING PHASE
|
||||
* Create a batch containing all prompt tokens and feed through the model
|
||||
* This computes internal representations and fills the KV cache
|
||||
*/
|
||||
const llama_batch prompt_batch = llama_batch_get_one(
|
||||
prompt_tokens.data(), static_cast<int32_t>(prompt_tokens.size()));
|
||||
if (llama_decode(context_, prompt_batch) != 0)
|
||||
throw std::runtime_error("LlamaGenerator: prompt decode failed");
|
||||
|
||||
/**
|
||||
* SAMPLER CONFIGURATION PHASE
|
||||
* Set up the probabilistic token selection pipeline (sampler chain)
|
||||
* Samplers are applied in sequence: temperature -> top-p -> distribution
|
||||
*/
|
||||
llama_sampler_chain_params sampler_params =
|
||||
llama_sampler_chain_default_params();
|
||||
using SamplerPtr =
|
||||
@@ -82,21 +130,48 @@ std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
|
||||
if (!sampler)
|
||||
throw std::runtime_error("LlamaGenerator: failed to initialize sampler");
|
||||
|
||||
/**
|
||||
* Temperature: scales logits before softmax (controls randomness)
|
||||
*/
|
||||
llama_sampler_chain_add(sampler.get(),
|
||||
llama_sampler_init_temp(sampling_temperature_));
|
||||
/**
|
||||
* Top-P: nucleus sampling - filters to most likely tokens summing to top_p
|
||||
* probability
|
||||
*/
|
||||
llama_sampler_chain_add(sampler.get(),
|
||||
llama_sampler_init_top_p(sampling_top_p_, 1));
|
||||
/**
|
||||
* Distribution sampler: selects actual token using configured seed for
|
||||
* reproducibility
|
||||
*/
|
||||
llama_sampler_chain_add(sampler.get(),
|
||||
llama_sampler_init_dist(sampling_seed_));
|
||||
|
||||
/**
|
||||
* TOKEN GENERATION LOOP
|
||||
* Iteratively generate tokens one at a time until max_tokens or
|
||||
* end-of-sequence
|
||||
*/
|
||||
std::vector<llama_token> generated_tokens;
|
||||
generated_tokens.reserve(static_cast<std::size_t>(effective_max_tokens));
|
||||
|
||||
for (int i = 0; i < effective_max_tokens; ++i) {
|
||||
/**
|
||||
* Sample next token using configured sampler chain and model logits
|
||||
* Index -1 means use the last output position from previous batch
|
||||
*/
|
||||
const llama_token next =
|
||||
llama_sampler_sample(sampler.get(), context_, -1);
|
||||
/**
|
||||
* Stop if model predicts end-of-generation token (EOS/EOT)
|
||||
*/
|
||||
if (llama_vocab_is_eog(vocab, next)) break;
|
||||
generated_tokens.push_back(next);
|
||||
/**
|
||||
* Feed the sampled token back into model for next iteration
|
||||
* (autoregressive)
|
||||
*/
|
||||
llama_token token = next;
|
||||
const llama_batch one_token_batch = llama_batch_get_one(&token, 1);
|
||||
if (llama_decode(context_, one_token_batch) != 0)
|
||||
@@ -104,8 +179,18 @@ std::string LlamaGenerator::InferFormatted(const std::string& formatted_prompt,
|
||||
"LlamaGenerator: decode failed during generation");
|
||||
}
|
||||
|
||||
/**
|
||||
* DETOKENIZATION PHASE
|
||||
* Convert generated token IDs back to text using vocabulary
|
||||
*/
|
||||
std::string output;
|
||||
for (const llama_token token : generated_tokens)
|
||||
AppendTokenPiecePublic(vocab, token, output);
|
||||
|
||||
/**
|
||||
* Advance seed for next generation to improve output diversity
|
||||
*/
|
||||
sampling_seed_ = (sampling_seed_ == 0xFFFFFFFFu) ? 0 : sampling_seed_ + 1;
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
/**
|
||||
* Model Loading Module
|
||||
* This module handles loading a pre-trained LLM model from disk and
|
||||
* initializing the llama.cpp context for inference. It performs one-time setup
|
||||
* required before any inference operations can be performed.
|
||||
*/
|
||||
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <stdexcept>
|
||||
@@ -7,6 +14,9 @@
|
||||
#include "llama.h"
|
||||
|
||||
void LlamaGenerator::Load(const std::string& model_path) {
|
||||
/**
|
||||
* Validate input and clean up any previously loaded model/context
|
||||
*/
|
||||
if (model_path.empty())
|
||||
throw std::runtime_error("LlamaGenerator: model path must not be empty");
|
||||
|
||||
@@ -19,6 +29,9 @@ void LlamaGenerator::Load(const std::string& model_path) {
|
||||
model_ = nullptr;
|
||||
}
|
||||
|
||||
/**
|
||||
* Initialize the llama backend (one-time setup for GPU/CPU acceleration)
|
||||
*/
|
||||
llama_backend_init();
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
@@ -29,7 +42,8 @@ void LlamaGenerator::Load(const std::string& model_path) {
|
||||
}
|
||||
|
||||
llama_context_params context_params = llama_context_default_params();
|
||||
context_params.n_ctx = 2048;
|
||||
context_params.n_ctx = n_ctx_;
|
||||
context_params.n_batch = n_ctx_; // Set batch size equal to context window
|
||||
|
||||
context_ = llama_init_from_model(model_, context_params);
|
||||
if (context_ == nullptr) {
|
||||
|
||||
74
pipeline/src/data_generation/llama/load_brewery_prompt.cpp
Normal file
74
pipeline/src/data_generation/llama/load_brewery_prompt.cpp
Normal file
@@ -0,0 +1,74 @@
|
||||
#include <fstream>
|
||||
#include <filesystem>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "data_generation/llama_generator.h"
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
|
||||
std::string LlamaGenerator::LoadBrewerySystemPrompt(
|
||||
const std::string& prompt_file_path) {
|
||||
// Return cached version if already loaded
|
||||
if (!brewery_system_prompt_.empty()) {
|
||||
return brewery_system_prompt_;
|
||||
}
|
||||
|
||||
// Try multiple path locations
|
||||
std::vector<std::string> paths_to_try = {
|
||||
prompt_file_path, // As provided
|
||||
"../" + prompt_file_path, // One level up
|
||||
"../../" + prompt_file_path, // Two levels up
|
||||
};
|
||||
|
||||
for (const auto& path : paths_to_try) {
|
||||
std::ifstream prompt_file(path);
|
||||
if (prompt_file.is_open()) {
|
||||
std::string prompt((std::istreambuf_iterator<char>(prompt_file)),
|
||||
std::istreambuf_iterator<char>());
|
||||
prompt_file.close();
|
||||
|
||||
if (!prompt.empty()) {
|
||||
spdlog::info(
|
||||
"LlamaGenerator: Loaded brewery system prompt from '{}' ({} chars)",
|
||||
path, prompt.length());
|
||||
brewery_system_prompt_ = prompt;
|
||||
return brewery_system_prompt_;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
spdlog::warn(
|
||||
"LlamaGenerator: Could not open brewery system prompt file at any of the "
|
||||
"expected locations. Using fallback inline prompt.");
|
||||
return GetFallbackBreweryPrompt();
|
||||
}
|
||||
|
||||
// Fallback: minimal inline prompt if file fails to load
|
||||
std::string LlamaGenerator::GetFallbackBreweryPrompt() {
|
||||
return "You are an experienced brewmaster and owner of a local craft brewery. "
|
||||
"Create a distinctive, authentic name and detailed description that "
|
||||
"genuinely reflects your specific location, brewing philosophy, local "
|
||||
"culture, and community connection. The brewery must feel real and "
|
||||
"grounded—not generic or interchangeable.\n\n"
|
||||
"AVOID REPETITIVE PHRASES - Never use:\n"
|
||||
"Love letter to, tribute to, rolling hills, picturesque, every sip "
|
||||
"tells a story, Come for X stay for Y, rich history, passion, woven "
|
||||
"into, ancient roots, timeless, where tradition meets innovation\n\n"
|
||||
"OPENING APPROACHES - Choose ONE:\n"
|
||||
"1. Start with specific beer style and its regional origins\n"
|
||||
"2. Begin with specific brewing challenge (water, altitude, climate)\n"
|
||||
"3. Open with founding story or personal motivation\n"
|
||||
"4. Lead with specific local ingredient or resource\n"
|
||||
"5. Start with unexpected angle or contradiction\n"
|
||||
"6. Open with local event, tradition, or cultural moment\n"
|
||||
"7. Begin with tangible architectural or geographic detail\n\n"
|
||||
"BE SPECIFIC - Include:\n"
|
||||
"- At least ONE concrete proper noun (landmark, river, neighborhood)\n"
|
||||
"- Specific beer styles relevant to the REGION'S culture\n"
|
||||
"- Concrete brewing challenges or advantages\n"
|
||||
"- Sensory details SPECIFIC to place—not generic adjectives\n\n"
|
||||
"LENGTH: 150-250 words. TONE: Can be soulful, irreverent, "
|
||||
"matter-of-fact, unpretentious, or minimalist.\n\n"
|
||||
"Output ONLY a raw JSON object with keys name and description. "
|
||||
"No markdown, backticks, preamble, or trailing text.";
|
||||
}
|
||||
@@ -1,3 +1,10 @@
|
||||
/**
|
||||
* Sampling Configuration Module
|
||||
* Configures the hyperparameters that control probabilistic token selection
|
||||
* during text generation. These settings affect the randomness, diversity, and
|
||||
* quality of generated output.
|
||||
*/
|
||||
|
||||
#include <stdexcept>
|
||||
|
||||
#include "data_generation/llama_generator.h"
|
||||
@@ -5,21 +12,54 @@
|
||||
|
||||
void LlamaGenerator::SetSamplingOptions(float temperature, float top_p,
|
||||
int seed) {
|
||||
/**
|
||||
* Validate temperature: controls randomness in output distribution
|
||||
* 0.0 = deterministic (always pick highest probability token)
|
||||
* Higher values = more random/diverse output
|
||||
*/
|
||||
if (temperature < 0.0f) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: sampling temperature must be >= 0");
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate top-p (nucleus sampling): only sample from top cumulative
|
||||
* probability e.g., top-p=0.9 means sample from tokens that make up 90% of
|
||||
* probability mass
|
||||
*/
|
||||
if (!(top_p > 0.0f && top_p <= 1.0f)) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: sampling top-p must be in (0, 1]");
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate seed: for reproducible results (-1 uses random seed)
|
||||
*/
|
||||
if (seed < -1) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: seed must be >= 0, or -1 for random");
|
||||
}
|
||||
|
||||
/**
|
||||
* Store sampling parameters for use during token generation
|
||||
*/
|
||||
sampling_temperature_ = temperature;
|
||||
sampling_top_p_ = top_p;
|
||||
sampling_seed_ = (seed < 0) ? static_cast<uint32_t>(LLAMA_DEFAULT_SEED)
|
||||
: static_cast<uint32_t>(seed);
|
||||
}
|
||||
|
||||
void LlamaGenerator::SetContextSize(uint32_t n_ctx) {
|
||||
/**
|
||||
* Validate context size: must be positive and reasonable for the model
|
||||
*/
|
||||
if (n_ctx == 0 || n_ctx > 32768) {
|
||||
throw std::runtime_error(
|
||||
"LlamaGenerator: context size must be in range [1, 32768]");
|
||||
}
|
||||
|
||||
/**
|
||||
* Store context size for use during model loading
|
||||
*/
|
||||
n_ctx_ = n_ctx;
|
||||
}
|
||||
|
||||
@@ -80,6 +80,16 @@ void SqliteDatabase::CommitTransaction() {
|
||||
}
|
||||
}
|
||||
|
||||
void SqliteDatabase::RollbackTransaction() {
|
||||
std::lock_guard<std::mutex> lock(db_mutex_);
|
||||
char* err = nullptr;
|
||||
if (sqlite3_exec(db_, "ROLLBACK", nullptr, nullptr, &err) != SQLITE_OK) {
|
||||
std::string msg = err ? err : "unknown";
|
||||
sqlite3_free(err);
|
||||
throw std::runtime_error("RollbackTransaction failed: " + msg);
|
||||
}
|
||||
}
|
||||
|
||||
void SqliteDatabase::InsertCountry(int id, const std::string& name,
|
||||
const std::string& iso2,
|
||||
const std::string& iso3) {
|
||||
@@ -96,9 +106,9 @@ void SqliteDatabase::InsertCountry(int id, const std::string& name,
|
||||
throw std::runtime_error("Failed to prepare country insert");
|
||||
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 2, name.c_str(), -1, SQLITE_TRANSIENT);
|
||||
sqlite3_bind_text(stmt, 3, iso2.c_str(), -1, SQLITE_TRANSIENT);
|
||||
sqlite3_bind_text(stmt, 4, iso3.c_str(), -1, SQLITE_TRANSIENT);
|
||||
|
||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||
throw std::runtime_error("Failed to insert country");
|
||||
@@ -123,8 +133,8 @@ void SqliteDatabase::InsertState(int id, int country_id,
|
||||
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_int(stmt, 2, country_id);
|
||||
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 3, name.c_str(), -1, SQLITE_TRANSIENT);
|
||||
sqlite3_bind_text(stmt, 4, iso2.c_str(), -1, SQLITE_TRANSIENT);
|
||||
|
||||
if (sqlite3_step(stmt) != SQLITE_DONE) {
|
||||
throw std::runtime_error("Failed to insert state");
|
||||
@@ -150,7 +160,7 @@ void SqliteDatabase::InsertCity(int id, int state_id, int country_id,
|
||||
sqlite3_bind_int(stmt, 1, id);
|
||||
sqlite3_bind_int(stmt, 2, state_id);
|
||||
sqlite3_bind_int(stmt, 3, country_id);
|
||||
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_STATIC);
|
||||
sqlite3_bind_text(stmt, 4, name.c_str(), -1, SQLITE_TRANSIENT);
|
||||
sqlite3_bind_double(stmt, 5, latitude);
|
||||
sqlite3_bind_double(stmt, 6, longitude);
|
||||
|
||||
@@ -165,7 +175,8 @@ std::vector<City> SqliteDatabase::QueryCities() {
|
||||
std::vector<City> cities;
|
||||
sqlite3_stmt* stmt = nullptr;
|
||||
|
||||
const char* query = "SELECT id, name, country_id FROM cities ORDER BY name";
|
||||
const char* query =
|
||||
"SELECT id, name, country_id FROM cities ORDER BY RANDOM()";
|
||||
int rc = sqlite3_prepare_v2(db_, query, -1, &stmt, nullptr);
|
||||
|
||||
if (rc != SQLITE_OK) {
|
||||
|
||||
@@ -11,7 +11,7 @@ void JsonLoader::LoadWorldCities(const std::string& json_path,
|
||||
constexpr size_t kBatchSize = 10000;
|
||||
|
||||
auto startTime = std::chrono::high_resolution_clock::now();
|
||||
spdlog::info("\nLoading {} (streaming RapidJSON SAX)...", json_path);
|
||||
spdlog::info("\nLoading {} (streaming Boost.JSON SAX)...", json_path);
|
||||
|
||||
db.BeginTransaction();
|
||||
bool transactionOpen = true;
|
||||
@@ -44,7 +44,8 @@ void JsonLoader::LoadWorldCities(const std::string& json_path,
|
||||
}
|
||||
} catch (...) {
|
||||
if (transactionOpen) {
|
||||
db.CommitTransaction();
|
||||
db.RollbackTransaction();
|
||||
transactionOpen = false;
|
||||
}
|
||||
throw;
|
||||
}
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
#include <iostream>
|
||||
#include <memory>
|
||||
|
||||
#include <boost/program_options.hpp>
|
||||
#include <spdlog/spdlog.h>
|
||||
|
||||
#include "biergarten_data_generator.h"
|
||||
#include "web_client/curl_web_client.h"
|
||||
#include "database/database.h"
|
||||
#include "web_client/curl_web_client.h"
|
||||
|
||||
namespace po = boost::program_options;
|
||||
|
||||
@@ -18,21 +18,32 @@ namespace po = boost::program_options;
|
||||
* @param options Output ApplicationOptions struct.
|
||||
* @return true if parsing succeeded and should proceed, false otherwise.
|
||||
*/
|
||||
bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
|
||||
bool ParseArguments(int argc, char** argv, ApplicationOptions& options) {
|
||||
// If no arguments provided, display usage and exit
|
||||
if (argc == 1) {
|
||||
std::cout << "Biergarten Pipeline - Geographic Data Pipeline with Brewery Generation\n\n";
|
||||
std::cout << "Biergarten Pipeline - Geographic Data Pipeline with "
|
||||
"Brewery Generation\n\n";
|
||||
std::cout << "Usage: biergarten-pipeline [options]\n\n";
|
||||
std::cout << "Options:\n";
|
||||
std::cout << " --mocked Use mocked generator for brewery/user data\n";
|
||||
std::cout << " --model, -m PATH Path to LLM model file (gguf) for generation\n";
|
||||
std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: /tmp)\n";
|
||||
std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 (default: 0.8)\n";
|
||||
std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 (default: 0.92)\n";
|
||||
std::cout << " --seed SEED Random seed: -1 for random (default: -1)\n";
|
||||
std::cout << " --mocked Use mocked generator for "
|
||||
"brewery/user data\n";
|
||||
std::cout << " --model, -m PATH Path to LLM model file (gguf) for "
|
||||
"generation\n";
|
||||
std::cout << " --cache-dir, -c DIR Directory for cached JSON (default: "
|
||||
"/tmp)\n";
|
||||
std::cout << " --temperature TEMP LLM sampling temperature 0.0-1.0 "
|
||||
"(default: 0.8)\n";
|
||||
std::cout << " --top-p VALUE Nucleus sampling parameter 0.0-1.0 "
|
||||
"(default: 0.92)\n";
|
||||
std::cout << " --n-ctx SIZE Context window size in tokens "
|
||||
"(default: 4096)\n";
|
||||
std::cout << " --seed SEED Random seed: -1 for random "
|
||||
"(default: -1)\n";
|
||||
std::cout << " --help, -h Show this help message\n\n";
|
||||
std::cout << "Note: --mocked and --model are mutually exclusive. Exactly one must be provided.\n";
|
||||
std::cout << "Data source is always pinned to commit c5eb7772 (stable 2026-03-28).\n";
|
||||
std::cout << "Note: --mocked and --model are mutually exclusive. Exactly "
|
||||
"one must be provided.\n";
|
||||
std::cout << "Data source is always pinned to commit c5eb7772 (stable "
|
||||
"2026-03-28).\n";
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -48,6 +59,8 @@ bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
|
||||
"Sampling temperature (higher = more random)")(
|
||||
"top-p", po::value<float>()->default_value(0.92f),
|
||||
"Nucleus sampling top-p in (0,1] (higher = more random)")(
|
||||
"n-ctx", po::value<uint32_t>()->default_value(8192),
|
||||
"Context window size in tokens (1-32768)")(
|
||||
"seed", po::value<int>()->default_value(-1),
|
||||
"Sampler seed: -1 for random, otherwise non-negative integer");
|
||||
|
||||
@@ -81,7 +94,9 @@ bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
|
||||
bool hasSeed = vm["seed"].defaulted() == false;
|
||||
|
||||
if (hasTemperature || hasTopP || hasSeed) {
|
||||
spdlog::warn("WARNING: Sampling parameters (--temperature, --top-p, --seed) are ignored when using --mocked");
|
||||
spdlog::warn(
|
||||
"WARNING: Sampling parameters (--temperature, --top-p, --seed) "
|
||||
"are ignored when using --mocked");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -90,13 +105,14 @@ bool ParseArguments(int argc, char **argv, ApplicationOptions &options) {
|
||||
options.cache_dir = vm["cache-dir"].as<std::string>();
|
||||
options.temperature = vm["temperature"].as<float>();
|
||||
options.top_p = vm["top-p"].as<float>();
|
||||
options.n_ctx = vm["n-ctx"].as<uint32_t>();
|
||||
options.seed = vm["seed"].as<int>();
|
||||
// commit is always pinned to c5eb7772
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int main(int argc, char* argv[]) {
|
||||
try {
|
||||
const CurlGlobalState curl_state;
|
||||
|
||||
@@ -111,7 +127,7 @@ int main(int argc, char *argv[]) {
|
||||
BiergartenDataGenerator generator(options, webClient, database);
|
||||
return generator.Run();
|
||||
|
||||
} catch (const std::exception &e) {
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::error("ERROR: Application failed: {}", e.what());
|
||||
return 1;
|
||||
}
|
||||
|
||||
@@ -11,7 +11,7 @@ std::string WikipediaService::FetchExtract(std::string_view query) {
|
||||
const std::string encoded = client_->UrlEncode(std::string(query));
|
||||
const std::string url =
|
||||
"https://en.wikipedia.org/w/api.php?action=query&titles=" + encoded +
|
||||
"&prop=extracts&explaintext=true&format=json";
|
||||
"&prop=extracts&explaintext=1&format=json";
|
||||
|
||||
const std::string body = client_->Get(url);
|
||||
|
||||
@@ -19,6 +19,7 @@ std::string WikipediaService::FetchExtract(std::string_view query) {
|
||||
boost::json::value doc = boost::json::parse(body, ec);
|
||||
|
||||
if (!ec && doc.is_object()) {
|
||||
try {
|
||||
auto& pages = doc.at("query").at("pages").get_object();
|
||||
if (!pages.empty()) {
|
||||
auto& page = pages.begin()->value().get_object();
|
||||
@@ -29,6 +30,16 @@ std::string WikipediaService::FetchExtract(std::string_view query) {
|
||||
return extract;
|
||||
}
|
||||
}
|
||||
} catch (const std::exception& e) {
|
||||
spdlog::warn(
|
||||
"WikipediaService: failed to parse response structure for '{}': "
|
||||
"{}",
|
||||
query, e.what());
|
||||
return {};
|
||||
}
|
||||
} else if (ec) {
|
||||
spdlog::warn("WikipediaService: JSON parse error for '{}': {}", query,
|
||||
ec.message());
|
||||
}
|
||||
|
||||
return {};
|
||||
@@ -55,7 +66,7 @@ std::string WikipediaService::GetSummary(std::string_view city,
|
||||
regionQuery += country;
|
||||
}
|
||||
|
||||
const std::string beerQuery = "beer in " + std::string(city);
|
||||
const std::string beerQuery = "beer in " + std::string(country);
|
||||
|
||||
try {
|
||||
const std::string regionExtract = FetchExtract(regionQuery);
|
||||
|
||||
Reference in New Issue
Block a user