updates for gemma-4-E4B-it-Q6_K.gguf

This commit is contained in:
Aaron Po
2026-04-09 23:59:38 -04:00
parent b53f9e5582
commit 7ca651a886
4 changed files with 88 additions and 230 deletions

View File

@@ -171,76 +171,68 @@ static std::pair<std::string, std::string> ParseTwoLineResponse(
if (first.empty() || second.empty()) throw std::runtime_error(error_message);
return {first, second};
}
/**
* Apply model's chat template to user-only prompt, formatting it for the model
*/
static std::string ToChatPrompt(const llama_model* model,
const std::string& user_prompt) {
const char* tmpl = llama_model_chat_template(model, nullptr);
if (tmpl == nullptr) {
return user_prompt;
}
const llama_chat_message message{"user", user_prompt.c_str()};
std::vector<char> buffer(
std::max<std::size_t>(1024, user_prompt.size() * 4));
int32_t required =
llama_chat_apply_template(tmpl, &message, 1, true, buffer.data(),
static_cast<int32_t>(buffer.size()));
if (required < 0) {
throw std::runtime_error("LlamaGenerator: failed to apply chat template");
}
if (required >= static_cast<int32_t>(buffer.size())) {
buffer.resize(static_cast<std::size_t>(required) + 1);
required =
llama_chat_apply_template(tmpl, &message, 1, true, buffer.data(),
static_cast<int32_t>(buffer.size()));
if (required < 0) {
throw std::runtime_error(
"LlamaGenerator: failed to apply chat template");
}
}
return std::string(buffer.data(), static_cast<std::size_t>(required));
}
/**
* Apply model's chat template to system+user prompt pair, formatting for the
* model
*/
static std::string ToChatPrompt(const llama_model* model,
const std::string& system_prompt,
const std::string& user_prompt) {
std::string ToChatPrompt(const llama_model* model,
const std::string& system_prompt,
const std::string& user_prompt) {
const char* tmpl = llama_model_chat_template(model, nullptr);
if (tmpl == nullptr) {
// No template found, fallback to raw text
return system_prompt + "\n\n" + user_prompt;
}
const llama_chat_message messages[2] = {{"system", system_prompt.c_str()},
{"user", user_prompt.c_str()}};
const std::array<llama_chat_message, 2> messages = {
{{"system", system_prompt.c_str()}, {"user", user_prompt.c_str()}}};
std::vector<char> buffer(std::max<std::size_t>(
1024, (system_prompt.size() + user_prompt.size()) * 4));
int32_t required =
llama_chat_apply_template(tmpl, messages, 2, true, buffer.data(),
llama_chat_apply_template(tmpl, messages.data(), 2, true, buffer.data(),
static_cast<int32_t>(buffer.size()));
// FALLBACK: If the template fails (e.g., Gemma rejecting the "system" role),
// combine the system and user prompts into a single "user" message.
if (required < 0) {
throw std::runtime_error("LlamaGenerator: failed to apply chat template");
std::string combined_prompt = system_prompt + "\n\n" + user_prompt;
const std::array<llama_chat_message, 1> fallback_msg = {
{{"user", combined_prompt.c_str()}}};
required = llama_chat_apply_template(tmpl, fallback_msg.data(), 1, true,
buffer.data(),
static_cast<int32_t>(buffer.size()));
// THE FIX: Ultimate fallback. If the GGUF's internal template is
// completely unparseable (which happens with complex Jinja macros),
// degrade gracefully to raw text instead of throwing a runtime_error.
if (required < 0) {
return combined_prompt;
}
if (required >= static_cast<int32_t>(buffer.size())) {
buffer.resize(static_cast<std::size_t>(required) + 1);
required = llama_chat_apply_template(
tmpl, fallback_msg.data(), 1, true, buffer.data(),
static_cast<int32_t>(buffer.size()));
if (required < 0) {
return combined_prompt;
}
}
return std::string(buffer.data(), static_cast<std::size_t>(required));
}
// Standard buffer resize if the original "system" + "user" array succeeded
// but needed more space
if (required >= static_cast<int32_t>(buffer.size())) {
buffer.resize(static_cast<std::size_t>(required) + 1);
required =
llama_chat_apply_template(tmpl, messages, 2, true, buffer.data(),
static_cast<int32_t>(buffer.size()));
required = llama_chat_apply_template(tmpl, messages.data(), 2, true,
buffer.data(),
static_cast<int32_t>(buffer.size()));
// Final safety net on resize
if (required < 0) {
throw std::runtime_error(
"LlamaGenerator: failed to apply chat template");
return system_prompt + "\n\n" + user_prompt;
}
}
@@ -416,7 +408,7 @@ std::pair<std::string, std::string> ParseTwoLineResponsePublic(
std::string ToChatPromptPublic(const llama_model* model,
const std::string& user_prompt) {
return ToChatPrompt(model, user_prompt);
return ToChatPrompt(model, user_prompt, "");
}
std::string ToChatPromptPublic(const llama_model* model,

View File

@@ -32,7 +32,7 @@ void LlamaGenerator::Load(const std::string& model_path) {
llama_context_params context_params = llama_context_default_params();
context_params.n_ctx = n_ctx_;
context_params.n_batch = std::min(n_ctx_, static_cast<uint32_t>(512));
context_params.n_batch = std::min(n_ctx_, static_cast<uint32_t>(5000));
context_ = llama_init_from_model(model_, context_params);
if (context_ == nullptr) {