@inproceedings{hua-etal-2025-flaw,
title = "Flaw or Artifact? Rethinking Prompt Sensitivity in Evaluating {LLM}s",
author = "Hua, Andong and
Tang, Kenan and
Gu, Chenhe and
Gu, Jindong and
Wong, Eric and
Qin, Yao",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.emnlp-main.1006/",
doi = "10.18653/v1/2025.emnlp-main.1006",
pages = "19889--19899",
ISBN = "979-8-89176-332-6",
abstract = "Prompt sensitivity, referring to the phenomenon where paraphrasing (that is, repeating something written or spoken using different words) leads to significant changes in large language model performance, has been widely accepted as a core limitation of large language models. In this work, we revisit this issue and ask: Is the widely reported high prompt sensitivity truly an inherent weakness of large language models, or is it largely an artifact of evaluation processes? To answer this question, we systematically evaluate seven large language models (for example, the GPT and Gemini families) across six benchmarks, including both multiple-choice and open-ended tasks on twelve diverse prompt templates. We find that much of the prompt sensitivity stems from heuristic evaluation methods, including log-likelihood scoring and rigid answer matching, which often overlook semantically correct responses expressed through alternative phrasings, such as synonyms or paraphrases. When we adopt large language model as a judge evaluations, we observe a substantial reduction in performance variance and a consistently higher correlation in model rankings across prompts. Our findings suggest that modern large language models are more robust to prompt templates than previously believed, and that prompt sensitivity may be more an artifact of evaluation than a flaw in the models."
}<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hua-etal-2025-flaw">
<titleInfo>
<title>Flaw or Artifact? Rethinking Prompt Sensitivity in Evaluating LLMs</title>
</titleInfo>
<name type="personal">
<namePart type="given">Andong</namePart>
<namePart type="family">Hua</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kenan</namePart>
<namePart type="family">Tang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Chenhe</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jindong</namePart>
<namePart type="family">Gu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Eric</namePart>
<namePart type="family">Wong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yao</namePart>
<namePart type="family">Qin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2025-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Christos</namePart>
<namePart type="family">Christodoulopoulos</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tanmoy</namePart>
<namePart type="family">Chakraborty</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Carolyn</namePart>
<namePart type="family">Rose</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Violet</namePart>
<namePart type="family">Peng</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-8-89176-332-6</identifier>
</relatedItem>
<abstract>Prompt sensitivity, referring to the phenomenon where paraphrasing (that is, repeating something written or spoken using different words) leads to significant changes in large language model performance, has been widely accepted as a core limitation of large language models. In this work, we revisit this issue and ask: Is the widely reported high prompt sensitivity truly an inherent weakness of large language models, or is it largely an artifact of evaluation processes? To answer this question, we systematically evaluate seven large language models (for example, the GPT and Gemini families) across six benchmarks, including both multiple-choice and open-ended tasks on twelve diverse prompt templates. We find that much of the prompt sensitivity stems from heuristic evaluation methods, including log-likelihood scoring and rigid answer matching, which often overlook semantically correct responses expressed through alternative phrasings, such as synonyms or paraphrases. When we adopt large language model as a judge evaluations, we observe a substantial reduction in performance variance and a consistently higher correlation in model rankings across prompts. Our findings suggest that modern large language models are more robust to prompt templates than previously believed, and that prompt sensitivity may be more an artifact of evaluation than a flaw in the models.</abstract>
<identifier type="citekey">hua-etal-2025-flaw</identifier>
<identifier type="doi">10.18653/v1/2025.emnlp-main.1006</identifier>
<location>
<url>https://aclanthology.org/2025.emnlp-main.1006/</url>
</location>
<part>
<date>2025-11</date>
<extent unit="page">
<start>19889</start>
<end>19899</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Flaw or Artifact? Rethinking Prompt Sensitivity in Evaluating LLMs
%A Hua, Andong
%A Tang, Kenan
%A Gu, Chenhe
%A Gu, Jindong
%A Wong, Eric
%A Qin, Yao
%Y Christodoulopoulos, Christos
%Y Chakraborty, Tanmoy
%Y Rose, Carolyn
%Y Peng, Violet
%S Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing
%D 2025
%8 November
%I Association for Computational Linguistics
%C Suzhou, China
%@ 979-8-89176-332-6
%F hua-etal-2025-flaw
%X Prompt sensitivity, referring to the phenomenon where paraphrasing (that is, repeating something written or spoken using different words) leads to significant changes in large language model performance, has been widely accepted as a core limitation of large language models. In this work, we revisit this issue and ask: Is the widely reported high prompt sensitivity truly an inherent weakness of large language models, or is it largely an artifact of evaluation processes? To answer this question, we systematically evaluate seven large language models (for example, the GPT and Gemini families) across six benchmarks, including both multiple-choice and open-ended tasks on twelve diverse prompt templates. We find that much of the prompt sensitivity stems from heuristic evaluation methods, including log-likelihood scoring and rigid answer matching, which often overlook semantically correct responses expressed through alternative phrasings, such as synonyms or paraphrases. When we adopt large language model as a judge evaluations, we observe a substantial reduction in performance variance and a consistently higher correlation in model rankings across prompts. Our findings suggest that modern large language models are more robust to prompt templates than previously believed, and that prompt sensitivity may be more an artifact of evaluation than a flaw in the models.
%R 10.18653/v1/2025.emnlp-main.1006
%U https://aclanthology.org/2025.emnlp-main.1006/
%U https://doi.org/10.18653/v1/2025.emnlp-main.1006
%P 19889-19899
Markdown (Informal)
[Flaw or Artifact? Rethinking Prompt Sensitivity in Evaluating LLMs](https://aclanthology.org/2025.emnlp-main.1006/) (Hua et al., EMNLP 2025)
ACL