@inproceedings{popovic-2021-agree,
title = "Agree to Disagree: Analysis of Inter-Annotator Disagreements in Human Evaluation of Machine Translation Output",
author = "Popovi{\'c}, Maja",
editor = "Bisazza, Arianna and
Abend, Omri",
booktitle = "Proceedings of the 25th Conference on Computational Natural Language Learning",
month = nov,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.conll-1.18/",
doi = "10.18653/v1/2021.conll-1.18",
pages = "234--243",
abstract = "This work describes an analysis of inter-annotator disagreements in human evaluation of machine translation output. The errors in the analysed texts were marked by multiple annotators under guidance of different quality criteria: adequacy, comprehension, and an unspecified generic mixture of adequacy and fluency. Our results show that different criteria result in different disagreements, and indicate that a clear definition of quality criterion can improve the inter-annotator agreement. Furthermore, our results show that for certain linguistic phenomena which are not limited to one or two words (such as word ambiguity or gender) but span over several words or even entire phrases (such as negation or relative clause), disagreements do not necessarily represent {\textquotedblleft}errors{\textquotedblright} or {\textquotedblleft}noise{\textquotedblright} but are rather inherent to the evaluation process. {\%}These disagreements are caused by differences in error perception and/or the fact that there is no single correct translation of a text so that multiple solutions are possible. On the other hand, for some other phenomena (such as omission or verb forms) agreement can be easily improved by providing more precise and detailed instructions to the evaluators."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="popovic-2021-agree">
<titleInfo>
<title>Agree to Disagree: Analysis of Inter-Annotator Disagreements in Human Evaluation of Machine Translation Output</title>
</titleInfo>
<name type="personal">
<namePart type="given">Maja</namePart>
<namePart type="family">Popović</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-11</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 25th Conference on Computational Natural Language Learning</title>
</titleInfo>
<name type="personal">
<namePart type="given">Arianna</namePart>
<namePart type="family">Bisazza</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Omri</namePart>
<namePart type="family">Abend</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>This work describes an analysis of inter-annotator disagreements in human evaluation of machine translation output. The errors in the analysed texts were marked by multiple annotators under guidance of different quality criteria: adequacy, comprehension, and an unspecified generic mixture of adequacy and fluency. Our results show that different criteria result in different disagreements, and indicate that a clear definition of quality criterion can improve the inter-annotator agreement. Furthermore, our results show that for certain linguistic phenomena which are not limited to one or two words (such as word ambiguity or gender) but span over several words or even entire phrases (such as negation or relative clause), disagreements do not necessarily represent “errors” or “noise” but are rather inherent to the evaluation process. %These disagreements are caused by differences in error perception and/or the fact that there is no single correct translation of a text so that multiple solutions are possible. On the other hand, for some other phenomena (such as omission or verb forms) agreement can be easily improved by providing more precise and detailed instructions to the evaluators.</abstract>
<identifier type="citekey">popovic-2021-agree</identifier>
<identifier type="doi">10.18653/v1/2021.conll-1.18</identifier>
<location>
<url>https://aclanthology.org/2021.conll-1.18/</url>
</location>
<part>
<date>2021-11</date>
<extent unit="page">
<start>234</start>
<end>243</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Agree to Disagree: Analysis of Inter-Annotator Disagreements in Human Evaluation of Machine Translation Output
%A Popović, Maja
%Y Bisazza, Arianna
%Y Abend, Omri
%S Proceedings of the 25th Conference on Computational Natural Language Learning
%D 2021
%8 November
%I Association for Computational Linguistics
%C Online
%F popovic-2021-agree
%X This work describes an analysis of inter-annotator disagreements in human evaluation of machine translation output. The errors in the analysed texts were marked by multiple annotators under guidance of different quality criteria: adequacy, comprehension, and an unspecified generic mixture of adequacy and fluency. Our results show that different criteria result in different disagreements, and indicate that a clear definition of quality criterion can improve the inter-annotator agreement. Furthermore, our results show that for certain linguistic phenomena which are not limited to one or two words (such as word ambiguity or gender) but span over several words or even entire phrases (such as negation or relative clause), disagreements do not necessarily represent “errors” or “noise” but are rather inherent to the evaluation process. %These disagreements are caused by differences in error perception and/or the fact that there is no single correct translation of a text so that multiple solutions are possible. On the other hand, for some other phenomena (such as omission or verb forms) agreement can be easily improved by providing more precise and detailed instructions to the evaluators.
%R 10.18653/v1/2021.conll-1.18
%U https://aclanthology.org/2021.conll-1.18/
%U https://doi.org/10.18653/v1/2021.conll-1.18
%P 234-243
Markdown (Informal)
[Agree to Disagree: Analysis of Inter-Annotator Disagreements in Human Evaluation of Machine Translation Output](https://aclanthology.org/2021.conll-1.18/) (Popović, CoNLL 2021)
ACL