@inproceedings{wu-etal-2023-scene,
title = "Scene Graph Enhanced Pseudo-Labeling for Referring Expression Comprehension",
author = "Wu, Cantao and
Cai, Yi and
Li, Liuwu and
Wang, Jiexin",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.findings-emnlp.802/",
doi = "10.18653/v1/2023.findings-emnlp.802",
pages = "11978--11990",
abstract = "Referring Expression Comprehension (ReC) is a task that involves localizing objects in images based on natural language expressions. Most ReC methods typically approach the task as a supervised learning problem. However, the need for costly annotations, such as clear image-text pairs or region-text pairs, hinders the scalability of existing approaches. In this work, we propose a novel scene graph-based framework that automatically generates high-quality pseudo region-query pairs. Our method harnesses scene graphs to capture the relationships between objects in images and generate expressions enriched with relation information. To ensure accurate mapping between visual regions and text, we introduce an external module that employs a calibration algorithm to filter out ambiguous queries. Additionally, we employ a rewriter module to enhance the diversity of our generated pseudo queries through rewriting. Extensive experiments demonstrate that our method outperforms previous pseudo-labeling methods by about 10{\%}, 12{\%}, and 11{\%} on RefCOCO, RefCOCO+, and RefCOCOg, respectively. Furthermore, it surpasses the state-of-the-art unsupervised approach by more than 15{\%} on the RefCOCO dataset."
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wu-etal-2023-scene">
<titleInfo>
<title>Scene Graph Enhanced Pseudo-Labeling for Referring Expression Comprehension</title>
</titleInfo>
<name type="personal">
<namePart type="given">Cantao</namePart>
<namePart type="family">Wu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yi</namePart>
<namePart type="family">Cai</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Liuwu</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jiexin</namePart>
<namePart type="family">Wang</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2023</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Referring Expression Comprehension (ReC) is a task that involves localizing objects in images based on natural language expressions. Most ReC methods typically approach the task as a supervised learning problem. However, the need for costly annotations, such as clear image-text pairs or region-text pairs, hinders the scalability of existing approaches. In this work, we propose a novel scene graph-based framework that automatically generates high-quality pseudo region-query pairs. Our method harnesses scene graphs to capture the relationships between objects in images and generate expressions enriched with relation information. To ensure accurate mapping between visual regions and text, we introduce an external module that employs a calibration algorithm to filter out ambiguous queries. Additionally, we employ a rewriter module to enhance the diversity of our generated pseudo queries through rewriting. Extensive experiments demonstrate that our method outperforms previous pseudo-labeling methods by about 10%, 12%, and 11% on RefCOCO, RefCOCO+, and RefCOCOg, respectively. Furthermore, it surpasses the state-of-the-art unsupervised approach by more than 15% on the RefCOCO dataset.</abstract>
<identifier type="citekey">wu-etal-2023-scene</identifier>
<identifier type="doi">10.18653/v1/2023.findings-emnlp.802</identifier>
<location>
<url>https://aclanthology.org/2023.findings-emnlp.802/</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>11978</start>
<end>11990</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Scene Graph Enhanced Pseudo-Labeling for Referring Expression Comprehension
%A Wu, Cantao
%A Cai, Yi
%A Li, Liuwu
%A Wang, Jiexin
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Findings of the Association for Computational Linguistics: EMNLP 2023
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F wu-etal-2023-scene
%X Referring Expression Comprehension (ReC) is a task that involves localizing objects in images based on natural language expressions. Most ReC methods typically approach the task as a supervised learning problem. However, the need for costly annotations, such as clear image-text pairs or region-text pairs, hinders the scalability of existing approaches. In this work, we propose a novel scene graph-based framework that automatically generates high-quality pseudo region-query pairs. Our method harnesses scene graphs to capture the relationships between objects in images and generate expressions enriched with relation information. To ensure accurate mapping between visual regions and text, we introduce an external module that employs a calibration algorithm to filter out ambiguous queries. Additionally, we employ a rewriter module to enhance the diversity of our generated pseudo queries through rewriting. Extensive experiments demonstrate that our method outperforms previous pseudo-labeling methods by about 10%, 12%, and 11% on RefCOCO, RefCOCO+, and RefCOCOg, respectively. Furthermore, it surpasses the state-of-the-art unsupervised approach by more than 15% on the RefCOCO dataset.
%R 10.18653/v1/2023.findings-emnlp.802
%U https://aclanthology.org/2023.findings-emnlp.802/
%U https://doi.org/10.18653/v1/2023.findings-emnlp.802
%P 11978-11990
Markdown (Informal)
[Scene Graph Enhanced Pseudo-Labeling for Referring Expression Comprehension](https://aclanthology.org/2023.findings-emnlp.802/) (Wu et al., Findings 2023)
ACL