Skip to content

Commit 07c6e7a

Browse files
committed
corefud.Link2Cluster can convert bridging, added params and documentation
1 parent 987ef5b commit 07c6e7a

File tree

1 file changed

+85
-12
lines changed

1 file changed

+85
-12
lines changed

udapi/block/corefud/link2cluster.py

Lines changed: 85 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,66 @@
22
from udapi.core.block import Block
33

44
class Link2Cluster(Block):
5-
"""Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format."""
5+
"""Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.
66
7-
def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, **kwargs):
7+
Params:
8+
id_attr: name of the attribute in MISC that stores the original-format IDs of nodes
9+
ante_attr: name of the attribute in MISC that stores the ID of the antecedent
10+
of the current node (in the same format as `id_attr`).
11+
delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion?
12+
(i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr,
13+
bridge_attr, bridge_relation_attr if these are used). Default=True.
14+
infstat_attr: name of the attribute in MISC that stores the information status of a given mention
15+
Will be stored in `mention.other['infstat']`. Use None for ignoring this.
16+
coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention
17+
Will be stored in `mention.other['coreftype']`. Use None for ignoring this.
18+
bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent
19+
of the current node/mention (in the same format as `id_attr`).
20+
Default=None, i.e. ignore this parameter.
21+
bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type
22+
(e.g. "part" or "subset"). Default=None, i.e. ignore this parameter.
23+
eid_counter: use a global counter of entity.eid and start with a given number. Default=1.
24+
The main goal of this parameter is to make eid unique across multiple documents.
25+
If you use eid_counter=0, this feature will be turned off,
26+
so entities will be created using `root.document.create_coref_entity()`,
27+
with no eid parameter, so that the eid will start from "e1" in each document processed by this block.
28+
"""
29+
def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True,
30+
infstat_attr='information-status', coreftype_attr='coreftype',
31+
bridge_attr=None, bridge_relation_attr=None, eid_counter=1, **kwargs):
832
super().__init__(**kwargs)
933
self.id_attr = id_attr
1034
self.ante_attr = ante_attr
1135
self.delete_orig_attrs = delete_orig_attrs
36+
self.infstat_attr = infstat_attr
37+
self.coreftype_attr = coreftype_attr
38+
self.bridge_attr = bridge_attr
39+
self.bridge_relation_attr = bridge_relation_attr
40+
self.eid_counter = int(eid_counter)
41+
42+
def _new_entity(self, doc):
43+
if not self.eid_counter:
44+
return doc.create_coref_entity()
45+
entity = doc.create_coref_entity(eid=f"e{self.eid_counter}")
46+
self.eid_counter += 1
47+
return entity
48+
49+
def _new_mention(self, entity, node):
50+
mention = entity.create_mention(head=node, words=[node])
51+
if self.infstat_attr and node.misc[self.infstat_attr]:
52+
mention.other['infstat'] = node.misc[self.infstat_attr]
53+
if self.delete_orig_attrs:
54+
del node.misc[self.infstat_attr]
55+
if self.coreftype_attr and node.misc[self.coreftype_attr]:
56+
mention.other['coreftype'] = node.misc[self.coreftype_attr]
57+
if self.delete_orig_attrs:
58+
del node.misc[self.coreftype_attr]
59+
return mention
1260

1361
def process_document(self, doc):
1462
id2node = {}
1563
links = []
64+
bridges = []
1665
for node in doc.nodes_and_empty:
1766
this_id = node.misc[self.id_attr]
1867
if this_id != '':
@@ -26,6 +75,16 @@ def process_document(self, doc):
2675
if self.delete_orig_attrs:
2776
for attr in (self.id_attr, self.ante_attr):
2877
del node.misc[attr]
78+
if self.bridge_attr:
79+
bridge_id = node.misc[self.bridge_attr]
80+
if bridge_id != '':
81+
if bridge_id == this_id:
82+
logging.warning(f"{node} has a self-reference bridging {self.bridge_attr}={bridge_id}")
83+
else:
84+
bridges.append([bridge_id, this_id, node.misc[self.bridge_relation_attr]])
85+
if self.delete_orig_attrs:
86+
for attr in (self.bridge_attr, self.bridge_relation_attr):
87+
del node.misc[attr]
2988

3089
# It seems faster&simpler to process the links in any order and implement entity merging,
3190
# rather than trying to sort the links so that no entity merging is needed.
@@ -36,14 +95,9 @@ def process_document(self, doc):
3695
ante_node, this_node = id2node[ante_id], id2node[this_id]
3796
if not this_node.coref_mentions and not ante_node.coref_mentions:
3897
# None of the nodes is part of any mention/entity. Let's create them.
39-
entity = this_node.root.document.create_coref_entity()
40-
m_ante = entity.create_mention(head=ante_node, words=[ante_node])
41-
m_this = entity.create_mention(head=this_node, words=[this_node])
42-
for node, mention in ((ante_node, m_ante), (this_node, m_this)):
43-
if node.misc['information-status']:
44-
mention.other['infstat'] = node.misc['information-status']
45-
if self.delete_orig_attrs:
46-
del node.misc['information-status']
98+
entity = self._new_entity(this_node.root.document)
99+
self._new_mention(entity, ante_node)
100+
self._new_mention(entity, this_node)
47101
elif this_node.coref_mentions and ante_node.coref_mentions:
48102
# Both of the nodes are part of mentions in different entities.
49103
# Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity).
@@ -59,6 +113,25 @@ def process_document(self, doc):
59113
else:
60114
# Only one of the nodes is part of an entity. Let's add the second one to this entity.
61115
if ante_node.coref_mentions:
62-
ante_node.coref_entities[0].create_mention(head=this_node, words=[this_node])
116+
self._new_mention(ante_node.coref_entities[0], this_node)
63117
else:
64-
this_node.coref_entities[0].create_mention(head=ante_node, words=[ante_node])
118+
self._new_mention(this_node.coref_entities[0], ante_node)
119+
120+
# Bridging
121+
for ante_id, this_id, relation in bridges:
122+
if ante_id not in id2node:
123+
logging.warning(f"{ante_id} is referenced in {self.bridge_attr}, but not in {self.id_attr}")
124+
else:
125+
ante_node, this_node = id2node[ante_id], id2node[this_id]
126+
if ante_node.coref_mentions:
127+
m_ante = next(m for m in ante_node.coref_mentions if m.head is ante_node)
128+
e_ante = m_ante.entity
129+
else:
130+
e_ante = self._new_entity(ante_node.root.document)
131+
m_ante = self._new_mention(e_ante, ante_node)
132+
if this_node.coref_mentions:
133+
m_this = next(m for m in this_node.coref_mentions if m.head is this_node)
134+
else:
135+
e_this = self._new_entity(this_node.root.document)
136+
m_this = self._new_mention(e_this, this_node)
137+
m_this.bridging.append((e_ante, relation))

0 commit comments

Comments
 (0)