Skip to content

Commit a5f5d90

Browse files
committed
WIP: corefud.Link2Cluster for converting link-based coreference annotation to CorefUD
1 parent 0269841 commit a5f5d90

File tree

1 file changed

+37
-0
lines changed

1 file changed

+37
-0
lines changed

udapi/block/corefud/link2cluster.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from udapi.core.block import Block
2+
3+
class Link2Cluster(Block):
4+
"""Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format."""
5+
6+
def __init__(self, id_attr='external-id', ante_attr='antecedent-id', **kwargs):
7+
super().__init__(**kwargs)
8+
self.id_attr = id_attr
9+
self.ante_id = ante_attr
10+
11+
def process_document(self, doc):
12+
id2node = {}
13+
links = []
14+
for node in doc.nodes:
15+
this_id = node.misc[self.id_attr]
16+
if this_id != '':
17+
id2node[this_id] = node
18+
ante_id = node.misc[self.ante_attr]
19+
if ante_id != '':
20+
links.append([ante_id, this_id])
21+
22+
# sorted(...,reverse=True) converts both cataphora and anaphora to a pair (this, ante) where ante < this.
23+
node_links = [sorted([id2node[link[0]], id2node[link[1]]], reverse=True) for link in links]
24+
25+
# sort() makes sure the links are sorted by the "this" node (i.e. the anaphor, not the antecendent).
26+
node_links.sort()
27+
28+
# Thanks to this sorting, we can assert that this_node is not part of any mention/entity when iterating
29+
# and we can prevent the need for merging two entities.
30+
for this_node, ante_node in node_links:
31+
assert not this_node.mentions
32+
if ante_node.mentions:
33+
ante_node.entities[0].create_mention(head=this_node, words=[this_node])
34+
else:
35+
entity = this_node.root.document.create_coref_entity()
36+
entity.create_mention(head=ante_node, words=[ante_node])
37+
entity.create_mention(head=this_node, words=[this_node])

0 commit comments

Comments
 (0)