|
| 1 | +from udapi.core.block import Block |
| 2 | + |
| 3 | +class Link2Cluster(Block): |
| 4 | + """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.""" |
| 5 | + |
| 6 | + def __init__(self, id_attr='external-id', ante_attr='antecedent-id', **kwargs): |
| 7 | + super().__init__(**kwargs) |
| 8 | + self.id_attr = id_attr |
| 9 | + self.ante_id = ante_attr |
| 10 | + |
| 11 | + def process_document(self, doc): |
| 12 | + id2node = {} |
| 13 | + links = [] |
| 14 | + for node in doc.nodes: |
| 15 | + this_id = node.misc[self.id_attr] |
| 16 | + if this_id != '': |
| 17 | + id2node[this_id] = node |
| 18 | + ante_id = node.misc[self.ante_attr] |
| 19 | + if ante_id != '': |
| 20 | + links.append([ante_id, this_id]) |
| 21 | + |
| 22 | + # sorted(...,reverse=True) converts both cataphora and anaphora to a pair (this, ante) where ante < this. |
| 23 | + node_links = [sorted([id2node[link[0]], id2node[link[1]]], reverse=True) for link in links] |
| 24 | + |
| 25 | + # sort() makes sure the links are sorted by the "this" node (i.e. the anaphor, not the antecendent). |
| 26 | + node_links.sort() |
| 27 | + |
| 28 | + # Thanks to this sorting, we can assert that this_node is not part of any mention/entity when iterating |
| 29 | + # and we can prevent the need for merging two entities. |
| 30 | + for this_node, ante_node in node_links: |
| 31 | + assert not this_node.mentions |
| 32 | + if ante_node.mentions: |
| 33 | + ante_node.entities[0].create_mention(head=this_node, words=[this_node]) |
| 34 | + else: |
| 35 | + entity = this_node.root.document.create_coref_entity() |
| 36 | + entity.create_mention(head=ante_node, words=[ante_node]) |
| 37 | + entity.create_mention(head=this_node, words=[this_node]) |
0 commit comments