2
2
from udapi .core .block import Block
3
3
4
4
class Link2Cluster (Block ):
5
- """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format."""
5
+ """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.
6
6
7
- def __init__ (self , id_attr = 'proiel-id' , ante_attr = 'antecedent-proiel-id' , delete_orig_attrs = True , ** kwargs ):
7
+ Params:
8
+ id_attr: name of the attribute in MISC that stores the original-format IDs of nodes
9
+ ante_attr: name of the attribute in MISC that stores the ID of the antecedent
10
+ of the current node (in the same format as `id_attr`).
11
+ delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion?
12
+ (i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr,
13
+ bridge_attr, bridge_relation_attr if these are used). Default=True.
14
+ infstat_attr: name of the attribute in MISC that stores the information status of a given mention
15
+ Will be stored in `mention.other['infstat']`. Use None for ignoring this.
16
+ coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention
17
+ Will be stored in `mention.other['coreftype']`. Use None for ignoring this.
18
+ bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent
19
+ of the current node/mention (in the same format as `id_attr`).
20
+ Default=None, i.e. ignore this parameter.
21
+ bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type
22
+ (e.g. "part" or "subset"). Default=None, i.e. ignore this parameter.
23
+ eid_counter: use a global counter of entity.eid and start with a given number. Default=1.
24
+ The main goal of this parameter is to make eid unique across multiple documents.
25
+ If you use eid_counter=0, this feature will be turned off,
26
+ so entities will be created using `root.document.create_coref_entity()`,
27
+ with no eid parameter, so that the eid will start from "e1" in each document processed by this block.
28
+ """
29
+ def __init__ (self , id_attr = 'proiel-id' , ante_attr = 'antecedent-proiel-id' , delete_orig_attrs = True ,
30
+ infstat_attr = 'information-status' , coreftype_attr = 'coreftype' ,
31
+ bridge_attr = None , bridge_relation_attr = None , eid_counter = 1 , ** kwargs ):
8
32
super ().__init__ (** kwargs )
9
33
self .id_attr = id_attr
10
34
self .ante_attr = ante_attr
11
35
self .delete_orig_attrs = delete_orig_attrs
36
+ self .infstat_attr = infstat_attr
37
+ self .coreftype_attr = coreftype_attr
38
+ self .bridge_attr = bridge_attr
39
+ self .bridge_relation_attr = bridge_relation_attr
40
+ self .eid_counter = int (eid_counter )
41
+
42
+ def _new_entity (self , doc ):
43
+ if not self .eid_counter :
44
+ return doc .create_coref_entity ()
45
+ entity = doc .create_coref_entity (eid = f"e{ self .eid_counter } " )
46
+ self .eid_counter += 1
47
+ return entity
48
+
49
+ def _new_mention (self , entity , node ):
50
+ mention = entity .create_mention (head = node , words = [node ])
51
+ if self .infstat_attr and node .misc [self .infstat_attr ]:
52
+ mention .other ['infstat' ] = node .misc [self .infstat_attr ]
53
+ if self .delete_orig_attrs :
54
+ del node .misc [self .infstat_attr ]
55
+ if self .coreftype_attr and node .misc [self .coreftype_attr ]:
56
+ mention .other ['coreftype' ] = node .misc [self .coreftype_attr ]
57
+ if self .delete_orig_attrs :
58
+ del node .misc [self .coreftype_attr ]
59
+ return mention
12
60
13
61
def process_document (self , doc ):
14
62
id2node = {}
15
63
links = []
64
+ bridges = []
16
65
for node in doc .nodes_and_empty :
17
66
this_id = node .misc [self .id_attr ]
18
67
if this_id != '' :
@@ -26,6 +75,16 @@ def process_document(self, doc):
26
75
if self .delete_orig_attrs :
27
76
for attr in (self .id_attr , self .ante_attr ):
28
77
del node .misc [attr ]
78
+ if self .bridge_attr :
79
+ bridge_id = node .misc [self .bridge_attr ]
80
+ if bridge_id != '' :
81
+ if bridge_id == this_id :
82
+ logging .warning (f"{ node } has a self-reference bridging { self .bridge_attr } ={ bridge_id } " )
83
+ else :
84
+ bridges .append ([bridge_id , this_id , node .misc [self .bridge_relation_attr ]])
85
+ if self .delete_orig_attrs :
86
+ for attr in (self .bridge_attr , self .bridge_relation_attr ):
87
+ del node .misc [attr ]
29
88
30
89
# It seems faster&simpler to process the links in any order and implement entity merging,
31
90
# rather than trying to sort the links so that no entity merging is needed.
@@ -36,14 +95,9 @@ def process_document(self, doc):
36
95
ante_node , this_node = id2node [ante_id ], id2node [this_id ]
37
96
if not this_node .coref_mentions and not ante_node .coref_mentions :
38
97
# None of the nodes is part of any mention/entity. Let's create them.
39
- entity = this_node .root .document .create_coref_entity ()
40
- m_ante = entity .create_mention (head = ante_node , words = [ante_node ])
41
- m_this = entity .create_mention (head = this_node , words = [this_node ])
42
- for node , mention in ((ante_node , m_ante ), (this_node , m_this )):
43
- if node .misc ['information-status' ]:
44
- mention .other ['infstat' ] = node .misc ['information-status' ]
45
- if self .delete_orig_attrs :
46
- del node .misc ['information-status' ]
98
+ entity = self ._new_entity (this_node .root .document )
99
+ self ._new_mention (entity , ante_node )
100
+ self ._new_mention (entity , this_node )
47
101
elif this_node .coref_mentions and ante_node .coref_mentions :
48
102
# Both of the nodes are part of mentions in different entities.
49
103
# Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity).
@@ -59,6 +113,25 @@ def process_document(self, doc):
59
113
else :
60
114
# Only one of the nodes is part of an entity. Let's add the second one to this entity.
61
115
if ante_node .coref_mentions :
62
- ante_node .coref_entities [0 ]. create_mention ( head = this_node , words = [ this_node ] )
116
+ self . _new_mention ( ante_node .coref_entities [0 ], this_node )
63
117
else :
64
- this_node .coref_entities [0 ].create_mention (head = ante_node , words = [ante_node ])
118
+ self ._new_mention (this_node .coref_entities [0 ], ante_node )
119
+
120
+ # Bridging
121
+ for ante_id , this_id , relation in bridges :
122
+ if ante_id not in id2node :
123
+ logging .warning (f"{ ante_id } is referenced in { self .bridge_attr } , but not in { self .id_attr } " )
124
+ else :
125
+ ante_node , this_node = id2node [ante_id ], id2node [this_id ]
126
+ if ante_node .coref_mentions :
127
+ m_ante = next (m for m in ante_node .coref_mentions if m .head is ante_node )
128
+ e_ante = m_ante .entity
129
+ else :
130
+ e_ante = self ._new_entity (ante_node .root .document )
131
+ m_ante = self ._new_mention (e_ante , ante_node )
132
+ if this_node .coref_mentions :
133
+ m_this = next (m for m in this_node .coref_mentions if m .head is this_node )
134
+ else :
135
+ e_this = self ._new_entity (this_node .root .document )
136
+ m_this = self ._new_mention (e_this , this_node )
137
+ m_this .bridging .append ((e_ante , relation ))
0 commit comments