4
4
class Stats (Block ):
5
5
"""Block corefud.Stats prints various coreference-related statistics."""
6
6
7
- def __init__ (self , m_len_max = 5 , c_len_max = 5 , report_mentions = True , report_entities = True ,
7
+ def __init__ (self , m_len_max = 5 , e_len_max = 5 , report_mentions = True , report_entities = True ,
8
8
report_details = True , selected_upos = 'NOUN PRON PROPN DET ADJ VERB ADV NUM' ,
9
- exclude_singletons = False , exclude_nonsingletons = False , style = 'human' , ** kwargs ):
9
+ exclude_singletons = False , exclude_nonsingletons = False , style = 'human' ,
10
+ per_doc = False , ** kwargs ):
10
11
super ().__init__ (** kwargs )
11
12
self .m_len_max = m_len_max
12
- self .c_len_max = c_len_max
13
+ self .e_len_max = e_len_max
13
14
self .report_mentions = report_mentions
14
15
self .report_entities = report_entities
15
16
self .report_details = report_details
16
17
self .exclude_singletons = exclude_singletons
17
18
self .exclude_nonsingletons = exclude_nonsingletons
18
19
self .style = style
19
- if style not in 'tex human' .split ():
20
- raise ValueError (f'Unknown style f{ style } ' )
20
+ if style not in 'tex tex-table tex-doc human' .split ():
21
+ raise ValueError (f'Unknown style { style } ' )
22
+ self .per_doc = per_doc
23
+ self ._header_printed = False
21
24
22
25
self .counter = Counter ()
23
26
self .mentions = 0
@@ -31,6 +34,7 @@ def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_entiti
31
34
32
35
def process_document (self , doc ):
33
36
self .total_nodes += len (list (doc .nodes ))
37
+ self .counter ['documents' ] += 1
34
38
for entity in doc .coref_entities :
35
39
len_mentions = len (entity .mentions )
36
40
if len_mentions == 1 :
@@ -41,7 +45,7 @@ def process_document(self, doc):
41
45
continue
42
46
self .longest_entity = max (len_mentions , self .longest_entity )
43
47
self .counter ['c_total_len' ] += len_mentions
44
- self .counter [f"c_len_{ min (len_mentions , self .c_len_max )} " ] += 1
48
+ self .counter [f"c_len_{ min (len_mentions , self .e_len_max )} " ] += 1
45
49
46
50
self .entities += 1
47
51
if not self .report_mentions and not self .report_details :
@@ -69,7 +73,32 @@ def process_document(self, doc):
69
73
heads += 0 if any (d ['parent' ] in mwords for d in w .deps ) else 1
70
74
self .counter ['m_nontreelet' ] += 1 if heads > 1 else 0
71
75
72
- def process_end (self ):
76
+
77
+ def after_process_document (self , doc ):
78
+ if self .per_doc :
79
+ self .process_end (skip = False , doc = doc )
80
+ self .counter = Counter ()
81
+ self .mentions = 0
82
+ self .entities = 0
83
+ self .singletons = 0
84
+ self .total_nodes = 0
85
+ self .longest_mention = 0
86
+ self .longest_entity = 0
87
+ self .m_words = 0
88
+
89
+ def process_end (self , skip = True , doc = None ):
90
+ if not self ._header_printed :
91
+ self ._header_printed = True
92
+ self .print_header ()
93
+ if self .per_doc :
94
+ if skip :
95
+ self .print_footer ()
96
+ return
97
+ else :
98
+ print (f"{ doc [0 ].trees [0 ].newdoc :15} " , end = '&' if self .style .startswith ('tex' ) else '\n ' )
99
+ elif self .style .startswith ('tex-' ):
100
+ print (f"{ self .counter ['documents' ]:4} documents &" )
101
+
73
102
mentions_nonzero = 1 if self .mentions == 0 else self .mentions
74
103
entities_nonzero = 1 if self .entities == 0 else self .entities
75
104
total_nodes_nonzero = 1 if self .total_nodes == 0 else self .total_nodes
@@ -80,17 +109,18 @@ def process_end(self):
80
109
('entities_per1k' , f"{ 1000 * self .entities / total_nodes_nonzero :6.0f} " ),
81
110
('longest_entity' , f"{ self .longest_entity :6} " ),
82
111
('avg_entity' , f"{ self .counter ['c_total_len' ] / entities_nonzero :5.1f} " )]
83
- for i in range (1 , self .c_len_max + 1 ):
112
+ for i in range (1 , self .e_len_max + 1 ):
84
113
percent = 100 * self .counter [f"c_len_{ i } " ] / entities_nonzero
85
- columns .append ((f"c_len_{ i } { '' if i < self .c_len_max else '+' } " , f"{ percent :5.1f} " ))
114
+ columns .append ((f"c_len_{ i } { '' if i < self .e_len_max else '+' } " , f"{ percent :5.1f} " ))
86
115
if self .report_mentions :
87
116
columns += [('mentions' , f"{ self .mentions :7,} " ),
88
117
('mentions_per1k' , f"{ 1000 * self .mentions / total_nodes_nonzero :6.0f} " ),
89
118
('longest_mention' , f"{ self .longest_mention :6} " ),
90
119
('avg_mention' , f"{ self .counter ['m_total_len' ] / mentions_nonzero :5.1f} " )]
91
- for i in range (0 , self .m_len_max + 1 ):
92
- percent = 100 * self .counter [f"m_len_{ i } " ] / mentions_nonzero
93
- columns .append ((f"m_len_{ i } { '' if i < self .m_len_max else '+' } " , f"{ percent :5.1f} " ))
120
+ if self .m_len_max :
121
+ for i in range (0 , self .m_len_max + 1 ):
122
+ percent = 100 * self .counter [f"m_len_{ i } " ] / mentions_nonzero
123
+ columns .append ((f"m_len_{ i } { '' if i < self .m_len_max else '+' } " , f"{ percent :5.1f} " ))
94
124
if self .report_details :
95
125
columns += [('with_empty' , f"{ 100 * self .counter ['m_with_empty' ] / mentions_nonzero :5.1f} " ),
96
126
('with_gaps' , f"{ 100 * self .counter ['m_with_gaps' ] / mentions_nonzero :5.1f} " ),
@@ -102,8 +132,88 @@ def process_end(self):
102
132
for upos in upos_list :
103
133
columns .append (('head_upos=' + upos , f"{ 100 * self .counter ['m_head_upos_' + upos ] / mentions_nonzero :5.1f} " ))
104
134
105
- if self .style == 'tex' :
106
- print (" & " .join (c [1 ] for c in columns ))
135
+ if self .style . startswith ( 'tex' ) :
136
+ print (" & " .join (c [1 ] for c in columns ), end = " \\ \\ \n " )
107
137
elif self .style == 'human' :
108
138
for c in columns :
109
139
print (f"{ c [0 ]:>15} = { c [1 ].strip ():>10} " )
140
+ if not self .per_doc :
141
+ self .print_footer ()
142
+
143
+ def print_header (self ):
144
+ if not self .style .startswith ('tex-' ):
145
+ return
146
+ if self .style == 'tex-doc' :
147
+ print (r'\documentclass{standalone}' )
148
+ print (r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}' )
149
+ print (r'\title{Udapi coreference statistics}' )
150
+ print (r'\begin{document}' )
151
+ print (r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}' )
152
+ lines = [r'\begin{tabular}{@{}l ' , " " * 15 , ("document" if self .per_doc else "dataset " ) + " " * 7 , " " * 15 ]
153
+ if self .report_entities :
154
+ lines [0 ] += "rrrr "
155
+ lines [1 ] += r'& \MC{4}{entities} '
156
+ lines [2 ] += r'& total & per 1k & \MC{2}{length} '
157
+ lines [3 ] += r'& count & words & max & avg. '
158
+ if self .e_len_max :
159
+ for i in range (1 , self .e_len_max + 1 ):
160
+ lines [0 ] += "r"
161
+ lines [2 ] += f"& { i :4} " + ("+ " if i == self .e_len_max else " " )
162
+ lines [3 ] += r'& [\%] '
163
+ lines [0 ] += " "
164
+ lines [1 ] += r'& \MC{' + str (self .e_len_max ) + r'}{distribution of entity lengths}'
165
+ if self .report_mentions :
166
+ lines [0 ] += "rrrr "
167
+ lines [1 ] += r'& \MC{4}{mentions} '
168
+ lines [2 ] += r'& total & per 1k & \MC{2}{length} '
169
+ lines [3 ] += r'& count & words & max & avg. '
170
+ if self .m_len_max :
171
+ for i in range (0 , self .m_len_max + 1 ):
172
+ lines [0 ] += "r"
173
+ lines [2 ] += f"& { i :4} " + ("+ " if i == self .m_len_max else " " )
174
+ lines [3 ] += r'& [\%] '
175
+ lines [0 ] += " "
176
+ lines [1 ] += r'& \MC{' + str (self .m_len_max + 1 ) + r'}{distribution of mention lengths}' + " " * 7
177
+ if self .report_details :
178
+ lines [0 ] += "rrrr "
179
+ lines [1 ] += r'& \MC{3}{mention type} '
180
+ lines [2 ] += r'&w/empty& w/gap&non-tree'
181
+ lines [3 ] += r'& [\%] ' * 3
182
+ if self .selected_upos :
183
+ upos_list = self .selected_upos + ['other' ]
184
+ else :
185
+ upos_list = [x [12 :] for x in self .counter if x .startswith ('m_head_upos_' )]
186
+ lines [0 ] += "@{~}r" * len (upos_list )
187
+ lines [1 ] += r"& \MC{" + str (len (upos_list )) + r"}{distribution of head UPOS}"
188
+ lines [2 ] += '' .join (f'&{ upos :7} ' for upos in upos_list )
189
+ lines [3 ] += r'& [\%] ' * len (upos_list )
190
+ lines [0 ] += r'@{}}\toprule'
191
+ last_col = 1
192
+ lines [1 ] += r'\\'
193
+ lines [2 ] += r'\\'
194
+ lines [3 ] += r'\\\midrule'
195
+ if self .report_entities :
196
+ last_col += 4
197
+ lines [1 ] += r'\cmidrule(lr){2-5}'
198
+ lines [2 ] += r'\cmidrule(lr){4-5}'
199
+ if self .e_len_max :
200
+ last_col += self .e_len_max
201
+ lines [1 ] += r'\cmidrule(lr){6-' + str (last_col ) + '}'
202
+ if self .report_mentions :
203
+ lines [1 ] += r'\cmidrule(lr){' + f"{ last_col + 1 } -{ last_col + 4 } " + '}'
204
+ lines [2 ] += r'\cmidrule(lr){' + f"{ last_col + 3 } -{ last_col + 4 } " + '}'
205
+ last_col += 4
206
+ if self .m_len_max :
207
+ lines [1 ] += r'\cmidrule(lr){' + f"{ last_col + 1 } -{ last_col + self .m_len_max + 1 } " + '}'
208
+ last_col += self .m_len_max + 1
209
+ if self .report_details :
210
+ lines [1 ] += r'\cmidrule(lr){' + f"{ last_col + 1 } -{ last_col + 3 } "
211
+ lines [1 ] += r'}\cmidrule(l){' + f"{ last_col + 4 } -{ last_col + 3 + len (upos_list )} " + '}'
212
+ print ("\n " .join (lines ))
213
+
214
+ def print_footer (self ):
215
+ if not self .style .startswith ('tex-' ):
216
+ return
217
+ print (r'\bottomrule\end{tabular}' )
218
+ if self .style == 'tex-doc' :
219
+ print (r'\end{document}' )
0 commit comments