Skip to content

Commit be5c58e

Browse files
committed
nice LaTeX tables using corefud.Stats style=tex-doc
udapy -q read.Conllu split_docs=1 corefud.Stats per_doc=1 style=tex-doc < in.conllu > out.tex
1 parent f981c83 commit be5c58e

File tree

1 file changed

+124
-14
lines changed

1 file changed

+124
-14
lines changed

udapi/block/corefud/stats.py

Lines changed: 124 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,23 @@
44
class Stats(Block):
55
"""Block corefud.Stats prints various coreference-related statistics."""
66

7-
def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_entities=True,
7+
def __init__(self, m_len_max=5, e_len_max=5, report_mentions=True, report_entities=True,
88
report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM',
9-
exclude_singletons=False, exclude_nonsingletons=False, style='human', **kwargs):
9+
exclude_singletons=False, exclude_nonsingletons=False, style='human',
10+
per_doc=False, **kwargs):
1011
super().__init__(**kwargs)
1112
self.m_len_max = m_len_max
12-
self.c_len_max = c_len_max
13+
self.e_len_max = e_len_max
1314
self.report_mentions = report_mentions
1415
self.report_entities = report_entities
1516
self.report_details = report_details
1617
self.exclude_singletons = exclude_singletons
1718
self.exclude_nonsingletons = exclude_nonsingletons
1819
self.style = style
19-
if style not in 'tex human'.split():
20-
raise ValueError(f'Unknown style f{style}')
20+
if style not in 'tex tex-table tex-doc human'.split():
21+
raise ValueError(f'Unknown style {style}')
22+
self.per_doc = per_doc
23+
self._header_printed = False
2124

2225
self.counter = Counter()
2326
self.mentions = 0
@@ -31,6 +34,7 @@ def __init__(self, m_len_max=5, c_len_max=5, report_mentions=True, report_entiti
3134

3235
def process_document(self, doc):
3336
self.total_nodes += len(list(doc.nodes))
37+
self.counter['documents'] += 1
3438
for entity in doc.coref_entities:
3539
len_mentions = len(entity.mentions)
3640
if len_mentions == 1:
@@ -41,7 +45,7 @@ def process_document(self, doc):
4145
continue
4246
self.longest_entity = max(len_mentions, self.longest_entity)
4347
self.counter['c_total_len'] += len_mentions
44-
self.counter[f"c_len_{min(len_mentions, self.c_len_max)}"] += 1
48+
self.counter[f"c_len_{min(len_mentions, self.e_len_max)}"] += 1
4549

4650
self.entities += 1
4751
if not self.report_mentions and not self.report_details:
@@ -69,7 +73,32 @@ def process_document(self, doc):
6973
heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1
7074
self.counter['m_nontreelet'] += 1 if heads > 1 else 0
7175

72-
def process_end(self):
76+
77+
def after_process_document(self, doc):
78+
if self.per_doc:
79+
self.process_end(skip=False, doc=doc)
80+
self.counter = Counter()
81+
self.mentions = 0
82+
self.entities = 0
83+
self.singletons = 0
84+
self.total_nodes = 0
85+
self.longest_mention = 0
86+
self.longest_entity = 0
87+
self.m_words = 0
88+
89+
def process_end(self, skip=True, doc=None):
90+
if not self._header_printed:
91+
self._header_printed = True
92+
self.print_header()
93+
if self.per_doc:
94+
if skip:
95+
self.print_footer()
96+
return
97+
else:
98+
print(f"{doc[0].trees[0].newdoc:15}", end='&' if self.style.startswith('tex') else '\n')
99+
elif self.style.startswith('tex-'):
100+
print(f"{self.counter['documents']:4} documents &")
101+
73102
mentions_nonzero = 1 if self.mentions == 0 else self.mentions
74103
entities_nonzero = 1 if self.entities == 0 else self.entities
75104
total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes
@@ -80,17 +109,18 @@ def process_end(self):
80109
('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"),
81110
('longest_entity', f"{self.longest_entity:6}"),
82111
('avg_entity', f"{self.counter['c_total_len'] / entities_nonzero:5.1f}")]
83-
for i in range(1, self.c_len_max + 1):
112+
for i in range(1, self.e_len_max + 1):
84113
percent = 100 * self.counter[f"c_len_{i}"] / entities_nonzero
85-
columns.append((f"c_len_{i}{'' if i < self.c_len_max else '+'}", f"{percent:5.1f}"))
114+
columns.append((f"c_len_{i}{'' if i < self.e_len_max else '+'}", f"{percent:5.1f}"))
86115
if self.report_mentions:
87116
columns += [('mentions', f"{self.mentions:7,}"),
88117
('mentions_per1k', f"{1000 * self.mentions / total_nodes_nonzero:6.0f}"),
89118
('longest_mention', f"{self.longest_mention:6}"),
90119
('avg_mention', f"{self.counter['m_total_len'] / mentions_nonzero:5.1f}")]
91-
for i in range(0, self.m_len_max + 1):
92-
percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero
93-
columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}"))
120+
if self.m_len_max:
121+
for i in range(0, self.m_len_max + 1):
122+
percent = 100 * self.counter[f"m_len_{i}"] / mentions_nonzero
123+
columns.append((f"m_len_{i}{'' if i < self.m_len_max else '+'}", f"{percent:5.1f}"))
94124
if self.report_details:
95125
columns += [('with_empty', f"{100 * self.counter['m_with_empty'] / mentions_nonzero:5.1f}"),
96126
('with_gaps', f"{100 * self.counter['m_with_gaps'] / mentions_nonzero:5.1f}"),
@@ -102,8 +132,88 @@ def process_end(self):
102132
for upos in upos_list:
103133
columns.append(('head_upos=' + upos, f"{100 * self.counter['m_head_upos_' + upos] / mentions_nonzero:5.1f}"))
104134

105-
if self.style == 'tex':
106-
print(" & ".join(c[1] for c in columns))
135+
if self.style.startswith('tex'):
136+
print(" & ".join(c[1] for c in columns), end=" \\\\\n")
107137
elif self.style == 'human':
108138
for c in columns:
109139
print(f"{c[0]:>15} = {c[1].strip():>10}")
140+
if not self.per_doc:
141+
self.print_footer()
142+
143+
def print_header(self):
144+
if not self.style.startswith('tex-'):
145+
return
146+
if self.style == 'tex-doc':
147+
print(r'\documentclass{standalone}')
148+
print(r'\usepackage[utf8]{inputenc}\usepackage{booktabs}\usepackage{underscore}')
149+
print(r'\title{Udapi coreference statistics}')
150+
print(r'\begin{document}')
151+
print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}')
152+
lines = [r'\begin{tabular}{@{}l ', " "*15, ("document" if self.per_doc else "dataset ") + " "*7, " "*15]
153+
if self.report_entities:
154+
lines[0] += "rrrr "
155+
lines[1] += r'& \MC{4}{entities} '
156+
lines[2] += r'& total & per 1k & \MC{2}{length} '
157+
lines[3] += r'& count & words & max & avg. '
158+
if self.e_len_max:
159+
for i in range(1, self.e_len_max + 1):
160+
lines[0] += "r"
161+
lines[2] += f"& {i:4}" + ("+ " if i==self.e_len_max else " ")
162+
lines[3] += r'& [\%] '
163+
lines[0] += " "
164+
lines[1] += r'& \MC{' + str(self.e_len_max) + r'}{distribution of entity lengths}'
165+
if self.report_mentions:
166+
lines[0] += "rrrr "
167+
lines[1] += r'& \MC{4}{mentions} '
168+
lines[2] += r'& total & per 1k & \MC{2}{length} '
169+
lines[3] += r'& count & words & max & avg. '
170+
if self.m_len_max:
171+
for i in range(0, self.m_len_max + 1):
172+
lines[0] += "r"
173+
lines[2] += f"& {i:4}" + ("+ " if i==self.m_len_max else " ")
174+
lines[3] += r'& [\%] '
175+
lines[0] += " "
176+
lines[1] += r'& \MC{' + str(self.m_len_max + 1) + r'}{distribution of mention lengths}' + " "*7
177+
if self.report_details:
178+
lines[0] += "rrrr "
179+
lines[1] += r'& \MC{3}{mention type} '
180+
lines[2] += r'&w/empty& w/gap&non-tree'
181+
lines[3] += r'& [\%] ' * 3
182+
if self.selected_upos:
183+
upos_list = self.selected_upos + ['other']
184+
else:
185+
upos_list = [x[12:] for x in self.counter if x.startswith('m_head_upos_')]
186+
lines[0] += "@{~}r" * len(upos_list)
187+
lines[1] += r"& \MC{" + str(len(upos_list)) + r"}{distribution of head UPOS}"
188+
lines[2] += ''.join(f'&{upos:7}' for upos in upos_list)
189+
lines[3] += r'& [\%] ' * len(upos_list)
190+
lines[0] += r'@{}}\toprule'
191+
last_col = 1
192+
lines[1] += r'\\'
193+
lines[2] += r'\\'
194+
lines[3] += r'\\\midrule'
195+
if self.report_entities:
196+
last_col += 4
197+
lines[1] += r'\cmidrule(lr){2-5}'
198+
lines[2] += r'\cmidrule(lr){4-5}'
199+
if self.e_len_max:
200+
last_col += self.e_len_max
201+
lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}'
202+
if self.report_mentions:
203+
lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}'
204+
lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}'
205+
last_col += 4
206+
if self.m_len_max:
207+
lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+self.m_len_max+1}" + '}'
208+
last_col += self.m_len_max + 1
209+
if self.report_details:
210+
lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+3}"
211+
lines[1] += r'}\cmidrule(l){' + f"{last_col+4}-{last_col+3+len(upos_list)}" + '}'
212+
print("\n".join(lines))
213+
214+
def print_footer(self):
215+
if not self.style.startswith('tex-'):
216+
return
217+
print(r'\bottomrule\end{tabular}')
218+
if self.style == 'tex-doc':
219+
print(r'\end{document}')

0 commit comments

Comments
 (0)