Skip to content

Commit c03419c

Browse files
committed
Blocks to work with PARSEME MWE corpora.
1 parent 19fe229 commit c03419c

File tree

3 files changed

+204
-0
lines changed

3 files changed

+204
-0
lines changed

udapi/block/mwe/normalize.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
"""Block that takes PARSEME-like annotation of multiword expressions from MISC
2+
and normalizes it so that the type is always annotated at the first word of
3+
the expression."""
4+
from udapi.core.block import Block
5+
import logging
6+
import re
7+
8+
class Normalize(Block):
9+
10+
def collect_mwes(self, root):
11+
"""
12+
Collects annotations of multiword expressions from MISC of the nodes.
13+
The expected annotation is in the style of Parseme (see
14+
https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download
15+
the data from http://hdl.handle.net/11372/LRT-5124), except that there
16+
are only ten columns and the annotation from the eleventh column is
17+
copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).
18+
"""
19+
nodes = root.descendants
20+
mwes = {} # for each mwe id, its type and list of node ids
21+
mwes_by_nodes = {} # for each node id, a list of mwe ids
22+
for n in nodes:
23+
mwes_by_nodes[n.ord] = []
24+
miscmwe = n.misc['Mwe']
25+
if miscmwe:
26+
# A node may belong to multiple multiword expressions.
27+
miscmwes = miscmwe.split(';')
28+
for m in miscmwes:
29+
# Either it is NUMBER:TYPE, or just NUMBER.
30+
# Number identifies this MWE among all MWEs in the sentence.
31+
# Type is a main uppercase string (VID, LVC etc.), optionally
32+
# followed by a subtype ('LVC.cause').
33+
# See https://gitlab.com/parseme/corpora/-/wikis/home
34+
match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m)
35+
if match:
36+
number = match.group(1)
37+
type = match.group(2)
38+
if not number in mwes:
39+
mwes[number] = {'nodes': [], 'type': ''}
40+
if type:
41+
mwes[number]['type'] = type
42+
mwes[number]['nodes'].append(n.ord)
43+
mwes_by_nodes[n.ord].append(number)
44+
else:
45+
logging.warning("Cannot parse Mwe=%s" % m)
46+
return (mwes, mwes_by_nodes)
47+
48+
def process_tree(self, root):
49+
"""
50+
Collects annotations of multiword expressions from MISC of the nodes.
51+
Then saves them back but makes sure that the type is annotated at the
52+
first word of the expression (as opposed to the syntactic head or to
53+
any other word).
54+
"""
55+
(mwes, mwes_by_nodes) = self.collect_mwes(root)
56+
nodes = root.descendants
57+
for n in nodes:
58+
# Erase the previous MWE annotations so we can start from scratch.
59+
n.misc['Mwe'] = ''
60+
# There may be multiple MWEs this node is member of.
61+
annotations = []
62+
for m in mwes_by_nodes[n.ord]:
63+
if n.ord == mwes[m]['nodes'][0]:
64+
annotations.append("%s:%s" % (m, mwes[m]['type']))
65+
else:
66+
annotations.append(m)
67+
if annotations:
68+
n.misc['Mwe'] = ';'.join(annotations)

udapi/block/mwe/possessives.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
"""Block that takes PARSEME-like annotation of multiword expressions from MISC,
2+
looks for dependent possessive pronouns and reports how they are treated."""
3+
from udapi.core.block import Block
4+
import logging
5+
import re
6+
7+
class Possessives(Block):
8+
9+
def collect_mwes(self, root):
10+
"""
11+
Collects annotations of multiword expressions from MISC of the nodes.
12+
The expected annotation is in the style of Parseme (see
13+
https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download
14+
the data from http://hdl.handle.net/11372/LRT-5124), except that there
15+
are only ten columns and the annotation from the eleventh column is
16+
copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).
17+
"""
18+
nodes = root.descendants
19+
mwes = {} # for each mwe id, its type and list of node ids
20+
mwes_by_nodes = {} # for each node id, a list of mwe ids
21+
for n in nodes:
22+
mwes_by_nodes[n.ord] = []
23+
miscmwe = n.misc['Mwe']
24+
if miscmwe:
25+
# A node may belong to multiple multiword expressions.
26+
miscmwes = miscmwe.split(';')
27+
for m in miscmwes:
28+
# Either it is NUMBER:TYPE, or just NUMBER.
29+
# Number identifies this MWE among all MWEs in the sentence.
30+
# Type is a main uppercase string (VID, LVC etc.), optionally
31+
# followed by a subtype ('LVC.cause').
32+
# See https://gitlab.com/parseme/corpora/-/wikis/home
33+
match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m)
34+
if match:
35+
number = match.group(1)
36+
type = match.group(2)
37+
if not number in mwes:
38+
mwes[number] = {'nodes': [], 'type': ''}
39+
if type:
40+
mwes[number]['type'] = type
41+
mwes[number]['nodes'].append(n.ord)
42+
mwes_by_nodes[n.ord].append(number)
43+
else:
44+
logging.warning("Cannot parse Mwe=%s" % m)
45+
return (mwes, mwes_by_nodes)
46+
47+
def process_tree(self, root):
48+
"""
49+
Collects annotations of multiword expressions from MISC of the nodes.
50+
Then surveys the possessive pronouns.
51+
"""
52+
(mwes, mwes_by_nodes) = self.collect_mwes(root)
53+
nodes = root.descendants
54+
for m in mwes:
55+
mwenodes = [x for x in nodes if m in mwes_by_nodes[x.ord]]
56+
mweheads = [x for x in mwenodes if not x.parent in mwenodes]
57+
mwedescendantset = set()
58+
for x in mweheads:
59+
mwedescendantset = mwedescendantset.union(set(x.descendants))
60+
mwedescendants = list(sorted(mwedescendantset))
61+
# Is there a possessive pronoun?
62+
possprons = [x for x in mwedescendants if x.upos == 'PRON' and x.feats['Poss'] == 'Yes']
63+
inpp = [x for x in possprons if m in mwes_by_nodes[x.ord]]
64+
outpp = [x for x in possprons if not m in mwes_by_nodes[x.ord]]
65+
observation = ''
66+
if inpp and outpp:
67+
observation = 'both'
68+
elif inpp:
69+
observation = 'in'
70+
elif outpp:
71+
observation = 'out'
72+
if observation:
73+
expression = ' '.join([x.form if m in mwes_by_nodes[x.ord] else '('+x.form+')' for x in mwedescendants])
74+
print(observation + ': ' + expression)

udapi/block/mwe/tosubdeprels.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
"""Block that takes PARSEME-like annotation of multiword expressions from MISC
2+
and projects it to subtypes of dependency relation labels. The motivation is
3+
that a parser could learn to predict the multiword expressions."""
4+
from udapi.core.block import Block
5+
import logging
6+
import re
7+
8+
class ToSubDeprels(Block):
9+
10+
def collect_mwes(self, root):
11+
"""
12+
Collects annotations of multiword expressions from MISC of the nodes.
13+
The expected annotation is in the style of Parseme (see
14+
https://gitlab.com/parseme/corpora/-/wikis/home#annotation and download
15+
the data from http://hdl.handle.net/11372/LRT-5124), except that there
16+
are only ten columns and the annotation from the eleventh column is
17+
copied to the tenth (MISC) as the attribute Mwe (e.g., Mwe=1:LVC.cause).
18+
"""
19+
nodes = root.descendants
20+
mwes = {} # for each mwe id, its type and list of node ids
21+
mwes_by_nodes = {} # for each node id, a list of mwe ids
22+
for n in nodes:
23+
mwes_by_nodes[n.ord] = []
24+
miscmwe = n.misc['Mwe']
25+
if miscmwe:
26+
# A node may belong to multiple multiword expressions.
27+
miscmwes = miscmwe.split(';')
28+
for m in miscmwes:
29+
# Either it is NUMBER:TYPE, or just NUMBER.
30+
# Number identifies this MWE among all MWEs in the sentence.
31+
# Type is a main uppercase string (VID, LVC etc.), optionally
32+
# followed by a subtype ('LVC.cause').
33+
# See https://gitlab.com/parseme/corpora/-/wikis/home
34+
match = re.match(r"^([0-9]+)(?::([A-Za-z\.]+))?$", m)
35+
if match:
36+
number = match.group(1)
37+
type = match.group(2)
38+
if not number in mwes:
39+
mwes[number] = {'nodes': [], 'type': ''}
40+
if type:
41+
mwes[number]['type'] = type
42+
mwes[number]['nodes'].append(n.ord)
43+
mwes_by_nodes[n.ord].append(number)
44+
else:
45+
logging.warning("Cannot parse Mwe=%s" % m)
46+
return (mwes, mwes_by_nodes)
47+
48+
def process_tree(self, root):
49+
"""
50+
Collects annotations of multiword expressions from MISC of the nodes.
51+
Then saves the type of the MWE as a subtype of the deprels inside.
52+
"""
53+
nodes = root.descendants
54+
(mwes, mwes_by_nodes) = self.collect_mwes(root)
55+
# Now we hopefully know the type of every multiword expression in the sentence.
56+
for n in nodes:
57+
if mwes_by_nodes[n.ord]:
58+
for m in mwes_by_nodes[n.ord]:
59+
type = re.sub(r"\.", '', mwes[m]['type'].lower())
60+
# Add the MWE type to the DEPREL if the parent is also in the same MWE.
61+
if n.parent.ord > 0 and m in mwes_by_nodes[n.parent.ord]:
62+
n.deprel += ':' + type

0 commit comments

Comments
 (0)