diff --git a/CHANGES.txt b/CHANGES.txt index 67ced748..49dfd40e 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,11 @@ Udapi Change Log ---------------- See https://github.com/udapi/udapi-python/commits/master for details. +0.4.0 2024-03-28 + - support for CorefUD 1.3 + - edits by Dan Zeman in block.ud.* + - requires Python 3.9+ (difficult to test older versions in Circle-CI) + 0.3.0 2022-04-06 - support for CorefUD 1.0 (new CoNLL-U format for coreference annotation) - edits by Dan Zeman in block.ud.* diff --git a/bin/udapy b/bin/udapy index 528e3577..30cb2595 100755 --- a/bin/udapy +++ b/bin/udapy @@ -71,6 +71,20 @@ else: logging.basicConfig(format='%(asctime)-15s [%(levelname)7s] %(funcName)s - %(message)s', level=level) +# Global flag to track if an unhandled exception occurred +_unhandled_exception_occurred = False + +def _custom_excepthook(exc_type, exc_value, traceback): + global _unhandled_exception_occurred + _unhandled_exception_occurred = True + + # Call the default excepthook to allow normal error reporting + sys.__excepthook__(exc_type, exc_value, traceback) + +# Override the default excepthook +sys.excepthook = _custom_excepthook + + # Process and provide the scenario. if __name__ == "__main__": @@ -86,7 +100,13 @@ if __name__ == "__main__": # Udapi documents have a many cyclic references, so running GC is quite slow. if not args.gc: gc.disable() - atexit.register(os._exit, 0) + # When an exception/error has happened, udapy should exit with a non-zero exit code, + # so that users can use `udapy ... || echo "Error detected"` (or Makefile reports errors). + # However, we cannot use `atexit.register(lambda: os._exit(1 if sys.exc_info()[0] else 0))` + # because the Python has already exited the exception-handling block + # (the exception/error has been already reported and sys.exc_info()[0] is None). + # We thus keep record whether _unhandled_exception_occurred. + atexit.register(lambda: os._exit(1 if _unhandled_exception_occurred else 0)) atexit.register(sys.stderr.flush) if args.save: args.scenario = args.scenario + ['write.Conllu'] diff --git a/bin/udapy.bat b/bin/udapy.bat new file mode 100644 index 00000000..013e08e7 --- /dev/null +++ b/bin/udapy.bat @@ -0,0 +1,4 @@ +@REM The Python launcher "py" must be accessible via the PATH environment variable. +@REM We assume that this batch script lies next to udapy in udapi-python/bin. +@REM The PYTHONPATH environment variable must contain path to udapi-python. +py %~dp$PATH:0\udapy %* diff --git a/setup.cfg b/setup.cfg index a14145ab..3ac1ebf2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = udapi -version = 0.3.0 +version = 0.4.0 author = Martin Popel author_email = popel@ufal.mff.cuni.cz description = Python framework for processing Universal Dependencies data @@ -14,7 +14,7 @@ classifiers = [options] packages = find: -python_requires = >=3.6 +python_requires = >=3.9 include_package_data = True scripts = bin/udapy diff --git a/tutorial/udapi-tutorial-dz.odt b/tutorial/udapi-tutorial-dz.odt new file mode 100644 index 00000000..a954e9f3 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.odt differ diff --git a/tutorial/udapi-tutorial-dz.pdf b/tutorial/udapi-tutorial-dz.pdf new file mode 100644 index 00000000..91312187 Binary files /dev/null and b/tutorial/udapi-tutorial-dz.pdf differ diff --git a/udapi/block/corefud/fixparentheses.py b/udapi/block/corefud/fixparentheses.py old mode 100755 new mode 100644 index 9baeca98..bc8e6504 --- a/udapi/block/corefud/fixparentheses.py +++ b/udapi/block/corefud/fixparentheses.py @@ -1,31 +1,31 @@ -from udapi.core.block import Block - - -class FixParentheses(Block): - """Find mentions that contain opening parenthesis but do not contain the closing one (or the other way around). - If the missing parenthesis is an immediate neighbour of the mention span, add it to the span.""" - - def __init__(self, mark=True, **kwargs): - super().__init__(**kwargs) - self.mark = mark - - def process_coref_mention(self, mention): - words = [word.lemma for word in mention.words] - pairs = ['()', '[]', '{}'] - for pair in pairs: - if pair[0] in words: - if not pair[1] in words and pair[1] in [node.lemma for node in mention.head.root.descendants]: - if mention.words[-1].ord == int(mention.words[-1].ord) and mention.words[-1].next_node and \ - mention.words[-1].next_node.lemma == pair[1]: - next_node = mention.words[-1].next_node - mention.words.append(next_node) - if self.mark: - next_node.misc['Mark'] = 1 - - elif pair[1] in words and pair[0] in [node.lemma for node in mention.head.root.descendants]: - if mention.words[0].ord == int(mention.words[0].ord) and mention.words[0].prev_node \ - and mention.words[0].prev_node.lemma == pair[0]: - prev_node = mention.words[0].prev_node - mention.words.append(prev_node) - if self.mark: - prev_node.misc['Mark'] = 1 +from udapi.core.block import Block + + +class FixParentheses(Block): + """Find mentions that contain opening parenthesis but do not contain the closing one (or the other way around). + If the missing parenthesis is an immediate neighbour of the mention span, add it to the span.""" + + def __init__(self, mark=True, **kwargs): + super().__init__(**kwargs) + self.mark = mark + + def process_coref_mention(self, mention): + words = [word.lemma for word in mention.words] + pairs = ['()', '[]', '{}'] + for pair in pairs: + if pair[0] in words: + if not pair[1] in words and pair[1] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[-1].ord == int(mention.words[-1].ord) and mention.words[-1].next_node and \ + mention.words[-1].next_node.lemma == pair[1]: + next_node = mention.words[-1].next_node + mention.words.append(next_node) + if self.mark: + next_node.misc['Mark'] = 1 + + elif pair[1] in words and pair[0] in [node.lemma for node in mention.head.root.descendants]: + if mention.words[0].ord == int(mention.words[0].ord) and mention.words[0].prev_node \ + and mention.words[0].prev_node.lemma == pair[0]: + prev_node = mention.words[0].prev_node + mention.words.append(prev_node) + if self.mark: + prev_node.misc['Mark'] = 1 diff --git a/udapi/block/corefud/link2cluster.py b/udapi/block/corefud/link2cluster.py index 3f9f9bb3..08296531 100644 --- a/udapi/block/corefud/link2cluster.py +++ b/udapi/block/corefud/link2cluster.py @@ -2,17 +2,66 @@ from udapi.core.block import Block class Link2Cluster(Block): - """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format.""" + """Block corefud.Link2Cluster converts link-based coreference annotation to the (cluster-based) CorefUD format. - def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, **kwargs): + Params: + id_attr: name of the attribute in MISC that stores the original-format IDs of nodes + ante_attr: name of the attribute in MISC that stores the ID of the antecedent + of the current node (in the same format as `id_attr`). + delete_orig_attrs: Should we delete the MISC attributes that were used for the conversion? + (i.e. id_attr and ante_attr, plus possibly also infstat_attr, coreftype_attr, + bridge_attr, bridge_relation_attr if these are used). Default=True. + infstat_attr: name of the attribute in MISC that stores the information status of a given mention + Will be stored in `mention.other['infstat']`. Use None for ignoring this. + coreftype_attr: name of the attribute in MISC that stores the coreference type of a given mention + Will be stored in `mention.other['coreftype']`. Use None for ignoring this. + bridge_attr: name of the attribute in MISC that stores the ID of the bridging antecedent + of the current node/mention (in the same format as `id_attr`). + Default=None, i.e. ignore this parameter. + bridge_relation_attr: name of the attribute in MISC that stores the bridging relation type + (e.g. "part" or "subset"). Default=None, i.e. ignore this parameter. + eid_counter: use a global counter of entity.eid and start with a given number. Default=1. + The main goal of this parameter is to make eid unique across multiple documents. + If you use eid_counter=0, this feature will be turned off, + so entities will be created using `root.document.create_coref_entity()`, + with no eid parameter, so that the eid will start from "e1" in each document processed by this block. + """ + def __init__(self, id_attr='proiel-id', ante_attr='antecedent-proiel-id', delete_orig_attrs=True, + infstat_attr='information-status', coreftype_attr='coreftype', + bridge_attr=None, bridge_relation_attr=None, eid_counter=1, **kwargs): super().__init__(**kwargs) self.id_attr = id_attr self.ante_attr = ante_attr self.delete_orig_attrs = delete_orig_attrs + self.infstat_attr = infstat_attr + self.coreftype_attr = coreftype_attr + self.bridge_attr = bridge_attr + self.bridge_relation_attr = bridge_relation_attr + self.eid_counter = int(eid_counter) + + def _new_entity(self, doc): + if not self.eid_counter: + return doc.create_coref_entity() + entity = doc.create_coref_entity(eid=f"e{self.eid_counter}") + self.eid_counter += 1 + return entity + + def _new_mention(self, entity, node): + mention = entity.create_mention(head=node, words=[node]) + if self.infstat_attr and node.misc[self.infstat_attr]: + mention.other['infstat'] = node.misc[self.infstat_attr] + if self.delete_orig_attrs: + del node.misc[self.infstat_attr] + if self.coreftype_attr and node.misc[self.coreftype_attr]: + mention.other['coreftype'] = node.misc[self.coreftype_attr] + if self.delete_orig_attrs: + del node.misc[self.coreftype_attr] + return mention def process_document(self, doc): id2node = {} links = [] + bridges = [] for node in doc.nodes_and_empty: this_id = node.misc[self.id_attr] if this_id != '': @@ -26,6 +75,16 @@ def process_document(self, doc): if self.delete_orig_attrs: for attr in (self.id_attr, self.ante_attr): del node.misc[attr] + if self.bridge_attr: + bridge_id = node.misc[self.bridge_attr] + if bridge_id != '': + if bridge_id == this_id: + logging.warning(f"{node} has a self-reference bridging {self.bridge_attr}={bridge_id}") + else: + bridges.append([bridge_id, this_id, node.misc[self.bridge_relation_attr]]) + if self.delete_orig_attrs: + for attr in (self.bridge_attr, self.bridge_relation_attr): + del node.misc[attr] # It seems faster&simpler to process the links in any order and implement entity merging, # rather than trying to sort the links so that no entity merging is needed. @@ -36,14 +95,9 @@ def process_document(self, doc): ante_node, this_node = id2node[ante_id], id2node[this_id] if not this_node.coref_mentions and not ante_node.coref_mentions: # None of the nodes is part of any mention/entity. Let's create them. - entity = this_node.root.document.create_coref_entity() - m_ante = entity.create_mention(head=ante_node, words=[ante_node]) - m_this = entity.create_mention(head=this_node, words=[this_node]) - for node, mention in ((ante_node, m_ante), (this_node, m_this)): - if node.misc['information-status']: - mention.other['infstat'] = node.misc['information-status'] - if self.delete_orig_attrs: - del node.misc['information-status'] + entity = self._new_entity(this_node.root.document) + self._new_mention(entity, ante_node) + self._new_mention(entity, this_node) elif this_node.coref_mentions and ante_node.coref_mentions: # Both of the nodes are part of mentions in different entities. # Let's merge the two entities (i.e. "steal" all mentions from the "ante" entity to "this" entity). @@ -59,6 +113,25 @@ def process_document(self, doc): else: # Only one of the nodes is part of an entity. Let's add the second one to this entity. if ante_node.coref_mentions: - ante_node.coref_entities[0].create_mention(head=this_node, words=[this_node]) + self._new_mention(ante_node.coref_entities[0], this_node) else: - this_node.coref_entities[0].create_mention(head=ante_node, words=[ante_node]) + self._new_mention(this_node.coref_entities[0], ante_node) + + # Bridging + for ante_id, this_id, relation in bridges: + if ante_id not in id2node: + logging.warning(f"{ante_id} is referenced in {self.bridge_attr}, but not in {self.id_attr}") + else: + ante_node, this_node = id2node[ante_id], id2node[this_id] + if ante_node.coref_mentions: + m_ante = next(m for m in ante_node.coref_mentions if m.head is ante_node) + e_ante = m_ante.entity + else: + e_ante = self._new_entity(ante_node.root.document) + m_ante = self._new_mention(e_ante, ante_node) + if this_node.coref_mentions: + m_this = next(m for m in this_node.coref_mentions if m.head is this_node) + else: + e_this = self._new_entity(this_node.root.document) + m_this = self._new_mention(e_this, this_node) + m_this.bridging.append((e_ante, relation)) diff --git a/udapi/block/corefud/markpairs.py b/udapi/block/corefud/markpairs.py new file mode 100644 index 00000000..cc63b387 --- /dev/null +++ b/udapi/block/corefud/markpairs.py @@ -0,0 +1,138 @@ +from udapi.core.block import Block +import udapi.core.coref +import itertools +from collections import Counter +import logging + +class MarkPairs(Block): + """Find pairs of coreference mentions within the same sentence with given properties. + Mark these pairs of mentions (using `misc["Mark"]`), so they can be further + processed or printed. + + Usage: + # Find pairs of mentions of the same entity within the same sentence: + cat my.conllu | udapy -TM corefud.MarkPairs same_entity=1 | less -R + + Properties: + same_entity - both mentions belong to the same entity (cluster) + both_continuous - both mentions have continuous spans + both_discontinuous - both mentions have discontinuous spans + nested - span of one mention is nested (a subset of) in the span of the other mention + crossing - spans are crossing (i.e. intersecting, but neither is subset of the other) + interleaved - spans are interleaved (i.e. not intersecting, but neither span precedes the other) + same_head - the same node is a head of both mentions + same_span - both mentions have the same span (which is invalid according to UD's validate.py) + same_subspan - at least one of the mentions is discontinuous and one of its subspans + is also a subspan (or span) of the other mention + + + You can combine any number of properties. + Each property can have one of the three values: + include - this is the default value: include pairs with this property, i.e. ignore the property + exclude - exclude (from the marking) pairs of mentions with this property + only - pairs of mentions without this property will be excluded + + As a shortcut, you can use -1 and 1 instead of exclude and only, so e.g. + nested=only same_head=exclude + can be written as + nested=1 same_head=-1 + """ + + def __init__(self, same_entity=0, both_continuous=0, both_discontinuous=0, + nested=0, crossing=0, interleaved=0, + same_head=0, same_span=0, same_subspan=0, + print_form=False, print_total=True, log=True, mark=True, **kwargs): + super().__init__(**kwargs) + + + self.same_entity = self._convert(same_entity) + self.both_continuous = self._convert(both_continuous) + self.both_discontinuous = self._convert(both_discontinuous) + self.nested = self._convert(nested) + self.crossing = self._convert(crossing) + self.interleaved = self._convert(interleaved) + self.same_head = self._convert(same_head) + self.same_span = self._convert(same_span) + self.same_subspan = self._convert(same_subspan) + + self.print_form = print_form + self.print_total = print_total + self.log = log + self.mark = mark + self.counter = Counter() + + def _convert(self, value): + if value in {-1, 0, 1}: + return value + if value == 'include': + return 0 + if value == 'only': + return 1 + if value == 'exclude': + return -1 + raise ValueError('unknown value ' + value) + + def _ok(self, condition, value): + if value == 0: + return True + return (condition and value == 1) or (not condition and value==-1) + + def _print(self, mention): + if self.print_form: + return mention.entity.eid + ':' + ' '.join([w.form for w in mention.words]) + else: + return mention.entity.eid + ':' + mention.span + + def process_tree(self, tree): + mentions = set() + for node in tree.descendants_and_empty: + for m in node.coref_mentions: + mentions.add(m) + self.counter['mentions'] += len(mentions) + + for mA, mB in itertools.combinations(mentions, 2): + self.counter['pairs'] += 1 + if not self._ok(mA.entity == mB.entity, self.same_entity): + continue + if not self._ok(mA.head == mB.head, self.same_head): + continue + + if self.both_continuous or self.both_discontinuous or self.same_span or self.same_subspan: + sA, sB = mA.span, mB.span + cA, cB = ',' not in sA, ',' not in sB + if not self._ok(cA and cB, self.both_continuous): + continue + if not self._ok(not cA and not cB, self.both_discontinuous): + continue + if not self._ok(sA == sB, self.same_span): + continue + if not self._ok(set(sA.split(',')).intersection(set(sB.split(','))), self.same_subspan): + continue + + if self.nested or self.crossing or self.interleaved: + wA, wB = set(mA.words), set(mB.words) + if not self._ok(wA <= wB or wB <= wA, self.nested): + continue + if not self._ok(wA.intersection(wB) and not wA <= wB and not wB <= wA, self.crossing): + continue + if self.interleaved: + a_precedes_b = mA.words[0] < mB.words[0] and mA.words[-1] < mB.words[0] + b_precedes_a = mB.words[0] < mA.words[0] and mB.words[-1] < mA.words[0] + if not self._ok(not wA.intersection(wB) and not a_precedes_b and not b_precedes_a, self.interleaved): + continue + + self.counter['matching'] += 1 + if self.mark: + for w in mA.words + mB.words: + w.misc['Mark'] = 1 + mA.words[0].misc['Mark'] = f"{self._print(mA)}+{self._print(mB)}" + if self.log: + logging.info(f"Found mentions at {tree.sent_id}: {self._print(mA)} + {self._print(mB)}") + + def after_process_document(self, doc): + if self.print_total: + #if self.max_trees and seen_trees > self.max_trees: + # print(f'######## Only first {self.max_trees} matching mentions printed. Use max_trees=0 to see all.') + msg = f'######## Mentions = {self.counter["mentions"]}, matching/all pairs = {self.counter["matching"]} / {self.counter["pairs"]}' + logging.info(msg) + doc.meta["corefud.MarkPairs"] = msg diff --git a/udapi/block/corefud/stats.py b/udapi/block/corefud/stats.py index e05815a6..5368cabc 100644 --- a/udapi/block/corefud/stats.py +++ b/udapi/block/corefud/stats.py @@ -4,13 +4,16 @@ class Stats(Block): """Block corefud.Stats prints various coreference-related statistics.""" - def __init__(self, m_len_max=5, e_len_max=5, report_mentions=True, report_entities=True, - report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM', + def __init__(self, m_len_max=5, e_len_max=5, + report_basics=False, report_mentions=True, report_entities=True, + report_details=True, selected_upos='NOUN PRON PROPN DET ADJ VERB ADV NUM _', exclude_singletons=False, exclude_nonsingletons=False, style='human', - per_doc=False, max_rows_per_page=50, **kwargs): + per_doc=False, max_rows_per_page=50, docname='newdoc', docname_len=15, + **kwargs): super().__init__(**kwargs) self.m_len_max = m_len_max self.e_len_max = e_len_max + self.report_basics = report_basics self.report_mentions = report_mentions self.report_entities = report_entities self.report_details = report_details @@ -21,6 +24,10 @@ def __init__(self, m_len_max=5, e_len_max=5, report_mentions=True, report_entiti raise ValueError(f'Unknown style {style}') self.per_doc = per_doc self.max_rows_per_page = max_rows_per_page + if docname not in 'newdoc filename'.split(): + raise ValueError(f'Unknown style {style}') + self.docname = docname + self.docname_len = docname_len self._header_printed = False self._lines_printed = None @@ -75,6 +82,12 @@ def process_document(self, doc): heads += 0 if any(d['parent'] in mwords for d in w.deps) else 1 self.counter['m_nontreelet'] += 1 if heads > 1 else 0 + if self.report_basics: + for tree in doc.trees: + self.counter['newdocs'] += 1 if tree.newdoc else 0 + self.counter['sents'] += 1 + self.counter['words'] += len(tree.descendants) + self.counter['empty'] += len(tree.empty_nodes) def after_process_document(self, doc): if self.per_doc: @@ -97,7 +110,8 @@ def process_end(self, skip=True, doc=None): self.print_footer() return else: - print(f"{doc[0].trees[0].newdoc:15}", end='&' if self.style.startswith('tex') else '\n') + docname = doc.meta['loaded_from'] if self.docname == 'filename' else doc[0].trees[0].newdoc + print(f"{docname:{self.docname_len}}", end='&' if self.style.startswith('tex') else '\n') elif self.style.startswith('tex-'): print(f"{self.counter['documents']:4} documents &") self._lines_printed += 1 @@ -107,6 +121,11 @@ def process_end(self, skip=True, doc=None): total_nodes_nonzero = 1 if self.total_nodes == 0 else self.total_nodes columns =[ ] + if self.report_basics: + columns += [('docs', f"{self.counter['newdocs']:7,}"), + ('sents', f"{self.counter['sents']:7,}"), + ('words', f"{self.counter['words']:7,}"), + ('empty', f"{self.counter['empty']:7,}"),] if self.report_entities: columns += [('entities', f"{self.entities:7,}"), ('entities_per1k', f"{1000 * self.entities / total_nodes_nonzero:6.0f}"), @@ -156,7 +175,15 @@ def print_header(self): print(r'\title{Udapi coreference statistics}') print(r'\begin{document}') print(r'\def\MC#1#2{\multicolumn{#1}{c}{#2}}') - lines = [r'\begin{mypage}\begin{tabular}{@{}l ', " "*15, ("document" if self.per_doc else "dataset ") + " "*7, " "*15] + lines = [r'\begin{mypage}\begin{tabular}{@{}l ', + " " * self.docname_len, + ("document" if self.per_doc else "dataset ") + " " * (self.docname_len-8), + " " * self.docname_len] + if self.report_basics: + lines[0] += "rrrr " + lines[1] += r'& \MC{4}{total number of} ' + lines[2] += r'& & & & ' + lines[3] += r'& docs & sents & words & empty n.' if self.report_entities: lines[0] += "rrrr " lines[1] += r'& \MC{4}{entities} ' @@ -199,10 +226,13 @@ def print_header(self): lines[1] += r'\\' lines[2] += r'\\' lines[3] += r'\\\midrule' - if self.report_entities: + if self.report_basics: last_col += 4 lines[1] += r'\cmidrule(lr){2-5}' - lines[2] += r'\cmidrule(lr){4-5}' + if self.report_entities: + lines[1] += r'\cmidrule(lr){' + f"{last_col+1}-{last_col+4}" + '}' + lines[2] += r'\cmidrule(lr){' + f"{last_col+3}-{last_col+4}" + '}' + last_col += 4 if self.e_len_max: last_col += self.e_len_max lines[1] += r'\cmidrule(lr){6-' + str(last_col) + '}' diff --git a/udapi/block/eval/f1.py b/udapi/block/eval/f1.py index ca5510e4..e4889770 100644 --- a/udapi/block/eval/f1.py +++ b/udapi/block/eval/f1.py @@ -126,7 +126,7 @@ def process_tree(self, tree): i, j, c, un_pred, un_gold, common = 0, 0, 0, [], [], [] while i < len(pred_tokens) and j < len(gold_tokens): if c == len(nf_common): - common += find_lcs(pred_tokens[i+1:], gold_tokens[j+1:]) + common += find_lcs(pred_tokens[i:], gold_tokens[j:]) break while nf_common[c] != pred_tokens[i]: un_pred.append(pred_tokens[i]) @@ -156,6 +156,13 @@ def process_tree(self, tree): self._pred[x] += 1 self._total[x] += 1 + @property + def f1(self): + pred, gold = self.pred or 1, self.gold or 1 # prevent division by zero + precision = self.correct / pred + recall = self.correct / gold + return 2 * precision * recall / ((precision + recall) or 1) + def process_end(self): # Redirect the default filehandle to the file specified by self.files self.before_process_document(None) diff --git a/udapi/block/msf/case.py b/udapi/block/msf/case.py new file mode 100644 index 00000000..7d362c7f --- /dev/null +++ b/udapi/block/msf/case.py @@ -0,0 +1,448 @@ +""" +Morphosyntactic features (UniDive): +Derive a MS Case feature from morphological case and adposition. +""" +from udapi.core.block import Block +import logging + +class Case(Block): + + adposmap = { + 'v+Loc': 'Ine', + 'uvnitř+Gen': 'Ine', + 'uvnitř+': 'Ine', + 'mezi_uvnitř+Gen': 'Ine', # annotation error? + 'uprostřed+Gen': 'Ces', + 'mezi+Ins': 'Int', + 'mezi+Nom': 'Int', # annotation error + 'mezi+Voc': 'Int', # annotation error + 'vně+Gen': 'Ext', + 'stranou+Gen': 'Ext', + 'stranou+Dat': 'Ext', + 'na+Loc': 'Ade', + 'na_mimo+Loc': 'Ade', # annotation error? + 'na_úroveň+Gen': 'Ade', + 'na_úroveň+': 'Ade', + 'v_proces+Gen': 'Ade', # ??? + 'v_rámec+Gen': 'Ade', # ??? + 'v_rámec+': 'Ade', # ??? + 'v_řada+Gen': 'Ade', # ??? + 'z_oblast+Gen': 'Ade', # ??? + 'vedle+Gen': 'Apu', + 'u+Gen': 'Chz', + 'kolem+Gen': 'Cir', + 'kol+Gen': 'Cir', + 'dokola+Gen': 'Cir', + 'okolo+Gen': 'Cir', + 'v_oblast+Gen': 'Cir', + 'v_oblast+': 'Cir', + 'blízko+Dat': 'Prx', + 'blízko+Gen': 'Prx', + 'blízko+': 'Prx', + 'nedaleko+Gen': 'Prx', + 'daleko+Gen': 'Prx', # lemma of 'nedaleko' + 'poblíž+Gen': 'Prx', + 'daleko_od+Gen': 'Dst', + 'nad+Ins': 'Sup', + 'pod+Ins': 'Sub', + 'vespod+Gen': 'Sub', + 'před+Ins': 'Ant', + 'vpředu+Gen': 'Ant', + 'na_čelo+Gen': 'Ant', + 'v_čelo+Gen': 'Ant', + 'v_čelo+': 'Ant', + 'za+Ins': 'Pst', + 'naproti+Dat': 'Opp', + 'od+Gen': 'Abl', + 'od+Dat': 'Abl', # annotation error + 'směr_od+Gen': 'Abl', + 'z_strana+Gen': 'Abl', + 'z_strana+': 'Abl', + 'z+Gen': 'Ela', + 'z+Nom': 'Ela', # annotation error + 'z+Dat': 'Ela', # annotation error + 'zevnitř+Gen': 'Ela', + 'zprostřed+Gen': 'Cne', + 's+Gen': 'Del', + 'zpod+Gen': 'Sbe', + 'zpoza+Gen': 'Pse', + 'po+Loc': 'Per', + 'cesta+Gen': 'Per', + 'cesta+Ins': 'Per', + 'napříč+Gen': 'Crs', + 'napříč+Ins': 'Crs', + 'podél+Gen': 'Lng', + 'skrz+Acc': 'Inx', + 'přes+Acc': 'Spx', + 'přes+Nom': 'Spx', # annotation error + 'ob+Acc': 'Cix', + 'po+Acc': 'Ter', + 'po+Nom': 'Ter', # annotation error + 'po+Gen': 'Ter', # annotation error + 'do+Gen': 'Ill', + 'do+Acc': 'Ill', # annotation error + 'do_/+Gen': 'Ill', + 'dovnitř+Gen': 'Ill', + 'doprostřed+Gen': 'Cnl', + 'mezi+Acc': 'Itl', + 'na+Acc': 'All', + 'na+Nom': 'All', # annotation error + 'na+Gen': 'All', # annotation error + 'k+Dat': 'Apl', + 'k+Nom': 'Apl', # annotation error + 'vstříc+Dat': 'Apl', + 'do_oblast+Gen': 'Apl', + 'směr+': 'Apl', + 'směr_k+Dat': 'Apl', + 'směr_k+': 'Apl', + 'směr_na+Acc': 'Apl', + 'v_směr_k+Dat': 'Apl', + 'nad+Acc': 'Spl', + 'nad+Nom': 'Spl', # annotation error + 'pod+Acc': 'Sbl', + 'před+Acc': 'Anl', + 'před+Gen': 'Anl', # annotation error + 'za+Acc': 'Psl', + 'dík_za+Acc': 'Psl', # annotation error? + 'dokud': 'Tan', + 'nežli': 'Tan', + 'v+Acc': 'Tem', + 'v+Nom': 'Tem', # annotation error + 'v+Gen': 'Tem', # annotation error + 'při_příležitost+Gen': 'Tem', + 'současně_s+Ins': 'Tem', + 'u_příležitost+Gen': 'Tem', + 'v_období+Gen': 'Tpx', + 'počátkem+Gen': 'Din', + 'počátek+Gen': 'Din', + 'počínat+Ins': 'Din', + 'počínat+': 'Din', + 'začátkem+Gen': 'Din', + 'začátek+Gen': 'Din', + 'během+Gen': 'Dur', + 'postupem+Gen': 'Dur', + 'postup+Gen': 'Dur', + 'při+Loc': 'Dur', + 'v_průběh+Gen': 'Dur', + 'za+Gen': 'Der', + 'koncem+Gen': 'Dtr', + 'konec+Gen': 'Dtr', + 'k_konec+Gen': 'Dtr', + 'končit+Ins': 'Dtr', + 'závěrem+Gen': 'Dtr', + 'závěr+Gen': 'Dtr', + 'na_závěr+Gen': 'Dtr', + 'v_závěr+Gen': 'Dtr', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'před_po+Loc': 'Tps', + 'počínaje+Ins': 'Teg', + 'jménem+Nom': 'Atr', + 'jméno+Nom': 'Atr', + 'zdali': 'Atr', + 'že': 'Atr', + 'z_řada+Gen': 'Gen', + 's+Ins': 'Com', + 's+Nom': 'Com', # annotation error + 'spolu_s+Ins': 'Com', + 'spolu_s+': 'Com', + 'společně_s+Ins': 'Com', + 'společně_s+': 'Com', + 'v_čelo_s+Ins': 'Com', + 'v_spolupráce_s+Ins': 'Com', + 'bez+Gen': 'Abe', + 'včetně+Gen': 'Inc', + 'nad_rámec+Gen': 'Add', + 'kromě+Gen': 'Exc', + 'krom+Gen': 'Exc', + 'mimo+Acc': 'Exc', + 'mimo+Gen': 'Exc', + 'vyjma+Gen': 'Exc', + 'až_na+Acc': 'Exc', + 's_výjimka+Gen': 'Exc', + 's_výjimka+': 'Exc', + 'místo+Gen': 'Sbs', + 'místo+Ins': 'Sbs', # něčím místo něčím jiným + 'místo+Loc': 'Sbs', # annotation error + 'místo_do+Gen': 'Sbs', + 'místo_k+Dat': 'Sbs', + 'místo_na+Acc': 'Sbs', + 'místo_na+': 'Sbs', + 'místo_po+Loc': 'Sbs', + 'místo_v+Acc': 'Sbs', + 'místo_v+': 'Sbs', + 'místo_za+Acc': 'Sbs', + 'namísto+Gen': 'Sbs', + 'namísto_do+Gen': 'Sbs', + 'v_zastoupení+Gen': 'Sbs', + 'výměna_za+Acc': 'Sbs', + 'jako': 'Ess', + 'jako+': 'Ess', + 'jako+Nom': 'Ess', + 'jako+Acc': 'Ess', + 'jako+Dat': 'Ess', + 'jako_u+Gen': 'Ess', + 'jako_v+Loc': 'Ess', + 'formou+Gen': 'Ess', + 'forma+Gen': 'Ess', + 'v_forma+Gen': 'Ess', + 'v_podoba+Gen': 'Ess', + 'v_podoba+': 'Ess', + 'shoda+Gen': 'Equ', + 'v_shoda_s+Ins': 'Equ', + 'do_soulad_s+Ins': 'Sem', + 'na_způsob+Gen': 'Sem', + 'po_vzor+Gen': 'Sem', + 'úměrně+Dat': 'Sem', + 'úměrně_k+Dat': 'Sem', + 'úměrně_s+Ins': 'Sem', + 'v_analogie_s+Ins': 'Sem', + 'v_duch+Gen': 'Sem', + 'v_smysl+Gen': 'Sem', + 'oproti+Dat': 'Dsm', + 'na_rozdíl_od+Gen': 'Dsm', + 'na_rozdíl_od+': 'Dsm', + 'než': 'Cmp', + 'než+Nom': 'Cmp', + 'než+Gen': 'Cmp', + 'než+Acc': 'Cmp', + 'než_nad+Ins': 'Cmp', + 'než_v+Acc': 'Cmp', + 'než_v+Loc': 'Cmp', + 'v_poměr_k+Dat': 'Cmp', + 'v_poměr_k+': 'Cmp', + 'v_porovnání_k+Dat': 'Cmp', + 'v_porovnání_s+Ins': 'Cmp', + 'v_porovnání_s+': 'Cmp', + 'v_srovnání_s+Ins': 'Cmp', + 'v_srovnání_s+': 'Cmp', + 'o+Acc': 'Dif', + 'o+Nom': 'Dif', # annotation error + 'o+Gen': 'Dif', # annotation error + 'o+Dat': 'Dif', # annotation error + 'o_o+Acc': 'Dif', # annotation error + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'díky+Dat': 'Cau', + 'dík+Dat': 'Cau', + 'kvůli+Dat': 'Cau', + 'vinou+Gen': 'Cau', + 'vlivem+Gen': 'Cau', + 'vliv+Gen': 'Cau', + 'vliv+': 'Cau', + 'vinou+Gen': 'Cau', + 'vina+Gen': 'Cau', + 'zásluhou+Gen': 'Cau', + 'zásluha+Gen': 'Cau', + 'z_důvod+Gen': 'Cau', + 'v_důsledek+Gen': 'Cau', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'následek+Gen': 'Cau', + 'aby': 'Pur', + 'jméno+Gen': 'Pur', + 'pro_případ+Gen': 'Pur', + 'v_jméno+Gen': 'Pur', + 'v_zájem+Gen': 'Pur', + 'za_účel+Gen': 'Pur', + 'na_základ+Gen': 'Cns', + 'pod_vliv+Gen': 'Cns', + 's_ohled_na+Acc': 'Cns', + 's_přihlédnutí_k+Dat': 'Cns', + 's_přihlédnutí_na+Acc': 'Cns', + 'v_souvislost_s+Ins': 'Cns', + 'v_souvislost_s+': 'Cns', + 'v_světlo+Gen': 'Cns', + 'vzhledem_k+Dat': 'Cns', + 'v_soulad_s+Ins': 'Cns', + 'v_soulad_s+': 'Cns', + 'z_titul+Gen': 'Cns', + 'ať': 'Ign', + 'bez_ohled_na+Acc': 'Ign', + 'nehledě_k+Dat': 'Ign', + 'nehledě_na+Acc': 'Ign', + 'navzdory+Dat': 'Ccs', + 'vzdor+Dat': 'Ccs', + 'v_rozpor_s+Ins': 'Ccs', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'pokud+Nom': 'Cnd', + 'zda': 'Cnd', + 'v_případ+Gen': 'Cnd', + 'v_případ+': 'Cnd', + 'v_závislost_na+Loc': 'Cnd', + 'v_závislost_s+Ins': 'Cnd', + 'o+Loc': 'The', + 'ohledně+Gen': 'The', + 'stran+Gen': 'The', + 'co_do+Gen': 'The', + 'na_téma+Gen': 'The', + 'na_téma+Nom': 'The', + 'na_téma+': 'The', + 'na_úsek+Gen': 'The', + 'po_stránka+Gen': 'The', + 'v_obor+Gen': 'The', + 'v_otázka+Gen': 'The', + 'v_spojení_s+Ins': 'The', + 'v_věc+Gen': 'The', + 'v_vztah_k+Dat': 'The', + 'v_vztah_k+': 'The', + 'v_záležitost+Gen': 'The', + 'v_znamení+Gen': 'The', + 'z_hledisko+Gen': 'The', + 'z_hledisko+': 'The', + 'podle+Gen': 'Quo', + 'dle+Gen': 'Quo', + 'pomocí+Gen': 'Ins', + 's_pomoc+Gen': 'Ins', + 'prostřednictvím+Gen': 'Ins', + 'prostřednictví+Gen': 'Ins', + 'prostřednictví+Ins': 'Ins', # annotation error + 'prostřednictví+': 'Ins', + 'za_pomoc+Gen': 'Ins', + 'pro+Acc': 'Ben', + 'pro+Nom': 'Ben', # annotation error + 'pro+Gen': 'Ben', # annotation error + 'pro+Ins': 'Ben', # annotation error + 'napospas+Dat': 'Ben', + 'k_prospěch+Gen': 'Ben', + 'na_úkor+Gen': 'Ben', + 'na_vrub+Gen': 'Ben', + 'v_prospěch+Gen': 'Ben', + 'v_neprospěch+Gen': 'Ben', + 'v_služba+Gen': 'Ben', + 'proti+Dat': 'Adv', + 'proti+Gen': 'Adv', + 'kontra+Nom': 'Adv', + 'versus+Nom': 'Adv', + 'vůči+Dat': 'Adv', + # subordinators + 'dokud': 'Tan', + 'nežli': 'Tan', + 'jakmile': 'Tps', + 'jen_co': 'Tps', + 'zdali': 'Atr', + 'že': 'Atr', + 'jako': 'Ess', + 'než': 'Cmp', + 'kdežto': 'Cmt', + 'přičemž': 'Cmt', + 'zatímco': 'Cmt', + 'jelikož': 'Cau', + 'ježto': 'Cau', + 'poněvadž': 'Cau', + 'protože': 'Cau', + 'takže': 'Cau', + 'aby': 'Pur', + 'ať': 'Ign', + 'ač': 'Ccs', + 'ačkoli': 'Ccs', + 'byť': 'Ccs', + 'přestože': 'Ccs', + 'třebaže': 'Ccs', + 'jestli': 'Cnd', + 'jestliže': 'Cnd', + 'ledaže': 'Cnd', + 'li': 'Cnd', + 'pakliže': 'Cnd', + 'pokud': 'Cnd', + 'zda': 'Cnd', + # coordinators + 'a': 'Conj', + 'i': 'Conj', + 'ani': 'Nnor', + 'nebo': 'Disj', + 'či': 'Disj', + 'ale': 'Advs', + 'avšak': 'Advs', + 'však': 'Advs', + 'nýbrž': 'Advs', + 'neboť': 'Reas', + 'tedy': 'Cnsq', + 'tak': 'Cnsq' + } + + def process_node(self, node): + """ + Derives a case value from preposition and morphological case. Stores it + as MSFCase in MISC. + """ + # Do not do anything for function words. + # Specifically for Case, also skip 'det' and 'amod' modifiers (congruent attributes) + # because their Case is only agreement feature inherited from the head noun. + if node.udeprel in ['case', 'mark', 'cc', 'aux', 'cop', 'punct']: + node.misc['MSFFunc'] = 'Yes' + return + elif node.udeprel in ['det', 'amod']: + node.misc['MSFFunc'] = 'No' + return + else: + node.misc['MSFFunc'] = 'No' + # Get all case markers (adpositions) attached to the current node. + adpositions = [] + for c in node.children: + if c.udeprel == 'case': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + adpositions.append(lemma) + # We assume that all features were copied from FEATS to MISC in mwe.MsfInit. + # They may have been further processed there, so we take the input from there. + msfcase = node.misc['MSFCase'] + if adpositions: + adpostring = '_'.join(adpositions) + caseadpostring = adpostring + '+' + msfcase + if caseadpostring in self.adposmap: + msfcase = self.adposmap[caseadpostring] + else: + logging.warn(f"No Case value found for '{caseadpostring}'.") + msfcase = caseadpostring + # Omer wants to collect cases from both adpositions and subordinators + # but we will consider subordinators only if we do not have any case + # from morphology or adpositions. + if not msfcase: + subordinators = [] + for c in node.children: + if c.udeprel == 'mark': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + subordinators.append(lemma) + if subordinators: + subordstring = '_'.join(subordinators) + if subordstring in self.adposmap: + msfcase = self.adposmap[subordstring] + # To lump coordinators with all the above makes even less sense but for + # the moment we do it. + if not msfcase: + coordinators = [] + for c in node.children: + if c.udeprel == 'cc': + lemma = c.lemma + # If it has outgoing 'fixed' relations, it is a multiword adposition. + fixedchildren = [x.lemma for x in c.children if x.udeprel == 'fixed'] + if fixedchildren: + lemma += '_' + '_'.join(fixedchildren) + coordinators.append(lemma) + if coordinators: + coordstring = '_'.join(coordinators) + if coordstring in self.adposmap: + msfcase = self.adposmap[coordstring] + node.misc['MSFCase'] = msfcase diff --git a/udapi/block/msf/createabstract.py b/udapi/block/msf/createabstract.py new file mode 100644 index 00000000..fbdf73e5 --- /dev/null +++ b/udapi/block/msf/createabstract.py @@ -0,0 +1,45 @@ +""" +Morphosyntactic features (UniDive): +Create abstract nodes representing dropped arguments of predicates (if verbal +morphology signals that the subject is third person singular, and there is no +subject node, create an abstract node and copy the features there). +""" +from udapi.core.block import Block +import re + +class CreateAbstract(Block): + + def process_node(self, node): + """ + If a node has MSFVerbForm=Fin and at least one of the agreement features + MSFNumber, MSFPerson, MSFGender, MSFAnimacy, MSFPolite, assume that these + features characterize the subject (this block is not suitable for languages + with polypersonal agreement). Check that the subject is present. If not, + create an abstract node to represent it. + """ + if node.misc['MSFVerbForm'] == 'Fin' and any([node.misc[x] for x in ['MSFNumber', 'MSFPerson', 'MSFGender', 'MSFAnimacy', 'MSFPolite']]): + # Current node is a finite predicate. Does it have a subject? If not, create an abstract one. + if not any([x.udeprel in ['nsubj', 'csubj'] for x in node.children]): + # There could already be an abstract subject. We have to look for it in the enhanced graph. + if not any([re.match(r"^[nc]subj", edep['deprel']) for edep in node.deps]): + # Create an abstract subject. + subject = node.create_empty_child('nsubj') + subject.upos = 'PRON' + subject.feats['PronType'] = 'Prs' + subject.misc['MSFPronType'] = 'Prs' + subject.feats['Case'] = 'Nom' + subject.misc['MSFCase'] = 'Nom' + for f in ['Number', 'Person', 'Gender', 'Animacy', 'Polite']: + msf = 'MSF' + f + if node.misc[msf]: + subject.feats[f] = node.misc[msf] + subject.misc[msf] = node.misc[msf] + subject.misc['MSFFunc'] = 'No' + # Regardless of whether it had a subject or not, the agreement features + # should be removed from the verb. + ###!!! We also may want to check if the pre-existing subject has all the features. + node.misc['MSFNumber'] = '' + node.misc['MSFPerson'] = '' + node.misc['MSFGender'] = '' + node.misc['MSFAnimacy'] = '' + node.misc['MSFPolite'] = '' diff --git a/udapi/block/msf/init.py b/udapi/block/msf/init.py new file mode 100644 index 00000000..ceca12af --- /dev/null +++ b/udapi/block/msf/init.py @@ -0,0 +1,53 @@ +""" +Morphosyntactic features (UniDive): +Initialization. Copies features from FEATS as MSF* attributes to MISC. +""" +from udapi.core.block import Block +import re + +class Init(Block): + + + def process_node(self, node): + """ + For every feature in FEATS, creates its MSF* counterpart in MISC. + """ + for f in node.feats: + # Only selected features will be copied. Certain features are not + # interesting for the morphosyntactic annotation. + if f not in ['Abbr', 'AdpType', 'Emph', 'Foreign', 'NameType', 'Style', 'Typo', 'Variant']: + node.misc['MSF'+f] = node.feats[f] + # We are particularly interested in the Case feature but some nominals + # lack it (e.g. acronyms or numbers). If there is a preposition, it may + # indicate the expected case of the nominal. + if not node.feats['Case']: + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + node.misc['MSFCase'] = adpositions[0].feats['Case'] + # If we did not find a preposition to help us, we may be able to read + # the case off an adjectival modifier or determiner. + if not node.misc['MSFCase']: + modifiers = [x for x in node.children if x.udeprel in ['amod', 'det'] and x.feats['Case']] + if modifiers: + node.misc['MSFCase'] = modifiers[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if not node.misc['MSFCase']: + if node.udeprel == 'nsubj': + node.misc['MSFCase'] = 'Nom' + elif node.udeprel == 'obj': + node.misc['MSFCase'] = 'Acc' + # If the node contains Phrase features in MISC (periphrastic verb forms + # detected by Lenka's code), replace the MS features with them. + phrasefeatures = [x for x in node.misc if re.match(r"^Phrase[A-Z]", x)] + for pf in phrasefeatures: + msf = pf + if msf == 'PhraseForm': + msf = 'MSFVerbForm' + else: + msf = re.sub(r"Phrase", 'MSF', pf) + node.misc[msf] = node.misc[pf] + node.misc[pf] = '' diff --git a/udapi/block/msf/numphrase.py b/udapi/block/msf/numphrase.py new file mode 100644 index 00000000..22f68c9d --- /dev/null +++ b/udapi/block/msf/numphrase.py @@ -0,0 +1,36 @@ +""" +Morphosyntactic features (UniDive): +Case in Number Phrases like 'pět mužů' (five men) in Czech. +""" +from udapi.core.block import Block + +class NumPhrase(Block): + + + def process_node(self, node): + """ + Nouns with a 'nummod:gov' dependent are morphologically in genitive, + but the case of the whole phrase (number + counted noun) is different, + probably nominative or accusative. + """ + quantifiers = [x for x in node.children if x.deprel in ['nummod:gov', 'det:numgov']] + current_case = node.misc['MSFCase'] + if (current_case == 'Gen' or current_case == '') and quantifiers: + quantifier_case = quantifiers[0].misc['MSFCase'] + # The quantifier may lack the case feature (e.g. numbers expressed by digits) + # but we may be able to guess it from a preposition or other factors. + if quantifier_case == '': + # Not any 'case' dependent is helpful. Here we really need single-word + # adposition. + adpositions = [x for x in node.children if x.udeprel == 'case' and x.upos == 'ADP'] + if len(adpositions) == 1: + fixed = [x for x in adpositions[0].children if x.udeprel == 'fixed'] + if not fixed and adpositions[0].feats['Case']: + quantifier_case = adpositions[0].feats['Case'] + # Finally, if the above did not help, we may guess the case from the deprel of the node itself. + if quantifier_case == '': + if node.udeprel == 'nsubj': + quantifier_case = 'Nom' + elif node.udeprel == 'obj': + quantifier_case = 'Acc' + node.misc['MSFCase'] = quantifier_case diff --git a/udapi/block/msf/phrase.py b/udapi/block/msf/phrase.py new file mode 100644 index 00000000..90ea5d2d --- /dev/null +++ b/udapi/block/msf/phrase.py @@ -0,0 +1,139 @@ +""" +Morphosyntactic features (UniDive): +An abstract block as a base for derivation of blocks that discover periphrastic +verb forms and save them as Phrase features in MISC. This block provides the +methods that save the features in MISC. It is based on the Writer module by +Lenka Krippnerová. +""" +from udapi.core.block import Block +import logging + +class Phrase(Block): + + def process_node(self, node): + """ + Override this in a derived class! + """ + logging.fatal('process_node() not implemented.') + + dictionary = { + 'person': 'PhrasePerson', + 'number': 'PhraseNumber', + 'mood': 'PhraseMood', + 'tense': 'PhraseTense', + 'voice': 'PhraseVoice', + 'aspect':'PhraseAspect', + 'form': 'PhraseForm', + 'reflex': 'PhraseReflex', + 'polarity': 'PhrasePolarity', + 'gender':'PhraseGender', + 'animacy':'PhraseAnimacy', + 'ords':'Phrase', + 'expl':'PhraseExpl', + } + + # a dictionary where the key is the lemma of a negative particle and the value is a list of the lemmas of their possible children that have a 'fixed' relation + # we do not want to include these negative particles in the phrase; these are expressions like "never", etc. + negation_fixed = { + # Belarusian + 'ні' : ['раз'], + 'ня' : ['толькі'], + + # Upper Sorbian + 'nic' : ['naposledku'], + + # Polish + 'nie' : ['mało'], + + # Pomak + 'néma' : ['kak'], + + # Slovenian + 'ne' : ['le'], + + # Russian and Old East Slavic + 'не' : ['то', 'токмо'], + 'ни' : ['в', 'раз', 'шатко'], + 'нет' : ['нет'] + } + + def write_node_info(self, node, + tense = None, + person = None, + number = None, + mood = None, + voice = None, + form = None, + reflex = None, + polarity = None, + ords = None, + gender = None, + animacy = None, + aspect = None, + expl=None): + arguments = locals() + del arguments['self'] # delete self and node from arguments, + del arguments['node'] # we want only grammatical categories + for key,val in arguments.items(): + if val != None: + node.misc[self.dictionary[key]] = val + + def has_fixed_children(self, node): + """ + Returns True if the node has any children with the 'fixed' relation and the node's lemma along with the child's lemma are listed in self.negation_fixed. + """ + fixed_children = [x for x in node.children if x.udeprel == 'fixed'] + + if fixed_children: + if fixed_children[0].lemma in self.negation_fixed.get(node.lemma, []): + return True + return False + + def get_polarity(self, nodes): + """ + Returns 'Neg' if there is exactly one node with Polarity='Neg' among the given nodes. + Returns an empty string if there are zero or more than one such nodes. + """ + neg_count = 0 + for node in nodes: + if node.feats['Polarity'] == 'Neg': + neg_count += 1 + + if neg_count == 1: + return 'Neg' + + # neg_count can be zero or two, in either case we want to return an empty string so that the PhrasePolarity attribute is not generated + else: + return '' + + def get_negative_particles(self, nodes): + """ + Returns a list of all negative particles found among the children + of the specified nodes, except for negative particles with fixed children specified in self.negation_fixed. + """ + neg_particles = [] + for node in nodes: + neg = [x for x in node.children if x.upos == 'PART' and x.feats['Polarity'] == 'Neg' and x.udeprel == 'advmod' and not self.has_fixed_children(x)] + if neg: + neg_particles += neg + return neg_particles + + + def get_is_reflex(self,node,refl): + if node.feats['Voice'] == 'Mid': + return 'Yes' + if len(refl) == 0: + return node.feats['Reflex'] + return 'Yes' + + def is_expl_pass(self,refl): + if len(refl) == 0: + return False + return refl[0].deprel == 'expl:pass' + + def get_voice(self,node,refl): + voice = node.feats['Voice'] + if self.is_expl_pass(refl): + return 'Pass' + return voice + diff --git a/udapi/block/msf/removefunc.py b/udapi/block/msf/removefunc.py new file mode 100644 index 00000000..e169a2de --- /dev/null +++ b/udapi/block/msf/removefunc.py @@ -0,0 +1,17 @@ +""" +Morphosyntactic features (UniDive): +Cleanup. Removes MSF* features from MISC for function nodes (MSFFunc=Yes). +""" +from udapi.core.block import Block + +class RemoveFunc(Block): + + + def process_node(self, node): + """ + Removes MSF* features if MSFFunc=Yes. + """ + if node.misc['MSFFunc'] == 'Yes': + msfeats = [x for x in node.misc if x.startswith('MSF')] + for msf in msfeats: + node.misc[msf] = '' diff --git a/udapi/block/msf/romance/romance.py b/udapi/block/msf/romance/romance.py new file mode 100644 index 00000000..dd2393f7 --- /dev/null +++ b/udapi/block/msf/romance/romance.py @@ -0,0 +1,523 @@ + +import udapi.block.msf.phrase +from enum import Enum + +class Aspect(str, Enum): + IMP = 'Imp' + IMPPROG = 'ImpProg' + PERF = 'Perf' + PERFPROG = 'PerfProg' + PROG = 'Prog' + PQP = 'Pqp' + +class Tense(str, Enum): + FUT = 'Fut' + FUTFUT = 'FutFut' + PAST = 'Past' + PASTFUT = 'PastFut' + PASTPRES = 'PastPres' + PRES = 'Pres' + +class Romance(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + + cop = [x for x in node.children if x.udeprel == 'cop'] + + # only expl or expl:pv, no expl:impers or expl:pass + refl = [x for x in node.children if x.lemma == 'se' and x.upos == 'PRON' and x.udeprel == 'expl' and x.udeprel != 'expl:impers' and x.udeprel != 'expl:pass'] + + if refl: + expl='Pv' + else: + expl=None + + if cop: + auxes = [x for x in node.children if x.udeprel == 'aux'] + if auxes: + self.process_periphrastic_verb_forms(cop[0], auxes, refl, auxes + cop, node) + else: + # no auxiliaries, only cop + self.process_copulas(node,cop,auxes,refl,expl) + return + + if node.upos == 'VERB': + auxes = [x for x in node.children if x.udeprel == 'aux'] + aux_pass = [x for x in node.children if x.deprel == 'aux:pass'] + auxes_without_pass = [x for x in node.children if x.udeprel == 'aux' and x.deprel != 'aux:pass'] + + # infinitive with a subject is a subjunctive + subj = [x for x in node.children if x.udeprel == 'subj'] + if node.feats['VerbForm'] == 'Inf' and subj: + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + mood='Sub', + form='Fin', + tense=Tense.FUT.value, + gender=node.feats['Gender'], + voice=node.feats['Voice'], + expl=expl, + ords=[node.ord] + ) + return + + if not auxes: + phrase_ords = [node.ord] + [r.ord for r in refl] + phrase_ords.sort() + + # presente -> PhraseTense=Pres, PhraseAspect='' + # Futuro do presente -> PhraseTense=Fut, PhraseAspect='' + aspect = '' + tense = node.feats['Tense'] + + if node.feats['Mood'] == 'Ind': + + # pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # pretérito perfeito -> PhraseTense=Past, PhraseAspect=Perf + if node.feats['Tense'] == 'Past': + aspect=Aspect.PERF.value + + # pretérito mais que perfeito simples -> PhraseTense=Past, PhraseAspect=Pqp + if node.feats['Tense'] == 'Pqp': + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + # subjunctive presente -> PhraseTense=Pres, PhraseAspect='' + # subjunctive futuro -> PhraseTense=Fut, PhraseAspect='' + if node.feats['Mood'] == 'Sub': + + if node.feats['Tense'] == 'Past': + aspect=Aspect.IMP.value + + # subjunctive pretérito imperfeito -> PhraseTense=Past, PhraseAspect=Imp + if node.feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + # Futuro do pretérito (cnd) -> PhraseTense=Pres, PhraseAspect='', PhraseMood=Cnd + if node.feats['Mood'] == 'Cnd': + aspect='' + tense=Tense.PRES.value + + + self.write_node_info(node, + person=node.feats['Person'], + aspect=aspect, + number=node.feats['Number'], + mood=node.feats['Mood'], + form=node.feats['VerbForm'], + tense=tense, + gender=node.feats['Gender'], + voice=node.feats['Voice'], + expl=expl, + ords=phrase_ords + ) + + + else: + # no passive auxiliaries + if not aux_pass: + self.process_periphrastic_verb_forms(node, auxes, refl, auxes, node) + + # head verb has one passive auxiliary and no more other auxiliaries + # TODO complete the tenses and aspects for individual verb forms + elif not auxes_without_pass: + phrase_ords = [node.ord] + [x.ord for x in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.write_node_info(node, + person=aux_pass[0].feats['Person'], + number=aux_pass[0].feats['Number'], + mood=aux_pass[0].feats['Mood'], + form='Fin', + tense=aux_pass[0].feats['Tense'], + gender=node.feats['Gender'], + voice='Pass', + expl=expl, + ords=phrase_ords + ) + + # head verb has passive auxiliary and also other auxiliaries + else: + self.process_periphrastic_verb_forms(aux_pass[0], auxes_without_pass, refl, auxes, node) + + + def process_periphrastic_verb_forms(self, node, auxes, refl, all_auxes, head_node): + """ + Parameters + - node: if there is no passive then the node is the head verb, if the head verb is in the passive, then the node is the passive auxiliary + - auxes: list of all auxiliaries except the passive auxes + - refl: list of reflexives which should be included into the periphrastic phrase + - all_auxes: list of all auxiliaries (passive auxes are included) + - head_node: the node which should have the Phrase* attributes, i. e. the head of the phrase + + annotates periphrastic verb forms with the Phrase* attributes + """ + + if refl: + expl='Pv' + else: + expl=None + + if len(auxes) == 1: + # Cnd + if ((auxes[0].lemma == 'ter' and node.feats['VerbForm'] == 'Part') or (auxes[0].lemma == 'estar' and node.feats['VerbForm'] == 'Ger')) and auxes[0].feats['Mood'] == 'Cnd': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + [r.ord for r in refl] + phrase_ords.sort() + + # aux estar cond + gerund -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].lemma == 'estar': + tense=Tense.PRES.value + aspect=Aspect.PROG.value + + # Futuro do pretérito composto -> PhraseTense=Past, PhraseAspect=Perf, PhraseMood=Cnd + else: + tense=Tense.PAST.value + aspect=Aspect.PERF.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + aspect=aspect, + mood='Cnd', + form='Fin', + expl=expl, + voice=head_node.feats['Voice'], + ords=phrase_ords) + return + + # Auxiliary 'estar' followed by a gerund + if auxes[0].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg + # subjunctive pretérito imperfeito (aux estar) -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + # pretérito perfeito (aux estar) -> PhraseTense=Past, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PAST.value + aspect=Aspect.PERFPROG.value + + # conditional (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Cnd + elif auxes[0].feats['Mood'] == 'Cnd': + tense=Tense.PRES.value + aspect=Aspect.PROG.value + + # presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog + # futuro do presente (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog + # subjunctive presente (aux estar) -> PhraseTense=Pres, PhraseAspect=Prog, PhraseMood=Sub + # subjunctive futuro (aux estar) -> PhraseTense=Fut, PhraseAspect=Prog, PhraseMood=Sub + else: + tense=auxes[0].feats['Tense'] + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + aspect=aspect, + expl=expl, + ords=phrase_ords) + + # Auxiliary 'ter' followed by a participle + if auxes[0].lemma == 'ter' and node.feats['VerbForm'] == 'Part': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # futuro do presente composto (aux ter) -> PhraseTense=Fut, PhraseAspect=Perf + aspect=Aspect.PERF.value + tense=auxes[0].feats['Tense'] + + # pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf + # subjonctive pretérito perfeito composto (aux ter) -> PhraseTense=PastPres, PhraseAspect=Perf, PhraseMood=Sub + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + # pretérito mais que perfeito composto (aux ter/haver) -> PhraseTense=Past, PhraseAspect=Pqp + # subjonctive pretérito mais-que-perfeito composto (aux ter) -> PhraseTense=Past, PhraseAspect=Pqp, PhraseMood=Sub + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: # TODO prej neni v Past, jenom Imp + tense=Tense.PAST.value + aspect=Aspect.PQP.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + if auxes[0].lemma == 'haver' and auxes[0].feats['Tense'] == 'Imp' and node.feats['VerbForm'] == 'Part': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.write_node_info(head_node, + tense=Tense.PAST.value, + aspect=Aspect.PERF.value, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + if auxes[0].lemma == 'vir' and auxes[0].feats['Tense'] in ['Pres', 'Imp', 'Past'] and node.feats['VerbForm'] == 'Ger': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # aux Pres (vir) + gerund -> PhraseTense=PastPres, PraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=Aspect.PROG.value, + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + + # auxiliary 'ir' followed by infinitive + if auxes[0].lemma == 'ir' and node.feats['VerbForm'] == 'Inf': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + tense=node.feats['Tense'] + aspect='' + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect='' + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect='' + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=Imp + elif auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMP.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect='' + elif auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect='' + + # Futuro perifrástico passado perf -> PhraseTense=PastFut, PhraseAspect=Perf + elif auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERF.value + + + + self.write_node_info(head_node, + tense=tense, + aspect=aspect, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + if auxes[0].lemma == 'ir' and node.feats['VerbForm'] == 'Ger': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # aux Pres (ir) + gerund -> PhraseTense=Pres, PhraseAspect=Prog + tense = auxes[0].feats['Tense'] + aspect = Aspect.PROG.value + + # aux Imp (ir) + gerund -> PhraseTense=Past, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + elif len(auxes) == 2: + # auxiliry 'ir' followed by auxiliary 'estar' in infinitive and a gerund + if auxes[0].lemma == 'ir' and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # Futuro perifrástico -> PhraseTense=Fut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PROG.value + + # Futuro perifrástico passado imp -> PhraseTense=PastFut, PhraseAspect=ImpProg + if auxes[0].feats['Tense'] == 'Imp': + tense=Tense.PASTFUT.value + aspect=Aspect.IMPPROG.value + + # Futuro perifrástico in the future -> PhraseTense=FutFut, PhraseAspect=Prog + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PROG.value + + if auxes[0].feats['Tense'] == 'Past': + tense=Tense.PASTFUT.value + aspect=Aspect.PERFPROG.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + # auxiliriy 'ir' in present or future tense followed by auxiliary 'ter' in infinitive and a participle + if auxes[0].lemma == 'ir' and (auxes[0].feats['Tense'] in ['Pres', 'Fut']) and auxes[1].lemma == 'ter' and node.feats['VerbForm'] == 'Part': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + # Futuro perifrástico -> PhraseTense=FutFut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Fut': + tense=Tense.FUTFUT.value + aspect=Aspect.PERF.value + + # aux Pres (ir) + aux ter inf + pp -> PhraseTense=Fut, PhraseAspect=Perf + if auxes[0].feats['Tense'] == 'Pres': + tense=Tense.FUT.value + aspect=Aspect.PERF.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + aspect=aspect, + form='Fin', + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords) + + + + # Cnd (only ter), Sub and Past,Pres,Fut tenses: 2 auxes - ter + estar + if auxes[0].lemma in ['ter', 'haver'] and auxes[1].lemma == 'estar' and node.feats['VerbForm'] == 'Ger': + phrase_ords = [head_node.ord] + [x.ord for x in all_auxes] + [r.ord for r in refl] + phrase_ords.sort() + + tense = auxes[0].feats['Tense'] + aspect = Aspect.PERFPROG.value + + # aux ter cond + estar pp + gerund -> PhraseTense=Past, PhraseAspect=Prog, PhraseMood=Cnd + if auxes[0].feats['Mood'] == 'Cnd': + tense=Tense.PAST.value + aspect=Aspect.PROG.value + + # Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg + # subjonctive Pretérito perfeito composto -> PhraseTense=PastPres, PhraseAspect=PerfProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] == 'Pres': + tense=Tense.PASTPRES.value + + # Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg + # subjonctive Pretérito mais que perfeito composto -> PhraseTense=Past, PhraseAspect=ImpProg, PhraseMood=Sub + elif auxes[0].feats['Tense'] in ['Imp', 'Past']: + tense=Tense.PAST.value + aspect=Aspect.IMPPROG.value + + # Futuro do presente composto -> PhraseTense=Fut, PhraseAspect=PerfProg + elif auxes[0].feats['Tense'] == 'Fut' and auxes[0].lemma == 'ter': + tense=Tense.FUT.value + + self.write_node_info(head_node, + tense=tense, + number=auxes[0].feats['Number'], + person=auxes[0].feats['Person'], + mood=auxes[0].feats['Mood'], + form='Fin', + aspect=aspect, + voice=head_node.feats['Voice'], + expl=expl, + ords=phrase_ords, + ) + return + + def process_copulas(self, node, cop, auxes, refl, expl): + + if not auxes: + tense = cop[0].feats['Tense'] + number=cop[0].feats['Number'] + person=cop[0].feats['Person'] + mood=cop[0].feats['Mood'] + + if cop[0].feats['Tense'] in ['Pres', 'Fut']: + if cop[0].lemma == 'ser': + aspect=Aspect.PERF.value + elif cop[0].lemma == 'estar': + aspect=Aspect.IMP.value + + elif cop[0].feats['Tense'] == 'Imp': + tense=Tense.PAST.value + aspect=Aspect.IMP.value + + elif cop[0].feats['Tense'] == 'Past': + aspect=Aspect.PERF.value + else: + # i.e. copulas in infinitive + aspect='' + + else: + tense = auxes[0].feats['Tense'] + number=auxes[0].feats['Number'] + person=auxes[0].feats['Person'] + mood=auxes[0].feats['Mood'] + aspect='' + + + if auxes[0].lemma == 'estar': + aspect=Aspect.IMPPROG.value + + phrase_ords = [node.ord] + [x.ord for x in cop] + [x.ord for x in auxes] + [r.ord for r in refl] + phrase_ords.sort() + + self.write_node_info(node, + tense=tense, + number=number, + person=person, + mood=mood, + form='Fin', + aspect=aspect, + voice=node.feats['Voice'], + expl=expl, + ords=phrase_ords, + ) diff --git a/udapi/block/msf/slavic/conditional.py b/udapi/block/msf/slavic/conditional.py new file mode 100644 index 00000000..89eafd6c --- /dev/null +++ b/udapi/block/msf/slavic/conditional.py @@ -0,0 +1,85 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects conditional verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Conditional(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + if (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'PartRes') or node.feats['VerbForm'] == 'Fin': + # in most Slavic languages, the verb has feats['VerbForm'] == 'Part' but in Polish the verb has feats['VerbForm'] == 'Fin' + + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # list for auxiliary verbs for forming the conditional mood + cop = [x for x in node.children if x.udeprel == 'cop'] # in some cases it may happen that the cop follows the noun, we don't want to these cases in this branch + # in Polish the auxiliary verbs for conditional mood have deprel == 'aux:cnd', in other languages the auxiliary verbs have x.feats['Mood'] == 'Cnd' + + # the conditional mood can be formed using the auxiliary verb or some conjunctions (such as 'aby, kdyby...' in Czech) + # so x.udeprel == 'aux' can't be required because it doesn't meet the conjunctions + + if aux_cnd and not cop: + aux = [x for x in node.children if x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd'] # all auxiliary verbs and conjuctions with feats['Mood'] == 'Cnd' + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + auxVerb = aux_cnd[0] + + person='3' # TODO there is a problem in russian etc. (same as in past tense) + if auxVerb.feats['Person'] != '': + person=auxVerb.feats['Person'] + + + self.write_node_info(node, + person=person, + number=node.feats['Number'], + mood='Cnd', + form='Fin', + aspect=node.feats['Aspect'], + reflex=self.get_is_reflex(node,refl), + polarity=self.get_polarity(phrase_nodes), + voice=self.get_voice(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['VerbForm'] == 'Part' or x.feats['VerbForm'] == 'Fin')] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel=='aux:cnd'] + + if cop and aux_cnd: + # there can be a copula with Mood='Cnd' (i. e. in Old East Slavonic), we don't want to count these copula in phrase_ords twice, so there is x.udeprel != 'cop' in aux list + aux = [x for x in node.children if (x.udeprel == 'aux' or x.feats['Mood'] == 'Cnd') and x.udeprel != 'cop'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + prep + refl + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Cnd', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node, refl), + ords=phrase_ords, + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'] + ) \ No newline at end of file diff --git a/udapi/block/msf/slavic/converb.py b/udapi/block/msf/slavic/converb.py new file mode 100644 index 00000000..6b725d56 --- /dev/null +++ b/udapi/block/msf/slavic/converb.py @@ -0,0 +1,91 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects converb (transgressive) forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Converb(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # condition node.upos == 'VERB' to prevent copulas from entering this branch + if node.feats['VerbForm'] == 'Conv' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + form='Conv', + tense=node.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + voice=self.get_voice(node, refl) + ) + + # passive voice + elif node.upos == 'ADJ': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Conv'] + + if aux: + auxVerb = aux[0] + + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + form='Conv', + tense=auxVerb.feats['Tense'], + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=auxVerb.feats['Gender'], + animacy=auxVerb.feats['Animacy'], + voice='Pass' + ) + + # copulas + else: + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Conv'] + + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + tense=copVerb.feats['Tense'], + gender=copVerb.feats['Gender'], + animacy=copVerb.feats['Animacy'], + form='Conv', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + voice=self.get_voice(copVerb, refl) + ) diff --git a/udapi/block/msf/slavic/future.py b/udapi/block/msf/slavic/future.py new file mode 100644 index 00000000..02452c36 --- /dev/null +++ b/udapi/block/msf/slavic/future.py @@ -0,0 +1,200 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects future tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Future(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # future tense for Serbian and Croatian + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and (x.lemma == 'hteti' or x.lemma == 'htjeti')] + if node.upos != 'AUX' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + aux_other = [x for x in node.children if x.udeprel == 'aux'] # adding aux for passive voice + cop = [x for x in node.children if x.deprel == 'cop'] + + phrase_nodes = [node] + refl + aux_other + cop + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + if not cop: + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], # srbstina ani chorvatstina vidy nema + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + ords=phrase_ords + ) + else: + prep = [x for x in node.children if x.upos == 'ADP'] + phrase_nodes += prep + phrase_ords += [x.ord for x in prep] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + ords=phrase_ords + ) + + return + + # Macedonian forms the future tense with the auxiliary word ќе and a verb in the present tense + # Bulgarian forms the future tense with the auxiliary word ще and a verb in the present tense + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + + if node.feats['Tense'] == 'Pres' and aux: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=node.feats['Voice'], + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return + + # future tense of perfect verbs + # Upper Sorbian forms the future tense in this way, however, the feats[Aspect] are not listed in the data + # in some languages (e.g. in Russian) these verbs have the Tense Fut, in others (e.g. in Czech) they have the Tense Pres + """if node.feats['Aspect'] == 'Perf' and (node.feats['Tense'] == 'Pres' or node.feats['Tense'] == 'Fut') and node.feats['VerbForm'] != 'Conv': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + form='Fin', + aspect='Perf', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return""" + + + # future tense of imperfect verbs and passive voice + # in some languages the verb is in the infinitive, in some it is in the l-participle + # the condition node.upos == 'ADJ' is due to the passive voice - the n-participle is marked as ADJ, but the auxiliary verb is not cop, but aux + if node.upos == 'VERB' or node.upos == 'ADJ': + + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Fut'] + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + if aux: + auxVerb = aux[0] + self.write_node_info(node, + tense='Fut', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + return + + # simple future tense - e.g. in Serbian, the future tense can be formed by combining a verb with a full meaning and an auxiliary verb into one word, i.e. without an auxiliary verb + # or verbs like pojede, půjdeme... in Czech + + if not aux and node.feats['Tense'] == 'Fut': + + self.write_node_info(node, + tense='Fut', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Fut'] + if cop: + copVerb = cop[0] + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood']=='Ind'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Fut', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords + ) + diff --git a/udapi/block/msf/slavic/imperative.py b/udapi/block/msf/slavic/imperative.py new file mode 100644 index 00000000..d4fedd50 --- /dev/null +++ b/udapi/block/msf/slavic/imperative.py @@ -0,0 +1,86 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects imperative verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Imperative(udapi.block.msf.phrase.Phrase): + + def process_node(self, node): + # the condition node.upos == 'VERB' ensures that copulas do not enter this branch + if node.feats['Mood'] == 'Imp' and node.upos == 'VERB': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=node.feats['Person'], + number=node.feats['Number'], + aspect=node.feats['Aspect'], + mood='Imp', + form='Fin', + voice='Act', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return + + # verbs in the passive forms are marked as ADJ + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Mood'] == 'Imp'] + if aux: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + person=aux[0].feats['Person'], + number=aux[0].feats['Number'], + mood='Imp', + voice='Pass', + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + return + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Mood'] == 'Imp'] + if cop: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + copVerb = cop[0] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Imp', + form='Fin', + voice=self.get_voice(copVerb, refl), + reflex=self.get_is_reflex(node, refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/infinitive.py b/udapi/block/msf/slavic/infinitive.py new file mode 100644 index 00000000..f39a2646 --- /dev/null +++ b/udapi/block/msf/slavic/infinitive.py @@ -0,0 +1,103 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects infinitive verb forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Infinitive(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + if node.feats['VerbForm'] == 'Inf' and node.upos == 'VERB': + aux = [x for x in node.children if x.udeprel == 'aux'] + if not aux: # the list of auxiliary list must be empty - we don't want to mark infinitives which are part of any other phrase (for example the infinititive is part of the future tense in Czech) + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes == neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return + + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['VerbForm'] != 'Inf'] + if aux and not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Pass', + form='Inf', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node, refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'], + number=node.feats['Number'] + ) + return + + + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['VerbForm'] == 'Inf'] + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + if cop and not aux_forb: + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + voice=self.get_voice(cop[0], refl), + form='Inf', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node, refl), + ords=phrase_ords + ) + + # there is a rare verb form called supine in Slovenian, it is used instead of infinitive as the argument of motion verbs + if node.feats['VerbForm'] == 'Sup': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + voice='Act', + form='Sup', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node, refl), + ords=phrase_ords + ) diff --git a/udapi/block/msf/slavic/past.py b/udapi/block/msf/slavic/past.py new file mode 100644 index 00000000..423bff45 --- /dev/null +++ b/udapi/block/msf/slavic/past.py @@ -0,0 +1,207 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects past tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Past(udapi.block.msf.phrase.Phrase): + + def get_person_for_langs_with_simple_past(self, node, person): + """ + returns the person which is known from subject, languages with the simple past tense (e. g. Russian) do not express person in these verb forms + if the person was not taken from the subject, the third person would be filled in automatically due to languages with a compound past but simple forms for the third person (e. g. Czech) + """ + subj = [x for x in node.children if x.udeprel == 'nsubj'] + if subj: + subj = subj[0] + if subj.feats['Person'] != '': + person = subj.feats['Person'] + return person + + def process_node(self, node): + + past_tenses = ['Past', 'Imp', 'Pqp'] + cop = [x for x in node.children if x.udeprel == 'cop' and (x.feats['Tense'] in past_tenses)] + + # there is person 0 in Polish and Ukrainian which is for impersonal statements + # in Polish, verbs with Person=0 have also Tense=Past, in Ukrainian the tense is not specified + if node.feats['Person'] == '0': + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood=node.feats['Mood'], + voice='Act', #In Polish, impersonal statements are annotated with Voice=Act. In Ukrainian, the Voice feature is missing; therefore, we decided to annotate these phrases with PhraseVoice=Act + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + + # compound past tense + if (node.feats['VerbForm'] == 'Part' or node.feats['VerbForm'] == 'PartRes') and node.upos == 'VERB' and node.feats['Voice'] != 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] + aux_pqp = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] in past_tenses] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux + refl + aux_pqp + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + aux_cnd = [x for x in node.children if (x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd') and x.udeprel != 'conj'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux: + person = aux[0].feats['Person'] + + elif not aux: + person = '3' + + if aux_pqp: + person = aux_pqp[0].feats['Person'] + + # in Slovenian, the participles are not annotated as Tense='Past', the Tense feature is missing here + # but in Bulgarian, there are cases where the participles are annotated as Tense='Imp' + tense = 'Past' + if node.feats['Tense'] == 'Imp': + tense = 'Imp' + if node.feats['Tense'] == 'Pqp': + tense = 'Pqp' + + self.write_node_info(node, + tense=tense, + person=person, + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + + + # the past tense of some Slavic languages is formed only by a verb without an auxiliary verb (e.g. Polish) + # or imperfect (special case of the past tense) e.g. in Bulgarian or Croatian + elif (node.feats['Tense'] in past_tenses) and node.upos == 'VERB' and node.feats['VerbForm'] != 'Conv': + + # the past tense is formed only by a content verb, not with an auxiliary + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + + if not aux_forb: + + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense=node.feats['Tense'], + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + voice=self.get_voice(node,refl), + aspect=node.feats['Aspect'], + form=node.feats['VerbForm'], + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + + + + # passive + elif node.upos == 'ADJ' and node.feats['Voice'] == 'Pass' and not cop: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and (x.feats['Tense'] in past_tenses)] + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if not aux_cnd: + if aux_past_tense: + aux_pres_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] # e. g. the auxiliary 'jsem' in the phrase 'byl jsem přinucen' + + phrase_nodes = [node] + aux_past_tense + aux_pres_tense + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_pres_tense: + person = aux_pres_tense[0].feats['Person'] + person = self.get_person_for_langs_with_simple_past(node, person) + + self.write_node_info(node, + tense=aux_past_tense[0].feats['Tense'], + person=person, + number=aux_past_tense[0].feats['Number'], + mood='Ind', + voice='Pass', + form='Fin', + aspect=node.feats['Aspect'], + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + + else: + aux_cnd = [x for x in node.children if x.feats['Mood'] == 'Cnd' or x.deprel == 'aux:cnd'] # we don't want to mark l-participles in the conditional as past tense + if cop and not aux_cnd: + aux_past_tense = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + aux_past_tense + cop + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + person = '3' + if aux_past_tense: + person = aux_past_tense[0].feats['Person'] + + # In ru, be, uk, the person is not expressed in past tense and the verbform is Fin, not Part + if cop[0].feats['VerbForm'] == 'Fin': + person = '' + + self.write_node_info(node, + aspect=cop[0].feats['Aspect'], + tense=cop[0].feats['Tense'], + person=person, + number=cop[0].feats['Number'], + mood='Ind', + voice=self.get_voice(cop[0], refl), + form='Fin', + reflex=self.get_is_reflex(node,refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=cop[0].feats['Gender'], + animacy=cop[0].feats['Animacy'] + ) diff --git a/udapi/block/msf/slavic/preprocessor.py b/udapi/block/msf/slavic/preprocessor.py new file mode 100644 index 00000000..804a081f --- /dev/null +++ b/udapi/block/msf/slavic/preprocessor.py @@ -0,0 +1,83 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block serves as a preprocessor for Slavic languages before the other blocks +are applied to detect periphrastic verb forms. It improves harmonization of +annotations across the treebanks by addressing some known divergences. +""" + +from udapi.core.block import Block + +class Preprocessor(Block): + + def process_node(self,node): + + # in Ukrainian the active verb forms are not marked as PhraseVoice=Act + if (node.upos == 'VERB' or (node.upos == 'AUX' and node.feats['VerbForm'] == 'Fin')) and node.feats['Voice'] == '': + node.feats['Voice'] = 'Act' + + # in some languages, participles are annotated with UPOS=VERB, while in others they are annotated with UPOS=ADJ + # we change the UPOS to ADJ when a participle expresses case + if node.upos == 'VERB' and node.feats['VerbForm'] == 'Part' and node.feats['Case'] != '': + node.upos = 'ADJ' + + # in Polish, the conditional mood for auxiliary verbs is marked as deprel == 'aux:cnd' and not as in the last Slavic languages feats['Mood'] == 'Cnd' + if node.deprel == 'aux:cnd': + node.feats['Mood'] = 'Cnd' + + # unify polarities - some languages mark only Neg (Russian), some mark both Neg and Pos (Czech) + if node.feats['Polarity'] == 'Pos': + node.feats['Polarity'] = '' + + # In Ukrainian, there is no explicit annotation of reflexive verbs + # We decided to unify the annotation of reflexive verbs with Russian and Belarusian, where reflexive verbs are formed similarly + # We add the feature Voice=Mid to reflexive verbs + if node.upos == 'VERB' and (node.lemma.endswith('сь') or node.lemma.endswith('ся')): + node.feats['Voice'] = 'Mid' + + # makedonstina tvori budouci cas pomoci pomocneho slova ќе, u nejz neni nijak vyznaceno, ze se podili na tvorbe budouciho casu + # stejne tak bulharstina pomoci pomocneho slova ще + # makedonstina a bulharstina + if node.feats['Tense'] == 'Pres': + aux = [x for x in node.children if x.lemma == 'ќе' or x.lemma == 'ще'] + if len(aux) == 1: + aux[0].feats['Tense'] = 'Fut' + + # in Czech and in Old Church Slavonic, the participles are sometimes marked with the plural gender + if node.feats['Gender'] == 'Fem,Neut' or node.feats['Gender'] == 'Fem,Masc': + subj = [x for x in node.children if x.udeprel == 'nsubj'] + + # for relative pronouns, only one gender is indicated + if len(subj) == 1: + conj = [x for x in subj[0].children if x.deprel == 'conj'] + if len(conj) == 0: + node.feats['Gender'] = subj[0].feats['Gender'] + node.feats['Number'] = subj[0].feats['Number'] + + # participles in passive are sometimes annotated as VERB, sometimes as ADJ + if node.upos == 'VERB' and node.feats['Voice'] == 'Pass': + node.upos = 'ADJ' + + # there are cases where the node has deprel=='expl:pv' or 'expl:pass' or 'expl:impers' and Reflex is not Yes (i.e. Macedonian treebank) + # we add the Reflex=Yes feature + if node.deprel == 'expl:pv' or node.deprel == 'expl:pass' or node.deprel == 'expl:impers': + node.feats['Reflex'] = 'Yes' + + # fixing the mistake in Macedonian treebank (mk_mtb-ud-test.conllu), in sent_id=other0010, there is personal pronoun 'ми' marked as expl:pv, it should be iobj + if node.deprel == 'expl:pv' and node.lemma == 'ми' and node.feats['PronType'] == 'Prs': + node.deprel = '' + node.udeprel = 'iobj' + + # in Old Church Slavonic, there is feature Mood=Sub, but this is a notation for conditional mood + if node.feats['Mood'] == 'Sub': + node.feats['Mood'] = 'Cnd' + + # although infinitives in Old Church Slavonic are annotated with Tense=Pres, they do not convey tense; therefore, we remove this annotation + if node.feats['VerbForm'] == 'Inf': + node.feats['Tense'] = '' + + # in the russian Syntagrus corpus, the negative particles have no Polarity=Neg feature + if node.lemma == 'не' and node.upos == 'PART' and node.udeprel == 'advmod': + node.feats['Polarity'] = 'Neg' + + # TODO maybe we want to set Tense=Fut for the perfective verbs with Tense=Pres? This could solve the problem with the simplified detection of the future tense in Czech + # but there are many verbs with no Aspect value, so the problem is still there diff --git a/udapi/block/msf/slavic/present.py b/udapi/block/msf/slavic/present.py new file mode 100644 index 00000000..9a743a9e --- /dev/null +++ b/udapi/block/msf/slavic/present.py @@ -0,0 +1,128 @@ +""" +Morphosyntactic features (UniDive, Lenka Krippnerová): +This block detects present tense forms in Slavic languages and saves their +features as Phrase* attributes in MISC of their head word. +""" + +import udapi.block.msf.phrase + +class Present(udapi.block.msf.phrase.Phrase): + + def process_node(self,node): + # the condition VerbForm == 'Fin' ensures that there are no transgressives between the found verbs + # the aspect is not always given in Czech treebanks, so we can't rely on the fact that the imperfect aspect is specified + if node.feats['Tense'] == 'Pres' and node.upos == 'VERB' and node.feats['VerbForm'] == 'Fin': #and node.feats['Aspect']=='Imp': + + aux_forb = [x for x in node.children if x.upos == 'AUX' and (x.lemma == 'ќе' or x.lemma == 'ще' or x.feats['Mood'] == 'Cnd')] # forbidden auxiliaries for present tense (these auxiliaries are used for the future tense or the conditional mood) + + if not aux_forb: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + tense='Pres', + person=node.feats['Person'], + number=node.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + voice=self.get_voice(node,refl), + form='Fin', + polarity=self.get_polarity(phrase_nodes), + reflex=self.get_is_reflex(node,refl), + ords=phrase_ords + ) + return + + # passive voice + if node.upos == 'ADJ' and node.feats['Voice'] == 'Pass': + aux = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] == 'Pres' and x.lemma != 'hteti' and x.lemma != 'htjeti'] + aux_forb = [x for x in node.children if x.udeprel == 'aux' and x.feats['Tense'] != 'Pres'] # we don't want the past passive (e. g. 'byl jsem poučen' in Czech) + + if aux and not aux_forb: + phrase_nodes = [node] + aux + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + auxVerb = aux[0] + + self.write_node_info(node, + tense='Pres', + person=auxVerb.feats['Person'], + number=auxVerb.feats['Number'], + mood='Ind', + aspect=node.feats['Aspect'], + form='Fin', + voice='Pass', + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords, + gender=node.feats['Gender'], + animacy=node.feats['Animacy'] + ) + return + + # participles + # in some languages, participles are used as attributes (they express case and degree) + if node.upos == 'ADJ' and node.feats['VerbForm'] == 'Part': + aux_forb = [x for x in node.children if x.udeprel == 'aux'] + cop = [x for x in node.children if x.udeprel == 'cop'] + + if not aux_forb and not cop: + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=node.feats['Aspect'], + tense=node.feats['Tense'], + number=node.feats['Number'], + form='Part', + voice=self.get_voice(node, refl), + reflex=self.get_is_reflex(node, refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords + ) + return + + cop = [x for x in node.children if x.udeprel == 'cop' and x.feats['Tense'] == 'Pres'] + aux_forb = [x for x in node.children if x.upos == 'AUX' and x.feats['Tense'] != 'Pres'] # in Serbian this can be a future tense + + if cop and not aux_forb: + aux = [x for x in node.children if x.udeprel == "aux" and x.feats['Mood'] == 'Ind' and x.feats['Tense'] == 'Pres'] + prep = [x for x in node.children if x.upos == 'ADP'] + refl = [x for x in node.children if x.feats['Reflex'] == 'Yes' and x.udeprel == 'expl'] + + phrase_nodes = [node] + cop + aux + prep + refl + neg = self.get_negative_particles(phrase_nodes) + phrase_nodes += neg + + copVerb = cop[0] + + phrase_ords = [x.ord for x in phrase_nodes] + phrase_ords.sort() + + self.write_node_info(node, + aspect=copVerb.feats['Aspect'], + tense='Pres', + person=copVerb.feats['Person'], + number=copVerb.feats['Number'], + mood='Ind', + form='Fin', + voice=self.get_voice(copVerb, refl), + reflex=self.get_is_reflex(node, refl), + polarity=self.get_polarity(phrase_nodes), + ords=phrase_ords + ) diff --git a/udapi/block/read/conll.py b/udapi/block/read/conll.py index f64cd9ff..d0aef1ee 100644 --- a/udapi/block/read/conll.py +++ b/udapi/block/read/conll.py @@ -79,22 +79,24 @@ def parse_node_line(self, line, root, nodes, parents, mwts): # but it allows for arbitrary columns node = root.create_child() for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] if attribute_name == 'head': try: - parents.append(int(fields[n_attribute])) + parents.append(int(value)) except ValueError as exception: - if not self.strict and fields[n_attribute] == '_': + if not self.strict and value == '_': if self.empty_parent == 'warn': logging.warning("Empty parent/head index in '%s'", line) parents.append(0) else: raise exception elif attribute_name == 'ord': - setattr(node, 'ord', int(fields[n_attribute])) + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") elif attribute_name == 'deps': - setattr(node, 'raw_deps', fields[n_attribute]) - elif attribute_name != '_' and fields[n_attribute] != '_': - setattr(node, attribute_name, fields[n_attribute]) + setattr(node, 'raw_deps', value) + elif attribute_name != '_' and value != '_': + setattr(node, attribute_name, value) nodes.append(node) @@ -134,11 +136,10 @@ def read_tree_from_lines(self, lines): if node is parent: if self.fix_cycles: logging.warning("Ignoring a cycle (attaching to the root instead):\n%s", node) - node._parent = root - root._children.append(node) + parent = root else: raise ValueError(f"Detected a cycle: {node} attached to itself") - elif node.children: + elif node._children: climbing = parent._parent while climbing: if climbing is node: diff --git a/udapi/block/read/conllu.py b/udapi/block/read/conllu.py index bb76bfee..b485c17d 100644 --- a/udapi/block/read/conllu.py +++ b/udapi/block/read/conllu.py @@ -201,7 +201,7 @@ def read_tree_from_lines(self, lines): root._children.append(node) else: raise ValueError(f"Detected a cycle: {node} attached to itself") - elif node.children: + elif node._children: climbing = parent._parent while climbing: if climbing is node: diff --git a/udapi/block/read/conllup.py b/udapi/block/read/conllup.py new file mode 100644 index 00000000..16d83d07 --- /dev/null +++ b/udapi/block/read/conllup.py @@ -0,0 +1,107 @@ +"""Conllup is a reader block for the CoNLL-UPlus format. + +Columns which don't have standardize attributes in Udapi/CoNLL-U +are stored in MISC (as key=value pairs). + +This code has been only tested on Hungarian KorKor files for CorefUD so far. +However, in the end, it is not used there (xtsv files are used instead conllup). +""" +import logging +import re + +import udapi.block.read.conll +from udapi.core.root import Root +from udapi.core.node import Node + +RE_GLOBAL_COLUMNS = re.compile(r'^# global.columns\s*=\s*(.+)') +COLUMN_MAP = { + 'ID': 'ord', +} +NORMAL_ATTRS = 'form lemma upos xpos feats deprel misc'.split() + +class Conllup(udapi.block.read.conll.Conll): + """A reader of the CoNLL-UPlus files.""" + + def __init__(self, attributes='autodetect', save_global_columns=False, **kwargs): + """Create the Conllup reader object. + + Args: + attributes: comma-separated list of column names in the input files + (can be used if the global.columns header is missing or needs to be overriden). + Default='autodetect' which means the column names will be loaded from the global.columns header. + For ignoring a column, use "_" as its name. + save_global_columns: keep the "global.columns" header in root.comments. Default=False. + Note that when saving the output to CoNLL-U, the comment is not needed + and it may be even misleading. It could be helpful only once write.Conllup is implemented + (with the possibility to use the same columns as in the input file). + """ + super().__init__(**kwargs) + self.save_global_columns = save_global_columns + if attributes == 'autodetect': + self.node_attributes = None + else: + self.node_attributes = attributes.split(',') + + def parse_comment_line(self, line, root): + if self.node_attributes is None: + global_columns_match = RE_GLOBAL_COLUMNS.match(line) + if global_columns_match is None: + return super().parse_comment_line(line, root) + global_columns = global_columns_match.group(1) + self.node_attributes = [COLUMN_MAP.get(v, v.lower()) for v in global_columns.split(" ")] + if self.save_global_columns: + root.comment += line[1:] + '\n' + return + return super().parse_comment_line(line, root) + + def parse_node_line(self, line, root, nodes, parents, mwts): + fields = line.split('\t') + if len(fields) != len(self.node_attributes): + if self.strict: + raise RuntimeError('Wrong number of columns in %r' % line) + fields.extend(['_'] * (len(self.node_attributes) - len(fields))) + + # multi-word tokens will be processed later + if '-' in fields[0]: + mwts.append(fields) + return + if '.' in fields[0]: + raise NotImplementedError("Empty nodes in CoNLL-UPlus not implement yet in read.Conllup") + + # This implementation is slower than in read.Conllu, + # but it allows for arbitrary columns + node = root.create_child() + nonstandard_attrs = [] + for (n_attribute, attribute_name) in enumerate(self.node_attributes): + value = fields[n_attribute] + if attribute_name == 'head': + if value == '???': + value = 0 + try: + parents.append(int(value)) + except ValueError as exception: + if not self.strict and value == '_': + if self.empty_parent == 'warn': + logging.warning("Empty parent/head index in '%s'", line) + parents.append(0) + else: + raise exception + elif attribute_name == 'ord': + if int(value) != node._ord: + raise ValueError(f"Node {node} ord mismatch: {value}, but expecting {node._ord} at:\n{line}") + elif attribute_name == 'deps': + setattr(node, 'raw_deps', value) + elif value == '_' and attribute_name != 'form': + pass + elif attribute_name == '_': + pass + elif attribute_name in NORMAL_ATTRS: + setattr(node, attribute_name, value) + else: + nonstandard_attrs.append([attribute_name, value]) + + # This needs to be done after node.misc is created (if "misc" in node.attributes) + for attribute_name, value in nonstandard_attrs: + node.misc[attribute_name.capitalize()] = value + + nodes.append(node) diff --git a/udapi/block/ud/ar/fixedeprels.py b/udapi/block/ud/ar/fixedeprels.py index ad093e1c..99db7fa2 100644 --- a/udapi/block/ud/ar/fixedeprels.py +++ b/udapi/block/ud/ar/fixedeprels.py @@ -1,6 +1,5 @@ """Block to fix case-enhanced dependency relations in Arabic.""" from udapi.core.block import Block -import logging import re class FixEdeprels(Block): @@ -21,7 +20,7 @@ class FixEdeprels(Block): 'مِثلَ': [], 'لِأَنَّ': [], 'كَمَا': [], - 'فِي_حِينَ': [], +# 'فِي_حِينَ': [], 'فَ': [] } @@ -398,6 +397,7 @@ class FixEdeprels(Block): 'كَمَا': 'كَمَا', # remove morphological case; kamā = as 'كَي': 'لِكَي', # kay = in order to 'لَ': 'لِ:gen', + 'لَ_عَلَّ': 'لِ:gen', 'لِ': 'لِ:gen', # li = to 'لِ_أَجَلّ': 'لِ:gen', 'لِ_إِلَى': 'لِ:gen', @@ -552,7 +552,6 @@ def process_node(self, node): for edep in node.deps: m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) if m: - bdeprel = m.group(1) solved = False # Arabic clauses often start with وَ wa "and", which does not add # much to the meaning but sometimes gets included in the enhanced @@ -564,10 +563,12 @@ def process_node(self, node): # If one of the following expressions occurs followed by another preposition # or by morphological case, remove the additional case marking. For example, # 'jako_v' becomes just 'jako'. + re_prefix = r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):' + re_suffix = r'([_:].+)?$' for x in self.outermost: exceptions = self.outermost[x] - m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):'+x+r'([_:].+)?$', edep['deprel']) - if m and m.group(2) and not x+m.group(2) in exceptions: + m = re.match(re_prefix + x + re_suffix, edep['deprel']) + if m and (not m.group(2) or not (x + m.group(2)) in exceptions): edep['deprel'] = m.group(1)+':'+x solved = True break diff --git a/udapi/block/ud/cs/addmwt.py b/udapi/block/ud/cs/addmwt.py index 4c203ddc..c1b3783a 100644 --- a/udapi/block/ud/cs/addmwt.py +++ b/udapi/block/ud/cs/addmwt.py @@ -1,17 +1,26 @@ """Block ud.cs.AddMwt for heuristic detection of multi-word tokens.""" import udapi.block.ud.addmwt +import re +import logging +# Define static rules for 'aby', 'kdyby' and similar forms. MWTS = { - 'abych': {'form': 'aby bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, - 'kdybych': {'form': 'když bych', 'feats': '_ Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, - 'abys': {'form': 'aby bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, - 'kdybys': {'form': 'když bys', 'feats': '_ Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, - 'aby': {'form': 'aby by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, - 'kdyby': {'form': 'když by', 'feats': '_ Mood=Cnd|Person=3|VerbForm=Fin'}, - 'abychom': {'form': 'aby bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, - 'kdybychom': {'form': 'když bychom', 'feats': '_ Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, - 'abyste': {'form': 'aby byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, - 'kdybyste': {'form': 'když byste', 'feats': '_ Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'abych': {'form': 'aby bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'kdybych': {'form': 'když bych', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=1|VerbForm=Fin'}, + 'abys': {'form': 'aby bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'kdybys': {'form': 'když bys', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Sing|Person=2|VerbForm=Fin'}, + 'aby': {'form': 'aby by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'kdyby': {'form': 'když by', 'feats': '_ Aspect=Imp|Mood=Cnd|VerbForm=Fin'}, + 'abychom': {'form': 'aby bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychom': {'form': 'když bychom', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + # Old Czech 'abychme' == Modern Czech 'abychom' + 'abychme': {'form': 'aby bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'kdybychme': {'form': 'když bychme', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=1|VerbForm=Fin'}, + 'abyste': {'form': 'aby byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + 'kdybyste': {'form': 'když byste', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Plur|Person=2|VerbForm=Fin'}, + # Old Czech 'abyšta' == dual number; 2nd or 3rd person, the one example in data so far is 3rd. + 'abyšta': {'form': 'aby byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, + 'kdybyšta': {'form': 'když byšta', 'feats': '_ Aspect=Imp|Mood=Cnd|Number=Dual|Person=3|VerbForm=Fin'}, } for v in MWTS.values(): v['upos'] = 'SCONJ AUX' @@ -25,49 +34,153 @@ person = '1' elif 'Person=2' in v['feats']: person = '2' - v['xpos'] = 'J,------------- Vc-%s---%s-------' % (number, person) v['deprel'] = '* aux' v['lemma'] = v['form'].split()[0] + ' být' v['main'] = 0 v['shape'] = 'siblings' +# Define static rules for 'nač', 'oč', 'zač' (but not 'proč'). +# Add them to the already existing dictionary MWTS. # nač -> na + co -for prep in 'na za o'.split(): +for prep in 'na o za'.split(): MWTS[prep + 'č'] = { 'form': prep + ' co', 'lemma': prep + ' co', 'upos': 'ADP PRON', + 'xpos': 'RR--4---------- PQ--4----------', + 'feats': 'AdpType=Prep|Case=Acc Animacy=Inan|Case=Acc|PronType=Int,Rel', 'deprel': 'case *', 'main': 1, 'shape': 'subtree', } + class AddMwt(udapi.block.ud.addmwt.AddMwt): """Detect and mark MWTs (split them into words and add the words to the tree).""" def multiword_analysis(self, node): """Return a dict with MWT info or None if `node` does not represent a multiword token.""" + # Avoid adding a MWT if the current node already is part of an MWT. + if node.multiword_token: + return None analysis = MWTS.get(node.form.lower(), None) if analysis is not None: return analysis - - # There is no VerbType=verbconj in the UD_Czech data. - # The purpose of this rule is rather to show that - # it is possible to write such "dynamic" rules - # (which cannot be included in static MWTS). - if node.form.lower().endswith('ť') and node.feats['VerbType'] == 'verbconj': - return { - 'form': node.form.lower()[:-1] + ' neboť', - 'lemma': '* neboť', - 'upos': '* CCONJ', - 'xpos': 'Vt-S---3P-NA--2 J^-------------', - 'feats': '* _', - 'deprel': '* cc', - 'main': 0, - 'shape': 'subtree', - } + # If the node did not match any of the static rules defined in MWTS, + # check it against the "dynamic" rules below. The enclitic 'ť' will be + # separated from its host but only if it has been marked by an annotator + # in MISC. (These are annotation conventions used for Old Czech in the + # Hičkok project.) + if node.misc['AddMwt'] != '': + subtokens = node.misc['AddMwt'].split() + if len(subtokens) != 2: + logging.warning("MISC 'AddMwt=%s' has unexpected number of subtokens." % node.misc['AddMwt']) + return None + token_from_subtokens = ''.join(subtokens) + if subtokens[1] == 'jsi': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' jsi', + 'lemma': '* být', + 'upos': '* AUX', + 'xpos': '* VB-S---2P-AAI--', + 'feats': '* Aspect=Imp|Mood=Ind|Number=Sing|Person=2|Polarity=Pos|Tense=Pres|VerbForm=Fin|Voice=Act', + 'deprel': '* aux', + 'main': 0, + 'shape': 'subtree' if node.upos in ['VERB'] else 'siblings', + } + if subtokens[1] == 'i': + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' i', + 'lemma': '* i', + 'upos': '* CCONJ', + 'xpos': '* J^-------------', + 'feats': '* _', + 'deprel': '* cc', + 'main': 0, + 'shape': 'subtree', + } + if subtokens[1] in ['ť', 'tě', 'ti']: + if token_from_subtokens != node.form: + logging.warning("Concatenation of MISC 'AddMwt=%s' does not yield the FORM '%s'." % (node.misc['AddMwt'], node.form)) + return None + node.misc['AddMwt'] = '' + return { + 'form': subtokens[0] + ' ' + subtokens[1], + 'lemma': '* ť', + 'upos': '* PART', + 'xpos': '* TT-------------', + 'feats': '* _', + 'deprel': '* discourse', + 'main': 0, + 'shape': 'subtree', + } + # Contractions of prepositions and pronouns almost could be processed + # regardless of AddMwt instructions by the annotator, but we still + # require it to be on the safe side. For example, both 'přědeň' and + # 'přěden' are attested in Old Czech but then we do not want to catch + # 'on' (besides the wanted 'oň'). Another reason si that the pronoun + # could be masculine or neuter. We pick Gender=Masc and Animacy=Anim + # by default, unless the original token was annotated as Animacy=Inan + # or Gender=Neut. + m = re.match(r"^(na|nade|o|pro|přěde|ski?rz[eě]|za)[nň](ž?)$", node.form.lower()) + if m: + node.misc['AddMwt'] = '' + # Remove vocalization from 'přěde' (přěd něj) but keep it in 'skrze' + # (skrze něj). + if m.group(1) == 'přěde': + pform = 'přěd' + plemma = 'před' + adptype = 'Voc' + at = 'V' + elif re.match(r"^ski?rz[eě]$", m.group(1).lower()): + pform = m.group(1) + plemma = 'skrz' + adptype = 'Voc' + at = 'V' + else: + pform = m.group(1) + plemma = m.group(1) + adptype = 'Prep' + at = 'R' + # In UD PDT, Gender=Masc,Neut, and in PDT it is PEZS4--3 / P4ZS4---. + if node.feats['Gender'] == 'Neut': + gender = 'Neut' + animacy = '' + g = 'N' + elif node.feats['Animacy'] == 'Inan': + gender = 'Masc' + animacy = 'Animacy=Inan|' + g = 'I' + else: + gender = 'Masc' + animacy = 'Animacy=Anim|' + g = 'M' + if m.group(2).lower() == 'ž': + return { + 'form': pform + ' nějž', + 'lemma': plemma + ' jenž', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- P4'+g+'S4---------2', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|PrepCase=Pre|PronType=Rel', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } + else: + return { + 'form': pform + ' něj', + 'lemma': plemma + ' on', + 'upos': 'ADP PRON', + 'xpos': 'R'+at+'--4---------- PE'+g+'S4--3-------', + 'feats': 'AdpType='+adptype+'|Case=Acc '+animacy+'Case=Acc|Gender='+gender+'|Number=Sing|Person=3|PrepCase=Pre|PronType=Prs', + 'deprel': 'case *', + 'main': 1, + 'shape': 'subtree', + } return None def postprocess_mwt(self, mwt): diff --git a/udapi/block/ud/cs/fixedeprels.py b/udapi/block/ud/cs/fixedeprels.py index f2f76b4b..bd85e1b4 100644 --- a/udapi/block/ud/cs/fixedeprels.py +++ b/udapi/block/ud/cs/fixedeprels.py @@ -1,6 +1,5 @@ """Block to fix case-enhanced dependency relations in Czech.""" from udapi.core.block import Block -import logging import re class FixEdeprels(Block): @@ -12,18 +11,25 @@ class FixEdeprels(Block): # by all the inner cases. # The list in the value contains exceptions that should be left intact. outermost = { + 'aby': [], 'ač': [], 'ačkoli': [], # 'ačkoliv' se převede na 'ačkoli' dole + 'ačkoliv': [], # ... ale možná ne když je doprovázeno předložkou + 'ať': [], 'byť': [], 'i_když': [], 'jak': [], 'jakkoli': [], # 'jakkoliv' se převede na 'jakkoli' dole 'jako': [], 'jakoby': ['jakoby_pod:ins'], # these instances in FicTree should be spelled 'jako by' + 'když': [], 'než': ['než_aby'], + 'nežli': [], + 'pokud': [], 'protože': [], 'takže': [], - 'třebaže': [] + 'třebaže': [], + 'že': [] } # Secondary prepositions sometimes have the lemma of the original part of @@ -32,35 +38,52 @@ class FixEdeprels(Block): # case. And include all other prepositions that have unambiguous morphological # case, even if they are not secondary. unambiguous = { + 'á': 'na:acc', # "á konto té záležitosti", ovšem "á konto" není ani spojeno jako složená předložka (význam = "na konto") 'abi': 'aby', 'aby_na': 'na:loc', 'ačkoliv': 'ačkoli', 'ať': 'ať', # remove morphological case 'ať_forma': 'formou:gen', + 'ať_na': 'na:loc', + 'ať_s': 's:ins', 'ať_v': 'v:loc', + 'ať_v_oblast': 'v_oblasti:gen', 'ať_z': 'z:gen', + 'ať_z_hledisko': 'z_hlediska:gen', 'ať_z_strana': 'ze_strany:gen', 'až_do': 'do:gen', 'až_o': 'o:acc', 'během': 'během:gen', 'bez': 'bez:gen', 'bez_ohled_na': 'bez_ohledu_na:acc', + 'bez_na': 'bez_ohledu_na:acc', ###!!! a temporary hack to silence the validator about (https://github.com/UniversalDependencies/UD_Czech-PDT/issues/10#issuecomment-2710721703) 'bez_zřetel_k': 'bez_zřetele_k:dat', 'bez_zřetel_na': 'bez_zřetele_na:acc', + 'blízko': 'blízko:dat', + 'blízko_k': 'blízko:dat', 'blíž': 'blízko:dat', + 'blíže': 'blízko:dat', + 'bok_po_bok_s': 'bok_po_boku_s:ins', 'cesta': 'cestou:gen', + 'coby': 'coby', # remove morphological case 'daleko': 'nedaleko:gen', 'daleko_od': 'od:gen', 'dík': 'díky:dat', 'díky': 'díky:dat', 'dle': 'dle:gen', 'do': 'do:gen', + 'do_čelo': 'do_čela:gen', 'do_k': 'k:dat', 'do_oblast': 'do_oblasti:gen', 'do_rozpor_s': 'do_rozporu_s:ins', + 'do_ruka': 'do_rukou:gen', 'do_soulad_s': 'do_souladu_s:ins', + 'důsledkem': 'v_důsledku:gen', 'forma': 'formou:gen', + 'formou': 'formou:gen', + 'hledět_na': 'nehledě_na:acc', 'i_když': 'i_když', # remove morphological case + 'i_pro': 'pro:acc', 'jak_aby': 'jak', 'jak_ad': 'jak', 'jakkoliv': 'jakkoli', @@ -68,33 +91,52 @@ class FixEdeprels(Block): 'jako_kupříkladu': 'jako', 'jakoby': 'jako', 'jakoby_pod': 'pod:ins', + 'jakožto': 'jako', 'jelikož_do': 'jelikož', + 'jenom': 'jen', + 'jesli': 'jestli', 'jestli_že': 'jestliže', + 'jménem': 'jménem:gen', 'k': 'k:dat', 'k_konec': 'ke_konci:gen', + 'k_prospěch': 'ku_prospěchu:gen', 'kdykoliv': 'kdykoli', 'kol': 'kolem:gen', 'kolem': 'kolem:gen', + 'kolem_dokola': 'kolem:gen', + 'koncem': 'koncem:gen', 'konec': 'koncem:gen', 'krom': 'kromě:gen', 'kromě': 'kromě:gen', + 'kvůli': 'kvůli:dat', + 'leda_když': 'ledaže', + 'li_jako': 'li', 'liž': 'li', 'mezi_uvnitř': 'uvnitř:gen', + 'na:ins': 'na:acc', 'na_báze': 'na_bázi:gen', 'na_čelo': 'na_čele:gen', 'na_mimo': 'na:loc', # na kurtě i mimo něj 'na_než': 'na:acc', # na víc než čtyři a půl kilometru 'na_od': 'na_rozdíl_od:gen', + 'na_počátek': 'na_počátku:gen', + 'na_počest': 'na_počest:gen', # appears also with :dat but the meaning is same 'na_podklad': 'na_podkladě:gen', 'na_rozdíl_od': 'na_rozdíl_od:gen', + 'na_strana': 'na_straně:gen', + 'na_účet': 'na_účet:gen', 'na_újma': 'gen', # 'nebude na újmu' is a multi-word predicate but 'na újmu' is probably not used as an independent oblique modifier 'na_úroveň': 'na_úrovni:gen', + 'na_úroveň_okolo': 'na_úrovni:gen', 'na_úsek': 'na_úseku:gen', + 'na_začátek': 'na_začátku:gen', 'na_základ': 'na_základě:gen', 'na_základna': 'na_základně:gen', 'na_závěr': 'na_závěr:gen', + 'na_zda': 'na:loc', # na tom, zda a v jaké formě... 'namísto': 'namísto:gen', 'namísto_do': 'do:gen', + 'napospas': 'napospas:dat', 'narozdíl_od': 'na_rozdíl_od:gen', 'následek': 'následkem:gen', 'navzdory': 'navzdory:dat', @@ -104,39 +146,58 @@ class FixEdeprels(Block): 'o_jako': 'jako', 'o_o': 'o:acc', 'od': 'od:gen', + 'od_počínaje': 'počínaje:ins', # od brambor počínaje a základní zeleninou konče 'ohledně': 'ohledně:gen', 'okolo': 'okolo:gen', 'oproti': 'oproti:dat', 'po_v': 'po:loc', + 'po_bok': 'po_boku:gen', 'po_doba': 'po_dobu:gen', + 'po_stránka': 'po_stránce:gen', 'po_vzor': 'po_vzoru:gen', 'poblíž': 'poblíž:gen', 'počátek': 'počátkem:gen', + 'počátkem': 'počátkem:gen', + 'počínaje': 'počínaje:ins', 'počínat': 'počínaje:ins', + 'počínat_od': 'počínaje:ins', 'pod_dojem': 'pod_dojmem:gen', + 'pod_tlak': 'pod_tlakem:gen', 'pod_vliv': 'pod_vlivem:gen', + 'pod_záminka': 'pod_záminkou:gen', + 'pod_záminka_že': 'pod_záminkou_že', + 'podél': 'podél:gen', 'podle': 'podle:gen', 'pomoc': 'pomocí:gen', 'pomocí': 'pomocí:gen', 'postup': 'postupem:gen', 'pouze_v': 'v:loc', 'pro': 'pro:acc', + 'pro_aby': 'pro:acc', 'prostřednictví': 'prostřednictvím:gen', 'prostřednictvím': 'prostřednictvím:gen', 'proti': 'proti:dat', + 'proto_aby': 'aby', 'protože': 'protože', # remove morphological case 'před_během': 'během:gen', # před a během utkání 'před_po': 'po:loc', # před a po vyloučení Schindlera 'přes': 'přes:acc', + 'přes_přes': 'přes:acc', # annotation error 'přestože': 'přestože', # remove morphological case 'při': 'při:loc', + 'při_pro': 'při:loc', 'při_příležitost': 'při_příležitosti:gen', + 'ruka_v_ruka_s': 'ruku_v_ruce_s:ins', + 's_cíl': 's_cílem', # s cílem projednat X 's_ohled_k': 's_ohledem_k:dat', 's_ohled_na': 's_ohledem_na:acc', 's_pomoc': 's_pomocí:gen', + 's_postup': 'postupem:gen', 's_přihlédnutí_k': 's_přihlédnutím_k:dat', 's_přihlédnutí_na': 's_přihlédnutím_na:acc', 's_výjimka': 's_výjimkou:gen', + 's_výjimka_z': 's_výjimkou:gen', + 's_výjimka_že': 's_výjimkou_že', 's_vyloučení': 's_vyloučením:gen', 's_zřetel_k': 'se_zřetelem_k:dat', 's_zřetel_na': 'se_zřetelem_na:acc', @@ -146,20 +207,29 @@ class FixEdeprels(Block): 'směr_k': 'směrem_k:dat', 'směr_na': 'směrem_na:acc', 'směr_od': 'směrem_od:gen', + 'směr_přes': 'směrem_přes:acc', + 'směr_z': 'směrem_z:gen', 'společně_s': 'společně_s:ins', 'spolu': 'spolu_s:ins', 'spolu_s': 'spolu_s:ins', + 'spolu_se': 'spolu_s:ins', 'stranou': 'stranou:gen', + 'stranou_od': 'stranou:gen', 'takže': 'takže', # remove morphological case 'takže_a': 'takže', 'třebaže': 'třebaže', # remove morphological case + 'tvář_v_tvář': 'tváří_v_tvář:dat', 'u': 'u:gen', 'u_příležitost': 'u_příležitosti:gen', 'uprostřed': 'uprostřed:gen', 'uvnitř': 'uvnitř:gen', + 'v:ins': 'v:loc', # ve skutečností (překlep) 'v_analogie_s': 'v_analogii_s:ins', + 'v_blízkost': 'v_blízkosti:gen', + 'v_čas': 'v_čase:gen', 'v_čelo': 'v_čele:gen', 'v_čelo_s': 'v_čele_s:ins', + 'v_doba': 'v_době:gen', 'v_dohoda_s': 'v_dohodě_s:ins', 'v_duch': 'v_duchu:gen', 'v_důsledek': 'v_důsledku:gen', @@ -170,12 +240,14 @@ class FixEdeprels(Block): 'v_konfrontace_s': 'v_konfrontaci_s:ins', 'v_kontext_s': 'v_kontextu_s:ins', 'v_na': 'na:loc', + 'v_neprospěch': 'v_neprospěch:gen', 'v_oblast': 'v_oblasti:gen', 'v_oblast_s': 's:ins', 'v_obor': 'v_oboru:gen', 'v_otázka': 'v_otázce:gen', 'v_podoba': 'v_podobě:gen', 'v_poměr_k': 'v_poměru_k:dat', + 'v_porovnání_s': 'v_porovnání_s:ins', 'v_proces': 'v_procesu:gen', 'v_prospěch': 've_prospěch:gen', 'v_protiklad_k': 'v_protikladu_k:dat', @@ -183,27 +255,34 @@ class FixEdeprels(Block): 'v_případ': 'v_případě:gen', 'v_případ_že': 'v_případě_že', 'v_rámec': 'v_rámci:gen', + 'v_reakce_na': 'v_reakci_na:acc', 'v_rozpor_s': 'v_rozporu_s:ins', 'v_řada': 'v_řadě:gen', 'v_shoda_s': 've_shodě_s:ins', 'v_služba': 've_službách:gen', 'v_směr': 've_směru:gen', 'v_směr_k': 've_směru_k:dat', + 'v_směr_na': 've_směru_k:dat', # same meaning as ve_směru_na:acc 'v_smysl': 've_smyslu:gen', 'v_součinnost_s': 'v_součinnosti_s:ins', 'v_souhlas_s': 'v_souhlasu_s:ins', 'v_soulad_s': 'v_souladu_s:ins', 'v_souvislost_s': 'v_souvislosti_s:ins', 'v_spojení_s': 've_spojení_s:ins', + 'v_spojení_se': 've_spojení_s:ins', 'v_spojený_s': 've_spojení_s:ins', 'v_spojitost_s': 've_spojitosti_s:ins', 'v_spolupráce_s': 've_spolupráci_s:ins', 'v_s_spolupráce': 've_spolupráci_s:ins', 'v_srovnání_s': 've_srovnání_s:ins', 'v_srovnání_se': 've_srovnání_s:ins', + 'v_stav': 've_stavu:gen', + 'v_stín': 've_stínu:gen', 'v_světlo': 've_světle:gen', + 'v_úroveň': 'v_úrovni:gen', 'v_věc': 've_věci:gen', 'v_vztah_k': 've_vztahu_k:dat', + 'v_vztah_s': 've_vztahu_k:dat', 'v_zájem': 'v_zájmu:gen', 'v_záležitost': 'v_záležitosti:gen', 'v_závěr': 'v_závěru:gen', @@ -212,9 +291,12 @@ class FixEdeprels(Block): 'v_znamení': 've_znamení:gen', 'včetně': 'včetně:gen', 'vedle': 'vedle:gen', + 'versus': 'versus:nom', 'vina': 'vinou:gen', 'vliv': 'vlivem:gen', + 'vlivem': 'vlivem:gen', 'vůči': 'vůči:dat', + 'výměna_za': 'výměnou_za:acc', 'vzhledem': 'vzhledem_k:dat', 'vzhledem_k': 'vzhledem_k:dat', 'z': 'z:gen', @@ -225,6 +307,7 @@ class FixEdeprels(Block): 'z_strana': 'ze_strany:gen', 'z_nedostatek': 'z_nedostatku:gen', 'z_titul': 'z_titulu:gen', + 'z_začátek': 'ze_začátku:gen', 'za_pomoc': 'za_pomoci:gen', 'za_účast': 'za_účasti:gen', 'za_účel': 'za_účelem:gen', @@ -262,13 +345,14 @@ def process_node(self, node): for edep in node.deps: m = re.match(r'^(obl(?::arg)?|nmod|advcl|acl(?::relcl)?):', edep['deprel']) if m: - bdeprel = m.group(1) solved = False # Issues caused by errors in the original annotation must be fixed early. # Especially if acl|advcl occurs with a preposition that unambiguously # receives a morphological case in the subsequent steps, and then gets # flagged as solved. edep['deprel'] = re.sub(r'^advcl:do(?::gen)?$', r'obl:do:gen', edep['deprel']) # od nevidím do nevidím ###!!! Ale měli bychom opravit i závislost v základním stromu! + edep['deprel'] = re.sub(r'^advcl:pro(?::acc)?$', r'advcl:aby', edep['deprel']) # byl by pro, abychom... ###!!! Opravit i konverzi stromu. + edep['deprel'] = re.sub(r'^advcl:s(?::ins)?$', r'advcl', edep['deprel']) ###!!! "seděli jsme tam s Člověče, nezlob se!" Měla by se opravit konverze stromu. edep['deprel'] = re.sub(r'^acl:k(?::dat)?$', r'acl', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:k(?::dat)?$', r'obl:k:dat', edep['deprel']) ###!!! Ale měli bychom opravit i závislost v základním stromu! edep['deprel'] = re.sub(r'^advcl:místo(?::gen)?$', r'obl:místo:gen', edep['deprel']) # 'v poslední době se množí bysem místo bych' @@ -280,6 +364,9 @@ def process_node(self, node): edep['deprel'] = re.sub(r'^acl:v$', r'nmod:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:v$', r'obl:v:loc', edep['deprel']) edep['deprel'] = re.sub(r'^advcl:v_duchu?(?::gen)?$', r'obl:v_duchu:gen', edep['deprel']) + edep['deprel'] = re.sub(r'^nmod:když.*$', r'nmod', edep['deprel']) # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here! + edep['deprel'] = re.sub(r'^obl:ačkoli.*$', r'obl', edep['deprel']) # nadějí když ne na zbohatnutí, tak alespoň na dobrou obživu ###!!! perhaps "když" or "když ne" should be analyzed as "cc" here! + edep['deprel'] = re.sub(r'^obl:jestli(?::gen)?$', r'obl:gen', edep['deprel']) # nevím, jestli osmého nebo devátého září # Removing 'až' must be done early. The remainder may be 'počátek' # and we will want to convert it to 'počátkem:gen'. edep['deprel'] = re.sub(r'^(nmod|obl(?::arg)?):až_(.+):(gen|dat|acc|loc|ins)', r'\1:\2:\3', edep['deprel']) diff --git a/udapi/block/ud/cs/markfeatsbugs.py b/udapi/block/ud/cs/markfeatsbugs.py index 7e1f8ffb..17570ee2 100644 --- a/udapi/block/ud/cs/markfeatsbugs.py +++ b/udapi/block/ud/cs/markfeatsbugs.py @@ -7,7 +7,6 @@ Windows: python udapy read.Conllu files="a.conllu,b.conllu" merge=1 ud.cs.MarkFeatsBugs write.TextModeTreesHtml files="bugs.html" marked_only=1 layout=compact attributes=form,lemma,upos,xpos,feats,deprel,misc """ import udapi.block.ud.markfeatsbugs -import logging import re class MarkFeatsBugs(udapi.block.ud.markfeatsbugs.MarkFeatsBugs): @@ -30,7 +29,7 @@ def process_node(self, node): pass # NOUNS ################################################################ elif node.upos == 'NOUN': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + self.check_required_features(node, ['Gender', 'Number', 'Case']) if node.feats['VerbForm'] == 'Vnoun': # verbal nouns: bytí, dělání, ... self.check_allowed_features(node, { @@ -38,8 +37,8 @@ def process_node(self, node): 'Gender': ['Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes'] + 'Foreign': ['Yes'], + 'Abbr': ['Yes'] }) elif node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Animacy']) @@ -48,18 +47,18 @@ def process_node(self, node): 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_allowed_features(node, { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Foreign': ['Yes']}) + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) # PROPER NOUNS ######################################################### elif node.upos == 'PROPN': - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity']) + self.check_required_features(node, ['Gender', 'Number', 'Case']) if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Animacy']) self.check_allowed_features(node, { @@ -67,17 +66,17 @@ def process_node(self, node): 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) else: self.check_allowed_features(node, { 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'NameType': ['Giv', 'Sur', 'Geo'], - 'Foreign': ['Yes']}) + 'NameType': ['Giv', 'Sur', 'Geo', 'Nat'], + 'Foreign': ['Yes'], + 'Abbr': ['Yes']}) # ADJECTIVES ########################################################### elif node.upos == 'ADJ': if node.feats['Poss'] == 'Yes': # possessive adjectives @@ -90,7 +89,8 @@ def process_node(self, node): 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], 'Foreign': ['Yes']}) else: self.check_required_features(node, ['Poss', 'Gender[psor]', 'Gender', 'Number', 'Case']) @@ -100,31 +100,37 @@ def process_node(self, node): 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'NameType': ['Giv', 'Sur'], # for possessive adjectives derived from personal names + 'NameType': ['Giv', 'Sur', 'Nat'], # for possessive adjectives derived from personal names + 'Emph': ['Yes'], 'Foreign': ['Yes']}) - elif node.feats['NumType'] == 'Ord': # ordinal numerals are a subtype of adjectives + elif node.feats['NumType'] == 'Ord' or node.feats['NumType'] == 'Mult': # ordinal numerals are a subtype of adjectives; same for some multiplicative numerals (dvojí, trojí) if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['NumType', 'Gender', 'Animacy', 'Number', 'Case']) self.check_allowed_features(node, { - 'NumType': ['Ord'], + 'NumType': ['Ord', 'Mult'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'], # sedmer (Mult Short) duch tvój; pól čtverta (Ord Short) komára + 'Emph': ['Yes'], 'Foreign': ['Yes']}) else: self.check_required_features(node, ['NumType', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'NumType': ['Ord'], + 'NumType': ['Ord', 'Mult'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Variant': ['Short'], + 'Emph': ['Yes'], 'Foreign': ['Yes']}) elif node.feats['VerbForm'] == 'Part': # participles (except l-participles) are a subtype of adjectives self.check_required_features(node, ['VerbForm', 'Voice']) - if node.feats['Voice'] == 'Act': # active participles have tense, passives don't + if node.feats['Voice'] == 'Act': # active participles have tense, passives don't but they have degree if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -136,9 +142,11 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], 'Variant': ['Short'], + 'Emph': ['Yes'], 'Foreign': ['Yes']}) else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzující'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Tense', 'Gender', 'Number', 'Case', 'Polarity']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -149,10 +157,12 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], 'Variant': ['Short'], + 'Emph': ['Yes'], 'Foreign': ['Yes']}) else: if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Degree']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -162,10 +172,13 @@ def process_node(self, node): 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], 'Variant': ['Short'], + 'Emph': ['Yes'], 'Foreign': ['Yes']}) else: - self.check_required_features(node, ['VerbForm', 'Aspect', 'Voice', 'Gender', 'Number', 'Case', 'Polarity']) + # Aspect is not required in general because it is omitted for participles of biaspectual verbs (e.g. 'analyzovaný'). + self.check_required_features(node, ['VerbForm', 'Voice', 'Gender', 'Number', 'Case', 'Polarity', 'Degree']) self.check_allowed_features(node, { 'VerbForm': ['Part'], 'Aspect': ['Imp', 'Perf'], @@ -174,29 +187,11 @@ def process_node(self, node): 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Polarity': ['Pos', 'Neg'], + 'Degree': ['Pos', 'Cmp', 'Sup'], 'Variant': ['Short'], + 'Emph': ['Yes'], 'Foreign': ['Yes']}) - elif node.feats['Variant'] == 'Short': # short (nominal) forms of adjectives have no degree - if node.feats['Gender'] == 'Masc': - self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Animacy': ['Anim', 'Inan'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: - self.check_required_features(node, ['Gender', 'Number', 'Case', 'Polarity', 'Variant']) - self.check_allowed_features(node, { - 'Gender': ['Masc', 'Fem', 'Neut'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], - 'Polarity': ['Pos', 'Neg'], - 'Variant': ['Short'], - 'Foreign': ['Yes']}) - else: # regular adjectives + else: # regular adjectives, including short forms if node.feats['Gender'] == 'Masc': self.check_required_features(node, ['Gender', 'Animacy', 'Number', 'Case', 'Degree', 'Polarity']) self.check_allowed_features(node, { @@ -206,6 +201,8 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Degree': ['Pos', 'Cmp', 'Sup'], 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], 'Foreign': ['Yes']}) else: self.check_required_features(node, ['Gender', 'Number', 'Case', 'Degree', 'Polarity']) @@ -215,6 +212,8 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Degree': ['Pos', 'Cmp', 'Sup'], 'Polarity': ['Pos', 'Neg'], + 'Variant': ['Short'], + 'Emph': ['Yes'], 'Foreign': ['Yes']}) # PRONOUNS ############################################################# elif node.upos == 'PRON': @@ -235,16 +234,19 @@ def process_node(self, node): 'PronType': ['Prs'], 'Person': ['3'] }) - elif node.feats['Variant'] == 'Short': # ho, mu - # The short (clitic) forms do not have PrepCase. - self.check_adjective_like(node, ['PronType', 'Person'], { + elif re.match(r"^(ho|mu)$", node.form.lower()): + # The short (clitic) forms do not have PrepCase in Modern Czech. + # Old Czech has also 'jmu' (besides 'jemu' and 'mu') and 'jho' + # (besides 'jeho' and 'ho'); it should not have Variant=Short + # and it should have PrepCase=Npr (the next block). + self.check_adjective_like(node, ['PronType', 'Person', 'Variant'], { 'PronType': ['Prs'], 'Person': ['3'], 'Variant': ['Short'] }) else: # jeho, něho, jemu, němu, jej, něj, něm, jím, ním, jí, ní, ji, ni, je, ně # Mostly only two gender groups and no animacy: - # Masc,Neut ... jeho, jemu, jej, něm, jím + # Masc,Neut ... jeho, jho, jemu, jmu, jej, něm, jím # Fem ... jí, ji, ní # Neut ... je # No gender in dual and plural: @@ -264,18 +266,22 @@ def process_node(self, node): 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], 'Variant': ['Short'] }) - elif re.search(r'k[dt]o', node.lemma): # kdo (kto), kdož, někdo, nikdo + elif re.search(r'k[dt][oe]', node.lemma): # kdo (kto), kdož, někdo, nikdo # There is no Number. Někdo and nikdo behave like singular; - # kdo is by default singular as well but it also occurs as a subject - # of plural verbs. + # kdo is by default singular as well but it also occurs as subject + # of plural verbs ("ti, kdo nepřišli včas, byli vyloučeni"). + # In Old Czech, "nikde" is a variant of the pronoun "nikdo" (nobody) + # (while in New Czech, "nikde" (nowhere) is a pronominal adverb only). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kdo to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. self.check_required_features(node, ['PronType', 'Gender', 'Animacy', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], 'Gender': ['Masc'], 'Animacy': ['Anim'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] }) - elif re.match(r'^(co|což|něco|nicož)$', node.lemma): + elif re.match(r'^(co|což|což?koliv?|něco|lečco|lecco|nic|nicož)$', node.lemma): # Although these pronouns behave by default as neuter singular, # no Gender and Number is annotated. However, quite unusually, # there is Animacy=Inan without Gender. @@ -284,9 +290,11 @@ def process_node(self, node): ###!!! animacy. For now, let's at least make animacy an optional ###!!! feature (I see that we already do not fill it in the Old ###!!! Czech data). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, co to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. self.check_required_features(node, ['PronType', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Int,Rel', 'Rel', 'Ind', 'Neg'], + 'PronType': ['Int,Rel', 'Int', 'Rel', 'Ind', 'Neg'], 'Animacy': ['Inan'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Loc', 'Ins'] }) @@ -294,6 +302,9 @@ def process_node(self, node): # Unlike 'jenžto', this relative pronoun does not inflect, it # always occurs in a nominative position, but the context can # be any gender and number. + # Update from the Hičkok project: 'ješto' is lemmatized to + # 'jenžto' (see below), meaning that this branch should not be + # needed for the new data. self.check_required_features(node, ['PronType', 'Case']) self.check_allowed_features(node, { 'PronType': ['Rel'], @@ -312,10 +323,24 @@ def process_node(self, node): # Unlike 'on', 'jenž' has the feature PrepCase everywhere, even # in the nominative, although there is no prepositional counter- # part (but similarly the locative has no prepositionless form). - self.check_adjective_like(node, ['PronType', 'PrepCase'], { - 'PronType': ['Rel'], - 'PrepCase': ['Npr', 'Pre'] - }) + # Update from the Hičkok project: In Old Czech, both 'jenž' and + # 'jenžto' (or its variant 'ješto') can be used uninflected, + # accompanied by a resumptive pronoun which provides the inflection. + # In this case, the Hičkok data will not annotate Gender, Animacy, + # Number and Case of the relative pronoun. Therefore, we require + # the full set of features if any of them is present; otherwise, + # we only expect PronType and PrepCase. + if node.feats['Gender'] != '' or node.feats['Animacy'] != '' or node.feats['Number'] != '' or node.feats['Case'] != '': + self.check_adjective_like(node, ['PronType', 'PrepCase'], { + 'PronType': ['Rel'], + 'PrepCase': ['Npr', 'Pre'] + }) + else: + self.check_required_features(node, ['PronType', 'PrepCase']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'PrepCase': ['Npr'] + }) else: # What remains is the relative pronoun 'an'. It behaves similarly # to 'jenž' but it does not have the PrepCase feature and it @@ -334,6 +359,7 @@ def process_node(self, node): self.check_allowed_features(node, { 'PronType': ['Rel'], 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom'] }) @@ -341,45 +367,129 @@ def process_node(self, node): elif node.upos == 'DET': # Possessive determiners 'jeho' and 'jejich' (formerly 'jich') do not inflect, i.e., no Gender, Number, Case. # Note that the possessive determiner 'její' (formerly 'jejie') does inflect, although it also has the lemma 'jeho'. - if re.match(r'^(jeho|jejich|jich)(ž(to)?)?$', node.form.lower()): + if re.match(r'^(je?ho|jejich|j[ií]ch)$', node.form.lower()): self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]']) self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], + 'PronType': ['Prs'], 'Poss': ['Yes'], 'Person': ['3'], 'Number[psor]': ['Sing', 'Dual', 'Plur'], - 'Gender[psor]': ['Masc,Neut'] + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner because no n-form can be used (jeho dům VS. na jeho dům). + # Compare with genitive/accusative of the pronoun "on", there the form changes after preposition and PrepCase must be annotated + # (jeho se bojím VS. bez něho se neobejdu). }) - elif re.match(r'^(její|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(to)?)?$', node.form.lower()): + # Relative possessive determiners 'jehož' and 'jejichž' behave similarly + # to the personal possessive determiners but they do not have Person. + elif re.match(r'^(jeho|jejich|j[ií]ch)ž(e|to)?$', node.form.lower()): + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing', 'Dual', 'Plur'], + 'Gender[psor]': ['Masc', 'Neut', 'Masc,Neut'], + 'Gender': ['Masc', 'Fem', 'Neut'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Animacy': ['Anim', 'Inan'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified gender by context + 'Number': ['Sing', 'Dual', 'Plur'], # uninflected in modern Czech, but old Czech annotations sometime indicate the modified number by context + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] # uninflected in modern Czech, but old Czech annotations sometime indicate the case by context + # PrepCase is not allowed when it is a possessive determiner (muž, jehož manželka zahynula při nehodě) because no n-form can be used + # (after preposition: muž, na jehož manželku jste si stěžoval). Compare with genitive/accusative of the relative pronoun "jenž", + # there the form changes after preposition and PrepCase must be annotated (muž, jehož se bojím VS. muž, bez něhož se neobejdeme). + }) + # Feminine personal possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)$', node.form.lower()): # The feminine possessive 'její' slightly inflects, unlike 'jeho' and 'jejich'. - # Congruent gender is annotated only in singular. Masculine and - # neuter are merged even in nominative. Feminine singular does - # not distinguish case in PDT but we need it in Old Czech at - # least for 'jejiej'. - if node.feats['Number'] == 'Sing': + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (její bota, její boty, její botě, její botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiej') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Prs'], + 'Poss': ['Yes'], + 'Person': ['3'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], + 'PronType': ['Prs'], 'Poss': ['Yes'], 'Person': ['3'], 'Number[psor]': ['Sing'], 'Gender[psor]': ['Fem'], - 'Gender': ['Masc,Neut', 'Fem'], - 'Number': ['Sing'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) + # Feminine relative possessive determiner. + elif re.match(r'^(její|jeje|jejie|jejího|jejieho|jejímu|jejiemu|jejím|jejiem|jejiej|jejíma|jejiema|jejích|jejiech|jejími|jejiemi)(ž(e|to)?)$', node.form.lower()): + # The feminine possessive 'jejíž' slightly inflects, unlike 'jehož' and 'jejichž'. + # Congruent gender: + # - in PDT, only in singular; masculine and neuter are merged even in nominative + # - in Old Czech data, gender is disambiguated by context (no merging), even in dual and plural + # Case: + # - in PDT, not distinguished in feminine singular (jejíž bota, jejíž boty, jejíž botě, jejíž botu...) + # - in Old Czech data, distinguished always (and needed at least for 'jejiejž') + if self.pdt20: + if node.feats['Number'] == 'Sing': + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Gender': ['Masc,Neut', 'Fem'], + 'Number': ['Sing'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) + else: + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Rel'], + 'Poss': ['Yes'], + 'Number[psor]': ['Sing'], + 'Gender[psor]': ['Fem'], + 'Number': ['Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) else: - self.check_required_features(node, ['PronType', 'Poss', 'Person', 'Number[psor]', 'Gender[psor]', 'Number', 'Case']) + self.check_required_features(node, ['PronType', 'Poss', 'Number[psor]', 'Gender[psor]', 'Gender', 'Number', 'Case']) self.check_allowed_features(node, { - 'PronType': ['Prs', 'Rel'], + 'PronType': ['Rel'], 'Poss': ['Yes'], - 'Person': ['3'], 'Number[psor]': ['Sing'], 'Gender[psor]': ['Fem'], - 'Number': ['Dual', 'Plur'], + 'Gender': ['Masc', 'Neut', 'Fem'], + 'Animacy': ['Anim', 'Inan'], # only for Gender=Masc + 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) - elif node.feats['Poss'] == 'Yes': # 'můj', 'tvůj', 'svůj' + elif re.match(r'^(můj|tvůj|svůj)$', node.lemma): if node.feats['Reflex'] == 'Yes': self.check_adjective_like(node, ['PronType', 'Poss', 'Reflex'], { 'PronType': ['Prs'], @@ -393,11 +503,41 @@ def process_node(self, node): 'Person': ['1', '2'], 'Number[psor]': ['Sing', 'Plur'] }) - elif re.match(r'^(samý)$', node.lemma): + elif re.match(r'^(ně|lec|ni)?číž?(koliv?)?$', node.lemma): + self.check_adjective_like(node, ['PronType', 'Poss'], { + 'PronType': ['Int', 'Rel', 'Ind', 'Neg'], + 'Poss': ['Yes'] + }) + elif re.match(r'^(sám|samý)$', node.lemma): + # The above condition looks at both lemma options, although only one lemma is assumed. + # However, in New Czech data the one lemma is "samý" while in Old Czech data it is "sám". # Unlike other determiners, it allows Variant=Short: sám, sama, samu, samo, sami, samy. self.check_adjective_like(node, ['PronType'], {'PronType': ['Emp'], 'Variant': ['Short']}) + elif node.lemma == 'veškerý': + # In Old Czech, this determiner also allows Variant=Short: veškeren, veškera, veškeru, veškero, veškeři, veškery. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Tot'], 'Variant': ['Short']}) + elif node.lemma == 'žádný': + # In Old Czech, this determiner also allows Variant=Short: žáden, žádna, žádnu, žádno, žádni, žádny. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Neg'], 'Variant': ['Short']}) + elif node.feats['NumType'] == 'Card': # pronominal quantifiers 'mnoho', 'málo', 'několik' etc. + if node.lemma == 'nejeden': + self.check_adjective_like(node, ['PronType', 'NumType'], {'PronType': ['Ind'], 'NumType': ['Card']}) + else: + # Lemmas 'hodně' and 'málo' have Degree even if used as quantifiers and not adverbs: + # hodně, více, nejvíce; málo, méně, nejméně + # Lemmas 'mnoho' and 'málo' can be negated (nemnoho, nemálo). + self.check_required_features(node, ['PronType', 'NumType', 'Case']) + self.check_allowed_features(node, { + 'PronType': ['Ind', 'Int', 'Rel', 'Dem'], + 'NumType': ['Card'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'], + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) else: - self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Rel', 'Ind', 'Neg', 'Tot']}) + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_adjective_like(node, ['PronType'], {'PronType': ['Dem', 'Int,Rel', 'Int', 'Rel', 'Ind', 'Neg', 'Tot']}) # NUMERALS ############################################################# elif node.upos == 'NUM': self.check_required_features(node, ['NumType', 'NumForm']) @@ -412,6 +552,8 @@ def process_node(self, node): # 'dva', 'oba' have Gender, Number=Dual(Plur in modern Czech), Case: dva, dvě, dvou, dvěma. # 'tři', 'čtyři' have Number=Plur, Case: tři, třech, třem, třemi. # 'pět' and more have Number=Plur, Case: pět, pěti. + # 'půl' has no Number and Case, although it behaves syntactically similarly to 'pět' (but genitive is still 'půl', not '*půli'). + # 'sto', 'tisíc', 'milión', 'miliarda' etc. have Gender (+ possibly Animacy) and Number (depending on their form). if node.lemma == 'jeden': self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) self.check_allowed_features(node, { @@ -427,6 +569,7 @@ def process_node(self, node): if self.pdt20: self.check_allowed_features(node, { 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' 'NumForm': ['Word'], 'Gender': ['Masc', 'Masc,Neut', 'Fem', 'Fem,Neut', 'Neut'], # similarly to determiners, genders are merged in some slots of the paradigm 'Number': ['Dual', 'Plur'], @@ -435,31 +578,63 @@ def process_node(self, node): else: self.check_allowed_features(node, { 'NumType': ['Card'], + 'PronType': ['Tot'], # for 'oba' 'NumForm': ['Word'], 'Gender': ['Masc', 'Fem', 'Neut'], 'Animacy': ['Anim', 'Inan'], 'Number': ['Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) + elif node.lemma == 'půl': + self.check_required_features(node, ['NumType', 'NumForm']) + self.check_allowed_features(node, { + 'NumType': ['Card'], + 'NumForm': ['Word'] + }) + elif re.match(r'^(sto|tisíc|.+ili[oó]n|.+iliarda)$', node.lemma): + self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) + self.check_allowed_features(node, { + 'NumType': ['Card', 'Sets'], + 'NumForm': ['Word'], + 'Gender': ['Masc', 'Fem', 'Neut'], + 'Animacy': ['Anim', 'Inan'], + 'Number': ['Sing', 'Dual', 'Plur'], + 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] + }) else: + # In PDT, cardinal numerals higher than four in nominative/accusative/vocative + # have Number=Sing instead of Plur! It may be motivated by the default + # agreement they trigger on verbs (but they don't have Gender=Neut). + # It does not make much sense but we must allow Sing before a better + # approach is defined and implemented in the data. + # On the other hand, we may want to allow Dual for "stě". self.check_required_features(node, ['NumType', 'NumForm', 'Number', 'Case']) self.check_allowed_features(node, { - 'NumType': ['Card'], + 'NumType': ['Card', 'Sets'], 'NumForm': ['Word'], - 'Number': ['Plur'], + 'Number': ['Sing', 'Dual', 'Plur'], 'Case': ['Nom', 'Gen', 'Dat', 'Acc', 'Voc', 'Loc', 'Ins'] }) # VERBS AND AUXILIARIES ################################################ - elif re.match(r'^(VERB|AUX)$', node.upos): - self.check_required_features(node, ['Aspect', 'VerbForm']) - if node.feats['VerbForm'] == 'Inf': + elif node.upos in ['VERB', 'AUX']: + # There are only three lemmas recognized as AUX in Czech. This is not + # about features and it would be caught by the UD validator, but it + # is error in morphology, so let's report it here as well. + if node.upos == 'AUX' and node.lemma not in ['být', 'bývat', 'bývávat']: + self.bug(node, 'NonAuxLemma') + # All Czech verbs (and some adjectives and nouns) must have VerbForm. + # Almost all verbs have lexical Aspect but we cannot require it + # because there are a few biaspectual verbs (e.g. 'analyzovat') that + # do not have the feature. + self.check_required_features(node, ['VerbForm']) + if node.feats['VerbForm'] in ['Inf', 'Sup']: # There is no voice. For some reason, PDT does not annotate that # the infinitive form is active (while a passive infinitive is # a combination of the infinitive with a passive participle). self.check_required_features(node, ['Polarity']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Inf'], + 'VerbForm': ['Inf', 'Sup'], 'Polarity': ['Pos', 'Neg'] }) elif node.feats['VerbForm'] == 'Fin': @@ -467,24 +642,46 @@ def process_node(self, node): # imperatives (although passive imperatives are a combination # of the active imperative and a passive participle). It is # also not annotated at the conditional auxiliary 'bych', 'bys', 'by', 'bychom', 'byste'. + # Conditional "by" has no person and number (it is typically + # 3rd person but it could be other persons, too, as in "ty by + # ses bál"). if node.feats['Mood'] == 'Cnd': - self.check_required_features(node, ['Mood', 'Person']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Fin'], - 'Mood': ['Cnd'], - 'Person': ['1', '2', '3'], - 'Number': ['Sing', 'Dual', 'Plur'] # optional: it is not annotated in the third person - }) + if node.form.lower() == 'by': + self.check_required_features(node, ['Mood']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'] + }) + elif node.form.lower() == 'byšta': + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['2', '3'], + 'Number': ['Dual'] + }) + else: + self.check_required_features(node, ['Mood', 'Person', 'Number']) + self.check_allowed_features(node, { + 'Aspect': ['Imp'], + 'VerbForm': ['Fin'], + 'Mood': ['Cnd'], + 'Person': ['1', '2'], + 'Number': ['Sing', 'Dual', 'Plur'] + }) elif node.feats['Mood'] == 'Imp': self.check_required_features(node, ['Mood', 'Person', 'Number', 'Polarity']) self.check_allowed_features(node, { 'Aspect': ['Imp', 'Perf'], 'VerbForm': ['Fin'], 'Mood': ['Imp'], + 'Voice': ['Act'], # optional in Old Czech data, not used with imperatives in Modern Czech data (at least not yet) 'Person': ['1', '2', '3'], # 3rd person imperative occasionally occurs in old Czech (but the form is identical to 2nd person) 'Number': ['Sing', 'Dual', 'Plur'], - 'Polarity': ['Pos', 'Neg'] + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'] }) else: # indicative self.check_required_features(node, ['Mood', 'Voice', 'Tense', 'Person', 'Number', 'Polarity']) @@ -524,43 +721,134 @@ def process_node(self, node): 'Polarity': ['Pos', 'Neg'] }) else: # converb - self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) - self.check_allowed_features(node, { - 'Aspect': ['Imp', 'Perf'], - 'VerbForm': ['Conv'], - 'Tense': ['Past', 'Pres'], - 'Voice': ['Act'], - 'Number': ['Sing', 'Dual', 'Plur'], - 'Gender': ['Masc', 'Fem', 'Neut'], # annotated only in singular, and no animacy - 'Polarity': ['Pos', 'Neg'] - }) + # Old Czech data annotate converb gender by context rather than form + # (because the form was different than in Modern Czech) and for + # masculines they also include animacy. In Modern Czech animacy is + # currently not annotated and Masc,Neut gender is merged. + if node.feats['Number'] == 'Sing': + if node.feats['Gender'] == 'Masc': + self.check_required_features(node, ['Tense', 'Gender', 'Animacy', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Masc'], + 'Animacy': ['Anim', 'Inan'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Gender', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], # passive participle is ADJ, so we will not encounter it under VERB + 'Number': ['Sing'], + 'Gender': ['Fem', 'Neut'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + self.check_required_features(node, ['Tense', 'Number', 'Voice', 'Polarity']) + self.check_allowed_features(node, { + 'Aspect': ['Imp', 'Perf'], + 'VerbForm': ['Conv'], + 'Tense': ['Past', 'Pres'], + 'Voice': ['Act'], + 'Number': ['Dual', 'Plur'], + 'Polarity': ['Pos', 'Neg'] + }) # ADVERBS ############################################################## elif node.upos == 'ADV': - if node.feats['PronType'] != '': - # Pronominal adverbs are neither compared nor negated. + if node.feats['NumType'] != '': + # Adverbial multiplicative numerals (jednou, dvakrát, třikrát) + # belong here. They have also pronominal counterparts (kolikrát, + # tolikrát, několikrát). There are also adverbial ordinal numerals + # (zaprvé, poprvé, zadruhé, podruhé). + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + # New Czech data, in particular PDT, use Int,Rel regardless of context. self.check_allowed_features(node, { - 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] - }) - elif node.feats['Degree'] != '': - # Adverbs that are compared can also be negated. - self.check_required_features(node, ['Degree', 'Polarity']) - self.check_allowed_features(node, { - 'Degree': ['Pos', 'Cmp', 'Sup'], - 'Polarity': ['Pos', 'Neg'] + 'NumType': ['Mult', 'Ord'], + 'PronType': ['Dem', 'Int', 'Rel', 'Int,Rel', 'Ind'] }) + elif self.pdt20: + if node.feats['PronType'] != '': + # Pronominal adverbs in PDT are neither compared nor negated. + # New Czech data, in particular PDT, use Int,Rel regardless of context. + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int,Rel', 'Ind', 'Neg', 'Tot'] + }) + elif node.feats['Degree'] != '': + # Adverbs that are compared can also be negated. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'] + }) + else: + # The remaining adverbs are neither pronominal, nor compared or + # negated. + self.check_allowed_features(node, {}) else: - # The remaining adverbs are neither pronominal, nor compared or - # negated. - self.check_allowed_features(node, {}) + if node.feats['PronType'] == 'Tot': + # Total adverbs in Old Czech can be negated: vždy, nevždy. + # Then for consistence with other adverbs, we also require + # Degree, although it will be always Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'PronType': ['Tot'], + 'Degree': ['Pos'], + 'Polarity': ['Pos', 'Neg'] + }) + elif node.feats['PronType'] != '': + # Other pronominal adverbs are neither compared nor negated. + # Old Czech data disambiguate Int from Rel (Int is used only in direct questions with question mark; indirect questions like "Ptal ses, kde to je?" use Rel.) + self.check_allowed_features(node, { + 'PronType': ['Dem', 'Int', 'Rel', 'Ind', 'Neg'] + }) + else: + # All other adverbs should have both Degree and Polarity, + # although for some of them the values will always be Pos. + self.check_required_features(node, ['Degree', 'Polarity']) + self.check_allowed_features(node, { + 'Degree': ['Pos', 'Cmp', 'Sup'], + 'Polarity': ['Pos', 'Neg'], + 'Emph': ['Yes'], + 'Abbr': ['Yes'] + }) # ADPOSITIONS ########################################################## elif node.upos == 'ADP': self.check_required_features(node, ['AdpType', 'Case']) self.check_allowed_features(node, { 'AdpType': ['Prep', 'Voc'], - 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'] + 'Case': ['Gen', 'Dat', 'Acc', 'Loc', 'Ins'], + 'Abbr': ['Yes'] + }) + # SUBORDINATING CONJUNCTIONS ########################################### + elif node.upos == 'SCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'] + }) + # COORDINATING CONJUNCTIONS ############################################ + elif node.upos == 'CCONJ': + self.check_allowed_features(node, { + 'Emph': ['Yes'] + }) + # PARTICLES ############################################################ + elif node.upos == 'PART': + # "t." = "totiž" + self.check_allowed_features(node, { + 'Abbr': ['Yes'] }) # THE REST: NO FEATURES ################################################ + # (OR UNDEFINED UPOS) ################################################## else: + if not node.upos in ['INTJ', 'PUNCT', 'SYM', 'X']: + bugmsg = 'UnknownUpos' + if node.upos: + bugmsg += node.upos + self.bug(node, bugmsg) self.check_allowed_features(node, {}) def check_adjective_like(self, node, r0, a0): @@ -575,7 +863,7 @@ def check_adjective_like(self, node, r0, a0): caller in parameters r0 (list) and a0 (dict). """ required_features = [] - allowed_featurs = {} + allowed_features = {} full_set = node.upos == 'ADJ' or not self.pdt20 if full_set: # Even in the full set, animacy is only distinguished for the diff --git a/udapi/block/ud/fixadvmodbyupos.py b/udapi/block/ud/fixadvmodbyupos.py index 781e7586..916910b5 100644 --- a/udapi/block/ud/fixadvmodbyupos.py +++ b/udapi/block/ud/fixadvmodbyupos.py @@ -29,10 +29,59 @@ def process_node(self, node): node.deprel = 'discourse' else: node.deprel = 'dep' - ###!!! The following are not advmod so they should probably have their own block or this block should have a different name. + ###!!! The following are not advmod so they should probably have their + ###!!! own block or this block should have a different name. elif node.udeprel == 'expl': if node.upos == 'AUX': node.deprel = 'aux' + elif node.upos == 'ADP': + node.deprel = 'case' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'CCONJ': + node.deprel = 'cc' + elif node.udeprel in ['aux', 'cop']: + if node.upos != 'AUX': + node.deprel = 'dep' + elif node.udeprel == 'case': + if node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'PRON': + node.deprel = 'nmod' elif node.udeprel == 'mark': - if node.upos == 'PRON': + if node.upos in ['PRON', 'DET']: node.deprel = 'nsubj' # it could be also obj, iobj, obl or nmod; just guessing what might be more probable + elif node.upos == 'NOUN': + node.deprel = 'obl' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.udeprel == 'cc': + if node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.upos == 'INTJ': + node.deprel = 'discourse' + elif node.udeprel == 'det': + if node.upos == 'NOUN': + node.deprel = 'nmod' + elif node.upos == 'ADJ': + node.deprel = 'amod' + elif node.upos == 'ADV': + node.deprel = 'advmod' + elif node.upos == 'AUX': + node.deprel = 'aux' + elif node.upos == 'VERB': + node.deprel = 'dep' + elif node.upos == 'SCONJ': + node.deprel = 'mark' + elif node.upos == 'X': + node.deprel = 'dep' + elif node.udeprel == 'nummod': + if node.upos == 'PRON': + node.deprel = 'nmod' + elif node.upos == 'DET': + node.deprel = 'det' + elif node.udeprel == 'punct': + if node.upos != 'PUNCT': + node.deprel = 'dep' diff --git a/udapi/block/ud/fixpseudocop.py b/udapi/block/ud/fixpseudocop.py index ecc5f0bd..f4d9a1ec 100644 --- a/udapi/block/ud/fixpseudocop.py +++ b/udapi/block/ud/fixpseudocop.py @@ -2,7 +2,6 @@ but they should be treated as normal verbs (with secondary predication) instead.""" from udapi.core.block import Block -import logging import re class FixPseudoCop(Block): diff --git a/udapi/block/ud/fixroot.py b/udapi/block/ud/fixroot.py new file mode 100644 index 00000000..be972d8b --- /dev/null +++ b/udapi/block/ud/fixroot.py @@ -0,0 +1,37 @@ +""" +Block ud.FixRoot will ensure that the tree is free of common root-related errors. +Simple heuristics are used; it is likely that human inspection would lead to +a different solution. Nevertheless, if a quick fix is needed to pass the +validation, this block can be helpful. + +WARNING: The block currently ignores enhanced dependencies. +""" +import re +from udapi.core.block import Block + + +class FixRoot(Block): + """ + Fixes the following validation errors: + - Only one node must be attached directly to the artificial root node. + => If the root has multiple children, keep the first one. Attach the other + ones to the first one. Change their deprel to 'parataxis'. + - The node attached as a child of the artificial root node must have the + 'root' relation (or its subtype). + => If the root child has another deprel, change it to 'root'. + - The node attached as a child of the artificial root node is the only one + allowed to have the 'root' relation (or its subtype). + => If another node has that deprel, change it to 'parataxis'. + """ + + def process_tree(self, root): + rchildren = root.children + if len(rchildren) > 1: + for i in range(len(rchildren)-1): + rchildren[i+1].parent = rchildren[0] + rchildren[i+1].deprel = 'parataxis' + if rchildren[0].udeprel != 'root': + rchildren[0].deprel = 'root' + for n in root.descendants: + if not n.parent == root and n.udeprel == 'root': + n.deprel = 'parataxis' diff --git a/udapi/block/ud/jointoken.py b/udapi/block/ud/jointoken.py new file mode 100644 index 00000000..43d2b30d --- /dev/null +++ b/udapi/block/ud/jointoken.py @@ -0,0 +1,97 @@ +""" +Block ud.JoinToken will join a given token with the preceding one. +""" +from udapi.core.block import Block +import logging + + +class JoinToken(Block): + """ + Merge two tokens into one. A MISC attribute is used to mark the tokens that + should join the preceding token. (The attribute may have been set by an + annotator or by a previous block that tests the specific conditions under + which joining is desired.) Joining cannot be done across sentence + boundaries; if necessary, apply util.JoinSentence first. Multiword tokens + are currently not supported: None of the nodes to be merged can belong to + a MWT. (The block ud.JoinAsMwt may be of some help, but it works differently.) + Merging is simple if there is no space between the tokens (see SpaceAfter=No + at the first token). If there is a space, there are three options in theory: + + 1. Keep the tokens as two nodes but apply the UD goeswith relation + (see https://universaldependencies.org/u/overview/typos.html) and + the related annotation rules. + 2. Join them into one token that contains a space. Such "words with + spaces" can be exceptionally allowed in UD if they are registered + in the given language. + 3. Remove the space without any trace. Not recommended in UD unless the + underlying text was created directly for UD and can be thus considered + part of the annotation. + + At present, this block does not support merging with spaces at all, but + in the future one or more of the options may be added. + """ + + def __init__(self, misc_name='JoinToken', misc_value=None, **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the joining + default: JoinToken + misc_value: value of the MISC attribute to trigger the joining; + if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + self.misc_value = misc_value + + def process_node(self, node): + """ + The JoinToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be merged with the previous node and the + attribute will be removed from MISC, or a warning will be issued that + the merging cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + if node.misc[self.misc_name] == '': + return + if self.misc_value and node.misc[self.misc_name] != self.misc_value: + return + prevnode = node.prev_node + if not prevnode: + logging.warning("MISC %s cannot be used at the first token of a sentence." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if node.multiword_token or prevnode.multiword_token: + logging.warning("MISC %s cannot be used if one of the nodes belongs to a multiword token." % self.misc_name) + node.misc['Bug'] = 'JoiningTokenNotSupportedHere' + return + if prevnode.misc['SpaceAfter'] != 'No': + logging.warning("MISC %s cannot be used if there is space between the tokens." % self.misc_name) + node.misc['Bug'] = 'JoiningTokensWithSpaceNotSupported' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if prevnode.deps or node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # If the first token depends on the second token, re-attach it to the + # second token's parent to prevent cycles. + if prevnode in node.descendants: + prevnode.parent = node.parent + prevnode.deprel = node.deprel + # Re-attach all children of the second token to the first token. + for c in node.children: + c.parent = prevnode + # Concatenate the word forms of the two tokens. Assume that morphological + # annotation, including the lemma, is already updated accordingly (we + # cannot guess it anyway). + prevnode.form += node.form + # Remove SpaceAfter=No from the first token unless the second token has + # this attribute, too (meaning that there is no space between the second + # token and whatever comes next). + prevnode.misc['SpaceAfter'] = node.misc['SpaceAfter'] + # Remove the current node. The joining instruction was in its MISC, so + # it will disappear together with the node. + node.remove() diff --git a/udapi/block/ud/markbugs.py b/udapi/block/ud/markbugs.py index 2204eb4f..ee58084a 100644 --- a/udapi/block/ud/markbugs.py +++ b/udapi/block/ud/markbugs.py @@ -118,7 +118,7 @@ def process_node(self, node): if upos == i_upos and not feats[i_feat]: # Some languages do not distinguish finite and non-finite forms of verbs. # The VerbForm feature is not obligatory in those languages. - if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb'}: + if i_feat != 'VerbForm' or not node.root.zone.split('_')[0] in {'id', 'jv', 'tl', 'hil', 'ifb', 'naq'}: self.log(node, 'no-' + i_feat, 'upos=%s but %s feature is missing' % (upos, i_feat)) if feats['VerbForm'] == 'Fin': @@ -127,15 +127,19 @@ def process_node(self, node): if not feats['Mood']: self.log(node, 'finverb-mood', 'VerbForm=Fin but Mood feature is missing') - if feats['Degree'] and upos not in ('ADJ', 'ADV'): - self.log(node, 'degree-upos', - 'Degree=%s upos!=ADJ|ADV (but %s)' % (feats['Degree'], upos)) - subject_children = [n for n in node.children if 'subj' in n.udeprel and n.sdeprel != 'outer'] if len(subject_children) > 1: self.log(node, 'multi-subj', 'More than one (non-outer) [nc]subj child') - object_children = [n for n in node.children if n.udeprel in ('obj', 'ccomp')] + # Since "ccomp" is considered a clausal counterpart of "obj" in UD v2, + # one may conclude that "obj" and "ccomp" are mutually exclusive. + # However, this has always be a gray zone and people have occasionally + # brought up examples where they would want the two relations to co-occur. + # Also, there is no clausal counterpart for "iobj", which may cause some + # of the problems. It is probably safer not to consider "ccomp" in this + # test. Nevertheless, two "obj" under the same parent are definitely an + # error. + object_children = [n for n in node.children if n.udeprel == 'obj'] if len(object_children) > 1: self.log(node, 'multi-obj', 'More than one obj|ccomp child') @@ -150,7 +154,7 @@ def process_node(self, node): # so there should be no false alarms. Some errors are not reported, i.e. the cases # when advmod incorrectly depends on a function word ("right before midnight"). if parent.udeprel in ('aux', 'cop', 'mark', 'clf', 'case'): - if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod'): + if udeprel not in ('conj', 'cc', 'punct', 'fixed', 'goeswith', 'advmod', 'reparandum'): self.log(node, parent.deprel + '-child', 'parent.deprel=%s deprel!=conj|cc|punct|fixed|goeswith' % parent.deprel) @@ -180,14 +184,6 @@ def process_node(self, node): if upos == 'PUNCT' and node.is_nonprojective_gap() and not parent.is_nonprojective_gap(): self.log(node, 'punct-nonproj-gap', 'upos=PUNCT and causing a non-projectivity') - # http://universaldependencies.org/u/dep/cc.html says - # "cc is the relation between a conjunct and a preceding - # [coordinating conjunction](http://universaldependencies.org/u/pos/CCONJ)." - # No other upos is allowed in the documentation, although e.g. PART is common in the data. - # There are clear cases of adverbs in role of cc (e.g. "respektive" in Swedish and Czech). - if udeprel == 'cc' and upos not in ('CCONJ', 'ADV'): - self.log(node, 'cc-upos', "deprel=cc upos!=CCONJ (but %s): " % upos) - if udeprel == 'cop': lemma = node.lemma if node.lemma != '_' else form self.cop_nodes[lemma].append(node) diff --git a/udapi/block/ud/markfeatsbugs.py b/udapi/block/ud/markfeatsbugs.py index 1bb8188b..26c5624d 100644 --- a/udapi/block/ud/markfeatsbugs.py +++ b/udapi/block/ud/markfeatsbugs.py @@ -8,8 +8,6 @@ Usage (Czech example): cat *.conllu | udapy -HAMX layout=compact ud.cs.MarkFeatsBugs > bugs.html """ from udapi.core.block import Block -import logging -import re class MarkFeatsBugs(Block): diff --git a/udapi/block/ud/ro/fixfixed.py b/udapi/block/ud/ro/fixfixed.py new file mode 100644 index 00000000..14d16464 --- /dev/null +++ b/udapi/block/ud/ro/fixfixed.py @@ -0,0 +1,20 @@ +"""Block ud.ro.FixFixed + +Author: Dan Zeman +""" +import logging + +from udapi.core.block import Block + + +class FixFixed(Block): + """Block for fixing annotation of some 'fixed' expressions.""" + + def process_node(self, node): + fixchildren = [x for x in node.children if x.udeprel=='fixed'] + nfc = len(fixchildren) + if nfc > 0: + if node.udeprel == 'advmod' and node.feats['ExtPos'] == '': + node.feats['ExtPos'] = 'ADV' + elif node.feats['ExtPos'] == '': + logging.info('Another case: '+node.lemma+' '+' '.join([x.form for x in fixchildren])) diff --git a/udapi/block/ud/setspaceafterfromtext.py b/udapi/block/ud/setspaceafterfromtext.py index c5321221..ec7ab658 100644 --- a/udapi/block/ud/setspaceafterfromtext.py +++ b/udapi/block/ud/setspaceafterfromtext.py @@ -14,6 +14,10 @@ class SetSpaceAfterFromText(Block): """Block for setting of the SpaceAfter=No MISC attribute according to the sentence text.""" def process_tree(self, root): + # Empty nodes cannot have 'SpaceAfter=No', so make sure the file is valid. + for empty_node in root.empty_nodes: + del empty_node.misc['SpaceAfter'] + text = root.text if text is None: raise ValueError('Tree %s has no text, cannot use ud.SetSpaceAfterFromText' % root) diff --git a/udapi/block/ud/splittoken.py b/udapi/block/ud/splittoken.py new file mode 100644 index 00000000..16c60a38 --- /dev/null +++ b/udapi/block/ud/splittoken.py @@ -0,0 +1,107 @@ +""" +Block ud.SplitToken will split a given token into multiple tokens. +""" +from udapi.core.block import Block +import re +import logging + + +class SplitToken(Block): + """ + Split a token into two or more. A MISC attribute is used to mark the tokens + that should be split. (The attribute may have been set by an annotator or + by a previous block that tests the specific conditions under which splitting + is desired.) Multiword tokens are currently not supported: The node to be + split cannot belong to a MWT. Note that the result will not be a MWT either + (use the block ud.AddMwt if that is desired). There will be simply a new + attribute SpaceAfter=No, possibly accompanied by CorrectSpaceAfter=Yes + (indicating that this was an error in the source text). + """ + + def __init__(self, misc_name='SplitToken', **kwargs): + """ + Args: + misc_name: name of the MISC attribute that can trigger the splitting + default: SplitToken + The value of the attribute should indicate where to split the token. + It should be a string that is identical to node.form except that + there is one or more spaces where the token should be split. + """ + super().__init__(**kwargs) + self.misc_name = misc_name + + def process_node(self, node): + """ + The SplitToken (or equivalent) attribute in MISC will trigger action. + Either the current node will be split to multiple nodes and the + attribute will be removed from MISC, or a warning will be issued that + the splitting cannot be done and the attribute will stay in MISC. Note + that multiword token lines and empty nodes are not even scanned for + the attribute, so if it is there, it will stay there but no warning + will be printed. + """ + value = node.misc[self.misc_name] + if value == '': + return + if node.multiword_token: + logging.warning(f"MISC {self.misc_name} cannot be used if the node belongs to a multiword token.") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + ###!!! This block currently must not be applied on data containing + ###!!! enhanced dependencies. We must first implement adjustments of + ###!!! the enhanced structure. + if node.deps: + logging.fatal('At present this block cannot be applied to data with enhanced dependencies.') + # Verify that the value of the MISC attribute can be used as specification + # of the split. + if re.match(r'^\s', value) or re.search(r'\s$', value) or re.search(r'\s\s', value): + logging.warning(f"MISC {self.misc_name} is '{value}'; leading spaces, trailing spaces or multiple consecutive spaces are not allowed.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + if re.search(r'\s', node.form): + logging.warning(f"MISC {self.misc_name} cannot be used with nodes whose forms contain a space (here '{node.form}').") + node.misc['Bug'] = 'SplittingTokenNotSupportedHere' + return + if re.sub(r' ', '', value) != node.form: + logging.warning(f"MISC {self.misc_name} value '{value}' does not match the word form '{node.form}'.") + node.misc['Bug'] = f'{self.misc_name}BadValue' + return + # Do the split. + space_after = node.misc['SpaceAfter'] + forms = value.split(' ') + # Optionally, SplitTokenMorpho in MISC can have the morphological annotation + # of the new tokens. For example: + # SplitTokenMorpho=LEMMA=popisovat\tUPOS=VERB\tFEATS=Aspect=Imp\\pMood=Ind\\pNumber=Sing\\pPerson=3\\pPolarity=Pos\\pTense=Pres\\pVerbForm=Fin\\pVoice=Act + if node.misc['SplitTokenMorpho'] != '': + morphoblocks = [''] + node.misc['SplitTokenMorpho'].split(' ') + del node.misc['SplitTokenMorpho'] + else: + morphoblocks = ['' for x in forms] + node.form = forms[0] + last_node = node + for form, morpho in zip(forms[1:], morphoblocks[1:]): + last_node.misc['SpaceAfter'] = 'No' + last_node.misc['CorrectSpaceAfter'] = 'Yes' + lemma = form + upos = node.upos + feats = str(node.feats) + xpos = node.xpos + if morpho != '': + cols = morpho.split('\\t') + for c in cols: + colname, value = c.split('=', 1) + if colname == 'LEMMA': + lemma = value + elif colname == 'UPOS': + upos = value + elif colname == 'FEATS': + feats = re.sub(r'\\p', '|', value) + elif colname == 'XPOS': + xpos = value + else: + logging.fatal(f"c = {c}") + new_node = node.create_child(form=form, lemma=lemma, upos=upos, feats=feats, xpos=xpos, deprel='dep') + new_node.shift_after_node(last_node) + last_node = new_node + last_node.misc['SpaceAfter'] = space_after + del node.misc[self.misc_name] diff --git a/udapi/block/udpipe/base.py b/udapi/block/udpipe/base.py index d94f8cc5..069fc9fb 100644 --- a/udapi/block/udpipe/base.py +++ b/udapi/block/udpipe/base.py @@ -121,13 +121,12 @@ class Base(Block): # pylint: disable=too-many-arguments def __init__(self, model=None, model_alias=None, online=False, tokenize=True, tag=True, parse=True, resegment=False, - delete_nodes=False, **kwargs): - """Create the udpipe.En block object.""" + ranges=False, delete_nodes=False, **kwargs): super().__init__(**kwargs) self.model, self.model_alias, self.online = model, model_alias, online self._tool = None self.tokenize, self.tag, self.parse, self.resegment = tokenize, tag, parse, resegment - self.delete_nodes = delete_nodes + self.ranges, self.delete_nodes = ranges, delete_nodes @property def tool(self): @@ -148,7 +147,10 @@ def tool(self): return self._tool def process_document(self, doc): - tok, tag, par, reseg = self.tokenize, self.tag, self.parse, self.resegment + tok, tag, par, reseg, ranges = self.tokenize, self.tag, self.parse, self.resegment, self.ranges + if self.zones == "all" and self.online: + self.tool.process_document(doc, tok, tag, par, reseg, ranges) + return old_bundles = doc.bundles new_bundles = [] for bundle in old_bundles: @@ -160,7 +162,7 @@ def process_document(self, doc): subroot.remove() if tok: new_trees = self.tool.tokenize_tag_parse_tree(tree, resegment=reseg, - tag=tag, parse=par) + tag=tag, parse=par, ranges=ranges) if self.resegment and len(new_trees) > 1: orig_bundle_id = bundle.bundle_id bundle.bundle_id = orig_bundle_id + '-1' diff --git a/udapi/block/util/eval.py b/udapi/block/util/eval.py index df6aaabf..6e4f2ac9 100644 --- a/udapi/block/util/eval.py +++ b/udapi/block/util/eval.py @@ -30,7 +30,7 @@ class Eval(Block): def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end=None, before_doc=None, after_doc=None, before_bundle=None, after_bundle=None, coref_mention=None, coref_entity=None, empty_nodes=False, - expand_code=True, **kwargs): + expand_code=True, mwt=None, **kwargs): super().__init__(**kwargs) self.doc = doc self.bundle = bundle @@ -38,6 +38,7 @@ def __init__(self, doc=None, bundle=None, tree=None, node=None, start=None, end= self.node = node self.start = start self.end = end + self.mwt = mwt self.before_doc = before_doc self.after_doc = after_doc self.before_bundle = before_bundle @@ -70,7 +71,7 @@ def process_document(self, document): if self.doc: exec(self.expand_eval_code(self.doc)) - if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node: + if self.bundle or self.before_bundle or self.after_bundle or self.tree or self.node or self.mwt: for bundle in doc.bundles: # TODO if self._should_process_bundle(bundle): self.process_bundle(bundle) @@ -96,7 +97,7 @@ def process_bundle(self, bundle): if self.bundle: exec(self.expand_eval_code(self.bundle)) - if self.tree or self.node: + if self.tree or self.node or self.mwt: trees = bundle.trees for tree in trees: if self._should_process_tree(tree): @@ -121,6 +122,11 @@ def process_tree(self, tree): this = node exec(self.expand_eval_code(self.node)) + if self.mwt: + for mwt in tree.multiword_tokens: + this = mwt + exec(self.expand_eval_code(self.mwt)) + def process_start(self): if self.start: exec(self.expand_eval_code(self.start)) diff --git a/udapi/block/util/joinsentence.py b/udapi/block/util/joinsentence.py new file mode 100644 index 00000000..578f3865 --- /dev/null +++ b/udapi/block/util/joinsentence.py @@ -0,0 +1,77 @@ +""" +Block util.JoinSentence will join a given sentence with the preceding one. +""" +import logging +from udapi.core.block import Block + +class JoinSentence(Block): + """ + Joins a sentence with the preceding one. There are two ways how to indicate + the sentences that this block should process. + + Method 1: Parameter sent_id provides the id of the sentence that should be + merged with the preceding one. At most one sentence pair from the input will + be merged, even if there are multiple sentences with the given id. + + Method 2: A MISC attribute can be specified that, if found, will trigger + joining of the current sentence to the previous one. With this approach, + multiple sentence pairs can be merged during one run. + """ + + def __init__(self, sent_id=None, misc_name=None, misc_value=None, **kwargs): + """ + Args: + sent_id: which sentence should be appended to the previous one + misc_name: name of the MISC attribute that can trigger the joining (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the joining; if not specified, then simple occurrence of the attribute with any value will cause the joining + MISC attributes that have triggered sentence joining will be removed from their node. + """ + super().__init__(**kwargs) + if misc_name: + if sent_id: + logging.fatal('Cannot combine misc_value with sent_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + self.sent_id = sent_id + self.misc_name = misc_name + self.misc_value = misc_value + + def process_document(self, document): + previous_tree = None + for bundle_no, bundle in enumerate(document.bundles): + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to join all zones but we do not try to do it at present. + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + # The MISC attribute we are looking for should logically occur + # on the first node of the sentence but we can take it from any node. + join_commands = [n for n in root.descendants if n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if join_commands: + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove from the node the MISC attribute that triggered the sentence split. + for n in join_commands: + n.misc[self.misc_name] = '' + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + else: + previous_tree = root + elif bundle.bundle_id == self.sent_id: + logging.info('Found!') + if not previous_tree: + logging.fatal('Cannot join the first sentence as there is no previous sentence') + root = bundle.get_tree() + previous_tree.steal_nodes(root.descendants) + previous_tree.text = previous_tree.compute_text() + # Remove the current bundle. It will also update the numbers of the remaining bundles. + bundle.remove() + # We have found our sentence. No need to process the rest of the document. + break diff --git a/udapi/block/util/markmwtbugsatnodes.py b/udapi/block/util/markmwtbugsatnodes.py new file mode 100644 index 00000000..ebc2ef4e --- /dev/null +++ b/udapi/block/util/markmwtbugsatnodes.py @@ -0,0 +1,25 @@ +"""util.MarkMwtBugsAtNodes copies Bug attributes from MISC of multiword tokens to MISC of member nodes. + Otherwise they will be ignored when write.TextModeTrees marked_only=1 is called.""" + +from udapi.core.block import Block + +class MarkMwtBugsAtNodes(Block): + """ + If a node belongs to a multiword token and the MWT has Bug in MISC, copy + the Bug to the node so that filtering trees with bugs works. + The same bug note will be copied to all nodes in the MWT. + """ + + ###!!! Do we want to do the same thing also with ToDo attributes? + def bug(self, node, bugstring): + bugs = [] + if node.misc['Bug']: + bugs = node.misc['Bug'].split('+') + if not bugstring in bugs: + bugs.append(bugstring) + node.misc['Bug'] = '+'.join(bugs) + + def process_node(self, node): + if node.multiword_token: + if node.multiword_token.misc['Bug']: + self.bug(node, node.multiword_token.misc['Bug']) diff --git a/udapi/block/util/normalize.py b/udapi/block/util/normalize.py index b150d551..4cce4ab8 100644 --- a/udapi/block/util/normalize.py +++ b/udapi/block/util/normalize.py @@ -1,9 +1,10 @@ """util.Normalize normalizes the ordering of various attributes in CoNLL-U.""" from udapi.core.block import Block +from pathlib import Path class Normalize(Block): """Normalize the ordering of attributes in the FEATS and MISC columns. - + The attribute-value pairs in the FEATS column in CoNLL-U files must be sorted alphabetically (case-insensitive) according to the guidelines (https://universaldependencies.org/format.html#morphological-annotation). @@ -20,7 +21,8 @@ class Normalize(Block): util.Eval node='node.misc["NonExistentAttribute"] = None' """ - def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, start_sent_id=1, sent_id_prefix="", **kwargs): + def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, start_sent_id=1, sent_id_prefix="", + sent_id_from_filename=False, sent_id_reset_at_newdoc=False, newdoc_from_filename=False, **kwargs): """ Args: `feats`: normalize the ordering of FEATS. Default=True. @@ -29,6 +31,9 @@ def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, s `empty_node_ord`: normalize ord attributes of empty nodes. Default=False. `start_sent_id`: the first sent_id number `sent_id_prefix`: a string to be prepended before the integer sent_id. Default=empty string. + `sent_id_from_filename`: add Path(doc.meta["loaded_from"]).stem before the `sent_id_prefix`. Default=False. + `sent_id_reset_at_newdoc`: reset the sent_id counter to 1 for each new document. Default=False. + `newdoc_from_filename`: set newdoc to Path(doc.meta["loaded_from"]).stem. Default=False. """ super().__init__(**kwargs) self.feats = feats @@ -37,13 +42,28 @@ def __init__(self, feats=True, misc=True, sent_id=False, empty_node_ord=False, s self.empty_node_ord = empty_node_ord self.next_sent_id = start_sent_id self.sent_id_prefix = sent_id_prefix - if sent_id_prefix or start_sent_id != 1: + self.sent_id_from_filename = sent_id_from_filename + self.sent_id_reset_at_newdoc = sent_id_reset_at_newdoc + self.newdoc_from_filename = newdoc_from_filename + if sent_id_reset_at_newdoc and not sent_id_from_filename: + raise ValueError("Cannot use sent_id_reset_at_newdoc without sent_id_from_filename") + if sent_id_prefix or start_sent_id != 1 or sent_id_from_filename: self.sent_id = True + # TODO: normalize also the order of standardized comments like text, sent_id,... def process_bundle(self, bundle): + is_newdoc = any(tree.newdoc for tree in bundle.trees) + if self.newdoc_from_filename and is_newdoc: + tree = next(tree for tree in bundle.trees if tree.newdoc) + tree.newdoc = Path(bundle.document.meta["loaded_from"]).stem if self.sent_id: - bundle.bundle_id = self.sent_id_prefix + str(self.next_sent_id) + if self.sent_id_reset_at_newdoc and is_newdoc: + self.next_sent_id = 1 + prefix = self.sent_id_prefix + if self.sent_id_from_filename: + prefix = Path(bundle.document.meta["loaded_from"]).stem + prefix + bundle.bundle_id = prefix + str(self.next_sent_id) self.next_sent_id += 1 for tree in bundle: diff --git a/udapi/block/util/see.py b/udapi/block/util/see.py index aa7131b7..9a895b88 100644 --- a/udapi/block/util/see.py +++ b/udapi/block/util/see.py @@ -51,7 +51,7 @@ class See(Block): """Print statistics about the nodes specified by the parameter `node`.""" - def __init__(self, node, n=5, stats=STATS, **kwargs): + def __init__(self, node, n=5, stats=STATS, empty=False, **kwargs): """Args: `node`: Python expression to be evaluated for each node and if True, the node will be considered "matching". @@ -62,6 +62,7 @@ def __init__(self, node, n=5, stats=STATS, **kwargs): `children` = number of children nodes, `p_lemma` = lemma of a parent node, etc). See `udapi.core.Node.get_attrs` for a full list of statistics. + `empty`: apply the code also on empty nodes """ super().__init__(**kwargs) self.node = node @@ -73,11 +74,13 @@ def __init__(self, node, n=5, stats=STATS, **kwargs): self.match[stat] = Counter() self.every[stat] = Counter() self.overall = Counter() + self.empty = empty def process_tree(self, root): self.overall['trees'] += 1 tree_match = False - for node in root.descendants: + nodes = root.descendants_and_empty if self.empty else root.descendants + for node in nodes: matching = self.process_node(node) self.overall['nodes'] += 1 if matching: diff --git a/udapi/block/util/splitsentence.py b/udapi/block/util/splitsentence.py index 2886cd5d..b6ca57d8 100644 --- a/udapi/block/util/splitsentence.py +++ b/udapi/block/util/splitsentence.py @@ -10,35 +10,87 @@ class SplitSentence(Block): If the sent_id of the current sentence matches the parameter, splits the sentence into two. The first token of the second sentence is also given as a parameter. + + Alternatively, a MISC attribute can be specified that triggers sentence + splitting at the given token. With this approach, multiple sentence splits + can be performed during one run. """ - def __init__(self, sent_id=None, word_id=None, **kwargs): + def __init__(self, sent_id=None, word_id=None, misc_name=None, misc_value=None, **kwargs): """ Args: sent_id: which sentence should be split (new ids will have A and B appended) word_id: which word should be the first word of the second sentence (tokens and words will be renumbered) + misc_name: name of the MISC attribute that can trigger the split (cannot be combined with sent_id and word_id) + misc_value: value of the MISC attribute to trigger the split; if not specified, then simple occurrence of the attribute with any value will cause the split + MISC attributes that have triggered sentence split will be removed from their node. """ super().__init__(**kwargs) - if not sent_id: - logging.fatal('Missing parameter sent_id') - if not word_id: - logging.fatal('Missing parameter word_id') + if misc_name: + if sent_id or word_id: + logging.fatal('Cannot combine misc_value with sent_id or word_id') + else: + if not sent_id: + logging.fatal('Missing parameter sent_id') + if not word_id: + logging.fatal('Missing parameter word_id') self.sent_id = sent_id self.word_id = word_id + self.misc_name = misc_name + self.misc_value = misc_value def process_document(self, document): for bundle_no, bundle in enumerate(document.bundles): - if bundle.bundle_id == self.sent_id: + # In general, a bundle may contain multiple trees in different zones. + # In UD data, we always expect just one zone (labeled '') per bundle. + # This code could be extended to split all zones but we do not try to do it at present. + # (The zones may be translations to other languages and it is not likely that we would + # want to split each translation at the same position.) + if len(bundle.trees) != 1: + logging.fatal('Cannot process bundles that have less or more than 1 zone') + if not bundle.has_tree(zone=''): + logging.fatal('Cannot process bundles that do not have the zone with empty zone id') + if self.misc_name: + root = bundle.get_tree() + split_points = [n for n in root.descendants if n.ord > 1 and n.misc[self.misc_name] and self.misc_value == None or n.misc[self.misc_name] == self.misc_value] + if split_points: + # Create as many new bundles as there are split points. + n_new = len(split_points) + current_bid = bundle.bundle_id + idletter = 'B' # a letter will be added to bundle ids to distinguish them + for i in range(n_new): + new_bundle = document.create_bundle() + new_bundle.bundle_id = current_bid + idletter + new_root = Root(zone='') + new_bundle.add_tree(new_root) + # Identify nodes to move to the new bundle. + first_node_id = split_points[i].ord + if i < n_new - 1: + next_first_node_id = split_points[i+1].ord + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id and n.ord < next_first_node_id] + else: + nodes_to_move = [n for n in root.descendants if n.ord >= first_node_id] + new_root.steal_nodes(nodes_to_move) + self.make_zeros_roots(new_root) + new_root.text = new_root.compute_text() + # The new bundle was created at the end of the document. + # Move it to the position right after the current bundle. + document.bundles.pop() + document.bundles.insert(bundle_no + i + 1, new_bundle) + idletter = chr(ord(idletter) + 1) + # Remove from the node the MISC attribute that triggered the sentence split. + split_points[i].misc[self.misc_name] = '' + # Update the id of the current bundle, fix its zero-dependents and recompute sentence text. + bundle.bundle_id += 'A' + self.make_zeros_roots(root) + root.text = root.compute_text() + # Update the bundle numbers of the new bundles and all bundles after them. + updated_no = bundle_no + 1 + for b in document.bundles[(bundle_no+1):]: + b.number = updated_no + updated_no += 1 + elif bundle.bundle_id == self.sent_id: logging.info('Found!') - # In general, a bundle may contain multiple trees in different zones. - # In UD data, we always expect just one zone (labeled '') per bundle. - # This code could be extended to split all zones but we do not try to do it at present. - # (The zones may be translations to other languages and it is not likely that we would - # want to split each translation at the same position.) - if len(bundle.trees) != 1: - logging.fatal('Cannot process bundles that have less or more than 1 zone') - if not bundle.has_tree(zone=''): - logging.fatal('Cannot process bundles that do not have the zone with empty zone id') root = bundle.get_tree() nodes_to_move = [n for n in root.descendants if n.ord >= self.word_id] if len(nodes_to_move) == 0: @@ -60,22 +112,23 @@ def process_document(self, document): new_root.steal_nodes(nodes_to_move) # The steal_nodes() method does not make sure that all nodes newly attached # to the artificial root have the 'root' relation. Fix it. - n_root = 0 - for n in root.descendants: - if n.parent.is_root(): - n.deprel = 'root' - n_root += 1 - if n_root > 1: - logging.warning('More than one 0:root relation in the first part of the sentence.') - n_root = 0 - for n in new_root.descendants: - if n.parent.is_root(): - n.deprel = 'root' - n_root += 1 - if n_root > 1: - logging.warning('More than one 0:root relation in the second part of the sentence.') + self.make_zeros_roots(root) + self.make_zeros_roots(new_root) # Update the sentence text attributes of the new sentences. root.text = root.compute_text() new_root.text = new_root.compute_text() # We have found our sentence. No need to process the rest of the document. break + + def make_zeros_roots(self, root): + """ + The steal_nodes() method does not make sure that all nodes newly attached + to the artificial root have the 'root' relation. Fix it. + """ + n_root = 0 + for n in root.descendants: + if n.parent.is_root(): + n.deprel = 'root' + n_root += 1 + if n_root > 1: + logging.warning('More than one 0:root relation in newly segmented sentence %s.' % root.bundle.bundle_id) diff --git a/udapi/block/write/conllu.py b/udapi/block/write/conllu.py index abe20963..2573b5ae 100644 --- a/udapi/block/write/conllu.py +++ b/udapi/block/write/conllu.py @@ -134,10 +134,10 @@ def process_tree(self, tree): # pylint: disable=too-many-branches '_' if node._feats is None else str(node.feats), head, node.deprel, node.raw_deps, '_' if node._misc is None else str(node.misc)))) - # Empty sentences are not allowed in CoNLL-U, + # Empty sentences (sentences with no non-empty nodes) are not allowed in CoNLL-U, # but with print_empty_trees==1 (which is the default), # we will print an artificial node, so we can print the comments. - if not nodes: + if not tree._descendants: print("1\t_\t_\t_\t_\t_\t0\t_\t_\tEmpty=Yes") # Empty line separates trees in CoNLL-U (and is required after the last tree as well) diff --git a/udapi/block/write/textmodetrees.py b/udapi/block/write/textmodetrees.py index 41539670..885f797f 100644 --- a/udapi/block/write/textmodetrees.py +++ b/udapi/block/write/textmodetrees.py @@ -14,6 +14,7 @@ 'upos': 'red', 'deprel': 'blue', 'ord': 'green', + 'misc[Entity]': 'magenta', } # Too many instance variables, arguments, branches... @@ -244,7 +245,7 @@ def should_print_tree(self, root, allnodes): return False return self.comment_mark_re.search(root.comment) - def process_tree(self, root): + def process_tree(self, root, force_print=False): """Print the tree to (possibly redirected) sys.stdout.""" if self.print_empty: if root.is_root(): @@ -256,7 +257,7 @@ def process_tree(self, root): allnodes.sort() else: allnodes = root.descendants(add_self=1) - if not self.should_print_tree(root, allnodes): + if not force_print and not self.should_print_tree(root, allnodes): return self._index_of = {allnodes[i].ord: i for i in range(len(allnodes))} self.lines = [''] * len(allnodes) @@ -353,7 +354,8 @@ def before_process_document(self, document): os.environ["FORCE_COLOR"] = "1" if self.print_doc_meta: for key, value in sorted(document.meta.items()): - print('%s = %s' % (key, value)) + if key[0] != '_': + print('%s = %s' % (key, value)) def _add(self, idx, text): self.lines[idx] += text diff --git a/udapi/block/write/textmodetreeshtml.py b/udapi/block/write/textmodetreeshtml.py index 5ccceb78..0ad39da4 100644 --- a/udapi/block/write/textmodetreeshtml.py +++ b/udapi/block/write/textmodetreeshtml.py @@ -26,7 +26,7 @@ class TextModeTreesHtml(TextModeTrees): This block is a subclass of `TextModeTrees`, see its documentation for more info. """ - def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, **kwargs): + def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, whole_bundle=True, **kwargs): """Create new TextModeTreesHtml block object. Args: see `TextModeTrees`. @@ -35,10 +35,14 @@ def __init__(self, color=True, title='Udapi visualization', zones_in_rows=True, (see the `mark` parameter) to be more eye-catching. title: What title metadata to use for the html? + zones_in_rows: print trees from the same bundle side by side (i.e. in the same row). + whole_bundle: always print the whole bundle (all its trees) if any of the trees is marked + (relevant only with marked_only=True and zones_in_rows=True) """ super().__init__(color=color, **kwargs) self.title = title self.zones_in_rows = zones_in_rows + self.whole_bundle = whole_bundle def before_process_document(self, document): # TextModeTrees.before_process_document changes the color property, @@ -97,10 +101,12 @@ def process_bundle(self, bundle): if self.should_print_tree(tree, allnodes): marked_trees.append(tree) if marked_trees: + if self.whole_bundle: + marked_trees = bundle print("
") - self.process_tree(tree) + self.process_tree(tree, force_print=True) print(" | ") print("