Support conversion from Penn Treebank to Universal Dependencies

hankcs · hankcs · commit ab6dab37e54c · 2022-02-18T14:17:07.000-05:00
diff --git a/hanlp/datasets/parsing/loaders/_ctb_utils.py b/hanlp/datasets/parsing/loaders/_ctb_utils.py
@@ -73,18 +73,30 @@
 # _make_splits(CTB9_ACADEMIA_SPLITS)
 
 
-def convert_to_stanford_dependency_330(src, dst, language='zh'):
-    cprint(f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version 3.3.0. '
+def convert_to_dependency(src, dst, language='zh', version='3.3.0', conllx=True, ud=False):
+    cprint(f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version {version}. '
            f'It might take a while [blink][yellow]...[/yellow][/blink]')
-    sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip'
+    if version == '3.3.0':
+        sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip'
+    elif version == '4.2.0':
+        sp_home = 'https://nlp.stanford.edu/software/stanford-parser-4.2.0.zip'
+    else:
+        raise ValueError(f'Unsupported version {version}')
     sp_home = get_resource(sp_home)
     # jar_path = get_resource(f'{sp_home}#stanford-parser.jar')
-    jclass = 'edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure' if language == 'zh' \
-        else 'edu.stanford.nlp.trees.EnglishGrammaticalStructure'
-    code, out, err = get_exitcode_stdout_stderr(
-        f'java -cp {sp_home}/* {jclass} '
-        f'-basic -keepPunct -conllx '
-        f'-treeFile {src}')
+    if ud:
+        jclass = 'edu.stanford.nlp.trees.international.pennchinese.UniversalChineseGrammaticalStructure' if language == 'zh' \
+            else 'edu.stanford.nlp.trees.ud.UniversalDependenciesConverter'
+    else:
+        jclass = 'edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure' if language == 'zh' \
+            else 'edu.stanford.nlp.trees.EnglishGrammaticalStructure'
+    cmd = f'java -cp {sp_home}/* {jclass} ' \
+          f'-treeFile {src}'
+    if conllx:
+        cmd += ' -conllx'
+    if not ud:
+        cmd += f' -basic -keepPunct'
+    code, out, err = get_exitcode_stdout_stderr(cmd)
     with open(dst, 'w') as f:
         f.write(out)
     if code:
@@ -180,7 +192,7 @@ def make_ctb_tasks(chtbs, out_root, part):
                       erase=False)
     remove_all_ec(par_path)
     dep_path = join(out_root, 'dep', f'{part}.conllx')
-    convert_to_stanford_dependency_330(par_path, dep_path)
+    convert_to_dependency(par_path, dep_path)
     sents = list(read_conll(dep_path))
     with open(dep_path, 'w') as out:
         for sent in sents:
diff --git a/hanlp/datasets/srl/ontonotes5/_utils.py b/hanlp/datasets/srl/ontonotes5/_utils.py
@@ -12,7 +12,7 @@
 from hanlp_common.io import eprint, save_json
 
 from hanlp.common.transform import NormalizeToken
-from hanlp.datasets.parsing.loaders._ctb_utils import remove_all_ec, convert_to_stanford_dependency_330
+from hanlp.datasets.parsing.loaders._ctb_utils import remove_all_ec, convert_to_dependency
 from hanlp.datasets.parsing.ptb import PTB_TOKEN_MAPPING
 from hanlp.utils.io_util import merge_files, get_resource, pushd, run_cmd, read_tsv_as_sents, replace_ext, \
     get_exitcode_stdout_stderr
@@ -538,7 +538,7 @@ def batch_remove_empty_category_if_necessary(json_files):
 
 def make_dep_conllx(con_txt_file, output_file, language='en'):
     con_txt_file = get_resource(con_txt_file)
-    convert_to_stanford_dependency_330(con_txt_file, output_file, language=language)
+    convert_to_dependency(con_txt_file, output_file, language=language)
 
 
 def make_dep_conllx_if_necessary(con_txt_file: str, language='en'):