Skip to content

Commit ab6dab3

Browse files
committed
Support conversion from Penn Treebank to Universal Dependencies
1 parent b6dcb2a commit ab6dab3

File tree

2 files changed

+24
-12
lines changed

2 files changed

+24
-12
lines changed

hanlp/datasets/parsing/loaders/_ctb_utils.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -73,18 +73,30 @@
7373
# _make_splits(CTB9_ACADEMIA_SPLITS)
7474

7575

76-
def convert_to_stanford_dependency_330(src, dst, language='zh'):
77-
cprint(f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version 3.3.0. '
76+
def convert_to_dependency(src, dst, language='zh', version='3.3.0', conllx=True, ud=False):
77+
cprint(f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version {version}. '
7878
f'It might take a while [blink][yellow]...[/yellow][/blink]')
79-
sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip'
79+
if version == '3.3.0':
80+
sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip'
81+
elif version == '4.2.0':
82+
sp_home = 'https://nlp.stanford.edu/software/stanford-parser-4.2.0.zip'
83+
else:
84+
raise ValueError(f'Unsupported version {version}')
8085
sp_home = get_resource(sp_home)
8186
# jar_path = get_resource(f'{sp_home}#stanford-parser.jar')
82-
jclass = 'edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure' if language == 'zh' \
83-
else 'edu.stanford.nlp.trees.EnglishGrammaticalStructure'
84-
code, out, err = get_exitcode_stdout_stderr(
85-
f'java -cp {sp_home}/* {jclass} '
86-
f'-basic -keepPunct -conllx '
87-
f'-treeFile {src}')
87+
if ud:
88+
jclass = 'edu.stanford.nlp.trees.international.pennchinese.UniversalChineseGrammaticalStructure' if language == 'zh' \
89+
else 'edu.stanford.nlp.trees.ud.UniversalDependenciesConverter'
90+
else:
91+
jclass = 'edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure' if language == 'zh' \
92+
else 'edu.stanford.nlp.trees.EnglishGrammaticalStructure'
93+
cmd = f'java -cp {sp_home}/* {jclass} ' \
94+
f'-treeFile {src}'
95+
if conllx:
96+
cmd += ' -conllx'
97+
if not ud:
98+
cmd += f' -basic -keepPunct'
99+
code, out, err = get_exitcode_stdout_stderr(cmd)
88100
with open(dst, 'w') as f:
89101
f.write(out)
90102
if code:
@@ -180,7 +192,7 @@ def make_ctb_tasks(chtbs, out_root, part):
180192
erase=False)
181193
remove_all_ec(par_path)
182194
dep_path = join(out_root, 'dep', f'{part}.conllx')
183-
convert_to_stanford_dependency_330(par_path, dep_path)
195+
convert_to_dependency(par_path, dep_path)
184196
sents = list(read_conll(dep_path))
185197
with open(dep_path, 'w') as out:
186198
for sent in sents:

hanlp/datasets/srl/ontonotes5/_utils.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from hanlp_common.io import eprint, save_json
1313

1414
from hanlp.common.transform import NormalizeToken
15-
from hanlp.datasets.parsing.loaders._ctb_utils import remove_all_ec, convert_to_stanford_dependency_330
15+
from hanlp.datasets.parsing.loaders._ctb_utils import remove_all_ec, convert_to_dependency
1616
from hanlp.datasets.parsing.ptb import PTB_TOKEN_MAPPING
1717
from hanlp.utils.io_util import merge_files, get_resource, pushd, run_cmd, read_tsv_as_sents, replace_ext, \
1818
get_exitcode_stdout_stderr
@@ -538,7 +538,7 @@ def batch_remove_empty_category_if_necessary(json_files):
538538

539539
def make_dep_conllx(con_txt_file, output_file, language='en'):
540540
con_txt_file = get_resource(con_txt_file)
541-
convert_to_stanford_dependency_330(con_txt_file, output_file, language=language)
541+
convert_to_dependency(con_txt_file, output_file, language=language)
542542

543543

544544
def make_dep_conllx_if_necessary(con_txt_file: str, language='en'):

0 commit comments

Comments
 (0)