|
73 | 73 | # _make_splits(CTB9_ACADEMIA_SPLITS)
|
74 | 74 |
|
75 | 75 |
|
76 |
| -def convert_to_stanford_dependency_330(src, dst, language='zh'): |
77 |
| - cprint(f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version 3.3.0. ' |
| 76 | +def convert_to_dependency(src, dst, language='zh', version='3.3.0', conllx=True, ud=False): |
| 77 | + cprint(f'Converting {os.path.basename(src)} to {os.path.basename(dst)} using Stanford Parser Version {version}. ' |
78 | 78 | f'It might take a while [blink][yellow]...[/yellow][/blink]')
|
79 |
| - sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip' |
| 79 | + if version == '3.3.0': |
| 80 | + sp_home = 'https://nlp.stanford.edu/software/stanford-parser-full-2013-11-12.zip' |
| 81 | + elif version == '4.2.0': |
| 82 | + sp_home = 'https://nlp.stanford.edu/software/stanford-parser-4.2.0.zip' |
| 83 | + else: |
| 84 | + raise ValueError(f'Unsupported version {version}') |
80 | 85 | sp_home = get_resource(sp_home)
|
81 | 86 | # jar_path = get_resource(f'{sp_home}#stanford-parser.jar')
|
82 |
| - jclass = 'edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure' if language == 'zh' \ |
83 |
| - else 'edu.stanford.nlp.trees.EnglishGrammaticalStructure' |
84 |
| - code, out, err = get_exitcode_stdout_stderr( |
85 |
| - f'java -cp {sp_home}/* {jclass} ' |
86 |
| - f'-basic -keepPunct -conllx ' |
87 |
| - f'-treeFile {src}') |
| 87 | + if ud: |
| 88 | + jclass = 'edu.stanford.nlp.trees.international.pennchinese.UniversalChineseGrammaticalStructure' if language == 'zh' \ |
| 89 | + else 'edu.stanford.nlp.trees.ud.UniversalDependenciesConverter' |
| 90 | + else: |
| 91 | + jclass = 'edu.stanford.nlp.trees.international.pennchinese.ChineseGrammaticalStructure' if language == 'zh' \ |
| 92 | + else 'edu.stanford.nlp.trees.EnglishGrammaticalStructure' |
| 93 | + cmd = f'java -cp {sp_home}/* {jclass} ' \ |
| 94 | + f'-treeFile {src}' |
| 95 | + if conllx: |
| 96 | + cmd += ' -conllx' |
| 97 | + if not ud: |
| 98 | + cmd += f' -basic -keepPunct' |
| 99 | + code, out, err = get_exitcode_stdout_stderr(cmd) |
88 | 100 | with open(dst, 'w') as f:
|
89 | 101 | f.write(out)
|
90 | 102 | if code:
|
@@ -180,7 +192,7 @@ def make_ctb_tasks(chtbs, out_root, part):
|
180 | 192 | erase=False)
|
181 | 193 | remove_all_ec(par_path)
|
182 | 194 | dep_path = join(out_root, 'dep', f'{part}.conllx')
|
183 |
| - convert_to_stanford_dependency_330(par_path, dep_path) |
| 195 | + convert_to_dependency(par_path, dep_path) |
184 | 196 | sents = list(read_conll(dep_path))
|
185 | 197 | with open(dep_path, 'w') as out:
|
186 | 198 | for sent in sents:
|
|
0 commit comments