Skip to content

Commit 8569dbc

Browse files
committed
* Add initial stuff for Chinese parsing
1 parent 67ce96c commit 8569dbc

File tree

10 files changed

+302
-16
lines changed

10 files changed

+302
-16
lines changed

bin/init_model.py

Lines changed: 3 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,8 @@
3636
from preshed.counter import PreshCounter
3737

3838
from spacy.parts_of_speech import NOUN, VERB, ADJ
39+
from spacy.util import get_lang_class
3940

40-
import spacy.en
41-
import spacy.de
42-
import spacy.fi
43-
import spacy.it
4441

4542
try:
4643
unicode
@@ -197,13 +194,6 @@ def setup_vocab(get_lex_attr, tag_map, src_dir, dst_dir):
197194

198195

199196
def main(lang_id, lang_data_dir, corpora_dir, model_dir):
200-
languages = {
201-
'en': spacy.en.English.default_lex_attrs(),
202-
'de': spacy.de.German.default_lex_attrs(),
203-
'fi': spacy.fi.Finnish.default_lex_attrs(),
204-
'it': spacy.it.Italian.default_lex_attrs(),
205-
}
206-
207197
model_dir = Path(model_dir)
208198
lang_data_dir = Path(lang_data_dir) / lang_id
209199
corpora_dir = Path(corpora_dir) / lang_id
@@ -216,7 +206,8 @@ def main(lang_id, lang_data_dir, corpora_dir, model_dir):
216206

217207
tag_map = json.load((lang_data_dir / 'tag_map.json').open())
218208
setup_tokenizer(lang_data_dir, model_dir / 'tokenizer')
219-
setup_vocab(languages[lang_id], tag_map, corpora_dir, model_dir / 'vocab')
209+
setup_vocab(get_lang_class(lang_id).default_lex_attrs(), tag_map, corpora_dir,
210+
model_dir / 'vocab')
220211

221212
if (lang_data_dir / 'gazetteer.json').exists():
222213
copyfile(str(lang_data_dir / 'gazetteer.json'),

bin/parser/train.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,6 @@
1313
import re
1414

1515
import spacy.util
16-
from spacy.en import English
17-
from spacy.de import German
1816

1917
from spacy.syntax.util import Config
2018
from spacy.gold import read_json_file
@@ -207,7 +205,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
207205

208206

209207
@plac.annotations(
210-
language=("The language to train", "positional", None, str, ['en','de']),
208+
language=("The language to train", "positional", None, str, ['en','de', 'zh']),
211209
train_loc=("Location of training file or directory"),
212210
dev_loc=("Location of development file or directory"),
213211
model_dir=("Location of output model directory",),
@@ -223,7 +221,7 @@ def write_parses(Language, dev_loc, model_dir, out_loc):
223221
)
224222
def main(language, train_loc, dev_loc, model_dir, n_sents=0, n_iter=15, out_loc="", verbose=False,
225223
debug=False, corruption_level=0.0, gold_preproc=False, eval_only=False, pseudoprojective=False):
226-
lang = {'en':English, 'de':German}.get(language)
224+
lang = spacy.util.get_lang_class(language)
227225

228226
if not eval_only:
229227
gold_train = list(read_json_file(train_loc))

lang_data/zh/gazetteer.json

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
{
2+
"Reddit": [
3+
"PRODUCT",
4+
{},
5+
[
6+
[{"lower": "reddit"}]
7+
]
8+
],
9+
"SeptemberElevenAttacks": [
10+
"EVENT",
11+
{},
12+
[
13+
[
14+
{"orth": "9/11"}
15+
],
16+
[
17+
{"lower": "september"},
18+
{"orth": "11"}
19+
]
20+
]
21+
],
22+
"Linux": [
23+
"PRODUCT",
24+
{},
25+
[
26+
[{"lower": "linux"}]
27+
]
28+
],
29+
"Haskell": [
30+
"PRODUCT",
31+
{},
32+
[
33+
[{"lower": "haskell"}]
34+
]
35+
],
36+
"HaskellCurry": [
37+
"PERSON",
38+
{},
39+
[
40+
[
41+
{"lower": "haskell"},
42+
{"lower": "curry"}
43+
]
44+
]
45+
],
46+
"Javascript": [
47+
"PRODUCT",
48+
{},
49+
[
50+
[{"lower": "javascript"}]
51+
]
52+
],
53+
"CSS": [
54+
"PRODUCT",
55+
{},
56+
[
57+
[{"lower": "css"}],
58+
[{"lower": "css3"}]
59+
]
60+
],
61+
"displaCy": [
62+
"PRODUCT",
63+
{},
64+
[
65+
[{"lower": "displacy"}]
66+
]
67+
],
68+
"spaCy": [
69+
"PRODUCT",
70+
{},
71+
[
72+
[{"orth": "spaCy"}]
73+
]
74+
],
75+
76+
"HTML": [
77+
"PRODUCT",
78+
{},
79+
[
80+
[{"lower": "html"}],
81+
[{"lower": "html5"}]
82+
]
83+
],
84+
"Python": [
85+
"PRODUCT",
86+
{},
87+
[
88+
[{"orth": "Python"}]
89+
]
90+
],
91+
"Ruby": [
92+
"PRODUCT",
93+
{},
94+
[
95+
[{"orth": "Ruby"}]
96+
]
97+
],
98+
"Digg": [
99+
"PRODUCT",
100+
{},
101+
[
102+
[{"lower": "digg"}]
103+
]
104+
],
105+
"FoxNews": [
106+
"ORG",
107+
{},
108+
[
109+
[{"orth": "Fox"}],
110+
[{"orth": "News"}]
111+
]
112+
],
113+
"Google": [
114+
"ORG",
115+
{},
116+
[
117+
[{"lower": "google"}]
118+
]
119+
],
120+
"Mac": [
121+
"PRODUCT",
122+
{},
123+
[
124+
[{"lower": "mac"}]
125+
]
126+
],
127+
"Wikipedia": [
128+
"PRODUCT",
129+
{},
130+
[
131+
[{"lower": "wikipedia"}]
132+
]
133+
],
134+
"Windows": [
135+
"PRODUCT",
136+
{},
137+
[
138+
[{"orth": "Windows"}]
139+
]
140+
],
141+
"Dell": [
142+
"ORG",
143+
{},
144+
[
145+
[{"lower": "dell"}]
146+
]
147+
],
148+
"Facebook": [
149+
"ORG",
150+
{},
151+
[
152+
[{"lower": "facebook"}]
153+
]
154+
],
155+
"Blizzard": [
156+
"ORG",
157+
{},
158+
[
159+
[{"orth": "Blizzard"}]
160+
]
161+
],
162+
"Ubuntu": [
163+
"ORG",
164+
{},
165+
[
166+
[{"orth": "Ubuntu"}]
167+
]
168+
],
169+
"Youtube": [
170+
"PRODUCT",
171+
{},
172+
[
173+
[{"lower": "youtube"}]
174+
]
175+
],
176+
"false_positives": [
177+
null,
178+
{},
179+
[
180+
[{"orth": "Shit"}],
181+
[{"orth": "Weed"}],
182+
[{"orth": "Cool"}],
183+
[{"orth": "Btw"}],
184+
[{"orth": "Bah"}],
185+
[{"orth": "Bullshit"}],
186+
[{"orth": "Lol"}],
187+
[{"orth": "Yo"}, {"lower": "dawg"}],
188+
[{"orth": "Yay"}],
189+
[{"orth": "Ahh"}],
190+
[{"orth": "Yea"}],
191+
[{"orth": "Bah"}]
192+
]
193+
]
194+
}

lang_data/zh/infix.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
\.\.\.
2+
(?<=[a-z])\.(?=[A-Z])
3+
(?<=[a-zA-Z])-(?=[a-zA-z])
4+
(?<=[a-zA-Z])--(?=[a-zA-z])
5+
(?<=[0-9])-(?=[0-9])
6+
(?<=[A-Za-z]),(?=[A-Za-z])

lang_data/zh/morphs.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{}

lang_data/zh/prefix.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
,
2+
"
3+
(
4+
[
5+
{
6+
*
7+
<
8+
$
9+
£
10+
11+
'
12+
``
13+
`
14+
#
15+
US$
16+
C$
17+
A$
18+
a-
19+
20+
....
21+
...

lang_data/zh/specials.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{}

lang_data/zh/suffix.txt

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
,
2+
\"
3+
\)
4+
\]
5+
\}
6+
\*
7+
\!
8+
\?
9+
%
10+
\$
11+
>
12+
:
13+
;
14+
'
15+
16+
''
17+
's
18+
'S
19+
’s
20+
’S
21+
22+
\.\.
23+
\.\.\.
24+
\.\.\.\.
25+
(?<=[a-z0-9)\]"'%\)])\.
26+
(?<=[0-9])km

lang_data/zh/tag_map.json

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
{
2+
"NR": {"pos": "PROPN"},
3+
"AD": {"pos": "ADV"},
4+
"NN": {"pos": "NOUN"},
5+
"CD": {"pos": "NUM"},
6+
"DEG": {"pos": "PART"},
7+
"PN": {"pos": "PRON"},
8+
"M": {"pos": "PART"},
9+
"JJ": {"pos": "ADJ"},
10+
"DEC": {"pos": "PART"},
11+
"NT": {"pos": "NOUN"},
12+
"DT": {"pos": "DET"},
13+
"LC": {"pos": "PART"},
14+
"CC": {"pos": "CONJ"},
15+
"AS": {"pos": "PART"},
16+
"SP": {"pos": "PART"},
17+
"IJ": {"pos": "INTJ"},
18+
"OD": {"pos": "NUM"},
19+
"MSP": {"pos": "PART"},
20+
"CS": {"pos": "SCONJ"},
21+
"ETC": {"pos": "PART"},
22+
"DEV": {"pos": "PART"},
23+
"BA": {"pos": "AUX"},
24+
"SB": {"pos": "AUX"},
25+
"DER": {"pos": "PART"},
26+
"LB": {"pos": "AUX"},
27+
"P": {"pos": "ADP"},
28+
"URL": {"pos": "SYM"},
29+
"FRAG": {"pos": "X"},
30+
"X": {"pos": "X"},
31+
"ON": {"pos": "X"},
32+
"FW": {"pos": "X"},
33+
"VC": {"pos": "VERB"},
34+
"VV": {"pos": "VERB"},
35+
"VA": {"pos": "VERB"},
36+
"VE": {"pos": "VERB"},
37+
"PU": {"pos": "PUNCT"},
38+
"SP": {"pos": "SPACE"},
39+
"NP": {"pos": "X"},
40+
"_": {"pos": "X"},
41+
"VP": {"pos": "X"},
42+
"CHAR": {"pos": "X"}
43+
}

spacy/zh/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from ..language import Language
2+
3+
4+
class Chinese(Language):
5+
lang = u'zh'

0 commit comments

Comments
 (0)