Skip to content

Commit 727ce6d

Browse files
authored
Remove English exceptions with mismatched features (explosion#10873)
Remove English contraction exceptions with mismatched features that lead to exceptions like "theses" and "thisre".
1 parent 41389ff commit 727ce6d

File tree

2 files changed

+45
-34
lines changed

2 files changed

+45
-34
lines changed

spacy/lang/en/tokenizer_exceptions.py

Lines changed: 36 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
_exc[orth + "m"] = [
3737
{ORTH: orth, NORM: pron},
38-
{ORTH: "m", "tenspect": 1, "number": 1},
38+
{ORTH: "m"},
3939
]
4040

4141
_exc[orth + "'ma"] = [
@@ -139,26 +139,27 @@
139139

140140
# W-words, relative pronouns, prepositions etc.
141141

142-
for word in [
143-
"who",
144-
"what",
145-
"when",
146-
"where",
147-
"why",
148-
"how",
149-
"there",
150-
"that",
151-
"this",
152-
"these",
153-
"those",
142+
for word, morph in [
143+
("who", None),
144+
("what", None),
145+
("when", None),
146+
("where", None),
147+
("why", None),
148+
("how", None),
149+
("there", None),
150+
("that", "Number=Sing|Person=3"),
151+
("this", "Number=Sing|Person=3"),
152+
("these", "Number=Plur|Person=3"),
153+
("those", "Number=Plur|Person=3"),
154154
]:
155155
for orth in [word, word.title()]:
156-
_exc[orth + "'s"] = [
157-
{ORTH: orth, NORM: word},
158-
{ORTH: "'s", NORM: "'s"},
159-
]
156+
if morph != "Number=Plur|Person=3":
157+
_exc[orth + "'s"] = [
158+
{ORTH: orth, NORM: word},
159+
{ORTH: "'s", NORM: "'s"},
160+
]
160161

161-
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
162+
_exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
162163

163164
_exc[orth + "'ll"] = [
164165
{ORTH: orth, NORM: word},
@@ -182,25 +183,26 @@
182183
{ORTH: "ve", NORM: "have"},
183184
]
184185

185-
_exc[orth + "'re"] = [
186-
{ORTH: orth, NORM: word},
187-
{ORTH: "'re", NORM: "are"},
188-
]
186+
if morph != "Number=Sing|Person=3":
187+
_exc[orth + "'re"] = [
188+
{ORTH: orth, NORM: word},
189+
{ORTH: "'re", NORM: "are"},
190+
]
189191

190-
_exc[orth + "re"] = [
191-
{ORTH: orth, NORM: word},
192-
{ORTH: "re", NORM: "are"},
193-
]
192+
_exc[orth + "re"] = [
193+
{ORTH: orth, NORM: word},
194+
{ORTH: "re", NORM: "are"},
195+
]
194196

195-
_exc[orth + "'ve"] = [
196-
{ORTH: orth, NORM: word},
197-
{ORTH: "'ve"},
198-
]
197+
_exc[orth + "'ve"] = [
198+
{ORTH: orth, NORM: word},
199+
{ORTH: "'ve"},
200+
]
199201

200-
_exc[orth + "ve"] = [
201-
{ORTH: orth},
202-
{ORTH: "ve", NORM: "have"},
203-
]
202+
_exc[orth + "ve"] = [
203+
{ORTH: orth},
204+
{ORTH: "ve", NORM: "have"},
205+
]
204206

205207
_exc[orth + "'d"] = [
206208
{ORTH: orth, NORM: word},

spacy/tests/lang/en/test_tokenizer.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,3 +167,12 @@ def test_issue3521(en_tokenizer, word):
167167
tok = en_tokenizer(word)[1]
168168
# 'not' and 'would' should be stopwords, also in their abbreviated forms
169169
assert tok.is_stop
170+
171+
172+
@pytest.mark.issue(10699)
173+
@pytest.mark.parametrize("text", ["theses", "thisre"])
174+
def test_issue10699(en_tokenizer, text):
175+
"""Test that 'theses' and 'thisre' are excluded from the contractions
176+
generated by the English tokenizer exceptions."""
177+
tokens = en_tokenizer(text)
178+
assert len(tokens) == 1

0 commit comments

Comments
 (0)