Remove English exceptions with mismatched features (explosion#10873)

adrianeboyd · web-flow · commit 727ce6d1f59f · 2022-06-03T09:44:04.000+02:00
Remove English contraction exceptions with mismatched features that lead
to exceptions like "theses" and "thisre".
diff --git a/spacy/lang/en/tokenizer_exceptions.py b/spacy/lang/en/tokenizer_exceptions.py
@@ -35,7 +35,7 @@
 
         _exc[orth + "m"] = [
             {ORTH: orth, NORM: pron},
-            {ORTH: "m", "tenspect": 1, "number": 1},
+            {ORTH: "m"},
         ]
 
         _exc[orth + "'ma"] = [
@@ -139,26 +139,27 @@
 
 # W-words, relative pronouns, prepositions etc.
 
-for word in [
-    "who",
-    "what",
-    "when",
-    "where",
-    "why",
-    "how",
-    "there",
-    "that",
-    "this",
-    "these",
-    "those",
+for word, morph in [
+    ("who", None),
+    ("what", None),
+    ("when", None),
+    ("where", None),
+    ("why", None),
+    ("how", None),
+    ("there", None),
+    ("that", "Number=Sing|Person=3"),
+    ("this", "Number=Sing|Person=3"),
+    ("these", "Number=Plur|Person=3"),
+    ("those", "Number=Plur|Person=3"),
 ]:
     for orth in [word, word.title()]:
-        _exc[orth + "'s"] = [
-            {ORTH: orth, NORM: word},
-            {ORTH: "'s", NORM: "'s"},
-        ]
+        if morph != "Number=Plur|Person=3":
+            _exc[orth + "'s"] = [
+                {ORTH: orth, NORM: word},
+                {ORTH: "'s", NORM: "'s"},
+            ]
 
-        _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
+            _exc[orth + "s"] = [{ORTH: orth, NORM: word}, {ORTH: "s"}]
 
         _exc[orth + "'ll"] = [
             {ORTH: orth, NORM: word},
@@ -182,25 +183,26 @@
             {ORTH: "ve", NORM: "have"},
         ]
 
-        _exc[orth + "'re"] = [
-            {ORTH: orth, NORM: word},
-            {ORTH: "'re", NORM: "are"},
-        ]
+        if morph != "Number=Sing|Person=3":
+            _exc[orth + "'re"] = [
+                {ORTH: orth, NORM: word},
+                {ORTH: "'re", NORM: "are"},
+            ]
 
-        _exc[orth + "re"] = [
-            {ORTH: orth, NORM: word},
-            {ORTH: "re", NORM: "are"},
-        ]
+            _exc[orth + "re"] = [
+                {ORTH: orth, NORM: word},
+                {ORTH: "re", NORM: "are"},
+            ]
 
-        _exc[orth + "'ve"] = [
-            {ORTH: orth, NORM: word},
-            {ORTH: "'ve"},
-        ]
+            _exc[orth + "'ve"] = [
+                {ORTH: orth, NORM: word},
+                {ORTH: "'ve"},
+            ]
 
-        _exc[orth + "ve"] = [
-            {ORTH: orth},
-            {ORTH: "ve", NORM: "have"},
-        ]
+            _exc[orth + "ve"] = [
+                {ORTH: orth},
+                {ORTH: "ve", NORM: "have"},
+            ]
 
         _exc[orth + "'d"] = [
             {ORTH: orth, NORM: word},
diff --git a/spacy/tests/lang/en/test_tokenizer.py b/spacy/tests/lang/en/test_tokenizer.py
@@ -167,3 +167,12 @@ def test_issue3521(en_tokenizer, word):
     tok = en_tokenizer(word)[1]
     # 'not' and 'would' should be stopwords, also in their abbreviated forms
     assert tok.is_stop
+
+
+@pytest.mark.issue(10699)
+@pytest.mark.parametrize("text", ["theses", "thisre"])
+def test_issue10699(en_tokenizer, text):
+    """Test that 'theses' and 'thisre' are excluded from the contractions
+    generated by the English tokenizer exceptions."""
+    tokens = en_tokenizer(text)
+    assert len(tokens) == 1