Skip to content

Commit 357be26

Browse files
rmitschadrianeboyd
andauthored
Fuzz tokenizer.explain: draft for fuzzy tests. (explosion#10771)
* Fuzz tokenizer.explain: draft for fuzzy tests. * Fuzz tokenizer.explain: xignoring tokenizer.explain() tests. Removed deadline modification. Removed LANGUAGES_WITHOUT_TOKENIZERS. * Fuzz tokenizer.explain: changed tokenizer initialization to avoid failus in Azure runs. * Fuzz tokenizer.explain: type hint for tokenizer in test. Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com> Co-authored-by: Adriane Boyd <adrianeboyd@gmail.com>
1 parent 99aeaf9 commit 357be26

File tree

1 file changed

+51
-2
lines changed

1 file changed

+51
-2
lines changed

spacy/tests/tokenizer/test_explain.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
1-
import pytest
21
import re
3-
from spacy.util import get_lang_class
2+
import string
3+
4+
import hypothesis
5+
import hypothesis.strategies
6+
import pytest
7+
8+
import spacy
49
from spacy.tokenizer import Tokenizer
10+
from spacy.util import get_lang_class
511

612
# Only include languages with no external dependencies
713
# "is" seems to confuse importlib, so we're also excluding it for now
@@ -77,3 +83,46 @@ def test_tokenizer_explain_special_matcher(en_vocab):
7783
tokens = [t.text for t in tokenizer("a/a.")]
7884
explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
7985
assert tokens == explain_tokens
86+
87+
88+
@hypothesis.strategies.composite
89+
def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str:
90+
"""
91+
Composite strategy for fuzzily generating sentence with varying interpunctation.
92+
93+
draw (hypothesis.strategies.DrawFn): Protocol for drawing function allowing to fuzzily pick from hypothesis'
94+
strategies.
95+
max_n_words (int): Max. number of words in generated sentence.
96+
RETURNS (str): Fuzzily generated sentence.
97+
"""
98+
99+
punctuation_and_space_regex = "|".join(
100+
[*[re.escape(p) for p in string.punctuation], r"\s"]
101+
)
102+
sentence = [
103+
[
104+
draw(hypothesis.strategies.text(min_size=1)),
105+
draw(hypothesis.strategies.from_regex(punctuation_and_space_regex)),
106+
]
107+
for _ in range(
108+
draw(hypothesis.strategies.integers(min_value=2, max_value=max_n_words))
109+
)
110+
]
111+
112+
return " ".join([token for token_pair in sentence for token in token_pair])
113+
114+
115+
@pytest.mark.xfail
116+
@pytest.mark.parametrize("lang", LANGUAGES)
117+
@hypothesis.given(sentence=sentence_strategy())
118+
def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None:
119+
"""
120+
Tests whether output of tokenizer.explain() matches tokenizer output. Input generated by hypothesis.
121+
lang (str): Language to test.
122+
text (str): Fuzzily generated sentence to tokenize.
123+
"""
124+
125+
tokenizer: Tokenizer = spacy.blank(lang).tokenizer
126+
tokens = [t.text for t in tokenizer(sentence) if not t.is_space]
127+
debug_tokens = [t[1] for t in tokenizer.explain(sentence)]
128+
assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}"

0 commit comments

Comments
 (0)