|
1 |
| -import pytest |
2 | 1 | import re
|
3 |
| -from spacy.util import get_lang_class |
| 2 | +import string |
| 3 | + |
| 4 | +import hypothesis |
| 5 | +import hypothesis.strategies |
| 6 | +import pytest |
| 7 | + |
| 8 | +import spacy |
4 | 9 | from spacy.tokenizer import Tokenizer
|
| 10 | +from spacy.util import get_lang_class |
5 | 11 |
|
6 | 12 | # Only include languages with no external dependencies
|
7 | 13 | # "is" seems to confuse importlib, so we're also excluding it for now
|
@@ -77,3 +83,46 @@ def test_tokenizer_explain_special_matcher(en_vocab):
|
77 | 83 | tokens = [t.text for t in tokenizer("a/a.")]
|
78 | 84 | explain_tokens = [t[1] for t in tokenizer.explain("a/a.")]
|
79 | 85 | assert tokens == explain_tokens
|
| 86 | + |
| 87 | + |
| 88 | +@hypothesis.strategies.composite |
| 89 | +def sentence_strategy(draw: hypothesis.strategies.DrawFn, max_n_words: int = 4) -> str: |
| 90 | + """ |
| 91 | + Composite strategy for fuzzily generating sentence with varying interpunctation. |
| 92 | +
|
| 93 | + draw (hypothesis.strategies.DrawFn): Protocol for drawing function allowing to fuzzily pick from hypothesis' |
| 94 | + strategies. |
| 95 | + max_n_words (int): Max. number of words in generated sentence. |
| 96 | + RETURNS (str): Fuzzily generated sentence. |
| 97 | + """ |
| 98 | + |
| 99 | + punctuation_and_space_regex = "|".join( |
| 100 | + [*[re.escape(p) for p in string.punctuation], r"\s"] |
| 101 | + ) |
| 102 | + sentence = [ |
| 103 | + [ |
| 104 | + draw(hypothesis.strategies.text(min_size=1)), |
| 105 | + draw(hypothesis.strategies.from_regex(punctuation_and_space_regex)), |
| 106 | + ] |
| 107 | + for _ in range( |
| 108 | + draw(hypothesis.strategies.integers(min_value=2, max_value=max_n_words)) |
| 109 | + ) |
| 110 | + ] |
| 111 | + |
| 112 | + return " ".join([token for token_pair in sentence for token in token_pair]) |
| 113 | + |
| 114 | + |
| 115 | +@pytest.mark.xfail |
| 116 | +@pytest.mark.parametrize("lang", LANGUAGES) |
| 117 | +@hypothesis.given(sentence=sentence_strategy()) |
| 118 | +def test_tokenizer_explain_fuzzy(lang: str, sentence: str) -> None: |
| 119 | + """ |
| 120 | + Tests whether output of tokenizer.explain() matches tokenizer output. Input generated by hypothesis. |
| 121 | + lang (str): Language to test. |
| 122 | + text (str): Fuzzily generated sentence to tokenize. |
| 123 | + """ |
| 124 | + |
| 125 | + tokenizer: Tokenizer = spacy.blank(lang).tokenizer |
| 126 | + tokens = [t.text for t in tokenizer(sentence) if not t.is_space] |
| 127 | + debug_tokens = [t[1] for t in tokenizer.explain(sentence)] |
| 128 | + assert tokens == debug_tokens, f"{tokens}, {debug_tokens}, {sentence}" |
0 commit comments