Skip to content

Commit 08cb085

Browse files
committed
Merge remote-tracking branch 'upstream/develop' into fix/various
2 parents e972ecb + b7cb9d9 commit 08cb085

File tree

13 files changed

+58
-65
lines changed

13 files changed

+58
-65
lines changed

spacy/errors.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,8 @@ class Errors:
456456
"issue tracker: http://github.com/explosion/spaCy/issues")
457457

458458
# TODO: fix numbering after merging develop into master
459+
E900 = ("Could not run the full 'nlp' pipeline for evaluation. If you specified "
460+
"frozen components, make sure they were already initialized and trained. ")
459461
E901 = ("Failed to remove existing output directory: {path}. If your "
460462
"config and the components you train change between runs, a "
461463
"non-empty output directory can lead to stale pipeline data. To "

spacy/language.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,6 +1034,9 @@ def select_pipes(
10341034
)
10351035
)
10361036
disable = to_disable
1037+
# DisabledPipes will restore the pipes in 'disable' when it's done, so we need to exclude
1038+
# those pipes that were already disabled.
1039+
disable = [d for d in disable if d not in self._disabled]
10371040
return DisabledPipes(self, disable)
10381041

10391042
def make_doc(self, text: str) -> Doc:

spacy/ml/models/tok2vec.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def CharacterEmbed(
177177
rows: int,
178178
nM: int,
179179
nC: int,
180-
also_use_static_vectors: bool,
180+
include_static_vectors: bool,
181181
feature: Union[int, str] = "LOWER",
182182
) -> Model[List[Doc], List[Floats2d]]:
183183
"""Construct an embedded representation based on character embeddings, using
@@ -204,13 +204,13 @@ def CharacterEmbed(
204204
nC (int): The number of UTF-8 bytes to embed per word. Recommended values
205205
are between 3 and 8, although it may depend on the length of words in the
206206
language.
207-
also_use_static_vectors (bool): Whether to also use static word vectors.
207+
include_static_vectors (bool): Whether to also use static word vectors.
208208
Requires a vectors table to be loaded in the Doc objects' vocab.
209209
"""
210210
feature = intify_attr(feature)
211211
if feature is None:
212212
raise ValueError(Errors.E911(feat=feature))
213-
if also_use_static_vectors:
213+
if include_static_vectors:
214214
model = chain(
215215
concatenate(
216216
chain(_character_embed.CharacterEmbed(nM=nM, nC=nC), list2ragged()),

spacy/pipeline/morphologizer.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ width = 128
3232
rows = 7000
3333
nM = 64
3434
nC = 8
35-
also_use_static_vectors = false
35+
include_static_vectors = false
3636
3737
[model.tok2vec.encode]
3838
@architectures = "spacy.MaxoutWindowEncoder.v1"

spacy/tests/doc/test_doc_api.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -608,14 +608,11 @@ def test_doc_init_iob():
608608
doc = Doc(Vocab(), words=words, ents=ents)
609609

610610

611-
@pytest.mark.xfail
612-
def test_doc_set_ents_spans(en_tokenizer):
611+
def test_doc_set_ents_invalid_spans(en_tokenizer):
613612
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
614613
spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
615614
with doc.retokenize() as retokenizer:
616615
for span in spans:
617616
retokenizer.merge(span)
618-
# If this line is uncommented, it works:
619-
# print(spans)
620-
doc.ents = spans
621-
assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"]
617+
with pytest.raises(IndexError):
618+
doc.ents = spans

spacy/tests/doc/test_retokenize_merge.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,7 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
336336
attrs = {"lemma": "none", "ent_type": "none"}
337337
retokenizer.merge(doc[0:2], attrs=attrs)
338338
retokenizer.merge(doc[-2:], attrs=attrs)
339+
sent1, sent2 = list(doc.sents)
339340
assert len(sent1) == init_len - 1
340341
assert len(sent2) == init_len2 - 1
341342

spacy/tests/pipeline/test_pipe_methods.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,13 +129,26 @@ def test_enable_pipes_method(nlp, name):
129129

130130
@pytest.mark.parametrize("name", ["my_component"])
131131
def test_disable_pipes_context(nlp, name):
132+
"""Test that an enabled component stays enabled after running the context manager."""
132133
nlp.add_pipe("new_pipe", name=name)
133134
assert nlp.has_pipe(name)
134135
with nlp.select_pipes(disable=name):
135136
assert not nlp.has_pipe(name)
136137
assert nlp.has_pipe(name)
137138

138139

140+
@pytest.mark.parametrize("name", ["my_component"])
141+
def test_disable_pipes_context_restore(nlp, name):
142+
"""Test that a disabled component stays disabled after running the context manager."""
143+
nlp.add_pipe("new_pipe", name=name)
144+
assert nlp.has_pipe(name)
145+
nlp.disable_pipes(name)
146+
assert not nlp.has_pipe(name)
147+
with nlp.select_pipes(disable=name):
148+
assert not nlp.has_pipe(name)
149+
assert not nlp.has_pipe(name)
150+
151+
139152
def test_select_pipes_list_arg(nlp):
140153
for name in ["c1", "c2", "c3"]:
141154
nlp.add_pipe("new_pipe", name=name)

spacy/tests/pipeline/test_tok2vec.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ def test_tok2vec_batch_sizes(batch_size, width, embed_size):
6363
[
6464
(8, MultiHashEmbed, {"rows": [100, 100], "attrs": ["SHAPE", "LOWER"], "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 2}),
6565
(8, MultiHashEmbed, {"rows": [100, 20], "attrs": ["ORTH", "PREFIX"], "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 6}),
66-
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "also_use_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
67-
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "also_use_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
66+
(8, CharacterEmbed, {"rows": 100, "nM": 64, "nC": 8, "include_static_vectors": False}, MaxoutWindowEncoder, {"window_size": 1, "maxout_pieces": 3, "depth": 3}),
67+
(8, CharacterEmbed, {"rows": 100, "nM": 16, "nC": 2, "include_static_vectors": False}, MishWindowEncoder, {"window_size": 1, "depth": 3}),
6868
],
6969
)
7070
# fmt: on

spacy/tokens/span.pxd

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,4 @@ cdef class Span:
1616
cdef public _vector
1717
cdef public _vector_norm
1818

19-
cpdef int _recalculate_indices(self) except -1
2019
cpdef np.ndarray to_array(self, object features)

spacy/tokens/span.pyx

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -150,7 +150,6 @@ cdef class Span:
150150
151151
DOCS: https://nightly.spacy.io/api/span#len
152152
"""
153-
self._recalculate_indices()
154153
if self.end < self.start:
155154
return 0
156155
return self.end - self.start
@@ -167,7 +166,6 @@ cdef class Span:
167166
168167
DOCS: https://nightly.spacy.io/api/span#getitem
169168
"""
170-
self._recalculate_indices()
171169
if isinstance(i, slice):
172170
start, end = normalize_slice(len(self), i.start, i.stop, i.step)
173171
return Span(self.doc, start + self.start, end + self.start)
@@ -188,7 +186,6 @@ cdef class Span:
188186
189187
DOCS: https://nightly.spacy.io/api/span#iter
190188
"""
191-
self._recalculate_indices()
192189
for i in range(self.start, self.end):
193190
yield self.doc[i]
194191

@@ -339,19 +336,6 @@ cdef class Span:
339336
output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
340337
return output
341338

342-
cpdef int _recalculate_indices(self) except -1:
343-
if self.end > self.doc.length \
344-
or self.doc.c[self.start].idx != self.start_char \
345-
or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
346-
start = token_by_start(self.doc.c, self.doc.length, self.start_char)
347-
if self.start == -1:
348-
raise IndexError(Errors.E036.format(start=self.start_char))
349-
end = token_by_end(self.doc.c, self.doc.length, self.end_char)
350-
if end == -1:
351-
raise IndexError(Errors.E037.format(end=self.end_char))
352-
self.start = start
353-
self.end = end + 1
354-
355339
@property
356340
def vocab(self):
357341
"""RETURNS (Vocab): The Span's Doc's vocab."""
@@ -520,7 +504,6 @@ cdef class Span:
520504
521505
DOCS: https://nightly.spacy.io/api/span#root
522506
"""
523-
self._recalculate_indices()
524507
if "root" in self.doc.user_span_hooks:
525508
return self.doc.user_span_hooks["root"](self)
526509
# This should probably be called 'head', and the other one called

spacy/training/loop.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,10 @@ def create_evaluation_callback(
249249

250250
def evaluate() -> Tuple[float, Dict[str, float]]:
251251
dev_examples = list(dev_corpus(nlp))
252-
scores = nlp.evaluate(dev_examples)
252+
try:
253+
scores = nlp.evaluate(dev_examples)
254+
except KeyError as e:
255+
raise KeyError(Errors.E900) from e
253256
# Calculate a weighted sum based on score_weights for the main score.
254257
# We can only consider scores that are ints/floats, not dicts like
255258
# entity scores per type etc.

website/docs/usage/embeddings-transformers.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -522,9 +522,9 @@ word vector tables using the `include_static_vectors` flag.
522522
[tagger.model.tok2vec.embed]
523523
@architectures = "spacy.MultiHashEmbed.v1"
524524
width = 128
525-
attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
526-
rows = [7000, 3500, 3500, 3500]
527-
also_use_static_vectors = true
525+
attrs = ["LOWER","PREFIX","SUFFIX","SHAPE"]
526+
rows = [5000,2500,2500,2500]
527+
include_static_vectors = true
528528
```
529529

530530
<Infobox title="How it works" emoji="💡">

website/docs/usage/processing-pipelines.md

Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,9 +1403,9 @@ especially useful it you want to pass in a string instead of calling
14031403
14041404
This example shows the implementation of a pipeline component that fetches
14051405
country meta data via the [REST Countries API](https://restcountries.eu), sets
1406-
entity annotations for countries, merges entities into one token and sets custom
1407-
attributes on the `Doc`, `Span` and `Token` for example, the capital,
1408-
latitude/longitude coordinates and even the country flag.
1406+
entity annotations for countries and sets custom attributes on the `Doc` and
1407+
`Span`for example, the capital, latitude/longitude coordinates and even the
1408+
country flag.
14091409
14101410
```python
14111411
### {executable="true"}
@@ -1427,54 +1427,46 @@ class RESTCountriesComponent:
14271427
# Set up the PhraseMatcher with Doc patterns for each country name
14281428
self.matcher = PhraseMatcher(nlp.vocab)
14291429
self.matcher.add("COUNTRIES", [nlp.make_doc(c) for c in self.countries.keys()])
1430-
# Register attribute on the Token. We'll be overwriting this based on
1430+
# Register attributes on the Span. We'll be overwriting this based on
14311431
# the matches, so we're only setting a default value, not a getter.
1432-
Token.set_extension("is_country", default=False)
1433-
Token.set_extension("country_capital", default=False)
1434-
Token.set_extension("country_latlng", default=False)
1435-
Token.set_extension("country_flag", default=False)
1436-
# Register attributes on Doc and Span via a getter that checks if one of
1437-
# the contained tokens is set to is_country == True.
1432+
Span.set_extension("is_country", default=None)
1433+
Span.set_extension("country_capital", default=None)
1434+
Span.set_extension("country_latlng", default=None)
1435+
Span.set_extension("country_flag", default=None)
1436+
# Register attribute on Doc via a getter that checks if the Doc
1437+
# contains a country entity
14381438
Doc.set_extension("has_country", getter=self.has_country)
1439-
Span.set_extension("has_country", getter=self.has_country)
14401439
14411440
def __call__(self, doc):
14421441
spans = [] # keep the spans for later so we can merge them afterwards
14431442
for _, start, end in self.matcher(doc):
14441443
# Generate Span representing the entity & set label
14451444
entity = Span(doc, start, end, label=self.label)
1445+
# Set custom attributes on entity. Can be extended with other data
1446+
# returned by the API, like currencies, country code, calling code etc.
1447+
entity._.set("is_country", True)
1448+
entity._.set("country_capital", self.countries[entity.text]["capital"])
1449+
entity._.set("country_latlng", self.countries[entity.text]["latlng"])
1450+
entity._.set("country_flag", self.countries[entity.text]["flag"])
14461451
spans.append(entity)
1447-
# Set custom attribute on each token of the entity
1448-
# Can be extended with other data returned by the API, like
1449-
# currencies, country code, flag, calling code etc.
1450-
for token in entity:
1451-
token._.set("is_country", True)
1452-
token._.set("country_capital", self.countries[entity.text]["capital"])
1453-
token._.set("country_latlng", self.countries[entity.text]["latlng"])
1454-
token._.set("country_flag", self.countries[entity.text]["flag"])
1455-
# Iterate over all spans and merge them into one token
1456-
with doc.retokenize() as retokenizer:
1457-
for span in spans:
1458-
retokenizer.merge(span)
14591452
# Overwrite doc.ents and add entity – be careful not to replace!
14601453
doc.ents = list(doc.ents) + spans
14611454
return doc # don't forget to return the Doc!
14621455
1463-
def has_country(self, tokens):
1464-
"""Getter for Doc and Span attributes. Since the getter is only called
1465-
when we access the attribute, we can refer to the Token's 'is_country'
1456+
def has_country(self, doc):
1457+
"""Getter for Doc attributes. Since the getter is only called
1458+
when we access the attribute, we can refer to the Span's 'is_country'
14661459
attribute here, which is already set in the processing step."""
1467-
return any([t._.get("is_country") for t in tokens])
1460+
return any([entity._.get("is_country") for entity in doc.ents])
14681461
14691462
nlp = English()
14701463
nlp.add_pipe("rest_countries", config={"label": "GPE"})
14711464
doc = nlp("Some text about Colombia and the Czech Republic")
14721465
print("Pipeline", nlp.pipe_names) # pipeline contains component name
14731466
print("Doc has countries", doc._.has_country) # Doc contains countries
1474-
for token in doc:
1475-
if token._.is_country:
1476-
print(token.text, token._.country_capital, token._.country_latlng, token._.country_flag)
1477-
print("Entities", [(e.text, e.label_) for e in doc.ents])
1467+
for ent in doc.ents:
1468+
if ent._.is_country:
1469+
print(ent.text, ent.label_, ent._.country_capital, ent._.country_latlng, ent._.country_flag)
14781470
```
14791471
14801472
In this case, all data can be fetched on initialization in one request. However,

0 commit comments

Comments
 (0)