Skip to content

Commit 8d7b353

Browse files
committed
Revert "Remove Span._recalculate_indices"
This reverts commit 727370c.
1 parent a2fa5f4 commit 8d7b353

File tree

4 files changed

+24
-4
lines changed

4 files changed

+24
-4
lines changed

spacy/tests/doc/test_doc_api.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -608,11 +608,14 @@ def test_doc_init_iob():
608608
doc = Doc(Vocab(), words=words, ents=ents)
609609

610610

611-
def test_doc_set_ents_invalid_spans(en_tokenizer):
611+
@pytest.mark.xfail
612+
def test_doc_set_ents_spans(en_tokenizer):
612613
doc = en_tokenizer("Some text about Colombia and the Czech Republic")
613614
spans = [Span(doc, 3, 4, label="GPE"), Span(doc, 6, 8, label="GPE")]
614615
with doc.retokenize() as retokenizer:
615616
for span in spans:
616617
retokenizer.merge(span)
617-
with pytest.raises(IndexError):
618-
doc.ents = spans
618+
# If this line is uncommented, it works:
619+
# print(spans)
620+
doc.ents = spans
621+
assert [ent.text for ent in doc.ents] == ["Colombia", "Czech Republic"]

spacy/tests/doc/test_retokenize_merge.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,6 @@ def test_doc_retokenize_spans_sentence_update_after_merge(en_tokenizer):
336336
attrs = {"lemma": "none", "ent_type": "none"}
337337
retokenizer.merge(doc[0:2], attrs=attrs)
338338
retokenizer.merge(doc[-2:], attrs=attrs)
339-
sent1, sent2 = list(doc.sents)
340339
assert len(sent1) == init_len - 1
341340
assert len(sent2) == init_len2 - 1
342341

spacy/tokens/span.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,5 @@ cdef class Span:
1616
cdef public _vector
1717
cdef public _vector_norm
1818

19+
cpdef int _recalculate_indices(self) except -1
1920
cpdef np.ndarray to_array(self, object features)

spacy/tokens/span.pyx

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ cdef class Span:
150150
151151
DOCS: https://nightly.spacy.io/api/span#len
152152
"""
153+
self._recalculate_indices()
153154
if self.end < self.start:
154155
return 0
155156
return self.end - self.start
@@ -166,6 +167,7 @@ cdef class Span:
166167
167168
DOCS: https://nightly.spacy.io/api/span#getitem
168169
"""
170+
self._recalculate_indices()
169171
if isinstance(i, slice):
170172
start, end = normalize_slice(len(self), i.start, i.stop, i.step)
171173
return Span(self.doc, start + self.start, end + self.start)
@@ -186,6 +188,7 @@ cdef class Span:
186188
187189
DOCS: https://nightly.spacy.io/api/span#iter
188190
"""
191+
self._recalculate_indices()
189192
for i in range(self.start, self.end):
190193
yield self.doc[i]
191194

@@ -336,6 +339,19 @@ cdef class Span:
336339
output[i-self.start, j] = get_token_attr(&self.doc.c[i], feature)
337340
return output
338341

342+
cpdef int _recalculate_indices(self) except -1:
343+
if self.end > self.doc.length \
344+
or self.doc.c[self.start].idx != self.start_char \
345+
or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
346+
start = token_by_start(self.doc.c, self.doc.length, self.start_char)
347+
if self.start == -1:
348+
raise IndexError(Errors.E036.format(start=self.start_char))
349+
end = token_by_end(self.doc.c, self.doc.length, self.end_char)
350+
if end == -1:
351+
raise IndexError(Errors.E037.format(end=self.end_char))
352+
self.start = start
353+
self.end = end + 1
354+
339355
@property
340356
def vocab(self):
341357
"""RETURNS (Vocab): The Span's Doc's vocab."""
@@ -504,6 +520,7 @@ cdef class Span:
504520
505521
DOCS: https://nightly.spacy.io/api/span#root
506522
"""
523+
self._recalculate_indices()
507524
if "root" in self.doc.user_span_hooks:
508525
return self.doc.user_span_hooks["root"](self)
509526
# This should probably be called 'head', and the other one called

0 commit comments

Comments
 (0)