Skip to content

Commit a7d7e80

Browse files
Duygu Altinoksvlandeg
andauthored
EntityRuler improve disk load error message (explosion#9658)
* added error string * added serialization test * added more to if statements * wrote file to tempdir * added tempdir * changed parameter a bit * Update spacy/tests/pipeline/test_entity_ruler.py Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
1 parent 9ac6d49 commit a7d7e80

File tree

3 files changed

+33
-2
lines changed

3 files changed

+33
-2
lines changed

spacy/errors.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -888,6 +888,7 @@ class Errors(metaclass=ErrorsWithCodes):
888888
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
889889
"Non-UD tags should use the `tag` property.")
890890
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
891+
E1023 = ("Couldn't read EntityRuler from the {path}. This file doesn't exist.")
891892

892893

893894
# Deprecated model shortcuts, only used in errors and warnings

spacy/pipeline/entityruler.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -431,10 +431,16 @@ def from_disk(
431431
path = ensure_path(path)
432432
self.clear()
433433
depr_patterns_path = path.with_suffix(".jsonl")
434-
if depr_patterns_path.is_file():
434+
if path.suffix == ".jsonl": # user provides a jsonl
435+
if path.is_file:
436+
patterns = srsly.read_jsonl(path)
437+
self.add_patterns(patterns)
438+
else:
439+
raise ValueError(Errors.E1023.format(path=path))
440+
elif depr_patterns_path.is_file():
435441
patterns = srsly.read_jsonl(depr_patterns_path)
436442
self.add_patterns(patterns)
437-
else:
443+
elif path.is_dir(): # path is a valid directory
438444
cfg = {}
439445
deserializers_patterns = {
440446
"patterns": lambda p: self.add_patterns(
@@ -451,6 +457,8 @@ def from_disk(
451457
self.nlp.vocab, attr=self.phrase_matcher_attr
452458
)
453459
from_disk(path, deserializers_patterns, {})
460+
else: # path is not a valid directory or file
461+
raise ValueError(Errors.E146.format(path=path))
454462
return self
455463

456464
def to_disk(

spacy/tests/pipeline/test_entity_ruler.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from spacy.language import Language
66
from spacy.pipeline import EntityRuler
77
from spacy.errors import MatchPatternError
8+
from spacy.tests.util import make_tempdir
9+
810
from thinc.api import NumpyOps, get_current_ops
911

1012

@@ -238,3 +240,23 @@ def test_entity_ruler_multiprocessing(nlp, n_process):
238240
for doc in nlp.pipe(texts, n_process=2):
239241
for ent in doc.ents:
240242
assert ent.ent_id_ == "1234"
243+
244+
245+
def test_entity_ruler_serialize_jsonl(nlp, patterns):
246+
ruler = nlp.add_pipe("entity_ruler")
247+
ruler.add_patterns(patterns)
248+
with make_tempdir() as d:
249+
ruler.to_disk(d / "test_ruler.jsonl")
250+
ruler.from_disk(d / "test_ruler.jsonl") # read from an existing jsonl file
251+
with pytest.raises(ValueError):
252+
ruler.from_disk(d / "non_existing.jsonl") # read from a bad jsonl file
253+
254+
255+
def test_entity_ruler_serialize_dir(nlp, patterns):
256+
ruler = nlp.add_pipe("entity_ruler")
257+
ruler.add_patterns(patterns)
258+
with make_tempdir() as d:
259+
ruler.to_disk(d / "test_ruler")
260+
ruler.from_disk(d / "test_ruler") # read from an existing directory
261+
with pytest.raises(ValueError):
262+
ruler.from_disk(d / "non_existing_dir") # read from a bad directory

0 commit comments

Comments
 (0)