Skip to content

Commit 55cf492

Browse files
Duygu Altinoksvlandeg
andauthored
Feat/debug data warn spread ents (explosion#9960)
* added check for crossing boundaries * formatted blacked * Rephrasing slightly Co-authored-by: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
1 parent 56dcb39 commit 55cf492

File tree

1 file changed

+12
-0
lines changed

1 file changed

+12
-0
lines changed

spacy/cli/debug_data.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,7 @@ def debug_data(
203203
has_low_data_warning = False
204204
has_no_neg_warning = False
205205
has_ws_ents_error = False
206+
has_boundary_cross_ents_warning = False
206207

207208
msg.divider("Named Entity Recognition")
208209
msg.info(f"{len(model_labels)} label(s)")
@@ -242,12 +243,20 @@ def debug_data(
242243
msg.warn(f"No examples for texts WITHOUT new label '{label}'")
243244
has_no_neg_warning = True
244245

246+
if gold_train_data["boundary_cross_ents"]:
247+
msg.warn(
248+
f"{gold_train_data['boundary_cross_ents']} entity span(s) crossing sentence boundaries"
249+
)
250+
has_boundary_cross_ents_warning = True
251+
245252
if not has_low_data_warning:
246253
msg.good("Good amount of examples for all labels")
247254
if not has_no_neg_warning:
248255
msg.good("Examples without occurrences available for all labels")
249256
if not has_ws_ents_error:
250257
msg.good("No entities consisting of or starting/ending with whitespace")
258+
if not has_boundary_cross_ents_warning:
259+
msg.good("No entities crossing sentence boundaries")
251260

252261
if has_low_data_warning:
253262
msg.text(
@@ -565,6 +574,7 @@ def _compile_gold(
565574
"words": Counter(),
566575
"roots": Counter(),
567576
"ws_ents": 0,
577+
"boundary_cross_ents": 0,
568578
"n_words": 0,
569579
"n_misaligned_words": 0,
570580
"words_missing_vectors": Counter(),
@@ -602,6 +612,8 @@ def _compile_gold(
602612
if label.startswith(("B-", "U-")):
603613
combined_label = label.split("-")[1]
604614
data["ner"][combined_label] += 1
615+
if gold[i].is_sent_start and label.startswith(("I-", "L-")):
616+
data["boundary_cross_ents"] += 1
605617
elif label == "-":
606618
data["ner"]["-"] += 1
607619
if "textcat" in factory_names or "textcat_multilabel" in factory_names:

0 commit comments

Comments
 (0)