@@ -203,6 +203,7 @@ def debug_data(
203
203
has_low_data_warning = False
204
204
has_no_neg_warning = False
205
205
has_ws_ents_error = False
206
+ has_boundary_cross_ents_warning = False
206
207
207
208
msg .divider ("Named Entity Recognition" )
208
209
msg .info (f"{ len (model_labels )} label(s)" )
@@ -242,12 +243,20 @@ def debug_data(
242
243
msg .warn (f"No examples for texts WITHOUT new label '{ label } '" )
243
244
has_no_neg_warning = True
244
245
246
+ if gold_train_data ["boundary_cross_ents" ]:
247
+ msg .warn (
248
+ f"{ gold_train_data ['boundary_cross_ents' ]} entity span(s) crossing sentence boundaries"
249
+ )
250
+ has_boundary_cross_ents_warning = True
251
+
245
252
if not has_low_data_warning :
246
253
msg .good ("Good amount of examples for all labels" )
247
254
if not has_no_neg_warning :
248
255
msg .good ("Examples without occurrences available for all labels" )
249
256
if not has_ws_ents_error :
250
257
msg .good ("No entities consisting of or starting/ending with whitespace" )
258
+ if not has_boundary_cross_ents_warning :
259
+ msg .good ("No entities crossing sentence boundaries" )
251
260
252
261
if has_low_data_warning :
253
262
msg .text (
@@ -565,6 +574,7 @@ def _compile_gold(
565
574
"words" : Counter (),
566
575
"roots" : Counter (),
567
576
"ws_ents" : 0 ,
577
+ "boundary_cross_ents" : 0 ,
568
578
"n_words" : 0 ,
569
579
"n_misaligned_words" : 0 ,
570
580
"words_missing_vectors" : Counter (),
@@ -602,6 +612,8 @@ def _compile_gold(
602
612
if label .startswith (("B-" , "U-" )):
603
613
combined_label = label .split ("-" )[1 ]
604
614
data ["ner" ][combined_label ] += 1
615
+ if gold [i ].is_sent_start and label .startswith (("I-" , "L-" )):
616
+ data ["boundary_cross_ents" ] += 1
605
617
elif label == "-" :
606
618
data ["ner" ]["-" ] += 1
607
619
if "textcat" in factory_names or "textcat_multilabel" in factory_names :
0 commit comments