Skip to content

Commit d2e4330

Browse files
committed
Merge remote-tracking branch 'upstream/master' into improve-saving-strategy-sentencepiece-tokenizer
2 parents b100bb5 + 637e817 commit d2e4330

21 files changed

+3496
-214
lines changed

.github/workflows/add-model-like.yml

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
name: Add model like runner
2+
3+
on:
4+
push:
5+
branches:
6+
- master
7+
pull_request:
8+
paths:
9+
- "src/**"
10+
- "tests/**"
11+
- ".github/**"
12+
types: [opened, synchronize, reopened]
13+
14+
jobs:
15+
run_tests_templates:
16+
runs-on: ubuntu-latest
17+
steps:
18+
- uses: actions/checkout@v2
19+
20+
- name: Loading cache.
21+
uses: actions/cache@v2
22+
id: cache
23+
with:
24+
path: ~/.cache/pip
25+
key: v1-tests_model_like
26+
restore-keys: |
27+
v1-tests_model_like-${{ hashFiles('setup.py') }}
28+
v1-tests_model_like
29+
30+
- name: Install dependencies
31+
run: |
32+
pip install --upgrade pip!=21.3
33+
sudo apt -y update && sudo apt install -y libsndfile1-dev
34+
pip install .[dev]
35+
36+
- name: Create model files
37+
run: |
38+
transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
39+
make style
40+
make fix-copies
41+
42+
- name: Run all PyTorch modeling test
43+
run: |
44+
python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_new_models tests/test_modeling_bert_new.py
45+
46+
- name: Run style changes
47+
run: |
48+
make style && make quality && make repo-consistency
49+
50+
- name: Failure short reports
51+
if: ${{ always() }}
52+
run: cat reports/tests_new_models_failures_short.txt
53+
54+
- name: Test suite reports artifacts
55+
if: ${{ always() }}
56+
uses: actions/upload-artifact@v2
57+
with:
58+
name: run_all_tests_new_models_test_reports
59+
path: reports

examples/pytorch/token-classification/run_ner.py

Lines changed: 28 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
AutoTokenizer,
3737
DataCollatorForTokenClassification,
3838
HfArgumentParser,
39+
PretrainedConfig,
3940
PreTrainedTokenizerFast,
4041
Trainer,
4142
TrainingArguments,
@@ -296,20 +297,12 @@ def get_label_list(labels):
296297

297298
if isinstance(features[label_column_name].feature, ClassLabel):
298299
label_list = features[label_column_name].feature.names
299-
# No need to convert the labels since they are already ints.
300-
label_to_id = {i: i for i in range(len(label_list))}
300+
label_keys = list(range(len(label_list)))
301301
else:
302302
label_list = get_label_list(raw_datasets["train"][label_column_name])
303-
label_to_id = {l: i for i, l in enumerate(label_list)}
304-
num_labels = len(label_list)
303+
label_keys = label_list
305304

306-
# Map that sends B-Xxx label to its I-Xxx counterpart
307-
b_to_i_label = []
308-
for idx, label in enumerate(label_list):
309-
if label.startswith("B-") and label.replace("B-", "I-") in label_list:
310-
b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
311-
else:
312-
b_to_i_label.append(idx)
305+
num_labels = len(label_list)
313306

314307
# Load pretrained model and tokenizer
315308
#
@@ -319,8 +312,6 @@ def get_label_list(labels):
319312
config = AutoConfig.from_pretrained(
320313
model_args.config_name if model_args.config_name else model_args.model_name_or_path,
321314
num_labels=num_labels,
322-
label2id=label_to_id,
323-
id2label={i: l for l, i in label_to_id.items()},
324315
finetuning_task=data_args.task_name,
325316
cache_dir=model_args.cache_dir,
326317
revision=model_args.model_revision,
@@ -363,6 +354,30 @@ def get_label_list(labels):
363354
"requirement"
364355
)
365356

357+
if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
358+
label_name_to_id = {k: v for k, v in model.config.label2id.items()}
359+
if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
360+
label_to_id = {k: int(label_name_to_id[k]) for k in label_keys}
361+
else:
362+
logger.warning(
363+
"Your model seems to have been trained with labels, but they don't match the dataset: ",
364+
f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
365+
"\nIgnoring the model labels as a result.",
366+
)
367+
else:
368+
label_to_id = {k: i for i, k in enumerate(label_keys)}
369+
370+
model.config.label2id = label_to_id
371+
model.config.id2label = {i: l for l, i in label_to_id.items()}
372+
373+
# Map that sends B-Xxx label to its I-Xxx counterpart
374+
b_to_i_label = []
375+
for idx, label in enumerate(label_list):
376+
if label.startswith("B-") and label.replace("B-", "I-") in label_list:
377+
b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
378+
else:
379+
b_to_i_label.append(idx)
380+
366381
# Preprocessing the dataset
367382
# Padding strategy
368383
padding = "max_length" if data_args.pad_to_max_length else False

examples/pytorch/token-classification/run_ner_no_trainer.py

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
AutoModelForTokenClassification,
4343
AutoTokenizer,
4444
DataCollatorForTokenClassification,
45+
PretrainedConfig,
4546
SchedulerType,
4647
default_data_collator,
4748
get_scheduler,
@@ -321,20 +322,12 @@ def get_label_list(labels):
321322

322323
if isinstance(features[label_column_name].feature, ClassLabel):
323324
label_list = features[label_column_name].feature.names
324-
# No need to convert the labels since they are already ints.
325-
label_to_id = {i: i for i in range(len(label_list))}
325+
label_keys = list(range(len(label_list)))
326326
else:
327327
label_list = get_label_list(raw_datasets["train"][label_column_name])
328-
label_to_id = {l: i for i, l in enumerate(label_list)}
329-
num_labels = len(label_list)
328+
label_keys = label_list
330329

331-
# Map that sends B-Xxx label to its I-Xxx counterpart
332-
b_to_i_label = []
333-
for idx, label in enumerate(label_list):
334-
if label.startswith("B-") and label.replace("B-", "I-") in label_list:
335-
b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
336-
else:
337-
b_to_i_label.append(idx)
330+
num_labels = len(label_list)
338331

339332
# Load pretrained model and tokenizer
340333
#
@@ -372,6 +365,30 @@ def get_label_list(labels):
372365

373366
model.resize_token_embeddings(len(tokenizer))
374367

368+
if model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id:
369+
label_name_to_id = {k: v for k, v in model.config.label2id.items()}
370+
if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)):
371+
label_to_id = {k: int(label_name_to_id[k]) for k in label_keys}
372+
else:
373+
logger.warning(
374+
"Your model seems to have been trained with labels, but they don't match the dataset: ",
375+
f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}."
376+
"\nIgnoring the model labels as a result.",
377+
)
378+
else:
379+
label_to_id = {k: i for i, k in enumerate(label_keys)}
380+
381+
model.config.label2id = label_to_id
382+
model.config.id2label = {i: l for l, i in label_to_id.items()}
383+
384+
# Map that sends B-Xxx label to its I-Xxx counterpart
385+
b_to_i_label = []
386+
for idx, label in enumerate(label_list):
387+
if label.startswith("B-") and label.replace("B-", "I-") in label_list:
388+
b_to_i_label.append(label_list.index(label.replace("B-", "I-")))
389+
else:
390+
b_to_i_label.append(idx)
391+
375392
# Preprocessing the datasets.
376393
# First we tokenize all the texts.
377394
padding = "max_length" if args.pad_to_max_length else False

0 commit comments

Comments
 (0)