Skip to content

Commit f60d059

Browse files
committed
perceptron loss
1 parent 01e3776 commit f60d059

File tree

4 files changed

+94
-78
lines changed

4 files changed

+94
-78
lines changed

neural_ner/crf.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from data_utils.sentence_utils import Constants
66
import numpy as np
77

8-
98
class CRF_Loss(nn.Module):
109
def __init__(self, tagset_size, config):
1110
super(CRF_Loss, self).__init__()
@@ -20,7 +19,8 @@ def __init__(self, tagset_size, config):
2019
self.transitions.data[:, self.start_tag] = -10000
2120
self.config = config
2221

23-
def get_log_p_z(self, emissions, mask, seq_len):
22+
def get_log_p_z(self, emissions, mask):
23+
seq_len = emissions.shape[1]
2424
log_alpha = emissions[:, 0].clone()
2525
log_alpha += self.transitions[self.start_tag, : self.start_tag].unsqueeze(0)
2626

@@ -36,7 +36,8 @@ def get_log_p_z(self, emissions, mask, seq_len):
3636
log_alpha += self.transitions[: self.start_tag, self.end_tag].unsqueeze(0)
3737
return torch.logsumexp(log_alpha.squeeze(1), 1)
3838

39-
def get_log_p_Y_X(self, emissions, mask, seq_len, orig_tags):
39+
def get_log_p_Y_X(self, emissions, mask, orig_tags):
40+
seq_len = emissions.shape[1]
4041
tags = orig_tags.clone()
4142
tags[tags < 0] = 0
4243

@@ -61,16 +62,11 @@ def get_log_p_Y_X(self, emissions, mask, seq_len, orig_tags):
6162

6263
def log_likelihood(self, emissions, tags):
6364
mask = tags.ne(Constants.TAG_PAD_ID).float()
64-
seq_len = emissions.shape[1]
65-
log_z = self.get_log_p_z(emissions, mask, seq_len)
66-
log_p_y_x = self.get_log_p_Y_X(emissions, mask, seq_len, tags)
65+
log_z = self.get_log_p_z(emissions, mask)
66+
log_p_y_x = self.get_log_p_Y_X(emissions, mask, tags)
6767
return log_p_y_x - log_z
6868

69-
def forward(self, emissions, tags):
70-
return self.log_likelihood(emissions, tags)
71-
72-
def viterbi_decode(self, emissions, lengths):
73-
mask = self.get_mask(lengths)
69+
def viterbi_decode(self, emissions, mask):
7470
seq_len = emissions.shape[1]
7571

7672
log_prob = emissions[:, 0].clone()
@@ -101,7 +97,7 @@ def viterbi_decode(self, emissions, lengths):
10197
best_scores = torch.cat(best_scores_list, 1).float()
10298
best_paths = torch.cat(best_paths_list, 1)
10399

104-
_, max_indices_from_scores = torch.max(best_scores, 2)
100+
max_scores, max_indices_from_scores = torch.max(best_scores, 2)
105101

106102
valid_index_tensor = torch.tensor(0).long()
107103
padding_tensor = torch.tensor(Constants.TAG_PAD_ID).long()
@@ -132,14 +128,18 @@ def viterbi_decode(self, emissions, lengths):
132128
labels = torch.where(mask[:, idx] != 1.0, padding_tensor, labels)
133129
all_labels = torch.cat((all_labels, labels.view(-1, 1).long()), 1)
134130

135-
return best_scores, torch.flip(all_labels, [1])
131+
last_tag_indices = mask.sum(1, dtype=torch.long) - 1
132+
sentence_score = max_scores.gather(1, last_tag_indices.view(-1, 1)).squeeze(1)
133+
134+
return sentence_score, torch.flip(all_labels, [1])
136135

137-
def get_mask(self, lengths):
138-
seq_lens = lengths.view(-1, 1)
139-
max_len = torch.max(seq_lens)
140-
range_tensor = torch.arange(max_len).unsqueeze(0)
141-
range_tensor = range_tensor.expand(seq_lens.size(0), range_tensor.size(1))
142-
if self.config.is_cuda:
143-
range_tensor = range_tensor.cuda()
144-
mask = (range_tensor < seq_lens).float()
145-
return mask
136+
def structural_perceptron_loss(self, emissions, tags):
137+
mask = tags.ne(Constants.TAG_PAD_ID).float()
138+
139+
best_scores, pred = self.viterbi_decode(emissions, mask)
140+
log_p_y_x = self.get_log_p_Y_X(emissions, mask, tags)
141+
142+
delta = torch.sum(tags.ne(pred).float()*mask, 1)
143+
144+
margin_loss = torch.clamp(best_scores + delta - log_p_y_x, min=0.0)
145+
return margin_loss

neural_ner/model.py

Lines changed: 8 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import numpy as np
1010

1111
from crf import CRF_Loss
12+
from model_utils import get_mask, init_lstm_wt, init_linear_wt, get_word_embd
1213

1314
print('pytorch version', torch.__version__)
1415

@@ -19,53 +20,8 @@
1920
if torch.cuda.is_available():
2021
torch.cuda.manual_seed_all(123)
2122

22-
def init_lstm_wt(lstm):
23-
for names in lstm._all_weights:
24-
for name in names:
25-
if name.startswith('weight_'):
26-
wt = getattr(lstm, name)
27-
drange = np.sqrt(6. / (np.sum(wt.size())))
28-
wt.data.uniform_(-drange, drange)
29-
30-
elif name.startswith('bias_'):
31-
# set forget bias to 1
32-
bias = getattr(lstm, name)
33-
n = bias.size(0)
34-
start, end = n // 4, n // 2
35-
bias.data.fill_(0.)
36-
bias.data[start:end].fill_(1.)
37-
38-
def init_linear_wt(linear):
39-
drange = np.sqrt(6. / (np.sum(linear.weight.size())))
40-
linear.weight.data.uniform_(-drange, drange)
41-
42-
if linear.bias is not None:
43-
linear.bias.data.fill_(0.)
44-
45-
def get_word_embd(vocab, config):
46-
word_to_vector = vocab.glove_vectors
47-
word_emb_matrix = np.random.uniform(low=-1.0, high=1.0,
48-
size=(len(vocab.word_to_id), config.word_emdb_dim))
49-
pretrained_init = 0
50-
for w, wid in vocab.word_to_id.items():
51-
if w in word_to_vector:
52-
word_emb_matrix[wid, :] = word_to_vector[w]
53-
pretrained_init += 1
54-
55-
"Total words %i (%i pretrained initialization)" % (
56-
len(vocab.word_to_id), pretrained_init
57-
)
58-
return word_emb_matrix
59-
60-
def test_one_batch(batch, model):
61-
model.eval()
62-
logits = model(batch)
63-
lengths = batch['words_lens']
64-
pred = model.predict(logits, lengths)
65-
return logits, pred
66-
6723
def get_model(vocab, config, model_file_path, is_eval=False):
68-
model = NER_SOFTMAX_CHAR_CRF(vocab, config)
24+
model = NER_SOFTMAX_CHAR_CRF(vocab, config)
6925

7026
if is_eval:
7127
model = model.eval()
@@ -194,18 +150,16 @@ def forward(self, batch):
194150
emissions = self.featurizer(batch)
195151
return emissions
196152

197-
def crf_loss(self, emissions, target):
198-
a = self.crf(emissions, target)
199-
loss = -1 * a
200-
loss = loss.mean()
201-
return loss
202-
203153
def get_loss(self, logits, y, s_lens):
204-
loss = self.crf_loss(logits, y)
154+
#loss = -1 * self.crf.log_likelihood(logits, y)
155+
loss = self.crf.structural_perceptron_loss(logits, y)
156+
loss = loss / s_lens.float()
157+
loss = loss.mean()
205158
if self.config.is_l2_loss:
206159
loss += self.get_l2_loss()
207160
return loss
208161

209162
def predict(self, emissions, lengths):
210-
best_scores, pred = self.crf.viterbi_decode(emissions, lengths)
163+
mask = get_mask(lengths, self.config)
164+
best_scores, pred = self.crf.viterbi_decode(emissions, mask)
211165
return pred

neural_ner/model_utils.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from __future__ import unicode_literals, print_function, division
2+
3+
import torch
4+
import numpy as np
5+
6+
def get_mask(lengths, config):
7+
seq_lens = lengths.view(-1, 1)
8+
max_len = torch.max(seq_lens)
9+
range_tensor = torch.arange(max_len).unsqueeze(0)
10+
range_tensor = range_tensor.expand(seq_lens.size(0), range_tensor.size(1))
11+
if config.is_cuda:
12+
range_tensor = range_tensor.cuda()
13+
mask = (range_tensor < seq_lens).float()
14+
return mask
15+
16+
def init_lstm_wt(lstm):
17+
for names in lstm._all_weights:
18+
for name in names:
19+
if name.startswith('weight_'):
20+
wt = getattr(lstm, name)
21+
drange = np.sqrt(6. / (np.sum(wt.size())))
22+
wt.data.uniform_(-drange, drange)
23+
24+
elif name.startswith('bias_'):
25+
# set forget bias to 1
26+
bias = getattr(lstm, name)
27+
n = bias.size(0)
28+
start, end = n // 4, n // 2
29+
bias.data.fill_(0.)
30+
bias.data[start:end].fill_(1.)
31+
32+
33+
def init_linear_wt(linear):
34+
drange = np.sqrt(6. / (np.sum(linear.weight.size())))
35+
linear.weight.data.uniform_(-drange, drange)
36+
37+
if linear.bias is not None:
38+
linear.bias.data.fill_(0.)
39+
40+
41+
def get_word_embd(vocab, config):
42+
word_to_vector = vocab.glove_vectors
43+
word_emb_matrix = np.random.uniform(low=-1.0, high=1.0,
44+
size=(len(vocab.word_to_id), config.word_emdb_dim))
45+
pretrained_init = 0
46+
for w, wid in vocab.word_to_id.items():
47+
if w in word_to_vector:
48+
word_emb_matrix[wid, :] = word_to_vector[w]
49+
pretrained_init += 1
50+
51+
"Total words %i (%i pretrained initialization)" % (
52+
len(vocab.word_to_id), pretrained_init
53+
)
54+
return word_emb_matrix
55+

neural_ner/process_training.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010
from data_utils.batcher import DatasetConll2003
1111
from data_utils.vocab import Vocab
12-
from model import get_model, test_one_batch
12+
from model import get_model
1313
from train_utils import setup_train_dir, save_model, write_summary, \
1414
get_param_norm, get_grad_norm, Evaluter
1515

@@ -22,6 +22,13 @@ def __init__(self, config, model_file_path):
2222
self.vocab = Vocab(config)
2323
self.model = get_model(self.vocab, config, model_file_path)
2424

25+
def test_one_batch(self, batch):
26+
self.model.eval()
27+
logits = self.model(batch)
28+
lengths = batch['words_lens']
29+
pred = self.model.predict(logits, lengths)
30+
return logits, pred
31+
2532
def train_one_batch(self, batch, optimizer, params):
2633
self.model.train()
2734
optimizer.zero_grad()
@@ -126,7 +133,7 @@ def evaluate(self, data_type, num_samples=None):
126133
s_lengths = batch['words_lens']
127134
y = batch['tags']
128135

129-
logits, pred = test_one_batch(batch, self.model)
136+
logits, pred = self.test_one_batch(batch)
130137
loss = self.model.get_loss(logits, y, s_lengths)
131138

132139
curr_batch_size = len(batch['raw_sentence'])

0 commit comments

Comments
 (0)