Update train.py

Morizeyao · Morizeyao · commit de0e0a1640c4 · 2019-07-29T14:44:18.000+08:00
diff --git a/train.py b/train.py
@@ -19,12 +19,12 @@
 
 raw_data_path = 'data/train.txt'
 tokenized_data_path = 'data/tokenized/'
-raw = True  # 选择是否从零开始构建数据集
+raw = False  # 选择是否从零开始构建数据集
 epochs = 5
 batch_size = 12
 lr = 1.5e-4
 warmup_steps = 2000
-log_step = 250
+log_step = 1
 stride = 768
 gradient_accumulation = 1
 fp16 = False  # 不支持半精度的显卡请勿打开
@@ -67,12 +67,19 @@ def main():
     model.to(device)
     multi_gpu = False
     total_tokens = 0
+    full_line = ''
     print('calculating total steps')
     for i in tqdm(range(num_pieces)):
         with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f:
-            total_tokens += len(f.read().split())
-    num_chunks = total_tokens // stride
-    total_steps = int(num_chunks * epochs / batch_size / gradient_accumulation)
+            full_line += f.read()
+    full_line = [int(item) for item in full_line.split()]
+    len_full_line = len(full_line)
+    samples = []
+    start_point = 0
+    while start_point + n_ctx < len_full_line:
+        samples.append(full_line[start_point:start_point+n_ctx])
+        start_point += stride
+    total_steps = int(len(samples) * epochs / batch_size / gradient_accumulation)
     print('total steps = {}'.format(total_steps))
     optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True)
     scheduler = pytorch_transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps,
@@ -93,67 +100,53 @@ def main():
         print('epoch {}'.format(epoch + 1))
         now = datetime.now()
         print('time: {}'.format(now))
-        x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32)
-        random.shuffle(x)
-        piece_num = 0
-        for i, j in enumerate(x):
-            with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(j), 'r') as f:
+        running_loss = 0
+        random.shuffle(samples)
+        for step in range(len(samples) // batch_size):
+
+            #  prepare data
+            batch = samples[step * batch_size: (step + 1) * batch_size]
+            batch_labels = []
+            batch_inputs = []
+            for ids in batch:
+                int_ids_for_labels = [int(x) for x in ids]
+                int_ids_for_inputs = [int(x) for x in ids]
+                batch_labels.append(int_ids_for_labels)
+                batch_inputs.append(int_ids_for_inputs)
+            batch_labels = torch.tensor(batch_labels).long().to(device)
+            batch_inputs = torch.tensor(batch_inputs).long().to(device)
+
+            #  forward pass
+            outputs = model.forward(input_ids=batch_inputs, labels=batch_labels)
+            loss, logits = outputs[:2]
+
+            #  get loss
+            if multi_gpu:
+                loss = loss.mean()
+            if gradient_accumulation > 1:
+                loss = loss / gradient_accumulation
+
+            #  loss backward
+            if fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
+
+            #  optimizer step
+            if (step + 1) % gradient_accumulation == 0:
+                running_loss += loss.item()
+                scheduler.step()
+                optimizer.step()
+                optimizer.zero_grad()
+            if (step + 1) % log_step == 0:
+                print('step {} of epoch {}, loss {}'.format(
+                    (step + 1) // gradient_accumulation,
+                    epoch + 1,
+                    running_loss * gradient_accumulation**2 / log_step))
                 running_loss = 0
-                line = f.read()
-                tokens = line.split()
-                tokens = [int(token) for token in tokens]
-                start_point = 0
-                chunks = []
-                while start_point < len(tokens) - n_ctx:
-                    chunks.append(tokens[start_point: start_point + n_ctx])
-                    start_point += stride
-                random.shuffle(chunks)
-                for step in range(len(chunks) // batch_size):
-
-                    #  prepare data
-                    batch = chunks[step * batch_size: (step + 1) * batch_size]
-                    batch_labels = []
-                    batch_inputs = []
-                    for ids in batch:
-                        int_ids_for_labels = [int(x) for x in ids]
-                        int_ids_for_inputs = [int(x) for x in ids]
-                        batch_labels.append(int_ids_for_labels)
-                        batch_inputs.append(int_ids_for_inputs)
-                    batch_labels = torch.tensor(batch_labels).long().to(device)
-                    batch_inputs = torch.tensor(batch_inputs).long().to(device)
-
-                    #  forward pass
-                    outputs = model.forward(input_ids=batch_inputs, labels=batch_labels)
-                    loss, logits = outputs[:2]
-
-                    #  get loss
-                    if multi_gpu:
-                        loss = loss.mean()
-                    if gradient_accumulation > 1:
-                        loss = loss / gradient_accumulation
-
-                    #  loss backward
-                    if fp16:
-                        with amp.scale_loss(loss, optimizer) as scaled_loss:
-                            scaled_loss.backward()
-                            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
-                    else:
-                        loss.backward()
-                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
-
-                    #  optimizer step
-                    if (step + 1) % gradient_accumulation == 0:
-                        running_loss += loss.item()
-                        scheduler.step()
-                        optimizer.step()
-                        optimizer.zero_grad()
-                    if (step + 1) % log_step == 0:
-                        print('step {} of piece {} of epoch {}, loss {}'.format(
-                            (step + 1) // gradient_accumulation,
-                            piece_num, epoch + 1,
-                            running_loss * gradient_accumulation**2 / log_step))
-                        running_loss = 0
-            piece_num += 1
 
         print('saving model for epoch {}'.format(epoch + 1))
         if not os.path.exists('./model/model_epoch{}'.format(epoch + 1)):