Project Source
Project Source
Project Source
py
# create an configuration
def get_config():
return {
"batch_size": 8,
"num_epochs": 20,
"lr": 10**-4,
"seq_len": 350,
"d_model": 512,
"lang_src": "en",
"lang_tgt": "it",
"model_folder": "weights",
"model_basename": "tmodel_",
"preload": None,
"tokenizer_file": "tokenizer_{0}.json",
"experiment_name": "runs/tmodel_",
# get weights
model_folder = config["model_folder"]
model_basename = config["model_basename"]
model_filename = f"{model_basename}{epoch}.pt"
import torch
import torch.nn as nn
class BillingualDataset(Dataset):
super().__init__()
self.dataset = dataset
self.tokenizer_src = tokenizer_src
self.tokenizer_tgt = tokenizer_tgt
self.src_lang = src_lang
self.tgt_lang = tgt_lang
self.seq_len = seq_len
self.sos_token = torch.tensor([
tokenizer_src.token_to_id('[SOS]')
], dtype=torch.int64)
self.eos_token = torch.tensor([
tokenizer_src.token_to_id('[EOS]')
], dtype=torch.int64)
self.pad_token = torch.tensor([
tokenizer_src.token_to_id('[PAD]')
], dtype=torch.int64)
def __len__(self):
return len(self.dataset)
def __getitem__(self, index) -> Any:
src_target_pair = self.dataset[index]
src_text = src_target_pair["translation"][self.src_lang]
tgt_text = src_target_pair["translation"][self.tgt_lang]
enc_input_tokens = self.tokenizer_src.encode(src_text).ids
dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
encoder_input = torch.cat(
self.sos_token,
torch.tensor(enc_input_tokens, dtype=torch.int64),
self.eos_token,
decoder_input = torch.cat(
self.sos_token,
torch.tensor(dec_input_tokens, dtype=torch.int64),
]
)
label = torch.cat(
torch.tensor(dec_input_tokens, dtype=torch.int64),
self.eos_token,
return {
"src_text": src_text,
"tgt_text": tgt_text
def casual_mask(size):
return mask == 0
Model.py
# import libraries
import torch
import math
import torch.nn as nn
class InputEmbeddings(nn.Module):
super().__init__()
self.d_model = d_model
self.vocab_size = vocab_size
class PositionalEncoding(nn.Module):
def __init__(self, d_model: int, sen_len: int, dropout: float) -> None:
super().__init__()
self.d_model = d_model
self.sen_len = sen_len
self.dropout = nn.Dropout(dropout)
# formula
self.register_buffer("pe", pe)
return self.dropout(x)
class LayerNormalization(nn.Module):
super().__init__()
self.eps = eps
self.alpha = nn.Parameter(torch.ones(1))
self.beta = nn.Parameter(torch.zeros(1))
def forward(self, x):
class FeedForwardBlock(nn.Module):
def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
super().__init__()
self.dropout = nn.Dropout(dropout)
# (Batch, sen_len, d_model) --> (Batch, sen_len, d_ff) --> (Batch, sen_len, d_model)
return self.linear_02(self.dropout(torch.relu(self.linear_01(x))))
class MultiHeadAttention(nn.Module):
def __init__(self, d_model: int, heads: int, dropout: float) -> None:
super().__init__()
self.d_model = d_model
self.heads = heads
# output
self.dropout = nn.Dropout(dropout)
@staticmethod
d_k = query.shape[-1]
attention_scores.masked_fill_(mask == 0, -1e9)
attention_scores = dropout(attention_scores)
query = self.w_q(q)
key = self.w_k(k)
value = self.w_v(v)
# (Batch, sen_len, d_model) --> (Batch, sen_len, heads, d_k) --> (Batch, heads, sen_len, d_k)
# (Batch, heads, sen_len, d_k) --> (Batch, sen_len, heads, d_k) --> (Batch, sen_len, d_model)
return self.w_o(x)
class ResidualConnection(nn.Module):
super().__init__()
self.dropout = nn.Dropout(dropout)
self.norm = LayerNormalization()
return x + self.dropout(sublayer(self.norm(x)))
class EncoderBlock(nn.Module):
def __init__(self, self_attention_block: MultiHeadAttention, feed_forward_network:
FeedForwardBlock, dropout: float):
super().__init__()
self.self_attention_block = self_attention_block
self.feed_forward_block = feed_forward_network
x = self.residual_connection[1](x, self.feed_forward_block)
return x
class Encoder(nn.Module):
super().__init__()
self.layers = layers
self.norm = LayerNormalization()
x = layer(x, mask)
return self.norm(x)
class DecoderBlock(nn.Module):
super().__init__()
self.self_attention_block = self_attention_block
self.cross_attention_block = cross_attention_block
self.feed_forward_block = feed_forward_block
x = self.residual_connections[2](x, self.feed_forward_block)
return x
class Decoder(nn.Module):
super().__init__()
self.layers = layers
self.norm = LayerNormalization()
return self.norm(x)
class ProjectionLayer(nn.Module):
super().__init__()
self.linear = nn.Linear(d_model, vocab_size)
class Transformer(nn.Module):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.src_embed = src_embed
self.tgt_embed = tgt_embed
self.src_pos = src_pos
self.tgt_pos = tgt_pos
self.projection = proj
src = self.src_embed(src)
src = self.src_pos(src)
tgt = self.tgt_embed(tgt)
tgt = self.tgt_pos(tgt)
encoder_blocks = []
for _ in range(N):
encoder_blocks.append(encoder_block)
decoder_blocks = []
for _ in range(N):
decoder_blocks.append(decoder_block)
encoder = Encoder(nn.ModuleList(encoder_blocks))
decoder = Decoder(nn.ModuleList(decoder_blocks))
for p in transformer.parameters():
if p.dim() > 1:
nn.init.xavier_uniform_(p)
return transformer
Train.py
import torch
import warnings
import torch.nn as nn
yield item['translation'][lang]
# tokenizer path
Tokenizer_path = Path(config["tokenizer_file"].format(lang))
if not Path.exists(Tokenizer_path):
tokenizer = Tokenizer(WordLevel(unk_token='[UNK]'))
tokenizer.pre_tokenizer = Whitespace()
tokenizer.save(str(Tokenizer_path))
else:
tokenizer = Tokenizer.from_file(str(Tokenizer_path))
return tokenizer
def get_dataset(config):
# keep the 90% data for training and 10% for testing
max_len_src = 0
max_len_tgt = 0
src_ids = tokenizer_src.encode(item["translation"][config["lang_src"]]).ids
tgt_ids = tokenizer_tgt.encode(item["translation"][config["lang_tgt"]]).ids
train_dataloader = DataLoader(
train_ds,
batch_size=config["batch_size"],
shuffle=True
val_dataloader = DataLoader(
val_ds,
batch_size=1,
shuffle=True
return model
def train_model(config):
Path(config["model_folder"]).mkdir(parents=True, exist_ok=True)
model = model.to(device)
# Tensorboard
writer = SummaryWriter(config["experiment_name"])
intial_epoch = 0
global_step = 0
if config["preload"]:
state = torch.load(model_filename)
intial_epoch = state["epoch"] + 1
optimizer.load_state_dict(state["optimizer_state_dict"])
global_step = state["global_step"]
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id("[PAD]"),
label_smoothing=0.1).to(device)
model.train()
batch_iterator.set_postfix({f"loss": f"{loss.item():6.3f}"})
writer.flush()
loss.backward()
optimizer.step()
optimizer.zero_grad()
global_step += 1
torch.save(
"epoch": epoch,
"model_state_dict": model.state_dict(),
"optimizer_state_dict": optimizer.state_dict(),
"global_step": global_step
},
model_filename
)
if __name__ == "__main__":
warnings.filterwarnings("ignore")
config = get_config()
train_model(config)
Project Structure