Building a Plain Seq2Seq Model for Language Translation

import random

import os

import re

import unicodedata

import zipfile

import requests

import torch

import torch.nn as nn

import torch.optim as optim

import tokenizers

import tqdm

# Data preparation

# Download dataset provided by Anki: https://www.manythings.org/anki/ with requests

if not os.path.exists(“fra-eng.zip”):

url = “http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip”

response = requests.get(url)

with open(“fra-eng.zip”, “wb”) as f:

f.write(response.content)

# Normalize text

# each line of the file is in the format “<english>\t<french>”

# We convert text to lowercasee, normalize unicode (UFKC)

def normalize(line):

“”“Normalize a line of text and split into two at the tab character”“”

line = unicodedata.normalize(“NFKC”, line.strip().lower())

eng, fra = line.split(“\t”)

return eng.lower().strip(), fra.lower().strip()

text_pairs = []

with zipfile.ZipFile(“fra-eng.zip”, “r”) as zip_ref:

for line in zip_ref.read(“fra.txt”).decode(“utf-8”).splitlines():

eng, fra = normalize(line)

text_pairs.append((eng, fra))

# Tokenization with BPE

if os.path.exists(“en_tokenizer.json”) and os.path.exists(“fr_tokenizer.json”):

en_tokenizer = tokenizers.Tokenizer.from_file(“en_tokenizer.json”)

fr_tokenizer = tokenizers.Tokenizer.from_file(“fr_tokenizer.json”)

else:

en_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())

fr_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())

# Configure pre-tokenizer to split on whitespace and punctuation, add space at beginning of the sentence

en_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)

fr_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)

# Configure decoder: So that word boundary symbol “Ġ” will be removed

en_tokenizer.decoder = tokenizers.decoders.ByteLevel()

fr_tokenizer.decoder = tokenizers.decoders.ByteLevel()

# Train BPE for English and French using the same trainer

VOCAB_SIZE = 8000

trainer = tokenizers.trainers.BpeTrainer(

vocab_size=VOCAB_SIZE,

special_tokens=[“[start]”, “[end]”, “[pad]”],

show_progress=True

)

en_tokenizer.train_from_iterator([x[0] for x in text_pairs], trainer=trainer)

fr_tokenizer.train_from_iterator([x[1] for x in text_pairs], trainer=trainer)

en_tokenizer.enable_padding(pad_id=en_tokenizer.token_to_id(“[pad]”), pad_token=“[pad]”)

fr_tokenizer.enable_padding(pad_id=fr_tokenizer.token_to_id(“[pad]”), pad_token=“[pad]”)

# Save the trained tokenizers

en_tokenizer.save(“en_tokenizer.json”, pretty=True)

fr_tokenizer.save(“fr_tokenizer.json”, pretty=True)

# Test the tokenizer

print(“Sample tokenization:”)

en_sample, fr_sample = random.choice(text_pairs)

encoded = en_tokenizer.encode(en_sample)

print(f“Original: {en_sample}”)

print(f“Tokens: {encoded.tokens}”)

print(f“IDs: {encoded.ids}”)

print(f“Decoded: {en_tokenizer.decode(encoded.ids)}”)

print()

encoded = fr_tokenizer.encode(“[start] “ + fr_sample + ” [end]”)

print(f“Original: {fr_sample}”)

print(f“Tokens: {encoded.tokens}”)

print(f“IDs: {encoded.ids}”)

print(f“Decoded: {fr_tokenizer.decode(encoded.ids)}”)

print()

# Create PyTorch dataset for the BPE-encoded translation pairs

class TranslationDataset(torch.utils.data.Dataset):

def __init__(self, text_pairs):

self.text_pairs = text_pairs

def __len__(self):

return len(self.text_pairs)

def __getitem__(self, idx):

eng, fra = self.text_pairs[idx]

return eng, “[start] “ + fra + ” [end]”

def collate_fn(batch):

en_str, fr_str = zip(*batch)

en_enc = en_tokenizer.encode_batch(en_str, add_special_tokens=True)

fr_enc = fr_tokenizer.encode_batch(fr_str, add_special_tokens=True)

en_ids = [enc.ids for enc in en_enc]

fr_ids = [enc.ids for enc in fr_enc]

return torch.tensor(en_ids), torch.tensor(fr_ids)

BATCH_SIZE = 32

dataset = TranslationDataset(text_pairs)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# Test the dataset

for en_ids, fr_ids in dataloader:

print(f“English: {en_ids}”)

print(f“French: {fr_ids}”)

break

# Create LSTM seq2seq model for translation

class EncoderLSTM(nn.Module):

“”“A stacked LSTM encoder with an embedding layer”“”

def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1, dropout=0.1):

“”“

Plain LSTM is used. No bidirectional LSTM.

Args:

vocab_size: The size of the input vocabulary

embedding_dim: The dimension of the embedding vector

hidden_dim: The dimension of the hidden state

num_layers: The number of recurrent layers (layers of stacked LSTM)

dropout: The dropout rate, applied to all LSTM layers except the last one

““”

super().__init__()

self.vocab_size = vocab_size

self.embedding_dim = embedding_dim

self.hidden_dim = hidden_dim

self.num_layers = num_layers

self.embedding = nn.Embedding(vocab_size, embedding_dim)

self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers,

batch_first=True, dropout=dropout if num_layers > 1 else 0)

def forward(self, input_seq):

# input seq = [batch_size, seq_len] -> embedded = [batch_size, seq_len, embedding_dim]

embedded = self.embedding(input_seq)

# outputs = [batch_size, seq_len, embedding_dim]

# hidden = cell = [n_layers, batch_size, hidden_dim]

outputs, (hidden, cell) = self.lstm(embedded)

return outputs, hidden, cell

class DecoderLSTM(nn.Module):

def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1, dropout=0.1):

super().__init__()

self.vocab_size = vocab_size

self.embedding_dim = embedding_dim

self.hidden_dim = hidden_dim

self.num_layers = num_layers

self.embedding = nn.Embedding(vocab_size, embedding_dim)

self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers,

batch_first=True, dropout=dropout if num_layers > 1 else 0)

self.out = nn.Linear(embedding_dim, vocab_size)

def forward(self, input_seq, hidden, cell):

# input seq = [batch_size, seq_len] -> embedded = [batch_size, seq_len, embedding_dim]

# hidden = cell = [n_layers, batch_size, hidden_dim]

embedded = self.embedding(input_seq)

# output = [batch_size, seq_len, embedding_dim]

output, (hidden, cell) = self.lstm(embedded, (hidden, cell))

prediction = self.out(output)

return prediction, hidden, cell

class Seq2SeqLSTM(nn.Module):

def __init__(self, encoder, decoder):

super().__init__()

self.encoder = encoder

self.decoder = decoder

def forward(self, input_seq, target_seq):

“”“Given the partial target sequence, predict the next token”“”

# input seq = [batch_size, seq_len]

# target seq = [batch_size, seq_len]

batch_size, target_len = target_seq.shape

device = target_seq.device

# storing output logits

outputs = []

# encoder forward pass

_enc_out, hidden, cell = self.encoder(input_seq)

dec_in = target_seq[:, :1]

# decoder forward pass

for t in range(target_len–1):

# last target token and hidden states -> next token

pred, hidden, cell = self.decoder(dec_in, hidden, cell)

# store the prediction

pred = pred[:, –1:, :]

outputs.append(pred)

# use the predicted token as the next input

dec_in = torch.cat([dec_in, pred.argmax(dim=2)], dim=1)

outputs = torch.cat(outputs, dim=1)

return outputs

# Initialize model parameters

device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)

enc_vocab = len(en_tokenizer.get_vocab())

dec_vocab = len(fr_tokenizer.get_vocab())

emb_dim = 256

hidden_dim = 256

num_layers = 2

dropout = 0.1

# Create model

encoder = EncoderLSTM(enc_vocab, emb_dim, hidden_dim, num_layers, dropout).to(device)

decoder = DecoderLSTM(dec_vocab, emb_dim, hidden_dim, num_layers, dropout).to(device)

model = Seq2SeqLSTM(encoder, decoder).to(device)

print(model)

print(“Model created with:”)

print(f” Input vocabulary size: {enc_vocab}”)

print(f” Output vocabulary size: {dec_vocab}”)

print(f” Embedding dimension: {emb_dim}”)

print(f” Hidden dimension: {hidden_dim}”)

print(f” Number of layers: {num_layers}”)

print(f” Dropout: {dropout}”)

print(f” Total parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}”)

# Train unless model.pth exists

if os.path.exists(“seq2seq.pth”):

model.load_state_dict(torch.load(“seq2seq.pth”))

else:

optimizer = optim.Adam(model.parameters(), lr=0.001)

loss_fn = nn.CrossEntropyLoss(ignore_index=fr_tokenizer.token_to_id(“[pad]”))

N_EPOCHS = 30

for epoch in range(N_EPOCHS):

model.train()

epoch_loss = 0

for en_ids, fr_ids in tqdm.tqdm(dataloader, desc=“Training”):

# Move the “sentences” to device

en_ids = en_ids.to(device)

fr_ids = fr_ids.to(device)

# zero the grad, then forward pass

optimizer.zero_grad()

outputs = model(en_ids, fr_ids)

# compute the loss: compare 3D logits to 2D targets

loss = loss_fn(outputs.reshape(–1, dec_vocab), fr_ids[:, 1:].reshape(–1))

loss.backward()

optimizer.step()

epoch_loss += loss.item()

print(f“Epoch {epoch+1}/{N_EPOCHS}; Avg loss {epoch_loss/len(dataloader)}; Latest loss {loss.item()}”)

torch.save(model.state_dict(), f“seq2seq-epoch-{epoch+1}.pth”)

# Test

if (epoch+1) % 5 != 0:

continue

model.eval()

epoch_loss = 0

with torch.no_grad():

for en_ids, fr_ids in tqdm.tqdm(dataloader, desc=“Evaluating”):

en_ids = en_ids.to(device)

fr_ids = fr_ids.to(device)

outputs = model(en_ids, fr_ids)

loss = loss_fn(outputs.reshape(–1, dec_vocab), fr_ids[:, 1:].reshape(–1))

epoch_loss += loss.item()

print(f“Eval loss: {epoch_loss/len(dataloader)}”)

# Save the final model

torch.save(model.state_dict(), “seq2seq.pth”)

# Test for a few samples

model.eval()

N_SAMPLES = 5

MAX_LEN = 60

with torch.no_grad():

start_token = torch.tensor([fr_tokenizer.token_to_id(“[start]”)]).to(device)

for en, true_fr in random.sample(text_pairs, N_SAMPLES):

en_ids = torch.tensor(en_tokenizer.encode(en).ids).unsqueeze(0).to(device)

_output, hidden, cell = model.encoder(en_ids)

pred_ids = [start_token]

for _ in range(MAX_LEN):

decoder_input = torch.tensor(pred_ids).unsqueeze(0).to(device)

output, hidden, cell = model.decoder(decoder_input, hidden, cell)

output = output[:, –1, :].argmax(dim=1)

pred_ids.append(output.item())

# early stop if the predicted token is the end token

if pred_ids[–1] == fr_tokenizer.token_to_id(“[end]”):

break

# Decode the predicted IDs

pred_fr = fr_tokenizer.decode(pred_ids)

print(f“English: {en}”)

print(f“French: {true_fr}”)

print(f“Predicted: {pred_fr}”)

print()

Source_link

Building a Plain Seq2Seq Model for Language Translation

Working to automate nuclear plant operations | MIT News

How to Build an End-to-End OCR Pipeline with Baidu’s Unlimited-OCR for High-Resolution Images and Multi-Page PDF Parsing

Related Posts

Working to automate nuclear plant operations | MIT News

How to Build an End-to-End OCR Pipeline with Baidu’s Unlimited-OCR for High-Resolution Images and Multi-Page PDF Parsing

MIT projects selected for funding under US Department of Energy’s Genesis Mission | MIT News

You Didn’t Get the AI Model You Paid For

Anthropic Releases Claude Security Plugin for Claude Code in Beta: A Multi-Agent Vulnerability Scanner That Runs in Your Terminal

Professor Emeritus Dimitri Bertsekas, influential computer scientist and prolific author, dies at 83 | MIT News

Is Silicon Valley Losing Its Influence on DC?

POPULAR NEWS

Trump ends trade talks with Canada over a digital services tax

15 Trending Songs on TikTok in 2025 (+ How to Use Them)

Communication Effectiveness Skills For Business Leaders

App Development Cost in Singapore: Pricing Breakdown & Insights

Comparing the Top 7 Large Language Models LLMs/Systems for Coding in 2025

EDITOR'S PICK

Synthetic Voices: The Moment the Magic Turns Mainstream

How Are SaaS Companies Using GenAI? 7 Product Use Cases

What is Crawl Budget & How to Optimize for Crawlers?

Microsoft Released VibeVoice-1.5B: An Open-Source Text-to-Speech Model that can Synthesize up to 90 Minutes of Speech with Four Distinct Speakers

About

Categories

Recent Posts

Building a Plain Seq2Seq Model for Language Translation

READ ALSO

Related Posts

POPULAR NEWS

EDITOR'S PICK

About

Categories

Recent Posts