• About Us
  • Disclaimer
  • Contact Us
  • Privacy Policy
Tuesday, March 10, 2026
mGrowTech
No Result
View All Result
  • Technology And Software
    • Account Based Marketing
    • Channel Marketing
    • Marketing Automation
      • Al, Analytics and Automation
      • Ad Management
  • Digital Marketing
    • Social Media Management
    • Google Marketing
  • Direct Marketing
    • Brand Management
    • Marketing Attribution and Consulting
  • Mobile Marketing
  • Event Management
  • PR Solutions
  • Technology And Software
    • Account Based Marketing
    • Channel Marketing
    • Marketing Automation
      • Al, Analytics and Automation
      • Ad Management
  • Digital Marketing
    • Social Media Management
    • Google Marketing
  • Direct Marketing
    • Brand Management
    • Marketing Attribution and Consulting
  • Mobile Marketing
  • Event Management
  • PR Solutions
No Result
View All Result
mGrowTech
No Result
View All Result
Home Al, Analytics and Automation

Building a Seq2Seq Model with Attention for Language Translation

Josh by Josh
August 16, 2025
in Al, Analytics and Automation
0
Building a Seq2Seq Model with Attention for Language Translation


import random

import os

import re

import unicodedata

import zipfile

 

import matplotlib.pyplot as plt

import numpy as np

import requests

import torch

import torch.nn as nn

import torch.nn.functional as F

import torch.optim as optim

import tokenizers

import tqdm

 

 

#

# Data preparation

#

 

 

# Download dataset provided by Anki: https://www.manythings.org/anki/ with requests

if not os.path.exists(“fra-eng.zip”):

    url = “http://storage.googleapis.com/download.tensorflow.org/data/fra-eng.zip”

    response = requests.get(url)

    with open(“fra-eng.zip”, “wb”) as f:

        f.write(response.content)

 

# Normalize text

# each line of the file is in the format “<english>\t<french>”

# We convert text to lowercasee, normalize unicode (UFKC)

def normalize(line):

    “”“Normalize a line of text and split into two at the tab character”“”

    line = unicodedata.normalize(“NFKC”, line.strip().lower())

    eng, fra = line.split(“\t”)

    return eng.lower().strip(), fra.lower().strip()

 

text_pairs = []

with zipfile.ZipFile(“fra-eng.zip”, “r”) as zip_ref:

    for line in zip_ref.read(“fra.txt”).decode(“utf-8”).splitlines():

        eng, fra = normalize(line)

        text_pairs.append((eng, fra))

 

#

# Tokenization with BPE

#

 

if os.path.exists(“en_tokenizer.json”) and os.path.exists(“fr_tokenizer.json”):

    en_tokenizer = tokenizers.Tokenizer.from_file(“en_tokenizer.json”)

    fr_tokenizer = tokenizers.Tokenizer.from_file(“fr_tokenizer.json”)

else:

    en_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())

    fr_tokenizer = tokenizers.Tokenizer(tokenizers.models.BPE())

 

    # Configure pre-tokenizer to split on whitespace and punctuation, add space at beginning of the sentence

    en_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)

    fr_tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.ByteLevel(add_prefix_space=True)

 

    # Configure decoder: So that word boundary symbol “Ġ” will be removed

    en_tokenizer.decoder = tokenizers.decoders.ByteLevel()

    fr_tokenizer.decoder = tokenizers.decoders.ByteLevel()

 

    # Train BPE for English and French using the same trainer

    VOCAB_SIZE = 8000

    trainer = tokenizers.trainers.BpeTrainer(

        vocab_size=VOCAB_SIZE,

        special_tokens=[“[start]”, “[end]”, “[pad]”],

        show_progress=True

    )

    en_tokenizer.train_from_iterator([x[0] for x in text_pairs], trainer=trainer)

    fr_tokenizer.train_from_iterator([x[1] for x in text_pairs], trainer=trainer)

 

    en_tokenizer.enable_padding(pad_id=en_tokenizer.token_to_id(“[pad]”), pad_token=“[pad]”)

    fr_tokenizer.enable_padding(pad_id=fr_tokenizer.token_to_id(“[pad]”), pad_token=“[pad]”)

 

    # Save the trained tokenizers

    en_tokenizer.save(“en_tokenizer.json”, pretty=True)

    fr_tokenizer.save(“fr_tokenizer.json”, pretty=True)

 

# Test the tokenizer

print(“Sample tokenization:”)

en_sample, fr_sample = random.choice(text_pairs)

encoded = en_tokenizer.encode(en_sample)

print(f“Original: {en_sample}”)

print(f“Tokens: {encoded.tokens}”)

print(f“IDs: {encoded.ids}”)

print(f“Decoded: {en_tokenizer.decode(encoded.ids)}”)

print()

 

encoded = fr_tokenizer.encode(“[start] “ + fr_sample + ” [end]”)

print(f“Original: {fr_sample}”)

print(f“Tokens: {encoded.tokens}”)

print(f“IDs: {encoded.ids}”)

print(f“Decoded: {fr_tokenizer.decode(encoded.ids)}”)

print()

 

#

# Create PyTorch dataset for the BPE-encoded translation pairs

#

 

class TranslationDataset(torch.utils.data.Dataset):

    def __init__(self, text_pairs):

        self.text_pairs = text_pairs

 

    def __len__(self):

        return len(self.text_pairs)

 

    def __getitem__(self, idx):

        eng, fra = self.text_pairs[idx]

        return eng, “[start] “ + fra + ” [end]”

 

 

def collate_fn(batch):

    en_str, fr_str = zip(*batch)

    en_enc = en_tokenizer.encode_batch(en_str, add_special_tokens=True)

    fr_enc = fr_tokenizer.encode_batch(fr_str, add_special_tokens=True)

    en_ids = [enc.ids for enc in en_enc]

    fr_ids = [enc.ids for enc in fr_enc]

    return torch.tensor(en_ids), torch.tensor(fr_ids)

 

 

BATCH_SIZE = 32

dataset = TranslationDataset(text_pairs)

dataloader = torch.utils.data.DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

 

 

#

# Create seq2seq model with attention for translation

#

 

class EncoderRNN(nn.Module):

    “”“A RNN encoder with an embedding layer”“”

    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout=0.1):

        “”“

        Args:

            vocab_size: The size of the input vocabulary

            embedding_dim: The dimension of the embedding vector

            hidden_dim: The dimension of the hidden state

            dropout: The dropout rate

        ““”

        super().__init__()

        self.vocab_size = vocab_size

        self.embedding_dim = embedding_dim

        self.hidden_dim = hidden_dim

 

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.rnn = nn.GRU(embedding_dim, hidden_dim, batch_first=True)

        self.dropout = nn.Dropout(dropout)

 

    def forward(self, input_seq):

        # input seq = [batch_size, seq_len] -> embedded = [batch_size, seq_len, embedding_dim]

        embedded = self.dropout(self.embedding(input_seq))

        # outputs = [batch_size, seq_len, embedding_dim]

        # hidden = [1, batch_size, hidden_dim]

        outputs, hidden = self.rnn(embedded)

        return outputs, hidden

 

 

class BahdanauAttention(nn.Module):

    “”“Bahdanau Attention https://arxiv.org/pdf/1409.0473.pdf

    The forward function takes query and keys only, and they should be the same shape (B,S,H)

    ““”

    def __init__(self, hidden_size):

        super(BahdanauAttention, self).__init__()

        self.Wa = nn.Linear(hidden_size, hidden_size)

        self.Ua = nn.Linear(hidden_size, hidden_size)

        self.Va = nn.Linear(hidden_size, 1)

 

    def forward(self, query, keys):

        “”“Bahdanau Attention

 

        Args:

            query: [B, 1, H]

            keys: [B, S, H]

 

        Returns:

            context: [B, 1, H]

            weights: [B, 1, S]

        ““”

        B, S, H = keys.shape

        assert query.shape == (B, 1, H)

        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))

        scores = scores.transpose(1,2)  # scores = [B, 1, S]

 

        weights = F.softmax(scores, dim=–1)

        context = torch.bmm(weights, keys)

        return context, weights

 

 

class DecoderRNN(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout=0.1):

        super().__init__()

        self.vocab_size = vocab_size

        self.embedding_dim = embedding_dim

        self.hidden_dim = hidden_dim

 

        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        self.dropout = nn.Dropout(dropout)

        self.attention = BahdanauAttention(hidden_dim)

        self.gru = nn.GRU(embedding_dim + hidden_dim, hidden_dim, batch_first=True)

        self.out_proj = nn.Linear(hidden_dim, vocab_size)

 

    def forward(self, input_seq, hidden, enc_out):

        “”“Single token input, single token output”“”

        # input seq = [batch_size, 1] -> embedded = [batch_size, 1, embedding_dim]

        embedded = self.dropout(self.embedding(input_seq))

        # hidden = [1, batch_size, hidden_dim]

        # context = [batch_size, 1, hidden_dim]

        context, attn_weights = self.attention(hidden.transpose(0, 1), enc_out)

        # rnn_input = [batch_size, 1, embedding_dim + hidden_dim]

        rnn_input = torch.cat([embedded, context], dim=–1)

        # rnn_output = [batch_size, 1, hidden_dim]

        rnn_output, hidden = self.gru(rnn_input, hidden)

        output = self.out_proj(rnn_output)

        return output, hidden

 

 

class Seq2SeqRNN(nn.Module):

    def __init__(self, encoder, decoder):

        super().__init__()

        self.encoder = encoder

        self.decoder = decoder

 

    def forward(self, input_seq, target_seq):

        “”“Given the partial target sequence, predict the next token”“”

        # input seq = [batch_size, seq_len]

        # target seq = [batch_size, seq_len]

        batch_size, target_len = target_seq.shape

        device = target_seq.device

        # list for storing the output logits

        outputs = []

        # encoder forward pass

        enc_out, hidden = self.encoder(input_seq)

        dec_hidden = hidden

        # decoder forward pass

        for t in range(target_len–1):

            # during training, use the ground truth token as the input (teacher forcing)

            dec_in = target_seq[:, t].unsqueeze(1)

            # last target token and hidden states -> next token

            dec_out, dec_hidden = self.decoder(dec_in, dec_hidden, enc_out)

            # store the prediction

            outputs.append(dec_out)

        outputs = torch.cat(outputs, dim=1)

        return outputs

 

 

# Initialize model parameters

device = torch.device(‘cuda’ if torch.cuda.is_available() else ‘cpu’)

enc_vocab = len(en_tokenizer.get_vocab())

dec_vocab = len(fr_tokenizer.get_vocab())

emb_dim = 256

hidden_dim = 256

dropout = 0.1

 

# Create model

encoder = EncoderRNN(enc_vocab, emb_dim, hidden_dim, dropout).to(device)

decoder = DecoderRNN(dec_vocab, emb_dim, hidden_dim, dropout).to(device)

model = Seq2SeqRNN(encoder, decoder).to(device)

print(model)

 

print(“Model created with:”)

print(f”  Input vocabulary size: {enc_vocab}”)

print(f”  Output vocabulary size: {dec_vocab}”)

print(f”  Embedding dimension: {emb_dim}”)

print(f”  Hidden dimension: {hidden_dim}”)

print(f”  Dropout: {dropout}”)

print(f”  Total parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}”)

 

# Initialize model parameters with uniform distribution [-0.08, 0.08]

#for name, param in model.named_parameters():

#    if param.dim() > 1:

#        nn.init.normal_(param.data, mean=0, std=0.01)

 

# Train unless model.pth exists

if os.path.exists(“seq2seq_attn.pth”):

    model.load_state_dict(torch.load(“seq2seq_attn.pth”))

else:

    optimizer = optim.Adam(model.parameters(), lr=0.0005)

    loss_fn = nn.CrossEntropyLoss() #ignore_index=fr_tokenizer.token_to_id(“[pad]”))

    N_EPOCHS = 100

 

    for epoch in range(N_EPOCHS):

        model.train()

        epoch_loss = 0

        for en_ids, fr_ids in tqdm.tqdm(dataloader, desc=“Training”):

            # Move the “sentences” to device

            en_ids = en_ids.to(device)

            fr_ids = fr_ids.to(device)

            # zero the grad, then forward pass

            optimizer.zero_grad()

            outputs = model(en_ids, fr_ids)

            # compute the loss: compare 3D logits to 2D targets

            loss = loss_fn(outputs.reshape(–1, dec_vocab), fr_ids[:, 1:].reshape(–1))

            loss.backward()

            optimizer.step()

            epoch_loss += loss.item()

        print(f“Epoch {epoch+1}/{N_EPOCHS}; Avg loss {epoch_loss/len(dataloader)}; Latest loss {loss.item()}”)

        torch.save(model.state_dict(), f“seq2seq_attn-epoch-{epoch+1}.pth”)

        # Test

        if (epoch+1) % 5 != 0:

            continue

        model.eval()

        epoch_loss = 0

        with torch.no_grad():

            for en_ids, fr_ids in tqdm.tqdm(dataloader, desc=“Evaluating”):

                en_ids = en_ids.to(device)

                fr_ids = fr_ids.to(device)

                outputs = model(en_ids, fr_ids)

                loss = loss_fn(outputs.reshape(–1, dec_vocab), fr_ids[:, 1:].reshape(–1))

                epoch_loss += loss.item()

        print(f“Eval loss: {epoch_loss/len(dataloader)}”)

    torch.save(model.state_dict(), “seq2seq_attn.pth”)

 

# Test for a few samples

model.eval()

N_SAMPLES = 5

MAX_LEN = 60

with torch.no_grad():

    start_token = torch.tensor([fr_tokenizer.token_to_id(“[start]”)]).to(device)

    for en, true_fr in random.sample(text_pairs, N_SAMPLES):

        en_ids = torch.tensor(en_tokenizer.encode(en).ids).unsqueeze(0).to(device)

        enc_out, hidden = model.encoder(en_ids)

        pred_ids = []

        prev_token = start_token.unsqueeze(0)

        for _ in range(MAX_LEN):

            output, hidden = model.decoder(prev_token, hidden, enc_out)

            output = output.argmax(dim=2)

            pred_ids.append(output.item())

            prev_token = output

            # early stop if the predicted token is the end token

            if pred_ids[–1] == fr_tokenizer.token_to_id(“[end]”):

                break

        # Decode the predicted IDs

        pred_fr = fr_tokenizer.decode(pred_ids)

        print(f“English: {en}”)

        print(f“French: {true_fr}”)

        print(f“Predicted: {pred_fr}”)

        print()



Source_link

READ ALSO

marvn.ai and the rise of vertical AI search engines

Andrew Ng’s Team Releases Context Hub: An Open Source Tool that Gives Your Coding Agent the Up-to-Date API Documentation It Needs

Related Posts

marvn.ai and the rise of vertical AI search engines
Al, Analytics and Automation

marvn.ai and the rise of vertical AI search engines

March 10, 2026
Andrew Ng’s Team Releases Context Hub: An Open Source Tool that Gives Your Coding Agent the Up-to-Date API Documentation It Needs
Al, Analytics and Automation

Andrew Ng’s Team Releases Context Hub: An Open Source Tool that Gives Your Coding Agent the Up-to-Date API Documentation It Needs

March 10, 2026
VirtuaLover Image Generator Pricing & Features Overview
Al, Analytics and Automation

VirtuaLover Image Generator Pricing & Features Overview

March 9, 2026
Al, Analytics and Automation

The ‘Bayesian’ Upgrade: Why Google AI’s New Teaching Method is the Key to LLM Reasoning

March 9, 2026
Pricing Breakdown and Core Feature Overview
Al, Analytics and Automation

Pricing Breakdown and Core Feature Overview

March 9, 2026
Improving AI models’ ability to explain their predictions | MIT News
Al, Analytics and Automation

Improving AI models’ ability to explain their predictions | MIT News

March 9, 2026
Next Post
Winklevoss twins’ crypto company Gemini files for IPO

Winklevoss twins' crypto company Gemini files for IPO

POPULAR NEWS

Trump ends trade talks with Canada over a digital services tax

Trump ends trade talks with Canada over a digital services tax

June 28, 2025
Communication Effectiveness Skills For Business Leaders

Communication Effectiveness Skills For Business Leaders

June 10, 2025
15 Trending Songs on TikTok in 2025 (+ How to Use Them)

15 Trending Songs on TikTok in 2025 (+ How to Use Them)

June 18, 2025
App Development Cost in Singapore: Pricing Breakdown & Insights

App Development Cost in Singapore: Pricing Breakdown & Insights

June 22, 2025
Google announced the next step in its nuclear energy plans 

Google announced the next step in its nuclear energy plans 

August 20, 2025

EDITOR'S PICK

Datumbox Machine Learning Framework v0.8.2 released

Datumbox Machine Learning Framework v0.8.2 released

June 3, 2025
How We‘re Driving LLM Visibility at Semrush

How We‘re Driving LLM Visibility at Semrush

October 19, 2025
14 Ways to Get More Followers on TikTok in 2025

14 Ways to Get More Followers on TikTok in 2025

June 23, 2025
Elektronikpraxis Celebrates Two Anniversaries This Year

Elektronikpraxis Celebrates Two Anniversaries This Year

February 23, 2026

About

We bring you the best Premium WordPress Themes that perfect for news, magazine, personal blog, etc. Check our landing page for details.

Follow us

Categories

  • Account Based Marketing
  • Ad Management
  • Al, Analytics and Automation
  • Brand Management
  • Channel Marketing
  • Digital Marketing
  • Direct Marketing
  • Event Management
  • Google Marketing
  • Marketing Attribution and Consulting
  • Marketing Automation
  • Mobile Marketing
  • PR Solutions
  • Social Media Management
  • Technology And Software
  • Uncategorized

Recent Posts

  • How We Intend to Lead This Year – Brookline PR
  • Government social media benchmarks: 2026 update
  • Uzbekistan’s Uzum valuation leaps over 50% in seven months to $2.3B
  • marvn.ai and the rise of vertical AI search engines
  • About Us
  • Disclaimer
  • Contact Us
  • Privacy Policy
No Result
View All Result
  • Technology And Software
    • Account Based Marketing
    • Channel Marketing
    • Marketing Automation
      • Al, Analytics and Automation
      • Ad Management
  • Digital Marketing
    • Social Media Management
    • Google Marketing
  • Direct Marketing
    • Brand Management
    • Marketing Attribution and Consulting
  • Mobile Marketing
  • Event Management
  • PR Solutions