OiO.lk Community platform!

Oio.lk is an excellent forum for developers, providing a wide range of resources, discussions, and support for those in the developer community. Join oio.lk today to connect with like-minded professionals, share insights, and stay updated on the latest trends and technologies in the development field.
  You need to log in or register to access the solved answers to this problem.
  • You have reached the maximum number of guest views allowed
  • Please register below to remove this limitation

Custom Tramsformer model issue

  • Thread starter Thread starter Md. Shaykhul Islam
  • Start date Start date
M

Md. Shaykhul Islam

Guest
Code:
import math
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
import sentencepiece as spm

# Define PositionalEncoding, MultiHeadAttention, PositionwiseFeedforward,
# TransformerEncoderLayer, TransformerDecoderLayer, and Transformer classes
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0
        self.d_model = d_model
        self.num_heads = num_heads
        self.depth = d_model // num_heads

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        seq_len = query.size(1)

        query = self.query(query).view(batch_size, seq_len, self.num_heads, self.depth).transpose(1, 2)
        key = self.key(key).view(batch_size, seq_len, self.num_heads, self.depth).transpose(1, 2)
        value = self.value(value).view(batch_size, seq_len, self.num_heads, self.depth).transpose(1, 2)

        scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.depth)

        if mask is not None:
            mask = mask.unsqueeze(1).unsqueeze(1)  # Ensure mask is broadcastable
            scores = scores.masked_fill(mask == 0, float('-inf'))

        attention_weights = F.softmax(scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        context = torch.matmul(attention_weights, value)
        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
        return context, attention_weights


class PositionwiseFeedforward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedforward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerEncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = PositionwiseFeedforward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)

    def forward(self, src, src_mask=None):
        src2, _ = self.self_attn(src, src, src, src_mask)
        src = src + self.dropout1(src2)
        src = self.norm1(src)
        src2 = self.ffn(src)
        src = src + self.dropout2(src2)
        src = self.norm2(src)
        return src

class TransformerDecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
        super(TransformerDecoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.src_attn = MultiHeadAttention(d_model, num_heads, dropout)
        self.ffn = PositionwiseFeedforward(d_model, d_ff, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)

    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
        tgt2, self_attention_weights = self.self_attn(tgt, tgt, tgt, tgt_mask)
        tgt = tgt + self.dropout1(tgt2)
        tgt = self.norm1(tgt)
        tgt2, src_attention_weights = self.src_attn(tgt, memory, memory, memory_mask)
        tgt = tgt + self.dropout2(tgt2)
        tgt = self.norm2(tgt)
        tgt2 = self.ffn(tgt)
        tgt = tgt + self.dropout3(tgt2)
        tgt = self.norm3(tgt)
        return tgt, self_attention_weights, src_attention_weights

class Transformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers, vocab_size, d_model, num_heads, d_ff, dropout=0.1):
        super(Transformer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        self.encoder_layers = nn.ModuleList([TransformerEncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_encoder_layers)])
        self.decoder_layers = nn.ModuleList([TransformerDecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_decoder_layers)])
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, src, tgt, src_mask=None, tgt_mask=None):
        src = self.embedding(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        tgt = self.embedding(tgt) * math.sqrt(self.d_model)
        tgt = self.pos_encoder(tgt)

        for layer in self.encoder_layers:
            src = layer(src, src_mask)
        
        memory = src

        for layer in self.decoder_layers:
            tgt, _, _ = layer(tgt, memory, tgt_mask, src_mask)

        output = self.fc_out(tgt)
        return output

class ConversationDataset(Dataset):
    def __init__(self, conversations, sp_model):
        self.conversations = conversations
        self.sp_model = sp_model

    def __len__(self):
        return len(self.conversations)

    def __getitem__(self, idx):
        input_text, target_text = self.conversations[idx]
        input_tensor = self.sp_model.encode(input_text, out_type=int)
        target_tensor = self.sp_model.encode(target_text, out_type=int)
        return input_tensor, target_tensor

def pad_sequence(seq, max_len, pad_value):
    return seq + [pad_value] * (max_len - len(seq))

def collate_fn(batch, pad_token=0):
    input_seqs, target_seqs = zip(*batch)
    max_input_len = max(len(seq) for seq in input_seqs)
    max_target_len = max(len(seq) for seq in target_seqs)
    input_seqs = [pad_sequence(seq, max_input_len, pad_token) for seq in input_seqs]
    target_seqs = [pad_sequence(seq, max_target_len, pad_token) for seq in target_seqs]
    input_seqs = torch.tensor(input_seqs, dtype=torch.long)
    target_seqs = torch.tensor(target_seqs, dtype=torch.long)
    return input_seqs, target_seqs

def train_model(model, dataloader, num_epochs, learning_rate, vocab_size):
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, verbose=True)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        pbar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")
        for src, tgt in pbar:
            optimizer.zero_grad()

            src = src.transpose(0, 1)
            tgt_input = tgt[:, :-1].transpose(0, 1)
            tgt_output = tgt[:, 1:].transpose(0, 1)

            src_mask = generate_square_subsequent_mask(src.size(0)).to(src.device)
            tgt_mask = generate_square_subsequent_mask(tgt_input.size(0)).to(tgt_input.device)

            src_mask = src_mask.unsqueeze(0)
            tgt_mask = tgt_mask.unsqueeze(0)

            output = model(src, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask)
            output = output.transpose(0, 1).contiguous().view(-1, vocab_size)
            tgt_output = tgt_output.contiguous().view(-1)

            loss = criterion(output, tgt_output)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            pbar.set_postfix({'loss': total_loss / len(pbar)})
        
        scheduler.step(total_loss / len(dataloader))


def generate_square_subsequent_mask(sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


def generate_response(model, sp_model, input_text, max_len=20):
    model.eval()

    with torch.no_grad():
        src_tokens = sp_model.encode(input_text, out_type=int)
        src_tensor = torch.tensor([src_tokens], dtype=torch.long)
        src_mask = (src_tensor != 0).unsqueeze(-2)

        memory = model.encoder(src_tensor, src_mask)
        
        # Start decoding with "<s>" token
        tgt_token = [sp_model.bos_id()]

        for i in range(max_len):
            tgt_tensor = torch.tensor([tgt_token], dtype=torch.long)
            tgt_mask = (tgt_tensor != 0).unsqueeze(-2)
            
            output = model.decoder(tgt_tensor, memory, tgt_mask, src_mask)
            output = torch.argmax(output, dim=-1)
            token = output[0, -1].item()
            
            if token == sp_model.eos_id():
                break
            
            tgt_token.append(token)

        output_text = sp_model.decode_ids(tgt_token)
        return output_text

# Load SentencePiece model
sp_model = spm.SentencePieceProcessor(model_file='m.model')

# Example conversation dataset
conversations = [
    ("hello how are you", "i am fine"),
    ("what is your name", "my name is bot"),
    ("how old are you", "i am 2 years old")
]

# Create dataset and dataloader
dataset = ConversationDataset(conversations, sp_model)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Model parameters
num_encoder_layers = 6
num_decoder_layers = 6
vocab_size = sp_model.get_piece_size()  # Get actual vocab size from SentencePiece model
d_model = 512
num_heads = 8
d_ff = 2048
dropout = 0.1
learning_rate = 0.0001
num_epochs = 10

# Initialize and train the model
model = Transformer(num_encoder_layers, num_decoder_layers, vocab_size, d_model, num_heads, d_ff, dropout)
train_model(model, dataloader, num_epochs, learning_rate, vocab_size)

# Test inference
input_text = "hello how are you"
response = generate_response(model, sp_model, input_text)
print(f"Input: {input_text}\nResponse: {response}")
%Run ai.py /home/shaykhul/.local/lib/python3.11/site-packages/torch/optim/lr_scheduler.py:28: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate. warnings.warn("The verbose parameter is deprecated. Please use get_last_lr() " Epoch 1/10: 0%| | 0/2 [00:00<?, ?it/s] Traceback (most recent call last): File "/home/shaykhul/Desktop/ai.py", line 272, in train_model(model, dataloader, num_epochs, learning_rate, vocab_size) File "/home/shaykhul/Desktop/ai.py", line 196, in train_model output = model(src, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask) File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(*args, **kwargs) File "/home/shaykhul/Desktop/ai.py", line 137, in forward src = layer(src, src_mask) File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(*args, **kwargs) File "/home/shaykhul/Desktop/ai.py", line 87, in forward src2, _ = self.self_attn(src, src, src, src_mask) File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl return self._call_impl(*args, **kwargs) File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl return forward_call(*args, **kwargs) File "/home/shaykhul/Desktop/ai.py", line 55, in forward scores = scores.masked_fill(mask == 0, float('-inf')) RuntimeError: The size of tensor a (13) must match the size of tensor b (2) at non-singleton dimension 4

I can't fix this error.I tried different approach and even ChatGp but it not fix.The Error occurred due to MultiHeadAttention dimension mismatched.I want to build a simple personal chat-bot AI girlfriend. So please help me to fix the error.I want to make my custom transformer model and train it with my custom data sets.
<pre><code>import math
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
import sentencepiece as spm

# Define PositionalEncoding, MultiHeadAttention, PositionwiseFeedforward,
# TransformerEncoderLayer, TransformerDecoderLayer, and Transformer classes
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)

pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)

def forward(self, x):
x = x + self.pe[:x.size(0), :]
return self.dropout(x)

class MultiHeadAttention(nn.Module):
def __init__(self, d_model, num_heads, dropout=0.1):
super(MultiHeadAttention, self).__init__()
assert d_model % num_heads == 0
self.d_model = d_model
self.num_heads = num_heads
self.depth = d_model // num_heads

self.query = nn.Linear(d_model, d_model)
self.key = nn.Linear(d_model, d_model)
self.value = nn.Linear(d_model, d_model)
self.dropout = nn.Dropout(dropout)

def forward(self, query, key, value, mask=None):
batch_size = query.size(0)
seq_len = query.size(1)

query = self.query(query).view(batch_size, seq_len, self.num_heads, self.depth).transpose(1, 2)
key = self.key(key).view(batch_size, seq_len, self.num_heads, self.depth).transpose(1, 2)
value = self.value(value).view(batch_size, seq_len, self.num_heads, self.depth).transpose(1, 2)

scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.depth)

if mask is not None:
mask = mask.unsqueeze(1).unsqueeze(1) # Ensure mask is broadcastable
scores = scores.masked_fill(mask == 0, float('-inf'))

attention_weights = F.softmax(scores, dim=-1)
attention_weights = self.dropout(attention_weights)
context = torch.matmul(attention_weights, value)
context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
return context, attention_weights


class PositionwiseFeedforward(nn.Module):
def __init__(self, d_model, d_ff, dropout=0.1):
super(PositionwiseFeedforward, self).__init__()
self.fc1 = nn.Linear(d_model, d_ff)
self.fc2 = nn.Linear(d_ff, d_model)
self.dropout = nn.Dropout(dropout)

def forward(self, x):
x = self.dropout(F.relu(self.fc1(x)))
x = self.fc2(x)
return x

class TransformerEncoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerEncoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.ffn = PositionwiseFeedforward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)

def forward(self, src, src_mask=None):
src2, _ = self.self_attn(src, src, src, src_mask)
src = src + self.dropout1(src2)
src = self.norm1(src)
src2 = self.ffn(src)
src = src + self.dropout2(src2)
src = self.norm2(src)
return src

class TransformerDecoderLayer(nn.Module):
def __init__(self, d_model, num_heads, d_ff, dropout=0.1):
super(TransformerDecoderLayer, self).__init__()
self.self_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.src_attn = MultiHeadAttention(d_model, num_heads, dropout)
self.ffn = PositionwiseFeedforward(d_model, d_ff, dropout)
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.norm3 = nn.LayerNorm(d_model)
self.dropout1 = nn.Dropout(dropout)
self.dropout2 = nn.Dropout(dropout)
self.dropout3 = nn.Dropout(dropout)

def forward(self, tgt, memory, tgt_mask=None, memory_mask=None):
tgt2, self_attention_weights = self.self_attn(tgt, tgt, tgt, tgt_mask)
tgt = tgt + self.dropout1(tgt2)
tgt = self.norm1(tgt)
tgt2, src_attention_weights = self.src_attn(tgt, memory, memory, memory_mask)
tgt = tgt + self.dropout2(tgt2)
tgt = self.norm2(tgt)
tgt2 = self.ffn(tgt)
tgt = tgt + self.dropout3(tgt2)
tgt = self.norm3(tgt)
return tgt, self_attention_weights, src_attention_weights

class Transformer(nn.Module):
def __init__(self, num_encoder_layers, num_decoder_layers, vocab_size, d_model, num_heads, d_ff, dropout=0.1):
super(Transformer, self).__init__()
self.embedding = nn.Embedding(vocab_size, d_model)
self.pos_encoder = PositionalEncoding(d_model, dropout)
self.encoder_layers = nn.ModuleList([TransformerEncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_encoder_layers)])
self.decoder_layers = nn.ModuleList([TransformerDecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_decoder_layers)])
self.fc_out = nn.Linear(d_model, vocab_size)
self.d_model = d_model

def forward(self, src, tgt, src_mask=None, tgt_mask=None):
src = self.embedding(src) * math.sqrt(self.d_model)
src = self.pos_encoder(src)
tgt = self.embedding(tgt) * math.sqrt(self.d_model)
tgt = self.pos_encoder(tgt)

for layer in self.encoder_layers:
src = layer(src, src_mask)

memory = src

for layer in self.decoder_layers:
tgt, _, _ = layer(tgt, memory, tgt_mask, src_mask)

output = self.fc_out(tgt)
return output

class ConversationDataset(Dataset):
def __init__(self, conversations, sp_model):
self.conversations = conversations
self.sp_model = sp_model

def __len__(self):
return len(self.conversations)

def __getitem__(self, idx):
input_text, target_text = self.conversations[idx]
input_tensor = self.sp_model.encode(input_text, out_type=int)
target_tensor = self.sp_model.encode(target_text, out_type=int)
return input_tensor, target_tensor

def pad_sequence(seq, max_len, pad_value):
return seq + [pad_value] * (max_len - len(seq))

def collate_fn(batch, pad_token=0):
input_seqs, target_seqs = zip(*batch)
max_input_len = max(len(seq) for seq in input_seqs)
max_target_len = max(len(seq) for seq in target_seqs)
input_seqs = [pad_sequence(seq, max_input_len, pad_token) for seq in input_seqs]
target_seqs = [pad_sequence(seq, max_target_len, pad_token) for seq in target_seqs]
input_seqs = torch.tensor(input_seqs, dtype=torch.long)
target_seqs = torch.tensor(target_seqs, dtype=torch.long)
return input_seqs, target_seqs

def train_model(model, dataloader, num_epochs, learning_rate, vocab_size):
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.01)
criterion = nn.CrossEntropyLoss(ignore_index=0)
scheduler = ReduceLROnPlateau(optimizer, 'min', patience=2, verbose=True)

for epoch in range(num_epochs):
model.train()
total_loss = 0
pbar = tqdm(dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")
for src, tgt in pbar:
optimizer.zero_grad()

src = src.transpose(0, 1)
tgt_input = tgt[:, :-1].transpose(0, 1)
tgt_output = tgt[:, 1:].transpose(0, 1)

src_mask = generate_square_subsequent_mask(src.size(0)).to(src.device)
tgt_mask = generate_square_subsequent_mask(tgt_input.size(0)).to(tgt_input.device)

src_mask = src_mask.unsqueeze(0)
tgt_mask = tgt_mask.unsqueeze(0)

output = model(src, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask)
output = output.transpose(0, 1).contiguous().view(-1, vocab_size)
tgt_output = tgt_output.contiguous().view(-1)

loss = criterion(output, tgt_output)
loss.backward()
optimizer.step()

total_loss += loss.item()
pbar.set_postfix({'loss': total_loss / len(pbar)})

scheduler.step(total_loss / len(dataloader))


def generate_square_subsequent_mask(sz):
mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
return mask


def generate_response(model, sp_model, input_text, max_len=20):
model.eval()

with torch.no_grad():
src_tokens = sp_model.encode(input_text, out_type=int)
src_tensor = torch.tensor([src_tokens], dtype=torch.long)
src_mask = (src_tensor != 0).unsqueeze(-2)

memory = model.encoder(src_tensor, src_mask)

# Start decoding with "<s>" token
tgt_token = [sp_model.bos_id()]

for i in range(max_len):
tgt_tensor = torch.tensor([tgt_token], dtype=torch.long)
tgt_mask = (tgt_tensor != 0).unsqueeze(-2)

output = model.decoder(tgt_tensor, memory, tgt_mask, src_mask)
output = torch.argmax(output, dim=-1)
token = output[0, -1].item()

if token == sp_model.eos_id():
break

tgt_token.append(token)

output_text = sp_model.decode_ids(tgt_token)
return output_text

# Load SentencePiece model
sp_model = spm.SentencePieceProcessor(model_file='m.model')

# Example conversation dataset
conversations = [
("hello how are you", "i am fine"),
("what is your name", "my name is bot"),
("how old are you", "i am 2 years old")
]

# Create dataset and dataloader
dataset = ConversationDataset(conversations, sp_model)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True, collate_fn=collate_fn)

# Model parameters
num_encoder_layers = 6
num_decoder_layers = 6
vocab_size = sp_model.get_piece_size() # Get actual vocab size from SentencePiece model
d_model = 512
num_heads = 8
d_ff = 2048
dropout = 0.1
learning_rate = 0.0001
num_epochs = 10

# Initialize and train the model
model = Transformer(num_encoder_layers, num_decoder_layers, vocab_size, d_model, num_heads, d_ff, dropout)
train_model(model, dataloader, num_epochs, learning_rate, vocab_size)

# Test inference
input_text = "hello how are you"
response = generate_response(model, sp_model, input_text)
print(f"Input: {input_text}\nResponse: {response}")
</code></pre>
<blockquote>
<blockquote>
<blockquote>
<p>%Run ai.py
/home/shaykhul/.local/lib/python3.11/site-packages/torch/optim/lr_scheduler.py:28: UserWarning: The verbose parameter is deprecated. Please use get_last_lr() to access the learning rate.
warnings.warn("The verbose parameter is deprecated. Please use get_last_lr() "
Epoch 1/10: 0%| | 0/2 [00:00<?, ?it/s]
Traceback (most recent call last):
File "/home/shaykhul/Desktop/ai.py", line 272, in
train_model(model, dataloader, num_epochs, learning_rate, vocab_size)
File "/home/shaykhul/Desktop/ai.py", line 196, in train_model
output = model(src, tgt_input, src_mask=src_mask, tgt_mask=tgt_mask)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/shaykhul/Desktop/ai.py", line 137, in forward
src = layer(src, src_mask)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/shaykhul/Desktop/ai.py", line 87, in forward
src2, _ = self.self_attn(src, src, src, src_mask)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/shaykhul/.local/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/shaykhul/Desktop/ai.py", line 55, in forward
scores = scores.masked_fill(mask == 0, float('-inf'))
RuntimeError: The size of tensor a (13) must match the size of tensor b (2) at non-singleton dimension 4</p>
</blockquote>
</blockquote>
</blockquote>
<p>I can't fix this error.I tried different approach and even ChatGp but it not fix.The Error occurred due to MultiHeadAttention dimension mismatched.I want to build a simple personal chat-bot AI girlfriend. So please help me to fix the error.I want to make my custom transformer model and train it with my custom data sets.</p>
 

Latest posts

ن
Replies
0
Views
1
نعمان منذر محمود الجميلي
ن
Top