--- license: other language: - ur - hi tags: - pytorch - transliterations - urdu - hindi - RekhtaLabs - Sequence2Sequence - Transformers --- ![Rekhta Lab Logo](https://www.rekhta.org/Content/Images/RekhtaLogo.png) # Hindi to Urdu Transliteration Model (Character-Level) This is a lightweight Transformer-based model trained for **character-level transliteration** of **Hindi poetry into Urdu script**. The model is specially tuned for literary and poetic text, making it ideal for applications involving shayari, nazm, or ghazals. # Live Inference https://rekhtalabs.org/demo/transliterate ## Model Overview | Feature | Value | |-------------------------|--------------------------| | **Architecture** | Transformer (BART-style) | | **Tokenizer** | Character-level | | **Embedding Size** | 256 | | **Hidden Size** | 256 (`d_model`) | | **Feedforward Size** | 512 (`dim_feedforward`) | | **Encoder Layers** | 3 (`num_layers`) | | **Decoder Layers** | 3 (`num_layers`) | | **Attention Heads** | 4 (`nhead`) | | **Max Sequence Length** | 128 (`max_len`) | --- ## Usage ```python from huggingface_hub import snapshot_download path = snapshot_download( repo_id="rekhtalabs/hi-2-ur-translit", local_dir="./hi-2-ur-translit", local_dir_use_symlinks=False ) cd hi-2-ur-translit ``` ```python pip install -r requirements.txt ``` ```python import torch import sentencepiece as spm from torch import nn from collections import OrderedDict class PositionalEncoding(nn.Module): def __init__(self, d_model, max_len=5000): super().__init__() pe = torch.zeros(max_len, d_model) position = torch.arange(0, max_len).unsqueeze(1) div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model)) pe[:, 0::2] = torch.sin(position.float() * div_term) pe[:, 1::2] = torch.cos(position.float() * div_term) self.pe = pe.unsqueeze(0) def forward(self, x): return x + self.pe[:, :x.size(1)].to(x.device) class Transformer(nn.Module): def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, nhead=4, num_layers=3, dim_feedforward=512, max_len=128): super().__init__() self.src_tok_emb = nn.Embedding(src_vocab_size, d_model) self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, d_model) self.pos_encoder = PositionalEncoding(d_model, max_len) self.transformer = nn.Transformer( d_model=d_model, nhead=nhead, num_encoder_layers=num_layers, num_decoder_layers=num_layers, dim_feedforward=dim_feedforward, batch_first=True ) self.out = nn.Linear(d_model, tgt_vocab_size) def forward(self, src, tgt): src = self.pos_encoder(self.src_tok_emb(src)) tgt = self.pos_encoder(self.tgt_tok_emb(tgt)) tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device) out = self.transformer(src, tgt, tgt_mask=tgt_mask) return self.out(out) sp_nastaaliq = spm.SentencePieceProcessor(model_file='nastaaliq_bpe.model') sp_devanagari = spm.SentencePieceProcessor(model_file='devanagari_bpe.model') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = Transformer( src_vocab_size=sp_devanagari.get_piece_size(), tgt_vocab_size=sp_nastaaliq.get_piece_size() ).to(device) checkpoint = torch.load("h2u_2.0.pt", map_location=device) state_dict = checkpoint["model_state_dict"] new_state_dict = OrderedDict() for k, v in state_dict.items(): new_k = k.replace("module.", "") new_state_dict[new_k] = v model.load_state_dict(new_state_dict) model.eval() def transliterate_urdu_to_hindi(text_urdu, max_len=128): src_ids = [2] + sp_devanagari.encode(text_urdu)[:max_len - 2] + [3] src_tensor = torch.tensor(src_ids).unsqueeze(0).to(device) tgt_ids = [2] # BOS token for _ in range(max_len): tgt_tensor = torch.tensor(tgt_ids).unsqueeze(0).to(device) with torch.no_grad(): output = model(src_tensor, tgt_tensor) next_token_logits = output[0, -1, :] next_token_id = torch.argmax(next_token_logits).item() if next_token_id == 3: break tgt_ids.append(next_token_id) return sp_nastaaliq.decode(tgt_ids[1:]) res=transliterate_urdu_to_hindi("थम गए हों बहते बहते चम्पई रुख़्सार पर") print(res) ``` ## Output ```python تھم گئے ہوں بہتے بہتے چمپئی رخسار پر ``` --- ## Dataset - Trained on approximately **1300,000 Hindi-Urdu Ghazal and Nazm Pairs** - Sourced and curated for transliteration. - Character-level alignment ensured for quality ---