konishant commited on
Commit
be5ccfa
·
verified ·
1 Parent(s): 471e44e

rekhta hi2ur model

Browse files
config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "custom-transliterator",
3
+ "architectures": [
4
+ "CustomTransformerModel"
5
+ ],
6
+ "hidden_size": 256,
7
+ "num_attention_heads": 4,
8
+ "num_hidden_layers": 3,
9
+ "dim_feedforward": 512,
10
+ "max_position_embeddings": 256,
11
+ "src_vocab_size": 139,
12
+ "tgt_vocab_size": 141,
13
+ "pad_token_id": 0,
14
+ "bos_token_id": 2,
15
+ "eos_token_id": 3,
16
+ "unk_token_id": 1
17
+ }
devanagari_bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cebd156b47d0c23766a98e970db86873baf44ada068c7074ed998436bd5e9c7
3
+ size 239237
devanagari_bpe.vocab ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD] 0
2
+ [UNK] 0
3
+ [CLS] 0
4
+ [SEP] 0
5
+ ▁ -1.56837
6
+ ा -2.72961
7
+ े -3.09795
8
+ क -3.12604
9
+ र -3.14134
10
+ ह -3.15577
11
+ म -3.41228
12
+ ी -3.46207
13
+ त -3.56156
14
+ न -3.57827
15
+ स -3.57865
16
+ ़ -3.58176
17
+ ं -3.72041
18
+ ो -3.76092
19
+ ि -3.78206
20
+ ज -3.89364
21
+ ल -3.89917
22
+ - -3.91544
23
+ द -3.97711
24
+ ब -4.00613
25
+ ् -4.03211
26
+ ु -4.11664
27
+ ै -4.1828
28
+ ग -4.3143
29
+ ए -4.42049
30
+ य -4.42315
31
+ ँ -4.63556
32
+ ख -4.65368
33
+ प -4.67017
34
+ ू -4.76717
35
+ व -4.82514
36
+ श -4.83062
37
+ आ -4.94678
38
+ अ -5.0527
39
+ फ -5.13204
40
+ भ -5.30514
41
+ च -5.38654
42
+ इ -5.45387
43
+ उ -5.55407
44
+ थ -5.62956
45
+ ई -5.6659
46
+ ' -5.66818
47
+ झ -5.88977
48
+ ड -5.92759
49
+ ट -6.11794
50
+ छ -6.12487
51
+ ौ -6.2094
52
+ ओ -6.35333
53
+ औ -6.65035
54
+ ठ -6.868
55
+ ध -6.86917
56
+ ऐ -7.01473
57
+ घ -7.14654
58
+ ढ -7.29567
59
+ ऊ -8.06043
60
+ , -9.41882
61
+ ! -9.8114
62
+ . -10.3114
63
+ ‘ -10.5707
64
+ ॉ -10.7803
65
+ ’ -10.8678
66
+ ? -10.8859
67
+ ष -11.0155
68
+ ण -11.4962
69
+ ृ -11.5722
70
+ ( -12.4031
71
+ ) -12.4188
72
+ ञ -12.6159
73
+ ऑ -12.7605
74
+ : -13.1119
75
+ 2 -13.734
76
+ 1 -14.0549
77
+ a -14.1693
78
+ 3 -14.4469
79
+ । -14.4877
80
+ ऋ -14.5747
81
+ e -14.6213
82
+ ॅ -14.6213
83
+ 4 -14.67
84
+ ۔ -14.7754
85
+ r -14.8932
86
+ ङ -14.9577
87
+ u -15.0267
88
+ ‍ -15.2679
89
+ * -15.3632
90
+ 5 -15.3632
91
+ i -15.3632
92
+ ٖ -15.3632
93
+ 6 -15.4686
94
+ 7 -15.4686
95
+ ; -15.4686
96
+ b -15.4686
97
+ h -15.4686
98
+ s -15.4686
99
+ 8 -15.5863
100
+ २ -15.5863
101
+ n -15.7199
102
+ v -15.7199
103
+ l -15.874
104
+ 0 -16.0563
105
+ t -16.0563
106
+ z -16.0563
107
+ 9 -16.2795
108
+ T -16.2795
109
+ d -16.2795
110
+ k -16.2795
111
+ o -16.2795
112
+ p -16.2795
113
+ أ -16.2795
114
+ ३ -16.2795
115
+ _ -16.5672
116
+ ˈ -16.5672
117
+ “ -16.5672
118
+ + -16.9726
119
+ y -16.9726
120
+ ، -16.9726
121
+ ٍ -16.9726
122
+ ऩ -16.9726
123
+ — -16.9726
124
+ " -17.6658
125
+ H -17.6658
126
+ I -17.6658
127
+ K -17.6658
128
+ L -17.6658
129
+ O -17.6658
130
+ U -17.6658
131
+ f -17.6658
132
+ | -17.6658
133
+ £ -17.6658
134
+ س -17.6658
135
+ ع -17.6658
136
+ ِ -17.6658
137
+ ٓ -17.6658
138
+ ک -17.6658
139
+ ः -17.6658
h2u_2.0.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c6c40da98a2bc82c3a88200d46fdc00014ae92d4e5a0f1cad9b6b3ac3dcd00
3
+ size 48872290
nastaaliq_bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0e70a3a6728c54b310f196f1e3dda86ea92fac6573a0473fc4529c65f05c696
3
+ size 239191
nastaaliq_bpe.vocab ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [PAD] 0
2
+ [UNK] 0
3
+ [CLS] 0
4
+ [SEP] 0
5
+ ▁ -1.43725
6
+ ا -2.49726
7
+ ی -2.61728
8
+ ہ -2.921
9
+ و -3.05754
10
+ ر -3.0802
11
+ ک -3.08283
12
+ ے -3.12091
13
+ ن -3.31991
14
+ م -3.35435
15
+ ت -3.45322
16
+ ں -3.52174
17
+ س -3.63601
18
+ ب -3.71838
19
+ ھ -3.7338
20
+ ل -3.84319
21
+ د -3.87154
22
+ ج -4.17633
23
+ گ -4.346
24
+ پ -4.47736
25
+ ش -4.76856
26
+ ئ -4.78603
27
+ چ -4.97538
28
+ آ -5.09129
29
+ ز -5.11549
30
+ خ -5.14642
31
+ ح -5.30285
32
+ ق -5.304
33
+ ف -5.32229
34
+ ع -5.4835
35
+ ٹ -5.71517
36
+ ص -5.91753
37
+ ڑ -5.97114
38
+ ط -6.11523
39
+ غ -6.14773
40
+ ظ -6.60046
41
+ ؔ -6.76266
42
+ ڈ -6.90553
43
+ ؤ -6.92384
44
+ ض -7.01754
45
+ ذ -7.02157
46
+ ۂ -7.43851
47
+ ث -7.59653
48
+ ٔ -9.09115
49
+ ٰ -9.32497
50
+ ، -9.33314
51
+ ' -9.48661
52
+ ! -9.74387
53
+ ً -10.0546
54
+ ژ -10.1661
55
+ ۔ -10.5828
56
+ ؟ -10.8042
57
+ أ -10.8382
58
+ . -11.4494
59
+ ( -12.3291
60
+ ) -12.3343
61
+ ۓ -12.4488
62
+ ۃ -12.7346
63
+ ي -12.7899
64
+ ء -13.0172
65
+ : -13.0588
66
+ - -13.6901
67
+ ك -13.8409
68
+ ؐ -13.9386
69
+ ۲ -14.0468
70
+ َ -14.0758
71
+ ه -14.2009
72
+ ّ -14.2348
73
+ 2 -14.4666
74
+ ُ -14.5576
75
+ ۳ -14.5576
76
+ 1 -14.6577
77
+ ؑ -14.6577
78
+ ۱ -14.7689
79
+ ؓ -14.9631
80
+ ٴ -14.9631
81
+ ۴ -15.1172
82
+ * -15.2995
83
+ 3 -15.4049
84
+ , -15.5227
85
+ ۶ -15.5227
86
+ 4 -15.6562
87
+ ۵ -15.6562
88
+ i -15.8104
89
+ ِ -15.8104
90
+ ۷ -15.8104
91
+ ۸ -15.8104
92
+ ः -15.8104
93
+ ‘ -15.8104
94
+ ’ -15.8104
95
+ a -15.9927
96
+ e -15.9927
97
+ r -15.9927
98
+ s -15.9927
99
+ ؛ -15.9927
100
+ ْ -15.9927
101
+ 0 -16.2158
102
+ A -16.2158
103
+ b -16.2158
104
+ d -16.2158
105
+ n -16.2158
106
+ u -16.2158
107
+ 5 -16.5035
108
+ 7 -16.5035
109
+ I -16.5035
110
+ R -16.5035
111
+ ٍ -16.5035
112
+ “ -16.5035
113
+ 8 -16.909
114
+ 9 -16.909
115
+ C -16.909
116
+ E -16.909
117
+ G -16.909
118
+ L -16.909
119
+ U -16.909
120
+ t -16.909
121
+ y -16.909
122
+ ة -16.909
123
+ ٌ -16.909
124
+ ۹ -16.909
125
+ + -17.6021
126
+ 6 -17.6021
127
+ ? -17.6021
128
+ B -17.6021
129
+ M -17.6021
130
+ O -17.6021
131
+ T -17.6021
132
+ Z -17.6021
133
+ [ -17.6021
134
+ f -17.6021
135
+ h -17.6021
136
+ o -17.6021
137
+ z -17.6021
138
+ ؎ -17.6021
139
+ ٓ -17.6021
140
+ ٖ -17.6021
141
+ ۰ -17.6021
readme.md ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ language:
4
+ - ur
5
+ - hi
6
+ tags:
7
+ - pytorch
8
+ - transliterations
9
+ - urdu
10
+ - hindi
11
+ - RekhtaLabs
12
+ - Sequence2Sequence
13
+ - Transformers
14
+ ---
15
+ ![Rekhta Lab Logo](https://www.rekhta.org/Content/Images/RekhtaLogo.png)
16
+ # Hindi to Urdu Transliteration Model (Character-Level)
17
+
18
+
19
+
20
+ This is a lightweight Transformer-based model trained for **character-level transliteration** of **Hindi poetry into Urdu script**. The model is specially tuned for literary and poetic text, making it ideal for applications involving shayari, nazm, or ghazals.
21
+
22
+ # Live Inference
23
+ https://rekhtalabs.org/demo/transliterate
24
+
25
+
26
+
27
+ ## Model Overview
28
+ | Feature | Value |
29
+ |-------------------------|--------------------------|
30
+ | **Architecture** | Transformer (BART-style) |
31
+ | **Tokenizer** | Character-level |
32
+ | **Embedding Size** | 256 |
33
+ | **Hidden Size** | 256 (`d_model`) |
34
+ | **Feedforward Size** | 512 (`dim_feedforward`) |
35
+ | **Encoder Layers** | 3 (`num_layers`) |
36
+ | **Decoder Layers** | 3 (`num_layers`) |
37
+ | **Attention Heads** | 4 (`nhead`) |
38
+ | **Max Sequence Length** | 128 (`max_len`) |
39
+
40
+ ---
41
+
42
+
43
+
44
+ ## Usage
45
+
46
+ ```python
47
+ from huggingface_hub import snapshot_download
48
+
49
+ path = snapshot_download(
50
+ repo_id="rekhtalabs/hi-2-ur-translit",
51
+ local_dir="./hi-2-ur-translit",
52
+ local_dir_use_symlinks=False
53
+ )
54
+
55
+ cd hi-2-ur-translit
56
+ ```
57
+
58
+ ```python
59
+ pip install -r requirements.txt
60
+ ```
61
+ ```python
62
+ import torch
63
+ import sentencepiece as spm
64
+ from torch import nn
65
+ from collections import OrderedDict
66
+
67
+
68
+ class PositionalEncoding(nn.Module):
69
+ def __init__(self, d_model, max_len=5000):
70
+ super().__init__()
71
+ pe = torch.zeros(max_len, d_model)
72
+ position = torch.arange(0, max_len).unsqueeze(1)
73
+ div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model))
74
+ pe[:, 0::2] = torch.sin(position.float() * div_term)
75
+ pe[:, 1::2] = torch.cos(position.float() * div_term)
76
+ self.pe = pe.unsqueeze(0)
77
+
78
+ def forward(self, x):
79
+ return x + self.pe[:, :x.size(1)].to(x.device)
80
+
81
+
82
+ class Transformer(nn.Module):
83
+ def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, nhead=4, num_layers=3, dim_feedforward=512, max_len=128):
84
+ super().__init__()
85
+ self.src_tok_emb = nn.Embedding(src_vocab_size, d_model)
86
+ self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, d_model)
87
+ self.pos_encoder = PositionalEncoding(d_model, max_len)
88
+ self.transformer = nn.Transformer(
89
+ d_model=d_model,
90
+ nhead=nhead,
91
+ num_encoder_layers=num_layers,
92
+ num_decoder_layers=num_layers,
93
+ dim_feedforward=dim_feedforward,
94
+ batch_first=True
95
+ )
96
+ self.out = nn.Linear(d_model, tgt_vocab_size)
97
+
98
+ def forward(self, src, tgt):
99
+ src = self.pos_encoder(self.src_tok_emb(src))
100
+ tgt = self.pos_encoder(self.tgt_tok_emb(tgt))
101
+ tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device)
102
+ out = self.transformer(src, tgt, tgt_mask=tgt_mask)
103
+ return self.out(out)
104
+
105
+
106
+ sp_nastaaliq = spm.SentencePieceProcessor(model_file='nastaaliq_bpe.model')
107
+ sp_devanagari = spm.SentencePieceProcessor(model_file='devanagari_bpe.model')
108
+
109
+
110
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
111
+ model = Transformer(
112
+ src_vocab_size=sp_devanagari.get_piece_size(),
113
+ tgt_vocab_size=sp_nastaaliq.get_piece_size()
114
+ ).to(device)
115
+
116
+
117
+ checkpoint = torch.load("h2u_2.0.pt", map_location=device)
118
+ state_dict = checkpoint["model_state_dict"]
119
+ new_state_dict = OrderedDict()
120
+ for k, v in state_dict.items():
121
+ new_k = k.replace("module.", "")
122
+ new_state_dict[new_k] = v
123
+ model.load_state_dict(new_state_dict)
124
+ model.eval()
125
+
126
+
127
+ def transliterate_urdu_to_hindi(text_urdu, max_len=128):
128
+ src_ids = [2] + sp_devanagari.encode(text_urdu)[:max_len - 2] + [3]
129
+ src_tensor = torch.tensor(src_ids).unsqueeze(0).to(device)
130
+
131
+ tgt_ids = [2] # BOS token
132
+ for _ in range(max_len):
133
+ tgt_tensor = torch.tensor(tgt_ids).unsqueeze(0).to(device)
134
+ with torch.no_grad():
135
+ output = model(src_tensor, tgt_tensor)
136
+ next_token_logits = output[0, -1, :]
137
+ next_token_id = torch.argmax(next_token_logits).item()
138
+
139
+ if next_token_id == 3:
140
+ break
141
+ tgt_ids.append(next_token_id)
142
+
143
+ return sp_nastaaliq.decode(tgt_ids[1:])
144
+
145
+
146
+ res=transliterate_urdu_to_hindi("थम गए हों बहते बहते चम्पई रुख़्सार पर")
147
+ print(res)
148
+
149
+
150
+
151
+ ```
152
+ ## Output
153
+ ```python
154
+ تھم گئے ہوں بہتے بہتے چمپئی رخسار پر
155
+ ```
156
+
157
+ ---
158
+
159
+ ## Dataset
160
+
161
+ - Trained on approximately **1300,000 Hindi-Urdu Ghazal and Nazm Pairs**
162
+ - Sourced and curated for transliteration.
163
+ - Character-level alignment ensured for quality
164
+
165
+ ---
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ filelock==3.18.0
2
+ fsspec==2025.7.0
3
+ Jinja2==3.1.6
4
+ MarkupSafe==3.0.2
5
+ mpmath==1.3.0
6
+ networkx==3.4.2
7
+ nvidia-cublas-cu12==12.6.4.1
8
+ nvidia-cuda-cupti-cu12==12.6.80
9
+ nvidia-cuda-nvrtc-cu12==12.6.77
10
+ nvidia-cuda-runtime-cu12==12.6.77
11
+ nvidia-cudnn-cu12==9.5.1.17
12
+ nvidia-cufft-cu12==11.3.0.4
13
+ nvidia-cufile-cu12==1.11.1.6
14
+ nvidia-curand-cu12==10.3.7.77
15
+ nvidia-cusolver-cu12==11.7.1.2
16
+ nvidia-cusparse-cu12==12.5.4.2
17
+ nvidia-cusparselt-cu12==0.6.3
18
+ nvidia-nccl-cu12==2.26.2
19
+ nvidia-nvjitlink-cu12==12.6.85
20
+ nvidia-nvtx-cu12==12.6.77
21
+ sentencepiece==0.2.0
22
+ sympy==1.14.0
23
+ torch==2.7.1
24
+ triton==3.3.1
25
+ typing_extensions==4.14.1