rekhta hi2ur model
Browse files- config.json +17 -0
- devanagari_bpe.model +3 -0
- devanagari_bpe.vocab +139 -0
- h2u_2.0.pt +3 -0
- nastaaliq_bpe.model +3 -0
- nastaaliq_bpe.vocab +141 -0
- readme.md +165 -0
- requirements.txt +25 -0
config.json
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "custom-transliterator",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"CustomTransformerModel"
|
| 5 |
+
],
|
| 6 |
+
"hidden_size": 256,
|
| 7 |
+
"num_attention_heads": 4,
|
| 8 |
+
"num_hidden_layers": 3,
|
| 9 |
+
"dim_feedforward": 512,
|
| 10 |
+
"max_position_embeddings": 256,
|
| 11 |
+
"src_vocab_size": 139,
|
| 12 |
+
"tgt_vocab_size": 141,
|
| 13 |
+
"pad_token_id": 0,
|
| 14 |
+
"bos_token_id": 2,
|
| 15 |
+
"eos_token_id": 3,
|
| 16 |
+
"unk_token_id": 1
|
| 17 |
+
}
|
devanagari_bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5cebd156b47d0c23766a98e970db86873baf44ada068c7074ed998436bd5e9c7
|
| 3 |
+
size 239237
|
devanagari_bpe.vocab
ADDED
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[PAD] 0
|
| 2 |
+
[UNK] 0
|
| 3 |
+
[CLS] 0
|
| 4 |
+
[SEP] 0
|
| 5 |
+
▁ -1.56837
|
| 6 |
+
ा -2.72961
|
| 7 |
+
े -3.09795
|
| 8 |
+
क -3.12604
|
| 9 |
+
र -3.14134
|
| 10 |
+
ह -3.15577
|
| 11 |
+
म -3.41228
|
| 12 |
+
ी -3.46207
|
| 13 |
+
त -3.56156
|
| 14 |
+
न -3.57827
|
| 15 |
+
स -3.57865
|
| 16 |
+
़ -3.58176
|
| 17 |
+
ं -3.72041
|
| 18 |
+
ो -3.76092
|
| 19 |
+
ि -3.78206
|
| 20 |
+
ज -3.89364
|
| 21 |
+
ल -3.89917
|
| 22 |
+
- -3.91544
|
| 23 |
+
द -3.97711
|
| 24 |
+
ब -4.00613
|
| 25 |
+
् -4.03211
|
| 26 |
+
ु -4.11664
|
| 27 |
+
ै -4.1828
|
| 28 |
+
ग -4.3143
|
| 29 |
+
ए -4.42049
|
| 30 |
+
य -4.42315
|
| 31 |
+
ँ -4.63556
|
| 32 |
+
ख -4.65368
|
| 33 |
+
प -4.67017
|
| 34 |
+
ू -4.76717
|
| 35 |
+
व -4.82514
|
| 36 |
+
श -4.83062
|
| 37 |
+
आ -4.94678
|
| 38 |
+
अ -5.0527
|
| 39 |
+
फ -5.13204
|
| 40 |
+
भ -5.30514
|
| 41 |
+
च -5.38654
|
| 42 |
+
इ -5.45387
|
| 43 |
+
उ -5.55407
|
| 44 |
+
थ -5.62956
|
| 45 |
+
ई -5.6659
|
| 46 |
+
' -5.66818
|
| 47 |
+
झ -5.88977
|
| 48 |
+
ड -5.92759
|
| 49 |
+
ट -6.11794
|
| 50 |
+
छ -6.12487
|
| 51 |
+
ौ -6.2094
|
| 52 |
+
ओ -6.35333
|
| 53 |
+
औ -6.65035
|
| 54 |
+
ठ -6.868
|
| 55 |
+
ध -6.86917
|
| 56 |
+
ऐ -7.01473
|
| 57 |
+
घ -7.14654
|
| 58 |
+
ढ -7.29567
|
| 59 |
+
ऊ -8.06043
|
| 60 |
+
, -9.41882
|
| 61 |
+
! -9.8114
|
| 62 |
+
. -10.3114
|
| 63 |
+
‘ -10.5707
|
| 64 |
+
ॉ -10.7803
|
| 65 |
+
’ -10.8678
|
| 66 |
+
? -10.8859
|
| 67 |
+
ष -11.0155
|
| 68 |
+
ण -11.4962
|
| 69 |
+
ृ -11.5722
|
| 70 |
+
( -12.4031
|
| 71 |
+
) -12.4188
|
| 72 |
+
ञ -12.6159
|
| 73 |
+
ऑ -12.7605
|
| 74 |
+
: -13.1119
|
| 75 |
+
2 -13.734
|
| 76 |
+
1 -14.0549
|
| 77 |
+
a -14.1693
|
| 78 |
+
3 -14.4469
|
| 79 |
+
। -14.4877
|
| 80 |
+
ऋ -14.5747
|
| 81 |
+
e -14.6213
|
| 82 |
+
ॅ -14.6213
|
| 83 |
+
4 -14.67
|
| 84 |
+
۔ -14.7754
|
| 85 |
+
r -14.8932
|
| 86 |
+
ङ -14.9577
|
| 87 |
+
u -15.0267
|
| 88 |
+
-15.2679
|
| 89 |
+
* -15.3632
|
| 90 |
+
5 -15.3632
|
| 91 |
+
i -15.3632
|
| 92 |
+
ٖ -15.3632
|
| 93 |
+
6 -15.4686
|
| 94 |
+
7 -15.4686
|
| 95 |
+
; -15.4686
|
| 96 |
+
b -15.4686
|
| 97 |
+
h -15.4686
|
| 98 |
+
s -15.4686
|
| 99 |
+
8 -15.5863
|
| 100 |
+
२ -15.5863
|
| 101 |
+
n -15.7199
|
| 102 |
+
v -15.7199
|
| 103 |
+
l -15.874
|
| 104 |
+
0 -16.0563
|
| 105 |
+
t -16.0563
|
| 106 |
+
z -16.0563
|
| 107 |
+
9 -16.2795
|
| 108 |
+
T -16.2795
|
| 109 |
+
d -16.2795
|
| 110 |
+
k -16.2795
|
| 111 |
+
o -16.2795
|
| 112 |
+
p -16.2795
|
| 113 |
+
أ -16.2795
|
| 114 |
+
३ -16.2795
|
| 115 |
+
_ -16.5672
|
| 116 |
+
ˈ -16.5672
|
| 117 |
+
“ -16.5672
|
| 118 |
+
+ -16.9726
|
| 119 |
+
y -16.9726
|
| 120 |
+
، -16.9726
|
| 121 |
+
ٍ -16.9726
|
| 122 |
+
ऩ -16.9726
|
| 123 |
+
— -16.9726
|
| 124 |
+
" -17.6658
|
| 125 |
+
H -17.6658
|
| 126 |
+
I -17.6658
|
| 127 |
+
K -17.6658
|
| 128 |
+
L -17.6658
|
| 129 |
+
O -17.6658
|
| 130 |
+
U -17.6658
|
| 131 |
+
f -17.6658
|
| 132 |
+
| -17.6658
|
| 133 |
+
£ -17.6658
|
| 134 |
+
س -17.6658
|
| 135 |
+
ع -17.6658
|
| 136 |
+
ِ -17.6658
|
| 137 |
+
ٓ -17.6658
|
| 138 |
+
ک -17.6658
|
| 139 |
+
ः -17.6658
|
h2u_2.0.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51c6c40da98a2bc82c3a88200d46fdc00014ae92d4e5a0f1cad9b6b3ac3dcd00
|
| 3 |
+
size 48872290
|
nastaaliq_bpe.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0e70a3a6728c54b310f196f1e3dda86ea92fac6573a0473fc4529c65f05c696
|
| 3 |
+
size 239191
|
nastaaliq_bpe.vocab
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[PAD] 0
|
| 2 |
+
[UNK] 0
|
| 3 |
+
[CLS] 0
|
| 4 |
+
[SEP] 0
|
| 5 |
+
▁ -1.43725
|
| 6 |
+
ا -2.49726
|
| 7 |
+
ی -2.61728
|
| 8 |
+
ہ -2.921
|
| 9 |
+
و -3.05754
|
| 10 |
+
ر -3.0802
|
| 11 |
+
ک -3.08283
|
| 12 |
+
ے -3.12091
|
| 13 |
+
ن -3.31991
|
| 14 |
+
م -3.35435
|
| 15 |
+
ت -3.45322
|
| 16 |
+
ں -3.52174
|
| 17 |
+
س -3.63601
|
| 18 |
+
ب -3.71838
|
| 19 |
+
ھ -3.7338
|
| 20 |
+
ل -3.84319
|
| 21 |
+
د -3.87154
|
| 22 |
+
ج -4.17633
|
| 23 |
+
گ -4.346
|
| 24 |
+
پ -4.47736
|
| 25 |
+
ش -4.76856
|
| 26 |
+
ئ -4.78603
|
| 27 |
+
چ -4.97538
|
| 28 |
+
آ -5.09129
|
| 29 |
+
ز -5.11549
|
| 30 |
+
خ -5.14642
|
| 31 |
+
ح -5.30285
|
| 32 |
+
ق -5.304
|
| 33 |
+
ف -5.32229
|
| 34 |
+
ع -5.4835
|
| 35 |
+
ٹ -5.71517
|
| 36 |
+
ص -5.91753
|
| 37 |
+
ڑ -5.97114
|
| 38 |
+
ط -6.11523
|
| 39 |
+
غ -6.14773
|
| 40 |
+
ظ -6.60046
|
| 41 |
+
ؔ -6.76266
|
| 42 |
+
ڈ -6.90553
|
| 43 |
+
ؤ -6.92384
|
| 44 |
+
ض -7.01754
|
| 45 |
+
ذ -7.02157
|
| 46 |
+
ۂ -7.43851
|
| 47 |
+
ث -7.59653
|
| 48 |
+
ٔ -9.09115
|
| 49 |
+
ٰ -9.32497
|
| 50 |
+
، -9.33314
|
| 51 |
+
' -9.48661
|
| 52 |
+
! -9.74387
|
| 53 |
+
ً -10.0546
|
| 54 |
+
ژ -10.1661
|
| 55 |
+
۔ -10.5828
|
| 56 |
+
؟ -10.8042
|
| 57 |
+
أ -10.8382
|
| 58 |
+
. -11.4494
|
| 59 |
+
( -12.3291
|
| 60 |
+
) -12.3343
|
| 61 |
+
ۓ -12.4488
|
| 62 |
+
ۃ -12.7346
|
| 63 |
+
ي -12.7899
|
| 64 |
+
ء -13.0172
|
| 65 |
+
: -13.0588
|
| 66 |
+
- -13.6901
|
| 67 |
+
ك -13.8409
|
| 68 |
+
ؐ -13.9386
|
| 69 |
+
۲ -14.0468
|
| 70 |
+
َ -14.0758
|
| 71 |
+
ه -14.2009
|
| 72 |
+
ّ -14.2348
|
| 73 |
+
2 -14.4666
|
| 74 |
+
ُ -14.5576
|
| 75 |
+
۳ -14.5576
|
| 76 |
+
1 -14.6577
|
| 77 |
+
ؑ -14.6577
|
| 78 |
+
۱ -14.7689
|
| 79 |
+
ؓ -14.9631
|
| 80 |
+
ٴ -14.9631
|
| 81 |
+
۴ -15.1172
|
| 82 |
+
* -15.2995
|
| 83 |
+
3 -15.4049
|
| 84 |
+
, -15.5227
|
| 85 |
+
۶ -15.5227
|
| 86 |
+
4 -15.6562
|
| 87 |
+
۵ -15.6562
|
| 88 |
+
i -15.8104
|
| 89 |
+
ِ -15.8104
|
| 90 |
+
۷ -15.8104
|
| 91 |
+
۸ -15.8104
|
| 92 |
+
ः -15.8104
|
| 93 |
+
‘ -15.8104
|
| 94 |
+
’ -15.8104
|
| 95 |
+
a -15.9927
|
| 96 |
+
e -15.9927
|
| 97 |
+
r -15.9927
|
| 98 |
+
s -15.9927
|
| 99 |
+
؛ -15.9927
|
| 100 |
+
ْ -15.9927
|
| 101 |
+
0 -16.2158
|
| 102 |
+
A -16.2158
|
| 103 |
+
b -16.2158
|
| 104 |
+
d -16.2158
|
| 105 |
+
n -16.2158
|
| 106 |
+
u -16.2158
|
| 107 |
+
5 -16.5035
|
| 108 |
+
7 -16.5035
|
| 109 |
+
I -16.5035
|
| 110 |
+
R -16.5035
|
| 111 |
+
ٍ -16.5035
|
| 112 |
+
“ -16.5035
|
| 113 |
+
8 -16.909
|
| 114 |
+
9 -16.909
|
| 115 |
+
C -16.909
|
| 116 |
+
E -16.909
|
| 117 |
+
G -16.909
|
| 118 |
+
L -16.909
|
| 119 |
+
U -16.909
|
| 120 |
+
t -16.909
|
| 121 |
+
y -16.909
|
| 122 |
+
ة -16.909
|
| 123 |
+
ٌ -16.909
|
| 124 |
+
۹ -16.909
|
| 125 |
+
+ -17.6021
|
| 126 |
+
6 -17.6021
|
| 127 |
+
? -17.6021
|
| 128 |
+
B -17.6021
|
| 129 |
+
M -17.6021
|
| 130 |
+
O -17.6021
|
| 131 |
+
T -17.6021
|
| 132 |
+
Z -17.6021
|
| 133 |
+
[ -17.6021
|
| 134 |
+
f -17.6021
|
| 135 |
+
h -17.6021
|
| 136 |
+
o -17.6021
|
| 137 |
+
z -17.6021
|
| 138 |
+
؎ -17.6021
|
| 139 |
+
ٓ -17.6021
|
| 140 |
+
ٖ -17.6021
|
| 141 |
+
۰ -17.6021
|
readme.md
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: other
|
| 3 |
+
language:
|
| 4 |
+
- ur
|
| 5 |
+
- hi
|
| 6 |
+
tags:
|
| 7 |
+
- pytorch
|
| 8 |
+
- transliterations
|
| 9 |
+
- urdu
|
| 10 |
+
- hindi
|
| 11 |
+
- RekhtaLabs
|
| 12 |
+
- Sequence2Sequence
|
| 13 |
+
- Transformers
|
| 14 |
+
---
|
| 15 |
+

|
| 16 |
+
# Hindi to Urdu Transliteration Model (Character-Level)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
This is a lightweight Transformer-based model trained for **character-level transliteration** of **Hindi poetry into Urdu script**. The model is specially tuned for literary and poetic text, making it ideal for applications involving shayari, nazm, or ghazals.
|
| 21 |
+
|
| 22 |
+
# Live Inference
|
| 23 |
+
https://rekhtalabs.org/demo/transliterate
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
## Model Overview
|
| 28 |
+
| Feature | Value |
|
| 29 |
+
|-------------------------|--------------------------|
|
| 30 |
+
| **Architecture** | Transformer (BART-style) |
|
| 31 |
+
| **Tokenizer** | Character-level |
|
| 32 |
+
| **Embedding Size** | 256 |
|
| 33 |
+
| **Hidden Size** | 256 (`d_model`) |
|
| 34 |
+
| **Feedforward Size** | 512 (`dim_feedforward`) |
|
| 35 |
+
| **Encoder Layers** | 3 (`num_layers`) |
|
| 36 |
+
| **Decoder Layers** | 3 (`num_layers`) |
|
| 37 |
+
| **Attention Heads** | 4 (`nhead`) |
|
| 38 |
+
| **Max Sequence Length** | 128 (`max_len`) |
|
| 39 |
+
|
| 40 |
+
---
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
## Usage
|
| 45 |
+
|
| 46 |
+
```python
|
| 47 |
+
from huggingface_hub import snapshot_download
|
| 48 |
+
|
| 49 |
+
path = snapshot_download(
|
| 50 |
+
repo_id="rekhtalabs/hi-2-ur-translit",
|
| 51 |
+
local_dir="./hi-2-ur-translit",
|
| 52 |
+
local_dir_use_symlinks=False
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
cd hi-2-ur-translit
|
| 56 |
+
```
|
| 57 |
+
|
| 58 |
+
```python
|
| 59 |
+
pip install -r requirements.txt
|
| 60 |
+
```
|
| 61 |
+
```python
|
| 62 |
+
import torch
|
| 63 |
+
import sentencepiece as spm
|
| 64 |
+
from torch import nn
|
| 65 |
+
from collections import OrderedDict
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class PositionalEncoding(nn.Module):
|
| 69 |
+
def __init__(self, d_model, max_len=5000):
|
| 70 |
+
super().__init__()
|
| 71 |
+
pe = torch.zeros(max_len, d_model)
|
| 72 |
+
position = torch.arange(0, max_len).unsqueeze(1)
|
| 73 |
+
div_term = torch.exp(torch.arange(0, d_model, 2) * (-torch.log(torch.tensor(10000.0)) / d_model))
|
| 74 |
+
pe[:, 0::2] = torch.sin(position.float() * div_term)
|
| 75 |
+
pe[:, 1::2] = torch.cos(position.float() * div_term)
|
| 76 |
+
self.pe = pe.unsqueeze(0)
|
| 77 |
+
|
| 78 |
+
def forward(self, x):
|
| 79 |
+
return x + self.pe[:, :x.size(1)].to(x.device)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class Transformer(nn.Module):
|
| 83 |
+
def __init__(self, src_vocab_size, tgt_vocab_size, d_model=256, nhead=4, num_layers=3, dim_feedforward=512, max_len=128):
|
| 84 |
+
super().__init__()
|
| 85 |
+
self.src_tok_emb = nn.Embedding(src_vocab_size, d_model)
|
| 86 |
+
self.tgt_tok_emb = nn.Embedding(tgt_vocab_size, d_model)
|
| 87 |
+
self.pos_encoder = PositionalEncoding(d_model, max_len)
|
| 88 |
+
self.transformer = nn.Transformer(
|
| 89 |
+
d_model=d_model,
|
| 90 |
+
nhead=nhead,
|
| 91 |
+
num_encoder_layers=num_layers,
|
| 92 |
+
num_decoder_layers=num_layers,
|
| 93 |
+
dim_feedforward=dim_feedforward,
|
| 94 |
+
batch_first=True
|
| 95 |
+
)
|
| 96 |
+
self.out = nn.Linear(d_model, tgt_vocab_size)
|
| 97 |
+
|
| 98 |
+
def forward(self, src, tgt):
|
| 99 |
+
src = self.pos_encoder(self.src_tok_emb(src))
|
| 100 |
+
tgt = self.pos_encoder(self.tgt_tok_emb(tgt))
|
| 101 |
+
tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(src.device)
|
| 102 |
+
out = self.transformer(src, tgt, tgt_mask=tgt_mask)
|
| 103 |
+
return self.out(out)
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
sp_nastaaliq = spm.SentencePieceProcessor(model_file='nastaaliq_bpe.model')
|
| 107 |
+
sp_devanagari = spm.SentencePieceProcessor(model_file='devanagari_bpe.model')
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 111 |
+
model = Transformer(
|
| 112 |
+
src_vocab_size=sp_devanagari.get_piece_size(),
|
| 113 |
+
tgt_vocab_size=sp_nastaaliq.get_piece_size()
|
| 114 |
+
).to(device)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
checkpoint = torch.load("h2u_2.0.pt", map_location=device)
|
| 118 |
+
state_dict = checkpoint["model_state_dict"]
|
| 119 |
+
new_state_dict = OrderedDict()
|
| 120 |
+
for k, v in state_dict.items():
|
| 121 |
+
new_k = k.replace("module.", "")
|
| 122 |
+
new_state_dict[new_k] = v
|
| 123 |
+
model.load_state_dict(new_state_dict)
|
| 124 |
+
model.eval()
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def transliterate_urdu_to_hindi(text_urdu, max_len=128):
|
| 128 |
+
src_ids = [2] + sp_devanagari.encode(text_urdu)[:max_len - 2] + [3]
|
| 129 |
+
src_tensor = torch.tensor(src_ids).unsqueeze(0).to(device)
|
| 130 |
+
|
| 131 |
+
tgt_ids = [2] # BOS token
|
| 132 |
+
for _ in range(max_len):
|
| 133 |
+
tgt_tensor = torch.tensor(tgt_ids).unsqueeze(0).to(device)
|
| 134 |
+
with torch.no_grad():
|
| 135 |
+
output = model(src_tensor, tgt_tensor)
|
| 136 |
+
next_token_logits = output[0, -1, :]
|
| 137 |
+
next_token_id = torch.argmax(next_token_logits).item()
|
| 138 |
+
|
| 139 |
+
if next_token_id == 3:
|
| 140 |
+
break
|
| 141 |
+
tgt_ids.append(next_token_id)
|
| 142 |
+
|
| 143 |
+
return sp_nastaaliq.decode(tgt_ids[1:])
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
res=transliterate_urdu_to_hindi("थम गए हों बहते बहते चम्पई रुख़्सार पर")
|
| 147 |
+
print(res)
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
```
|
| 152 |
+
## Output
|
| 153 |
+
```python
|
| 154 |
+
تھم گئے ہوں بہتے بہتے چمپئی رخسار پر
|
| 155 |
+
```
|
| 156 |
+
|
| 157 |
+
---
|
| 158 |
+
|
| 159 |
+
## Dataset
|
| 160 |
+
|
| 161 |
+
- Trained on approximately **1300,000 Hindi-Urdu Ghazal and Nazm Pairs**
|
| 162 |
+
- Sourced and curated for transliteration.
|
| 163 |
+
- Character-level alignment ensured for quality
|
| 164 |
+
|
| 165 |
+
---
|
requirements.txt
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
filelock==3.18.0
|
| 2 |
+
fsspec==2025.7.0
|
| 3 |
+
Jinja2==3.1.6
|
| 4 |
+
MarkupSafe==3.0.2
|
| 5 |
+
mpmath==1.3.0
|
| 6 |
+
networkx==3.4.2
|
| 7 |
+
nvidia-cublas-cu12==12.6.4.1
|
| 8 |
+
nvidia-cuda-cupti-cu12==12.6.80
|
| 9 |
+
nvidia-cuda-nvrtc-cu12==12.6.77
|
| 10 |
+
nvidia-cuda-runtime-cu12==12.6.77
|
| 11 |
+
nvidia-cudnn-cu12==9.5.1.17
|
| 12 |
+
nvidia-cufft-cu12==11.3.0.4
|
| 13 |
+
nvidia-cufile-cu12==1.11.1.6
|
| 14 |
+
nvidia-curand-cu12==10.3.7.77
|
| 15 |
+
nvidia-cusolver-cu12==11.7.1.2
|
| 16 |
+
nvidia-cusparse-cu12==12.5.4.2
|
| 17 |
+
nvidia-cusparselt-cu12==0.6.3
|
| 18 |
+
nvidia-nccl-cu12==2.26.2
|
| 19 |
+
nvidia-nvjitlink-cu12==12.6.85
|
| 20 |
+
nvidia-nvtx-cu12==12.6.77
|
| 21 |
+
sentencepiece==0.2.0
|
| 22 |
+
sympy==1.14.0
|
| 23 |
+
torch==2.7.1
|
| 24 |
+
triton==3.3.1
|
| 25 |
+
typing_extensions==4.14.1
|