hasankursun commited on
Commit
da0ab8a
·
verified ·
1 Parent(s): efcc88b

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,36 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
1_Pooling/config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "word_embedding_dimension": 1024,
3
+ "pooling_mode_cls_token": true,
4
+ "pooling_mode_mean_tokens": false,
5
+ "pooling_mode_max_tokens": false,
6
+ "pooling_mode_mean_sqrt_len_tokens": false,
7
+ "pooling_mode_weightedmean_tokens": false,
8
+ "pooling_mode_lasttoken": false,
9
+ "include_prompt": true
10
+ }
2_Normalize/config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
LICENSE ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Creative Commons Attribution-NonCommercial 4.0 International License
2
+
3
+ Copyright (c) 2025
4
+
5
+ This work is licensed under the Creative Commons Attribution-NonCommercial 4.0
6
+ International License. To view a copy of this license, visit
7
+ http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to
8
+ Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
9
+
10
+ You are free to:
11
+ - Share — copy and redistribute the material in any medium or format
12
+ - Adapt — remix, transform, and build upon the material
13
+
14
+ Under the following terms:
15
+ - Attribution — You must give appropriate credit, provide a link to the license,
16
+ and indicate if changes were made. You may do so in any reasonable manner, but
17
+ not in any way that suggests the licensor endorses you or your use.
18
+ - NonCommercial — You may not use the material for commercial purposes.
19
+ - No additional restrictions — You may not apply legal terms or technological
20
+ measures that legally restrict others from doing anything the license permits.
21
+
22
+ ---
23
+
24
+ ## Acknowledgments
25
+
26
+ This model builds upon several foundational works and contributions:
27
+
28
+ ### Base Architecture
29
+ - **XLM-RoBERTa**: This model uses XLM-RoBERTa as its base architecture
30
+ - Original paper: "Unsupervised Cross-lingual Representation Learning at Scale"
31
+ - Authors: Conneau et al.
32
+ - License: MIT License
33
+
34
+ ### Training Methodology
35
+ We are grateful to the Beijing Academy of Artificial Intelligence (BAAI) for their
36
+ contributions to embedding research:
37
+
38
+ - **RetroMAE**: Self-supervised pre-training methodology
39
+ - Paper: "RetroMAE: Pre-Training Retrieval-oriented Language Models Via Masked Auto-Encoder"
40
+ - Authors: BAAI
41
+ - arXiv: https://arxiv.org/abs/2205.12035
42
+
43
+ - **BGE-M3**: Multi-lingual embedding research
44
+ - Paper: "BGE M3-Embedding: Multi-Lingual, Multi-Functionality, Multi-Granularity Text Embeddings"
45
+ - Authors: BAAI
46
+ - arXiv: https://arxiv.org/abs/2402.03216
47
+
48
+ ### Matryoshka Representation Learning
49
+ - Paper: "Matryoshka Representation Learning"
50
+ - Authors: Kusupati et al.
51
+ - Year: 2022
52
+ - arXiv: https://arxiv.org/abs/2205.13147
53
+
54
+ ### Training Framework
55
+ - Sentence Transformers: https://www.sbert.net
56
+ - License: Apache 2.0
57
+
58
+ Users are encouraged to cite this model and the foundational works when using
59
+ it in research or applications.
README.md CHANGED
@@ -1,3 +1,439 @@
1
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  license: cc-by-nc-4.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ library_name: sentence-transformers
3
+ pipeline_tag: sentence-similarity
4
+ tags:
5
+ - sentence-transformers
6
+ - feature-extraction
7
+ - sentence-similarity
8
+ - matryoshka
9
+ - multilingual
10
+ - embeddings
11
+ - xlm-roberta
12
+ language:
13
+ - multilingual
14
+ - en
15
+ - ar
16
+ - de
17
+ - es
18
+ - fr
19
+ - zh
20
+ - ru
21
+ - tr
22
+ - ko
23
+ - ja
24
+ - it
25
+ - pt
26
+ - nl
27
  license: cc-by-nc-4.0
28
+ base_model: xlm-roberta-base
29
+ metrics:
30
+ - cosine_accuracy
31
+ - cosine_precision
32
+ - cosine_recall
33
+ - cosine_f1
34
+ - cosine_ap
35
+ - dot_accuracy
36
+ - dot_precision
37
+ - dot_recall
38
+ - dot_f1
39
+ - dot_ap
40
+ - manhattan_accuracy
41
+ - manhattan_precision
42
+ - manhattan_recall
43
+ - manhattan_f1
44
+ - manhattan_ap
45
+ - euclidean_accuracy
46
+ - euclidean_precision
47
+ - euclidean_recall
48
+ - euclidean_f1
49
+ - euclidean_ap
50
+ model-index:
51
+ - name: Matryoshka Text Embedding v1
52
+ results:
53
+ - task:
54
+ type: information-retrieval
55
+ name: Information Retrieval
56
+ dataset:
57
+ name: SciFact
58
+ type: scifact
59
+ config: default
60
+ split: test
61
+ revision: d56462d0e63a25450459c4f213e49ffdb866f7f9
62
+ metrics:
63
+ - type: ndcg_at_10
64
+ value: 0.63084
65
+ name: NDCG@10
66
+ - type: ndcg_at_1
67
+ value: 0.51
68
+ name: NDCG@1
69
+ - type: ndcg_at_3
70
+ value: 0.578
71
+ name: NDCG@3
72
+ - type: ndcg_at_5
73
+ value: 0.60648
74
+ name: NDCG@5
75
+ - task:
76
+ type: semantic-similarity
77
+ name: Semantic Similarity
78
+ dataset:
79
+ name: STSBenchmark
80
+ type: stsbenchmark
81
+ config: default
82
+ split: test
83
+ revision: b0fddb56ed78048fa8b90373c8a3cfc37b684831
84
+ metrics:
85
+ - type: spearman
86
+ value: 0.850616
87
+ name: Spearman
88
+ - type: pearson
89
+ value: 0.838067
90
+ name: Pearson
91
+ - task:
92
+ type: semantic-similarity
93
+ name: Semantic Similarity
94
+ dataset:
95
+ name: STS17
96
+ type: sts17-crosslingual-sts
97
+ config: en-en
98
+ split: test
99
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
100
+ metrics:
101
+ - type: spearman
102
+ value: 0.873981
103
+ name: Spearman (en-en)
104
+ - task:
105
+ type: semantic-similarity
106
+ name: Semantic Similarity
107
+ dataset:
108
+ name: STS17
109
+ type: sts17-crosslingual-sts
110
+ config: es-es
111
+ split: test
112
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
113
+ metrics:
114
+ - type: spearman
115
+ value: 0.88079
116
+ name: Spearman (es-es)
117
+ - task:
118
+ type: semantic-similarity
119
+ name: Semantic Similarity
120
+ dataset:
121
+ name: STS17
122
+ type: sts17-crosslingual-sts
123
+ config: ko-ko
124
+ split: test
125
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
126
+ metrics:
127
+ - type: spearman
128
+ value: 0.821019
129
+ name: Spearman (ko-ko)
130
+ - task:
131
+ type: semantic-similarity
132
+ name: Semantic Similarity
133
+ dataset:
134
+ name: STS17
135
+ type: sts17-crosslingual-sts
136
+ config: ar-ar
137
+ split: test
138
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
139
+ metrics:
140
+ - type: spearman
141
+ value: 0.805643
142
+ name: Spearman (ar-ar)
143
+ - task:
144
+ type: semantic-similarity
145
+ name: Semantic Similarity
146
+ dataset:
147
+ name: STS17
148
+ type: sts17-crosslingual-sts
149
+ config: en-de
150
+ split: test
151
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
152
+ metrics:
153
+ - type: spearman
154
+ value: 0.824516
155
+ name: Spearman (en-de)
156
+ - task:
157
+ type: semantic-similarity
158
+ name: Semantic Similarity
159
+ dataset:
160
+ name: STS17
161
+ type: sts17-crosslingual-sts
162
+ config: nl-en
163
+ split: test
164
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
165
+ metrics:
166
+ - type: spearman
167
+ value: 0.819011
168
+ name: Spearman (nl-en)
169
+ - task:
170
+ type: semantic-similarity
171
+ name: Semantic Similarity
172
+ dataset:
173
+ name: STS17
174
+ type: sts17-crosslingual-sts
175
+ config: it-en
176
+ split: test
177
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
178
+ metrics:
179
+ - type: spearman
180
+ value: 0.815176
181
+ name: Spearman (it-en)
182
+ - task:
183
+ type: semantic-similarity
184
+ name: Semantic Similarity
185
+ dataset:
186
+ name: STS17
187
+ type: sts17-crosslingual-sts
188
+ config: fr-en
189
+ split: test
190
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
191
+ metrics:
192
+ - type: spearman
193
+ value: 0.815679
194
+ name: Spearman (fr-en)
195
+ - task:
196
+ type: semantic-similarity
197
+ name: Semantic Similarity
198
+ dataset:
199
+ name: STS17
200
+ type: sts17-crosslingual-sts
201
+ config: en-tr
202
+ split: test
203
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
204
+ metrics:
205
+ - type: spearman
206
+ value: 0.748444
207
+ name: Spearman (en-tr)
208
+ - task:
209
+ type: semantic-similarity
210
+ name: Semantic Similarity
211
+ dataset:
212
+ name: STS17
213
+ type: sts17-crosslingual-sts
214
+ config: es-en
215
+ split: test
216
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
217
+ metrics:
218
+ - type: spearman
219
+ value: 0.766019
220
+ name: Spearman (es-en)
221
+ - task:
222
+ type: semantic-similarity
223
+ name: Semantic Similarity
224
+ dataset:
225
+ name: STS17
226
+ type: sts17-crosslingual-sts
227
+ config: en-ar
228
+ split: test
229
+ revision: faeb762787bd10488a50c8b5be4a3b82e411949c
230
+ metrics:
231
+ - type: spearman
232
+ value: 0.71912
233
+ name: Spearman (en-ar)
234
  ---
235
+
236
+ # Matryoshka Text Embedding v1
237
+
238
+ A multilingual text embedding model with Matryoshka Representation Learning, allowing flexible embedding dimensions from 64D to 1024D.
239
+
240
+ ## Model Overview
241
+
242
+ This model implements Matryoshka Representation Learning, enabling you to truncate embeddings to different dimensions while maintaining good performance. This allows you to balance accuracy, speed, and storage based on your specific needs.
243
+
244
+ ### Key Features
245
+
246
+ - **Flexible Dimensions**: Choose from 7 different embedding sizes (64D, 128D, 256D, 384D, 512D, 768D, 1024D)
247
+ - **Multilingual Support**: Trained on 100+ languages
248
+ - **Base Architecture**: XLM-RoBERTa
249
+ - **Max Sequence Length**: 8192 tokens
250
+
251
+ ## Quick Start
252
+
253
+ ### Installation
254
+
255
+ ```python
256
+ pip install sentence-transformers
257
+ ```
258
+
259
+ ### Basic Usage
260
+
261
+ ```python
262
+ from sentence_transformers import SentenceTransformer
263
+
264
+ # Load model
265
+ model = SentenceTransformer('matryoshka-text-embedding-v1')
266
+
267
+ # Full precision (1024D)
268
+ embeddings = model.encode(["Your text here"])
269
+
270
+ # Balanced mode (512D) - Recommended for most use cases
271
+ embeddings = model.encode(["Your text here"], truncate_dim=512)
272
+
273
+ # Fast mode (256D) - For high-throughput applications
274
+ embeddings = model.encode(["Your text here"], truncate_dim=256)
275
+
276
+ # Ultra-fast mode (128D) - For real-time applications
277
+ embeddings = model.encode(["Your text here"], truncate_dim=128)
278
+ ```
279
+
280
+ ## Performance Benchmarks
281
+
282
+ ### SciFact (Scientific Document Retrieval)
283
+
284
+ | Dimension | NDCG@10 | Relative Performance |
285
+ |-----------|---------|---------------------|
286
+ | **1024D** | 0.6308 | 100.0% |
287
+ | **768D** | 0.6277 | 99.5% |
288
+ | **512D** | 0.6114 | 96.9% |
289
+ | **384D** | 0.6035 | 95.7% |
290
+ | **256D** | 0.5614 | 89.0% |
291
+ | **128D** | 0.4732 | 75.0% |
292
+ | **64D** | 0.3317 | 52.6% |
293
+
294
+ ### STSBenchmark (English Semantic Similarity)
295
+
296
+ - **Spearman**: 0.8506 (1024D)
297
+ - **Pearson**: 0.8381 (1024D)
298
+
299
+ ### STS17 (Multilingual Semantic Similarity)
300
+
301
+ **Average Spearman Correlation across languages: 0.8096**
302
+
303
+ Performance by language pair (1024D):
304
+ - Spanish (es-es): 0.8808
305
+ - English (en-en): 0.8740
306
+ - German (en-de): 0.8245
307
+ - Korean (ko-ko): 0.8210
308
+ - French (fr-en): 0.8157
309
+ - Italian (it-en): 0.8152
310
+ - Dutch (nl-en): 0.8190
311
+ - Arabic (ar-ar): 0.8056
312
+ - Turkish (en-tr): 0.7484
313
+ - Spanish-English (es-en): 0.7660
314
+ - English-Arabic (en-ar): 0.7191
315
+
316
+ ## Use Cases
317
+
318
+ ### High Accuracy Applications (768D-1024D)
319
+ - Scientific literature search
320
+ - Legal document retrieval
321
+ - Medical information systems
322
+
323
+ ### Balanced Production (512D) - Recommended
324
+ - General web search
325
+ - E-commerce product search
326
+ - Content recommendation engines
327
+ - Knowledge base retrieval
328
+
329
+ ### High-Throughput Systems (256D-384D)
330
+ - Real-time search APIs
331
+ - Large-scale document indexing
332
+ - Social media search
333
+
334
+ ### Mobile & Edge Devices (64D-128D)
335
+ - Mobile applications
336
+ - IoT devices
337
+ - Browser-based search
338
+ - Resource-constrained environments
339
+
340
+ ## Advanced Usage
341
+
342
+ ### Semantic Search
343
+
344
+ ```python
345
+ import numpy as np
346
+ from sentence_transformers import util
347
+
348
+ # Index documents with 512D (optimal balance)
349
+ documents = [
350
+ "Artificial intelligence is transforming healthcare.",
351
+ "Machine learning models require large datasets.",
352
+ "Quantum computing promises exponential speedups."
353
+ ]
354
+
355
+ doc_embeddings = model.encode(documents, truncate_dim=512)
356
+
357
+ # Search with same dimension
358
+ query = "How is AI used in medicine?"
359
+ query_embedding = model.encode(query, truncate_dim=512)
360
+
361
+ # Compute similarities
362
+ similarities = util.cos_sim(query_embedding, doc_embeddings)
363
+ top_result = np.argmax(similarities)
364
+
365
+ print(f"Most relevant: {documents[top_result]}")
366
+ ```
367
+
368
+ ### Integration with FAISS
369
+
370
+ ```python
371
+ import faiss
372
+ import numpy as np
373
+
374
+ # Create embeddings with 512D
375
+ embeddings = model.encode(documents, truncate_dim=512)
376
+ embeddings = embeddings.astype('float32')
377
+
378
+ # Build FAISS index
379
+ dimension = 512
380
+ index = faiss.IndexFlatIP(dimension)
381
+ faiss.normalize_L2(embeddings)
382
+ index.add(embeddings)
383
+
384
+ # Search
385
+ query_embedding = model.encode(query, truncate_dim=512).astype('float32')
386
+ faiss.normalize_L2(query_embedding.reshape(1, -1))
387
+ distances, indices = index.search(query_embedding.reshape(1, -1), k=10)
388
+ ```
389
+
390
+ ## Technical Details
391
+
392
+ ### Architecture
393
+ - **Base**: XLM-RoBERTa transformer encoder
394
+ - **Embedding Dimensions**: 1024 (full) with 7 supported truncation levels
395
+ - **Max Sequence Length**: 8192 tokens
396
+ - **Vocabulary Size**: 250,002 tokens
397
+ - **Parameters**: ~568M
398
+
399
+ ### Training
400
+ - **Technique**: Matryoshka Representation Learning
401
+ - **Languages**: 100+ languages
402
+ - **Max Input Length**: 8192 tokens
403
+
404
+ ## Model Files
405
+
406
+ - `pytorch_model.bin` - Model weights
407
+ - `config.json` - Model configuration
408
+ - `tokenizer.json` - Tokenizer configuration
409
+ - `lumees_config.json` - Matryoshka-specific configuration
410
+
411
+ ## License
412
+
413
+ This model is released under the **CC-BY-NC-4.0** (Creative Commons Attribution-NonCommercial 4.0 International) license.
414
+
415
+ See the [LICENSE](LICENSE) file for full details and acknowledgments.
416
+
417
+ ## Acknowledgments
418
+
419
+ This model builds upon important foundational work:
420
+
421
+ - **XLM-RoBERTa**: Base architecture for multilingual representations
422
+ - **BAAI**: For their contributions through RetroMAE and BGE-M3 papers
423
+ - **Matryoshka Representation Learning**: Training methodology (Kusupati et al., 2022)
424
+
425
+ ## Citation
426
+
427
+ If you use this model in your research or application, please cite:
428
+
429
+ ```bibtex
430
+ @misc{matryoshka-text-embedding-v1,
431
+ title={Matryoshka Text Embedding v1},
432
+ author={Hasan Kurşun and Kerem Berkay Yanık},
433
+ year={2025},
434
+ url={https://huggingface.co/matryoshka-text-embedding-v1},
435
+ organization={Lumees},
436
+ contact={[email protected]},
437
+ website={https://lumees.io}
438
+ }
439
+ ```
README_HF.md ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: sentence-transformers
3
+ pipeline_tag: sentence-similarity
4
+ tags:
5
+ - sentence-transformers
6
+ - feature-extraction
7
+ - sentence-similarity
8
+ - matryoshka
9
+ - multilingual
10
+ - embeddings
11
+ language:
12
+ - multilingual
13
+ - en
14
+ - ar
15
+ - de
16
+ - es
17
+ - fr
18
+ - zh
19
+ - ru
20
+ - tr
21
+ - ko
22
+ - ja
23
+ license: cc-by-nc-4.0
24
+ base_model: xlm-roberta-base
25
+ ---
26
+
27
+ # Matryoshka Text Embedding v1
28
+
29
+ **Matryoshka Embedding Model with Flexible Dimensions**
30
+
31
+ This is a [sentence-transformers](https://www.SBERT.net) model with Matryoshka Representation Learning,
32
+ allowing flexible dimension truncation from 64D to 1024D.
33
+
34
+ ## Model Details
35
+
36
+ - **Model Type**: Sentence Transformer with Matryoshka Representation Learning
37
+ - **Base Architecture**: XLM-RoBERTa
38
+ - **Dimensions**: 64, 128, 256, 384, 512, 768, 1024
39
+ - **Max Sequence Length**: 8192 tokens
40
+ - **Languages**: 100+ languages
41
+ - **Output Dimensionality**: 1024 (with 7 truncation options)
42
+
43
+ ## Usage
44
+ ```python
45
+ from sentence_transformers import SentenceTransformer
46
+
47
+ model = SentenceTransformer('matryoshka-text-embedding-v1')
48
+
49
+ # Full precision
50
+ embeddings = model.encode(["Hello World"])
51
+
52
+ # Optimized for production (recommended)
53
+ embeddings = model.encode(["Hello World"], truncate_dim=512)
54
+ ```
55
+
56
+ See [README.md](README.md) for detailed documentation.
57
+
58
+ ## Performance (Self-Reported)
59
+
60
+ | Task | Metric | Score |
61
+ |------|--------|-------|
62
+ | SciFact | NDCG@10 | 0.6308 |
63
+ | STS17 | Spearman | 0.8096 |
64
+ | STSBenchmark | Spearman | 0.8506 |
65
+
66
+ ## License
67
+
68
+ CC-BY-NC-4.0 - See LICENSE file for details and acknowledgments.
69
+
70
+ ## Acknowledgments
71
+
72
+ This model builds upon:
73
+ - **XLM-RoBERTa**: Base architecture
74
+ - **BAAI**: RetroMAE and BGE-M3 research contributions
75
+ - **Matryoshka Representation Learning**: Training methodology
76
+
77
+ ## Citation
78
+
79
+ ```bibtex
80
+ @misc{matryoshka-text-embedding-v1,
81
+ title={Matryoshka Text Embedding v1},
82
+ author={Hasan Kurşun and Kerem Berkay Yanık},
83
+ year={2025},
84
+ url={https://huggingface.co/matryoshka-text-embedding-v1},
85
+ organization={Lumees},
86
+ contact={[email protected]},
87
+ website={https://lumees.io}
88
+ }
89
+ ```
config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "XLMRobertaModel"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "bos_token_id": 0,
7
+ "classifier_dropout": null,
8
+ "dtype": "float32",
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 1024,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 4096,
15
+ "layer_norm_eps": 1e-05,
16
+ "max_position_embeddings": 8194,
17
+ "model_type": "xlm-roberta",
18
+ "num_attention_heads": 16,
19
+ "num_hidden_layers": 24,
20
+ "output_past": true,
21
+ "pad_token_id": 1,
22
+ "position_embedding_type": "absolute",
23
+ "transformers_version": "4.57.1",
24
+ "type_vocab_size": 1,
25
+ "use_cache": true,
26
+ "vocab_size": 250002
27
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "5.1.2",
4
+ "transformers": "4.57.1",
5
+ "pytorch": "2.6.0+cu124"
6
+ },
7
+ "model_type": "SentenceTransformer",
8
+ "prompts": {
9
+ "query": "",
10
+ "document": ""
11
+ },
12
+ "default_prompt_name": null,
13
+ "similarity_fn_name": "cosine"
14
+ }
lumees_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "matryoshka-text-embedding-v1",
3
+ "model_type": "matryoshka",
4
+ "version": "1.0.0",
5
+ "matryoshka_dimensions": [
6
+ 64,
7
+ 128,
8
+ 256,
9
+ 384,
10
+ 512,
11
+ 768,
12
+ 1024
13
+ ],
14
+ "default_dimension": 1024,
15
+ "embedding_dimension": 1024,
16
+ "max_sequence_length": 8192,
17
+ "is_matryoshka": true
18
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58aa06c15337c73f189f161594b536c74a4be096df6143864e09c9e853676bb0
3
+ size 2271064456
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]
sentence_bert_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 8192,
3
+ "do_lower_case": false
4
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "cls_token": {
10
+ "content": "<s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "</s>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "mask_token": {
24
+ "content": "<mask>",
25
+ "lstrip": true,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "pad_token": {
31
+ "content": "<pad>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ },
37
+ "sep_token": {
38
+ "content": "</s>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false
43
+ },
44
+ "unk_token": {
45
+ "content": "<unk>",
46
+ "lstrip": false,
47
+ "normalized": false,
48
+ "rstrip": false,
49
+ "single_word": false
50
+ }
51
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4f7e21bec3fb0044ca0bb2d50eb5d4d8c596273c422baef84466d2c73748b9c
3
+ size 17083053
tokenizer_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "max_length": 8192,
51
+ "model_max_length": 8192,
52
+ "pad_to_multiple_of": null,
53
+ "pad_token": "<pad>",
54
+ "pad_token_type_id": 0,
55
+ "padding_side": "right",
56
+ "sep_token": "</s>",
57
+ "sp_model_kwargs": {},
58
+ "stride": 0,
59
+ "tokenizer_class": "XLMRobertaTokenizerFast",
60
+ "truncation_side": "right",
61
+ "truncation_strategy": "longest_first",
62
+ "unk_token": "<unk>"
63
+ }