Update README.md
Browse files
README.md
CHANGED
|
@@ -216,6 +216,88 @@ print(similarity)
|
|
| 216 |
```
|
| 217 |
</details>
|
| 218 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
## Training & Evaluation
|
| 220 |
|
| 221 |
Please refer to our [technical report of jina-code-embeddings](https://arxiv.org/abs/2508.21290) for training details and benchmarks.
|
|
|
|
| 216 |
```
|
| 217 |
</details>
|
| 218 |
|
| 219 |
+
<details>
|
| 220 |
+
<summary>via <a href="https://github.com/vllm-project/vllm">vLLM</a></summary>
|
| 221 |
+
|
| 222 |
+
```python
|
| 223 |
+
|
| 224 |
+
import torch
|
| 225 |
+
import torch.nn.functional as F
|
| 226 |
+
from vllm import LLM
|
| 227 |
+
|
| 228 |
+
INSTRUCTION_CONFIG = {
|
| 229 |
+
"nl2code": {
|
| 230 |
+
"query": "Find the most relevant code snippet given the following query:\n",
|
| 231 |
+
"passage": "Candidate code snippet:\n"
|
| 232 |
+
},
|
| 233 |
+
"qa": {
|
| 234 |
+
"query": "Find the most relevant answer given the following question:\n",
|
| 235 |
+
"passage": "Candidate answer:\n"
|
| 236 |
+
},
|
| 237 |
+
"code2code": {
|
| 238 |
+
"query": "Find an equivalent code snippet given the following code snippet:\n",
|
| 239 |
+
"passage": "Candidate code snippet:\n"
|
| 240 |
+
},
|
| 241 |
+
"code2nl": {
|
| 242 |
+
"query": "Find the most relevant comment given the following code snippet:\n",
|
| 243 |
+
"passage": "Candidate comment:\n"
|
| 244 |
+
},
|
| 245 |
+
"code2completion": {
|
| 246 |
+
"query": "Find the most relevant completion given the following start of code snippet:\n",
|
| 247 |
+
"passage": "Candidate completion:\n"
|
| 248 |
+
}
|
| 249 |
+
}
|
| 250 |
+
|
| 251 |
+
def add_instruction(instruction, text):
|
| 252 |
+
return f"{instruction}{text}"
|
| 253 |
+
|
| 254 |
+
def cosine_similarity(x, y):
|
| 255 |
+
x = F.normalize(x, p=2, dim=1)
|
| 256 |
+
y = F.normalize(y, p=2, dim=1)
|
| 257 |
+
return x @ y.T
|
| 258 |
+
|
| 259 |
+
# Build the queries and documents
|
| 260 |
+
queries = [
|
| 261 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "print hello world in python"),
|
| 262 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "initialize array of 5 zeros in c++"),
|
| 263 |
+
]
|
| 264 |
+
documents = [
|
| 265 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "print('Hello World!')"),
|
| 266 |
+
add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "int arr[5] = {0, 0, 0, 0, 0};"),
|
| 267 |
+
]
|
| 268 |
+
all_inputs = queries + documents
|
| 269 |
+
|
| 270 |
+
# vLLM embedding model
|
| 271 |
+
llm = LLM(
|
| 272 |
+
model="jinaai/jina-code-embeddings-0.5b",
|
| 273 |
+
hf_overrides={"architectures": ["Qwen2ForCausalLM"]},
|
| 274 |
+
task="embed"
|
| 275 |
+
)
|
| 276 |
+
|
| 277 |
+
# Encode with vLLM
|
| 278 |
+
outputs = llm.encode(all_inputs)
|
| 279 |
+
|
| 280 |
+
# Collect embeddings into a single tensor
|
| 281 |
+
emb_list = []
|
| 282 |
+
for out in outputs:
|
| 283 |
+
vec = out.outputs.data.detach()
|
| 284 |
+
emb_list.append(vec)
|
| 285 |
+
embeddings = torch.stack(emb_list, dim=0)
|
| 286 |
+
|
| 287 |
+
# Split into query and passage embeddings
|
| 288 |
+
n_q = len(queries)
|
| 289 |
+
query_embeddings = embeddings[:n_q]
|
| 290 |
+
passage_embeddings = embeddings[n_q:]
|
| 291 |
+
|
| 292 |
+
# Cosine similarity matrix (queries x documents)
|
| 293 |
+
scores = cosine_similarity(query_embeddings, passage_embeddings)
|
| 294 |
+
print(scores)
|
| 295 |
+
# tensor([[0.8171, 0.1230],
|
| 296 |
+
# [0.1207, 0.5513]])
|
| 297 |
+
```
|
| 298 |
+
|
| 299 |
+
</details>
|
| 300 |
+
|
| 301 |
## Training & Evaluation
|
| 302 |
|
| 303 |
Please refer to our [technical report of jina-code-embeddings](https://arxiv.org/abs/2508.21290) for training details and benchmarks.
|