michael-guenther commited on
Commit
01983f3
·
verified ·
1 Parent(s): 422967f

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +82 -0
README.md CHANGED
@@ -216,6 +216,88 @@ print(similarity)
216
  ```
217
  </details>
218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  ## Training & Evaluation
220
 
221
  Please refer to our [technical report of jina-code-embeddings](https://arxiv.org/abs/2508.21290) for training details and benchmarks.
 
216
  ```
217
  </details>
218
 
219
+ <details>
220
+ <summary>via <a href="https://github.com/vllm-project/vllm">vLLM</a></summary>
221
+
222
+ ```python
223
+
224
+ import torch
225
+ import torch.nn.functional as F
226
+ from vllm import LLM
227
+
228
+ INSTRUCTION_CONFIG = {
229
+ "nl2code": {
230
+ "query": "Find the most relevant code snippet given the following query:\n",
231
+ "passage": "Candidate code snippet:\n"
232
+ },
233
+ "qa": {
234
+ "query": "Find the most relevant answer given the following question:\n",
235
+ "passage": "Candidate answer:\n"
236
+ },
237
+ "code2code": {
238
+ "query": "Find an equivalent code snippet given the following code snippet:\n",
239
+ "passage": "Candidate code snippet:\n"
240
+ },
241
+ "code2nl": {
242
+ "query": "Find the most relevant comment given the following code snippet:\n",
243
+ "passage": "Candidate comment:\n"
244
+ },
245
+ "code2completion": {
246
+ "query": "Find the most relevant completion given the following start of code snippet:\n",
247
+ "passage": "Candidate completion:\n"
248
+ }
249
+ }
250
+
251
+ def add_instruction(instruction, text):
252
+ return f"{instruction}{text}"
253
+
254
+ def cosine_similarity(x, y):
255
+ x = F.normalize(x, p=2, dim=1)
256
+ y = F.normalize(y, p=2, dim=1)
257
+ return x @ y.T
258
+
259
+ # Build the queries and documents
260
+ queries = [
261
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "print hello world in python"),
262
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["query"], "initialize array of 5 zeros in c++"),
263
+ ]
264
+ documents = [
265
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "print('Hello World!')"),
266
+ add_instruction(INSTRUCTION_CONFIG["nl2code"]["passage"], "int arr[5] = {0, 0, 0, 0, 0};"),
267
+ ]
268
+ all_inputs = queries + documents
269
+
270
+ # vLLM embedding model
271
+ llm = LLM(
272
+ model="jinaai/jina-code-embeddings-0.5b",
273
+ hf_overrides={"architectures": ["Qwen2ForCausalLM"]},
274
+ task="embed"
275
+ )
276
+
277
+ # Encode with vLLM
278
+ outputs = llm.encode(all_inputs)
279
+
280
+ # Collect embeddings into a single tensor
281
+ emb_list = []
282
+ for out in outputs:
283
+ vec = out.outputs.data.detach()
284
+ emb_list.append(vec)
285
+ embeddings = torch.stack(emb_list, dim=0)
286
+
287
+ # Split into query and passage embeddings
288
+ n_q = len(queries)
289
+ query_embeddings = embeddings[:n_q]
290
+ passage_embeddings = embeddings[n_q:]
291
+
292
+ # Cosine similarity matrix (queries x documents)
293
+ scores = cosine_similarity(query_embeddings, passage_embeddings)
294
+ print(scores)
295
+ # tensor([[0.8171, 0.1230],
296
+ # [0.1207, 0.5513]])
297
+ ```
298
+
299
+ </details>
300
+
301
  ## Training & Evaluation
302
 
303
  Please refer to our [technical report of jina-code-embeddings](https://arxiv.org/abs/2508.21290) for training details and benchmarks.