Codefuse Embeddings
Collection
10 items
•
Updated
•
6
C2LLM: Advanced Code Embeddings for Deep Semantic Understanding
C2LLMs (Code Contrastive Large Language Model) is a powerful new model for generating code embeddings, designed to capture the deep semantics of source code.
Qwen2.5-Coder, inheriting its exceptional code comprehension capabilities.mean pooling or last token pooling, C2LLM uses PMA (Pooling by Multi-head Attention). This allows the model to dynamically focus on the most critical parts of the code, creating a more informative and robust embedding.C2LLM is designed to be a go-to model for tasks like code search and Retrieval-Augmented Generation (RAG).
from transformers import AutoModel, AutoTokenizer
import torch
model_path = "codefuse-ai/C2LLM-7B"
# Load the model
model = AutoModel.from_pretrained(model_path, torch_dtype=torch.bfloat16, trust_remote_code=True)
# Prepare the data
sentences = ['''int r = (int) params >> 8 & 0xff;
int p = (int) params & 0xff;
byte[] derived1 = SCrypt.scrypt(passwd.getBytes("UTF-8"), salt, N, r, p, 32);
if (derived0.length != derived1.length) return false;
int result = 0;
for (int i = 0; i < derived0.length; i++) {
result |= derived0[i] ^ derived1[i];
}
return result == 0;
} catch (UnsupportedEncodingException e) {
throw new IllegalStateException("JVM doesn't support UTF-8?");
} catch (GeneralSecurityException e) {
throw new IllegalStateException("JVM doesn't support SHA1PRNG or HMAC_SHA256?");
}
}''',
'''
}
if (tempFrom > tempTo) {
return new RangeInfo(inclusive ? tempTo : tempTo + 1, tempFrom + 1, true);
}
return new RangeInfo(tempFrom, inclusive ? tempTo + 1 : tempTo, false);
}''']
# Get the embeddings
embeddings = model.encode(sentences)
from sentence_transformers import SentenceTransformer
# Load the model
model = SentenceTransformer("codefuse-ai/C2LLM-7B", trust_remote_code=True)
# Prepare the data
sentences = ['''int r = (int) params >> 8 & 0xff;
int p = (int) params & 0xff;
byte[] derived1 = SCrypt.scrypt(passwd.getBytes("UTF-8"), salt, N, r, p, 32);
if (derived0.length != derived1.length) return false;
int result = 0;
for (int i = 0; i < derived0.length; i++) {
result |= derived0[i] ^ derived1[i];
}
return result == 0;
} catch (UnsupportedEncodingException e) {
throw new IllegalStateException("JVM doesn't support UTF-8?");
} catch (GeneralSecurityException e) {
throw new IllegalStateException("JVM doesn't support SHA1PRNG or HMAC_SHA256?");
}
}''',
'''
}
if (tempFrom > tempTo) {
return new RangeInfo(inclusive ? tempTo : tempTo + 1, tempFrom + 1, true);
}
return new RangeInfo(tempFrom, inclusive ? tempTo + 1 : tempTo, false);
}''']
# Get the embeddings
embeddings = model.encode(sentences)
from sentence_transformers import SentenceTransformer
from mteb.models import ModelMeta
from mteb.cache import ResultCache
model_name = "codefuse-ai/C2LLM-7B"
# Load the model
model = mteb.get_model(model_name) # if the model is not implemented in MTEB it will be eq. to SentenceTransformer(model_name)
# Select tasks
tasks = mteb.get_tasks(tasks=["AppsRetrieval", "CodeSearchNetCCRetrieval", "CodeEditSearchRetrieval","CodeSearchNetRetrieval","CodeFeedbackMT","CodeFeedbackST","CodeTransOceanContest","CodeTransOceanDL","COIRCodeSearchNetRetrieval","CosQA","StackOverflowQA","SyntheticText2SQL"])
# Cache the result
cache = ResultCache("./c2llm_results")
# Evaluate
results = mteb.evaluate(model, tasks=tasks, cache=cache, encode_kwargs={"batch_size": 16})
Jin Qin ([email protected]), Zihan Liao ([email protected]), Ziyin Zhang ([email protected]), Hang Yu ([email protected]), Peng Di ([email protected])