UniME-V2
Collection
The collections of UniME-V2's data and Model Weights
β’
6 items
β’
Updated
β’
1
Tiancheng Gu*,
Kaicheng Yang*,
Kaichen Zhang,
Xiang An,
Ziyong Feng,
Yueyi Zhang,
Weidong Cai,
Jiankang Deng,
Lidong Bing
git clone https://github.com/deepglint/UniME-v2.git
cd UniME-v2
conda create -n uniMEv2 python=3.10 -y
conda activate uniMEv2
pip install -r requirements.txt
# Optional: Install Flash Attention for acceleration
# wget https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# pip install flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
import torch
from torch.nn import functional as F
from utils.utils import init_model_and_processor, prepare_stage_data, parse_answer_index
device="cuda"
embedding=False # adjust embedding model or rerank model
if embedding:
model_name="models/UniME-V2_qwen2VL_2B"
# model_name="models/UniME-V2_qwen2VL_7B"
# model_name="models/UniME-V2_LLaVA_onevision_8B"
text = "A man is crossing the street with a red car parked nearby."
image_path = "Figures/demo.png"
else:
model_name="models/UniME-v2-rerank_qwen25VL_7B"
text = ["A man is crossing the street with a red car parked nearby.", #! Target text
"A woman is walking her dog with a blue bicycle leaning nearby.",
"A child is riding a scooter past a green truck stopped nearby.",
"A couple is waiting for the bus beside a yellow taxi parked nearby.",
"A jogger is running along the path with a black motorcycle parked nearby."]
image_path = "Figures/demo.png"
model, processor = init_model_and_processor(model_name, device, embedding=embedding)
if embedding:
inputs_image, inputs_txt = prepare_stage_data(model_name, processor, text, image_path, embedding=embedding)
inputs_image = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs_image.items()}
inputs_txt = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs_txt.items()}
with torch.no_grad():
emb_text = model(**inputs_txt, output_hidden_states=True, return_dict=True).hidden_states[-1][:, -1, :]
emb_image = model(**inputs_image, output_hidden_states=True, return_dict=True).hidden_states[-1][:, -1, :]
emb_text = F.normalize(emb_text, dim=-1)
emb_image = F.normalize(emb_image, dim=-1)
Score = emb_image @ emb_text.T
print("Score: ", Score.item()) # qwen2VL 2B : Score: 0.62109375
else:
inputs = prepare_stage_data(model_name, processor, text, image_path, embedding=embedding)
inputs = {k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=128, output_scores=True, return_dict_in_generate=True, do_sample=False).sequences
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs['input_ids'], generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print("Rerank Answer: ", parse_answer_index(output_text[0])) # qwen25VL 7B: Rerank Answer: 0
If you find this repository useful, please use the following BibTeX entry for citation.
@misc{gu2025unimev2mllmasajudgeuniversalmultimodal,
title={UniME-V2: MLLM-as-a-Judge for Universal Multimodal Embedding Learning},
author={Tiancheng Gu and Kaicheng Yang and Kaichen Zhang and Xiang An and Ziyong Feng and Yueyi Zhang and Weidong Cai and Jiankang Deng and Lidong Bing},
year={2025},
eprint={2510.13515},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2510.13515},
}
@inproceedings{unime,
title={Breaking the Modality Barrier: Universal Embedding Learning with Multimodal LLMs},
author={Gu, Tiancheng and Yang, Kaicheng and Feng, Ziyong and Wang, Xingjun and Zhang, Yanzhao and Long, Dingkun and Chen, Yingda and Cai, Weidong and Deng, Jiankang},
booktitle={ACM MM},
year={2025}
}