--- license: apache-2.0 base_model: - Qwen/Qwen2.5-VL-3B-Instruct --- # Tevatron usage Tevatron usage: https://github.com/texttron/tevatron/tree/main/examples/multimodal # Load the model ```python from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration from peft import PeftModel, PeftConfig def get_model(peft_model_name): config = PeftConfig.from_pretrained(peft_model_name) base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(config.base_model_name_or_path) model = PeftModel.from_pretrained(base_model, peft_model_name) model = model.merge_and_unload() model.eval() return model model = get_model('Tevatron/unified-retriever-v0.1').to('cuda:0') processor = AutoProcessor.from_pretrained('Tevatron/unified-retriever-v0.1') ``` # Encode text query ```python import torch from qwen_vl_utils import process_vision_info def get_embedding(last_hidden_state: torch.Tensor) -> torch.Tensor: reps = last_hidden_state[:, -1] reps = torch.nn.functional.normalize(reps, p=2, dim=-1) return reps queries = ["Where can we see Llama?", "What is the LLaMA AI model?"] query_messages = [] for query in queries: message = [ { 'role': 'user', 'content': [ {'type': 'text', 'text': f'Query: {query}'}, ] } ] query_messages.append(message) query_texts = [ processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) + "<|endoftext|>" for msg in query_messages ] query_image_inputs, query_video_inputs = process_vision_info(query_messages) query_inputs = processor(text=query_texts, images=query_image_inputs, videos=query_video_inputs, padding='longest', return_tensors='pt').to('cuda:0') with torch.no_grad(): output = model(**query_inputs, return_dict=True, output_hidden_states=True) query_embeddings = get_embedding(output.hidden_states[-1]) ``` > [!NOTE] > For encoding the textual documents, the code is the same as the above query encoding, but remove the `'Query: '` prefix. # Encode Document Screenshot ```python import requests from io import BytesIO from PIL import Image # URLs of the images url1 = "https://huggingface.co/Tevatron/dse-phi3-docmatix-v2/resolve/main/animal-llama.png" url2 = "https://huggingface.co/Tevatron/dse-phi3-docmatix-v2/resolve/main/meta-llama.png" response1 = requests.get(url1) response2 = requests.get(url2) doc_image1 = Image.open(BytesIO(response1.content)) doc_image2 = Image.open(BytesIO(response2.content)) doc_images = [doc_image1, doc_image2] doc_messages = [] for doc in doc_images: message = [ { 'role': 'user', 'content': [ {'type': 'text', 'text': ''}, {'type': 'image', 'image': doc, 'resized_height': 784, 'resized_width': 784} ] } ] doc_messages.append(message) doc_texts = [ processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=False) + "<|endoftext|>" for msg in doc_messages ] doc_image_inputs, doc_video_inputs = process_vision_info(doc_messages) doc_inputs = processor(text=doc_texts, images=doc_image_inputs, videos=doc_video_inputs, padding='longest', return_tensors='pt').to('cuda:0') with torch.no_grad(): output = model(**doc_inputs, return_dict=True, output_hidden_states=True) doc_embeddings = get_embedding(output.hidden_states[-1]) ``` # Compute Similarity ```python from torch.nn.functional import cosine_similarity num_queries = query_embeddings.size(0) num_passages = doc_embeddings.size(0) for i in range(num_queries): query_embedding = query_embeddings[i].unsqueeze(0) similarities = cosine_similarity(query_embedding, doc_embeddings) print(f"Similarities for Query {i+1}: {similarities.cpu().float().numpy()}") # Similarities for Query 1: [0.3282001 0.17449486] # Similarities for Query 2: [0.08133292 0.30867738] ```