Dependencies
pip install torch torchvision
pip install transformers==4.46.0
Inference function
Below is the inference_radvlm
function that facilitates multi-turn interactions with the model. This function handles both single-turn and multi-turn conversations, managing the chat history to maintain context across multiple exchanges.
import requests
from PIL import Image
from numpy import asarray
import torch
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
import re
def inference_radvlm(model, processor, image, prompt, chat_history=None, max_new_tokens=1500):
"""
Generate a response using RadVLM in either single-turn or multi-turn mode.
Args:
model: The RadVLM model.
processor: The processor for RadVLM (provides apply_chat_template and tokenization).
image: A PIL Image or NumPy array representing the input image.
prompt: The user prompt for this turn.
chat_history: A list of (user_msg, assistant_msg) tuples representing the conversation so far.
If None or empty, single-turn mode is used. Even in single-turn mode,
this function returns chat_history so that you can continue in subsequent turns.
max_new_tokens: The maximum number of new tokens to generate.
Returns:
response (str): The assistant's response for this turn.
chat_history (list): The updated chat_history including this turn's (prompt, response).
"""
# Initialize chat history if not provided
if chat_history is None:
chat_history = []
# Build the chat history
conversation = []
for idx, (user_text, assistant_text) in enumerate(chat_history):
if idx == 0:
conversation.append({
"role": "user",
"content": [
{"type": "text", "text": user_text},
{"type": "image"},
],
})
else:
conversation.append({
"role": "user",
"content": [
{"type": "text", "text": user_text},
],
})
conversation.append({
"role": "assistant",
"content": [
{"type": "text", "text": assistant_text},
],
})
# Add the current user prompt
if len(chat_history) == 0:
# First turn includes the image
conversation.append({
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image"},
],
})
else:
# Subsequent turns without the image
conversation.append({
"role": "user",
"content": [{"type": "text", "text": prompt}],
})
# Apply the chat template to create the full prompt
full_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
# Prepare model inputs
inputs = processor(images=image, text=full_prompt, return_tensors="pt", padding=True).to(
model.device, torch.float16
)
# Generate the response
with torch.inference_mode():
output = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
# Decode the output
full_response = processor.decode(output[0], skip_special_tokens=True)
response = re.split(r"(user|assistant)", full_response)[-1].strip()
# Update chat history
chat_history.append((prompt, response))
return response, chat_history
Quick-Start: Multi-turn Demo
Below is a demonstration of how to utilize the inference_radvlm function in a multi-turn conversation.
import torch
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
from PIL import Image
import requests
from io import BytesIO
import numpy as np
Initialize the model and processor
model_id = "KrauthammerLab/RadVLM"
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to('cuda') # Use 'cuda' if GPU is available, else 'cpu'
processor = AutoProcessor.from_pretrained(model_id)
image_url = "https://prod-images-static.radiopaedia.org/images/29923576/fed73420497c8622734f21ce20fc91_gallery.jpeg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Initialize chat history
chat_history = []
# First user prompt with image from URL
user_prompt_1 = "What can you say about this X-ray?"
response_1, chat_history = inference_radvlm(model, processor, image, user_prompt_1, chat_history)
print("RadVLM:", response_1)
# Second user prompt, continuing the conversation
user_prompt_2 = "Is there something concerning in the lungs area?"
response_2, chat_history = inference_radvlm(model, processor, image, user_prompt_2, chat_history)
print("RadVLM:", response_2)
# Third user prompt
user_prompt_3 = "What about the cardiac silhouette? Is it normal?"
response_3, chat_history = inference_radvlm(model, processor, image, user_prompt_3, chat_history)
print("Assistant:", response_3)
- Downloads last month
- 2
Inference Providers
NEW
This model is not currently available via any of the supported third-party Inference Providers, and
the model is not deployed on the HF Inference API.