Add initial model files

Browse files

Files changed (7) hide show

README.md +0 -151
model-00001-of-00004.safetensors +1 -1
model-00002-of-00004.safetensors +1 -1
model-00003-of-00004.safetensors +1 -1
model-00004-of-00004.safetensors +1 -1
preprocessor_config.json +1 -1
video_processor/preprocessor_config.json +2 -2

README.md DELETED Viewed

@@ -1,151 +0,0 @@
----
-license: apache-2.0
----
-## Inference function
-Below is the `inference_radvlm` function that facilitates multi-turn interactions with the model. This function handles both single-turn and multi-turn conversations, managing the chat history to maintain context across multiple exchanges.
-```
-import requests
-from PIL import Image
-from numpy import asarray
-import torch
-from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
-import re
-def inference_radvlm(model, processor, image, prompt, chat_history=None, max_new_tokens=1500):
-    """
-    Generate a response using RadVLM in either single-turn or multi-turn mode.
-    Args:
-        model: The RadVLM model.
-        processor: The processor for RadVLM (provides apply_chat_template and tokenization).
-        image: A PIL Image or NumPy array representing the input image.
-        prompt: The user prompt for this turn.
-        chat_history: A list of (user_msg, assistant_msg) tuples representing the conversation so far.
-                      If None or empty, single-turn mode is used. Even in single-turn mode,
-                      this function returns chat_history so that you can continue in subsequent turns.
-        max_new_tokens: The maximum number of new tokens to generate.
-    Returns:
-        response (str): The assistant's response for this turn.
-        chat_history (list): The updated chat_history including this turn's (prompt, response).
-    """
-    # Initialize chat history if not provided
-    if chat_history is None:
-        chat_history = []
-    # Build the chat history
-    conversation = []
-    for idx, (user_text, assistant_text) in enumerate(chat_history):
-        if idx == 0:
-            conversation.append({
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": user_text},
-                    {"type": "image"},
-                ],
-            })
-        else:
-            conversation.append({
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": user_text},
-                ],
-            })
-        conversation.append({
-            "role": "assistant",
-            "content": [
-                {"type": "text", "text": assistant_text},
-            ],
-        })
-    # Add the current user prompt
-    if len(chat_history) == 0:
-        # First turn includes the image
-        conversation.append({
-            "role": "user",
-            "content": [
-                {"type": "text", "text": prompt},
-                {"type": "image"},
-            ],
-        })
-    else:
-        # Subsequent turns without the image
-        conversation.append({
-            "role": "user",
-            "content": [{"type": "text", "text": prompt}],
-        })
-    # Apply the chat template to create the full prompt
-    full_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-    # Prepare model inputs
-    inputs = processor(images=image, text=full_prompt, return_tensors="pt", padding=True).to(
-        model.device, torch.float16
-    )
-    # Generate the response
-    with torch.inference_mode():
-        output = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
-    # Decode the output
-    full_response = processor.decode(output[0], skip_special_tokens=True)
-    response = re.split(r"(user|assistant)", full_response)[-1].strip()
-    # Update chat history
-    chat_history.append((prompt, response))
-    return response, chat_history
-```
-## Quick-Start: Multi-turn Demo
-Below is a demonstration of how to utilize the inference_radvlm function in a multi-turn conversation.
-```
-import torch
-from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
-from PIL import Image
-import requests
-from io import BytesIO
-import numpy as np
- Initialize the model and processor
-model_id = "KrauthammerLab/RadVLM"
-model = LlavaOnevisionForConditionalGeneration.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
-    low_cpu_mem_usage=True,
-).to('cuda')  # Use 'cuda' if GPU is available, else 'cpu'
-processor = AutoProcessor.from_pretrained(model_id)
-image_url = "https://prod-images-static.radiopaedia.org/images/29923576/fed73420497c8622734f21ce20fc91_gallery.jpeg"
-image = Image.open(requests.get(image_url, stream=True).raw)
-# Initialize chat history
-chat_history = []
-# First user prompt with image from URL
-user_prompt_1 = "What can you say about this X-ray?"
-response_1, chat_history = inference_radvlm(model, processor, image, user_prompt_1, chat_history)
-print("RadVLM:", response_1)
-# Second user prompt, continuing the conversation
-user_prompt_2 = "Is there something concerning in the lungs area?"
-response_2, chat_history = inference_radvlm(model, processor, image, user_prompt_2, chat_history)
-print("RadVLM:", response_2)
-# Third user prompt
-user_prompt_3 = "What about the cardiac silhouette? Is it normal?"
-response_3, chat_history = inference_radvlm(model, processor, image, user_prompt_3, chat_history)
-print("Assistant:", response_3)

model-00001-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:27d2827804c4a9e3a7681bdd332056dc03d57860b6111eecc19f3ff8cc7eda58
 size 4911200360

 version https://git-lfs.github.com/spec/v1
+oid sha256:7a6a3ea28103da54b9d4473f6a9370cda7e29ffe0fd6e6ed02f90df3ead00dd5
 size 4911200360

model-00002-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:400798c136e00856c8662d4815a9651e75166cbcc60e58f7d7a2bae55b0da197
 size 4991497664

 version https://git-lfs.github.com/spec/v1
+oid sha256:fe9a1367724aa73e914b18dbf46e3871dcaee1c696d65fc461b409f75c1bc7f7
 size 4991497664

model-00003-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aa3a22979f60f0340d1f98385e735745889374f6a5de9df82cce03d08bb2389b
 size 4932752752

 version https://git-lfs.github.com/spec/v1
+oid sha256:92593fc2123c5ec87b99e4c39de84bf9d89fc9a0e8a9f6fe9af0116e01422b90
 size 4932752752

model-00004-of-00004.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b1b6599630b7e4ad66281eac22c6a1c43f391e40140c7886a2ef3ad9d7dfa830
 size 1226266240

 version https://git-lfs.github.com/spec/v1
+oid sha256:f36298a9f93ea35ffe2f5ccee964564ea3d0e2914088a4025065abe3e43d3f7b
 size 1226266240

preprocessor_config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "do_convert_rgb": true,
   "do_normalize": true,
   "do_pad": true,
   "do_rescale": true,

 {
+  "do_convert_rgb": null,
   "do_normalize": true,
   "do_pad": true,
   "do_rescale": true,

video_processor/preprocessor_config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "do_convert_rgb": true,
   "do_normalize": true,
   "do_rescale": true,
   "do_resize": true,
@@ -14,7 +14,7 @@
     0.5,
     0.5
   ],
-  "processor_class": "SiglipProcessor",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "size": {

 {
+  "do_convert_rgb": null,
   "do_normalize": true,
   "do_rescale": true,
   "do_resize": true,
     0.5,
     0.5
   ],
+  "processor_class": "LlavaProcessor",
   "resample": 3,
   "rescale_factor": 0.00392156862745098,
   "size": {