NicoZenith commited on
Commit
90d1f85
·
1 Parent(s): 468a30d

Add initial model files

Browse files
README.md DELETED
@@ -1,151 +0,0 @@
1
- ---
2
- license: apache-2.0
3
- ---
4
-
5
- ## Inference function
6
-
7
- Below is the `inference_radvlm` function that facilitates multi-turn interactions with the model. This function handles both single-turn and multi-turn conversations, managing the chat history to maintain context across multiple exchanges.
8
-
9
- ```
10
- import requests
11
- from PIL import Image
12
- from numpy import asarray
13
- import torch
14
- from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
15
- import re
16
-
17
- def inference_radvlm(model, processor, image, prompt, chat_history=None, max_new_tokens=1500):
18
- """
19
- Generate a response using RadVLM in either single-turn or multi-turn mode.
20
-
21
- Args:
22
- model: The RadVLM model.
23
- processor: The processor for RadVLM (provides apply_chat_template and tokenization).
24
- image: A PIL Image or NumPy array representing the input image.
25
- prompt: The user prompt for this turn.
26
- chat_history: A list of (user_msg, assistant_msg) tuples representing the conversation so far.
27
- If None or empty, single-turn mode is used. Even in single-turn mode,
28
- this function returns chat_history so that you can continue in subsequent turns.
29
- max_new_tokens: The maximum number of new tokens to generate.
30
-
31
- Returns:
32
- response (str): The assistant's response for this turn.
33
- chat_history (list): The updated chat_history including this turn's (prompt, response).
34
- """
35
-
36
- # Initialize chat history if not provided
37
- if chat_history is None:
38
- chat_history = []
39
-
40
- # Build the chat history
41
- conversation = []
42
- for idx, (user_text, assistant_text) in enumerate(chat_history):
43
- if idx == 0:
44
- conversation.append({
45
- "role": "user",
46
- "content": [
47
- {"type": "text", "text": user_text},
48
- {"type": "image"},
49
- ],
50
- })
51
- else:
52
- conversation.append({
53
- "role": "user",
54
- "content": [
55
- {"type": "text", "text": user_text},
56
- ],
57
- })
58
- conversation.append({
59
- "role": "assistant",
60
- "content": [
61
- {"type": "text", "text": assistant_text},
62
- ],
63
- })
64
-
65
- # Add the current user prompt
66
- if len(chat_history) == 0:
67
- # First turn includes the image
68
- conversation.append({
69
- "role": "user",
70
- "content": [
71
- {"type": "text", "text": prompt},
72
- {"type": "image"},
73
- ],
74
- })
75
- else:
76
- # Subsequent turns without the image
77
- conversation.append({
78
- "role": "user",
79
- "content": [{"type": "text", "text": prompt}],
80
- })
81
-
82
- # Apply the chat template to create the full prompt
83
- full_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
84
-
85
- # Prepare model inputs
86
- inputs = processor(images=image, text=full_prompt, return_tensors="pt", padding=True).to(
87
- model.device, torch.float16
88
- )
89
-
90
- # Generate the response
91
- with torch.inference_mode():
92
- output = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
93
-
94
- # Decode the output
95
- full_response = processor.decode(output[0], skip_special_tokens=True)
96
- response = re.split(r"(user|assistant)", full_response)[-1].strip()
97
-
98
- # Update chat history
99
- chat_history.append((prompt, response))
100
-
101
- return response, chat_history
102
-
103
- ```
104
-
105
- ## Quick-Start: Multi-turn Demo
106
- Below is a demonstration of how to utilize the inference_radvlm function in a multi-turn conversation.
107
-
108
- ```
109
- import torch
110
- from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
111
- from PIL import Image
112
- import requests
113
- from io import BytesIO
114
- import numpy as np
115
-
116
- Initialize the model and processor
117
- model_id = "KrauthammerLab/RadVLM"
118
- model = LlavaOnevisionForConditionalGeneration.from_pretrained(
119
- model_id,
120
- torch_dtype=torch.float16,
121
- low_cpu_mem_usage=True,
122
- ).to('cuda') # Use 'cuda' if GPU is available, else 'cpu'
123
-
124
- processor = AutoProcessor.from_pretrained(model_id)
125
-
126
- image_url = "https://prod-images-static.radiopaedia.org/images/29923576/fed73420497c8622734f21ce20fc91_gallery.jpeg"
127
- image = Image.open(requests.get(image_url, stream=True).raw)
128
-
129
- # Initialize chat history
130
- chat_history = []
131
-
132
- # First user prompt with image from URL
133
- user_prompt_1 = "What can you say about this X-ray?"
134
- response_1, chat_history = inference_radvlm(model, processor, image, user_prompt_1, chat_history)
135
-
136
- print("RadVLM:", response_1)
137
-
138
- # Second user prompt, continuing the conversation
139
- user_prompt_2 = "Is there something concerning in the lungs area?"
140
- response_2, chat_history = inference_radvlm(model, processor, image, user_prompt_2, chat_history)
141
-
142
- print("RadVLM:", response_2)
143
-
144
- # Third user prompt
145
- user_prompt_3 = "What about the cardiac silhouette? Is it normal?"
146
- response_3, chat_history = inference_radvlm(model, processor, image, user_prompt_3, chat_history)
147
-
148
- print("Assistant:", response_3)
149
-
150
-
151
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27d2827804c4a9e3a7681bdd332056dc03d57860b6111eecc19f3ff8cc7eda58
3
  size 4911200360
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a6a3ea28103da54b9d4473f6a9370cda7e29ffe0fd6e6ed02f90df3ead00dd5
3
  size 4911200360
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:400798c136e00856c8662d4815a9651e75166cbcc60e58f7d7a2bae55b0da197
3
  size 4991497664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe9a1367724aa73e914b18dbf46e3871dcaee1c696d65fc461b409f75c1bc7f7
3
  size 4991497664
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aa3a22979f60f0340d1f98385e735745889374f6a5de9df82cce03d08bb2389b
3
  size 4932752752
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92593fc2123c5ec87b99e4c39de84bf9d89fc9a0e8a9f6fe9af0116e01422b90
3
  size 4932752752
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b1b6599630b7e4ad66281eac22c6a1c43f391e40140c7886a2ef3ad9d7dfa830
3
  size 1226266240
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f36298a9f93ea35ffe2f5ccee964564ea3d0e2914088a4025065abe3e43d3f7b
3
  size 1226266240
preprocessor_config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "do_convert_rgb": true,
3
  "do_normalize": true,
4
  "do_pad": true,
5
  "do_rescale": true,
 
1
  {
2
+ "do_convert_rgb": null,
3
  "do_normalize": true,
4
  "do_pad": true,
5
  "do_rescale": true,
video_processor/preprocessor_config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "do_convert_rgb": true,
3
  "do_normalize": true,
4
  "do_rescale": true,
5
  "do_resize": true,
@@ -14,7 +14,7 @@
14
  0.5,
15
  0.5
16
  ],
17
- "processor_class": "SiglipProcessor",
18
  "resample": 3,
19
  "rescale_factor": 0.00392156862745098,
20
  "size": {
 
1
  {
2
+ "do_convert_rgb": null,
3
  "do_normalize": true,
4
  "do_rescale": true,
5
  "do_resize": true,
 
14
  0.5,
15
  0.5
16
  ],
17
+ "processor_class": "LlavaProcessor",
18
  "resample": 3,
19
  "rescale_factor": 0.00392156862745098,
20
  "size": {