prithivMLmods commited on
Commit
cae2745
·
verified ·
1 Parent(s): 05f9a90

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -74
app.py CHANGED
@@ -29,8 +29,6 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
29
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
32
- # --- Model Loading ---
33
-
34
  # Load Qwen2.5-VL-7B-Instruct
35
  MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
36
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -59,38 +57,14 @@ model_q = Qwen2_5_VLForConditionalGeneration.from_pretrained(
59
  ).to(device).eval()
60
 
61
  # Load prithivMLmods/DeepCaption-VLA-7B
62
- MODEL_ID_DC = "prithivMLmods/DeepCaption-VLA-7B"
63
- processor_dc = AutoProcessor.from_pretrained(MODEL_ID_DC, trust_remote_code=True)
64
- model_dc = Qwen2_5_VLForConditionalGeneration.from_pretrained(
65
- MODEL_ID_DC,
66
  trust_remote_code=True,
67
  torch_dtype=torch.float16
68
  ).to(device).eval()
69
 
70
-
71
- # --- System Prompt for DeepCaption-VLA-7B ---
72
- CAPTION_SYSTEM_PROMPT = """
73
- You are an AI assistant that rigorously follows this response protocol:
74
-
75
- 1. For every input image, your primary task is to write a **precise caption**. The caption must capture the **essence of the image** in clear, concise, and contextually accurate language.
76
-
77
- 2. Along with the caption, provide a structured set of **attributes** that describe the visual elements. Attributes should include details such as objects, people, actions, colors, environment, mood, and other notable characteristics.
78
-
79
- 3. Always include a **class_name** field. This must represent the **core theme or main subject** of the image in a compact format.
80
- - Use the syntax: `{class_name==write_the_core_theme}`
81
- - Example: `{class_name==dog_playing}` or `{class_name==city_sunset}`
82
-
83
- 4. Maintain the following strict format in your output:
84
- - **Caption:** <one-sentence description>
85
- - **Attributes:** <comma-separated list of visual attributes>
86
- - **{class_name==core_theme}**
87
-
88
- 5. Ensure captions are **precise, neutral, and descriptive**, avoiding unnecessary elaboration or subjective interpretation unless explicitly required.
89
-
90
- 6. Do not reference the rules or instructions in the output. Only return the formatted caption, attributes, and class_name.
91
- """.strip()
92
-
93
-
94
  def downsample_video(video_path):
95
  """
96
  Downsamples the video to evenly spaced frames.
@@ -100,7 +74,6 @@ def downsample_video(video_path):
100
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
101
  fps = vidcap.get(cv2.CAP_PROP_FPS)
102
  frames = []
103
- # Use a denser sampling for better video understanding
104
  frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
105
  for i in frame_indices:
106
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -124,9 +97,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
124
  Generates responses using the selected model for image input.
125
  Yields raw text and Markdown-formatted text.
126
  """
127
- processor = None
128
- model = None
129
-
130
  if model_name == "Qwen2.5-VL-7B-Instruct":
131
  processor = processor_m
132
  model = model_m
@@ -137,10 +107,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
137
  processor = processor_q
138
  model = model_q
139
  elif model_name == "DeepCaption-VLA-7B":
140
- processor = processor_dc
141
- model = model_dc
142
- # Prepend system prompt for this model
143
- text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
144
  else:
145
  yield "Invalid model selected.", "Invalid model selected."
146
  return
@@ -165,21 +133,10 @@ def generate_image(model_name: str, text: str, image: Image.Image,
165
  truncation=False,
166
  max_length=MAX_INPUT_TOKEN_LENGTH
167
  ).to(device)
168
-
169
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
170
- generation_kwargs = {
171
- **inputs,
172
- "streamer": streamer,
173
- "max_new_tokens": max_new_tokens,
174
- "temperature": temperature,
175
- "top_p": top_p,
176
- "top_k": top_k,
177
- "repetition_penalty": repetition_penalty,
178
- "do_sample": True,
179
- }
180
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
181
  thread.start()
182
-
183
  buffer = ""
184
  for new_text in streamer:
185
  buffer += new_text
@@ -197,9 +154,6 @@ def generate_video(model_name: str, text: str, video_path: str,
197
  Generates responses using the selected model for video input.
198
  Yields raw text and Markdown-formatted text.
199
  """
200
- processor = None
201
- model = None
202
-
203
  if model_name == "Qwen2.5-VL-7B-Instruct":
204
  processor = processor_m
205
  model = model_m
@@ -210,10 +164,8 @@ def generate_video(model_name: str, text: str, video_path: str,
210
  processor = processor_q
211
  model = model_q
212
  elif model_name == "DeepCaption-VLA-7B":
213
- processor = processor_dc
214
- model = model_dc
215
- # Prepend system prompt for this model
216
- text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
217
  else:
218
  yield "Invalid model selected.", "Invalid model selected."
219
  return
@@ -223,19 +175,14 @@ def generate_video(model_name: str, text: str, video_path: str,
223
  return
224
 
225
  frames = downsample_video(video_path)
226
- # Create the message structure with a system prompt and user query
227
  messages = [
228
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
229
  {"role": "user", "content": [{"type": "text", "text": text}]}
230
  ]
231
-
232
- # Add each frame to the user content
233
  for frame in frames:
234
  image, timestamp = frame
235
- messages[1]["content"].append({"type": "text", "text": f"Frame at {timestamp}s:"})
236
  messages[1]["content"].append({"type": "image", "image": image})
237
-
238
- # Prepare inputs for the model
239
  inputs = processor.apply_chat_template(
240
  messages,
241
  tokenize=True,
@@ -245,7 +192,6 @@ def generate_video(model_name: str, text: str, video_path: str,
245
  truncation=False,
246
  max_length=MAX_INPUT_TOKEN_LENGTH
247
  ).to(device)
248
-
249
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
250
  generation_kwargs = {
251
  **inputs,
@@ -259,14 +205,12 @@ def generate_video(model_name: str, text: str, video_path: str,
259
  }
260
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
261
  thread.start()
262
-
263
  buffer = ""
264
  for new_text in streamer:
265
  buffer += new_text
266
  time.sleep(0.01)
267
  yield buffer, buffer
268
 
269
-
270
  # Define examples for image and video inference
271
  image_examples = [
272
  ["Provide a detailed caption for the image..", "images/A.jpg"],
@@ -333,12 +277,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
333
  markdown_output = gr.Markdown()
334
 
335
  model_choice = gr.Radio(
336
- choices=[
337
- "Qwen2.5-VL-7B-Instruct",
338
- "Qwen2.5-VL-3B-Instruct",
339
- "Qwen2.5-VL-7B-Abliterated-Caption-it",
340
- "DeepCaption-VLA-7B"
341
- ],
342
  label="Select Model",
343
  value="Qwen2.5-VL-7B-Instruct"
344
  )
@@ -346,7 +285,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
346
  gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
347
  gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
348
  gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](https://huggingface.co/prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
349
- gr.Markdown("> [prithivMLmods/DeepCaption-VLA-7B](https://huggingface.co/prithivMLmods/DeepCaption-VLA-7B): DeepCaption-VLA-7B is a fine-tuned model based on Qwen2.5-VL, designed for generating precise, structured captions and attributes for images. It follows a strict protocol to provide a main caption, a list of visual attributes, and a core class name, making it ideal for detailed and organized visual analysis.")
350
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
351
 
352
  image_submit.click(
@@ -361,4 +300,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
361
  )
362
 
363
  if __name__ == "__main__":
364
- demo.queue(max_size=50).launch(share=True, show_error=True)
 
29
 
30
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
31
 
 
 
32
  # Load Qwen2.5-VL-7B-Instruct
33
  MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
34
  processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 
57
  ).to(device).eval()
58
 
59
  # Load prithivMLmods/DeepCaption-VLA-7B
60
+ MODEL_ID_D = "prithivMLmods/DeepCaption-VLA-7B"
61
+ processor_d = AutoProcessor.from_pretrained(MODEL_ID_D, trust_remote_code=True)
62
+ model_d = Qwen2_5_VLForConditionalGeneration.from_pretrained(
63
+ MODEL_ID_D,
64
  trust_remote_code=True,
65
  torch_dtype=torch.float16
66
  ).to(device).eval()
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def downsample_video(video_path):
69
  """
70
  Downsamples the video to evenly spaced frames.
 
74
  total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
75
  fps = vidcap.get(cv2.CAP_PROP_FPS)
76
  frames = []
 
77
  frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
78
  for i in frame_indices:
79
  vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
 
97
  Generates responses using the selected model for image input.
98
  Yields raw text and Markdown-formatted text.
99
  """
 
 
 
100
  if model_name == "Qwen2.5-VL-7B-Instruct":
101
  processor = processor_m
102
  model = model_m
 
107
  processor = processor_q
108
  model = model_q
109
  elif model_name == "DeepCaption-VLA-7B":
110
+ processor = processor_d
111
+ model = model_d
 
 
112
  else:
113
  yield "Invalid model selected.", "Invalid model selected."
114
  return
 
133
  truncation=False,
134
  max_length=MAX_INPUT_TOKEN_LENGTH
135
  ).to(device)
 
136
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
137
+ generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
 
 
 
 
 
 
 
 
 
138
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
139
  thread.start()
 
140
  buffer = ""
141
  for new_text in streamer:
142
  buffer += new_text
 
154
  Generates responses using the selected model for video input.
155
  Yields raw text and Markdown-formatted text.
156
  """
 
 
 
157
  if model_name == "Qwen2.5-VL-7B-Instruct":
158
  processor = processor_m
159
  model = model_m
 
164
  processor = processor_q
165
  model = model_q
166
  elif model_name == "DeepCaption-VLA-7B":
167
+ processor = processor_d
168
+ model = model_d
 
 
169
  else:
170
  yield "Invalid model selected.", "Invalid model selected."
171
  return
 
175
  return
176
 
177
  frames = downsample_video(video_path)
 
178
  messages = [
179
  {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
180
  {"role": "user", "content": [{"type": "text", "text": text}]}
181
  ]
 
 
182
  for frame in frames:
183
  image, timestamp = frame
184
+ messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
185
  messages[1]["content"].append({"type": "image", "image": image})
 
 
186
  inputs = processor.apply_chat_template(
187
  messages,
188
  tokenize=True,
 
192
  truncation=False,
193
  max_length=MAX_INPUT_TOKEN_LENGTH
194
  ).to(device)
 
195
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
196
  generation_kwargs = {
197
  **inputs,
 
205
  }
206
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
207
  thread.start()
 
208
  buffer = ""
209
  for new_text in streamer:
210
  buffer += new_text
211
  time.sleep(0.01)
212
  yield buffer, buffer
213
 
 
214
  # Define examples for image and video inference
215
  image_examples = [
216
  ["Provide a detailed caption for the image..", "images/A.jpg"],
 
277
  markdown_output = gr.Markdown()
278
 
279
  model_choice = gr.Radio(
280
+ choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct", "Qwen2.5-VL-7B-Abliterated-Caption-it", "DeepCaption-VLA-7B"],
 
 
 
 
 
281
  label="Select Model",
282
  value="Qwen2.5-VL-7B-Instruct"
283
  )
 
285
  gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
286
  gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
287
  gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](https://huggingface.co/prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
288
+ gr.Markdown("> [DeepCaption-VLA-7B](https://huggingface.co/prithivMLmods/DeepCaption-VLA-7B): This is a fine-tuned version of Qwen2.5-VL-7B-Instruct, specialized for Image Captioning and Vision Language Attribution. [1] It is designed to generate precise, highly descriptive captions, focusing on visual properties and object attributes across a wide variety of images. [1]")
289
  gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
290
 
291
  image_submit.click(
 
300
  )
301
 
302
  if __name__ == "__main__":
303
+ demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)