Spaces:

prithivMLmods
/

Qwen2.5-VL-Outpost

Running on Zero

App Files Files Community

prithivMLmods commited on 23 days ago

Commit

cae2745

verified ·

1 Parent(s): 05f9a90

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -74

app.py CHANGED Viewed

@@ -29,8 +29,6 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-# --- Model Loading ---
 # Load Qwen2.5-VL-7B-Instruct
 MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
@@ -59,38 +57,14 @@ model_q = Qwen2_5_VLForConditionalGeneration.from_pretrained(
 ).to(device).eval()
 # Load prithivMLmods/DeepCaption-VLA-7B
-MODEL_ID_DC = "prithivMLmods/DeepCaption-VLA-7B"
-processor_dc = AutoProcessor.from_pretrained(MODEL_ID_DC, trust_remote_code=True)
-model_dc = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_DC,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
-# --- System Prompt for DeepCaption-VLA-7B ---
-CAPTION_SYSTEM_PROMPT = """
-You are an AI assistant that rigorously follows this response protocol:
-1. For every input image, your primary task is to write a **precise caption**. The caption must capture the **essence of the image** in clear, concise, and contextually accurate language.
-2. Along with the caption, provide a structured set of **attributes** that describe the visual elements. Attributes should include details such as objects, people, actions, colors, environment, mood, and other notable characteristics.
-3. Always include a **class_name** field. This must represent the **core theme or main subject** of the image in a compact format.
-   - Use the syntax: `{class_name==write_the_core_theme}`
-   - Example: `{class_name==dog_playing}` or `{class_name==city_sunset}`
-4. Maintain the following strict format in your output:
-   - **Caption:** <one-sentence description>
-   - **Attributes:** <comma-separated list of visual attributes>
-   - **{class_name==core_theme}**
-5. Ensure captions are **precise, neutral, and descriptive**, avoiding unnecessary elaboration or subjective interpretation unless explicitly required.
-6. Do not reference the rules or instructions in the output. Only return the formatted caption, attributes, and class_name.
-""".strip()
 def downsample_video(video_path):
     """
     Downsamples the video to evenly spaced frames.
@@ -100,7 +74,6 @@ def downsample_video(video_path):
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
-    # Use a denser sampling for better video understanding
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
@@ -124,9 +97,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
     """
-    processor = None
-    model = None
     if model_name == "Qwen2.5-VL-7B-Instruct":
         processor = processor_m
         model = model_m
@@ -137,10 +107,8 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         processor = processor_q
         model = model_q
     elif model_name == "DeepCaption-VLA-7B":
-        processor = processor_dc
-        model = model_dc
-        # Prepend system prompt for this model
-        text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -165,21 +133,10 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-        "do_sample": True,
-    }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
@@ -197,9 +154,6 @@ def generate_video(model_name: str, text: str, video_path: str,
     Generates responses using the selected model for video input.
     Yields raw text and Markdown-formatted text.
     """
-    processor = None
-    model = None
     if model_name == "Qwen2.5-VL-7B-Instruct":
         processor = processor_m
         model = model_m
@@ -210,10 +164,8 @@ def generate_video(model_name: str, text: str, video_path: str,
         processor = processor_q
         model = model_q
     elif model_name == "DeepCaption-VLA-7B":
-        processor = processor_dc
-        model = model_dc
-        # Prepend system prompt for this model
-        text = f"{CAPTION_SYSTEM_PROMPT}\n\n{text}"
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -223,19 +175,14 @@ def generate_video(model_name: str, text: str, video_path: str,
         return
     frames = downsample_video(video_path)
-    # Create the message structure with a system prompt and user query
     messages = [
         {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
         {"role": "user", "content": [{"type": "text", "text": text}]}
     ]
-    # Add each frame to the user content
     for frame in frames:
         image, timestamp = frame
-        messages[1]["content"].append({"type": "text", "text": f"Frame at {timestamp}s:"})
         messages[1]["content"].append({"type": "image", "image": image})
-    # Prepare inputs for the model
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
@@ -245,7 +192,6 @@ def generate_video(model_name: str, text: str, video_path: str,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
@@ -259,14 +205,12 @@ def generate_video(model_name: str, text: str, video_path: str,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
         yield buffer, buffer
 # Define examples for image and video inference
 image_examples = [
     ["Provide a detailed caption for the image..", "images/A.jpg"],
@@ -333,12 +277,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
                     markdown_output = gr.Markdown()
             model_choice = gr.Radio(
-                choices=[
-                    "Qwen2.5-VL-7B-Instruct",
-                    "Qwen2.5-VL-3B-Instruct",
-                    "Qwen2.5-VL-7B-Abliterated-Caption-it",
-                    "DeepCaption-VLA-7B"
-                ],
                 label="Select Model",
                 value="Qwen2.5-VL-7B-Instruct"
             )
@@ -346,7 +285,7 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
             gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
             gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
             gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](https://huggingface.co/prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
-            gr.Markdown("> [prithivMLmods/DeepCaption-VLA-7B](https://huggingface.co/prithivMLmods/DeepCaption-VLA-7B): DeepCaption-VLA-7B is a fine-tuned model based on Qwen2.5-VL, designed for generating precise, structured captions and attributes for images. It follows a strict protocol to provide a main caption, a list of visual attributes, and a core class name, making it ideal for detailed and organized visual analysis.")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
@@ -361,4 +300,4 @@ with gr.Blocks(css=css, theme="bethecloud/storj_theme") as demo:
     )
 if __name__ == "__main__":
-    demo.queue(max_size=50).launch(share=True, show_error=True)

 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Qwen2.5-VL-7B-Instruct
 MODEL_ID_M = "Qwen/Qwen2.5-VL-7B-Instruct"
 processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 ).to(device).eval()
 # Load prithivMLmods/DeepCaption-VLA-7B
+MODEL_ID_D = "prithivMLmods/DeepCaption-VLA-7B"
+processor_d = AutoProcessor.from_pretrained(MODEL_ID_D, trust_remote_code=True)
+model_d = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    MODEL_ID_D,
     trust_remote_code=True,
     torch_dtype=torch.float16
 ).to(device).eval()
 def downsample_video(video_path):
     """
     Downsamples the video to evenly spaced frames.
     total_frames = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
     fps = vidcap.get(cv2.CAP_PROP_FPS)
     frames = []
     frame_indices = np.linspace(0, total_frames - 1, 10, dtype=int)
     for i in frame_indices:
         vidcap.set(cv2.CAP_PROP_POS_FRAMES, i)
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
     """
     if model_name == "Qwen2.5-VL-7B-Instruct":
         processor = processor_m
         model = model_m
         processor = processor_q
         model = model_q
     elif model_name == "DeepCaption-VLA-7B":
+        processor = processor_d
+        model = model_d
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = {**inputs, "streamer": streamer, "max_new_tokens": max_new_tokens}
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
     Generates responses using the selected model for video input.
     Yields raw text and Markdown-formatted text.
     """
     if model_name == "Qwen2.5-VL-7B-Instruct":
         processor = processor_m
         model = model_m
         processor = processor_q
         model = model_q
     elif model_name == "DeepCaption-VLA-7B":
+        processor = processor_d
+        model = model_d
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         return
     frames = downsample_video(video_path)
     messages = [
         {"role": "system", "content": [{"type": "text", "text": "You are a helpful assistant."}]},
         {"role": "user", "content": [{"type": "text", "text": text}]}
     ]
     for frame in frames:
         image, timestamp = frame
+        messages[1]["content"].append({"type": "text", "text": f"Frame {timestamp}:"})
         messages[1]["content"].append({"type": "image", "image": image})
     inputs = processor.apply_chat_template(
         messages,
         tokenize=True,
         truncation=False,
         max_length=MAX_INPUT_TOKEN_LENGTH
     ).to(device)
     streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
     generation_kwargs = {
         **inputs,
     }
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
     buffer = ""
     for new_text in streamer:
         buffer += new_text
         time.sleep(0.01)
         yield buffer, buffer
 # Define examples for image and video inference
 image_examples = [
     ["Provide a detailed caption for the image..", "images/A.jpg"],
                     markdown_output = gr.Markdown()
             model_choice = gr.Radio(
+                choices=["Qwen2.5-VL-7B-Instruct", "Qwen2.5-VL-3B-Instruct", "Qwen2.5-VL-7B-Abliterated-Caption-it", "DeepCaption-VLA-7B"],
                 label="Select Model",
                 value="Qwen2.5-VL-7B-Instruct"
             )
             gr.Markdown("> [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct): The Qwen2.5-VL-7B-Instruct model is a multimodal AI model developed by Alibaba Cloud that excels at understanding both text and images. It's a Vision-Language Model (VLM) designed to handle various visual understanding tasks, including image understanding, video analysis, and even multilingual support.")
             gr.Markdown("> [Qwen2.5-VL-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct): Qwen2.5-VL-3B-Instruct is an instruction-tuned vision-language model from Alibaba Cloud, built upon the Qwen2-VL series. It excels at understanding and generating text related to both visual and textual inputs, making it capable of tasks like image captioning, visual question answering, and object localization. The model also supports long video understanding and structured data extraction")
             gr.Markdown("> [Qwen2.5-VL-7B-Abliterated-Caption-it](https://huggingface.co/prithivMLmods/Qwen2.5-VL-7B-Abliterated-Caption-it): Qwen2.5-VL-7B-Abliterated-Caption-it is a fine-tuned version of Qwen2.5-VL-7B-Instruct, optimized for Abliterated Captioning / Uncensored Captioning. This model excels at generating detailed, context-rich, and high-fidelity captions across diverse image categories and variational aspect ratios, offering robust visual understanding without filtering or censorship.")
+            gr.Markdown("> [DeepCaption-VLA-7B](https://huggingface.co/prithivMLmods/DeepCaption-VLA-7B): This is a fine-tuned version of Qwen2.5-VL-7B-Instruct, specialized for Image Captioning and Vision Language Attribution. [1] It is designed to generate precise, highly descriptive captions, focusing on visual properties and object attributes across a wide variety of images. [1]")
             gr.Markdown(">⚠️note: all the models in space are not guaranteed to perform well in video inference use cases.")
     image_submit.click(
     )
 if __name__ == "__main__":
+    demo.queue(max_size=50).launch(share=True, mcp_server=True, ssr_mode=False, show_error=True)