Spaces:

enpeizhao
/

VLM_ODD_Online_Demo

Sleeping

App Files Files Community

enpeizhao commited on Jul 28

Commit

b651759

1 Parent(s): 98ba949

fix video inference and add images

Browse files

Files changed (2) hide show

app.py +98 -9
packages.txt +1 -0

app.py CHANGED Viewed

@@ -128,7 +128,7 @@ def process_video_frames(video_path, prompt):
             print(f"Qwen-VL style processing failed: {e}")
             # Process first frame with text prompt
             first_frame = frames[0]
-            inputs = processor(images=first_frame, text=prompt, return_tensors="pt").to(device)
             # Generate response
             with torch.no_grad():
@@ -141,6 +141,60 @@ def process_video_frames(video_path, prompt):
     except Exception as e:
         return f"Error processing video: {str(e)}"
 def video_qa(video, prompt):
     """Main function for Gradio interface"""
     if video is None:
@@ -181,33 +235,68 @@ def video_qa(video, prompt):
     except Exception as e:
         return f"Error processing video: {str(e)}"
 # Create Gradio interface
 with gr.Blocks() as demo:
-    gr.Markdown("# Video Question Answering with Custom VLM")
     gr.Markdown(f"Model: {MODEL_ID}")
     with gr.Row():
         with gr.Column():
-            video_input = gr.Video(label="Upload Video", interactive=True)
-            text_input = gr.Textbox(label="Question", placeholder="What is happening in this video?")
             submit_btn = gr.Button("Process")
         with gr.Column():
             output_text = gr.Textbox(label="Answer", lines=10)
-    # Examples
     gr.Examples(
         examples=[
-            [None, "Describe what you see in the video"],
             [None, "What objects are present in the scene?"]
         ],
-        inputs=[video_input, text_input],
         outputs=output_text
     )
     submit_btn.click(
-        fn=video_qa,
-        inputs=[video_input, text_input],
         outputs=output_text
     )

             print(f"Qwen-VL style processing failed: {e}")
             # Process first frame with text prompt
             first_frame = frames[0]
+            inputs = processor(text=prompt, videos=[first_frame], return_tensors="pt").to(device)
             # Generate response
             with torch.no_grad():
     except Exception as e:
         return f"Error processing video: {str(e)}"
+def process_media(media, prompt):
+    """
+    通用处理函数，支持图片（PIL.Image）或视频（文件路径）
+    """
+    if model is None or processor is None or tokenizer is None:
+        return "Model not loaded properly"
+    # 判断输入类型
+    if isinstance(media, Image.Image):
+        # 单张图片
+        frames = [media]
+    elif isinstance(media, str) and os.path.exists(media):
+        # 视频路径，提取帧
+        frames = extract_frames(media, max_frames=8)
+        if not frames:
+            return "No frames extracted from video"
+    else:
+        return "Unsupported media type"
+    # 构造消息
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "video", "video": frames},
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    try:
+        # Qwen-VL风格处理
+        text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        inputs = processor(text=text, videos=frames, return_tensors="pt")
+        inputs = inputs.to(device)
+        with torch.no_grad():
+            generated_ids = model.generate(**inputs, max_new_tokens=512)
+        generated_ids = [
+            output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return response
+    except Exception as e:
+        print(f"Qwen-VL style processing failed: {e}")
+        first_frame = frames[0]
+        try:
+            inputs = processor(text=prompt, videos=[first_frame], return_tensors="pt").to(device)
+            with torch.no_grad():
+                outputs = model.generate(**inputs, max_new_tokens=100)
+            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            return f"[Processed first frame only] {response}"
+        except Exception as e2:
+            return f"Error processing media: {str(e2)}"
 def video_qa(video, prompt):
     """Main function for Gradio interface"""
     if video is None:
     except Exception as e:
         return f"Error processing video: {str(e)}"
+def media_qa(media, prompt):
+    """Gradio接口主函数，支持图片或视频"""
+    if media is None:
+        return "Please upload an image or video"
+    if not prompt:
+        return "Please enter a question"
+    # 判断是否为视频文件路径
+    if isinstance(media, str) and os.path.exists(media):
+        # 视频处理流程（与原video_qa一致）
+        try:
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_input:
+                input_path = tmp_input.name
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_output:
+                output_path = tmp_output.name
+            try:
+                with open(input_path, "wb") as f:
+                    with open(media, "rb") as uploaded_file:
+                        f.write(uploaded_file.read())
+                if not convert_video_format(input_path, output_path):
+                    output_path = input_path
+                result = process_media(output_path, prompt)
+                return result
+            finally:
+                for path in [input_path, output_path]:
+                    if os.path.exists(path):
+                        os.unlink(path)
+        except Exception as e:
+            return f"Error processing video: {str(e)}"
+    else:
+        # 图片直接处理
+        try:
+            return process_media(media, prompt)
+        except Exception as e:
+            return f"Error processing image: {str(e)}"
 # Create Gradio interface
 with gr.Blocks() as demo:
+    gr.Markdown("# Image/Video Question Answering with Custom VLM")
     gr.Markdown(f"Model: {MODEL_ID}")
     with gr.Row():
         with gr.Column():
+            media_input = gr.File(label="Upload Image or Video", file_types=["image", "video"], interactive=True)
+            text_input = gr.Textbox(label="Question", placeholder="What is happening in this image or video?")
             submit_btn = gr.Button("Process")
         with gr.Column():
             output_text = gr.Textbox(label="Answer", lines=10)
     gr.Examples(
         examples=[
+            [None, "Describe what you see in the image or video"],
             [None, "What objects are present in the scene?"]
         ],
+        inputs=[media_input, text_input],
         outputs=output_text
     )
     submit_btn.click(
+        fn=media_qa,
+        inputs=[media_input, text_input],
         outputs=output_text
     )

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg