enpeizhao commited on
Commit
b651759
·
1 Parent(s): 98ba949

fix video inference and add images

Browse files
Files changed (2) hide show
  1. app.py +98 -9
  2. packages.txt +1 -0
app.py CHANGED
@@ -128,7 +128,7 @@ def process_video_frames(video_path, prompt):
128
  print(f"Qwen-VL style processing failed: {e}")
129
  # Process first frame with text prompt
130
  first_frame = frames[0]
131
- inputs = processor(images=first_frame, text=prompt, return_tensors="pt").to(device)
132
 
133
  # Generate response
134
  with torch.no_grad():
@@ -141,6 +141,60 @@ def process_video_frames(video_path, prompt):
141
  except Exception as e:
142
  return f"Error processing video: {str(e)}"
143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  def video_qa(video, prompt):
145
  """Main function for Gradio interface"""
146
  if video is None:
@@ -181,33 +235,68 @@ def video_qa(video, prompt):
181
  except Exception as e:
182
  return f"Error processing video: {str(e)}"
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  # Create Gradio interface
185
  with gr.Blocks() as demo:
186
- gr.Markdown("# Video Question Answering with Custom VLM")
187
  gr.Markdown(f"Model: {MODEL_ID}")
188
 
189
  with gr.Row():
190
  with gr.Column():
191
- video_input = gr.Video(label="Upload Video", interactive=True)
192
- text_input = gr.Textbox(label="Question", placeholder="What is happening in this video?")
193
  submit_btn = gr.Button("Process")
194
 
195
  with gr.Column():
196
  output_text = gr.Textbox(label="Answer", lines=10)
197
 
198
- # Examples
199
  gr.Examples(
200
  examples=[
201
- [None, "Describe what you see in the video"],
202
  [None, "What objects are present in the scene?"]
203
  ],
204
- inputs=[video_input, text_input],
205
  outputs=output_text
206
  )
207
 
208
  submit_btn.click(
209
- fn=video_qa,
210
- inputs=[video_input, text_input],
211
  outputs=output_text
212
  )
213
 
 
128
  print(f"Qwen-VL style processing failed: {e}")
129
  # Process first frame with text prompt
130
  first_frame = frames[0]
131
+ inputs = processor(text=prompt, videos=[first_frame], return_tensors="pt").to(device)
132
 
133
  # Generate response
134
  with torch.no_grad():
 
141
  except Exception as e:
142
  return f"Error processing video: {str(e)}"
143
 
144
+ def process_media(media, prompt):
145
+ """
146
+ 通用处理函数,支持图片(PIL.Image)或视频(文件路径)
147
+ """
148
+ if model is None or processor is None or tokenizer is None:
149
+ return "Model not loaded properly"
150
+
151
+ # 判断输入类型
152
+ if isinstance(media, Image.Image):
153
+ # 单张图片
154
+ frames = [media]
155
+ elif isinstance(media, str) and os.path.exists(media):
156
+ # 视频路径,提取帧
157
+ frames = extract_frames(media, max_frames=8)
158
+ if not frames:
159
+ return "No frames extracted from video"
160
+ else:
161
+ return "Unsupported media type"
162
+
163
+ # 构造消息
164
+ messages = [
165
+ {
166
+ "role": "user",
167
+ "content": [
168
+ {"type": "video", "video": frames},
169
+ {"type": "text", "text": prompt},
170
+ ],
171
+ }
172
+ ]
173
+
174
+ try:
175
+ # Qwen-VL风格处理
176
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
177
+ inputs = processor(text=text, videos=frames, return_tensors="pt")
178
+ inputs = inputs.to(device)
179
+ with torch.no_grad():
180
+ generated_ids = model.generate(**inputs, max_new_tokens=512)
181
+ generated_ids = [
182
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, generated_ids)
183
+ ]
184
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
185
+ return response
186
+ except Exception as e:
187
+ print(f"Qwen-VL style processing failed: {e}")
188
+ first_frame = frames[0]
189
+ try:
190
+ inputs = processor(text=prompt, videos=[first_frame], return_tensors="pt").to(device)
191
+ with torch.no_grad():
192
+ outputs = model.generate(**inputs, max_new_tokens=100)
193
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
194
+ return f"[Processed first frame only] {response}"
195
+ except Exception as e2:
196
+ return f"Error processing media: {str(e2)}"
197
+
198
  def video_qa(video, prompt):
199
  """Main function for Gradio interface"""
200
  if video is None:
 
235
  except Exception as e:
236
  return f"Error processing video: {str(e)}"
237
 
238
+ def media_qa(media, prompt):
239
+ """Gradio接口主函数,支持图片或视频"""
240
+ if media is None:
241
+ return "Please upload an image or video"
242
+ if not prompt:
243
+ return "Please enter a question"
244
+
245
+ # 判断是否为视频文件路径
246
+ if isinstance(media, str) and os.path.exists(media):
247
+ # 视频处理流程(与原video_qa一致)
248
+ try:
249
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_input:
250
+ input_path = tmp_input.name
251
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".mp4") as tmp_output:
252
+ output_path = tmp_output.name
253
+ try:
254
+ with open(input_path, "wb") as f:
255
+ with open(media, "rb") as uploaded_file:
256
+ f.write(uploaded_file.read())
257
+ if not convert_video_format(input_path, output_path):
258
+ output_path = input_path
259
+ result = process_media(output_path, prompt)
260
+ return result
261
+ finally:
262
+ for path in [input_path, output_path]:
263
+ if os.path.exists(path):
264
+ os.unlink(path)
265
+ except Exception as e:
266
+ return f"Error processing video: {str(e)}"
267
+ else:
268
+ # 图片直接处理
269
+ try:
270
+ return process_media(media, prompt)
271
+ except Exception as e:
272
+ return f"Error processing image: {str(e)}"
273
+
274
  # Create Gradio interface
275
  with gr.Blocks() as demo:
276
+ gr.Markdown("# Image/Video Question Answering with Custom VLM")
277
  gr.Markdown(f"Model: {MODEL_ID}")
278
 
279
  with gr.Row():
280
  with gr.Column():
281
+ media_input = gr.File(label="Upload Image or Video", file_types=["image", "video"], interactive=True)
282
+ text_input = gr.Textbox(label="Question", placeholder="What is happening in this image or video?")
283
  submit_btn = gr.Button("Process")
284
 
285
  with gr.Column():
286
  output_text = gr.Textbox(label="Answer", lines=10)
287
 
 
288
  gr.Examples(
289
  examples=[
290
+ [None, "Describe what you see in the image or video"],
291
  [None, "What objects are present in the scene?"]
292
  ],
293
+ inputs=[media_input, text_input],
294
  outputs=output_text
295
  )
296
 
297
  submit_btn.click(
298
+ fn=media_qa,
299
+ inputs=[media_input, text_input],
300
  outputs=output_text
301
  )
302
 
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg