whyumesh commited on
Commit
4142d39
·
verified ·
1 Parent(s): 124292b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -67
app.py CHANGED
@@ -1,21 +1,38 @@
1
  import gradio as gr
2
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
3
  from qwen_vl_utils import process_vision_info
4
- import torch
5
  from PIL import Image
6
  import cv2
7
  import numpy as np
8
  import os
9
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def load_model():
11
- model = Qwen2VLForConditionalGeneration.from_pretrained(
12
- "Qwen/Qwen2-VL-2B-Instruct",
13
- torch_dtype=torch.float16
14
- ).to("cuda")
15
- processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
16
- return model, processor
 
 
 
 
 
 
17
 
18
- model, processor = load_model()
19
 
20
  SYSTEM_PROMPT = """You are an expert technical analyst specializing in identifying bugs, fixing errors, and explaining code functions from visual inputs. When presented with an image or video:
21
  1. If you see code, analyze it for potential bugs or errors, and suggest fixes.
@@ -54,71 +71,93 @@ def analyze_image(image, prompt):
54
  return generate_response(messages)
55
 
56
  def analyze_video(video_path, prompt, max_frames=16, frame_interval=30, max_resolution=224):
57
- cap = cv2.VideoCapture(video_path)
58
- frames = []
59
- frame_count = 0
60
-
61
- while len(frames) < max_frames:
62
- ret, frame = cap.read()
63
- if not ret:
64
- break
65
 
66
- if frame_count % frame_interval == 0:
67
- h, w = frame.shape[:2]
68
- if h > w:
69
- new_h, new_w = max_resolution, int(w * max_resolution / h)
70
- else:
71
- new_h, new_w = int(h * max_resolution / w), max_resolution
72
- frame = cv2.resize(frame, (new_w, new_h))
73
 
74
- frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
75
- frame = Image.fromarray(frame)
 
 
 
 
 
 
 
 
 
 
76
 
77
- frames.append(frame)
78
-
79
- frame_count += 1
80
 
81
- cap.release()
82
-
83
- messages = [
84
- {"role": "system", "content": SYSTEM_PROMPT},
85
- {
86
- "role": "user",
87
- "content": [
88
- {"type": "video", "video": frames},
89
- {"type": "text", "text": f"Based on the system instructions, {prompt}"},
90
- ],
91
- }
92
- ]
93
-
94
- return generate_response(messages)
 
 
95
 
96
  def generate_response(messages):
97
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
98
- image_inputs, video_inputs = process_vision_info(messages)
99
-
100
- inputs = processor(
101
- text=[text],
102
- images=image_inputs,
103
- videos=video_inputs,
104
- padding=True,
105
- return_tensors="pt",
106
- ).to("cuda")
107
-
108
- del image_inputs, video_inputs
109
- torch.cuda.empty_cache()
110
-
111
- with torch.no_grad():
112
- generated_ids = model.generate(**inputs, max_new_tokens=512) # Increased token limit for more detailed responses
113
- generated_ids_trimmed = [
114
- out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
115
- ]
116
- output_text = processor.batch_decode(
117
- generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
118
- )
119
-
120
- return output_text[0]
121
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
  # Gradio interface
123
  iface = gr.Interface(
124
  fn=process_content,
 
1
  import gradio as gr
2
  from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
3
  from qwen_vl_utils import process_vision_info
 
4
  from PIL import Image
5
  import cv2
6
  import numpy as np
7
  import os
8
 
9
+ import torch
10
+
11
+ # Optimize for A100
12
+ torch.backends.cuda.matmul.allow_tf32 = True
13
+ torch.backends.cudnn.allow_tf32 = True
14
+
15
+ # Set the default tensor type to cuda
16
+ if torch.cuda.is_available():
17
+ torch.set_default_tensor_type('torch.cuda.FloatTensor')
18
+
19
+
20
+
21
  def load_model():
22
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
23
+ try:
24
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
25
+ "Qwen/Qwen2-VL-2B-Instruct",
26
+ torch_dtype=torch.float16, # Use float16 for faster inference on GPU
27
+ device_map="auto" # This will automatically handle multi-GPU setups
28
+ )
29
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
30
+ return model, processor, device
31
+ except Exception as e:
32
+ print(f"Error loading model: {e}")
33
+ return None, None, None
34
 
35
+ model, processor, device = load_model()
36
 
37
  SYSTEM_PROMPT = """You are an expert technical analyst specializing in identifying bugs, fixing errors, and explaining code functions from visual inputs. When presented with an image or video:
38
  1. If you see code, analyze it for potential bugs or errors, and suggest fixes.
 
71
  return generate_response(messages)
72
 
73
  def analyze_video(video_path, prompt, max_frames=16, frame_interval=30, max_resolution=224):
74
+ try:
75
+ cap = cv2.VideoCapture(video_path)
76
+ if not cap.isOpened():
77
+ return "Error: Could not open video file."
 
 
 
 
78
 
79
+ frames = []
80
+ frame_count = 0
81
+
82
+ while len(frames) < max_frames:
83
+ ret, frame = cap.read()
84
+ if not ret:
85
+ break
86
 
87
+ if frame_count % frame_interval == 0:
88
+ h, w = frame.shape[:2]
89
+ if h > w:
90
+ new_h, new_w = max_resolution, int(w * max_resolution / h)
91
+ else:
92
+ new_h, new_w = int(h * max_resolution / w), max_resolution
93
+ frame = cv2.resize(frame, (new_w, new_h))
94
+
95
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
96
+ frame = Image.fromarray(frame)
97
+
98
+ frames.append(frame)
99
 
100
+ frame_count += 1
 
 
101
 
102
+ return generate_response([
103
+ {"role": "system", "content": SYSTEM_PROMPT},
104
+ {
105
+ "role": "user",
106
+ "content": [
107
+ {"type": "video", "video": frames},
108
+ {"type": "text", "text": f"Based on the system instructions, {prompt}"},
109
+ ],
110
+ }
111
+ ])
112
+ except Exception as e:
113
+ return f"Error processing video: {e}"
114
+ finally:
115
+ if 'cap' in locals():
116
+ cap.release()
117
+
118
 
119
  def generate_response(messages):
120
+ try:
121
+ text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
122
+ image_inputs, video_inputs = process_vision_info(messages)
123
+
124
+ inputs = processor(
125
+ text=[text],
126
+ images=image_inputs,
127
+ videos=video_inputs,
128
+ padding=True,
129
+ return_tensors="pt"
130
+ )
131
+
132
+ # Move inputs to GPU
133
+ inputs = {k: v.to(device) for k, v in inputs.items()}
134
+
135
+ with torch.no_grad():
136
+ generated_ids = model.generate(
137
+ **inputs,
138
+ max_new_tokens=512,
139
+ do_sample=True,
140
+ top_k=20,
141
+ top_p=0.9,
142
+ temperature=0.7
143
+ )
144
+
145
+ generated_ids_trimmed = [
146
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
147
+ ]
148
+ output_text = processor.batch_decode(
149
+ generated_ids_trimmed,
150
+ skip_special_tokens=True,
151
+ clean_up_tokenization_spaces=False
152
+ )
153
+
154
+ # Clear CUDA cache
155
+ torch.cuda.empty_cache()
156
+
157
+ return output_text[0]
158
+ except Exception as e:
159
+ return f"Error generating response: {e}"
160
+
161
  # Gradio interface
162
  iface = gr.Interface(
163
  fn=process_content,