John Ho commited on
Commit
f18bd0f
·
1 Parent(s): bd916de

testing more efficient model loading

Browse files
Files changed (2) hide show
  1. README.md +3 -3
  2. app.py +47 -31
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: Name for you Space App
3
- emoji: 📚
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.32.0
8
  app_file: app.py
9
  pinned: false
10
- short_description: short description for your Space App
11
  ---
12
 
13
  # The HuggingFace Space Template
 
1
  ---
2
+ title: Camera Motion Detection
3
+ emoji: 📸
4
  colorFrom: blue
5
  colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.32.0
8
  app_file: app.py
9
  pinned: false
10
+ short_description: Demo of the camera motion detection as part of CameraBench
11
  ---
12
 
13
  # The HuggingFace Space Template
app.py CHANGED
@@ -20,10 +20,16 @@ subprocess.run(
20
  )
21
  # --- now we got Flash Attention ---#
22
 
23
- # The model is trained on 8.0 FPS which we recommend for optimal inference
24
-
25
- DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float16
26
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
 
 
27
  logger.info(f"Device: {DEVICE}, dtype: {DTYPE}")
28
 
29
 
@@ -60,9 +66,20 @@ def load_model(
60
  device_map=DEVICE,
61
  )
62
  )
 
 
63
  return model
64
 
65
 
 
 
 
 
 
 
 
 
 
66
  @spaces.GPU(duration=120)
67
  def inference(
68
  video_path: str,
@@ -70,13 +87,10 @@ def inference(
70
  use_flash_attention: bool = True,
71
  ):
72
  # default processor
73
- processor = AutoProcessor.from_pretrained(
74
- "Qwen/Qwen2.5-VL-7B-Instruct",
75
- device_map=DEVICE,
76
- use_fast=True,
77
- torch_dtype=DTYPE,
78
- )
79
  model = load_model(use_flash_attention=use_flash_attention)
 
 
80
  fps = get_fps_ffmpeg(video_path)
81
  logger.info(f"{os.path.basename(video_path)} FPS: {fps}")
82
  messages = [
@@ -99,28 +113,30 @@ def inference(
99
  image_inputs, video_inputs, video_kwargs = process_vision_info(
100
  messages, return_video_kwargs=True
101
  )
102
- inputs = processor(
103
- text=[text],
104
- images=image_inputs,
105
- videos=video_inputs,
106
- # fps=fps,
107
- padding=True,
108
- return_tensors="pt",
109
- **video_kwargs,
110
- )
111
- inputs = inputs.to(DEVICE)
112
 
113
- # Inference
114
- generated_ids = model.generate(**inputs, max_new_tokens=128)
115
- generated_ids_trimmed = [
116
- out_ids[len(in_ids) :]
117
- for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
118
- ]
119
- output_text = processor.batch_decode(
120
- generated_ids_trimmed,
121
- skip_special_tokens=True,
122
- clean_up_tokenization_spaces=False,
123
- )
 
 
 
 
 
 
 
 
 
 
 
 
124
  return output_text
125
 
126
 
 
20
  )
21
  # --- now we got Flash Attention ---#
22
 
23
+ # Set target DEVICE and DTYPE
24
+ # For maximum memory efficiency, use bfloat16 if your GPU supports it, otherwise float16.
25
+ DTYPE = (
26
+ torch.bfloat16
27
+ if torch.cuda.is_available() and torch.cuda.is_bfloat16_supported()
28
+ else torch.float16
29
+ )
30
+ # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
31
+ # Use "auto" to let accelerate handle device placement (GPU, CPU, disk)
32
+ DEVICE = "auto"
33
  logger.info(f"Device: {DEVICE}, dtype: {DTYPE}")
34
 
35
 
 
66
  device_map=DEVICE,
67
  )
68
  )
69
+ # Set model to evaluation mode for inference (disables dropout, etc.)
70
+ model.eval()
71
  return model
72
 
73
 
74
+ def load_processor(model_name="Qwen/Qwen2.5-VL-7B-Instruct"):
75
+ return AutoProcessor.from_pretrained(
76
+ model_name,
77
+ device_map=DEVICE,
78
+ use_fast=True,
79
+ torch_dtype=DTYPE,
80
+ )
81
+
82
+
83
  @spaces.GPU(duration=120)
84
  def inference(
85
  video_path: str,
 
87
  use_flash_attention: bool = True,
88
  ):
89
  # default processor
90
+ processor = load_processor()
 
 
 
 
 
91
  model = load_model(use_flash_attention=use_flash_attention)
92
+
93
+ # The model is trained on 8.0 FPS which we recommend for optimal inference
94
  fps = get_fps_ffmpeg(video_path)
95
  logger.info(f"{os.path.basename(video_path)} FPS: {fps}")
96
  messages = [
 
113
  image_inputs, video_inputs, video_kwargs = process_vision_info(
114
  messages, return_video_kwargs=True
115
  )
 
 
 
 
 
 
 
 
 
 
116
 
117
+ with torch.no_grad():
118
+ inputs = processor(
119
+ text=[text],
120
+ images=image_inputs,
121
+ videos=video_inputs,
122
+ # fps=fps,
123
+ padding=True,
124
+ return_tensors="pt",
125
+ **video_kwargs,
126
+ )
127
+ # inputs = inputs.to(DEVICE)
128
+
129
+ # Inference
130
+ generated_ids = model.generate(**inputs, max_new_tokens=128)
131
+ generated_ids_trimmed = [
132
+ out_ids[len(in_ids) :]
133
+ for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
134
+ ]
135
+ output_text = processor.batch_decode(
136
+ generated_ids_trimmed,
137
+ skip_special_tokens=True,
138
+ clean_up_tokenization_spaces=False,
139
+ )
140
  return output_text
141
 
142