Spaces:

prithivMLmods
/

Agent-Dino

Running on Zero

prithivMLmods commited on 22 days ago

Commit

b604e8c

verified ·

1 Parent(s): 3ed9f62

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -32,7 +32,7 @@ enhancer_long = pipeline("summarization", model="prithivMLmods/t5-Flan-Prompt-En
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 2048
-# Qwen2VL caption function – updated to request plain text caption instead of JSON
 @spaces.GPU
 def qwen_caption(image):
     # Convert image to PIL if needed
@@ -48,7 +48,7 @@ def qwen_caption(image):
             ],
         }
     ]
     text = qwen_processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
@@ -59,11 +59,17 @@ def qwen_caption(image):
         videos=video_inputs,
         padding=True,
         return_tensors="pt",
-    ).to(device)
-    generated_ids = qwen_model.generate(**inputs, max_new_tokens=1024)
     generated_ids_trimmed = [
-        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
     ]
     output_text = qwen_processor.batch_decode(
         generated_ids_trimmed,
@@ -118,7 +124,7 @@ def process_workflow(image, text_prompt, use_enhancer, seed, randomize_seed, wid
 custom_css = """
 .input-group, .output-group {
 }
 .submit-btn {
     background: linear-gradient(90deg, #4B79A1 0%, #283E51 100%) !important;

 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 2048
+# Qwen2VL caption function – updated with no_grad and autocast contexts, and explicit device moves
 @spaces.GPU
 def qwen_caption(image):
     # Convert image to PIL if needed
             ],
         }
     ]
     text = qwen_processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
         videos=video_inputs,
         padding=True,
         return_tensors="pt",
+    )
+    # Explicitly move each tensor to device
+    inputs = {k: v.to(device) for k, v in inputs.items()}
+    # Wrap generation in no_grad and autocast contexts to prevent extra memory usage and potential caching issues
+    with torch.no_grad():
+        with torch.cuda.amp.autocast(device_type="cuda", dtype=torch.float16):
+            generated_ids = qwen_model.generate(**inputs, max_new_tokens=1024)
     generated_ids_trimmed = [
+        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs["input_ids"], generated_ids)
     ]
     output_text = qwen_processor.batch_decode(
         generated_ids_trimmed,
 custom_css = """
 .input-group, .output-group {
+    /* You can add styling here if needed */
 }
 .submit-btn {
     background: linear-gradient(90deg, #4B79A1 0%, #283E51 100%) !important;