Spaces:

Aditi755
/

dance-instructor-llava

Sleeping

App Files Files Community

Aditi755 commited on Jun 13

Commit

7a9faec

verified ·

1 Parent(s): 9b3e431

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -26

app.py CHANGED Viewed

@@ -3,24 +3,69 @@ import torch
 from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration
 import tempfile
 import os
 # Load model and processor
 model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
-processor = LlavaNextVideoProcessor.from_pretrained(model_id)
-model = LlavaNextVideoForConditionalGeneration.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
-    device_map="auto"
-)
-def analyze_dance_video(video_file, pose_scores = 0.85, music_info="Unknown"):
     """
     Analyze dance video with pose scores and music information
     """
     try:
         # Prepare the prompt
-        prompt = f"""
-        You are an expert dance instructor. Analyze this dance performance video.
         Additional Data:
         - MediaPipe Pose Scores: {pose_scores}
@@ -37,16 +82,20 @@ def analyze_dance_video(video_file, pose_scores = 0.85, music_info="Unknown"):
         7. The accuracy and technique of the dancer's poses, considering the pose scores.
         8. The fluidity and smoothness of transitions between moves.
         9. Specific areas where the dancer can improve.
         Give constructive feedback in a friendly, encouraging tone.
-        """
-        # Process video
         inputs = processor(
             text=prompt,
-            videos=video_file,
             return_tensors="pt"
-        ).to(model.device, torch.float16)
         # Generate response
         with torch.no_grad():
@@ -55,14 +104,16 @@ def analyze_dance_video(video_file, pose_scores = 0.85, music_info="Unknown"):
                 max_new_tokens=500,
                 do_sample=True,
                 temperature=0.7,
-                pad_token_id=processor.tokenizer.eos_token_id
             )
         # Decode response
         response = processor.decode(output[0], skip_special_tokens=True)
-        # Extract just the generated part (after the prompt)
-        response = response.split("Give constructive feedback in a friendly, encouraging tone.")[-1].strip()
         return response
@@ -77,17 +128,18 @@ with gr.Blocks(title="AI Dance Instructor") as demo:
     with gr.Row():
         with gr.Column():
             video_input = gr.Video(
-                label="Upload Dance Video",
-                format="mp4"
             )
             pose_scores = gr.Textbox(
                 label="MediaPipe Pose Scores",
-                placeholder="Enter pose scores data from MediaPipe...",
-                lines=5
             )
             music_info = gr.Textbox(
                 label="Music Information",
                 placeholder="BPM, genre, rhythm details...",
                 lines=3
             )
             analyze_btn = gr.Button("Analyze Dance", variant="primary")
@@ -106,17 +158,17 @@ with gr.Blocks(title="AI Dance Instructor") as demo:
         outputs=[feedback_output]
     )
-    # Add API endpoint for Next.js integration
     gr.Markdown("### API Usage")
     gr.Markdown("""
-    **For Next.js Integration:**
     ```javascript
     const response = await fetch('https://your-space-name.hf.space/api/predict', {
       method: 'POST',
-      headers: { 'Content-Type': 'application/json' },
-      body: JSON.stringify({
-        data: [videoFile, poseScores, musicInfo]
-      })
     });
     ```
     """)

 from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration
 import tempfile
 import os
+import cv2
+import numpy as np
 # Load model and processor
 model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
+# Initialize processor and model with error handling
+try:
+    processor = LlavaNextVideoProcessor.from_pretrained(model_id)
+    model = LlavaNextVideoForConditionalGeneration.from_pretrained(
+        model_id,
+        torch_dtype=torch.float16,
+        device_map="auto",
+        low_cpu_mem_usage=True
+    )
+except Exception as e:
+    print(f"Error loading model: {e}")
+    processor = None
+    model = None
+def process_video_file(video_path):
+    """Convert video file to the format expected by the model"""
+    try:
+        # Read video using OpenCV
+        cap = cv2.VideoCapture(video_path)
+        frames = []
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            # Convert BGR to RGB
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(frame)
+        cap.release()
+        # Convert to numpy array and normalize
+        video_frames = np.array(frames)
+        return video_frames
+    except Exception as e:
+        print(f"Error processing video: {e}")
+        return None
+def analyze_dance_video(video_file, pose_scores="0.85", music_info="Unknown"):
     """
     Analyze dance video with pose scores and music information
     """
+    if model is None or processor is None:
+        return "Error: Model not loaded properly. Please check the logs."
+    if video_file is None:
+        return "Please upload a video file."
     try:
+        # Process the video file
+        video_frames = process_video_file(video_file)
+        if video_frames is None:
+            return "Error: Could not process video file."
         # Prepare the prompt
+        prompt = f"""USER: You are an expert dance instructor. Analyze this dance performance video.
         Additional Data:
         - MediaPipe Pose Scores: {pose_scores}
         7. The accuracy and technique of the dancer's poses, considering the pose scores.
         8. The fluidity and smoothness of transitions between moves.
         9. Specific areas where the dancer can improve.
         Give constructive feedback in a friendly, encouraging tone.
+        ASSISTANT:"""
+        # Process video with the model
         inputs = processor(
             text=prompt,
+            videos=[video_frames],  # Note: videos expects a list
             return_tensors="pt"
+        )
+        # Move inputs to the same device as model
+        inputs = {k: v.to(model.device) if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
         # Generate response
         with torch.no_grad():
                 max_new_tokens=500,
                 do_sample=True,
                 temperature=0.7,
+                pad_token_id=processor.tokenizer.eos_token_id,
+                eos_token_id=processor.tokenizer.eos_token_id
             )
         # Decode response
         response = processor.decode(output[0], skip_special_tokens=True)
+        # Extract just the generated part (after ASSISTANT:)
+        if "ASSISTANT:" in response:
+            response = response.split("ASSISTANT:")[-1].strip()
         return response
     with gr.Row():
         with gr.Column():
             video_input = gr.Video(
+                label="Upload Dance Video"
             )
             pose_scores = gr.Textbox(
                 label="MediaPipe Pose Scores",
+                placeholder="Enter pose scores data from MediaPipe (e.g., 0.85)...",
+                value="0.85",
+                lines=3
             )
             music_info = gr.Textbox(
                 label="Music Information",
                 placeholder="BPM, genre, rhythm details...",
+                value="120 BPM, Pop music",
                 lines=3
             )
             analyze_btn = gr.Button("Analyze Dance", variant="primary")
         outputs=[feedback_output]
     )
+    # Add API endpoint info
     gr.Markdown("### API Usage")
     gr.Markdown("""
+    **For Next.js Integration, use the API endpoint:**
     ```javascript
+    const formData = new FormData();
+    formData.append('data', JSON.stringify([videoFile, poseScores, musicInfo]));
     const response = await fetch('https://your-space-name.hf.space/api/predict', {
       method: 'POST',
+      body: formData
     });
     ```
     """)