Spaces:
Running
Running
import gradio as gr | |
import whisper | |
import cv2 | |
import numpy as np | |
import moviepy.editor as mp | |
from moviepy.video.fx import resize | |
from transformers import pipeline, AutoTokenizer, AutoModel | |
import torch | |
import re | |
import os | |
import tempfile | |
from typing import List, Dict, Tuple | |
import json | |
import librosa | |
from textblob import TextBlob | |
import emoji | |
class AIVideoClipper: | |
def __init__(self): | |
# Initialize models | |
print("Loading models...") | |
self.whisper_model = whisper.load_model("base") # Using base model for free tier | |
self.sentiment_analyzer = pipeline("sentiment-analysis", | |
model="cardiffnlp/twitter-roberta-base-sentiment-latest") | |
self.emotion_analyzer = pipeline("text-classification", | |
model="j-hartmann/emotion-english-distilroberta-base") | |
# Viral keywords and patterns | |
self.viral_keywords = [ | |
"wow", "amazing", "incredible", "unbelievable", "shocking", "surprise", | |
"secret", "trick", "hack", "tip", "mistake", "fail", "success", | |
"breakthrough", "discovery", "reveal", "expose", "truth", "lie", | |
"before", "after", "transformation", "change", "upgrade", "improve", | |
"money", "rich", "poor", "expensive", "cheap", "free", "save", | |
"love", "hate", "angry", "happy", "sad", "funny", "laugh", "cry", | |
"first time", "last time", "never", "always", "everyone", "nobody", | |
"finally", "suddenly", "immediately", "instantly", "quickly" | |
] | |
self.hook_patterns = [ | |
r"you won't believe", | |
r"this will change", | |
r"nobody talks about", | |
r"the truth about", | |
r"what happens when", | |
r"here's what", | |
r"this is why", | |
r"the secret", | |
r"watch this", | |
r"wait for it" | |
] | |
def extract_audio_features(self, audio_path: str) -> Dict: | |
"""Extract audio features for engagement analysis""" | |
y, sr = librosa.load(audio_path) | |
# Extract features | |
tempo, _ = librosa.beat.beat_track(y=y, sr=sr) | |
spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[0] | |
spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0] | |
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) | |
return { | |
'tempo': float(tempo), | |
'spectral_centroid_mean': float(np.mean(spectral_centroids)), | |
'spectral_rolloff_mean': float(np.mean(spectral_rolloff)), | |
'mfcc_mean': float(np.mean(mfccs)), | |
'energy_variance': float(np.var(librosa.feature.rms(y=y)[0])) | |
} | |
def transcribe_video(self, video_path: str) -> List[Dict]: | |
"""Transcribe video and return segments with timestamps""" | |
print("Transcribing video...") | |
result = self.whisper_model.transcribe(video_path, word_timestamps=True) | |
segments = [] | |
for segment in result["segments"]: | |
segments.append({ | |
'start': segment['start'], | |
'end': segment['end'], | |
'text': segment['text'].strip(), | |
'words': segment.get('words', []) | |
}) | |
return segments | |
def calculate_virality_score(self, text: str, audio_features: Dict, | |
segment_duration: float) -> float: | |
"""Calculate virality score for a text segment""" | |
score = 0.0 | |
text_lower = text.lower() | |
# Sentiment analysis | |
sentiment = self.sentiment_analyzer(text)[0] | |
if sentiment['label'] == 'POSITIVE' and sentiment['score'] > 0.8: | |
score += 2.0 | |
elif sentiment['label'] == 'NEGATIVE' and sentiment['score'] > 0.8: | |
score += 1.5 | |
# Emotion analysis | |
emotion = self.emotion_analyzer(text)[0] | |
high_engagement_emotions = ['surprise', 'excitement', 'anger', 'joy'] | |
if emotion['label'].lower() in high_engagement_emotions and emotion['score'] > 0.7: | |
score += 2.0 | |
# Viral keywords | |
for keyword in self.viral_keywords: | |
if keyword in text_lower: | |
score += 1.0 | |
# Hook patterns | |
for pattern in self.hook_patterns: | |
if re.search(pattern, text_lower): | |
score += 3.0 | |
# Audio engagement features | |
if audio_features['tempo'] > 120: # Higher tempo = more engaging | |
score += 1.0 | |
if audio_features['energy_variance'] > 0.01: # Energy variation | |
score += 1.0 | |
# Segment duration (30-60 seconds ideal for clips) | |
if 25 <= segment_duration <= 65: | |
score += 2.0 | |
elif 15 <= segment_duration <= 90: | |
score += 1.0 | |
# Text length (not too short, not too long) | |
word_count = len(text.split()) | |
if 20 <= word_count <= 100: | |
score += 1.0 | |
return min(score, 10.0) # Cap at 10 | |
def find_best_moments(self, segments: List[Dict], audio_features: Dict, | |
clip_duration: int = 30) -> List[Dict]: | |
"""Find the best moments for short clips""" | |
print("Analyzing segments for viral potential...") | |
scored_segments = [] | |
for i, segment in enumerate(segments): | |
# Group segments into potential clips | |
clip_segments = [segment] | |
current_duration = segment['end'] - segment['start'] | |
# Extend clip to reach desired duration | |
j = i + 1 | |
while j < len(segments) and current_duration < clip_duration: | |
next_segment = segments[j] | |
if next_segment['end'] - segment['start'] <= clip_duration * 1.5: | |
clip_segments.append(next_segment) | |
current_duration = next_segment['end'] - segment['start'] | |
j += 1 | |
else: | |
break | |
# Calculate combined text and virality score | |
combined_text = " ".join([s['text'] for s in clip_segments]) | |
virality_score = self.calculate_virality_score( | |
combined_text, audio_features, current_duration | |
) | |
scored_segments.append({ | |
'start': segment['start'], | |
'end': clip_segments[-1]['end'], | |
'text': combined_text, | |
'duration': current_duration, | |
'virality_score': virality_score, | |
'segments': clip_segments | |
}) | |
# Sort by virality score and remove overlaps | |
scored_segments.sort(key=lambda x: x['virality_score'], reverse=True) | |
# Remove overlapping segments | |
final_segments = [] | |
for segment in scored_segments: | |
overlap = False | |
for existing in final_segments: | |
if (segment['start'] < existing['end'] and | |
segment['end'] > existing['start']): | |
overlap = True | |
break | |
if not overlap: | |
final_segments.append(segment) | |
if len(final_segments) >= 5: # Limit to top 5 clips | |
break | |
return final_segments | |
def add_emojis_to_text(self, text: str) -> str: | |
"""Add relevant emojis to text based on content""" | |
emoji_map = { | |
'money': 'π°', 'rich': 'π°', 'dollar': 'π΅', | |
'love': 'β€οΈ', 'heart': 'β€οΈ', 'like': 'π', | |
'fire': 'π₯', 'hot': 'π₯', 'amazing': 'π₯', | |
'laugh': 'π', 'funny': 'π', 'lol': 'π', | |
'wow': 'π±', 'omg': 'π±', 'shocking': 'π±', | |
'cool': 'π', 'awesome': 'π', 'great': 'π', | |
'think': 'π€', 'question': 'β', 'why': 'π€', | |
'warning': 'β οΈ', 'careful': 'β οΈ', 'danger': 'β οΈ', | |
'success': 'β ', 'win': 'π', 'winner': 'π', | |
'music': 'π΅', 'song': 'π΅', 'sound': 'π' | |
} | |
words = text.lower().split() | |
for word in words: | |
clean_word = re.sub(r'[^\w]', '', word) | |
if clean_word in emoji_map: | |
text = re.sub(f"\\b{re.escape(word)}\\b", | |
f"{word} {emoji_map[clean_word]}", text, flags=re.IGNORECASE) | |
return text | |
def create_clip(self, video_path: str, start_time: float, end_time: float, | |
text: str, output_path: str, add_subtitles: bool = True) -> str: | |
"""Create a short clip from the video""" | |
print(f"Creating clip: {start_time:.1f}s - {end_time:.1f}s") | |
# Load video | |
video = mp.VideoFileClip(video_path).subclip(start_time, end_time) | |
# Resize to 9:16 aspect ratio (1080x1920) | |
target_width = 1080 | |
target_height = 1920 | |
# Calculate scaling to fit the video in the frame | |
scale_w = target_width / video.w | |
scale_h = target_height / video.h | |
scale = min(scale_w, scale_h) | |
# Resize video | |
video_resized = video.resize(scale) | |
# Create background (blur or solid color) | |
if video_resized.h < target_height or video_resized.w < target_width: | |
# Create blurred background | |
background = video.resize((target_width, target_height)) | |
background = background.fl_image(lambda frame: cv2.GaussianBlur(frame, (21, 21), 0)) | |
# Overlay the main video in center | |
final_video = mp.CompositeVideoClip([ | |
background, | |
video_resized.set_position('center') | |
], size=(target_width, target_height)) | |
else: | |
final_video = video_resized | |
# Add subtitles if requested | |
if add_subtitles and text: | |
# Add emojis to text | |
text_with_emojis = self.add_emojis_to_text(text) | |
# Create text clip | |
txt_clip = mp.TextClip( | |
text_with_emojis, | |
fontsize=60, | |
color='white', | |
stroke_color='black', | |
stroke_width=3, | |
size=(target_width - 100, None), | |
method='caption' | |
).set_position(('center', 0.8), relative=True).set_duration(final_video.duration) | |
final_video = mp.CompositeVideoClip([final_video, txt_clip]) | |
# Write the final video | |
final_video.write_videofile( | |
output_path, | |
codec='libx264', | |
audio_codec='aac', | |
temp_audiofile='temp-audio.m4a', | |
remove_temp=True, | |
fps=30, | |
preset='ultrafast' # Faster encoding for free tier | |
) | |
# Clean up | |
video.close() | |
final_video.close() | |
return output_path | |
def process_video(video_file, clip_duration, num_clips, add_subtitles): | |
"""Main function to process video and create clips""" | |
if video_file is None: | |
return "Please upload a video file.", [], [] | |
clipper = AIVideoClipper() | |
try: | |
# Create temporary directory | |
with tempfile.TemporaryDirectory() as temp_dir: | |
video_path = video_file.name | |
# Extract audio features | |
print("Extracting audio features...") | |
audio_features = clipper.extract_audio_features(video_path) | |
# Transcribe video | |
segments = clipper.transcribe_video(video_path) | |
if not segments: | |
return "Could not transcribe video. Please check the audio quality.", [], [] | |
# Find best moments | |
best_moments = clipper.find_best_moments(segments, audio_features, clip_duration) | |
best_moments = best_moments[:num_clips] # Limit to requested number | |
if not best_moments: | |
return "No suitable clips found. Try adjusting parameters.", [], [] | |
# Create clips | |
output_videos = [] | |
clip_info = [] | |
for i, moment in enumerate(best_moments): | |
output_path = os.path.join(temp_dir, f"clip_{i+1}.mp4") | |
try: | |
clipper.create_clip( | |
video_path, | |
moment['start'], | |
moment['end'], | |
moment['text'], | |
output_path, | |
add_subtitles | |
) | |
# Copy to permanent location | |
permanent_path = f"clip_{i+1}_{hash(video_path)}_{i}.mp4" | |
os.rename(output_path, permanent_path) | |
output_videos.append(permanent_path) | |
clip_info.append({ | |
'clip_number': i + 1, | |
'start_time': f"{moment['start']:.1f}s", | |
'end_time': f"{moment['end']:.1f}s", | |
'duration': f"{moment['duration']:.1f}s", | |
'virality_score': f"{moment['virality_score']:.2f}/10", | |
'text_preview': moment['text'][:100] + "..." if len(moment['text']) > 100 else moment['text'] | |
}) | |
except Exception as e: | |
print(f"Error creating clip {i+1}: {str(e)}") | |
continue | |
success_msg = f"Successfully created {len(output_videos)} clips!" | |
return success_msg, output_videos, clip_info | |
except Exception as e: | |
return f"Error processing video: {str(e)}", [], [] | |
# Create Gradio interface | |
def create_interface(): | |
with gr.Blocks(title="AI Video Clipper", theme=gr.themes.Soft()) as demo: | |
gr.Markdown( | |
""" | |
# π¬ AI Video Clipper | |
Transform your long videos into viral short clips automatically! | |
Upload your video and let AI find the most engaging moments. | |
**Features:** | |
- π€ AI-powered moment detection | |
- π± Auto 9:16 aspect ratio conversion | |
- π Automatic subtitles with emojis | |
- π Virality scoring | |
- π― Multi-language support | |
""" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
video_input = gr.File( | |
label="Upload Video", | |
file_types=[".mp4", ".avi", ".mov", ".mkv", ".webm"], | |
type="filepath" | |
) | |
with gr.Row(): | |
clip_duration = gr.Slider( | |
minimum=15, | |
maximum=90, | |
value=30, | |
step=5, | |
label="Target Clip Duration (seconds)" | |
) | |
num_clips = gr.Slider( | |
minimum=1, | |
maximum=5, | |
value=3, | |
step=1, | |
label="Number of Clips to Generate" | |
) | |
add_subtitles = gr.Checkbox( | |
label="Add Subtitles with Emojis", | |
value=True | |
) | |
process_btn = gr.Button( | |
"π Create Clips", | |
variant="primary", | |
size="lg" | |
) | |
with gr.Column(): | |
status_output = gr.Textbox( | |
label="Status", | |
interactive=False, | |
lines=2 | |
) | |
clips_output = gr.Gallery( | |
label="Generated Clips", | |
show_label=True, | |
elem_id="gallery", | |
columns=1, | |
rows=3, | |
height="auto", | |
allow_preview=True, | |
show_download_button=True | |
) | |
with gr.Row(): | |
info_output = gr.JSON( | |
label="Clip Analysis", | |
visible=True | |
) | |
# Example videos section | |
gr.Markdown("### πΊ Tips for Best Results:") | |
gr.Markdown(""" | |
- Upload videos with clear speech (podcasts, interviews, tutorials work great!) | |
- Longer videos (5+ minutes) provide more clip opportunities | |
- Videos with engaging content and emotional moments score higher | |
- Good audio quality improves transcription accuracy | |
""") | |
process_btn.click( | |
process_video, | |
inputs=[video_input, clip_duration, num_clips, add_subtitles], | |
outputs=[status_output, clips_output, info_output] | |
) | |
return demo | |
# Launch the app | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False | |
) |