Spaces:
Running
Running
import { | |
AutoProcessor, | |
AutoModelForImageTextToText, | |
load_image, | |
TextStreamer, | |
} from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]"; | |
class VideoCaptionApp { | |
constructor() { | |
this.videoFile = null; | |
this.model = null; | |
this.processor = null; | |
this.isProcessing = false; | |
this.initializeElements(); | |
this.attachEventListeners(); | |
this.checkWebGPUSupport(); | |
} | |
initializeElements() { | |
this.elements = { | |
videoPlayer: document.getElementById('videoPlayer'), | |
videoInput: document.getElementById('videoInput'), | |
uploadArea: document.getElementById('uploadArea'), | |
processBtn: document.getElementById('processBtn'), | |
frameCount: document.getElementById('frameCount'), | |
deviceSelect: document.getElementById('deviceSelect'), | |
results: document.getElementById('results'), | |
frameCaptions: document.getElementById('frameCaptions'), | |
summaryText: document.getElementById('summaryText'), | |
progressOverlay: document.getElementById('progressOverlay'), | |
progressCircle: document.getElementById('progressCircle'), | |
progressText: document.getElementById('progressText'), | |
progressStatus: document.getElementById('progressStatus'), | |
controls: document.getElementById('controls'), | |
copyBtn: document.getElementById('copyBtn'), | |
finalCaption: document.getElementById('finalCaption') | |
}; | |
} | |
attachEventListeners() { | |
this.elements.uploadArea.addEventListener('click', () => { | |
if (!this.isProcessing) { | |
this.elements.videoInput.click(); | |
} | |
}); | |
this.elements.uploadArea.addEventListener('dragover', (e) => { | |
e.preventDefault(); | |
if (!this.isProcessing) { | |
this.elements.uploadArea.classList.add('drag-over'); | |
} | |
}); | |
this.elements.uploadArea.addEventListener('dragleave', () => { | |
this.elements.uploadArea.classList.remove('drag-over'); | |
}); | |
this.elements.uploadArea.addEventListener('drop', (e) => { | |
e.preventDefault(); | |
this.elements.uploadArea.classList.remove('drag-over'); | |
if (!this.isProcessing && e.dataTransfer.files.length > 0) { | |
const file = e.dataTransfer.files[0]; | |
if (file.type.startsWith('video/')) { | |
this.handleVideoUpload(file); | |
} | |
} | |
}); | |
this.elements.videoInput.addEventListener('change', (e) => { | |
if (e.target.files.length > 0) { | |
this.handleVideoUpload(e.target.files[0]); | |
} | |
}); | |
this.elements.processBtn.addEventListener('click', () => { | |
if (!this.isProcessing && this.videoFile) { | |
this.processVideo(); | |
} | |
}); | |
this.elements.copyBtn.addEventListener('click', () => { | |
this.copyResults(); | |
}); | |
} | |
async checkWebGPUSupport() { | |
if (!navigator.gpu) { | |
this.elements.deviceSelect.querySelector('option[value="webgpu"]').disabled = true; | |
this.elements.deviceSelect.value = 'cpu'; | |
} | |
} | |
handleVideoUpload(file) { | |
this.videoFile = file; | |
const videoURL = URL.createObjectURL(file); | |
this.elements.videoPlayer.src = videoURL; | |
this.elements.uploadArea.style.display = 'none'; | |
this.elements.controls.style.display = 'block'; | |
this.elements.results.style.display = 'none'; | |
} | |
updateProgress(percent, status) { | |
const circumference = 2 * Math.PI * 45; | |
const offset = circumference - (percent / 100) * circumference; | |
this.elements.progressCircle.style.strokeDasharray = `${circumference} ${circumference}`; | |
this.elements.progressCircle.style.strokeDashoffset = offset; | |
this.elements.progressText.textContent = `${Math.round(percent)}%`; | |
this.elements.progressStatus.textContent = status; | |
} | |
async extractFramesFromVideo(videoUrl, numFrames = 8) { | |
return new Promise((resolve, reject) => { | |
const video = document.createElement('video'); | |
video.crossOrigin = 'anonymous'; | |
video.muted = true; | |
const frames = []; | |
let captureCount = 0; | |
video.addEventListener('loadedmetadata', async () => { | |
const duration = video.duration; | |
const interval = duration / numFrames; | |
for (let i = 0; i < numFrames; i++) { | |
video.currentTime = i * interval; | |
await new Promise(seekResolve => { | |
video.addEventListener('seeked', async () => { | |
const canvas = document.createElement('canvas'); | |
canvas.width = Math.min(video.videoWidth, 1280); | |
canvas.height = Math.min(video.videoHeight, 720); | |
const ctx = canvas.getContext('2d'); | |
ctx.drawImage(video, 0, 0, canvas.width, canvas.height); | |
const blob = await new Promise(blobResolve => { | |
canvas.toBlob(blobResolve, 'image/jpeg', 0.85); | |
}); | |
frames.push({ | |
blob, | |
timestamp: i * interval | |
}); | |
captureCount++; | |
this.updateProgress((captureCount / numFrames) * 20, `Extracting frame ${captureCount}/${numFrames}...`); | |
if (captureCount === numFrames) { | |
resolve(frames); | |
} | |
seekResolve(); | |
}, { once: true }); | |
}); | |
} | |
}); | |
video.addEventListener('error', reject); | |
video.src = videoUrl; | |
video.load(); | |
}); | |
} | |
async initializeModel() { | |
const device = this.elements.deviceSelect.value; | |
const model_id = "onnx-community/FastVLM-0.5B-ONNX"; | |
this.updateProgress(25, 'Loading AI model...'); | |
try { | |
this.processor = await AutoProcessor.from_pretrained(model_id); | |
this.updateProgress(50, 'Initializing model...'); | |
const modelOptions = { | |
dtype: { | |
embed_tokens: "fp16", | |
vision_encoder: "q4", | |
decoder_model_merged: "q4", | |
} | |
}; | |
if (device === 'webgpu') { | |
modelOptions.device = 'webgpu'; | |
} | |
this.model = await AutoModelForImageTextToText.from_pretrained(model_id, modelOptions); | |
this.updateProgress(60, 'Model ready'); | |
} catch (error) { | |
console.error('Model initialization error:', error); | |
throw error; | |
} | |
} | |
async processVideo() { | |
this.isProcessing = true; | |
this.elements.processBtn.classList.add('loading'); | |
this.elements.progressOverlay.classList.add('active'); | |
this.elements.results.style.display = 'none'; | |
this.elements.frameCaptions.innerHTML = ''; | |
try { | |
const videoURL = URL.createObjectURL(this.videoFile); | |
const numFrames = parseInt(this.elements.frameCount.value); | |
this.updateProgress(0, 'Starting...'); | |
const frames = await this.extractFramesFromVideo(videoURL, numFrames); | |
if (!this.model) { | |
await this.initializeModel(); | |
} | |
const allCaptions = []; | |
const totalSteps = frames.length; | |
for (let i = 0; i < frames.length; i++) { | |
const progress = 60 + (i / totalSteps) * 30; | |
this.updateProgress(progress, `Analyzing frame ${i + 1}/${totalSteps}...`); | |
const frameUrl = URL.createObjectURL(frames[i].blob); | |
const image = await load_image(frameUrl); | |
const messages = [ | |
{ | |
role: "user", | |
content: `<image>This is frame ${i + 1} of ${numFrames} from a video at ${frames[i].timestamp.toFixed(1)}s. Describe what's happening in this frame, focusing on actions, objects, and any notable changes.`, | |
}, | |
]; | |
const prompt = this.processor.apply_chat_template(messages, { | |
add_generation_prompt: true, | |
}); | |
const inputs = await this.processor(image, prompt, { | |
add_special_tokens: false, | |
}); | |
let captionText = ''; | |
const streamer = new TextStreamer(this.processor.tokenizer, { | |
skip_prompt: true, | |
skip_special_tokens: false, | |
callback_function: (text) => { | |
captionText += text; | |
} | |
}); | |
const outputs = await this.model.generate({ | |
...inputs, | |
max_new_tokens: 256, | |
do_sample: false, | |
streamer: streamer, | |
}); | |
allCaptions.push({ | |
frame: i + 1, | |
timestamp: frames[i].timestamp, | |
caption: captionText.trim() | |
}); | |
this.displayFrameCaption(allCaptions[allCaptions.length - 1]); | |
URL.revokeObjectURL(frameUrl); | |
} | |
this.updateProgress(95, 'Generating video summary...'); | |
await this.generateVideoSummary(frames[frames.length - 1], allCaptions); | |
this.updateProgress(100, 'Complete!'); | |
setTimeout(() => { | |
this.elements.progressOverlay.classList.remove('active'); | |
this.elements.results.style.display = 'block'; | |
}, 500); | |
} catch (error) { | |
console.error('Processing error:', error); | |
alert('An error occurred while processing the video. Please try again.'); | |
} finally { | |
this.isProcessing = false; | |
this.elements.processBtn.classList.remove('loading'); | |
} | |
} | |
displayFrameCaption(captionData) { | |
const captionElement = document.createElement('div'); | |
captionElement.className = 'frame-caption-item'; | |
captionElement.innerHTML = ` | |
<div class="frame-header"> | |
<span class="frame-number">Frame ${captionData.frame}</span> | |
<span class="frame-time">${this.formatTime(captionData.timestamp)}</span> | |
</div> | |
<p class="frame-text">${captionData.caption}</p> | |
`; | |
this.elements.frameCaptions.appendChild(captionElement); | |
} | |
async generateVideoSummary(lastFrame, allCaptions) { | |
const frameUrl = URL.createObjectURL(lastFrame.blob); | |
const image = await load_image(frameUrl); | |
const summaryPrompt = this.processor.apply_chat_template([ | |
{ | |
role: "user", | |
content: `<image>Based on this frame and knowing that the video shows: ${ | |
allCaptions.map(fc => fc.caption).join('; ') | |
}, provide a comprehensive caption for the entire video.` | |
} | |
], { add_generation_prompt: true }); | |
const summaryInputs = await this.processor(image, summaryPrompt, { | |
add_special_tokens: false, | |
}); | |
let summaryText = ''; | |
const streamer = new TextStreamer(this.processor.tokenizer, { | |
skip_prompt: true, | |
skip_special_tokens: false, | |
callback_function: (text) => { | |
summaryText += text; | |
this.elements.summaryText.textContent = summaryText; | |
} | |
}); | |
await this.model.generate({ | |
...summaryInputs, | |
max_new_tokens: 512, | |
do_sample: false, | |
streamer: streamer, | |
}); | |
URL.revokeObjectURL(frameUrl); | |
} | |
formatTime(seconds) { | |
const mins = Math.floor(seconds / 60); | |
const secs = Math.floor(seconds % 60); | |
return `${mins}:${secs.toString().padStart(2, '0')}`; | |
} | |
async copyResults() { | |
const frameCaptions = Array.from(this.elements.frameCaptions.querySelectorAll('.frame-caption-item')) | |
.map(el => el.querySelector('.frame-text').textContent) | |
.join('\n\n'); | |
const summary = this.elements.summaryText.textContent; | |
const fullText = `Frame Captions:\n${frameCaptions}\n\nVideo Summary:\n${summary}`; | |
try { | |
await navigator.clipboard.writeText(fullText); | |
this.elements.copyBtn.classList.add('copied'); | |
setTimeout(() => { | |
this.elements.copyBtn.classList.remove('copied'); | |
}, 2000); | |
} catch (err) { | |
console.error('Failed to copy:', err); | |
} | |
} | |
} | |
document.addEventListener('DOMContentLoaded', () => { | |
new VideoCaptionApp(); | |
}); |