Spaces:
Running
Running
| import { | |
| AutoProcessor, | |
| AutoModelForImageTextToText, | |
| load_image, | |
| TextStreamer, | |
| } from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]"; | |
| class VideoCaptionApp { | |
| constructor() { | |
| this.videoFile = null; | |
| this.model = null; | |
| this.processor = null; | |
| this.isProcessing = false; | |
| this.initializeElements(); | |
| this.attachEventListeners(); | |
| this.checkWebGPUSupport(); | |
| } | |
| initializeElements() { | |
| this.elements = { | |
| videoPlayer: document.getElementById('videoPlayer'), | |
| videoInput: document.getElementById('videoInput'), | |
| uploadArea: document.getElementById('uploadArea'), | |
| processBtn: document.getElementById('processBtn'), | |
| frameCount: document.getElementById('frameCount'), | |
| deviceSelect: document.getElementById('deviceSelect'), | |
| results: document.getElementById('results'), | |
| frameCaptions: document.getElementById('frameCaptions'), | |
| summaryText: document.getElementById('summaryText'), | |
| progressOverlay: document.getElementById('progressOverlay'), | |
| progressCircle: document.getElementById('progressCircle'), | |
| progressText: document.getElementById('progressText'), | |
| progressStatus: document.getElementById('progressStatus'), | |
| controls: document.getElementById('controls'), | |
| copyBtn: document.getElementById('copyBtn'), | |
| finalCaption: document.getElementById('finalCaption') | |
| }; | |
| } | |
| attachEventListeners() { | |
| this.elements.uploadArea.addEventListener('click', () => { | |
| if (!this.isProcessing) { | |
| this.elements.videoInput.click(); | |
| } | |
| }); | |
| this.elements.uploadArea.addEventListener('dragover', (e) => { | |
| e.preventDefault(); | |
| if (!this.isProcessing) { | |
| this.elements.uploadArea.classList.add('drag-over'); | |
| } | |
| }); | |
| this.elements.uploadArea.addEventListener('dragleave', () => { | |
| this.elements.uploadArea.classList.remove('drag-over'); | |
| }); | |
| this.elements.uploadArea.addEventListener('drop', (e) => { | |
| e.preventDefault(); | |
| this.elements.uploadArea.classList.remove('drag-over'); | |
| if (!this.isProcessing && e.dataTransfer.files.length > 0) { | |
| const file = e.dataTransfer.files[0]; | |
| if (file.type.startsWith('video/')) { | |
| this.handleVideoUpload(file); | |
| } | |
| } | |
| }); | |
| this.elements.videoInput.addEventListener('change', (e) => { | |
| if (e.target.files.length > 0) { | |
| this.handleVideoUpload(e.target.files[0]); | |
| } | |
| }); | |
| this.elements.processBtn.addEventListener('click', () => { | |
| if (!this.isProcessing && this.videoFile) { | |
| this.processVideo(); | |
| } | |
| }); | |
| this.elements.copyBtn.addEventListener('click', () => { | |
| this.copyResults(); | |
| }); | |
| } | |
| async checkWebGPUSupport() { | |
| if (!navigator.gpu) { | |
| this.elements.deviceSelect.querySelector('option[value="webgpu"]').disabled = true; | |
| this.elements.deviceSelect.value = 'cpu'; | |
| } | |
| } | |
| handleVideoUpload(file) { | |
| this.videoFile = file; | |
| const videoURL = URL.createObjectURL(file); | |
| this.elements.videoPlayer.src = videoURL; | |
| this.elements.uploadArea.style.display = 'none'; | |
| this.elements.controls.style.display = 'block'; | |
| this.elements.results.style.display = 'none'; | |
| } | |
| updateProgress(percent, status) { | |
| const circumference = 2 * Math.PI * 45; | |
| const offset = circumference - (percent / 100) * circumference; | |
| this.elements.progressCircle.style.strokeDasharray = `${circumference} ${circumference}`; | |
| this.elements.progressCircle.style.strokeDashoffset = offset; | |
| this.elements.progressText.textContent = `${Math.round(percent)}%`; | |
| this.elements.progressStatus.textContent = status; | |
| } | |
| async extractFramesFromVideo(videoUrl, numFrames = 8) { | |
| return new Promise((resolve, reject) => { | |
| const video = document.createElement('video'); | |
| video.crossOrigin = 'anonymous'; | |
| video.muted = true; | |
| const frames = []; | |
| let captureCount = 0; | |
| video.addEventListener('loadedmetadata', async () => { | |
| const duration = video.duration; | |
| const interval = duration / numFrames; | |
| for (let i = 0; i < numFrames; i++) { | |
| video.currentTime = i * interval; | |
| await new Promise(seekResolve => { | |
| video.addEventListener('seeked', async () => { | |
| const canvas = document.createElement('canvas'); | |
| canvas.width = Math.min(video.videoWidth, 1280); | |
| canvas.height = Math.min(video.videoHeight, 720); | |
| const ctx = canvas.getContext('2d'); | |
| ctx.drawImage(video, 0, 0, canvas.width, canvas.height); | |
| const blob = await new Promise(blobResolve => { | |
| canvas.toBlob(blobResolve, 'image/jpeg', 0.85); | |
| }); | |
| frames.push({ | |
| blob, | |
| timestamp: i * interval | |
| }); | |
| captureCount++; | |
| this.updateProgress((captureCount / numFrames) * 20, `Extracting frame ${captureCount}/${numFrames}...`); | |
| if (captureCount === numFrames) { | |
| resolve(frames); | |
| } | |
| seekResolve(); | |
| }, { once: true }); | |
| }); | |
| } | |
| }); | |
| video.addEventListener('error', reject); | |
| video.src = videoUrl; | |
| video.load(); | |
| }); | |
| } | |
| async initializeModel() { | |
| const device = this.elements.deviceSelect.value; | |
| const model_id = "onnx-community/FastVLM-0.5B-ONNX"; | |
| this.updateProgress(25, 'Loading AI model...'); | |
| try { | |
| this.processor = await AutoProcessor.from_pretrained(model_id); | |
| this.updateProgress(50, 'Initializing model...'); | |
| const modelOptions = { | |
| dtype: { | |
| embed_tokens: "fp16", | |
| vision_encoder: "q4", | |
| decoder_model_merged: "q4", | |
| } | |
| }; | |
| if (device === 'webgpu') { | |
| modelOptions.device = 'webgpu'; | |
| } | |
| this.model = await AutoModelForImageTextToText.from_pretrained(model_id, modelOptions); | |
| this.updateProgress(60, 'Model ready'); | |
| } catch (error) { | |
| console.error('Model initialization error:', error); | |
| throw error; | |
| } | |
| } | |
| async processVideo() { | |
| this.isProcessing = true; | |
| this.elements.processBtn.classList.add('loading'); | |
| this.elements.progressOverlay.classList.add('active'); | |
| this.elements.results.style.display = 'none'; | |
| this.elements.frameCaptions.innerHTML = ''; | |
| try { | |
| const videoURL = URL.createObjectURL(this.videoFile); | |
| const numFrames = parseInt(this.elements.frameCount.value); | |
| this.updateProgress(0, 'Starting...'); | |
| const frames = await this.extractFramesFromVideo(videoURL, numFrames); | |
| if (!this.model) { | |
| await this.initializeModel(); | |
| } | |
| const allCaptions = []; | |
| const totalSteps = frames.length; | |
| for (let i = 0; i < frames.length; i++) { | |
| const progress = 60 + (i / totalSteps) * 30; | |
| this.updateProgress(progress, `Analyzing frame ${i + 1}/${totalSteps}...`); | |
| const frameUrl = URL.createObjectURL(frames[i].blob); | |
| const image = await load_image(frameUrl); | |
| const messages = [ | |
| { | |
| role: "user", | |
| content: `<image>This is frame ${i + 1} of ${numFrames} from a video at ${frames[i].timestamp.toFixed(1)}s. Describe what's happening in this frame, focusing on actions, objects, and any notable changes.`, | |
| }, | |
| ]; | |
| const prompt = this.processor.apply_chat_template(messages, { | |
| add_generation_prompt: true, | |
| }); | |
| const inputs = await this.processor(image, prompt, { | |
| add_special_tokens: false, | |
| }); | |
| let captionText = ''; | |
| const streamer = new TextStreamer(this.processor.tokenizer, { | |
| skip_prompt: true, | |
| skip_special_tokens: false, | |
| callback_function: (text) => { | |
| captionText += text; | |
| } | |
| }); | |
| const outputs = await this.model.generate({ | |
| ...inputs, | |
| max_new_tokens: 256, | |
| do_sample: false, | |
| streamer: streamer, | |
| }); | |
| allCaptions.push({ | |
| frame: i + 1, | |
| timestamp: frames[i].timestamp, | |
| caption: captionText.trim() | |
| }); | |
| this.displayFrameCaption(allCaptions[allCaptions.length - 1]); | |
| URL.revokeObjectURL(frameUrl); | |
| } | |
| this.updateProgress(95, 'Generating video summary...'); | |
| await this.generateVideoSummary(frames[frames.length - 1], allCaptions); | |
| this.updateProgress(100, 'Complete!'); | |
| setTimeout(() => { | |
| this.elements.progressOverlay.classList.remove('active'); | |
| this.elements.results.style.display = 'block'; | |
| }, 500); | |
| } catch (error) { | |
| console.error('Processing error:', error); | |
| alert('An error occurred while processing the video. Please try again.'); | |
| } finally { | |
| this.isProcessing = false; | |
| this.elements.processBtn.classList.remove('loading'); | |
| } | |
| } | |
| displayFrameCaption(captionData) { | |
| const captionElement = document.createElement('div'); | |
| captionElement.className = 'frame-caption-item'; | |
| captionElement.innerHTML = ` | |
| <div class="frame-header"> | |
| <span class="frame-number">Frame ${captionData.frame}</span> | |
| <span class="frame-time">${this.formatTime(captionData.timestamp)}</span> | |
| </div> | |
| <p class="frame-text">${captionData.caption}</p> | |
| `; | |
| this.elements.frameCaptions.appendChild(captionElement); | |
| } | |
| async generateVideoSummary(lastFrame, allCaptions) { | |
| const frameUrl = URL.createObjectURL(lastFrame.blob); | |
| const image = await load_image(frameUrl); | |
| const summaryPrompt = this.processor.apply_chat_template([ | |
| { | |
| role: "user", | |
| content: `<image>Based on this frame and knowing that the video shows: ${ | |
| allCaptions.map(fc => fc.caption).join('; ') | |
| }, provide a comprehensive caption for the entire video.` | |
| } | |
| ], { add_generation_prompt: true }); | |
| const summaryInputs = await this.processor(image, summaryPrompt, { | |
| add_special_tokens: false, | |
| }); | |
| let summaryText = ''; | |
| const streamer = new TextStreamer(this.processor.tokenizer, { | |
| skip_prompt: true, | |
| skip_special_tokens: false, | |
| callback_function: (text) => { | |
| summaryText += text; | |
| this.elements.summaryText.textContent = summaryText; | |
| } | |
| }); | |
| await this.model.generate({ | |
| ...summaryInputs, | |
| max_new_tokens: 512, | |
| do_sample: false, | |
| streamer: streamer, | |
| }); | |
| URL.revokeObjectURL(frameUrl); | |
| } | |
| formatTime(seconds) { | |
| const mins = Math.floor(seconds / 60); | |
| const secs = Math.floor(seconds % 60); | |
| return `${mins}:${secs.toString().padStart(2, '0')}`; | |
| } | |
| async copyResults() { | |
| const frameCaptions = Array.from(this.elements.frameCaptions.querySelectorAll('.frame-caption-item')) | |
| .map(el => el.querySelector('.frame-text').textContent) | |
| .join('\n\n'); | |
| const summary = this.elements.summaryText.textContent; | |
| const fullText = `Frame Captions:\n${frameCaptions}\n\nVideo Summary:\n${summary}`; | |
| try { | |
| await navigator.clipboard.writeText(fullText); | |
| this.elements.copyBtn.classList.add('copied'); | |
| setTimeout(() => { | |
| this.elements.copyBtn.classList.remove('copied'); | |
| }, 2000); | |
| } catch (err) { | |
| console.error('Failed to copy:', err); | |
| } | |
| } | |
| } | |
| document.addEventListener('DOMContentLoaded', () => { | |
| new VideoCaptionApp(); | |
| }); |