import { AutoProcessor, AutoModelForImageTextToText, load_image, TextStreamer, } from "https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.7.2"; class VideoCaptionApp { constructor() { this.videoFile = null; this.model = null; this.processor = null; this.isProcessing = false; this.initializeElements(); this.attachEventListeners(); this.checkWebGPUSupport(); } initializeElements() { this.elements = { videoPlayer: document.getElementById('videoPlayer'), videoInput: document.getElementById('videoInput'), uploadArea: document.getElementById('uploadArea'), processBtn: document.getElementById('processBtn'), frameCount: document.getElementById('frameCount'), deviceSelect: document.getElementById('deviceSelect'), results: document.getElementById('results'), frameCaptions: document.getElementById('frameCaptions'), summaryText: document.getElementById('summaryText'), progressOverlay: document.getElementById('progressOverlay'), progressCircle: document.getElementById('progressCircle'), progressText: document.getElementById('progressText'), progressStatus: document.getElementById('progressStatus'), controls: document.getElementById('controls'), copyBtn: document.getElementById('copyBtn'), finalCaption: document.getElementById('finalCaption') }; } attachEventListeners() { this.elements.uploadArea.addEventListener('click', () => { if (!this.isProcessing) { this.elements.videoInput.click(); } }); this.elements.uploadArea.addEventListener('dragover', (e) => { e.preventDefault(); if (!this.isProcessing) { this.elements.uploadArea.classList.add('drag-over'); } }); this.elements.uploadArea.addEventListener('dragleave', () => { this.elements.uploadArea.classList.remove('drag-over'); }); this.elements.uploadArea.addEventListener('drop', (e) => { e.preventDefault(); this.elements.uploadArea.classList.remove('drag-over'); if (!this.isProcessing && e.dataTransfer.files.length > 0) { const file = e.dataTransfer.files[0]; if (file.type.startsWith('video/')) { this.handleVideoUpload(file); } } }); this.elements.videoInput.addEventListener('change', (e) => { if (e.target.files.length > 0) { this.handleVideoUpload(e.target.files[0]); } }); this.elements.processBtn.addEventListener('click', () => { if (!this.isProcessing && this.videoFile) { this.processVideo(); } }); this.elements.copyBtn.addEventListener('click', () => { this.copyResults(); }); } async checkWebGPUSupport() { if (!navigator.gpu) { this.elements.deviceSelect.querySelector('option[value="webgpu"]').disabled = true; this.elements.deviceSelect.value = 'cpu'; } } handleVideoUpload(file) { this.videoFile = file; const videoURL = URL.createObjectURL(file); this.elements.videoPlayer.src = videoURL; this.elements.uploadArea.style.display = 'none'; this.elements.controls.style.display = 'block'; this.elements.results.style.display = 'none'; } updateProgress(percent, status) { const circumference = 2 * Math.PI * 45; const offset = circumference - (percent / 100) * circumference; this.elements.progressCircle.style.strokeDasharray = `${circumference} ${circumference}`; this.elements.progressCircle.style.strokeDashoffset = offset; this.elements.progressText.textContent = `${Math.round(percent)}%`; this.elements.progressStatus.textContent = status; } async extractFramesFromVideo(videoUrl, numFrames = 8) { return new Promise((resolve, reject) => { const video = document.createElement('video'); video.crossOrigin = 'anonymous'; video.muted = true; const frames = []; let captureCount = 0; video.addEventListener('loadedmetadata', async () => { const duration = video.duration; const interval = duration / numFrames; for (let i = 0; i < numFrames; i++) { video.currentTime = i * interval; await new Promise(seekResolve => { video.addEventListener('seeked', async () => { const canvas = document.createElement('canvas'); canvas.width = Math.min(video.videoWidth, 1280); canvas.height = Math.min(video.videoHeight, 720); const ctx = canvas.getContext('2d'); ctx.drawImage(video, 0, 0, canvas.width, canvas.height); const blob = await new Promise(blobResolve => { canvas.toBlob(blobResolve, 'image/jpeg', 0.85); }); frames.push({ blob, timestamp: i * interval }); captureCount++; this.updateProgress((captureCount / numFrames) * 20, `Extracting frame ${captureCount}/${numFrames}...`); if (captureCount === numFrames) { resolve(frames); } seekResolve(); }, { once: true }); }); } }); video.addEventListener('error', reject); video.src = videoUrl; video.load(); }); } async initializeModel() { const device = this.elements.deviceSelect.value; const model_id = "onnx-community/FastVLM-0.5B-ONNX"; this.updateProgress(25, 'Loading AI model...'); try { this.processor = await AutoProcessor.from_pretrained(model_id); this.updateProgress(50, 'Initializing model...'); const modelOptions = { dtype: { embed_tokens: "fp16", vision_encoder: "q4", decoder_model_merged: "q4", } }; if (device === 'webgpu') { modelOptions.device = 'webgpu'; } this.model = await AutoModelForImageTextToText.from_pretrained(model_id, modelOptions); this.updateProgress(60, 'Model ready'); } catch (error) { console.error('Model initialization error:', error); throw error; } } async processVideo() { this.isProcessing = true; this.elements.processBtn.classList.add('loading'); this.elements.progressOverlay.classList.add('active'); this.elements.results.style.display = 'none'; this.elements.frameCaptions.innerHTML = ''; try { const videoURL = URL.createObjectURL(this.videoFile); const numFrames = parseInt(this.elements.frameCount.value); this.updateProgress(0, 'Starting...'); const frames = await this.extractFramesFromVideo(videoURL, numFrames); if (!this.model) { await this.initializeModel(); } const allCaptions = []; const totalSteps = frames.length; for (let i = 0; i < frames.length; i++) { const progress = 60 + (i / totalSteps) * 30; this.updateProgress(progress, `Analyzing frame ${i + 1}/${totalSteps}...`); const frameUrl = URL.createObjectURL(frames[i].blob); const image = await load_image(frameUrl); const messages = [ { role: "user", content: `This is frame ${i + 1} of ${numFrames} from a video at ${frames[i].timestamp.toFixed(1)}s. Describe what's happening in this frame, focusing on actions, objects, and any notable changes.`, }, ]; const prompt = this.processor.apply_chat_template(messages, { add_generation_prompt: true, }); const inputs = await this.processor(image, prompt, { add_special_tokens: false, }); let captionText = ''; const streamer = new TextStreamer(this.processor.tokenizer, { skip_prompt: true, skip_special_tokens: false, callback_function: (text) => { captionText += text; } }); const outputs = await this.model.generate({ ...inputs, max_new_tokens: 256, do_sample: false, streamer: streamer, }); allCaptions.push({ frame: i + 1, timestamp: frames[i].timestamp, caption: captionText.trim() }); this.displayFrameCaption(allCaptions[allCaptions.length - 1]); URL.revokeObjectURL(frameUrl); } this.updateProgress(95, 'Generating video summary...'); await this.generateVideoSummary(frames[frames.length - 1], allCaptions); this.updateProgress(100, 'Complete!'); setTimeout(() => { this.elements.progressOverlay.classList.remove('active'); this.elements.results.style.display = 'block'; }, 500); } catch (error) { console.error('Processing error:', error); alert('An error occurred while processing the video. Please try again.'); } finally { this.isProcessing = false; this.elements.processBtn.classList.remove('loading'); } } displayFrameCaption(captionData) { const captionElement = document.createElement('div'); captionElement.className = 'frame-caption-item'; captionElement.innerHTML = `
Frame ${captionData.frame} ${this.formatTime(captionData.timestamp)}

${captionData.caption}

`; this.elements.frameCaptions.appendChild(captionElement); } async generateVideoSummary(lastFrame, allCaptions) { const frameUrl = URL.createObjectURL(lastFrame.blob); const image = await load_image(frameUrl); const summaryPrompt = this.processor.apply_chat_template([ { role: "user", content: `Based on this frame and knowing that the video shows: ${ allCaptions.map(fc => fc.caption).join('; ') }, provide a comprehensive caption for the entire video.` } ], { add_generation_prompt: true }); const summaryInputs = await this.processor(image, summaryPrompt, { add_special_tokens: false, }); let summaryText = ''; const streamer = new TextStreamer(this.processor.tokenizer, { skip_prompt: true, skip_special_tokens: false, callback_function: (text) => { summaryText += text; this.elements.summaryText.textContent = summaryText; } }); await this.model.generate({ ...summaryInputs, max_new_tokens: 512, do_sample: false, streamer: streamer, }); URL.revokeObjectURL(frameUrl); } formatTime(seconds) { const mins = Math.floor(seconds / 60); const secs = Math.floor(seconds % 60); return `${mins}:${secs.toString().padStart(2, '0')}`; } async copyResults() { const frameCaptions = Array.from(this.elements.frameCaptions.querySelectorAll('.frame-caption-item')) .map(el => el.querySelector('.frame-text').textContent) .join('\n\n'); const summary = this.elements.summaryText.textContent; const fullText = `Frame Captions:\n${frameCaptions}\n\nVideo Summary:\n${summary}`; try { await navigator.clipboard.writeText(fullText); this.elements.copyBtn.classList.add('copied'); setTimeout(() => { this.elements.copyBtn.classList.remove('copied'); }, 2000); } catch (err) { console.error('Failed to copy:', err); } } } document.addEventListener('DOMContentLoaded', () => { new VideoCaptionApp(); });