akhaliq's picture
akhaliq HF Staff
Upload index.js with huggingface_hub
5276df5 verified
import {
AutoProcessor,
AutoModelForImageTextToText,
load_image,
TextStreamer,
} from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]";
class VideoCaptionApp {
constructor() {
this.videoFile = null;
this.model = null;
this.processor = null;
this.isProcessing = false;
this.initializeElements();
this.attachEventListeners();
this.checkWebGPUSupport();
}
initializeElements() {
this.elements = {
videoPlayer: document.getElementById('videoPlayer'),
videoInput: document.getElementById('videoInput'),
uploadArea: document.getElementById('uploadArea'),
processBtn: document.getElementById('processBtn'),
frameCount: document.getElementById('frameCount'),
deviceSelect: document.getElementById('deviceSelect'),
results: document.getElementById('results'),
frameCaptions: document.getElementById('frameCaptions'),
summaryText: document.getElementById('summaryText'),
progressOverlay: document.getElementById('progressOverlay'),
progressCircle: document.getElementById('progressCircle'),
progressText: document.getElementById('progressText'),
progressStatus: document.getElementById('progressStatus'),
controls: document.getElementById('controls'),
copyBtn: document.getElementById('copyBtn'),
finalCaption: document.getElementById('finalCaption')
};
}
attachEventListeners() {
this.elements.uploadArea.addEventListener('click', () => {
if (!this.isProcessing) {
this.elements.videoInput.click();
}
});
this.elements.uploadArea.addEventListener('dragover', (e) => {
e.preventDefault();
if (!this.isProcessing) {
this.elements.uploadArea.classList.add('drag-over');
}
});
this.elements.uploadArea.addEventListener('dragleave', () => {
this.elements.uploadArea.classList.remove('drag-over');
});
this.elements.uploadArea.addEventListener('drop', (e) => {
e.preventDefault();
this.elements.uploadArea.classList.remove('drag-over');
if (!this.isProcessing && e.dataTransfer.files.length > 0) {
const file = e.dataTransfer.files[0];
if (file.type.startsWith('video/')) {
this.handleVideoUpload(file);
}
}
});
this.elements.videoInput.addEventListener('change', (e) => {
if (e.target.files.length > 0) {
this.handleVideoUpload(e.target.files[0]);
}
});
this.elements.processBtn.addEventListener('click', () => {
if (!this.isProcessing && this.videoFile) {
this.processVideo();
}
});
this.elements.copyBtn.addEventListener('click', () => {
this.copyResults();
});
}
async checkWebGPUSupport() {
if (!navigator.gpu) {
this.elements.deviceSelect.querySelector('option[value="webgpu"]').disabled = true;
this.elements.deviceSelect.value = 'cpu';
}
}
handleVideoUpload(file) {
this.videoFile = file;
const videoURL = URL.createObjectURL(file);
this.elements.videoPlayer.src = videoURL;
this.elements.uploadArea.style.display = 'none';
this.elements.controls.style.display = 'block';
this.elements.results.style.display = 'none';
}
updateProgress(percent, status) {
const circumference = 2 * Math.PI * 45;
const offset = circumference - (percent / 100) * circumference;
this.elements.progressCircle.style.strokeDasharray = `${circumference} ${circumference}`;
this.elements.progressCircle.style.strokeDashoffset = offset;
this.elements.progressText.textContent = `${Math.round(percent)}%`;
this.elements.progressStatus.textContent = status;
}
async extractFramesFromVideo(videoUrl, numFrames = 8) {
return new Promise((resolve, reject) => {
const video = document.createElement('video');
video.crossOrigin = 'anonymous';
video.muted = true;
const frames = [];
let captureCount = 0;
video.addEventListener('loadedmetadata', async () => {
const duration = video.duration;
const interval = duration / numFrames;
for (let i = 0; i < numFrames; i++) {
video.currentTime = i * interval;
await new Promise(seekResolve => {
video.addEventListener('seeked', async () => {
const canvas = document.createElement('canvas');
canvas.width = Math.min(video.videoWidth, 1280);
canvas.height = Math.min(video.videoHeight, 720);
const ctx = canvas.getContext('2d');
ctx.drawImage(video, 0, 0, canvas.width, canvas.height);
const blob = await new Promise(blobResolve => {
canvas.toBlob(blobResolve, 'image/jpeg', 0.85);
});
frames.push({
blob,
timestamp: i * interval
});
captureCount++;
this.updateProgress((captureCount / numFrames) * 20, `Extracting frame ${captureCount}/${numFrames}...`);
if (captureCount === numFrames) {
resolve(frames);
}
seekResolve();
}, { once: true });
});
}
});
video.addEventListener('error', reject);
video.src = videoUrl;
video.load();
});
}
async initializeModel() {
const device = this.elements.deviceSelect.value;
const model_id = "onnx-community/FastVLM-0.5B-ONNX";
this.updateProgress(25, 'Loading AI model...');
try {
this.processor = await AutoProcessor.from_pretrained(model_id);
this.updateProgress(50, 'Initializing model...');
const modelOptions = {
dtype: {
embed_tokens: "fp16",
vision_encoder: "q4",
decoder_model_merged: "q4",
}
};
if (device === 'webgpu') {
modelOptions.device = 'webgpu';
}
this.model = await AutoModelForImageTextToText.from_pretrained(model_id, modelOptions);
this.updateProgress(60, 'Model ready');
} catch (error) {
console.error('Model initialization error:', error);
throw error;
}
}
async processVideo() {
this.isProcessing = true;
this.elements.processBtn.classList.add('loading');
this.elements.progressOverlay.classList.add('active');
this.elements.results.style.display = 'none';
this.elements.frameCaptions.innerHTML = '';
try {
const videoURL = URL.createObjectURL(this.videoFile);
const numFrames = parseInt(this.elements.frameCount.value);
this.updateProgress(0, 'Starting...');
const frames = await this.extractFramesFromVideo(videoURL, numFrames);
if (!this.model) {
await this.initializeModel();
}
const allCaptions = [];
const totalSteps = frames.length;
for (let i = 0; i < frames.length; i++) {
const progress = 60 + (i / totalSteps) * 30;
this.updateProgress(progress, `Analyzing frame ${i + 1}/${totalSteps}...`);
const frameUrl = URL.createObjectURL(frames[i].blob);
const image = await load_image(frameUrl);
const messages = [
{
role: "user",
content: `<image>This is frame ${i + 1} of ${numFrames} from a video at ${frames[i].timestamp.toFixed(1)}s. Describe what's happening in this frame, focusing on actions, objects, and any notable changes.`,
},
];
const prompt = this.processor.apply_chat_template(messages, {
add_generation_prompt: true,
});
const inputs = await this.processor(image, prompt, {
add_special_tokens: false,
});
let captionText = '';
const streamer = new TextStreamer(this.processor.tokenizer, {
skip_prompt: true,
skip_special_tokens: false,
callback_function: (text) => {
captionText += text;
}
});
const outputs = await this.model.generate({
...inputs,
max_new_tokens: 256,
do_sample: false,
streamer: streamer,
});
allCaptions.push({
frame: i + 1,
timestamp: frames[i].timestamp,
caption: captionText.trim()
});
this.displayFrameCaption(allCaptions[allCaptions.length - 1]);
URL.revokeObjectURL(frameUrl);
}
this.updateProgress(95, 'Generating video summary...');
await this.generateVideoSummary(frames[frames.length - 1], allCaptions);
this.updateProgress(100, 'Complete!');
setTimeout(() => {
this.elements.progressOverlay.classList.remove('active');
this.elements.results.style.display = 'block';
}, 500);
} catch (error) {
console.error('Processing error:', error);
alert('An error occurred while processing the video. Please try again.');
} finally {
this.isProcessing = false;
this.elements.processBtn.classList.remove('loading');
}
}
displayFrameCaption(captionData) {
const captionElement = document.createElement('div');
captionElement.className = 'frame-caption-item';
captionElement.innerHTML = `
<div class="frame-header">
<span class="frame-number">Frame ${captionData.frame}</span>
<span class="frame-time">${this.formatTime(captionData.timestamp)}</span>
</div>
<p class="frame-text">${captionData.caption}</p>
`;
this.elements.frameCaptions.appendChild(captionElement);
}
async generateVideoSummary(lastFrame, allCaptions) {
const frameUrl = URL.createObjectURL(lastFrame.blob);
const image = await load_image(frameUrl);
const summaryPrompt = this.processor.apply_chat_template([
{
role: "user",
content: `<image>Based on this frame and knowing that the video shows: ${
allCaptions.map(fc => fc.caption).join('; ')
}, provide a comprehensive caption for the entire video.`
}
], { add_generation_prompt: true });
const summaryInputs = await this.processor(image, summaryPrompt, {
add_special_tokens: false,
});
let summaryText = '';
const streamer = new TextStreamer(this.processor.tokenizer, {
skip_prompt: true,
skip_special_tokens: false,
callback_function: (text) => {
summaryText += text;
this.elements.summaryText.textContent = summaryText;
}
});
await this.model.generate({
...summaryInputs,
max_new_tokens: 512,
do_sample: false,
streamer: streamer,
});
URL.revokeObjectURL(frameUrl);
}
formatTime(seconds) {
const mins = Math.floor(seconds / 60);
const secs = Math.floor(seconds % 60);
return `${mins}:${secs.toString().padStart(2, '0')}`;
}
async copyResults() {
const frameCaptions = Array.from(this.elements.frameCaptions.querySelectorAll('.frame-caption-item'))
.map(el => el.querySelector('.frame-text').textContent)
.join('\n\n');
const summary = this.elements.summaryText.textContent;
const fullText = `Frame Captions:\n${frameCaptions}\n\nVideo Summary:\n${summary}`;
try {
await navigator.clipboard.writeText(fullText);
this.elements.copyBtn.classList.add('copied');
setTimeout(() => {
this.elements.copyBtn.classList.remove('copied');
}, 2000);
} catch (err) {
console.error('Failed to copy:', err);
}
}
}
document.addEventListener('DOMContentLoaded', () => {
new VideoCaptionApp();
});