Screen-VLA / backend /server.js
Gemini
VLA Data Generator - Complete TypeScript/React app with backend
256cef9
const express = require('express');
const cors = require('cors');
const multer = require('multer');
const path = require('path');
const { GoogleGenerativeAI } = require('@google/generative-ai');
const app = express();
const PORT = process.env.PORT || 7860; // Use HF Spaces port
// Get API key from environment variables
const API_KEY = process.env.GEMINI_API_KEY || process.env.API_KEY;
if (!API_KEY) {
console.error('Error: GEMINI_API_KEY or API_KEY environment variable is required');
process.exit(1);
}
console.log('Initializing Google Generative AI...');
const genAI = new GoogleGenerativeAI(API_KEY);
// Middleware
app.use(cors());
app.use(express.json({ limit: '50mb' }));
// Serve static files from frontend build
app.use(express.static(path.join(__dirname, '../dist')));
// Configure multer for file uploads
const upload = multer({
storage: multer.memoryStorage(),
limits: {
fileSize: 50 * 1024 * 1024, // 50MB limit
}
});
// Health check endpoint
app.get('/health', (req, res) => {
res.json({ status: 'ok', message: 'VLA Backend Server is running' });
});
// Generate overall goal endpoint
app.post('/api/generate-goal', async (req, res) => {
try {
const { frames, videoDuration } = req.body;
if (!frames || !Array.isArray(frames)) {
return res.status(400).json({ error: 'Frames array is required' });
}
const model = genAI.getGenerativeModel({
model: "gemini-1.5-flash"
});
// Create the prompt (you'll need to move the prompt logic here)
const prompt = `Analyze these video frames and generate an overall goal for the user's actions.
Video duration: ${videoDuration} seconds
Frames: ${frames.length} total
Please provide a concise overall goal description of what the user is trying to accomplish in this video.`;
const result = await model.generateContent([
{ text: prompt },
...frames.map(frame => ({
inlineData: {
data: frame.split(',')[1], // Remove data URL prefix
mimeType: 'image/jpeg'
}
}))
]);
const response = result.response;
const text = response.text();
res.json({ goal: text });
} catch (error) {
console.error('Error generating goal:', error);
res.status(500).json({ error: 'Failed to generate goal' });
}
});
// Generate tasks and interactions endpoint
app.post('/api/generate-tasks', async (req, res) => {
try {
const { frames, goal, videoDuration, totalFrames } = req.body;
if (!frames || !Array.isArray(frames) || !goal) {
return res.status(400).json({ error: 'Frames array and goal are required' });
}
const model = genAI.getGenerativeModel({
model: "gemini-1.5-flash"
});
// Create the prompt for tasks and interactions
const prompt = `Based on the overall goal: "${goal}"
Analyze these ${frames.length} video frames and generate detailed tasks and interactions.
Video duration: ${videoDuration} seconds
Total frames: ${totalFrames}
Please provide a JSON response with tasks and interactions following this structure:
{
"tasks": [
{
"task_id": "task_1",
"description": "Description of the task",
"start_frame": 0,
"end_frame": 10,
"interactions": [
{
"interaction_id": "interaction_1",
"type": "click|scroll|type|drag",
"description": "What action is being performed",
"frame_number": 5,
"coordinates": {"x": 100, "y": 200},
"target_element": "Description of UI element"
}
]
}
]
}`;
const result = await model.generateContent([
{ text: prompt },
...frames.map(frame => ({
inlineData: {
data: frame.split(',')[1],
mimeType: 'image/jpeg'
}
}))
]);
const response = result.response;
const text = response.text();
// Try to parse JSON response
try {
const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/) || text.match(/\{[\s\S]*\}/);
if (jsonMatch) {
const jsonStr = jsonMatch[1] || jsonMatch[0];
const parsedData = JSON.parse(jsonStr);
res.json(parsedData);
} else {
// Fallback if JSON parsing fails
res.json({
tasks: [{
task_id: "task_1",
description: text,
start_frame: 0,
end_frame: frames.length - 1,
interactions: []
}]
});
}
} catch (parseError) {
console.error('JSON parsing error:', parseError);
res.json({
tasks: [{
task_id: "task_1",
description: text,
start_frame: 0,
end_frame: frames.length - 1,
interactions: []
}]
});
}
} catch (error) {
console.error('Error generating tasks:', error);
res.status(500).json({ error: 'Failed to generate tasks and interactions' });
}
});
// Serve frontend for all other routes
app.get('*', (req, res) => {
res.sendFile(path.join(__dirname, '../dist/index.html'));
});
app.listen(PORT, '0.0.0.0', () => {
console.log(`VLA Data Generator running on port ${PORT}`);
console.log(`API Key configured: ${API_KEY ? 'Yes' : 'No'}`);
});