Spaces:

Nirav-Madhani
/

Screen-VLA

Sleeping

File size: 5,745 Bytes

256cef9

const express = require('express');
const cors = require('cors');
const multer = require('multer');
const path = require('path');
const { GoogleGenerativeAI } = require('@google/generative-ai');

const app = express();
const PORT = process.env.PORT || 7860; // Use HF Spaces port

// Get API key from environment variables
const API_KEY = process.env.GEMINI_API_KEY || process.env.API_KEY;
if (!API_KEY) {
    console.error('Error: GEMINI_API_KEY or API_KEY environment variable is required');
    process.exit(1);
}

console.log('Initializing Google Generative AI...');
const genAI = new GoogleGenerativeAI(API_KEY);

// Middleware
app.use(cors());
app.use(express.json({ limit: '50mb' }));

// Serve static files from frontend build
app.use(express.static(path.join(__dirname, '../dist')));

// Configure multer for file uploads
const upload = multer({
    storage: multer.memoryStorage(),
    limits: {
        fileSize: 50 * 1024 * 1024, // 50MB limit
    }
});

// Health check endpoint
app.get('/health', (req, res) => {
    res.json({ status: 'ok', message: 'VLA Backend Server is running' });
});

// Generate overall goal endpoint
app.post('/api/generate-goal', async (req, res) => {
    try {
        const { frames, videoDuration } = req.body;
        
        if (!frames || !Array.isArray(frames)) {
            return res.status(400).json({ error: 'Frames array is required' });
        }

        const model = genAI.getGenerativeModel({ 
            model: "gemini-1.5-flash"
        });

        // Create the prompt (you'll need to move the prompt logic here)
        const prompt = `Analyze these video frames and generate an overall goal for the user's actions. 
Video duration: ${videoDuration} seconds
Frames: ${frames.length} total

Please provide a concise overall goal description of what the user is trying to accomplish in this video.`;

        const result = await model.generateContent([
            { text: prompt },
            ...frames.map(frame => ({
                inlineData: {
                    data: frame.split(',')[1], // Remove data URL prefix
                    mimeType: 'image/jpeg'
                }
            }))
        ]);

        const response = result.response;
        const text = response.text();

        res.json({ goal: text });
    } catch (error) {
        console.error('Error generating goal:', error);
        res.status(500).json({ error: 'Failed to generate goal' });
    }
});

// Generate tasks and interactions endpoint
app.post('/api/generate-tasks', async (req, res) => {
    try {
        const { frames, goal, videoDuration, totalFrames } = req.body;
        
        if (!frames || !Array.isArray(frames) || !goal) {
            return res.status(400).json({ error: 'Frames array and goal are required' });
        }

        const model = genAI.getGenerativeModel({ 
            model: "gemini-1.5-flash"
        });

        // Create the prompt for tasks and interactions
        const prompt = `Based on the overall goal: "${goal}"
        
Analyze these ${frames.length} video frames and generate detailed tasks and interactions.
Video duration: ${videoDuration} seconds
Total frames: ${totalFrames}

Please provide a JSON response with tasks and interactions following this structure:
{
  "tasks": [
    {
      "task_id": "task_1",
      "description": "Description of the task",
      "start_frame": 0,
      "end_frame": 10,
      "interactions": [
        {
          "interaction_id": "interaction_1",
          "type": "click|scroll|type|drag",
          "description": "What action is being performed",
          "frame_number": 5,
          "coordinates": {"x": 100, "y": 200},
          "target_element": "Description of UI element"
        }
      ]
    }
  ]
}`;

        const result = await model.generateContent([
            { text: prompt },
            ...frames.map(frame => ({
                inlineData: {
                    data: frame.split(',')[1],
                    mimeType: 'image/jpeg'
                }
            }))
        ]);

        const response = result.response;
        const text = response.text();

        // Try to parse JSON response
        try {
            const jsonMatch = text.match(/```json\s*([\s\S]*?)\s*```/) || text.match(/\{[\s\S]*\}/);
            if (jsonMatch) {
                const jsonStr = jsonMatch[1] || jsonMatch[0];
                const parsedData = JSON.parse(jsonStr);
                res.json(parsedData);
            } else {
                // Fallback if JSON parsing fails
                res.json({ 
                    tasks: [{
                        task_id: "task_1",
                        description: text,
                        start_frame: 0,
                        end_frame: frames.length - 1,
                        interactions: []
                    }]
                });
            }
        } catch (parseError) {
            console.error('JSON parsing error:', parseError);
            res.json({ 
                tasks: [{
                    task_id: "task_1", 
                    description: text,
                    start_frame: 0,
                    end_frame: frames.length - 1,
                    interactions: []
                }]
            });
        }
    } catch (error) {
        console.error('Error generating tasks:', error);
        res.status(500).json({ error: 'Failed to generate tasks and interactions' });
    }
});

// Serve frontend for all other routes
app.get('*', (req, res) => {
    res.sendFile(path.join(__dirname, '../dist/index.html'));
});

app.listen(PORT, '0.0.0.0', () => {
    console.log(`VLA Data Generator running on port ${PORT}`);
    console.log(`API Key configured: ${API_KEY ? 'Yes' : 'No'}`);
});