Spaces:
Build error
Build error
| const express = require('express'); | |
| const bodyParser = require('body-parser'); | |
| const multer = require('multer'); | |
| const cheerio = require('cheerio'); | |
| const { minifyHtml } = require('./minify'); | |
| const { removeMedia } = require('./removeMedia'); | |
| const app = express(); | |
| // Configure size limits | |
| const MAX_SIZE = '50mb'; | |
| // Configure multer with size limits | |
| const upload = multer({ | |
| limits: { | |
| fileSize: 50 * 1024 * 1024, // 50MB limit | |
| fieldSize: 50 * 1024 * 1024 // 50MB limit for fields | |
| } | |
| }); | |
| // Configure body parsers with consistent limits | |
| app.use(express.static('public')); | |
| app.use(bodyParser.json({limit: MAX_SIZE})); | |
| app.use(bodyParser.urlencoded({ | |
| extended: true, | |
| limit: MAX_SIZE, | |
| parameterLimit: 50000 | |
| })); | |
| app.use(express.json({limit: MAX_SIZE})); | |
| app.use(express.urlencoded({ | |
| limit: MAX_SIZE, | |
| extended: true, | |
| parameterLimit: 50000 | |
| })); | |
| function compressHtmlForLlm(html, options = {}) { | |
| const operationStatus = { | |
| minification: { success: false, error: null }, | |
| cheerioLoad: { success: false, error: null }, | |
| headCleaning: { success: false, error: null }, | |
| scriptRemoval: { success: false, error: null }, | |
| styleRemoval: { success: false, error: null }, | |
| mediaRemoval: { success: false, error: null }, | |
| repeatingElements: { success: false, error: null }, | |
| textTruncation: { success: false, error: null } | |
| }; | |
| let processed = html; | |
| let $ = null; | |
| // Step 1: Minification | |
| if (options.minifyHtml) { | |
| const minifyResult = minifyHtml(html, { | |
| removeScripts: options.removeScripts, | |
| removeStyles: options.removeStyles | |
| }); | |
| if (minifyResult.success) { | |
| processed = minifyResult.minifiedHtml; | |
| operationStatus.minification = { success: true, error: null }; | |
| } else { | |
| operationStatus.minification = { | |
| success: false, | |
| error: minifyResult.error?.message || 'Minification failed' | |
| }; | |
| } | |
| } | |
| // Step 2: Load with Cheerio | |
| try { | |
| $ = cheerio.load(processed, { | |
| decodeEntities: false, | |
| xmlMode: false, | |
| lowerCaseTags: true | |
| }); | |
| operationStatus.cheerioLoad.success = true; | |
| } catch (err) { | |
| operationStatus.cheerioLoad.error = err.message.substring(0, 100); | |
| console.error('Cheerio load failed:', err); | |
| return { html: processed, status: operationStatus }; | |
| } | |
| // Step 3: Remove scripts | |
| if (options.removeScripts) { | |
| try { | |
| $('script').remove(); | |
| operationStatus.scriptRemoval.success = true; | |
| } catch (err) { | |
| operationStatus.scriptRemoval.error = err.message.substring(0, 100); | |
| console.warn('Script removal failed:', err); | |
| } | |
| } | |
| // Step 4: Remove styles | |
| if (options.removeStyles) { | |
| try { | |
| $('style').remove(); | |
| $('link[rel="stylesheet"]').remove(); | |
| operationStatus.styleRemoval.success = true; | |
| } catch (err) { | |
| operationStatus.styleRemoval.error = err.message.substring(0, 100); | |
| console.warn('Style removal failed:', err); | |
| } | |
| } | |
| // Step 5: Remove media | |
| if (options.removeMedia) { | |
| try { | |
| const mediaResult = removeMedia($); | |
| if (mediaResult.success) { | |
| operationStatus.mediaRemoval.success = true; | |
| } else { | |
| operationStatus.mediaRemoval.error = mediaResult.error.substring(0, 100); | |
| console.warn('Media removal failed:', mediaResult.error); | |
| } | |
| } catch (err) { | |
| operationStatus.mediaRemoval.error = err.message.substring(0, 100); | |
| console.warn('Media removal failed:', err); | |
| } | |
| } | |
| // Step 6: Clean head | |
| if (options.cleanHead) { | |
| try { | |
| cleanHead($); | |
| operationStatus.headCleaning.success = true; | |
| } catch (err) { | |
| operationStatus.headCleaning.error = err.message.substring(0, 100); | |
| } | |
| } | |
| // Step 7: Handle repeating elements | |
| if (options.handleRepeatingElements) { | |
| try { | |
| handleRepeatingElements($); | |
| operationStatus.repeatingElements.success = true; | |
| } catch (err) { | |
| operationStatus.repeatingElements.error = err.message.substring(0, 100); | |
| } | |
| } | |
| // Step 8: Truncate text | |
| if (options.truncateText) { | |
| try { | |
| truncateText($, options.truncateLength); | |
| operationStatus.textTruncation.success = true; | |
| } catch (err) { | |
| operationStatus.textTruncation.error = err.message.substring(0, 100); | |
| } | |
| } | |
| let finalHtml = ''; | |
| try { | |
| finalHtml = $.html(); | |
| } catch (err) { | |
| console.error('Final HTML generation failed:', err); | |
| finalHtml = processed; | |
| } | |
| const structure = generateStructureJson($); | |
| return { | |
| html: finalHtml, | |
| json: JSON.stringify(structure, null, 2), | |
| status: operationStatus | |
| }; | |
| } | |
| function cleanHead($) { | |
| $('head').each((_, head) => { | |
| $(head).find('link').remove(); | |
| $(head).find('script').remove(); | |
| $(head).find('meta').each((_, meta) => { | |
| const name = $(meta).attr('name')?.toLowerCase(); | |
| const property = $(meta).attr('property')?.toLowerCase(); | |
| if (!['charset', 'viewport', 'description', 'keywords'].includes(name) && | |
| !property?.includes('og:')) { | |
| $(meta).remove(); | |
| } | |
| }); | |
| }); | |
| } | |
| function handleRepeatingElements($) { | |
| $('*').each((_, elem) => { | |
| const $elem = $(elem); | |
| const children = $elem.children(); | |
| if (children.length > 3 && areElementsSimilar(children, $)) { | |
| children.slice(1, -1).each((i, child) => { | |
| if (i !== Math.floor(children.length / 2) - 1) { | |
| $(child).remove(); | |
| } | |
| }); | |
| } | |
| }); | |
| } | |
| function truncateText($, truncateLength) { | |
| $('*').each((_, elem) => { | |
| const $elem = $(elem); | |
| if ($elem.children().length === 0) { | |
| let text = $elem.text(); | |
| if (text.length > truncateLength) { | |
| text = text.substring(0, truncateLength/2) + '...' + | |
| text.substring(text.length - truncateLength/2); | |
| $elem.text(text); | |
| } | |
| } | |
| }); | |
| } | |
| function areElementsSimilar(elements, $) { | |
| if (elements.length < 4) return false; | |
| const firstTag = elements[0].tagName; | |
| const firstClasses = $(elements[0]).attr('class'); | |
| let similarCount = 0; | |
| elements.each((_, elem) => { | |
| if (elem.tagName === firstTag && $(elem).attr('class') === firstClasses) { | |
| similarCount++; | |
| } | |
| }); | |
| return similarCount / elements.length > 0.7; | |
| } | |
| function generateStructureJson($) { | |
| try { | |
| const structure = []; | |
| $('*').each((_, el) => { | |
| const $el = $(el); | |
| const attributes = {}; | |
| Object.entries($el.attr() || {}).forEach(([key, value]) => { | |
| attributes[key] = value; | |
| }); | |
| const textContent = $el.clone().children().remove().end().text().trim(); | |
| const truncatedText = textContent.length > 50 | |
| ? textContent.substring(0, 25) + '...' + textContent.substring(textContent.length - 25) | |
| : textContent; | |
| structure.push({ | |
| tag: el.tagName, | |
| attributes: Object.keys(attributes).length ? attributes : undefined, | |
| textContent: truncatedText || undefined, | |
| childrenCount: $el.children().length, | |
| selector: generateSelector($, el) | |
| }); | |
| }); | |
| return structure; | |
| } catch (err) { | |
| console.error('Structure generation failed:', err); | |
| return []; | |
| } | |
| } | |
| function generateSelector($, element) { | |
| try { | |
| const $el = $(element); | |
| let selector = element.tagName; | |
| if ($el.attr('id')) { | |
| selector += `#${$el.attr('id')}`; | |
| } else if ($el.attr('class')) { | |
| selector += `.${$el.attr('class').replace(/\s+/g, '.')}`; | |
| } | |
| return selector; | |
| } catch (err) { | |
| console.warn('Selector generation failed:', err); | |
| return element.tagName || 'unknown'; | |
| } | |
| } | |
| function computeStats(html, processed) { | |
| try { | |
| const $ = cheerio.load(html); | |
| const $processed = cheerio.load(processed); | |
| const stats = { | |
| originalElementCount: $('*').length, | |
| processedElementCount: $processed('*').length, | |
| originalTextLength: html.length, | |
| processedTextLength: processed.length, | |
| }; | |
| return { | |
| elementReduction: `${(1 - stats.processedElementCount / stats.originalElementCount) * 100}%`, | |
| sizeReduction: `${(1 - stats.processedTextLength / stats.originalTextLength) * 100}%`, | |
| originalElements: stats.originalElementCount, | |
| remainingElements: stats.processedElementCount, | |
| originalSize: stats.originalTextLength, | |
| processedSize: stats.processedTextLength | |
| }; | |
| } catch (err) { | |
| console.error('Stats computation failed:', err); | |
| return { | |
| elementReduction: 'N/A', | |
| sizeReduction: 'N/A', | |
| originalElements: 'N/A', | |
| remainingElements: 'N/A', | |
| originalSize: html.length, | |
| processedSize: processed.length | |
| }; | |
| } | |
| } | |
| function validateScript(scriptContent) { | |
| if (!scriptContent.includes('function extract(')) { | |
| throw new Error('Script must contain a function named "extract"'); | |
| } | |
| } | |
| function executeCheerioScript(html, scriptContent) { | |
| try { | |
| validateScript(scriptContent); | |
| const context = { | |
| cheerio, | |
| input: html | |
| }; | |
| const extractorFunction = new Function('input', 'cheerio', ` | |
| ${scriptContent} | |
| return extract(input, cheerio); | |
| `); | |
| const result = extractorFunction(html, cheerio); | |
| if (!result || typeof result !== 'object') { | |
| throw new Error('Extract function must return an object'); | |
| } | |
| if (!('success' in result && 'data' in result && 'error' in result)) { | |
| throw new Error('Return object must contain success, data, and error fields'); | |
| } | |
| return result; | |
| } catch (err) { | |
| return { | |
| success: false, | |
| data: null, | |
| error: err.message | |
| }; | |
| } | |
| } | |
| app.post('/process', upload.single('htmlFile'), (req, res) => { | |
| try { | |
| const startTime = Date.now(); | |
| let htmlContent = req.file | |
| ? req.file.buffer.toString('utf8') | |
| : req.body.html || ''; | |
| if (!htmlContent.trim()) { | |
| return res.status(400).json({ error: 'No HTML content provided.' }); | |
| } | |
| const options = { | |
| cleanHead: req.body.cleanHead === 'true', | |
| removeScripts: req.body.removeScripts === 'true', | |
| removeStyles: req.body.removeStyles === 'true', | |
| handleRepeatingElements: req.body.handleRepeatingElements === 'true', | |
| truncateText: req.body.truncateText === 'true', | |
| truncateLength: parseInt(req.body.truncateLength) || 100, | |
| minifyHtml: req.body.minifyHtml === 'true', | |
| removeMedia: req.body.removeMedia === 'true' | |
| }; | |
| const processed = compressHtmlForLlm(htmlContent, options); | |
| const stats = computeStats(htmlContent, processed.html); | |
| return res.json({ | |
| success: true, | |
| result: processed, | |
| stats: { | |
| processingTime: `${Date.now() - startTime}ms`, | |
| elementReduction: stats.elementReduction, | |
| sizeReduction: stats.sizeReduction, | |
| originalElements: stats.originalElements, | |
| remainingElements: stats.remainingElements, | |
| originalSize: `${stats.originalSize} chars`, | |
| processedSize: `${stats.processedSize} chars` | |
| }, | |
| options, | |
| operationStatus: processed.status | |
| }); | |
| } catch (err) { | |
| console.error('Processing failed:', err); | |
| return res.status(500).json({ | |
| error: 'Internal server error.', | |
| details: err.message.substring(0, 100) | |
| }); | |
| } | |
| }); | |
| app.post('/extract', upload.single('htmlFile'), (req, res) => { | |
| try { | |
| const startTime = Date.now(); | |
| let htmlContent = req.file | |
| ? req.file.buffer.toString('utf8') | |
| : req.body.html || ''; | |
| const extractorScript = req.body.script; | |
| if (!htmlContent.trim()) { | |
| return res.status(400).json({ error: 'No HTML content provided.' }); | |
| } | |
| if (!extractorScript) { | |
| return res.status(400).json({ error: 'No extractor script provided.' }); | |
| } | |
| const result = executeCheerioScript(htmlContent, extractorScript); | |
| return res.json({ | |
| success: result.success, | |
| data: result.data, | |
| error: result.error, | |
| processingTime: `${Date.now() - startTime}ms` | |
| }); | |
| } catch (err) { | |
| console.error('Extraction failed:', err); | |
| return res.status(500).json({ | |
| success: false, | |
| error: 'Internal server error.', | |
| details: err.message.substring(0, 100) | |
| }); | |
| } | |
| }); | |
| const PORT = process.env.PORT || 3000; | |
| app.listen(PORT, () => { | |
| console.log(`Server running on http://localhost:${PORT}`); | |
| }); |