Spaces:

jeremierostan
/

url_scrape

Running

File size: 4,989 Bytes

c511484

import gradio as gr
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
import os
import re
from urllib.parse import urlparse
from typing import List, Tuple
import tempfile

class ArticleExtractor:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }

    def clean_text(self, text: str) -> str:
        """Clean extracted text by removing extra whitespace and special characters."""
        # Remove extra whitespace and newlines
        text = re.sub(r'\s+', ' ', text).strip()
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s.,!?-]', '', text)
        return text

    def extract_content(self, url: str) -> Tuple[str, List[str], str]:
        """Extract title, headings, and main content from a webpage."""
        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')

            # Extract title
            title = soup.title.string if soup.title else "No title found"
            title = self.clean_text(title)

            # Extract headings
            headings = []
            for heading in soup.find_all(['h1', 'h2', 'h3']):
                heading_text = self.clean_text(heading.get_text())
                if heading_text and len(heading_text) > 5:  # Filter out very short headings
                    headings.append(heading_text)

            # Extract main content (paragraphs)
            # Remove unwanted elements
            for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']):
                unwanted.decompose()

            # Find article content or main content
            content = ""
            article = soup.find('article') or soup.find('main') or soup.find('div', class_=re.compile(r'content|article|post'))
            
            if article:
                paragraphs = article.find_all('p')
            else:
                paragraphs = soup.find_all('p')

            content_parts = []
            for p in paragraphs:
                text = self.clean_text(p.get_text())
                if text and len(text) > 50:  # Filter out short paragraphs
                    content_parts.append(text)

            content = '\n\n'.join(content_parts)

            return title, headings, content

        except Exception as e:
            return f"Error: {str(e)}", [], "Failed to extract content"

    def create_pdf(self, url: str, output_dir: str) -> str:
        """Create a PDF document from extracted web content."""
        title, headings, content = self.extract_content(url)
        
        # Create PDF
        pdf = FPDF()
        pdf.add_page()
        
        # Set up fonts
        pdf.set_font('Arial', 'B', 16)
        
        # Add title
        pdf.cell(0, 10, title[:80], ln=True)  # Truncate very long titles
        pdf.ln(10)
        
        # Add headings
        pdf.set_font('Arial', 'B', 12)
        for heading in headings:
            pdf.multi_cell(0, 10, heading)
            pdf.ln(5)
        
        # Add content
        pdf.set_font('Arial', '', 11)
        pdf.multi_cell(0, 10, content)
        
        # Generate filename from URL
        filename = f"article_{urlparse(url).netloc.replace('.', '_')}.pdf"
        filepath = os.path.join(output_dir, filename)
        
        # Save PDF
        pdf.output(filepath)
        return filepath

def process_urls(urls: str) -> List[str]:
    """Process multiple URLs and return paths to generated PDFs."""
    # Create temporary directory for PDFs
    temp_dir = tempfile.mkdtemp()
    
    # Split and clean URLs
    url_list = [url.strip() for url in urls.split('\n') if url.strip()]
    
    # Limit to 5 URLs
    url_list = url_list[:5]
    
    extractor = ArticleExtractor()
    pdf_paths = []
    
    for url in url_list:
        try:
            pdf_path = extractor.create_pdf(url, temp_dir)
            pdf_paths.append(pdf_path)
        except Exception as e:
            print(f"Error processing {url}: {str(e)}")
    
    return pdf_paths

# Create Gradio interface
def gradio_interface(urls: str) -> List[str]:
    """Gradio interface function."""
    return process_urls(urls)

# Set up the Gradio app
iface = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Textbox(
        lines=5,
        placeholder="Enter up to 5 URLs (one per line)",
        label="URLs"
    ),
    outputs=gr.File(
        label="Downloaded PDFs",
        file_count="multiple"
    ),
    title="Web Content Extractor",
    description="Extract article content from web pages and download as PDFs. Enter up to 5 URLs, one per line.",
    examples=[
        ["https://example.com/article1\nhttps://example.com/article2"]
    ]
)

# Launch the app
if __name__ == "__main__":
    iface.launch()