Spaces:
Running
Running
File size: 4,989 Bytes
c511484 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import gradio as gr
import requests
from bs4 import BeautifulSoup
from fpdf import FPDF
import os
import re
from urllib.parse import urlparse
from typing import List, Tuple
import tempfile
class ArticleExtractor:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
def clean_text(self, text: str) -> str:
"""Clean extracted text by removing extra whitespace and special characters."""
# Remove extra whitespace and newlines
text = re.sub(r'\s+', ' ', text).strip()
# Remove special characters but keep basic punctuation
text = re.sub(r'[^\w\s.,!?-]', '', text)
return text
def extract_content(self, url: str) -> Tuple[str, List[str], str]:
"""Extract title, headings, and main content from a webpage."""
try:
response = requests.get(url, headers=self.headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
# Extract title
title = soup.title.string if soup.title else "No title found"
title = self.clean_text(title)
# Extract headings
headings = []
for heading in soup.find_all(['h1', 'h2', 'h3']):
heading_text = self.clean_text(heading.get_text())
if heading_text and len(heading_text) > 5: # Filter out very short headings
headings.append(heading_text)
# Extract main content (paragraphs)
# Remove unwanted elements
for unwanted in soup.find_all(['script', 'style', 'nav', 'header', 'footer', 'aside']):
unwanted.decompose()
# Find article content or main content
content = ""
article = soup.find('article') or soup.find('main') or soup.find('div', class_=re.compile(r'content|article|post'))
if article:
paragraphs = article.find_all('p')
else:
paragraphs = soup.find_all('p')
content_parts = []
for p in paragraphs:
text = self.clean_text(p.get_text())
if text and len(text) > 50: # Filter out short paragraphs
content_parts.append(text)
content = '\n\n'.join(content_parts)
return title, headings, content
except Exception as e:
return f"Error: {str(e)}", [], "Failed to extract content"
def create_pdf(self, url: str, output_dir: str) -> str:
"""Create a PDF document from extracted web content."""
title, headings, content = self.extract_content(url)
# Create PDF
pdf = FPDF()
pdf.add_page()
# Set up fonts
pdf.set_font('Arial', 'B', 16)
# Add title
pdf.cell(0, 10, title[:80], ln=True) # Truncate very long titles
pdf.ln(10)
# Add headings
pdf.set_font('Arial', 'B', 12)
for heading in headings:
pdf.multi_cell(0, 10, heading)
pdf.ln(5)
# Add content
pdf.set_font('Arial', '', 11)
pdf.multi_cell(0, 10, content)
# Generate filename from URL
filename = f"article_{urlparse(url).netloc.replace('.', '_')}.pdf"
filepath = os.path.join(output_dir, filename)
# Save PDF
pdf.output(filepath)
return filepath
def process_urls(urls: str) -> List[str]:
"""Process multiple URLs and return paths to generated PDFs."""
# Create temporary directory for PDFs
temp_dir = tempfile.mkdtemp()
# Split and clean URLs
url_list = [url.strip() for url in urls.split('\n') if url.strip()]
# Limit to 5 URLs
url_list = url_list[:5]
extractor = ArticleExtractor()
pdf_paths = []
for url in url_list:
try:
pdf_path = extractor.create_pdf(url, temp_dir)
pdf_paths.append(pdf_path)
except Exception as e:
print(f"Error processing {url}: {str(e)}")
return pdf_paths
# Create Gradio interface
def gradio_interface(urls: str) -> List[str]:
"""Gradio interface function."""
return process_urls(urls)
# Set up the Gradio app
iface = gr.Interface(
fn=gradio_interface,
inputs=gr.Textbox(
lines=5,
placeholder="Enter up to 5 URLs (one per line)",
label="URLs"
),
outputs=gr.File(
label="Downloaded PDFs",
file_count="multiple"
),
title="Web Content Extractor",
description="Extract article content from web pages and download as PDFs. Enter up to 5 URLs, one per line.",
examples=[
["https://example.com/article1\nhttps://example.com/article2"]
]
)
# Launch the app
if __name__ == "__main__":
iface.launch() |