Spaces:

loayshabet
/

news-sumarry

Sleeping

App Files Files Community

news-sumarry / app.py

loayshabet

Update app.py

41e7bb5 verified 4 months ago

raw

history blame

13.9 kB

	import gradio as gr
	from transformers import pipeline, MarianMTModel, AutoTokenizer
	import feedparser
	from datetime import datetime, timedelta
	import json
	import os
	from concurrent.futures import ThreadPoolExecutor, TimeoutError
	import logging
	import hashlib
	import requests
	from functools import lru_cache
	import time
	from email.utils import parsedate_to_datetime
	import pytz
	from bs4 import BeautifulSoup

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s'
	)

	# News sources and their RSS feeds
	NEWS_SOURCES = {
	"Technology": {
	"TechCrunch": "https://techcrunch.com/feed/",
	"Wired": "https://www.wired.com/feed/rss",
	"The Verge": "https://www.theverge.com/rss/index.xml"
	},
	"Business": {
	"Financial Times": "https://www.ft.com/rss/home",
	"Business Insider": "https://www.businessinsider.com/rss",
	"Forbes": "https://www.forbes.com/real-time/feed2/"
	},
	"Science": {
	"Science Daily": "https://www.sciencedaily.com/rss/all.xml",
	"Nature": "http://feeds.nature.com/nature/rss/current",
	"Scientific American": "http://rss.sciam.com/ScientificAmerican-Global"
	},
	"World News": {
	"Reuters": "http://feeds.reuters.com/reuters/topNews",
	"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
	"CNN": "http://rss.cnn.com/rss/edition_world.rss"
	}
	}

	# Language codes and their corresponding MarianMT model names
	LANGUAGE_CODES = {
	"English": {"code": "en", "model": None}, # No translation needed for English
	"Spanish": {"code": "es", "model": "Helsinki-NLP/opus-mt-en-es"},
	"French": {"code": "fr", "model": "Helsinki-NLP/opus-mt-en-fr"},
	"German": {"code": "de", "model": "Helsinki-NLP/opus-mt-en-de"},
	"Italian": {"code": "it", "model": "Helsinki-NLP/opus-mt-en-it"},
	"Portuguese": {"code": "pt", "model": "Helsinki-NLP/opus-mt-en-pt"},
	"Dutch": {"code": "nl", "model": "Helsinki-NLP/opus-mt-en-nl"},
	"Russian": {"code": "ru", "model": "Helsinki-NLP/opus-mt-en-ru"},
	"Chinese": {"code": "zh", "model": "Helsinki-NLP/opus-mt-en-zh"},
	"Japanese": {"code": "ja", "model": "Helsinki-NLP/opus-mt-en-jap"},
	"Arabic": {"code": "ar", "model": "Helsinki-NLP/opus-mt-en-ar"}
	}

	# Initialize global variables
	summarizer = None
	translators = {}

	class NewsCache:
	def __init__(self):
	self.summaries = {}
	self.translations = {}
	self.max_cache_size = 1000

	def store_summary(self, content_hash, summary, language=None):
	cache_key = f"{content_hash}_{language}" if language else content_hash
	if len(self.summaries) >= self.max_cache_size:
	self.summaries.pop(next(iter(self.summaries)))
	self.summaries[cache_key] = summary

	def get_summary(self, content_hash, language=None):
	cache_key = f"{content_hash}_{language}" if language else content_hash
	return self.summaries.get(cache_key)

	news_cache = NewsCache()

	def get_content_hash(content):
	"""Generate a hash for the content"""
	return hashlib.md5(content.encode()).hexdigest()

	def parse_date(date_str):
	"""Parse date string to datetime object"""
	try:
	return parsedate_to_datetime(date_str).replace(tzinfo=pytz.UTC)
	except:
	return None

	def fetch_news_from_rss(categories):
	"""Fetch news from RSS feeds based on user interests"""
	articles = []
	cutoff_time = datetime.now(pytz.UTC) - timedelta(hours=8)

	for category in categories:
	if category in NEWS_SOURCES:
	for source, feed_url in NEWS_SOURCES[category].items():
	try:
	feed = feedparser.parse(feed_url)
	for entry in feed.entries:
	published = parse_date(entry.get('published'))
	if published and published > cutoff_time:
	articles.append({
	'title': entry.get('title', ''),
	'description': BeautifulSoup(entry.get('description', ''), 'html.parser').get_text(),
	'link': entry.get('link', ''),
	'published': entry.get('published', ''),
	'category': category,
	'source': source
	})
	except Exception as e:
	logging.error(f"Error fetching from {feed_url}: {e}")
	continue

	return articles

	def initialize_models():
	"""Initialize the summarization and translation models"""
	global summarizer, translators

	try:
	# Initialize summarizer
	summarizer = pipeline(
	"summarization",
	model="facebook/bart-large-cnn",
	device=-1 # Use CPU
	)

	# Initialize translators for each language
	for lang, info in LANGUAGE_CODES.items():
	if info["model"]: # Skip English as it doesn't need translation
	try:
	model = MarianMTModel.from_pretrained(info["model"])
	tokenizer = AutoTokenizer.from_pretrained(info["model"])
	translators[lang] = (model, tokenizer)
	logging.info(f"Initialized translator for {lang}")
	except Exception as e:
	logging.error(f"Error initializing translator for {lang}: {e}")

	return True
	except Exception as e:
	logging.error(f"Error initializing models: {e}")
	return False

	def translate_text(text, target_language):
	"""Translate text to target language"""
	if target_language == "English" or not text:
	return text

	try:
	if target_language not in translators:
	logging.error(f"Translator not found for {target_language}")
	return text

	model, tokenizer = translators[target_language]

	# Split text into chunks to handle long text
	max_length = 512
	chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
	translated_chunks = []

	for chunk in chunks:
	inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=512)
	translated = model.generate(**inputs)
	translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
	translated_chunks.append(translated_text)

	return " ".join(translated_chunks)

	except Exception as e:
	logging.error(f"Translation error: {e}")
	return text

	def generate_summary(text, title="", category="", language="English"):
	"""Generate summary with translation support"""
	if not summarizer:
	if not initialize_models():
	return None

	try:
	# Check cache first
	content_hash = get_content_hash(text)
	cached_summary = news_cache.get_summary(content_hash, language)
	if cached_summary:
	return cached_summary

	# Generate English summary first
	prompt_template = f"""
	Analyze and summarize this {category} news article titled "{title}".
	Focus on providing:
	1. Main story/announcement/finding
	2. Key details and implications
	3. Relevant context or background
	4. Any significant numbers, statistics, or quotes
	5. Future implications or next steps (if mentioned)

	Article text:
	{text}

	Please provide a clear, concise summary that a general audience can understand:"""

	prompted_text = prompt_template.format(text=text[:1024])

	result = summarizer(prompted_text,
	max_length=200,
	min_length=50,
	do_sample=False,
	truncation=True)

	if result and len(result) > 0:
	summary = result[0]['summary_text']

	# Post-process summary
	summary = summary.replace(" .", ".").replace(" ,", ",")
	sentences = summary.split(". ")
	formatted_summary = "\n• " + "\n• ".join(filter(None, sentences))

	# Translate if needed
	if language != "English":
	formatted_summary = translate_text(formatted_summary, language)

	news_cache.store_summary(content_hash, formatted_summary, language)
	return formatted_summary

	return None

	except Exception as e:
	logging.error(f"Summarization error: {e}")
	return None

	def get_personalized_summary(name, progress=gr.Progress()):
	"""Generate personalized news summary in user's preferred language"""
	start_time = time.time()
	logging.info(f"Starting summary generation for user: {name}")

	if not name:
	return "Please enter your name!"

	try:
	with open(f"user_preferences/preferences_{name}.json", "r") as f:
	preferences = json.load(f)
	except FileNotFoundError:
	return "Please set your preferences first!"
	except Exception as e:
	return f"Error loading preferences: {e}"

	user_language = preferences.get("language", "English")

	# Fetch articles with progress
	progress(0.2, desc="Fetching recent news...")
	articles = fetch_news_from_rss(preferences["interests"])

	if not articles:
	return translate_text("No recent news articles found from the last 8 hours. Please try again later.", user_language)

	# Process articles with timeout
	progress(0.4, desc="Analyzing and summarizing...")
	summaries = []
	total_articles = len(articles)

	max_processing_time = 60

	for i, article in enumerate(articles):
	if time.time() - start_time > max_processing_time:
	logging.warning("Processing time exceeded maximum limit")
	break

	try:
	progress((i + 1) / total_articles * 0.8 + 0.4)

	title = article['title']
	content = article['description']
	category = article['category']
	link = article['link']
	published = parse_date(article['published'])
	published_str = published.strftime('%Y-%m-%d %H:%M UTC') if published else 'Recently'

	if not content:
	continue

	summary = generate_summary(content, title, category, user_language)
	if not summary:
	continue

	# Translate title and category if needed
	if user_language != "English":
	title = translate_text(title, user_language)
	category = translate_text(category, user_language)
	published_str = translate_text(published_str, user_language)

	formatted_summary = f"""
	📰 {title}
	📁 {translate_text("Category", user_language)}: {category}
	⏰ {translate_text("Published", user_language)}: {published_str}

	{summary}

	🔗 {translate_text("Read more", user_language)}: {link}

	---"""
	summaries.append(formatted_summary)

	except Exception as e:
	logging.error(f"Error processing article: {e}")
	continue

	if not summaries:
	return translate_text("Unable to generate summaries for recent news. Please try again.", user_language)

	progress(1.0, desc="Done!")
	return "\n".join(summaries)

	# Gradio interface
	with gr.Blocks(title="Enhanced News Summarizer") as demo:
	gr.Markdown("# 📰 Enhanced AI News Summarizer")

	with gr.Tab("Set Preferences"):
	name_input = gr.Textbox(label="Your Name")
	language_dropdown = gr.Dropdown(
	choices=list(LANGUAGE_CODES.keys()),
	label="Preferred Language",
	value="English"
	)
	interests_checkboxes = gr.CheckboxGroup(
	choices=list(NEWS_SOURCES.keys()),
	label="News Interests (Select multiple)"
	)
	save_button = gr.Button("Save Preferences")
	preferences_output = gr.Textbox(label="Status")

	def save_preferences(name, language, interests):
	if not name or not language or not interests:
	return "Please fill in all required fields!"

	preferences = {
	"name": name,
	"language": language,
	"interests": interests,
	"last_updated": datetime.now().isoformat()
	}

	try:
	os.makedirs('user_preferences', exist_ok=True)
	with open(f"user_preferences/preferences_{name}.json", "w") as f:
	json.dump(preferences, f)
	return f"Preferences saved for {name}!"
	except Exception as e:
	logging.error(f"Error saving preferences: {e}")
	return f"Error saving preferences: {e}"

	save_button.click(
	save_preferences,
	inputs=[name_input, language_dropdown, interests_checkboxes],
	outputs=[preferences_output]
	)

	with gr.Tab("Get News Summary"):
	name_check = gr.Textbox(label="Enter your name to get summary")
	get_summary_button = gr.Button("Get Summary")
	summary_output = gr.Textbox(
	label="Your Personalized News Summary",
	lines=20
	)

	get_summary_button.click(
	get_personalized_summary,
	inputs=[name_check],
	outputs=[summary_output]
	)

	if __name__ == "__main__":
	if initialize_models():
	demo.launch()
	else:
	print("Failed to initialize summarizer. Please check the logs.")