Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
from transformers import pipeline,
|
3 |
import feedparser
|
4 |
from datetime import datetime, timedelta
|
5 |
import json
|
@@ -20,6 +20,30 @@ logging.basicConfig(
|
|
20 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
21 |
)
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
# Language codes and their corresponding MarianMT model names
|
24 |
LANGUAGE_CODES = {
|
25 |
"English": {"code": "en", "model": None}, # No translation needed for English
|
@@ -35,8 +59,6 @@ LANGUAGE_CODES = {
|
|
35 |
"Arabic": {"code": "ar", "model": "Helsinki-NLP/opus-mt-en-ar"}
|
36 |
}
|
37 |
|
38 |
-
# [Previous NEWS_SOURCES definition remains the same...]
|
39 |
-
|
40 |
# Initialize global variables
|
41 |
summarizer = None
|
42 |
translators = {}
|
@@ -49,11 +71,8 @@ class NewsCache:
|
|
49 |
|
50 |
def store_summary(self, content_hash, summary, language=None):
|
51 |
cache_key = f"{content_hash}_{language}" if language else content_hash
|
52 |
-
|
53 |
if len(self.summaries) >= self.max_cache_size:
|
54 |
-
# Remove oldest entry if cache is full
|
55 |
self.summaries.pop(next(iter(self.summaries)))
|
56 |
-
|
57 |
self.summaries[cache_key] = summary
|
58 |
|
59 |
def get_summary(self, content_hash, language=None):
|
@@ -62,6 +81,44 @@ class NewsCache:
|
|
62 |
|
63 |
news_cache = NewsCache()
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
def initialize_models():
|
66 |
"""Initialize the summarization and translation models"""
|
67 |
global summarizer, translators
|
@@ -78,7 +135,7 @@ def initialize_models():
|
|
78 |
for lang, info in LANGUAGE_CODES.items():
|
79 |
if info["model"]: # Skip English as it doesn't need translation
|
80 |
try:
|
81 |
-
model =
|
82 |
tokenizer = AutoTokenizer.from_pretrained(info["model"])
|
83 |
translators[lang] = (model, tokenizer)
|
84 |
logging.info(f"Initialized translator for {lang}")
|
@@ -258,7 +315,6 @@ def get_personalized_summary(name, progress=gr.Progress()):
|
|
258 |
progress(1.0, desc="Done!")
|
259 |
return "\n".join(summaries)
|
260 |
|
261 |
-
|
262 |
# Gradio interface
|
263 |
with gr.Blocks(title="Enhanced News Summarizer") as demo:
|
264 |
gr.Markdown("# 📰 Enhanced AI News Summarizer")
|
@@ -318,7 +374,7 @@ with gr.Blocks(title="Enhanced News Summarizer") as demo:
|
|
318 |
)
|
319 |
|
320 |
if __name__ == "__main__":
|
321 |
-
if
|
322 |
demo.launch()
|
323 |
else:
|
324 |
print("Failed to initialize summarizer. Please check the logs.")
|
|
|
1 |
import gradio as gr
|
2 |
+
from transformers import pipeline, MarianMTModel, AutoTokenizer
|
3 |
import feedparser
|
4 |
from datetime import datetime, timedelta
|
5 |
import json
|
|
|
20 |
format='%(asctime)s - %(levelname)s - %(message)s'
|
21 |
)
|
22 |
|
23 |
+
# News sources and their RSS feeds
|
24 |
+
NEWS_SOURCES = {
|
25 |
+
"Technology": {
|
26 |
+
"TechCrunch": "https://techcrunch.com/feed/",
|
27 |
+
"Wired": "https://www.wired.com/feed/rss",
|
28 |
+
"The Verge": "https://www.theverge.com/rss/index.xml"
|
29 |
+
},
|
30 |
+
"Business": {
|
31 |
+
"Financial Times": "https://www.ft.com/rss/home",
|
32 |
+
"Business Insider": "https://www.businessinsider.com/rss",
|
33 |
+
"Forbes": "https://www.forbes.com/real-time/feed2/"
|
34 |
+
},
|
35 |
+
"Science": {
|
36 |
+
"Science Daily": "https://www.sciencedaily.com/rss/all.xml",
|
37 |
+
"Nature": "http://feeds.nature.com/nature/rss/current",
|
38 |
+
"Scientific American": "http://rss.sciam.com/ScientificAmerican-Global"
|
39 |
+
},
|
40 |
+
"World News": {
|
41 |
+
"Reuters": "http://feeds.reuters.com/reuters/topNews",
|
42 |
+
"BBC": "http://feeds.bbci.co.uk/news/world/rss.xml",
|
43 |
+
"CNN": "http://rss.cnn.com/rss/edition_world.rss"
|
44 |
+
}
|
45 |
+
}
|
46 |
+
|
47 |
# Language codes and their corresponding MarianMT model names
|
48 |
LANGUAGE_CODES = {
|
49 |
"English": {"code": "en", "model": None}, # No translation needed for English
|
|
|
59 |
"Arabic": {"code": "ar", "model": "Helsinki-NLP/opus-mt-en-ar"}
|
60 |
}
|
61 |
|
|
|
|
|
62 |
# Initialize global variables
|
63 |
summarizer = None
|
64 |
translators = {}
|
|
|
71 |
|
72 |
def store_summary(self, content_hash, summary, language=None):
|
73 |
cache_key = f"{content_hash}_{language}" if language else content_hash
|
|
|
74 |
if len(self.summaries) >= self.max_cache_size:
|
|
|
75 |
self.summaries.pop(next(iter(self.summaries)))
|
|
|
76 |
self.summaries[cache_key] = summary
|
77 |
|
78 |
def get_summary(self, content_hash, language=None):
|
|
|
81 |
|
82 |
news_cache = NewsCache()
|
83 |
|
84 |
+
def get_content_hash(content):
|
85 |
+
"""Generate a hash for the content"""
|
86 |
+
return hashlib.md5(content.encode()).hexdigest()
|
87 |
+
|
88 |
+
def parse_date(date_str):
|
89 |
+
"""Parse date string to datetime object"""
|
90 |
+
try:
|
91 |
+
return parsedate_to_datetime(date_str).replace(tzinfo=pytz.UTC)
|
92 |
+
except:
|
93 |
+
return None
|
94 |
+
|
95 |
+
def fetch_news_from_rss(categories):
|
96 |
+
"""Fetch news from RSS feeds based on user interests"""
|
97 |
+
articles = []
|
98 |
+
cutoff_time = datetime.now(pytz.UTC) - timedelta(hours=8)
|
99 |
+
|
100 |
+
for category in categories:
|
101 |
+
if category in NEWS_SOURCES:
|
102 |
+
for source, feed_url in NEWS_SOURCES[category].items():
|
103 |
+
try:
|
104 |
+
feed = feedparser.parse(feed_url)
|
105 |
+
for entry in feed.entries:
|
106 |
+
published = parse_date(entry.get('published'))
|
107 |
+
if published and published > cutoff_time:
|
108 |
+
articles.append({
|
109 |
+
'title': entry.get('title', ''),
|
110 |
+
'description': BeautifulSoup(entry.get('description', ''), 'html.parser').get_text(),
|
111 |
+
'link': entry.get('link', ''),
|
112 |
+
'published': entry.get('published', ''),
|
113 |
+
'category': category,
|
114 |
+
'source': source
|
115 |
+
})
|
116 |
+
except Exception as e:
|
117 |
+
logging.error(f"Error fetching from {feed_url}: {e}")
|
118 |
+
continue
|
119 |
+
|
120 |
+
return articles
|
121 |
+
|
122 |
def initialize_models():
|
123 |
"""Initialize the summarization and translation models"""
|
124 |
global summarizer, translators
|
|
|
135 |
for lang, info in LANGUAGE_CODES.items():
|
136 |
if info["model"]: # Skip English as it doesn't need translation
|
137 |
try:
|
138 |
+
model = MarianMTModel.from_pretrained(info["model"])
|
139 |
tokenizer = AutoTokenizer.from_pretrained(info["model"])
|
140 |
translators[lang] = (model, tokenizer)
|
141 |
logging.info(f"Initialized translator for {lang}")
|
|
|
315 |
progress(1.0, desc="Done!")
|
316 |
return "\n".join(summaries)
|
317 |
|
|
|
318 |
# Gradio interface
|
319 |
with gr.Blocks(title="Enhanced News Summarizer") as demo:
|
320 |
gr.Markdown("# 📰 Enhanced AI News Summarizer")
|
|
|
374 |
)
|
375 |
|
376 |
if __name__ == "__main__":
|
377 |
+
if initialize_models():
|
378 |
demo.launch()
|
379 |
else:
|
380 |
print("Failed to initialize summarizer. Please check the logs.")
|