Spaces:
Runtime error
Runtime error
import requests | |
from collections import Counter | |
from transformers import pipeline | |
import nltk | |
from nltk.tokenize import word_tokenize | |
from nltk.corpus import stopwords | |
import string | |
from transformers import AutoModelForSequenceClassification, AutoTokenizer | |
import torch | |
import os | |
nltk_data_dir = "/tmp/nltk_data" | |
hf_cache_dir = "/tmp/huggingface" | |
os.makedirs(nltk_data_dir, exist_ok=True) | |
os.makedirs(hf_cache_dir, exist_ok=True) | |
os.environ["NLTK_DATA"] = nltk_data_dir | |
os.environ["HF_HOME"] = hf_cache_dir | |
nltk.download('punkt', download_dir=nltk_data_dir) | |
nltk.download('stopwords', download_dir=nltk_data_dir) | |
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir) | |
nltk.download('punkt_tab', download_dir=nltk_data_dir) | |
# 1. Function for getting news via NewsAPI | |
def get_news(query, api_key, num_articles=5): | |
url = f'https://newsapi.org/v2/everything?q={query}&apiKey={api_key}&language=en&pageSize={num_articles}' | |
response = requests.get(url) | |
if response.status_code == 200: | |
return response.json()['articles'] | |
return [] | |
# 2. Analyzing tone with Hugging Face | |
tone_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", revision="714eb0f") | |
def analyze_sentiment(text): | |
return tone_analyzer(text)[0] | |
# 3. Define category | |
category_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/tweet-topic-21-multi") | |
category_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/tweet-topic-21-multi") | |
labels = ['art', 'business', 'entertainment', 'environment', 'fashion', 'finance', 'food', | |
'health', 'law', 'media', 'military', 'music', 'politics', 'religion', 'sci/tech', | |
'sports', 'travel', 'weather', 'world news', 'none'] | |
def classify_category(text): | |
inputs = category_tokenizer(text, return_tensors="pt", truncation=True, padding=True) | |
outputs = category_model(**inputs) | |
predicted_class = torch.argmax(outputs.logits, dim=1).item() | |
return labels[predicted_class] | |
# 4. Summarization | |
summarizer = pipeline("summarization", model="facebook/bart-large-cnn") | |
def split_text(text, max_tokens=512): | |
words = text.split() | |
return [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)] | |
def summarize_text(text): | |
chunks = split_text(text) | |
summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks] | |
return ' '.join(summaries) | |
# 5. Search for trending words | |
def extract_trending_words(texts): | |
text = ' '.join(texts).lower() | |
words = word_tokenize(text) | |
words = [word for word in words if word not in stopwords.words('english') and word not in string.punctuation and len(word) > 1] | |
word_freq = Counter(words) | |
return word_freq.most_common(10) | |
# 6. The main process of analyzing news | |
def analyze_news(query, api_key, num_articles=5): | |
articles = get_news(query, api_key, num_articles) | |
if not articles: | |
return [] | |
news_results = [] | |
for article in articles: | |
title = article.get('title', 'No Title') | |
description = article.get('description', '') or '' | |
url = article.get('url', '#') | |
sentiment = analyze_sentiment(title + " " + description)['label'] | |
category = classify_category(title + " " + description) | |
summary = summarize_text(title + " " + description) | |
news_results.append({ | |
"title": title, | |
"url": url, | |
"sentiment": sentiment, | |
"category": category, | |
"summary": summary | |
}) | |
return news_results | |