news-analyzer / main.py
ihaveaplan66's picture
Update main.py
026468b verified
import requests
from collections import Counter
from transformers import pipeline
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import os
nltk_data_dir = "/tmp/nltk_data"
hf_cache_dir = "/tmp/huggingface"
os.makedirs(nltk_data_dir, exist_ok=True)
os.makedirs(hf_cache_dir, exist_ok=True)
os.environ["NLTK_DATA"] = nltk_data_dir
os.environ["HF_HOME"] = hf_cache_dir
nltk.download('punkt', download_dir=nltk_data_dir)
nltk.download('stopwords', download_dir=nltk_data_dir)
nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir)
nltk.download('punkt_tab', download_dir=nltk_data_dir)
# 1. Function for getting news via NewsAPI
def get_news(query, api_key, num_articles=5):
url = f'https://newsapi.org/v2/everything?q={query}&apiKey={api_key}&language=en&pageSize={num_articles}'
response = requests.get(url)
if response.status_code == 200:
return response.json()['articles']
return []
# 2. Analyzing tone with Hugging Face
tone_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", revision="714eb0f")
def analyze_sentiment(text):
return tone_analyzer(text)[0]
# 3. Define category
category_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/tweet-topic-21-multi")
category_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/tweet-topic-21-multi")
labels = ['art', 'business', 'entertainment', 'environment', 'fashion', 'finance', 'food',
'health', 'law', 'media', 'military', 'music', 'politics', 'religion', 'sci/tech',
'sports', 'travel', 'weather', 'world news', 'none']
def classify_category(text):
inputs = category_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
outputs = category_model(**inputs)
predicted_class = torch.argmax(outputs.logits, dim=1).item()
return labels[predicted_class]
# 4. Summarization
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def split_text(text, max_tokens=512):
words = text.split()
return [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]
def summarize_text(text):
chunks = split_text(text)
summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
return ' '.join(summaries)
# 5. Search for trending words
def extract_trending_words(texts):
text = ' '.join(texts).lower()
words = word_tokenize(text)
words = [word for word in words if word not in stopwords.words('english') and word not in string.punctuation and len(word) > 1]
word_freq = Counter(words)
return word_freq.most_common(10)
# 6. The main process of analyzing news
def analyze_news(query, api_key, num_articles=5):
articles = get_news(query, api_key, num_articles)
if not articles:
return []
news_results = []
for article in articles:
title = article.get('title', 'No Title')
description = article.get('description', '') or ''
url = article.get('url', '#')
sentiment = analyze_sentiment(title + " " + description)['label']
category = classify_category(title + " " + description)
summary = summarize_text(title + " " + description)
news_results.append({
"title": title,
"url": url,
"sentiment": sentiment,
"category": category,
"summary": summary
})
return news_results