Spaces:

ihaveaplan66
/

news-analyzer

Runtime error

App Files Files Community

news-analyzer / main.py

ihaveaplan66

Update main.py

026468b verified 10 days ago

raw

history blame contribute delete

3.61 kB

	import requests
	from collections import Counter
	from transformers import pipeline
	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	import string
	from transformers import AutoModelForSequenceClassification, AutoTokenizer
	import torch
	import os

	nltk_data_dir = "/tmp/nltk_data"
	hf_cache_dir = "/tmp/huggingface"

	os.makedirs(nltk_data_dir, exist_ok=True)
	os.makedirs(hf_cache_dir, exist_ok=True)

	os.environ["NLTK_DATA"] = nltk_data_dir
	os.environ["HF_HOME"] = hf_cache_dir

	nltk.download('punkt', download_dir=nltk_data_dir)
	nltk.download('stopwords', download_dir=nltk_data_dir)
	nltk.download('averaged_perceptron_tagger', download_dir=nltk_data_dir)
	nltk.download('punkt_tab', download_dir=nltk_data_dir)


	# 1. Function for getting news via NewsAPI
	def get_news(query, api_key, num_articles=5):
	url = f'https://newsapi.org/v2/everything?q={query}&apiKey={api_key}&language=en&pageSize={num_articles}'
	response = requests.get(url)
	if response.status_code == 200:
	return response.json()['articles']
	return []


	# 2. Analyzing tone with Hugging Face
	tone_analyzer = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", revision="714eb0f")

	def analyze_sentiment(text):
	return tone_analyzer(text)[0]


	# 3. Define category

	category_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/tweet-topic-21-multi")
	category_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/tweet-topic-21-multi")
	labels = ['art', 'business', 'entertainment', 'environment', 'fashion', 'finance', 'food',
	'health', 'law', 'media', 'military', 'music', 'politics', 'religion', 'sci/tech',
	'sports', 'travel', 'weather', 'world news', 'none']

	def classify_category(text):
	inputs = category_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
	outputs = category_model(**inputs)
	predicted_class = torch.argmax(outputs.logits, dim=1).item()
	return labels[predicted_class]


	# 4. Summarization
	summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

	def split_text(text, max_tokens=512):
	words = text.split()
	return [' '.join(words[i:i+max_tokens]) for i in range(0, len(words), max_tokens)]

	def summarize_text(text):
	chunks = split_text(text)
	summaries = [summarizer(chunk, max_length=100, min_length=30, do_sample=False)[0]['summary_text'] for chunk in chunks]
	return ' '.join(summaries)


	# 5. Search for trending words
	def extract_trending_words(texts):
	text = ' '.join(texts).lower()
	words = word_tokenize(text)
	words = [word for word in words if word not in stopwords.words('english') and word not in string.punctuation and len(word) > 1]
	word_freq = Counter(words)
	return word_freq.most_common(10)

	# 6. The main process of analyzing news
	def analyze_news(query, api_key, num_articles=5):
	articles = get_news(query, api_key, num_articles)

	if not articles:
	return []

	news_results = []
	for article in articles:
	title = article.get('title', 'No Title')
	description = article.get('description', '') or ''
	url = article.get('url', '#')

	sentiment = analyze_sentiment(title + " " + description)['label']
	category = classify_category(title + " " + description)
	summary = summarize_text(title + " " + description)

	news_results.append({
	"title": title,
	"url": url,
	"sentiment": sentiment,
	"category": category,
	"summary": summary
	})

	return news_results