Spaces:

ozgurunlu
/

m-check

Sleeping

m-check / news_checker.py

Ozgur Unlu

news fix

61f73d5 5 months ago

8.42 kB

	import os
	from newsapi import NewsApiClient
	from dotenv import load_dotenv
	import pandas as pd
	from datetime import datetime, timedelta
	from transformers import pipeline, AutoTokenizer, AutoModel
	import torch
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity
	import spacy
	import re

	load_dotenv()

	class NewsChecker:
	def __init__(self):
	self.api_key = os.getenv('NEWS_API_KEY')
	if not self.api_key:
	print("WARNING: NEWS_API_KEY not found in environment variables")
	else:
	print("NEWS_API_KEY found in environment variables")

	try:
	self.newsapi = NewsApiClient(api_key=self.api_key)
	# Initialize sentiment analyzer
	self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
	# Initialize semantic similarity model
	self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
	self.model = AutoModel.from_pretrained('distilbert-base-uncased')
	# Load spaCy model for keyword extraction
	self.nlp = spacy.load('en_core_web_sm')
	print("Models initialized successfully")
	except Exception as e:
	print(f"Error initializing clients: {str(e)}")

	def extract_key_terms(self, text):
	"""Extract key product and topic terms from the text"""
	doc = self.nlp(text)

	# Extract noun phrases and product-related terms
	key_terms = []

	# Get noun phrases
	for chunk in doc.noun_chunks:
	if len(chunk.text.split()) <= 3: # Limit to phrases of 3 words or less
	key_terms.append(chunk.text.lower())

	# Get product-related nouns and adjectives
	for token in doc:
	if token.pos_ in ['NOUN', 'PROPN'] and not any(token.text.lower() in term for term in key_terms):
	key_terms.append(token.text.lower())

	# Clean terms
	cleaned_terms = []
	for term in key_terms:
	# Remove common marketing words
	if term not in ['introduction', 'collection', 'products', 'items', 'things']:
	# Clean the term
	cleaned = re.sub(r'[^\w\s-]', '', term)
	cleaned = cleaned.strip()
	if cleaned and len(cleaned) > 2: # Only keep terms longer than 2 characters
	cleaned_terms.append(cleaned)

	return list(set(cleaned_terms)) # Remove duplicates

	def get_embedding(self, text):
	"""Get embedding for a text using DistilBERT"""
	try:
	# Tokenize and encode the text
	inputs = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")

	# Get model outputs
	with torch.no_grad():
	outputs = self.model(**inputs)

	# Use the mean of the last hidden state as the sentence embedding
	embeddings = outputs.last_hidden_state.mean(dim=1)

	return embeddings
	except Exception as e:
	print(f"Error getting embedding: {str(e)}")
	return None

	def calculate_similarity(self, text1_embedding, text2_embedding):
	"""Calculate cosine similarity between two embeddings"""
	try:
	# Convert tensors to numpy arrays and reshape
	emb1 = text1_embedding.numpy().reshape(1, -1)
	emb2 = text2_embedding.numpy().reshape(1, -1)

	# Calculate cosine similarity
	similarity = cosine_similarity(emb1, emb2)[0][0]
	return similarity
	except Exception as e:
	print(f"Error calculating similarity: {str(e)}")
	return 0.0

	def is_negative_news(self, title, description):
	"""Check if the news article has negative sentiment"""
	try:
	# Combine title and description for better context
	text = f"{title} {description}"
	result = self.sentiment_analyzer(text)[0]

	# Return True if sentiment is negative with high confidence
	return result['label'] == 'NEGATIVE' and result['score'] > 0.7
	except Exception as e:
	print(f"Error in sentiment analysis: {str(e)}")
	return False

	def get_recent_news(self, marketing_text):
	if not self.api_key:
	print("Cannot fetch news: No API key configured")
	return pd.DataFrame()

	try:
	# Extract key terms from marketing text
	key_terms = self.extract_key_terms(marketing_text)
	if not key_terms:
	return pd.DataFrame()

	# Create search query from key terms
	search_query = ' OR '.join([f'"{term}"' for term in key_terms[:5]]) # Use top 5 terms
	print(f"Searching news with query: {search_query}")

	# Get news from the last 7 days
	week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')

	# Get embedding for marketing text
	marketing_embedding = self.get_embedding(marketing_text)
	if marketing_embedding is None:
	return pd.DataFrame()

	# Search news with the extracted terms
	response = self.newsapi.get_everything(
	q=search_query,
	from_param=week_ago,
	language='en',
	sort_by='relevancy',
	page_size=50
	)

	if response['status'] == 'ok':
	relevant_news = []

	for article in response['articles']:
	if not article['title'] or not article['description']:
	continue

	# Check if the article contains any of our key terms
	article_text = f"{article['title'].lower()} {article['description'].lower()}"
	if not any(term in article_text for term in key_terms):
	continue

	# Get embedding for article
	article_embedding = self.get_embedding(article_text)

	if article_embedding is None:
	continue

	# Calculate semantic similarity
	similarity = self.calculate_similarity(marketing_embedding, article_embedding)

	# Check if article is both semantically similar and negative
	if similarity > 0.6 and self.is_negative_news(article['title'], article['description']):
	relevant_news.append({
	'title': article['title'],
	'description': article['description'],
	'similarity': similarity
	})

	# Sort by similarity and convert to DataFrame
	relevant_news.sort(key=lambda x: x['similarity'], reverse=True)
	return pd.DataFrame(relevant_news)

	return pd.DataFrame()

	except Exception as e:
	print(f"Error fetching news: {str(e)}")
	return pd.DataFrame()

	def check_content_against_news(self, marketing_text):
	news_df = self.get_recent_news(marketing_text)

	if news_df.empty:
	return {
	'status': 'pass',
	'message': 'No relevant negative news found.'
	}

	# Get the top 3 most similar negative news articles
	top_news = news_df.head(3)

	if not top_news.empty:
	message = 'Found semantically relevant negative news that might impact your marketing:\n'
	for _, row in top_news.iterrows():
	message += f"- {row['title']} (Similarity: {row['similarity']:.2f})\n"

	return {
	'status': 'warning',
	'message': message
	}

	return {
	'status': 'pass',
	'message': 'No relevant negative news found.'
	}