m-check / news_checker.py
Ozgur Unlu
news fix
61f73d5
import os
from newsapi import NewsApiClient
from dotenv import load_dotenv
import pandas as pd
from datetime import datetime, timedelta
from transformers import pipeline, AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import re
load_dotenv()
class NewsChecker:
def __init__(self):
self.api_key = os.getenv('NEWS_API_KEY')
if not self.api_key:
print("WARNING: NEWS_API_KEY not found in environment variables")
else:
print("NEWS_API_KEY found in environment variables")
try:
self.newsapi = NewsApiClient(api_key=self.api_key)
# Initialize sentiment analyzer
self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')
# Initialize semantic similarity model
self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
self.model = AutoModel.from_pretrained('distilbert-base-uncased')
# Load spaCy model for keyword extraction
self.nlp = spacy.load('en_core_web_sm')
print("Models initialized successfully")
except Exception as e:
print(f"Error initializing clients: {str(e)}")
def extract_key_terms(self, text):
"""Extract key product and topic terms from the text"""
doc = self.nlp(text)
# Extract noun phrases and product-related terms
key_terms = []
# Get noun phrases
for chunk in doc.noun_chunks:
if len(chunk.text.split()) <= 3: # Limit to phrases of 3 words or less
key_terms.append(chunk.text.lower())
# Get product-related nouns and adjectives
for token in doc:
if token.pos_ in ['NOUN', 'PROPN'] and not any(token.text.lower() in term for term in key_terms):
key_terms.append(token.text.lower())
# Clean terms
cleaned_terms = []
for term in key_terms:
# Remove common marketing words
if term not in ['introduction', 'collection', 'products', 'items', 'things']:
# Clean the term
cleaned = re.sub(r'[^\w\s-]', '', term)
cleaned = cleaned.strip()
if cleaned and len(cleaned) > 2: # Only keep terms longer than 2 characters
cleaned_terms.append(cleaned)
return list(set(cleaned_terms)) # Remove duplicates
def get_embedding(self, text):
"""Get embedding for a text using DistilBERT"""
try:
# Tokenize and encode the text
inputs = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
# Get model outputs
with torch.no_grad():
outputs = self.model(**inputs)
# Use the mean of the last hidden state as the sentence embedding
embeddings = outputs.last_hidden_state.mean(dim=1)
return embeddings
except Exception as e:
print(f"Error getting embedding: {str(e)}")
return None
def calculate_similarity(self, text1_embedding, text2_embedding):
"""Calculate cosine similarity between two embeddings"""
try:
# Convert tensors to numpy arrays and reshape
emb1 = text1_embedding.numpy().reshape(1, -1)
emb2 = text2_embedding.numpy().reshape(1, -1)
# Calculate cosine similarity
similarity = cosine_similarity(emb1, emb2)[0][0]
return similarity
except Exception as e:
print(f"Error calculating similarity: {str(e)}")
return 0.0
def is_negative_news(self, title, description):
"""Check if the news article has negative sentiment"""
try:
# Combine title and description for better context
text = f"{title} {description}"
result = self.sentiment_analyzer(text)[0]
# Return True if sentiment is negative with high confidence
return result['label'] == 'NEGATIVE' and result['score'] > 0.7
except Exception as e:
print(f"Error in sentiment analysis: {str(e)}")
return False
def get_recent_news(self, marketing_text):
if not self.api_key:
print("Cannot fetch news: No API key configured")
return pd.DataFrame()
try:
# Extract key terms from marketing text
key_terms = self.extract_key_terms(marketing_text)
if not key_terms:
return pd.DataFrame()
# Create search query from key terms
search_query = ' OR '.join([f'"{term}"' for term in key_terms[:5]]) # Use top 5 terms
print(f"Searching news with query: {search_query}")
# Get news from the last 7 days
week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d')
# Get embedding for marketing text
marketing_embedding = self.get_embedding(marketing_text)
if marketing_embedding is None:
return pd.DataFrame()
# Search news with the extracted terms
response = self.newsapi.get_everything(
q=search_query,
from_param=week_ago,
language='en',
sort_by='relevancy',
page_size=50
)
if response['status'] == 'ok':
relevant_news = []
for article in response['articles']:
if not article['title'] or not article['description']:
continue
# Check if the article contains any of our key terms
article_text = f"{article['title'].lower()} {article['description'].lower()}"
if not any(term in article_text for term in key_terms):
continue
# Get embedding for article
article_embedding = self.get_embedding(article_text)
if article_embedding is None:
continue
# Calculate semantic similarity
similarity = self.calculate_similarity(marketing_embedding, article_embedding)
# Check if article is both semantically similar and negative
if similarity > 0.6 and self.is_negative_news(article['title'], article['description']):
relevant_news.append({
'title': article['title'],
'description': article['description'],
'similarity': similarity
})
# Sort by similarity and convert to DataFrame
relevant_news.sort(key=lambda x: x['similarity'], reverse=True)
return pd.DataFrame(relevant_news)
return pd.DataFrame()
except Exception as e:
print(f"Error fetching news: {str(e)}")
return pd.DataFrame()
def check_content_against_news(self, marketing_text):
news_df = self.get_recent_news(marketing_text)
if news_df.empty:
return {
'status': 'pass',
'message': 'No relevant negative news found.'
}
# Get the top 3 most similar negative news articles
top_news = news_df.head(3)
if not top_news.empty:
message = 'Found semantically relevant negative news that might impact your marketing:\n'
for _, row in top_news.iterrows():
message += f"- {row['title']} (Similarity: {row['similarity']:.2f})\n"
return {
'status': 'warning',
'message': message
}
return {
'status': 'pass',
'message': 'No relevant negative news found.'
}