|
import os |
|
from newsapi import NewsApiClient |
|
from dotenv import load_dotenv |
|
import pandas as pd |
|
from datetime import datetime, timedelta |
|
from transformers import pipeline, AutoTokenizer, AutoModel |
|
import torch |
|
import numpy as np |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
import spacy |
|
import re |
|
|
|
load_dotenv() |
|
|
|
class NewsChecker: |
|
def __init__(self): |
|
self.api_key = os.getenv('NEWS_API_KEY') |
|
if not self.api_key: |
|
print("WARNING: NEWS_API_KEY not found in environment variables") |
|
else: |
|
print("NEWS_API_KEY found in environment variables") |
|
|
|
try: |
|
self.newsapi = NewsApiClient(api_key=self.api_key) |
|
|
|
self.sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english') |
|
|
|
self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') |
|
self.model = AutoModel.from_pretrained('distilbert-base-uncased') |
|
|
|
self.nlp = spacy.load('en_core_web_sm') |
|
print("Models initialized successfully") |
|
except Exception as e: |
|
print(f"Error initializing clients: {str(e)}") |
|
|
|
def extract_key_terms(self, text): |
|
"""Extract key product and topic terms from the text""" |
|
doc = self.nlp(text) |
|
|
|
|
|
key_terms = [] |
|
|
|
|
|
for chunk in doc.noun_chunks: |
|
if len(chunk.text.split()) <= 3: |
|
key_terms.append(chunk.text.lower()) |
|
|
|
|
|
for token in doc: |
|
if token.pos_ in ['NOUN', 'PROPN'] and not any(token.text.lower() in term for term in key_terms): |
|
key_terms.append(token.text.lower()) |
|
|
|
|
|
cleaned_terms = [] |
|
for term in key_terms: |
|
|
|
if term not in ['introduction', 'collection', 'products', 'items', 'things']: |
|
|
|
cleaned = re.sub(r'[^\w\s-]', '', term) |
|
cleaned = cleaned.strip() |
|
if cleaned and len(cleaned) > 2: |
|
cleaned_terms.append(cleaned) |
|
|
|
return list(set(cleaned_terms)) |
|
|
|
def get_embedding(self, text): |
|
"""Get embedding for a text using DistilBERT""" |
|
try: |
|
|
|
inputs = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt") |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = self.model(**inputs) |
|
|
|
|
|
embeddings = outputs.last_hidden_state.mean(dim=1) |
|
|
|
return embeddings |
|
except Exception as e: |
|
print(f"Error getting embedding: {str(e)}") |
|
return None |
|
|
|
def calculate_similarity(self, text1_embedding, text2_embedding): |
|
"""Calculate cosine similarity between two embeddings""" |
|
try: |
|
|
|
emb1 = text1_embedding.numpy().reshape(1, -1) |
|
emb2 = text2_embedding.numpy().reshape(1, -1) |
|
|
|
|
|
similarity = cosine_similarity(emb1, emb2)[0][0] |
|
return similarity |
|
except Exception as e: |
|
print(f"Error calculating similarity: {str(e)}") |
|
return 0.0 |
|
|
|
def is_negative_news(self, title, description): |
|
"""Check if the news article has negative sentiment""" |
|
try: |
|
|
|
text = f"{title} {description}" |
|
result = self.sentiment_analyzer(text)[0] |
|
|
|
|
|
return result['label'] == 'NEGATIVE' and result['score'] > 0.7 |
|
except Exception as e: |
|
print(f"Error in sentiment analysis: {str(e)}") |
|
return False |
|
|
|
def get_recent_news(self, marketing_text): |
|
if not self.api_key: |
|
print("Cannot fetch news: No API key configured") |
|
return pd.DataFrame() |
|
|
|
try: |
|
|
|
key_terms = self.extract_key_terms(marketing_text) |
|
if not key_terms: |
|
return pd.DataFrame() |
|
|
|
|
|
search_query = ' OR '.join([f'"{term}"' for term in key_terms[:5]]) |
|
print(f"Searching news with query: {search_query}") |
|
|
|
|
|
week_ago = (datetime.now() - timedelta(days=7)).strftime('%Y-%m-%d') |
|
|
|
|
|
marketing_embedding = self.get_embedding(marketing_text) |
|
if marketing_embedding is None: |
|
return pd.DataFrame() |
|
|
|
|
|
response = self.newsapi.get_everything( |
|
q=search_query, |
|
from_param=week_ago, |
|
language='en', |
|
sort_by='relevancy', |
|
page_size=50 |
|
) |
|
|
|
if response['status'] == 'ok': |
|
relevant_news = [] |
|
|
|
for article in response['articles']: |
|
if not article['title'] or not article['description']: |
|
continue |
|
|
|
|
|
article_text = f"{article['title'].lower()} {article['description'].lower()}" |
|
if not any(term in article_text for term in key_terms): |
|
continue |
|
|
|
|
|
article_embedding = self.get_embedding(article_text) |
|
|
|
if article_embedding is None: |
|
continue |
|
|
|
|
|
similarity = self.calculate_similarity(marketing_embedding, article_embedding) |
|
|
|
|
|
if similarity > 0.6 and self.is_negative_news(article['title'], article['description']): |
|
relevant_news.append({ |
|
'title': article['title'], |
|
'description': article['description'], |
|
'similarity': similarity |
|
}) |
|
|
|
|
|
relevant_news.sort(key=lambda x: x['similarity'], reverse=True) |
|
return pd.DataFrame(relevant_news) |
|
|
|
return pd.DataFrame() |
|
|
|
except Exception as e: |
|
print(f"Error fetching news: {str(e)}") |
|
return pd.DataFrame() |
|
|
|
def check_content_against_news(self, marketing_text): |
|
news_df = self.get_recent_news(marketing_text) |
|
|
|
if news_df.empty: |
|
return { |
|
'status': 'pass', |
|
'message': 'No relevant negative news found.' |
|
} |
|
|
|
|
|
top_news = news_df.head(3) |
|
|
|
if not top_news.empty: |
|
message = 'Found semantically relevant negative news that might impact your marketing:\n' |
|
for _, row in top_news.iterrows(): |
|
message += f"- {row['title']} (Similarity: {row['similarity']:.2f})\n" |
|
|
|
return { |
|
'status': 'warning', |
|
'message': message |
|
} |
|
|
|
return { |
|
'status': 'pass', |
|
'message': 'No relevant negative news found.' |
|
} |