chuckiykyk's picture
Upload 36 files
72f802a verified
"""
News API Collector
Collects industry news and trends for feature validation
"""
import asyncio
from typing import List, Dict, Any, Optional
from datetime import datetime, timedelta
import os
from newsapi import NewsApiClient
import aiohttp
from textblob import TextBlob
import re
from dotenv import load_dotenv
from utils.logger import get_logger
from utils.nlp_query_enhancer import NLPQueryEnhancer
# Load environment variables
load_dotenv()
logger = get_logger(__name__)
class NewsCollector:
"""Collects and analyzes news for market trends"""
def __init__(self):
"""Initialize News API client"""
self.news_api = None
self.session = None
self.nlp_enhancer = NLPQueryEnhancer()
self.setup_news_client()
def setup_news_client(self):
"""Setup News API client with credentials"""
logger.log_processing_step("NewsAPI client setup", "started")
try:
# Try both possible environment variable names
api_key = os.getenv("NEWS_API_KEY") or os.getenv("NEWSAPI_KEY")
if not api_key:
error_msg = "No NewsAPI key found in environment variables (NEWS_API_KEY or NEWSAPI_KEY)"
logger.log_api_failure("NewsAPI", error_msg)
self.news_api = None
return
logger.log_api_attempt("NewsAPI", f"with key ending in ...{api_key[-4:]}")
self.news_api = NewsApiClient(api_key=api_key)
logger.log_api_success("NewsAPI", "client initialized")
logger.log_processing_step("NewsAPI client setup", "completed")
except Exception as e:
error_msg = f"NewsAPI client setup failed: {str(e)}"
logger.log_api_failure("NewsAPI", error_msg)
logger.log_processing_step("NewsAPI client setup", "failed")
self.news_api = None
async def __aenter__(self):
"""Async context manager entry"""
connector = aiohttp.TCPConnector(
limit=10,
limit_per_host=5,
ttl_dns_cache=300,
use_dns_cache=True,
keepalive_timeout=30,
enable_cleanup_closed=True
)
timeout = aiohttp.ClientTimeout(total=30)
self.session = aiohttp.ClientSession(
connector=connector,
timeout=timeout
)
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Async context manager exit with proper cleanup"""
await self.cleanup()
async def cleanup(self):
"""Clean up session and connections properly"""
try:
if self.session and not self.session.closed:
await self.session.close()
logger.debug("News collector session closed properly")
except Exception as e:
logger.debug(f"Error during news collector cleanup: {str(e)}")
finally:
self.session = None
# Give time for connections to close
await asyncio.sleep(0.1)
def clean_text(self, text: str) -> str:
"""Clean and preprocess text for analysis"""
if not text:
return ""
# Remove URLs, special characters
text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text, flags=re.MULTILINE)
text = re.sub(r'[^\\w\\s]', ' ', text)
# Remove extra whitespace
text = ' '.join(text.split())
return text.strip()
def extract_keywords(self, text: str, min_length: int = 4, max_keywords: int = 10) -> List[str]:
"""Extract important keywords from text"""
if not text:
return []
try:
# Use TextBlob for noun phrase extraction
blob = TextBlob(text)
# Extract noun phrases and filter for length
keywords = [phrase for phrase in blob.noun_phrases if len(phrase) >= min_length] # type: ignore
# Return top keywords
return keywords[:max_keywords]
except Exception as e:
logger.error(f"Error extracting keywords: {str(e)}")
return []
def optimize_search_query(self, feature_description: str) -> List[str]:
"""
Optimize search query by breaking down complex descriptions into searchable terms
Args:
feature_description: Original feature description
Returns:
List of optimized search queries
"""
queries = []
# Extract key concepts using common patterns
key_terms = []
# Common technology terms
tech_terms = ['ai', 'artificial intelligence', 'machine learning', 'ml', 'app', 'mobile', 'web', 'platform', 'software', 'tool', 'system']
# Common business/industry terms
business_terms = ['fitness', 'health', 'nutrition', 'training', 'workout', 'exercise', 'diet', 'wellness', 'personal trainer', 'coaching']
# Convert to lowercase for matching
description_lower = feature_description.lower()
# Extract relevant tech terms
for term in tech_terms:
if term in description_lower:
key_terms.append(term)
# Extract relevant business terms
for term in business_terms:
if term in description_lower:
key_terms.append(term)
# Create different query strategies
if key_terms:
# Strategy 1: Core concept combinations
if 'ai' in key_terms or 'artificial intelligence' in key_terms:
if 'fitness' in key_terms:
queries.extend([
'AI fitness app',
'artificial intelligence fitness',
'AI personal trainer',
'smart fitness technology'
])
if 'nutrition' in key_terms:
queries.extend([
'AI nutrition app',
'artificial intelligence diet',
'smart meal planning'
])
# Strategy 2: Industry-specific terms
if 'fitness' in key_terms:
queries.extend([
'fitness app market',
'digital fitness trends',
'fitness technology',
'workout app industry'
])
if 'nutrition' in key_terms:
queries.extend([
'nutrition app market',
'diet tracking technology',
'meal planning apps'
])
# Strategy 3: Broader market terms
queries.extend([
'health tech startup',
'wellness app market',
'fitness industry trends',
'health technology news'
])
# Fallback: Extract noun phrases from original description
try:
blob = TextBlob(feature_description)
noun_phrases = [phrase for phrase in blob.noun_phrases if len(phrase.split()) <= 3] # type: ignore
# Add top noun phrases as queries
for phrase in noun_phrases[:5]:
if len(phrase) > 3: # Skip very short phrases
queries.append(phrase)
except:
pass
# Remove duplicates and return unique queries
unique_queries = list(dict.fromkeys(queries))
# Limit to reasonable number of queries
return unique_queries[:8]
def create_search_variations(self, base_query: str) -> List[str]:
"""
Create variations of a search query for better coverage
Args:
base_query: Base search query
Returns:
List of query variations
"""
variations = [base_query]
# Add industry context
variations.extend([
f"{base_query} industry",
f"{base_query} market",
f"{base_query} trends",
f"{base_query} startup",
f"{base_query} technology"
])
# Add temporal context
variations.extend([
f"{base_query} 2024",
f"{base_query} latest",
f"{base_query} new"
])
return variations
def _analyze_article_sentiment(self, text: str) -> Dict[str, Any]:
"""
Enhanced sentiment analysis specifically tuned for news articles
Args:
text: Article text (title + description)
Returns:
Detailed sentiment analysis with improved thresholds
"""
if not text:
return {
"polarity": 0.0,
"subjectivity": 0.0,
"classification": "neutral",
"confidence": 0.0,
"keyword_sentiment": 0.0,
"debug_info": {"error": "Empty text"}
}
try:
# Basic TextBlob analysis
blob = TextBlob(text)
textblob_polarity = blob.sentiment.polarity # type: ignore
textblob_subjectivity = blob.sentiment.subjectivity # type: ignore
# Keyword-based sentiment analysis for news content
keyword_sentiment = self._analyze_keyword_sentiment(text.lower())
# Combine TextBlob and keyword-based analysis
# Give more weight to keyword analysis for news content
combined_polarity = (textblob_polarity * 0.6) + (keyword_sentiment * 0.4)
# More sensitive thresholds for news articles
NEWS_POSITIVE_THRESHOLD = 0.05 # Lowered from 0.1
NEWS_NEGATIVE_THRESHOLD = -0.05 # Lowered from -0.1
# Classify sentiment with new thresholds
if combined_polarity >= NEWS_POSITIVE_THRESHOLD:
classification = "positive"
elif combined_polarity <= NEWS_NEGATIVE_THRESHOLD:
classification = "negative"
else:
classification = "neutral"
# Calculate confidence based on distance from neutral
confidence = min(abs(combined_polarity) * 2, 1.0) # Scale to 0-1
return {
"polarity": combined_polarity,
"subjectivity": textblob_subjectivity,
"classification": classification,
"confidence": confidence,
"keyword_sentiment": keyword_sentiment,
"debug_info": {
"textblob_polarity": textblob_polarity,
"keyword_sentiment": keyword_sentiment,
"thresholds": {
"positive": NEWS_POSITIVE_THRESHOLD,
"negative": NEWS_NEGATIVE_THRESHOLD
}
}
}
except Exception as e:
logger.error(f"Error in enhanced sentiment analysis: {str(e)}")
return {
"polarity": 0.0,
"subjectivity": 0.0,
"classification": "neutral",
"confidence": 0.0,
"keyword_sentiment": 0.0,
"debug_info": {"error": str(e)}
}
def _analyze_keyword_sentiment(self, text: str) -> float:
"""
Analyze sentiment based on business/news-specific keywords
Args:
text: Lowercase text to analyze
Returns:
Sentiment score from keyword analysis (-1.0 to 1.0)
"""
# Negative keywords with weights
negative_keywords = {
# Strong negative
'bankruptcy': -0.9, 'fraud': -0.9, 'scandal': -0.8, 'crisis': -0.8,
'collapse': -0.8, 'failure': -0.7, 'lawsuit': -0.7, 'breach': -0.7,
'hack': -0.7, 'cyberattack': -0.7, 'data breach': -0.8,
# Moderate negative
'decline': -0.6, 'loss': -0.6, 'drop': -0.5, 'fall': -0.5,
'concern': -0.5, 'risk': -0.5, 'problem': -0.5, 'issue': -0.4,
'challenge': -0.4, 'difficulty': -0.4, 'struggle': -0.5,
'layoff': -0.6, 'shutdown': -0.7, 'closure': -0.6,
# Business negative
'recession': -0.7, 'downturn': -0.6, 'crash': -0.8,
'volatile': -0.5, 'uncertainty': -0.4, 'unstable': -0.5,
'disappointing': -0.5, 'missed expectations': -0.6,
'below forecast': -0.5, 'underperform': -0.5
}
# Positive keywords with weights
positive_keywords = {
# Strong positive
'breakthrough': 0.8, 'innovation': 0.7, 'revolutionary': 0.8,
'success': 0.7, 'achievement': 0.6, 'milestone': 0.6,
'record': 0.6, 'best': 0.5, 'excellent': 0.6,
# Growth/business positive
'growth': 0.6, 'increase': 0.5, 'rise': 0.5, 'surge': 0.6,
'expansion': 0.6, 'profit': 0.5, 'revenue': 0.4, 'gain': 0.5,
'boost': 0.5, 'improve': 0.5, 'enhance': 0.4,
'opportunity': 0.4, 'potential': 0.4, 'promising': 0.5,
# Market positive
'bullish': 0.6, 'optimistic': 0.5, 'confident': 0.5,
'strong': 0.4, 'robust': 0.5, 'solid': 0.4,
'outperform': 0.5, 'exceed expectations': 0.6,
'beat forecast': 0.6, 'above estimates': 0.5
}
# Calculate weighted sentiment score
sentiment_score = 0.0
total_weight = 0.0
# Check negative keywords
for keyword, weight in negative_keywords.items():
if keyword in text:
sentiment_score += weight
total_weight += abs(weight)
# Check positive keywords
for keyword, weight in positive_keywords.items():
if keyword in text:
sentiment_score += weight
total_weight += abs(weight)
# Normalize by total weight to prevent extreme scores
if total_weight > 0:
sentiment_score = sentiment_score / max(total_weight, 1.0)
# Ensure score is within bounds
return max(-1.0, min(1.0, sentiment_score))
async def search_news(self, query: str, days_back: int = 30, language: str = "en") -> Dict[str, Any]:
"""
Search for news articles related to a query with enhanced search strategies
Args:
query: Search query
days_back: Number of days to look back
language: Language code
Returns:
News search results or error details
"""
logger.log_processing_step(f"News search for '{query}'", "started")
if not self.news_api:
error_msg = "NewsAPI client not initialized - no valid API key available"
logger.log_api_failure("NewsAPI", error_msg)
logger.log_processing_step(f"News search for '{query}'", "failed")
return {
"query": query,
"error": error_msg,
"error_type": "no_api_client",
"total_results": 0,
"articles": [],
"searched_at": datetime.now().isoformat()
}
try:
logger.log_api_attempt("NewsAPI", f"searching for '{query}' ({days_back} days back)")
# Calculate date range
from_date = (datetime.now() - timedelta(days=days_back)).strftime("%Y-%m-%d")
to_date = datetime.now().strftime("%Y-%m-%d")
# Try multiple search strategies if the first one fails
search_strategies = [
# Strategy 1: Exact query
{"q": query, "sort_by": "relevancy"},
# Strategy 2: If query has multiple words, try OR search
{"q": " OR ".join(query.split()) if len(query.split()) > 1 else query, "sort_by": "relevancy"},
# Strategy 3: Try with quotes for exact phrase (if not already quoted)
{"q": f'"{query}"' if '"' not in query else query, "sort_by": "publishedAt"}
]
response = None
successful_strategy = None
for i, strategy in enumerate(search_strategies):
try:
response = self.news_api.get_everything(
from_param=from_date,
to=to_date,
language=language,
**strategy
)
# Check if we got results
if response and response.get("totalResults", 0) > 0:
successful_strategy = i + 1
logger.info(f"Search strategy {successful_strategy} successful for query: {strategy['q']}")
break
elif i == 0: # Log if first strategy failed
logger.info(f"Search strategy 1 returned 0 results, trying alternative strategies")
except Exception as strategy_error:
logger.warning(f"Search strategy {i + 1} failed: {str(strategy_error)}")
continue
# If no strategy worked, use the last response (even if empty)
if response is None:
response = {"articles": [], "totalResults": 0}
# Process and return results
articles = response.get("articles", [])
total_results = response.get("totalResults", 0)
logger.log_api_success("NewsAPI", f"found {total_results} total results, processing {len(articles)} articles")
processed_results = {
"query": query,
"total_results": total_results,
"days_back": days_back,
"articles": [],
"sources": {},
"keywords": {},
"searched_at": datetime.now().isoformat()
}
# Process each article
for article in articles:
# Extract article data
processed_article = {
"title": article.get("title", ""),
"source": article.get("source", {}).get("name", "Unknown"),
"author": article.get("author", ""),
"published_at": article.get("publishedAt", ""),
"description": article.get("description", ""),
"url": article.get("url", ""),
"content": article.get("content", "")
}
# Extract keywords
full_text = f"{processed_article['title']} {processed_article['description']}"
keywords = self.extract_keywords(full_text)
processed_article["keywords"] = keywords
# Enhanced sentiment analysis
sentiment_analysis = self._analyze_article_sentiment(full_text)
processed_article["sentiment"] = sentiment_analysis
# Add to results
processed_results["articles"].append(processed_article)
# Update source statistics
source = processed_article["source"]
if source not in processed_results["sources"]:
processed_results["sources"][source] = 0
processed_results["sources"][source] += 1
# Update keyword statistics
for keyword in keywords:
if keyword not in processed_results["keywords"]:
processed_results["keywords"][keyword] = 0
processed_results["keywords"][keyword] += 1
# Add sentiment distribution logging for debugging
sentiment_debug = {"positive": 0, "negative": 0, "neutral": 0}
for article in processed_results["articles"]:
classification = article.get("sentiment", {}).get("classification", "neutral")
sentiment_debug[classification] += 1
logger.info(f"Sentiment distribution for '{query}': {sentiment_debug}")
logger.log_data_collection(f"NewsAPI articles for '{query}'", len(processed_results['articles']))
logger.log_processing_step(f"News search for '{query}'", "completed")
return processed_results
except Exception as e:
error_msg = f"NewsAPI search failed for '{query}': {str(e)}"
logger.log_api_failure("NewsAPI", error_msg)
logger.log_processing_step(f"News search for '{query}'", "failed")
# Return detailed error information
return {
"query": query,
"error": error_msg,
"error_type": "api_request_failed",
"error_details": str(e),
"total_results": 0,
"articles": [],
"searched_at": datetime.now().isoformat()
}
async def get_trend_analysis(self, feature_description: str, target_market: str = "", days_back: int = 30) -> Dict[str, Any]:
"""
Analyze trends for a specific feature using enhanced NLP
Args:
feature_description: Description of the feature
target_market: Target market description
days_back: Number of days to look back
Returns:
Trend analysis
"""
try:
# Use NLP-enhanced query generation
enhanced_data = await self.nlp_enhancer.enhance_query(feature_description, target_market)
optimized_queries = self.nlp_enhancer.get_optimized_queries_for_platform(enhanced_data, 'news')
# If no optimized queries found, fall back to simpler approach
if not optimized_queries:
# Extract key words from the description
words = feature_description.lower().split()
key_words = [word for word in words if len(word) > 3 and word not in ['and', 'the', 'for', 'with', 'that', 'this']]
# Create basic queries from key words
if len(key_words) >= 2:
optimized_queries = [
' '.join(key_words[:2]),
' '.join(key_words[:3]) if len(key_words) >= 3 else ' '.join(key_words),
f"{key_words[0]} market" if key_words else "technology market",
f"{key_words[0]} industry" if key_words else "technology industry"
]
else:
optimized_queries = [feature_description]
logger.info(f"Using optimized search queries: {optimized_queries}")
search_queries = optimized_queries
# Collect news for each query
query_results = {}
successful_queries = []
for query in search_queries:
result = await self.search_news(query, days_back)
query_results[query] = result
# Track queries that returned results
if result.get("total_results", 0) > 0:
successful_queries.append(query)
# If no results from optimized queries, try broader fallback searches
if not successful_queries:
logger.info("No results from optimized queries, trying broader fallback searches")
fallback_queries = [
"fitness app",
"health technology",
"mobile app market",
"wellness industry",
"AI technology",
"personal training",
"nutrition app"
]
for query in fallback_queries:
if query not in query_results: # Avoid duplicates
result = await self.search_news(query, days_back)
query_results[query] = result
if result.get("total_results", 0) > 0:
successful_queries.append(query)
# Try a few fallback queries, not just one
if len(successful_queries) >= 3:
break
# Aggregate results
trend_analysis = {
"feature_description": feature_description,
"original_queries": search_queries,
"successful_queries": successful_queries,
"total_queries_tried": len(query_results),
"aggregate_stats": self._aggregate_news_results(query_results),
"query_results": query_results,
"analyzed_at": datetime.now().isoformat()
}
# Identify trends
trend_analysis["trends"] = self._identify_trends(trend_analysis)
return trend_analysis
except Exception as e:
logger.error(f"Error in trend analysis: {str(e)}")
return {"error": str(e)}
async def get_industry_trends(self, industry: str, timeframe: str = "30d") -> Dict[str, Any]:
"""Get general industry trends"""
try:
# Convert timeframe to days
days_mapping = {"7d": 7, "30d": 30, "90d": 90}
days_back = days_mapping.get(timeframe, 30)
# Search for industry news
result = await self.search_news(f"{industry} industry trends", days_back)
# Augment with industry-specific analysis
result["industry"] = industry
result["timeframe"] = timeframe
result["trend_score"] = self._calculate_trend_score(result)
# Extract top trends
result["top_trends"] = self._extract_top_trends(result, 5)
return result
except Exception as e:
logger.error(f"Error getting industry trends: {str(e)}")
return {"error": str(e)}
def _aggregate_news_results(self, query_results: Dict[str, Any]) -> Dict[str, Any]:
"""Aggregate results from multiple queries"""
aggregate = {
"total_articles": 0,
"total_sources": 0,
"sources": {},
"keywords": {},
"sentiment": {
"positive": 0,
"negative": 0,
"neutral": 0,
"average_polarity": 0
}
}
# Combine results from all queries
articles_processed = set() # Track unique articles by URL
total_polarity = 0
articles_with_sentiment = 0
for query, result in query_results.items():
if "articles" not in result:
continue
for article in result["articles"]:
# Skip duplicates
url = article.get("url", "")
if url in articles_processed:
continue
articles_processed.add(url)
aggregate["total_articles"] += 1
# Update source statistics
source = article.get("source", "Unknown")
if source not in aggregate["sources"]:
aggregate["sources"][source] = 0
aggregate["total_sources"] += 1
aggregate["sources"][source] += 1
# Update keyword statistics
for keyword in article.get("keywords", []):
if keyword not in aggregate["keywords"]:
aggregate["keywords"][keyword] = 0
aggregate["keywords"][keyword] += 1
# Update sentiment statistics using new classification
sentiment = article.get("sentiment", {})
classification = sentiment.get("classification", "neutral")
polarity = sentiment.get("polarity", 0)
# Use the enhanced classification
if classification == "positive":
aggregate["sentiment"]["positive"] += 1
elif classification == "negative":
aggregate["sentiment"]["negative"] += 1
else:
aggregate["sentiment"]["neutral"] += 1
total_polarity += polarity
articles_with_sentiment += 1
# Calculate average sentiment
if articles_with_sentiment > 0:
aggregate["sentiment"]["average_polarity"] = total_polarity / articles_with_sentiment
# Get top keywords and sources
aggregate["top_keywords"] = sorted(
aggregate["keywords"].items(),
key=lambda x: x[1],
reverse=True
)[:10]
aggregate["top_sources"] = sorted(
aggregate["sources"].items(),
key=lambda x: x[1],
reverse=True
)[:5]
return aggregate
def _identify_trends(self, analysis: Dict[str, Any]) -> List[Dict[str, Any]]:
"""Identify trends from news analysis"""
trends = []
# Extract top keywords as trends
keywords = analysis.get("aggregate_stats", {}).get("keywords", {})
for keyword, count in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]:
# Only include significant keywords
if count >= 2:
trends.append({
"keyword": keyword,
"count": count,
"type": "topic"
})
# Look for growth/emerging trends
# In a real implementation, this would compare against historical data
return trends
def _calculate_trend_score(self, result: Dict[str, Any]) -> float:
"""Calculate trend score based on article volume and recency"""
if "articles" not in result or not result["articles"]:
return 0.0
# Count recent articles (last 7 days)
recent_count = 0
total_count = len(result["articles"])
one_week_ago = datetime.now() - timedelta(days=7)
for article in result["articles"]:
try:
published_date = datetime.fromisoformat(article.get("published_at", "").replace("Z", "+00:00"))
if published_date >= one_week_ago:
recent_count += 1
except ValueError:
pass
# Calculate trend score (0-10 scale)
recency_ratio = recent_count / total_count if total_count > 0 else 0
trend_score = min((total_count / 10) * (1 + recency_ratio), 10.0)
return round(trend_score, 2)
def _extract_top_trends(self, result: Dict[str, Any], limit: int = 5) -> List[Dict[str, Any]]:
"""Extract top trends from news result"""
trends = []
# Extract keyword trends
keywords = {}
for article in result.get("articles", []):
for keyword in article.get("keywords", []):
if keyword not in keywords:
keywords[keyword] = {
"keyword": keyword,
"count": 0,
"sentiment": 0,
"articles": []
}
keywords[keyword]["count"] += 1
keywords[keyword]["sentiment"] += article.get("sentiment", {}).get("polarity", 0)
keywords[keyword]["articles"].append(article.get("url", ""))
# Calculate average sentiment and sort by count
for keyword, data in keywords.items():
if data["count"] > 0:
data["sentiment"] = data["sentiment"] / data["count"]
data["articles"] = data["articles"][:3] # Limit to top 3 articles
trends.append(data)
# Sort by count and return top trends
return sorted(trends, key=lambda x: x["count"], reverse=True)[:limit]
# Example usage and testing
async def test_news_collector():
"""Test function for NewsCollector"""
collector = NewsCollector()
# Test trend analysis
print("Testing trend analysis...")
result = await collector.get_trend_analysis(
"AI voice ordering",
days_back=15
)
print(f"Trend analysis: {result.get('aggregate_stats', {}).get('total_articles', 0)} articles analyzed")
# Test industry trends
print("Testing industry trends...")
industry_result = await collector.get_industry_trends("artificial intelligence", "30d")
print(f"Industry trends: {industry_result.get('trend_score', 0)} trend score")
return result, industry_result
if __name__ == "__main__":
# Run test
asyncio.run(test_news_collector())