""" News API Collector Collects industry news and trends for feature validation """ import asyncio from typing import List, Dict, Any, Optional from datetime import datetime, timedelta import os from newsapi import NewsApiClient import aiohttp from textblob import TextBlob import re from dotenv import load_dotenv from utils.logger import get_logger from utils.nlp_query_enhancer import NLPQueryEnhancer # Load environment variables load_dotenv() logger = get_logger(__name__) class NewsCollector: """Collects and analyzes news for market trends""" def __init__(self): """Initialize News API client""" self.news_api = None self.session = None self.nlp_enhancer = NLPQueryEnhancer() self.setup_news_client() def setup_news_client(self): """Setup News API client with credentials""" logger.log_processing_step("NewsAPI client setup", "started") try: # Try both possible environment variable names api_key = os.getenv("NEWS_API_KEY") or os.getenv("NEWSAPI_KEY") if not api_key: error_msg = "No NewsAPI key found in environment variables (NEWS_API_KEY or NEWSAPI_KEY)" logger.log_api_failure("NewsAPI", error_msg) self.news_api = None return logger.log_api_attempt("NewsAPI", f"with key ending in ...{api_key[-4:]}") self.news_api = NewsApiClient(api_key=api_key) logger.log_api_success("NewsAPI", "client initialized") logger.log_processing_step("NewsAPI client setup", "completed") except Exception as e: error_msg = f"NewsAPI client setup failed: {str(e)}" logger.log_api_failure("NewsAPI", error_msg) logger.log_processing_step("NewsAPI client setup", "failed") self.news_api = None async def __aenter__(self): """Async context manager entry""" connector = aiohttp.TCPConnector( limit=10, limit_per_host=5, ttl_dns_cache=300, use_dns_cache=True, keepalive_timeout=30, enable_cleanup_closed=True ) timeout = aiohttp.ClientTimeout(total=30) self.session = aiohttp.ClientSession( connector=connector, timeout=timeout ) return self async def __aexit__(self, exc_type, exc_val, exc_tb): """Async context manager exit with proper cleanup""" await self.cleanup() async def cleanup(self): """Clean up session and connections properly""" try: if self.session and not self.session.closed: await self.session.close() logger.debug("News collector session closed properly") except Exception as e: logger.debug(f"Error during news collector cleanup: {str(e)}") finally: self.session = None # Give time for connections to close await asyncio.sleep(0.1) def clean_text(self, text: str) -> str: """Clean and preprocess text for analysis""" if not text: return "" # Remove URLs, special characters text = re.sub(r'http\\S+|www\\S+|https\\S+', '', text, flags=re.MULTILINE) text = re.sub(r'[^\\w\\s]', ' ', text) # Remove extra whitespace text = ' '.join(text.split()) return text.strip() def extract_keywords(self, text: str, min_length: int = 4, max_keywords: int = 10) -> List[str]: """Extract important keywords from text""" if not text: return [] try: # Use TextBlob for noun phrase extraction blob = TextBlob(text) # Extract noun phrases and filter for length keywords = [phrase for phrase in blob.noun_phrases if len(phrase) >= min_length] # type: ignore # Return top keywords return keywords[:max_keywords] except Exception as e: logger.error(f"Error extracting keywords: {str(e)}") return [] def optimize_search_query(self, feature_description: str) -> List[str]: """ Optimize search query by breaking down complex descriptions into searchable terms Args: feature_description: Original feature description Returns: List of optimized search queries """ queries = [] # Extract key concepts using common patterns key_terms = [] # Common technology terms tech_terms = ['ai', 'artificial intelligence', 'machine learning', 'ml', 'app', 'mobile', 'web', 'platform', 'software', 'tool', 'system'] # Common business/industry terms business_terms = ['fitness', 'health', 'nutrition', 'training', 'workout', 'exercise', 'diet', 'wellness', 'personal trainer', 'coaching'] # Convert to lowercase for matching description_lower = feature_description.lower() # Extract relevant tech terms for term in tech_terms: if term in description_lower: key_terms.append(term) # Extract relevant business terms for term in business_terms: if term in description_lower: key_terms.append(term) # Create different query strategies if key_terms: # Strategy 1: Core concept combinations if 'ai' in key_terms or 'artificial intelligence' in key_terms: if 'fitness' in key_terms: queries.extend([ 'AI fitness app', 'artificial intelligence fitness', 'AI personal trainer', 'smart fitness technology' ]) if 'nutrition' in key_terms: queries.extend([ 'AI nutrition app', 'artificial intelligence diet', 'smart meal planning' ]) # Strategy 2: Industry-specific terms if 'fitness' in key_terms: queries.extend([ 'fitness app market', 'digital fitness trends', 'fitness technology', 'workout app industry' ]) if 'nutrition' in key_terms: queries.extend([ 'nutrition app market', 'diet tracking technology', 'meal planning apps' ]) # Strategy 3: Broader market terms queries.extend([ 'health tech startup', 'wellness app market', 'fitness industry trends', 'health technology news' ]) # Fallback: Extract noun phrases from original description try: blob = TextBlob(feature_description) noun_phrases = [phrase for phrase in blob.noun_phrases if len(phrase.split()) <= 3] # type: ignore # Add top noun phrases as queries for phrase in noun_phrases[:5]: if len(phrase) > 3: # Skip very short phrases queries.append(phrase) except: pass # Remove duplicates and return unique queries unique_queries = list(dict.fromkeys(queries)) # Limit to reasonable number of queries return unique_queries[:8] def create_search_variations(self, base_query: str) -> List[str]: """ Create variations of a search query for better coverage Args: base_query: Base search query Returns: List of query variations """ variations = [base_query] # Add industry context variations.extend([ f"{base_query} industry", f"{base_query} market", f"{base_query} trends", f"{base_query} startup", f"{base_query} technology" ]) # Add temporal context variations.extend([ f"{base_query} 2024", f"{base_query} latest", f"{base_query} new" ]) return variations def _analyze_article_sentiment(self, text: str) -> Dict[str, Any]: """ Enhanced sentiment analysis specifically tuned for news articles Args: text: Article text (title + description) Returns: Detailed sentiment analysis with improved thresholds """ if not text: return { "polarity": 0.0, "subjectivity": 0.0, "classification": "neutral", "confidence": 0.0, "keyword_sentiment": 0.0, "debug_info": {"error": "Empty text"} } try: # Basic TextBlob analysis blob = TextBlob(text) textblob_polarity = blob.sentiment.polarity # type: ignore textblob_subjectivity = blob.sentiment.subjectivity # type: ignore # Keyword-based sentiment analysis for news content keyword_sentiment = self._analyze_keyword_sentiment(text.lower()) # Combine TextBlob and keyword-based analysis # Give more weight to keyword analysis for news content combined_polarity = (textblob_polarity * 0.6) + (keyword_sentiment * 0.4) # More sensitive thresholds for news articles NEWS_POSITIVE_THRESHOLD = 0.05 # Lowered from 0.1 NEWS_NEGATIVE_THRESHOLD = -0.05 # Lowered from -0.1 # Classify sentiment with new thresholds if combined_polarity >= NEWS_POSITIVE_THRESHOLD: classification = "positive" elif combined_polarity <= NEWS_NEGATIVE_THRESHOLD: classification = "negative" else: classification = "neutral" # Calculate confidence based on distance from neutral confidence = min(abs(combined_polarity) * 2, 1.0) # Scale to 0-1 return { "polarity": combined_polarity, "subjectivity": textblob_subjectivity, "classification": classification, "confidence": confidence, "keyword_sentiment": keyword_sentiment, "debug_info": { "textblob_polarity": textblob_polarity, "keyword_sentiment": keyword_sentiment, "thresholds": { "positive": NEWS_POSITIVE_THRESHOLD, "negative": NEWS_NEGATIVE_THRESHOLD } } } except Exception as e: logger.error(f"Error in enhanced sentiment analysis: {str(e)}") return { "polarity": 0.0, "subjectivity": 0.0, "classification": "neutral", "confidence": 0.0, "keyword_sentiment": 0.0, "debug_info": {"error": str(e)} } def _analyze_keyword_sentiment(self, text: str) -> float: """ Analyze sentiment based on business/news-specific keywords Args: text: Lowercase text to analyze Returns: Sentiment score from keyword analysis (-1.0 to 1.0) """ # Negative keywords with weights negative_keywords = { # Strong negative 'bankruptcy': -0.9, 'fraud': -0.9, 'scandal': -0.8, 'crisis': -0.8, 'collapse': -0.8, 'failure': -0.7, 'lawsuit': -0.7, 'breach': -0.7, 'hack': -0.7, 'cyberattack': -0.7, 'data breach': -0.8, # Moderate negative 'decline': -0.6, 'loss': -0.6, 'drop': -0.5, 'fall': -0.5, 'concern': -0.5, 'risk': -0.5, 'problem': -0.5, 'issue': -0.4, 'challenge': -0.4, 'difficulty': -0.4, 'struggle': -0.5, 'layoff': -0.6, 'shutdown': -0.7, 'closure': -0.6, # Business negative 'recession': -0.7, 'downturn': -0.6, 'crash': -0.8, 'volatile': -0.5, 'uncertainty': -0.4, 'unstable': -0.5, 'disappointing': -0.5, 'missed expectations': -0.6, 'below forecast': -0.5, 'underperform': -0.5 } # Positive keywords with weights positive_keywords = { # Strong positive 'breakthrough': 0.8, 'innovation': 0.7, 'revolutionary': 0.8, 'success': 0.7, 'achievement': 0.6, 'milestone': 0.6, 'record': 0.6, 'best': 0.5, 'excellent': 0.6, # Growth/business positive 'growth': 0.6, 'increase': 0.5, 'rise': 0.5, 'surge': 0.6, 'expansion': 0.6, 'profit': 0.5, 'revenue': 0.4, 'gain': 0.5, 'boost': 0.5, 'improve': 0.5, 'enhance': 0.4, 'opportunity': 0.4, 'potential': 0.4, 'promising': 0.5, # Market positive 'bullish': 0.6, 'optimistic': 0.5, 'confident': 0.5, 'strong': 0.4, 'robust': 0.5, 'solid': 0.4, 'outperform': 0.5, 'exceed expectations': 0.6, 'beat forecast': 0.6, 'above estimates': 0.5 } # Calculate weighted sentiment score sentiment_score = 0.0 total_weight = 0.0 # Check negative keywords for keyword, weight in negative_keywords.items(): if keyword in text: sentiment_score += weight total_weight += abs(weight) # Check positive keywords for keyword, weight in positive_keywords.items(): if keyword in text: sentiment_score += weight total_weight += abs(weight) # Normalize by total weight to prevent extreme scores if total_weight > 0: sentiment_score = sentiment_score / max(total_weight, 1.0) # Ensure score is within bounds return max(-1.0, min(1.0, sentiment_score)) async def search_news(self, query: str, days_back: int = 30, language: str = "en") -> Dict[str, Any]: """ Search for news articles related to a query with enhanced search strategies Args: query: Search query days_back: Number of days to look back language: Language code Returns: News search results or error details """ logger.log_processing_step(f"News search for '{query}'", "started") if not self.news_api: error_msg = "NewsAPI client not initialized - no valid API key available" logger.log_api_failure("NewsAPI", error_msg) logger.log_processing_step(f"News search for '{query}'", "failed") return { "query": query, "error": error_msg, "error_type": "no_api_client", "total_results": 0, "articles": [], "searched_at": datetime.now().isoformat() } try: logger.log_api_attempt("NewsAPI", f"searching for '{query}' ({days_back} days back)") # Calculate date range from_date = (datetime.now() - timedelta(days=days_back)).strftime("%Y-%m-%d") to_date = datetime.now().strftime("%Y-%m-%d") # Try multiple search strategies if the first one fails search_strategies = [ # Strategy 1: Exact query {"q": query, "sort_by": "relevancy"}, # Strategy 2: If query has multiple words, try OR search {"q": " OR ".join(query.split()) if len(query.split()) > 1 else query, "sort_by": "relevancy"}, # Strategy 3: Try with quotes for exact phrase (if not already quoted) {"q": f'"{query}"' if '"' not in query else query, "sort_by": "publishedAt"} ] response = None successful_strategy = None for i, strategy in enumerate(search_strategies): try: response = self.news_api.get_everything( from_param=from_date, to=to_date, language=language, **strategy ) # Check if we got results if response and response.get("totalResults", 0) > 0: successful_strategy = i + 1 logger.info(f"Search strategy {successful_strategy} successful for query: {strategy['q']}") break elif i == 0: # Log if first strategy failed logger.info(f"Search strategy 1 returned 0 results, trying alternative strategies") except Exception as strategy_error: logger.warning(f"Search strategy {i + 1} failed: {str(strategy_error)}") continue # If no strategy worked, use the last response (even if empty) if response is None: response = {"articles": [], "totalResults": 0} # Process and return results articles = response.get("articles", []) total_results = response.get("totalResults", 0) logger.log_api_success("NewsAPI", f"found {total_results} total results, processing {len(articles)} articles") processed_results = { "query": query, "total_results": total_results, "days_back": days_back, "articles": [], "sources": {}, "keywords": {}, "searched_at": datetime.now().isoformat() } # Process each article for article in articles: # Extract article data processed_article = { "title": article.get("title", ""), "source": article.get("source", {}).get("name", "Unknown"), "author": article.get("author", ""), "published_at": article.get("publishedAt", ""), "description": article.get("description", ""), "url": article.get("url", ""), "content": article.get("content", "") } # Extract keywords full_text = f"{processed_article['title']} {processed_article['description']}" keywords = self.extract_keywords(full_text) processed_article["keywords"] = keywords # Enhanced sentiment analysis sentiment_analysis = self._analyze_article_sentiment(full_text) processed_article["sentiment"] = sentiment_analysis # Add to results processed_results["articles"].append(processed_article) # Update source statistics source = processed_article["source"] if source not in processed_results["sources"]: processed_results["sources"][source] = 0 processed_results["sources"][source] += 1 # Update keyword statistics for keyword in keywords: if keyword not in processed_results["keywords"]: processed_results["keywords"][keyword] = 0 processed_results["keywords"][keyword] += 1 # Add sentiment distribution logging for debugging sentiment_debug = {"positive": 0, "negative": 0, "neutral": 0} for article in processed_results["articles"]: classification = article.get("sentiment", {}).get("classification", "neutral") sentiment_debug[classification] += 1 logger.info(f"Sentiment distribution for '{query}': {sentiment_debug}") logger.log_data_collection(f"NewsAPI articles for '{query}'", len(processed_results['articles'])) logger.log_processing_step(f"News search for '{query}'", "completed") return processed_results except Exception as e: error_msg = f"NewsAPI search failed for '{query}': {str(e)}" logger.log_api_failure("NewsAPI", error_msg) logger.log_processing_step(f"News search for '{query}'", "failed") # Return detailed error information return { "query": query, "error": error_msg, "error_type": "api_request_failed", "error_details": str(e), "total_results": 0, "articles": [], "searched_at": datetime.now().isoformat() } async def get_trend_analysis(self, feature_description: str, target_market: str = "", days_back: int = 30) -> Dict[str, Any]: """ Analyze trends for a specific feature using enhanced NLP Args: feature_description: Description of the feature target_market: Target market description days_back: Number of days to look back Returns: Trend analysis """ try: # Use NLP-enhanced query generation enhanced_data = await self.nlp_enhancer.enhance_query(feature_description, target_market) optimized_queries = self.nlp_enhancer.get_optimized_queries_for_platform(enhanced_data, 'news') # If no optimized queries found, fall back to simpler approach if not optimized_queries: # Extract key words from the description words = feature_description.lower().split() key_words = [word for word in words if len(word) > 3 and word not in ['and', 'the', 'for', 'with', 'that', 'this']] # Create basic queries from key words if len(key_words) >= 2: optimized_queries = [ ' '.join(key_words[:2]), ' '.join(key_words[:3]) if len(key_words) >= 3 else ' '.join(key_words), f"{key_words[0]} market" if key_words else "technology market", f"{key_words[0]} industry" if key_words else "technology industry" ] else: optimized_queries = [feature_description] logger.info(f"Using optimized search queries: {optimized_queries}") search_queries = optimized_queries # Collect news for each query query_results = {} successful_queries = [] for query in search_queries: result = await self.search_news(query, days_back) query_results[query] = result # Track queries that returned results if result.get("total_results", 0) > 0: successful_queries.append(query) # If no results from optimized queries, try broader fallback searches if not successful_queries: logger.info("No results from optimized queries, trying broader fallback searches") fallback_queries = [ "fitness app", "health technology", "mobile app market", "wellness industry", "AI technology", "personal training", "nutrition app" ] for query in fallback_queries: if query not in query_results: # Avoid duplicates result = await self.search_news(query, days_back) query_results[query] = result if result.get("total_results", 0) > 0: successful_queries.append(query) # Try a few fallback queries, not just one if len(successful_queries) >= 3: break # Aggregate results trend_analysis = { "feature_description": feature_description, "original_queries": search_queries, "successful_queries": successful_queries, "total_queries_tried": len(query_results), "aggregate_stats": self._aggregate_news_results(query_results), "query_results": query_results, "analyzed_at": datetime.now().isoformat() } # Identify trends trend_analysis["trends"] = self._identify_trends(trend_analysis) return trend_analysis except Exception as e: logger.error(f"Error in trend analysis: {str(e)}") return {"error": str(e)} async def get_industry_trends(self, industry: str, timeframe: str = "30d") -> Dict[str, Any]: """Get general industry trends""" try: # Convert timeframe to days days_mapping = {"7d": 7, "30d": 30, "90d": 90} days_back = days_mapping.get(timeframe, 30) # Search for industry news result = await self.search_news(f"{industry} industry trends", days_back) # Augment with industry-specific analysis result["industry"] = industry result["timeframe"] = timeframe result["trend_score"] = self._calculate_trend_score(result) # Extract top trends result["top_trends"] = self._extract_top_trends(result, 5) return result except Exception as e: logger.error(f"Error getting industry trends: {str(e)}") return {"error": str(e)} def _aggregate_news_results(self, query_results: Dict[str, Any]) -> Dict[str, Any]: """Aggregate results from multiple queries""" aggregate = { "total_articles": 0, "total_sources": 0, "sources": {}, "keywords": {}, "sentiment": { "positive": 0, "negative": 0, "neutral": 0, "average_polarity": 0 } } # Combine results from all queries articles_processed = set() # Track unique articles by URL total_polarity = 0 articles_with_sentiment = 0 for query, result in query_results.items(): if "articles" not in result: continue for article in result["articles"]: # Skip duplicates url = article.get("url", "") if url in articles_processed: continue articles_processed.add(url) aggregate["total_articles"] += 1 # Update source statistics source = article.get("source", "Unknown") if source not in aggregate["sources"]: aggregate["sources"][source] = 0 aggregate["total_sources"] += 1 aggregate["sources"][source] += 1 # Update keyword statistics for keyword in article.get("keywords", []): if keyword not in aggregate["keywords"]: aggregate["keywords"][keyword] = 0 aggregate["keywords"][keyword] += 1 # Update sentiment statistics using new classification sentiment = article.get("sentiment", {}) classification = sentiment.get("classification", "neutral") polarity = sentiment.get("polarity", 0) # Use the enhanced classification if classification == "positive": aggregate["sentiment"]["positive"] += 1 elif classification == "negative": aggregate["sentiment"]["negative"] += 1 else: aggregate["sentiment"]["neutral"] += 1 total_polarity += polarity articles_with_sentiment += 1 # Calculate average sentiment if articles_with_sentiment > 0: aggregate["sentiment"]["average_polarity"] = total_polarity / articles_with_sentiment # Get top keywords and sources aggregate["top_keywords"] = sorted( aggregate["keywords"].items(), key=lambda x: x[1], reverse=True )[:10] aggregate["top_sources"] = sorted( aggregate["sources"].items(), key=lambda x: x[1], reverse=True )[:5] return aggregate def _identify_trends(self, analysis: Dict[str, Any]) -> List[Dict[str, Any]]: """Identify trends from news analysis""" trends = [] # Extract top keywords as trends keywords = analysis.get("aggregate_stats", {}).get("keywords", {}) for keyword, count in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]: # Only include significant keywords if count >= 2: trends.append({ "keyword": keyword, "count": count, "type": "topic" }) # Look for growth/emerging trends # In a real implementation, this would compare against historical data return trends def _calculate_trend_score(self, result: Dict[str, Any]) -> float: """Calculate trend score based on article volume and recency""" if "articles" not in result or not result["articles"]: return 0.0 # Count recent articles (last 7 days) recent_count = 0 total_count = len(result["articles"]) one_week_ago = datetime.now() - timedelta(days=7) for article in result["articles"]: try: published_date = datetime.fromisoformat(article.get("published_at", "").replace("Z", "+00:00")) if published_date >= one_week_ago: recent_count += 1 except ValueError: pass # Calculate trend score (0-10 scale) recency_ratio = recent_count / total_count if total_count > 0 else 0 trend_score = min((total_count / 10) * (1 + recency_ratio), 10.0) return round(trend_score, 2) def _extract_top_trends(self, result: Dict[str, Any], limit: int = 5) -> List[Dict[str, Any]]: """Extract top trends from news result""" trends = [] # Extract keyword trends keywords = {} for article in result.get("articles", []): for keyword in article.get("keywords", []): if keyword not in keywords: keywords[keyword] = { "keyword": keyword, "count": 0, "sentiment": 0, "articles": [] } keywords[keyword]["count"] += 1 keywords[keyword]["sentiment"] += article.get("sentiment", {}).get("polarity", 0) keywords[keyword]["articles"].append(article.get("url", "")) # Calculate average sentiment and sort by count for keyword, data in keywords.items(): if data["count"] > 0: data["sentiment"] = data["sentiment"] / data["count"] data["articles"] = data["articles"][:3] # Limit to top 3 articles trends.append(data) # Sort by count and return top trends return sorted(trends, key=lambda x: x["count"], reverse=True)[:limit] # Example usage and testing async def test_news_collector(): """Test function for NewsCollector""" collector = NewsCollector() # Test trend analysis print("Testing trend analysis...") result = await collector.get_trend_analysis( "AI voice ordering", days_back=15 ) print(f"Trend analysis: {result.get('aggregate_stats', {}).get('total_articles', 0)} articles analyzed") # Test industry trends print("Testing industry trends...") industry_result = await collector.get_industry_trends("artificial intelligence", "30d") print(f"Industry trends: {industry_result.get('trend_score', 0)} trend score") return result, industry_result if __name__ == "__main__": # Run test asyncio.run(test_news_collector())