Spaces:

Agents-MCP-Hackathon
/

iykyk-product-validation-agent

Running

App Files Files Community

iykyk-product-validation-agent / data_collectors /news_collector.py

chuckiykyk

Upload 36 files

72f802a verified 3 months ago

raw

history blame contribute delete

34.8 kB

	"""
	News API Collector
	Collects industry news and trends for feature validation
	"""

	import asyncio
	from typing import List, Dict, Any, Optional
	from datetime import datetime, timedelta
	import os
	from newsapi import NewsApiClient
	import aiohttp
	from textblob import TextBlob
	import re
	from dotenv import load_dotenv
	from utils.logger import get_logger
	from utils.nlp_query_enhancer import NLPQueryEnhancer

	# Load environment variables
	load_dotenv()

	logger = get_logger(__name__)

	class NewsCollector:
	"""Collects and analyzes news for market trends"""

	def __init__(self):
	"""Initialize News API client"""
	self.news_api = None
	self.session = None
	self.nlp_enhancer = NLPQueryEnhancer()
	self.setup_news_client()

	def setup_news_client(self):
	"""Setup News API client with credentials"""
	logger.log_processing_step("NewsAPI client setup", "started")

	try:
	# Try both possible environment variable names
	api_key = os.getenv("NEWS_API_KEY") or os.getenv("NEWSAPI_KEY")

	if not api_key:
	error_msg = "No NewsAPI key found in environment variables (NEWS_API_KEY or NEWSAPI_KEY)"
	logger.log_api_failure("NewsAPI", error_msg)
	self.news_api = None
	return

	logger.log_api_attempt("NewsAPI", f"with key ending in ...{api_key[-4:]}")
	self.news_api = NewsApiClient(api_key=api_key)
	logger.log_api_success("NewsAPI", "client initialized")
	logger.log_processing_step("NewsAPI client setup", "completed")

	except Exception as e:
	error_msg = f"NewsAPI client setup failed: {str(e)}"
	logger.log_api_failure("NewsAPI", error_msg)
	logger.log_processing_step("NewsAPI client setup", "failed")
	self.news_api = None

	async def __aenter__(self):
	"""Async context manager entry"""
	connector = aiohttp.TCPConnector(
	limit=10,
	limit_per_host=5,
	ttl_dns_cache=300,
	use_dns_cache=True,
	keepalive_timeout=30,
	enable_cleanup_closed=True
	)

	timeout = aiohttp.ClientTimeout(total=30)

	self.session = aiohttp.ClientSession(
	connector=connector,
	timeout=timeout
	)
	return self

	async def __aexit__(self, exc_type, exc_val, exc_tb):
	"""Async context manager exit with proper cleanup"""
	await self.cleanup()

	async def cleanup(self):
	"""Clean up session and connections properly"""
	try:
	if self.session and not self.session.closed:
	await self.session.close()
	logger.debug("News collector session closed properly")
	except Exception as e:
	logger.debug(f"Error during news collector cleanup: {str(e)}")
	finally:
	self.session = None
	# Give time for connections to close
	await asyncio.sleep(0.1)

	def clean_text(self, text: str) -> str:
	"""Clean and preprocess text for analysis"""
	if not text:
	return ""

	# Remove URLs, special characters
	text = re.sub(r'http\\S+\|www\\S+\|https\\S+', '', text, flags=re.MULTILINE)
	text = re.sub(r'[^\\w\\s]', ' ', text)

	# Remove extra whitespace
	text = ' '.join(text.split())

	return text.strip()

	def extract_keywords(self, text: str, min_length: int = 4, max_keywords: int = 10) -> List[str]:
	"""Extract important keywords from text"""
	if not text:
	return []

	try:
	# Use TextBlob for noun phrase extraction
	blob = TextBlob(text)

	# Extract noun phrases and filter for length
	keywords = [phrase for phrase in blob.noun_phrases if len(phrase) >= min_length] # type: ignore

	# Return top keywords
	return keywords[:max_keywords]

	except Exception as e:
	logger.error(f"Error extracting keywords: {str(e)}")
	return []

	def optimize_search_query(self, feature_description: str) -> List[str]:
	"""
	Optimize search query by breaking down complex descriptions into searchable terms

	Args:
	feature_description: Original feature description

	Returns:
	List of optimized search queries
	"""
	queries = []

	# Extract key concepts using common patterns
	key_terms = []

	# Common technology terms
	tech_terms = ['ai', 'artificial intelligence', 'machine learning', 'ml', 'app', 'mobile', 'web', 'platform', 'software', 'tool', 'system']

	# Common business/industry terms
	business_terms = ['fitness', 'health', 'nutrition', 'training', 'workout', 'exercise', 'diet', 'wellness', 'personal trainer', 'coaching']

	# Convert to lowercase for matching
	description_lower = feature_description.lower()

	# Extract relevant tech terms
	for term in tech_terms:
	if term in description_lower:
	key_terms.append(term)

	# Extract relevant business terms
	for term in business_terms:
	if term in description_lower:
	key_terms.append(term)

	# Create different query strategies
	if key_terms:
	# Strategy 1: Core concept combinations
	if 'ai' in key_terms or 'artificial intelligence' in key_terms:
	if 'fitness' in key_terms:
	queries.extend([
	'AI fitness app',
	'artificial intelligence fitness',
	'AI personal trainer',
	'smart fitness technology'
	])
	if 'nutrition' in key_terms:
	queries.extend([
	'AI nutrition app',
	'artificial intelligence diet',
	'smart meal planning'
	])

	# Strategy 2: Industry-specific terms
	if 'fitness' in key_terms:
	queries.extend([
	'fitness app market',
	'digital fitness trends',
	'fitness technology',
	'workout app industry'
	])

	if 'nutrition' in key_terms:
	queries.extend([
	'nutrition app market',
	'diet tracking technology',
	'meal planning apps'
	])

	# Strategy 3: Broader market terms
	queries.extend([
	'health tech startup',
	'wellness app market',
	'fitness industry trends',
	'health technology news'
	])

	# Fallback: Extract noun phrases from original description
	try:
	blob = TextBlob(feature_description)
	noun_phrases = [phrase for phrase in blob.noun_phrases if len(phrase.split()) <= 3] # type: ignore

	# Add top noun phrases as queries
	for phrase in noun_phrases[:5]:
	if len(phrase) > 3: # Skip very short phrases
	queries.append(phrase)
	except:
	pass

	# Remove duplicates and return unique queries
	unique_queries = list(dict.fromkeys(queries))

	# Limit to reasonable number of queries
	return unique_queries[:8]

	def create_search_variations(self, base_query: str) -> List[str]:
	"""
	Create variations of a search query for better coverage

	Args:
	base_query: Base search query

	Returns:
	List of query variations
	"""
	variations = [base_query]

	# Add industry context
	variations.extend([
	f"{base_query} industry",
	f"{base_query} market",
	f"{base_query} trends",
	f"{base_query} startup",
	f"{base_query} technology"
	])

	# Add temporal context
	variations.extend([
	f"{base_query} 2024",
	f"{base_query} latest",
	f"{base_query} new"
	])

	return variations

	def _analyze_article_sentiment(self, text: str) -> Dict[str, Any]:
	"""
	Enhanced sentiment analysis specifically tuned for news articles

	Args:
	text: Article text (title + description)

	Returns:
	Detailed sentiment analysis with improved thresholds
	"""
	if not text:
	return {
	"polarity": 0.0,
	"subjectivity": 0.0,
	"classification": "neutral",
	"confidence": 0.0,
	"keyword_sentiment": 0.0,
	"debug_info": {"error": "Empty text"}
	}

	try:
	# Basic TextBlob analysis
	blob = TextBlob(text)
	textblob_polarity = blob.sentiment.polarity # type: ignore
	textblob_subjectivity = blob.sentiment.subjectivity # type: ignore

	# Keyword-based sentiment analysis for news content
	keyword_sentiment = self._analyze_keyword_sentiment(text.lower())

	# Combine TextBlob and keyword-based analysis
	# Give more weight to keyword analysis for news content
	combined_polarity = (textblob_polarity * 0.6) + (keyword_sentiment * 0.4)

	# More sensitive thresholds for news articles
	NEWS_POSITIVE_THRESHOLD = 0.05 # Lowered from 0.1
	NEWS_NEGATIVE_THRESHOLD = -0.05 # Lowered from -0.1

	# Classify sentiment with new thresholds
	if combined_polarity >= NEWS_POSITIVE_THRESHOLD:
	classification = "positive"
	elif combined_polarity <= NEWS_NEGATIVE_THRESHOLD:
	classification = "negative"
	else:
	classification = "neutral"

	# Calculate confidence based on distance from neutral
	confidence = min(abs(combined_polarity) * 2, 1.0) # Scale to 0-1

	return {
	"polarity": combined_polarity,
	"subjectivity": textblob_subjectivity,
	"classification": classification,
	"confidence": confidence,
	"keyword_sentiment": keyword_sentiment,
	"debug_info": {
	"textblob_polarity": textblob_polarity,
	"keyword_sentiment": keyword_sentiment,
	"thresholds": {
	"positive": NEWS_POSITIVE_THRESHOLD,
	"negative": NEWS_NEGATIVE_THRESHOLD
	}
	}
	}

	except Exception as e:
	logger.error(f"Error in enhanced sentiment analysis: {str(e)}")
	return {
	"polarity": 0.0,
	"subjectivity": 0.0,
	"classification": "neutral",
	"confidence": 0.0,
	"keyword_sentiment": 0.0,
	"debug_info": {"error": str(e)}
	}

	def _analyze_keyword_sentiment(self, text: str) -> float:
	"""
	Analyze sentiment based on business/news-specific keywords

	Args:
	text: Lowercase text to analyze

	Returns:
	Sentiment score from keyword analysis (-1.0 to 1.0)
	"""
	# Negative keywords with weights
	negative_keywords = {
	# Strong negative
	'bankruptcy': -0.9, 'fraud': -0.9, 'scandal': -0.8, 'crisis': -0.8,
	'collapse': -0.8, 'failure': -0.7, 'lawsuit': -0.7, 'breach': -0.7,
	'hack': -0.7, 'cyberattack': -0.7, 'data breach': -0.8,

	# Moderate negative
	'decline': -0.6, 'loss': -0.6, 'drop': -0.5, 'fall': -0.5,
	'concern': -0.5, 'risk': -0.5, 'problem': -0.5, 'issue': -0.4,
	'challenge': -0.4, 'difficulty': -0.4, 'struggle': -0.5,
	'layoff': -0.6, 'shutdown': -0.7, 'closure': -0.6,

	# Business negative
	'recession': -0.7, 'downturn': -0.6, 'crash': -0.8,
	'volatile': -0.5, 'uncertainty': -0.4, 'unstable': -0.5,
	'disappointing': -0.5, 'missed expectations': -0.6,
	'below forecast': -0.5, 'underperform': -0.5
	}

	# Positive keywords with weights
	positive_keywords = {
	# Strong positive
	'breakthrough': 0.8, 'innovation': 0.7, 'revolutionary': 0.8,
	'success': 0.7, 'achievement': 0.6, 'milestone': 0.6,
	'record': 0.6, 'best': 0.5, 'excellent': 0.6,

	# Growth/business positive
	'growth': 0.6, 'increase': 0.5, 'rise': 0.5, 'surge': 0.6,
	'expansion': 0.6, 'profit': 0.5, 'revenue': 0.4, 'gain': 0.5,
	'boost': 0.5, 'improve': 0.5, 'enhance': 0.4,
	'opportunity': 0.4, 'potential': 0.4, 'promising': 0.5,

	# Market positive
	'bullish': 0.6, 'optimistic': 0.5, 'confident': 0.5,
	'strong': 0.4, 'robust': 0.5, 'solid': 0.4,
	'outperform': 0.5, 'exceed expectations': 0.6,
	'beat forecast': 0.6, 'above estimates': 0.5
	}

	# Calculate weighted sentiment score
	sentiment_score = 0.0
	total_weight = 0.0

	# Check negative keywords
	for keyword, weight in negative_keywords.items():
	if keyword in text:
	sentiment_score += weight
	total_weight += abs(weight)

	# Check positive keywords
	for keyword, weight in positive_keywords.items():
	if keyword in text:
	sentiment_score += weight
	total_weight += abs(weight)

	# Normalize by total weight to prevent extreme scores
	if total_weight > 0:
	sentiment_score = sentiment_score / max(total_weight, 1.0)

	# Ensure score is within bounds
	return max(-1.0, min(1.0, sentiment_score))

	async def search_news(self, query: str, days_back: int = 30, language: str = "en") -> Dict[str, Any]:
	"""
	Search for news articles related to a query with enhanced search strategies

	Args:
	query: Search query
	days_back: Number of days to look back
	language: Language code

	Returns:
	News search results or error details
	"""
	logger.log_processing_step(f"News search for '{query}'", "started")

	if not self.news_api:
	error_msg = "NewsAPI client not initialized - no valid API key available"
	logger.log_api_failure("NewsAPI", error_msg)
	logger.log_processing_step(f"News search for '{query}'", "failed")
	return {
	"query": query,
	"error": error_msg,
	"error_type": "no_api_client",
	"total_results": 0,
	"articles": [],
	"searched_at": datetime.now().isoformat()
	}

	try:
	logger.log_api_attempt("NewsAPI", f"searching for '{query}' ({days_back} days back)")

	# Calculate date range
	from_date = (datetime.now() - timedelta(days=days_back)).strftime("%Y-%m-%d")
	to_date = datetime.now().strftime("%Y-%m-%d")

	# Try multiple search strategies if the first one fails
	search_strategies = [
	# Strategy 1: Exact query
	{"q": query, "sort_by": "relevancy"},
	# Strategy 2: If query has multiple words, try OR search
	{"q": " OR ".join(query.split()) if len(query.split()) > 1 else query, "sort_by": "relevancy"},
	# Strategy 3: Try with quotes for exact phrase (if not already quoted)
	{"q": f'"{query}"' if '"' not in query else query, "sort_by": "publishedAt"}
	]

	response = None
	successful_strategy = None

	for i, strategy in enumerate(search_strategies):
	try:
	response = self.news_api.get_everything(
	from_param=from_date,
	to=to_date,
	language=language,
	**strategy
	)

	# Check if we got results
	if response and response.get("totalResults", 0) > 0:
	successful_strategy = i + 1
	logger.info(f"Search strategy {successful_strategy} successful for query: {strategy['q']}")
	break
	elif i == 0: # Log if first strategy failed
	logger.info(f"Search strategy 1 returned 0 results, trying alternative strategies")

	except Exception as strategy_error:
	logger.warning(f"Search strategy {i + 1} failed: {str(strategy_error)}")
	continue

	# If no strategy worked, use the last response (even if empty)
	if response is None:
	response = {"articles": [], "totalResults": 0}

	# Process and return results
	articles = response.get("articles", [])
	total_results = response.get("totalResults", 0)

	logger.log_api_success("NewsAPI", f"found {total_results} total results, processing {len(articles)} articles")

	processed_results = {
	"query": query,
	"total_results": total_results,
	"days_back": days_back,
	"articles": [],
	"sources": {},
	"keywords": {},
	"searched_at": datetime.now().isoformat()
	}

	# Process each article
	for article in articles:
	# Extract article data
	processed_article = {
	"title": article.get("title", ""),
	"source": article.get("source", {}).get("name", "Unknown"),
	"author": article.get("author", ""),
	"published_at": article.get("publishedAt", ""),
	"description": article.get("description", ""),
	"url": article.get("url", ""),
	"content": article.get("content", "")
	}

	# Extract keywords
	full_text = f"{processed_article['title']} {processed_article['description']}"
	keywords = self.extract_keywords(full_text)
	processed_article["keywords"] = keywords

	# Enhanced sentiment analysis
	sentiment_analysis = self._analyze_article_sentiment(full_text)
	processed_article["sentiment"] = sentiment_analysis

	# Add to results
	processed_results["articles"].append(processed_article)

	# Update source statistics
	source = processed_article["source"]
	if source not in processed_results["sources"]:
	processed_results["sources"][source] = 0
	processed_results["sources"][source] += 1

	# Update keyword statistics
	for keyword in keywords:
	if keyword not in processed_results["keywords"]:
	processed_results["keywords"][keyword] = 0
	processed_results["keywords"][keyword] += 1

	# Add sentiment distribution logging for debugging
	sentiment_debug = {"positive": 0, "negative": 0, "neutral": 0}
	for article in processed_results["articles"]:
	classification = article.get("sentiment", {}).get("classification", "neutral")
	sentiment_debug[classification] += 1

	logger.info(f"Sentiment distribution for '{query}': {sentiment_debug}")
	logger.log_data_collection(f"NewsAPI articles for '{query}'", len(processed_results['articles']))
	logger.log_processing_step(f"News search for '{query}'", "completed")

	return processed_results

	except Exception as e:
	error_msg = f"NewsAPI search failed for '{query}': {str(e)}"
	logger.log_api_failure("NewsAPI", error_msg)
	logger.log_processing_step(f"News search for '{query}'", "failed")

	# Return detailed error information
	return {
	"query": query,
	"error": error_msg,
	"error_type": "api_request_failed",
	"error_details": str(e),
	"total_results": 0,
	"articles": [],
	"searched_at": datetime.now().isoformat()
	}

	async def get_trend_analysis(self, feature_description: str, target_market: str = "", days_back: int = 30) -> Dict[str, Any]:
	"""
	Analyze trends for a specific feature using enhanced NLP

	Args:
	feature_description: Description of the feature
	target_market: Target market description
	days_back: Number of days to look back

	Returns:
	Trend analysis
	"""
	try:
	# Use NLP-enhanced query generation
	enhanced_data = await self.nlp_enhancer.enhance_query(feature_description, target_market)
	optimized_queries = self.nlp_enhancer.get_optimized_queries_for_platform(enhanced_data, 'news')

	# If no optimized queries found, fall back to simpler approach
	if not optimized_queries:
	# Extract key words from the description
	words = feature_description.lower().split()
	key_words = [word for word in words if len(word) > 3 and word not in ['and', 'the', 'for', 'with', 'that', 'this']]

	# Create basic queries from key words
	if len(key_words) >= 2:
	optimized_queries = [
	' '.join(key_words[:2]),
	' '.join(key_words[:3]) if len(key_words) >= 3 else ' '.join(key_words),
	f"{key_words[0]} market" if key_words else "technology market",
	f"{key_words[0]} industry" if key_words else "technology industry"
	]
	else:
	optimized_queries = [feature_description]

	logger.info(f"Using optimized search queries: {optimized_queries}")
	search_queries = optimized_queries

	# Collect news for each query
	query_results = {}
	successful_queries = []

	for query in search_queries:
	result = await self.search_news(query, days_back)
	query_results[query] = result

	# Track queries that returned results
	if result.get("total_results", 0) > 0:
	successful_queries.append(query)

	# If no results from optimized queries, try broader fallback searches
	if not successful_queries:
	logger.info("No results from optimized queries, trying broader fallback searches")
	fallback_queries = [
	"fitness app",
	"health technology",
	"mobile app market",
	"wellness industry",
	"AI technology",
	"personal training",
	"nutrition app"
	]

	for query in fallback_queries:
	if query not in query_results: # Avoid duplicates
	result = await self.search_news(query, days_back)
	query_results[query] = result

	if result.get("total_results", 0) > 0:
	successful_queries.append(query)
	# Try a few fallback queries, not just one
	if len(successful_queries) >= 3:
	break

	# Aggregate results
	trend_analysis = {
	"feature_description": feature_description,
	"original_queries": search_queries,
	"successful_queries": successful_queries,
	"total_queries_tried": len(query_results),
	"aggregate_stats": self._aggregate_news_results(query_results),
	"query_results": query_results,
	"analyzed_at": datetime.now().isoformat()
	}

	# Identify trends
	trend_analysis["trends"] = self._identify_trends(trend_analysis)

	return trend_analysis

	except Exception as e:
	logger.error(f"Error in trend analysis: {str(e)}")
	return {"error": str(e)}

	async def get_industry_trends(self, industry: str, timeframe: str = "30d") -> Dict[str, Any]:
	"""Get general industry trends"""
	try:
	# Convert timeframe to days
	days_mapping = {"7d": 7, "30d": 30, "90d": 90}
	days_back = days_mapping.get(timeframe, 30)

	# Search for industry news
	result = await self.search_news(f"{industry} industry trends", days_back)

	# Augment with industry-specific analysis
	result["industry"] = industry
	result["timeframe"] = timeframe
	result["trend_score"] = self._calculate_trend_score(result)

	# Extract top trends
	result["top_trends"] = self._extract_top_trends(result, 5)

	return result

	except Exception as e:
	logger.error(f"Error getting industry trends: {str(e)}")
	return {"error": str(e)}

	def _aggregate_news_results(self, query_results: Dict[str, Any]) -> Dict[str, Any]:
	"""Aggregate results from multiple queries"""
	aggregate = {
	"total_articles": 0,
	"total_sources": 0,
	"sources": {},
	"keywords": {},
	"sentiment": {
	"positive": 0,
	"negative": 0,
	"neutral": 0,
	"average_polarity": 0
	}
	}

	# Combine results from all queries
	articles_processed = set() # Track unique articles by URL
	total_polarity = 0
	articles_with_sentiment = 0

	for query, result in query_results.items():
	if "articles" not in result:
	continue

	for article in result["articles"]:
	# Skip duplicates
	url = article.get("url", "")
	if url in articles_processed:
	continue

	articles_processed.add(url)
	aggregate["total_articles"] += 1

	# Update source statistics
	source = article.get("source", "Unknown")
	if source not in aggregate["sources"]:
	aggregate["sources"][source] = 0
	aggregate["total_sources"] += 1
	aggregate["sources"][source] += 1

	# Update keyword statistics
	for keyword in article.get("keywords", []):
	if keyword not in aggregate["keywords"]:
	aggregate["keywords"][keyword] = 0
	aggregate["keywords"][keyword] += 1

	# Update sentiment statistics using new classification
	sentiment = article.get("sentiment", {})
	classification = sentiment.get("classification", "neutral")
	polarity = sentiment.get("polarity", 0)

	# Use the enhanced classification
	if classification == "positive":
	aggregate["sentiment"]["positive"] += 1
	elif classification == "negative":
	aggregate["sentiment"]["negative"] += 1
	else:
	aggregate["sentiment"]["neutral"] += 1

	total_polarity += polarity
	articles_with_sentiment += 1

	# Calculate average sentiment
	if articles_with_sentiment > 0:
	aggregate["sentiment"]["average_polarity"] = total_polarity / articles_with_sentiment

	# Get top keywords and sources
	aggregate["top_keywords"] = sorted(
	aggregate["keywords"].items(),
	key=lambda x: x[1],
	reverse=True
	)[:10]

	aggregate["top_sources"] = sorted(
	aggregate["sources"].items(),
	key=lambda x: x[1],
	reverse=True
	)[:5]

	return aggregate

	def _identify_trends(self, analysis: Dict[str, Any]) -> List[Dict[str, Any]]:
	"""Identify trends from news analysis"""
	trends = []

	# Extract top keywords as trends
	keywords = analysis.get("aggregate_stats", {}).get("keywords", {})
	for keyword, count in sorted(keywords.items(), key=lambda x: x[1], reverse=True)[:10]:
	# Only include significant keywords
	if count >= 2:
	trends.append({
	"keyword": keyword,
	"count": count,
	"type": "topic"
	})

	# Look for growth/emerging trends
	# In a real implementation, this would compare against historical data

	return trends

	def _calculate_trend_score(self, result: Dict[str, Any]) -> float:
	"""Calculate trend score based on article volume and recency"""
	if "articles" not in result or not result["articles"]:
	return 0.0

	# Count recent articles (last 7 days)
	recent_count = 0
	total_count = len(result["articles"])

	one_week_ago = datetime.now() - timedelta(days=7)

	for article in result["articles"]:
	try:
	published_date = datetime.fromisoformat(article.get("published_at", "").replace("Z", "+00:00"))
	if published_date >= one_week_ago:
	recent_count += 1
	except ValueError:
	pass

	# Calculate trend score (0-10 scale)
	recency_ratio = recent_count / total_count if total_count > 0 else 0
	trend_score = min((total_count / 10) * (1 + recency_ratio), 10.0)

	return round(trend_score, 2)

	def _extract_top_trends(self, result: Dict[str, Any], limit: int = 5) -> List[Dict[str, Any]]:
	"""Extract top trends from news result"""
	trends = []

	# Extract keyword trends
	keywords = {}
	for article in result.get("articles", []):
	for keyword in article.get("keywords", []):
	if keyword not in keywords:
	keywords[keyword] = {
	"keyword": keyword,
	"count": 0,
	"sentiment": 0,
	"articles": []
	}

	keywords[keyword]["count"] += 1
	keywords[keyword]["sentiment"] += article.get("sentiment", {}).get("polarity", 0)
	keywords[keyword]["articles"].append(article.get("url", ""))

	# Calculate average sentiment and sort by count
	for keyword, data in keywords.items():
	if data["count"] > 0:
	data["sentiment"] = data["sentiment"] / data["count"]
	data["articles"] = data["articles"][:3] # Limit to top 3 articles
	trends.append(data)

	# Sort by count and return top trends
	return sorted(trends, key=lambda x: x["count"], reverse=True)[:limit]


	# Example usage and testing
	async def test_news_collector():
	"""Test function for NewsCollector"""
	collector = NewsCollector()

	# Test trend analysis
	print("Testing trend analysis...")
	result = await collector.get_trend_analysis(
	"AI voice ordering",
	days_back=15
	)
	print(f"Trend analysis: {result.get('aggregate_stats', {}).get('total_articles', 0)} articles analyzed")

	# Test industry trends
	print("Testing industry trends...")
	industry_result = await collector.get_industry_trends("artificial intelligence", "30d")
	print(f"Industry trends: {industry_result.get('trend_score', 0)} trend score")

	return result, industry_result

	if __name__ == "__main__":
	# Run test
	asyncio.run(test_news_collector())