Spaces:
Runtime error
Runtime error
Sigrid De los Santos
commited on
Commit
·
edbb216
1
Parent(s):
215f78a
fixing date issues
Browse files- src/news_analysis.py +76 -7
src/news_analysis.py
CHANGED
|
@@ -212,10 +212,12 @@ def tavily_search(query, days, max_results=10):
|
|
| 212 |
response = requests.post(url, json=payload, headers=headers)
|
| 213 |
return response.json()
|
| 214 |
|
|
|
|
| 215 |
|
| 216 |
def fetch_deep_news(topic, days):
|
| 217 |
all_results = []
|
| 218 |
seen_urls = set()
|
|
|
|
| 219 |
|
| 220 |
base_queries = [
|
| 221 |
topic,
|
|
@@ -251,13 +253,28 @@ def fetch_deep_news(topic, days):
|
|
| 251 |
for item in response.get("results", []):
|
| 252 |
url = item.get("url")
|
| 253 |
content = item.get("content", "") or item.get("summary", "") or item.get("title", "")
|
| 254 |
-
if url
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
except Exception as e:
|
| 263 |
print(f"⚠️ Tavily request failed for query '{query}': {e}")
|
|
@@ -265,6 +282,58 @@ def fetch_deep_news(topic, days):
|
|
| 265 |
print(f"📰 Total articles collected: {len(all_results)}")
|
| 266 |
return all_results
|
| 267 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
# === Generate Markdown Report ===
|
| 269 |
def generate_value_investor_report(topic, news_results, max_articles=20, max_chars_per_article=400):
|
| 270 |
news_results = news_results[:max_articles]
|
|
|
|
| 212 |
response = requests.post(url, json=payload, headers=headers)
|
| 213 |
return response.json()
|
| 214 |
|
| 215 |
+
from datetime import datetime, timedelta # Ensure this is at the top
|
| 216 |
|
| 217 |
def fetch_deep_news(topic, days):
|
| 218 |
all_results = []
|
| 219 |
seen_urls = set()
|
| 220 |
+
cutoff_date = datetime.now() - timedelta(days=days)
|
| 221 |
|
| 222 |
base_queries = [
|
| 223 |
topic,
|
|
|
|
| 253 |
for item in response.get("results", []):
|
| 254 |
url = item.get("url")
|
| 255 |
content = item.get("content", "") or item.get("summary", "") or item.get("title", "")
|
| 256 |
+
if not url or url in seen_urls or len(content) <= 150:
|
| 257 |
+
continue
|
| 258 |
+
|
| 259 |
+
pub_date_str = item.get("published_date")
|
| 260 |
+
if pub_date_str:
|
| 261 |
+
try:
|
| 262 |
+
pub_date = datetime.fromisoformat(pub_date_str.rstrip("Z"))
|
| 263 |
+
if pub_date < cutoff_date:
|
| 264 |
+
continue # Skip articles too old
|
| 265 |
+
date_str = pub_date.strftime("%Y-%m-%d")
|
| 266 |
+
except Exception:
|
| 267 |
+
date_str = "Unknown"
|
| 268 |
+
else:
|
| 269 |
+
date_str = "Unknown"
|
| 270 |
+
|
| 271 |
+
all_results.append({
|
| 272 |
+
"title": item.get("title"),
|
| 273 |
+
"url": url,
|
| 274 |
+
"content": content,
|
| 275 |
+
"date": date_str
|
| 276 |
+
})
|
| 277 |
+
seen_urls.add(url)
|
| 278 |
|
| 279 |
except Exception as e:
|
| 280 |
print(f"⚠️ Tavily request failed for query '{query}': {e}")
|
|
|
|
| 282 |
print(f"📰 Total articles collected: {len(all_results)}")
|
| 283 |
return all_results
|
| 284 |
|
| 285 |
+
# def fetch_deep_news(topic, days):
|
| 286 |
+
# all_results = []
|
| 287 |
+
# seen_urls = set()
|
| 288 |
+
|
| 289 |
+
# base_queries = [
|
| 290 |
+
# topic,
|
| 291 |
+
# f"{topic} AND startup",
|
| 292 |
+
# f"{topic} AND acquisition OR merger OR funding",
|
| 293 |
+
# f"{topic} AND CEO OR executive OR leadership",
|
| 294 |
+
# f"{topic} AND venture capital OR Series A OR Series B",
|
| 295 |
+
# f"{topic} AND government grant OR approval OR contract",
|
| 296 |
+
# f"{topic} AND underrated OR small-cap OR micro-cap"
|
| 297 |
+
# ]
|
| 298 |
+
|
| 299 |
+
# investor_queries = [
|
| 300 |
+
# f"{topic} AND BlackRock OR Vanguard OR SoftBank",
|
| 301 |
+
# f"{topic} AND Elon Musk OR Sam Altman OR Peter Thiel",
|
| 302 |
+
# f"{topic} AND Berkshire Hathaway OR Warren Buffett",
|
| 303 |
+
# f"{topic} AND institutional investor OR hedge fund",
|
| 304 |
+
# ]
|
| 305 |
+
|
| 306 |
+
# related_terms = get_related_terms(topic)
|
| 307 |
+
# synonym_queries = [f"{term} AND {kw}" for term in related_terms for kw in ["startup", "funding", "merger", "acquisition"]]
|
| 308 |
+
|
| 309 |
+
# all_queries = base_queries + investor_queries + synonym_queries
|
| 310 |
+
|
| 311 |
+
# for query in all_queries:
|
| 312 |
+
# try:
|
| 313 |
+
# print(f"🔍 Tavily query: {query}")
|
| 314 |
+
# response = tavily_search(query, days)
|
| 315 |
+
|
| 316 |
+
# if not isinstance(response, dict) or "results" not in response:
|
| 317 |
+
# print(f"⚠️ Tavily API response issue: {response}")
|
| 318 |
+
# continue
|
| 319 |
+
|
| 320 |
+
# for item in response.get("results", []):
|
| 321 |
+
# url = item.get("url")
|
| 322 |
+
# content = item.get("content", "") or item.get("summary", "") or item.get("title", "")
|
| 323 |
+
# if url and url not in seen_urls and len(content) > 150:
|
| 324 |
+
# all_results.append({
|
| 325 |
+
# "title": item.get("title"),
|
| 326 |
+
# "url": url,
|
| 327 |
+
# "content": content
|
| 328 |
+
# })
|
| 329 |
+
# seen_urls.add(url)
|
| 330 |
+
|
| 331 |
+
# except Exception as e:
|
| 332 |
+
# print(f"⚠️ Tavily request failed for query '{query}': {e}")
|
| 333 |
+
|
| 334 |
+
# print(f"📰 Total articles collected: {len(all_results)}")
|
| 335 |
+
# return all_results
|
| 336 |
+
|
| 337 |
# === Generate Markdown Report ===
|
| 338 |
def generate_value_investor_report(topic, news_results, max_articles=20, max_chars_per_article=400):
|
| 339 |
news_results = news_results[:max_articles]
|