AI_Research_Buddy / src /data_ingestion.py
your-github-username
Auto-update from GitHub Actions
d9e62f5
import requests
import xml.etree.ElementTree as ET
from src.logger import logger
class DataIngestion:
def __init__(self, api_url="http://export.arxiv.org/api/query"):
self.api_url = api_url
def fetch_papers(self, topic, max_results=5):
"""Fetch papers from arXiv with logging and better error handling."""
url = f"{self.api_url}?search_query=all:{topic}&start=0&max_results={max_results}"
logger.info(f"Fetching papers from: {url}")
try:
response = requests.get(url, timeout=10) # Added timeout
response.raise_for_status()
except requests.exceptions.RequestException as e:
logger.error(f"Error fetching papers: {e}")
return [], []
# Parse XML
root = ET.fromstring(response.text)
titles, abstracts = [], []
for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
abstract = entry.find("{http://www.w3.org/2005/Atom}summary").text.strip()
titles.append(title)
abstracts.append(abstract)
logger.info(f"Fetched {len(abstracts)} papers.")
return titles, abstracts