Spaces:
Running
Running
import requests | |
import xml.etree.ElementTree as ET | |
from src.logger import logger | |
class DataIngestion: | |
def __init__(self, api_url="http://export.arxiv.org/api/query"): | |
self.api_url = api_url | |
def fetch_papers(self, topic, max_results=5): | |
"""Fetch papers from arXiv with logging and better error handling.""" | |
url = f"{self.api_url}?search_query=all:{topic}&start=0&max_results={max_results}" | |
logger.info(f"Fetching papers from: {url}") | |
try: | |
response = requests.get(url, timeout=10) # Added timeout | |
response.raise_for_status() | |
except requests.exceptions.RequestException as e: | |
logger.error(f"Error fetching papers: {e}") | |
return [], [] | |
# Parse XML | |
root = ET.fromstring(response.text) | |
titles, abstracts = [], [] | |
for entry in root.findall("{http://www.w3.org/2005/Atom}entry"): | |
title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip() | |
abstract = entry.find("{http://www.w3.org/2005/Atom}summary").text.strip() | |
titles.append(title) | |
abstracts.append(abstract) | |
logger.info(f"Fetched {len(abstracts)} papers.") | |
return titles, abstracts | |