File size: 1,277 Bytes
d9e62f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import requests
import xml.etree.ElementTree as ET
from src.logger import logger

class DataIngestion:
    def __init__(self, api_url="http://export.arxiv.org/api/query"):
        self.api_url = api_url

    def fetch_papers(self, topic, max_results=5):
        """Fetch papers from arXiv with logging and better error handling."""
        url = f"{self.api_url}?search_query=all:{topic}&start=0&max_results={max_results}"
        logger.info(f"Fetching papers from: {url}")
        
        try:
            response = requests.get(url, timeout=10)  # Added timeout
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            logger.error(f"Error fetching papers: {e}")
            return [], []
        
        # Parse XML
        root = ET.fromstring(response.text)
        titles, abstracts = [], []
        
        for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
            title = entry.find("{http://www.w3.org/2005/Atom}title").text.strip()
            abstract = entry.find("{http://www.w3.org/2005/Atom}summary").text.strip()
            titles.append(title)
            abstracts.append(abstract)
        
        logger.info(f"Fetched {len(abstracts)} papers.")
        return titles, abstracts