import logging import pathlib import random import re import time import requests from tqdm import trange from src.engine import SearchAPI from src.interfaces import Paper from src.utils import dump_json, load_json logger = logging.getLogger("uvicorn.default") class DblpPaperList(SearchAPI): """DBLP paper list Inputs: cache_filepath: Filepath to save cached file use_cache: will use cached file if `True`, otherwise download again query: Query string, basically the title you wanna search in a search box. Special logical grammars refer to the reference. max_results: Maximal returned papers request_time_inteval: Seconds to sleep when calling DBLP API References: https://dblp.org/faq/How+to+use+the+dblp+search+API.html """ API_URL = "https://dblp.org/search/publ/api" def __init__( self, cache_filepath: pathlib.Path, use_cache: bool = False, query: str = "", max_results: int = 5000, request_time_inteval: float = 3, ) -> None: super().__init__() if isinstance(cache_filepath, str): cache_filepath = pathlib.Path(cache_filepath) if (not cache_filepath.exists()) or (not use_cache): query = query.strip() query = re.sub(r"\s+?\|\s+?", "|", query) query = re.sub(r"\s+", "+", query) searched_results = [] # max capacity is 1000 h = 1000 for f in trange(0, max_results, h, desc="DBLP Downloading"): url = f"{self.API_URL}?q={query}&format=json&c=0&f={f}&h={h}" try: response = requests.get(url) response.raise_for_status() page = response.json() page_data = page["result"]["hits"]["hit"] if page_data: searched_results.extend(page_data) else: break except KeyboardInterrupt: raise KeyboardInterrupt except Exception as err: logger.info(err) break time.sleep((random.random() + 0.5) * request_time_inteval) dump_json(searched_results, cache_filepath) data = load_json(cache_filepath) for d in data: # dblp does not provide abstract and month data authors = [] if "authors" in d["info"]: if isinstance(d["info"]["authors"]["author"], dict): authors.append(d["info"]["authors"]["author"]["text"]) else: authors = [a["text"] for a in d["info"]["authors"]["author"]] venues = [] if "venue" in d["info"]: if isinstance(d["info"]["venue"], str): venues.append(d["info"]["venue"]) else: for venue in d["info"]["venue"]: venues.append(venue) paper = Paper( d["info"]["title"], " , ".join(authors), "", d["info"].get("ee", d["info"].get("url", "")), d["info"].get("doi", ""), " , ".join(venues), d["info"].get("year", "9999"), "99", ) self.papers.append(paper) @classmethod def build_paper_list( cls, cache_filepath: str, query: dict, max_results: int = 1000 ): title = query.get("title", []) abstract = query.get("abstract", []) cls_q = "" for t in title: cls_q += " ".join(t) for a in abstract: cls_q += " ".join(a) return cls( cache_filepath, use_cache=False, query=cls_q, max_results=max_results, ) @classmethod def build_and_search( cls, cache_filepath: str, query: dict, max_results: int = 1000 ) -> list[Paper]: obj = cls.build_paper_list(cache_filepath, query, max_results=max_results) return obj.search(query)[:max_results]