Spico's picture
- refactor exhausted search API
7b40c73
import logging
import pathlib
import random
import re
import time
import requests
from tqdm import trange
from src.engine import SearchAPI
from src.interfaces import Paper
from src.utils import dump_json, load_json
logger = logging.getLogger("uvicorn.default")
class DblpPaperList(SearchAPI):
"""DBLP paper list
Inputs:
cache_filepath: Filepath to save cached file
use_cache: will use cached file if `True`, otherwise download again
query: Query string, basically the title
you wanna search in a search box.
Special logical grammars refer to the reference.
max_results: Maximal returned papers
request_time_inteval: Seconds to sleep when calling DBLP API
References:
https://dblp.org/faq/How+to+use+the+dblp+search+API.html
"""
API_URL = "https://dblp.org/search/publ/api"
def __init__(
self,
cache_filepath: pathlib.Path,
use_cache: bool = False,
query: str = "",
max_results: int = 5000,
request_time_inteval: float = 3,
) -> None:
super().__init__()
if isinstance(cache_filepath, str):
cache_filepath = pathlib.Path(cache_filepath)
if (not cache_filepath.exists()) or (not use_cache):
query = query.strip()
query = re.sub(r"\s+?\|\s+?", "|", query)
query = re.sub(r"\s+", "+", query)
searched_results = []
# max capacity is 1000
h = 1000
for f in trange(0, max_results, h, desc="DBLP Downloading"):
url = f"{self.API_URL}?q={query}&format=json&c=0&f={f}&h={h}"
try:
response = requests.get(url)
response.raise_for_status()
page = response.json()
page_data = page["result"]["hits"]["hit"]
if page_data:
searched_results.extend(page_data)
else:
break
except KeyboardInterrupt:
raise KeyboardInterrupt
except Exception as err:
logger.info(err)
break
time.sleep((random.random() + 0.5) * request_time_inteval)
dump_json(searched_results, cache_filepath)
data = load_json(cache_filepath)
for d in data:
# dblp does not provide abstract and month data
authors = []
if "authors" in d["info"]:
if isinstance(d["info"]["authors"]["author"], dict):
authors.append(d["info"]["authors"]["author"]["text"])
else:
authors = [a["text"] for a in d["info"]["authors"]["author"]]
venues = []
if "venue" in d["info"]:
if isinstance(d["info"]["venue"], str):
venues.append(d["info"]["venue"])
else:
for venue in d["info"]["venue"]:
venues.append(venue)
paper = Paper(
d["info"]["title"],
" , ".join(authors),
"",
d["info"].get("ee", d["info"].get("url", "")),
d["info"].get("doi", ""),
" , ".join(venues),
d["info"].get("year", "9999"),
"99",
)
self.papers.append(paper)
@classmethod
def build_paper_list(
cls, cache_filepath: str, query: dict, max_results: int = 1000
):
title = query.get("title", [])
abstract = query.get("abstract", [])
cls_q = ""
for t in title:
cls_q += " ".join(t)
for a in abstract:
cls_q += " ".join(a)
return cls(
cache_filepath,
use_cache=False,
query=cls_q,
max_results=max_results,
)
@classmethod
def build_and_search(
cls, cache_filepath: str, query: dict, max_results: int = 1000
) -> list[Paper]:
obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
return obj.search(query)[:max_results]