Spico's picture
- add `build_paper_list` and `build_and_search` methods to help build demo (direct API)
0841c28
import pathlib
import re
import feedparser
from src.engine import SearchAPI
from src.interfaces import Paper
from src.utils import download
class ArxivPaperList(SearchAPI):
"""arXiv API
Inputs:
cache_filepath: Filepath to save cached file
use_cache: will use cached file if `True`
raw: Raw api query, e.g. `cat:cs.CL AND ti:event`. If set, others will be disabled
title: String of title you wanna search
author: Author string
abstract: Abstract string
comment: Comment string
category: arXiv category, e.g. "cs.CL"
max_results: Maximal returned papers
sort_by: `submittedDate` (default) or `lastUpdatedDate`
sort_order: `descending` (default) or `ascending`
Doc:
prefix explanation
- ti Title
- au Author
- abs Abstract
- co Comment
- jr Journal Reference
- cat Subject Category
- rn Report Number
- id Id (use id_list instead)
- all All of the above
logics:
- AND
- OR
- ANDNOT
symbol encoding explanation
- ( ) %28 %29 Used to group Boolean expressions for Boolean operator precedence.
- double quotes %22 %22 Used to group multiple words into phrases to search a particular field.
- space + Used to extend a search_query to include multiple fields.
e.g. https://export.arxiv.org/api/query?search_query=cat:cs.CL+AND+ti:event&start=0&max_results=2000&sortBy=submittedDate&sortOrder=descending
References:
https://arxiv.org/help/api/user-manual#title_id_published_updated
"""
API_URL = "https://export.arxiv.org/api/query?search_query="
def __init__(
self,
cache_filepath: str | pathlib.Path,
use_cache: bool = False,
raw: str = "",
title: str = "",
author: str = "",
abstract: str = "",
comment: str = "",
category: str = "cs.CL",
max_results: int = 5000,
sort_by: str = "submittedDate",
sort_order: str = "descending",
) -> None:
super().__init__()
if isinstance(cache_filepath, str):
cache_filepath = pathlib.Path(cache_filepath)
if (not cache_filepath.exists()) or (not use_cache):
cache_filepath.parent.mkdir(parents=True, exist_ok=True)
query: str = ""
if raw:
query = raw
else:
if title:
if len(query) > 0:
query += " AND "
query += f"ti:{title.strip()}"
if author:
if len(query) > 0:
query += " AND "
query += f"au:{author.strip()}"
if abstract:
if len(query) > 0:
query += " AND "
query += f"abs:{abstract.strip()}"
if comment:
if len(query) > 0:
query += " AND "
query += f"co:{comment.strip()}"
if category:
if len(query) > 0:
query += " AND "
query += f"cat:{category.strip()}"
query = query.strip().replace(" ", "+")
query = query.replace("(", "%28")
query = query.replace(")", "%29")
query = query.replace('"', "%22")
url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
download(url, cache_filepath)
feed_string = cache_filepath.open("rt", encoding="utf8").read()
feed = feedparser.parse(feed_string)
for entry in feed.entries:
author = ""
if hasattr(entry, "authors"):
author = " , ".join(author.name for author in entry.authors)
url = ""
doi = ""
for link in entry.links:
if link.rel == "alternate":
url = link.href
if "doi" in link.href:
doi = link.href
if not url:
url = entry.links[0].href
if sort_by == "submittedDate":
date = entry.published_parsed
else:
date = entry.updated_parsed
title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
abstract = re.sub(
r"[\s\n]+", " ", entry.summary, flags=re.MULTILINE
).strip()
paper = Paper(
title,
author,
abstract,
url,
doi,
" , ".join([t["term"] for t in entry.tags]),
str(date.tm_year),
str(date.tm_mon),
)
self.papers.append(paper)
@staticmethod
def build_logic_string(req: list[list[str]]) -> str:
if not req:
return ""
tmp_strings = []
for and_strs in req:
tmp_strings.append(f"({' AND '.join(and_strs)})")
logic_string = " OR ".join(tmp_strings)
return logic_string
@classmethod
def build_paper_list(
cls, cache_filepath: str, query: dict, max_results: int = 5000
):
title = query.get("title", [])
ti_string = cls.build_logic_string(title)
author = query.get("author", [])
au_string = cls.build_logic_string(author)
abstract = query.get("abstract", [])
abs_string = cls.build_logic_string(abstract)
venue = query.get("venue", [])
# only subject category is used when caching
if venue:
cat_string = venue[0]
else:
cat_string = ""
return cls(
cache_filepath,
use_cache=False,
title=ti_string,
author=au_string,
abstract=abs_string,
category=cat_string,
max_results=max_results,
)
@classmethod
def build_and_search(
cls, cache_filepath: str, query: dict, max_results: int = -1
) -> list[Paper]:
obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
return obj.search(query)[:max_results]