Spaces:

Spico
/

paper-hero

Running

App Files Files Community

paper-hero / src /interfaces /arxiv.py

Spico

- add `build_paper_list` and `build_and_search` methods to help build demo (direct API)

0841c28 about 2 years ago

raw

history blame contribute delete

6.32 kB

	import pathlib
	import re

	import feedparser

	from src.engine import SearchAPI
	from src.interfaces import Paper
	from src.utils import download


	class ArxivPaperList(SearchAPI):
	"""arXiv API

	Inputs:
	cache_filepath: Filepath to save cached file
	use_cache: will use cached file if `True`
	raw: Raw api query, e.g. `cat:cs.CL AND ti:event`. If set, others will be disabled
	title: String of title you wanna search
	author: Author string
	abstract: Abstract string
	comment: Comment string
	category: arXiv category, e.g. "cs.CL"
	max_results: Maximal returned papers
	sort_by: `submittedDate` (default) or `lastUpdatedDate`
	sort_order: `descending` (default) or `ascending`

	Doc:
	prefix explanation
	- ti Title
	- au Author
	- abs Abstract
	- co Comment
	- jr Journal Reference
	- cat Subject Category
	- rn Report Number
	- id Id (use id_list instead)
	- all All of the above

	logics:
	- AND
	- OR
	- ANDNOT

	symbol encoding explanation
	- ( ) %28 %29 Used to group Boolean expressions for Boolean operator precedence.
	- double quotes %22 %22 Used to group multiple words into phrases to search a particular field.
	- space + Used to extend a search_query to include multiple fields.

	e.g. https://export.arxiv.org/api/query?search_query=cat:cs.CL+AND+ti:event&start=0&max_results=2000&sortBy=submittedDate&sortOrder=descending

	References:
	https://arxiv.org/help/api/user-manual#title_id_published_updated
	"""

	API_URL = "https://export.arxiv.org/api/query?search_query="

	def __init__(
	self,
	cache_filepath: str \| pathlib.Path,
	use_cache: bool = False,
	raw: str = "",
	title: str = "",
	author: str = "",
	abstract: str = "",
	comment: str = "",
	category: str = "cs.CL",
	max_results: int = 5000,
	sort_by: str = "submittedDate",
	sort_order: str = "descending",
	) -> None:
	super().__init__()

	if isinstance(cache_filepath, str):
	cache_filepath = pathlib.Path(cache_filepath)
	if (not cache_filepath.exists()) or (not use_cache):
	cache_filepath.parent.mkdir(parents=True, exist_ok=True)

	query: str = ""
	if raw:
	query = raw
	else:
	if title:
	if len(query) > 0:
	query += " AND "
	query += f"ti:{title.strip()}"
	if author:
	if len(query) > 0:
	query += " AND "
	query += f"au:{author.strip()}"
	if abstract:
	if len(query) > 0:
	query += " AND "
	query += f"abs:{abstract.strip()}"
	if comment:
	if len(query) > 0:
	query += " AND "
	query += f"co:{comment.strip()}"
	if category:
	if len(query) > 0:
	query += " AND "
	query += f"cat:{category.strip()}"

	query = query.strip().replace(" ", "+")
	query = query.replace("(", "%28")
	query = query.replace(")", "%29")
	query = query.replace('"', "%22")

	url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
	download(url, cache_filepath)

	feed_string = cache_filepath.open("rt", encoding="utf8").read()
	feed = feedparser.parse(feed_string)
	for entry in feed.entries:
	author = ""
	if hasattr(entry, "authors"):
	author = " , ".join(author.name for author in entry.authors)
	url = ""
	doi = ""
	for link in entry.links:
	if link.rel == "alternate":
	url = link.href
	if "doi" in link.href:
	doi = link.href
	if not url:
	url = entry.links[0].href
	if sort_by == "submittedDate":
	date = entry.published_parsed
	else:
	date = entry.updated_parsed

	title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
	abstract = re.sub(
	r"[\s\n]+", " ", entry.summary, flags=re.MULTILINE
	).strip()
	paper = Paper(
	title,
	author,
	abstract,
	url,
	doi,
	" , ".join([t["term"] for t in entry.tags]),
	str(date.tm_year),
	str(date.tm_mon),
	)
	self.papers.append(paper)

	@staticmethod
	def build_logic_string(req: list[list[str]]) -> str:
	if not req:
	return ""

	tmp_strings = []
	for and_strs in req:
	tmp_strings.append(f"({' AND '.join(and_strs)})")
	logic_string = " OR ".join(tmp_strings)
	return logic_string

	@classmethod
	def build_paper_list(
	cls, cache_filepath: str, query: dict, max_results: int = 5000
	):
	title = query.get("title", [])
	ti_string = cls.build_logic_string(title)
	author = query.get("author", [])
	au_string = cls.build_logic_string(author)
	abstract = query.get("abstract", [])
	abs_string = cls.build_logic_string(abstract)
	venue = query.get("venue", [])
	# only subject category is used when caching
	if venue:
	cat_string = venue[0]
	else:
	cat_string = ""
	return cls(
	cache_filepath,
	use_cache=False,
	title=ti_string,
	author=au_string,
	abstract=abs_string,
	category=cat_string,
	max_results=max_results,
	)

	@classmethod
	def build_and_search(
	cls, cache_filepath: str, query: dict, max_results: int = -1
	) -> list[Paper]:
	obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
	return obj.search(query)[:max_results]