paper-hero / src /engine.py
Spico's picture
remove results files, make Paper hashable, add dump jsonl methods, make uniq search
796eb82
raw
history blame
2.66 kB
from src.interfaces import Paper
class SearchAPI:
# fmt: off
SEARCH_PRIORITY = ["doi", "url", "year", "month", "venue", "authors", "title", "abstract"]
# fmt: on
def __init__(self) -> None:
self.papers: list[Paper] = []
def exhausted_search(self, query: dict[str, tuple[tuple[str]]]) -> list[Paper]:
"""Exhausted search papers by matching query"""
papers = self.papers
for field in self.SEARCH_PRIORITY:
if field in query:
req = query[field]
paper_indices = []
for i, p in enumerate(papers):
for or_conditions in req:
matched = True
for and_cond_string in or_conditions:
if " " in and_cond_string:
if not and_cond_string.lower() in p[field].lower():
matched = False
break
else:
p_field = self.tokenize(p[field].lower())
if not and_cond_string.lower() in p_field:
matched = False
break
if matched:
paper_indices.append(i)
papers = [papers[i] for i in paper_indices]
return papers
def search(
self, query: dict[str, tuple[tuple[str]]], method: str = "exhausted"
) -> list[Paper]:
"""Search papers
Args:
query: A dict of queries on different field.
A query in a field is a tuple of strings, where strings are AND
and tuple means OR. Strings are case-insensitive.
e.g. {
"venue": (("EMNLP", ), ("ACL",)),
"title": (("parsing", "tree-crf"), ("event extraction",))
}
This query means we want to find papers in EMNLP or ACL,
AND the title either contains ("parsing" AND "tree-crf") OR "event extraction"
method: choice from:
- `exhausted`: brute force mathing
Returns:
a list of `Paper`
"""
papers = []
if method == "exhausted":
papers = self.exhausted_search(query)
else:
raise NotImplementedError
if papers:
papers = sorted(set(papers), key=lambda p: (p.year, p.month), reverse=True)
return papers
def tokenize(self, string: str) -> list[str]:
return string.lower().split()