Spaces:
Running
Running
from src.interfaces import Paper | |
class SearchAPI: | |
# fmt: off | |
SEARCH_PRIORITY = ["doi", "url", "year", "month", "venue", "authors", "title", "abstract"] | |
# fmt: on | |
def __init__(self) -> None: | |
self.papers: list[Paper] = [] | |
def exhausted_search(self, query: dict[str, tuple[tuple[str]]]) -> list[Paper]: | |
"""Exhausted search papers by matching query""" | |
papers = self.papers | |
for field in self.SEARCH_PRIORITY: | |
if field in query: | |
req = query[field] | |
paper_indices = [] | |
for i, p in enumerate(papers): | |
for or_conditions in req: | |
matched = True | |
for and_cond_string in or_conditions: | |
if " " in and_cond_string: | |
if not and_cond_string.lower() in p[field].lower(): | |
matched = False | |
break | |
else: | |
p_field = self.tokenize(p[field].lower()) | |
if not and_cond_string.lower() in p_field: | |
matched = False | |
break | |
if matched: | |
paper_indices.append(i) | |
papers = [papers[i] for i in paper_indices] | |
return papers | |
def search( | |
self, query: dict[str, tuple[tuple[str]]], method: str = "exhausted" | |
) -> list[Paper]: | |
"""Search papers | |
Args: | |
query: A dict of queries on different field. | |
A query in a field is a tuple of strings, where strings are AND | |
and tuple means OR. Strings are case-insensitive. | |
e.g. { | |
"venue": (("EMNLP", ), ("ACL",)), | |
"title": (("parsing", "tree-crf"), ("event extraction",)) | |
} | |
This query means we want to find papers in EMNLP or ACL, | |
AND the title either contains ("parsing" AND "tree-crf") OR "event extraction" | |
method: choice from: | |
- `exhausted`: brute force mathing | |
Returns: | |
a list of `Paper` | |
""" | |
papers = [] | |
if method == "exhausted": | |
papers = self.exhausted_search(query) | |
else: | |
raise NotImplementedError | |
if papers: | |
papers = sorted(set(papers), key=lambda p: (p.year, p.month), reverse=True) | |
return papers | |
def tokenize(self, string: str) -> list[str]: | |
return string.lower().split() | |