File size: 3,402 Bytes
c149479
 
 
 
0841c28
c149479
 
 
 
 
 
0841c28
 
 
 
 
 
 
 
 
 
c149479
 
 
 
0841c28
 
 
 
 
 
 
c149479
 
0841c28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c149479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
796eb82
c149479
796eb82
c149479
 
 
796eb82
 
 
 
c149479
 
0841c28
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
from src.interfaces import Paper


class SearchAPI:
    SEARCH_PRIORITY = ["year", "month", "venue", "author", "title", "abstract"]

    def __init__(self) -> None:
        self.papers: list[Paper] = []

    def exhausted_search(self, query: dict[str, tuple[tuple[str]]]) -> list[Paper]:
        """Exhausted search papers by matching query"""
        def _in_string(statement, string):
            stmt_in_string = False
            if " " in statement and statement.lower() in string.lower():
                stmt_in_string = True
            else:
                tokens = self.tokenize(string.lower())
                if statement.lower() in tokens:
                    stmt_in_string = True
            return stmt_in_string

        papers = self.papers
        for field in self.SEARCH_PRIORITY:
            if field in query:
                req = query[field]
                time_spans = []
                if field in ["year", "month"]:
                    for span in req:
                        assert len(span) == 2
                        assert all(num.isdigit() for num in span)
                        time_spans.append((int(span[0]), int(span[1])))

                paper_indices = []
                for i, p in enumerate(papers):
                    matched = False
                    if time_spans:
                        if any(s <= p[field] <= e for s, e in time_spans):
                            matched = True
                    else:
                        if any(
                            all(
                                _in_string(stmt, p[field])
                                for stmt in and_statements
                            )
                            for and_statements in req
                        ):
                            matched = True

                    if matched:
                        paper_indices.append(i)
                papers = [papers[i] for i in paper_indices]

        return papers

    def search(
        self, query: dict[str, tuple[tuple[str]]], method: str = "exhausted"
    ) -> list[Paper]:
        """Search papers

        Args:
            query: A dict of queries on different field.
                A query in a field is a tuple of strings, where strings are AND
                and tuple means OR. Strings are case-insensitive.
                e.g. {
                    "venue": (("EMNLP", ), ("ACL",)),
                    "title": (("parsing", "tree-crf"), ("event extraction",))
                }
                This query means we want to find papers in EMNLP or ACL,
                AND the title either contains ("parsing" AND "tree-crf") OR "event extraction"
            method: choice from:
                - `exhausted`: brute force mathing

        Returns:
            a list of `Paper`
        """
        papers = []
        if method == "exhausted":
            papers = self.exhausted_search(query)
        else:
            raise NotImplementedError

        if papers:
            papers = sorted(set(papers), key=lambda p: (p.year, p.month), reverse=True)
        return papers

    def tokenize(self, string: str) -> list[str]:
        return string.lower().split()

    @classmethod
    def build_paper_list(cls, *args, **kwargs):
        raise NotImplementedError

    @classmethod
    def build_and_search(cls, *args, **kwargs) -> list[Paper]:
        raise NotImplementedError