import pathlib
import re

import feedparser

from src.engine import SearchAPI
from src.interfaces import Paper
from src.utils import download


class ArxivPaperList(SearchAPI):
    """arXiv API

    Inputs:
        cache_filepath: Filepath to save cached file
        use_cache: will use cached file if `True`
        raw: Raw api query, e.g. `cat:cs.CL AND ti:event`. If set, others will be disabled
        title: String of title you wanna search
        author: Author string
        abstract: Abstract string
        comment: Comment string
        category: arXiv category, e.g. "cs.CL"
        max_results: Maximal returned papers
        sort_by: `submittedDate` (default) or `lastUpdatedDate`
        sort_order: `descending` (default) or `ascending`

    Doc:
        prefix	explanation
        - ti	Title
        - au	Author
        - abs	Abstract
        - co	Comment
        - jr	Journal Reference
        - cat	Subject Category
        - rn	Report Number
        - id	Id (use id_list instead)
        - all	All of the above

        logics:
        - AND
        - OR
        - ANDNOT

        symbol	encoding	explanation
        - ( )	%28 %29	Used to group Boolean expressions for Boolean operator precedence.
        - double quotes	%22 %22	Used to group multiple words into phrases to search a particular field.
        - space	+	Used to extend a search_query to include multiple fields.

        e.g. https://export.arxiv.org/api/query?search_query=cat:cs.CL+AND+ti:event&start=0&max_results=2000&sortBy=submittedDate&sortOrder=descending

    References:
        https://arxiv.org/help/api/user-manual#title_id_published_updated
    """

    API_URL = "https://export.arxiv.org/api/query?search_query="

    def __init__(
        self,
        cache_filepath: str | pathlib.Path,
        use_cache: bool = False,
        raw: str = "",
        title: str = "",
        author: str = "",
        abstract: str = "",
        comment: str = "",
        category: str = "cs.CL",
        max_results: int = 5000,
        sort_by: str = "submittedDate",
        sort_order: str = "descending",
    ) -> None:
        super().__init__()

        if isinstance(cache_filepath, str):
            cache_filepath = pathlib.Path(cache_filepath)
        if (not cache_filepath.exists()) or (not use_cache):
            cache_filepath.parent.mkdir(parents=True, exist_ok=True)

            query: str = ""
            if raw:
                query = raw
            else:
                if title:
                    if len(query) > 0:
                        query += " AND "
                    query += f"ti:{title.strip()}"
                if author:
                    if len(query) > 0:
                        query += " AND "
                    query += f"au:{author.strip()}"
                if abstract:
                    if len(query) > 0:
                        query += " AND "
                    query += f"abs:{abstract.strip()}"
                if comment:
                    if len(query) > 0:
                        query += " AND "
                    query += f"co:{comment.strip()}"
                if category:
                    if len(query) > 0:
                        query += " AND "
                    query += f"cat:{category.strip()}"

            query = query.strip().replace(" ", "+")
            query = query.replace("(", "%28")
            query = query.replace(")", "%29")
            query = query.replace('"', "%22")

            url = f"{self.API_URL}{query}&start=0&max_results={max_results}&sortBy={sort_by}&sortOrder={sort_order}"
            download(url, cache_filepath)

        feed_string = cache_filepath.open("rt", encoding="utf8").read()
        feed = feedparser.parse(feed_string)
        for entry in feed.entries:
            author = ""
            if hasattr(entry, "authors"):
                author = " , ".join(author.name for author in entry.authors)
            url = ""
            doi = ""
            for link in entry.links:
                if link.rel == "alternate":
                    url = link.href
                if "doi" in link.href:
                    doi = link.href
            if not url:
                url = entry.links[0].href
            if sort_by == "submittedDate":
                date = entry.published_parsed
            else:
                date = entry.updated_parsed

            title = re.sub(r"[\s\n]+", " ", entry.title, flags=re.MULTILINE).strip()
            abstract = re.sub(
                r"[\s\n]+", " ", entry.summary, flags=re.MULTILINE
            ).strip()
            paper = Paper(
                title,
                author,
                abstract,
                url,
                doi,
                " , ".join([t["term"] for t in entry.tags]),
                str(date.tm_year),
                str(date.tm_mon),
            )
            self.papers.append(paper)

    @staticmethod
    def build_logic_string(req: list[list[str]]) -> str:
        if not req:
            return ""

        tmp_strings = []
        for and_strs in req:
            tmp_strings.append(f"({' AND '.join(and_strs)})")
        logic_string = " OR ".join(tmp_strings)
        return logic_string

    @classmethod
    def build_paper_list(
        cls, cache_filepath: str, query: dict, max_results: int = 5000
    ):
        title = query.get("title", [])
        ti_string = cls.build_logic_string(title)
        author = query.get("author", [])
        au_string = cls.build_logic_string(author)
        abstract = query.get("abstract", [])
        abs_string = cls.build_logic_string(abstract)
        venue = query.get("venue", [])
        # only subject category is used when caching
        if venue:
            cat_string = venue[0]
        else:
            cat_string = ""
        return cls(
            cache_filepath,
            use_cache=False,
            title=ti_string,
            author=au_string,
            abstract=abs_string,
            category=cat_string,
            max_results=max_results,
        )

    @classmethod
    def build_and_search(
        cls, cache_filepath: str, query: dict, max_results: int = -1
    ) -> list[Paper]:
        obj = cls.build_paper_list(cache_filepath, query, max_results=max_results)
        return obj.search(query)[:max_results]