from langchain_community.document_loaders import PyMuPDFLoader from langchain_community.retrievers import ArxivRetriever def scrape_pdf_with_pymupdf(url) -> str: """Scrape a pdf with pymupdf Args: url (str): The url of the pdf to scrape Returns: str: The text scraped from the pdf """ loader = PyMuPDFLoader(url) doc = loader.load() return str(doc) def scrape_pdf_with_arxiv(query) -> str: """Scrape a pdf with arxiv default document length of 70000 about ~15 pages or None for no limit Args: query (str): The query to search for Returns: str: The text scraped from the pdf """ retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None) docs = retriever.get_relevant_documents(query=query) return docs[0].page_content