Spaces:
Running
Running
from langchain_community.document_loaders import PyMuPDFLoader | |
from langchain_community.retrievers import ArxivRetriever | |
def scrape_pdf_with_pymupdf(url) -> str: | |
"""Scrape a pdf with pymupdf | |
Args: | |
url (str): The url of the pdf to scrape | |
Returns: | |
str: The text scraped from the pdf | |
""" | |
loader = PyMuPDFLoader(url) | |
doc = loader.load() | |
return str(doc) | |
def scrape_pdf_with_arxiv(query) -> str: | |
"""Scrape a pdf with arxiv | |
default document length of 70000 about ~15 pages or None for no limit | |
Args: | |
query (str): The query to search for | |
Returns: | |
str: The text scraped from the pdf | |
""" | |
retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None) | |
docs = retriever.get_relevant_documents(query=query) | |
return docs[0].page_content |