Shreyas094's picture
Upload 528 files
372531f verified
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.retrievers import ArxivRetriever
def scrape_pdf_with_pymupdf(url) -> str:
"""Scrape a pdf with pymupdf
Args:
url (str): The url of the pdf to scrape
Returns:
str: The text scraped from the pdf
"""
loader = PyMuPDFLoader(url)
doc = loader.load()
return str(doc)
def scrape_pdf_with_arxiv(query) -> str:
"""Scrape a pdf with arxiv
default document length of 70000 about ~15 pages or None for no limit
Args:
query (str): The query to search for
Returns:
str: The text scraped from the pdf
"""
retriever = ArxivRetriever(load_max_docs=2, doc_content_chars_max=None)
docs = retriever.get_relevant_documents(query=query)
return docs[0].page_content