Spaces:

surfiniaburger
/

aura-mind-glow

Sleeping

App Files Files Community

aura-mind-glow / document_analyzer.py

surfiniaburger

ade

679d006 3 months ago

raw

history blame

3.2 kB

	import os
	from PyPDF2 import PdfReader
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	from langchain_community.vectorstores import FAISS
	from langchain_huggingface import HuggingFaceEmbeddings
	from langchain.chains.question_answering import load_qa_chain
	from langchain_community.llms import HuggingFaceHub
	import config

	def analyze_pdf(file_path):
	"""
	Analyzes a PDF file and returns a question-answering chain.

	Args:
	file_path: The path to the PDF file.

	Returns:
	A Langchain QA chain object.
	"""
	try:
	pdf_reader = PdfReader(file_path)
	text = ""
	for page in pdf_reader.pages:
	text += page.extract_text()

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=200,
	length_function=len
	)
	chunks = text_splitter.split_text(text=text)

	embeddings = HuggingFaceEmbeddings(model_name=config.EMBEDDING_MODEL_NAME)
	vector_store = FAISS.from_texts(chunks, embedding=embeddings)

	llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
	chain = load_qa_chain(llm=llm, chain_type="stuff")

	return chain, vector_store

	except Exception as e:
	print(f"Error analyzing PDF: {e}")
	return None, None

	def query_pdf(chain, vector_store, query):
	"""
	Queries the PDF using the QA chain.

	Args:
	chain: The Langchain QA chain object.
	vector_store: The FAISS vector store.
	query: The question to ask the PDF.

	Returns:
	The answer to the query.
	"""
	try:
	docs = vector_store.similarity_search(query=query, k=3)
	answer = chain.run(input_documents=docs, question=query)
	return answer
	except Exception as e:
	print(f"Error querying PDF: {e}")
	return "Sorry, I couldn't find an answer to your question in the PDF."

	import pandas as pd
	from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
	from langchain_community.llms import HuggingFaceHub

	def analyze_spreadsheet(file_path):
	"""
	Analyzes a spreadsheet file and returns a question-answering agent.

	Args:
	file_path: The path to the spreadsheet file.

	Returns:
	A Langchain agent object.
	"""
	try:
	df = pd.read_csv(file_path)
	llm = HuggingFaceHub(repo_id="google/flan-t5-xxl", model_kwargs={"temperature":0.5, "max_length":512})
	agent = create_pandas_dataframe_agent(llm, df, verbose=True)
	return agent
	except Exception as e:
	print(f"Error analyzing spreadsheet: {e}")
	return None

	def query_spreadsheet(agent, query):
	"""
	Queries the spreadsheet using the agent.

	Args:
	agent: The Langchain agent object.
	query: The question to ask the spreadsheet.

	Returns:
	The answer to the query.
	"""
	try:
	answer = agent.run(query)
	return answer
	except Exception as e:
	print(f"Error querying spreadsheet: {e}")
	return "Sorry, I couldn't find an answer to your question in the spreadsheet."