llm-pdf-qa / pdftoqa_generator.py
Sawon2023's picture
Updated the file reading for the app
38b91e9
raw
history blame
2.71 kB
import json
import os
import re
import statistics
import gradio as gr
import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import (
CharacterTextSplitter,
RecursiveCharacterTextSplitter,
)
from tqdm import tqdm
from tempfile import NamedTemporaryFile
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
os.environ["OPENAI_API_KEY"] = "sk-"
def pdf_parser(file_path):
bytes_data = uploaded_file.read()
with NamedTemporaryFile(delete=False) as tmp: # open a named temporary file
tmp.write(bytes_data) # Write data from the uploaded file into it
pdf_loader = PyPDFLoader(tmp.name) # <---- now it works!
#pdf_loader = PyPDFLoader(file_path) only for file path offline
documents = pdf_loader.load()
documents_text = [d.page_content for d in documents]
text_splitter = RecursiveCharacterTextSplitter(
# Set a really small chunk size, just to show.
chunk_size=600,
chunk_overlap=200,
length_function=len,
is_separator_regex=False,
)
# Split the text into chunks
texts = text_splitter.create_documents(documents_text)
os.remove(tmp.name) # remove temp file
return texts
def qa_generator(texts):
question_tokenizer = AutoTokenizer.from_pretrained(
"potsawee/t5-large-generation-squad-QuestionAnswer"
)
question_model = AutoModelForSeq2SeqLM.from_pretrained(
"potsawee/t5-large-generation-squad-QuestionAnswer"
)
question_answer_dic = {}
for i in tqdm(texts):
context = i.page_content
try:
inputs = question_tokenizer(context, return_tensors="pt")
outputs = question_model.generate(**inputs, max_length=100)
question_answer = question_tokenizer.decode(
outputs[0], skip_special_tokens=False
)
question_answer = question_answer.replace(
question_tokenizer.pad_token, ""
).replace(question_tokenizer.eos_token, "")
question, answer = question_answer.split(question_tokenizer.sep_token)
question_answer_dic[question] = answer
except:
print(i)
qa_notes_df = pd.DataFrame(data=[], columns=["No", "Question", "Answer"])
qa_notes_df["No"] = [i + 1 for i in range(0, len(question_answer_dic))]
qa_notes_df["Question"] = [k for k in question_answer_dic.keys()]
qa_notes_df["Answer"] = [a for a in question_answer_dic.values()]
qa_notes_json = qa_notes_df.to_dict("records")
return qa_notes_json