Samarth991 commited on
Commit
f18883e
·
1 Parent(s): 8ac4871
Files changed (1) hide show
  1. PDF_Reader.py +3 -3
PDF_Reader.py CHANGED
@@ -30,10 +30,10 @@ def read_pdf_text(pdf_path):
30
  pdf_reader = PdfReader(pdf_path)
31
  for page in pdf_reader.pages:
32
  text += page.extract_text()
33
-
34
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
35
  text_chunks = text_splitter.split_text(text)
36
- return text_chunks
 
37
 
38
  def read_pdf(pdf_path):
39
  loader = PyPDFLoader(pdf_path)
@@ -51,5 +51,5 @@ def PDF_4_QA(file_path):
51
  #docs = read_pdf(file_path)
52
  #cleaned_docs = Chunks(docs)
53
  cleaned_docs = read_pdf_text(file_path)
54
- vectordb = Chroma.from_texts(cleaned_docs,embedding=embeddings,persist_directory="Chroma/docs")
55
  return vectordb,cleaned_docs
 
30
  pdf_reader = PdfReader(pdf_path)
31
  for page in pdf_reader.pages:
32
  text += page.extract_text()
 
33
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
34
  text_chunks = text_splitter.split_text(text)
35
+ text_docs = [Document(page_content=txt) for txt in text_chunks]
36
+ return text_docs
37
 
38
  def read_pdf(pdf_path):
39
  loader = PyPDFLoader(pdf_path)
 
51
  #docs = read_pdf(file_path)
52
  #cleaned_docs = Chunks(docs)
53
  cleaned_docs = read_pdf_text(file_path)
54
+ vectordb = Chroma.from_documents(cleaned_docs,embedding=embeddings,persist_directory="Chroma/docs")
55
  return vectordb,cleaned_docs