Spaces:

Samarth991
/

RAG-PDF_With_LLAMA-3B

Sleeping

Samarth991 commited on Dec 25, 2024

Commit

f18883e

1 Parent(s): 8ac4871

updated

Files changed (1) hide show

PDF_Reader.py CHANGED Viewed

@@ -30,10 +30,10 @@ def read_pdf_text(pdf_path):
     pdf_reader = PdfReader(pdf_path)
     for page in pdf_reader.pages:
         text += page.extract_text()
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
     text_chunks = text_splitter.split_text(text)
-    return text_chunks
 def read_pdf(pdf_path):
     loader = PyPDFLoader(pdf_path)
@@ -51,5 +51,5 @@ def PDF_4_QA(file_path):
     #docs = read_pdf(file_path)
     #cleaned_docs = Chunks(docs)
     cleaned_docs = read_pdf_text(file_path)
-    vectordb = Chroma.from_texts(cleaned_docs,embedding=embeddings,persist_directory="Chroma/docs")
     return vectordb,cleaned_docs

     pdf_reader = PdfReader(pdf_path)
     for page in pdf_reader.pages:
         text += page.extract_text()
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
     text_chunks = text_splitter.split_text(text)
+    text_docs = [Document(page_content=txt) for txt in text_chunks]
+    return text_docs
 def read_pdf(pdf_path):
     loader = PyPDFLoader(pdf_path)
     #docs = read_pdf(file_path)
     #cleaned_docs = Chunks(docs)
     cleaned_docs = read_pdf_text(file_path)
+    vectordb = Chroma.from_documents(cleaned_docs,embedding=embeddings,persist_directory="Chroma/docs")
     return vectordb,cleaned_docs