Spaces:
Sleeping
Sleeping
Commit
·
f18883e
1
Parent(s):
8ac4871
updated
Browse files- PDF_Reader.py +3 -3
PDF_Reader.py
CHANGED
@@ -30,10 +30,10 @@ def read_pdf_text(pdf_path):
|
|
30 |
pdf_reader = PdfReader(pdf_path)
|
31 |
for page in pdf_reader.pages:
|
32 |
text += page.extract_text()
|
33 |
-
|
34 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
|
35 |
text_chunks = text_splitter.split_text(text)
|
36 |
-
|
|
|
37 |
|
38 |
def read_pdf(pdf_path):
|
39 |
loader = PyPDFLoader(pdf_path)
|
@@ -51,5 +51,5 @@ def PDF_4_QA(file_path):
|
|
51 |
#docs = read_pdf(file_path)
|
52 |
#cleaned_docs = Chunks(docs)
|
53 |
cleaned_docs = read_pdf_text(file_path)
|
54 |
-
vectordb = Chroma.
|
55 |
return vectordb,cleaned_docs
|
|
|
30 |
pdf_reader = PdfReader(pdf_path)
|
31 |
for page in pdf_reader.pages:
|
32 |
text += page.extract_text()
|
|
|
33 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
|
34 |
text_chunks = text_splitter.split_text(text)
|
35 |
+
text_docs = [Document(page_content=txt) for txt in text_chunks]
|
36 |
+
return text_docs
|
37 |
|
38 |
def read_pdf(pdf_path):
|
39 |
loader = PyPDFLoader(pdf_path)
|
|
|
51 |
#docs = read_pdf(file_path)
|
52 |
#cleaned_docs = Chunks(docs)
|
53 |
cleaned_docs = read_pdf_text(file_path)
|
54 |
+
vectordb = Chroma.from_documents(cleaned_docs,embedding=embeddings,persist_directory="Chroma/docs")
|
55 |
return vectordb,cleaned_docs
|