pdfchat / ingest_data.py
fakezeta
first release
6feb027
raw
history blame
1.68 kB
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import TensorflowHubEmbeddings
import os
import time
import streamlit as st
def embed_doc(filename):
if len(os.listdir("."))>0:
loader=PyPDFLoader(filename)
start = time.time()
raw_documents = loader.load()
# Split text
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=0,
length_function=len
)
documents = text_splitter.split_documents(raw_documents)
end = time.time()
st.text("Load and split text: "+str(round(end - start,1)))
# Load Data to vectorstore
start = time.time()
# embeddings = LlamaCppEmbeddings(model_path="ggml-model.bin")
# embeddings = HuggingFaceEmbeddings(model_name="diptanuc/all-mpnet-base-v2", model_kwargs={'device': 'cpu'})
# embeddings = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder/4")
embeddings = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3")
# embeddings = HuggingFaceEmbeddings(model_name="obrizum/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'})
end = time.time()
st.text("Embedding time: "+str(round(end - start,1)))
start = time.time()
vectorstore = Chroma.from_documents(documents, embeddings)
end = time.time()
st.text("Vectorizing time: "+str(round(end - start,1)))
return vectorstore