Spaces:
Runtime error
Runtime error
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import TensorflowHubEmbeddings | |
| import os | |
| import time | |
| import streamlit as st | |
| def embed_doc(filename): | |
| if len(os.listdir("."))>0: | |
| loader=PyPDFLoader(filename) | |
| start = time.time() | |
| raw_documents = loader.load() | |
| # Split text | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=0, | |
| length_function=len | |
| ) | |
| documents = text_splitter.split_documents(raw_documents) | |
| end = time.time() | |
| st.text("Load and split text: "+str(round(end - start,1))) | |
| # Load Data to vectorstore | |
| start = time.time() | |
| # embeddings = LlamaCppEmbeddings(model_path="ggml-model.bin") | |
| # embeddings = HuggingFaceEmbeddings(model_name="diptanuc/all-mpnet-base-v2", model_kwargs={'device': 'cpu'}) | |
| # embeddings = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder/4") | |
| embeddings = TensorflowHubEmbeddings(model_url="https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3") | |
| # embeddings = HuggingFaceEmbeddings(model_name="obrizum/all-MiniLM-L6-v2", model_kwargs={'device': 'cpu'}) | |
| end = time.time() | |
| st.text("Embedding time: "+str(round(end - start,1))) | |
| start = time.time() | |
| vectorstore = Chroma.from_documents(documents, embeddings) | |
| end = time.time() | |
| st.text("Vectorizing time: "+str(round(end - start,1))) | |
| return vectorstore | |