# JB: # LangChainDeprecationWarning: Importing embeddings from langchain is deprecated. # Importing from langchain will no longer be supported as of langchain==0.2.0. # Please import from langchain-community instead: # `from langchain_community.embeddings import FastEmbedEmbeddings`. # To install langchain-community run `pip install -U langchain-community`. from langchain_community.embeddings import FastEmbedEmbeddings import os import streamlit as st from langchain_groq import ChatGroq from langchain_community.document_loaders import WebBaseLoader # JB: from langchain_community.document_loaders import PyPDFLoader from langchain_community.embeddings import OllamaEmbeddings # JB: from langchain_community.embeddings import FastEmbedEmbeddings from langchain_community.document_loaders import PyPDFDirectoryLoader # JB: # File Directory # This covers how to load all documents in a directory. # Under the hood, by default this uses the UnstructuredLoader. from langchain_community.document_loaders import DirectoryLoader from langchain_community.document_loaders import TextLoader import chardet from langchain_community.vectorstores import FAISS # from langchain.vectorstores import Chroma # from langchain_community.vectorstores import Chroma from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.chains.combine_documents import create_stuff_documents_chain from langchain_core.prompts import ChatPromptTemplate from langchain.chains import create_retrieval_chain import time from dotenv import load_dotenv load_dotenv() # # groq_api_key = os.environ['GROQ_API_KEY'] print("groq_api_key: ", groq_api_key) # st.title("Chat with Docs - Groq Edition :) ") st.title("Literature Based Research (LBR) - A. Unzicker and J. Bours - Chat with Docs - Groq Edition (Very Fast!) - VERSION 3 - March 8 2024") if "vector" not in st.session_state: st.write("Chunking, embedding, storing in FAISS vectorstore ...") # st.session_state.embeddings = OllamaEmbeddings() # ORIGINAL st.session_state.embeddings = FastEmbedEmbeddings() # JB # st.session_state.loader = WebBaseLoader("https://paulgraham.com/greatwork.html") # ORIGINAL # st.session_state.docs = st.session_state.loader.load() # ORIGINAL # https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFLoader.html # https://python.langchain.com/docs/integrations/document_loaders/merge_doc # from langchain_community.document_loaders import PyPDFLoader # loader_pdf = PyPDFLoader("../MachineLearning-Lecture01.pdf") # # https://stackoverflow.com/questions/60215731/pypdf-to-read-each-pdf-in-a-folder # # https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.pdf.PyPDFDirectoryLoader.html # https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#pypdf-directory # !!!!! # PyPDF Directory # Load PDFs from directory # from langchain_community.document_loaders import PyPDFDirectoryLoader # loader = PyPDFDirectoryLoader("example_data/") # docs = loader.load() # # ZIE OOK: # https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#using-pypdf # Using MathPix # Inspired by Daniel Gross's https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21 # from langchain_community.document_loaders import MathpixPDFLoader # loader = MathpixPDFLoader("example_data/layout-parser-paper.pdf") # data = loader.load() # pdf_file_path = "*.pdf" # JB # st.session_state.loader = PyPDFLoader(file_path=pdf_file_path).load() # JB # st.session_state.loader = PyPDFLoader(*.pdf).load() # JB syntax error *.pdf ! # st.session_state.loader = PyPDFDirectoryLoader("*.pdf") # JB PyPDFDirectoryLoader("example_data/") # chunks = self.text_splitter.split_documents(docs) # chunks = filter_complex_metadata(chunks) # JB: # https://python.langchain.com/docs/modules/data_connection/document_loaders/pdf#pypdf-directory # st.session_state.docs = st.session_state.loader.load() # loader = PyPDFDirectoryLoader(".") # docs = loader.load() # st.session_state.docs = docs # JB: # https://python.langchain.com/docs/modules/data_connection/document_loaders/file_directory # text_loader_kwargs={'autodetect_encoding': True} text_loader_kwargs={'autodetect_encoding': False} path = '../' # loader = DirectoryLoader(path, glob="**/*.pdf", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs) # PyPDFDirectoryLoader (TEST): # loader = PyPDFDirectoryLoader(path, glob="**/*.pdf", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs) # loader = PyPDFDirectoryLoader(path, glob="**/*.pdf", loader_kwargs=text_loader_kwargs) loader = PyPDFDirectoryLoader(path, glob="**/*.pdf") docs = loader.load() st.session_state.docs = docs st.session_state.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) st.session_state.documents = st.session_state.text_splitter.split_documents(st.session_state.docs) # https://python.langchain.com/docs/integrations/vectorstores/faiss # docs_and_scores = db.similarity_search_with_score(query) # Saving and loading # You can also save and load a FAISS index. # This is useful so you don’t have to recreate it everytime you use it. # db.save_local("faiss_index") # new_db = FAISS.load_local("faiss_index", embeddings) # docs = new_db.similarity_search(query) # docs[0] # Document(page_content='Tonight. If you do this, make sure that you trust the source of the data. st.session_state.vector = FAISS.load_local("faiss_index", st.session_state.embeddings, allow_dangerous_deserialization=True) # ZIE: # ZIE VOOR EEN APP MET CHROMADB: # https://github.com/vndee/local-rag-example/blob/main/rag.py # https://raw.githubusercontent.com/vndee/local-rag-example/main/rag.py # Chroma.from_documents(documents=chunks, embedding=FastEmbedEmbeddings()) # st.session_state.vector = Chroma.from_documents(st.session_state.documents, st.session_state.embeddings) # JB # st.title("Chat with Docs - Groq Edition :) ") # st.title("Literature Based Research (LBR) - A. Unzicker and J. Bours - Chat with Docs - Groq Edition (Very Fast!) - VERSION 3 - March 8 2024") llm = ChatGroq( temperature=0.2, groq_api_key=groq_api_key, model_name='mixtral-8x7b-32768' ) prompt = ChatPromptTemplate.from_template(""" Answer the following question based only on the provided context. Think step by step before providing a detailed answer. I will tip you $200 if the user finds the answer helpful. {context} Question: {input}""") document_chain = create_stuff_documents_chain(llm, prompt) retriever = st.session_state.vector.as_retriever() retrieval_chain = create_retrieval_chain(retriever, document_chain) prompt = st.text_input("Input your prompt here") # If the user hits enter if prompt: # Then pass the prompt to the LLM start = time.process_time() response = retrieval_chain.invoke({"input": prompt}) print(f"Response time: {time.process_time() - start}") st.write(response["answer"]) # With a streamlit expander with st.expander("Document Similarity Search"): # Find the relevant chunks for i, doc in enumerate(response["context"]): # print(doc) # st.write(f"Source Document # {i+1} : {doc.metadata['source'].split('/')[-1]}") st.write(doc.page_content) st.write("--------------------------------")