import gradio as gr from embedchain import App, OpenSourceApp, CustomApp from embedchain.config import CustomAppConfig from embedchain.models import Providers, EmbeddingFunctions import chromadb import os import time import subprocess #HUGGINGFACEHUB_API_TOKEN = os.environ["HUGGINGFACEHUB_API_TOKEN"] class ContextCreator: def __init__(self, app): self.app = app def create_context(self, pdf_urls="", docx_urls="", youtube_urls="", web_urls="", sitemap_url="", upload_files=None): if pdf_urls != "": for x in pdf_urls.split(","): self.app.add(x, data_type='pdf_file') if docx_urls != "": for x in docx_urls.split(","): self.app.add(x, data_type='docx_file') if youtube_urls != "": for x in youtube_urls.split(","): self.app.add(x, data_type='youtube_video') if web_urls != "": for x in web_urls.split(","): self.app.add(x, data_type='web_page') if sitemap_url != "": self.app.add(x, data_type='sitemap') if upload_files is not None: for idx, file in enumerate(upload_files): if file.name.endswith('.pdf'): self.app.add(file.name, data_type='pdf_file') if file.name.endswith('.docx'): self.app.add(file.name, data_type='docx_file') def environ_api_key(api_key): global app os.environ["OPENAI_API_KEY"] = api_key config = CustomAppConfig(embedding_fn=EmbeddingFunctions.OPENAI, provider=Providers.OPENAI, embedding_fn_model="text-embedding-ada-002") app = CustomApp(config) return "OpenAI API key set !" def build_context(pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files): context_creator = ContextCreator(app) context_creator.create_context(pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files) return "loaded" def llm_respond(query, chat_history): result = app.query(query) chat_history.append((query, result)) time.sleep(2) return "", chat_history def loading(): return "Loading..." def clear_chromadb(): subprocess.call('rm -rf ./db', shell=True) with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.HTML("

Query your documents

") gr.HTML("

Made with the embedchain Framework

The framework is builded to be able to use multiple free or paid LLM (OpenAI GPT, GPT4ALL, llama2...), however open models like GPT4all or llama are very slow on CPU, which is why OpenAI is prefered here (the default embeddings model is text-embedding-ada-002 and the chat model is gpt-3.5-turbo)

") with gr.Row(): openai_key = gr.Textbox(label="OpenAI API Key") out = gr.Textbox(interactive=False) openai_key.change(environ_api_key, openai_key, out) with gr.Row(): with gr.Column(scale=1): gr.HTML("

Create your context by combining multiple document formats

") pdf_urls = gr.Textbox(label="Online pdf urls (comma separated if multiple)") docx_urls = gr.Textbox(label="Online docx urls (comma separated if multiple)") youtube_urls = gr.Textbox(label="Youtube video urls (comma separated if multiple)") web_urls = gr.Textbox(label="Webpage urls (comma separated if multiple)") sitemap_url = gr.Textbox(label="Sitemap url (generally ending with sitemap.xml)") upload_files = gr.Files(label="Load local pdf or docx files", file_types=['.pdf','.docx'], type="file") load_docs = gr.Button("Load documents and urls", variant="primary") loading_status = gr.Textbox(label="Loading status", placeholder="", interactive=False, scale=0) with gr.Column(scale=2): gr.HTML("

Query your context

") msg = gr.Textbox(label="User message") chatbot = gr.Chatbot() clearchat = gr.ClearButton([msg, chatbot], value="New chat",) cleardb = gr.Button(value="Reset current documents context (for loading new documents)", variant="secondary") load_docs.click(loading, None, loading_status, queue=False) load_docs.click(build_context, inputs=[pdf_urls, docx_urls, youtube_urls, web_urls, sitemap_url, upload_files], outputs=[loading_status], queue=False) msg.submit(llm_respond, [msg, chatbot], [msg, chatbot]) cleardb.click(clear_chromadb) demo.queue(concurrency_count=3) demo.launch()