Spaces:

whoami02
/

bot_manuals

Runtime error

App Files Files Community

whoami02 commited on Feb 14, 2024

Commit

35fa3f4

verified ·

1 Parent(s): c59f483

Rename bot.py to app.py

Browse files

Files changed (1) hide show

bot.py → app.py +5 -68

bot.py → app.py RENAMED Viewed

@@ -1,13 +1,8 @@
-import torch
 import os
 import gradio as gr
-from auto_gptq import AutoGPTQForCausalLM
-# from ctransformers import AutoModelForCausalLM, AutoConfig, Config
-from transformers import AutoTokenizer, pipeline, GenerationConfig
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain.retrievers import MultiQueryRetriever
-# from langchain.retrievers.document_compressors import LLMChainExtractor
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferWindowMemory
 from langchain_community.llms import llamacpp, huggingface_pipeline
@@ -16,9 +11,7 @@ from langchain.chains import LLMChain
 from langchain.chains.question_answering import load_qa_chain
 from huggingface_hub import hf_hub_download
 from dotenv import load_dotenv
-# import os
-# os.getenv('hf_token')
-# MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
 _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a
 standalone question without changing the content in given question.
 Chat History:
@@ -31,42 +24,9 @@ Do not use any other information for answering the user. Provide a detailed answ
 load_dotenv()
-def load_quantized_model_gptq(model_id, model_basename):
-    # if ".safetensors" in model_basename:
-    #     model_basename = model_basename.replace(".safetensors", "")
-    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, cache_dir = r"E:\AW\LLMs\models")
-    model = AutoGPTQForCausalLM.from_quantized(
-        model_id,
-        # model_basename=model_basename,
-        use_safetensors=True,
-        trust_remote_code=True,
-        device_map="auto",
-        use_triton=False,
-        cache_dir = r"E:\AW\LLMs\models"
-    )
-    generation_config = GenerationConfig.from_pretrained(model_id)
-    pipe = pipeline(
-        "text-generation",
-        model=model, #type: ignore
-        tokenizer=tokenizer,
-        max_length=20000,
-        temperature=0.7,
-        # top_p=0.95,
-        repetition_penalty=1.15,
-        generation_config=generation_config,
-    )
-    local_llm = huggingface_pipeline.HuggingFacePipeline(pipeline=pipe)
-    return local_llm
 def load_quantized_model(model_id=None):
     MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
-    # if model_id == "Zephyr-7b-Beta":
-    #     MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
-    # elif model_id == "Llama-2-7b-chat":
-    #     MODEL_ID, MODEL_BASENAME = "TheBloke/Llama-2-7b-Chat-GGUF","llama-2-7b-chat.Q4_K_M.gguf"
     try:
-        # logging.info("Using LlamaCPP for GGUF quantized model")
         model_path = hf_hub_download(
             repo_id=MODEL_ID,
             filename=MODEL_BASENAME,
@@ -80,7 +40,6 @@ def load_quantized_model(model_id=None):
             'n_batch': 512,
             # 'n_gpu_layers':6,
         }
-        # offloading 5 layers to gpu gave ans in 6-7 mins; 3270 mb of VRAM
         return llamacpp.LlamaCpp(**kwargs)
     except TypeError:
         print("Supported model architecture: Llama, Mistral")
@@ -97,33 +56,21 @@ with gr.Blocks() as demo:
     """)
     with gr.Row():
-        with gr.Column(scale=2): #type:ignore
-            # with gr.Column(scale=5):
-                # with gr.Row():
-                #     file_output = gr.File(label="Uploaded Documents",show_label=True)
-                # with gr.Row():
-                #     upload_button = gr.UploadButton("Click to upload files", file_types=[".pdf", ".csv", ".xlsx", ".txt"], file_count="multiple")
-                #     upload_button.upload(upload_files, upload_button, file_output)
             with gr.Row():
                 model_id = gr.Radio(["Zephyr-7b-Beta", "Llama-2-7b-chat"], value="Llama-2-7b-chat",label="LLM Model")
-                # Temp = gr.Slider(minimum=0, maximum=5, step=0.1, info="Adjust the [random parameter] of LLM from here")
             with gr.Row():
                 mode = gr.Radio(['Document', 'Data'], value='Document',label="QA mode")
-                # print(f"selected {model} model with {Temp} temperature")
         persist_directory = "db"
         embeddings = HuggingFaceBgeEmbeddings(
             model_name = "BAAI/bge-small-en-v1.5",
             model_kwargs={"device": "cpu"},
             encode_kwargs = {'normalize_embeddings':True},
-            cache_folder=r"E:\AW\LLMs\models",
         )
         db2 = Chroma(persist_directory = persist_directory,embedding_function = embeddings)
         # llm = load_quantized_model(model_id=model_id) #type:ignore
-        MODEL_ID = "TheBloke/Llama-2-7B-Chat-GPTQ"
-        # MODEL_I = "HuggingFaceH4/zephyr-7b-beta"
-        MODEL_BASENAME = "gptq-4bit-32g-actorder_True"
         # ---------------------------------------------------------------------------------------------------
-        # llm = load_quantized_model_gptq(model_id=MODEL_ID, model_basename=MODEL_BASENAME)
         llm = load_quantized_model()
         # ---------------------------------------------------------------------------------------------------
         condense_question_prompt_template = PromptTemplate.from_template(_template)
@@ -133,20 +80,11 @@ with gr.Blocks() as demo:
             Helpful Answer:"""
         qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
         memory = ConversationBufferWindowMemory(memory_key='chat_history', k=1, return_messages=True)
-        # memory = ConversationKGMemory(llm=llm, memory_key='chat_history', return_messages=True)
-        # compressor = LLMChainExtractor.from_llm(llm=llm)
-        # compression_retriever = ContextualCompressionRetriever(
-        #     base_compressor=compressor,
-        #     base_retriever=db2.as_retriever(search_kwargs={'k':5})
-        # )
         retriever_from_llm = MultiQueryRetriever.from_llm(
                 retriever=db2.as_retriever(search_kwargs={'k':5}),
                 llm = llm,
-                # llm = load_quantized_model(model_id="TheBloke/Llama-2-7B-Chat-GPTQ")
         )
         qa2 = ConversationalRetrievalChain(
-            # retriever=db.as_retriever(),
             retriever=retriever_from_llm,
             question_generator= LLMChain(llm=llm, prompt=condense_question_prompt_template, memory=memory, verbose=True), #type:ignore
             combine_docs_chain=load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, verbose=True), #type:ignore
@@ -168,7 +106,7 @@ with gr.Blocks() as demo:
             history[-1][1] = res['answer']
             torch.cuda.empty_cache()
             return history
-        with gr.Column(scale=8): # type: ignore
             with gr.Row():
                 chatbot = gr.Chatbot([], elem_id="chatbot",label="Chat", height=500, show_label=True, avatar_images=["user.jpeg","Bot.jpg"])
             with gr.Row():
@@ -198,5 +136,4 @@ with gr.Blocks() as demo:
 if __name__ == "__main__":
     demo.queue()
-    # demo.launch(share=True)
-    demo.launch(max_threads=40)

 import os
 import gradio as gr
 from langchain_community.embeddings import HuggingFaceBgeEmbeddings
 from langchain_community.vectorstores import Chroma
 from langchain.retrievers import MultiQueryRetriever
 from langchain.chains import ConversationalRetrievalChain
 from langchain.memory import ConversationBufferWindowMemory
 from langchain_community.llms import llamacpp, huggingface_pipeline
 from langchain.chains.question_answering import load_qa_chain
 from huggingface_hub import hf_hub_download
 from dotenv import load_dotenv
 _template = """Given the following conversation and a follow up question, rephrase the follow up question to be a
 standalone question without changing the content in given question.
 Chat History:
 load_dotenv()
 def load_quantized_model(model_id=None):
     MODEL_ID, MODEL_BASENAME = "TheBloke/zephyr-7B-beta-GGUF","zephyr-7b-beta.Q5_K_S.gguf"
     try:
         model_path = hf_hub_download(
             repo_id=MODEL_ID,
             filename=MODEL_BASENAME,
             'n_batch': 512,
             # 'n_gpu_layers':6,
         }
         return llamacpp.LlamaCpp(**kwargs)
     except TypeError:
         print("Supported model architecture: Llama, Mistral")
     """)
     with gr.Row():
+        with gr.Column(scale=1):
             with gr.Row():
                 model_id = gr.Radio(["Zephyr-7b-Beta", "Llama-2-7b-chat"], value="Llama-2-7b-chat",label="LLM Model")
             with gr.Row():
                 mode = gr.Radio(['Document', 'Data'], value='Document',label="QA mode")
         persist_directory = "db"
         embeddings = HuggingFaceBgeEmbeddings(
             model_name = "BAAI/bge-small-en-v1.5",
             model_kwargs={"device": "cpu"},
             encode_kwargs = {'normalize_embeddings':True},
+            cache_folder="models",
         )
         db2 = Chroma(persist_directory = persist_directory,embedding_function = embeddings)
         # llm = load_quantized_model(model_id=model_id) #type:ignore
         # ---------------------------------------------------------------------------------------------------
         llm = load_quantized_model()
         # ---------------------------------------------------------------------------------------------------
         condense_question_prompt_template = PromptTemplate.from_template(_template)
             Helpful Answer:"""
         qa_prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
         memory = ConversationBufferWindowMemory(memory_key='chat_history', k=1, return_messages=True)
         retriever_from_llm = MultiQueryRetriever.from_llm(
                 retriever=db2.as_retriever(search_kwargs={'k':5}),
                 llm = llm,
         )
         qa2 = ConversationalRetrievalChain(
             retriever=retriever_from_llm,
             question_generator= LLMChain(llm=llm, prompt=condense_question_prompt_template, memory=memory, verbose=True), #type:ignore
             combine_docs_chain=load_qa_chain(llm=llm, chain_type="stuff", prompt=qa_prompt, verbose=True), #type:ignore
             history[-1][1] = res['answer']
             torch.cuda.empty_cache()
             return history
+        with gr.Column(scale=9): # type: ignore
             with gr.Row():
                 chatbot = gr.Chatbot([], elem_id="chatbot",label="Chat", height=500, show_label=True, avatar_images=["user.jpeg","Bot.jpg"])
             with gr.Row():
 if __name__ == "__main__":
     demo.queue()
+    demo.launch(max_threads=40, debug=True)