Spaces:

InvestmentResearchAI
/

LLM-ADE-dev

Sleeping

App Files Files Community

WilliamGazeley commited on May 13, 2024

Commit

9e2a95f

1 Parent(s): 1b19641

Migrate to Ollama

Browse files

Files changed (3) hide show

src/app.py +27 -21
src/config.py +8 -3
src/functioncall.py +14 -42

src/app.py CHANGED Viewed

@@ -1,12 +1,11 @@
 import os
 import huggingface_hub
 import streamlit as st
 from config import config
 from utils import get_assistant_message
 from functioncall import ModelInference
-from prompter import PromptManager
-print("Why, hello there!", flush=True)
 @st.cache_resource(show_spinner="Loading model..")
 def init_llm():
@@ -14,40 +13,44 @@ def init_llm():
     llm = ModelInference(chat_template=config.chat_template)
     return llm
 def get_response(prompt):
     try:
         return llm.generate_function_call(
-            prompt,
-            config.chat_template,
-            config.num_fewshot,
-            config.max_depth
         )
     except Exception as e:
         return f"An error occurred: {str(e)}"
 def get_output(context, user_input):
     try:
         config.status.update(label=":bulb: Preparing answer..")
-        prompt_schema = llm.prompter.read_yaml_file("prompt_assets/output_sys_prompt.yml")
-        sys_prompt = llm.prompter.format_yaml_prompt(prompt_schema, dict()) + \
-            f"Information:\n{context}"
         convo = [
             {"role": "system", "content": sys_prompt},
             {"role": "user", "content": user_input},
         ]
         response = llm.run_inference(convo)
-        return get_assistant_message(response, config.chat_template, llm.tokenizer.eos_token)
     except Exception as e:
         return f"An error occurred: {str(e)}"
 def main():
     st.title("LLM-ADE 9B Demo")
     input_text = st.text_area("Enter your text here:", value="", height=200)
     if st.button("Generate"):
         if input_text:
-            with st.status('Generating response...') as status:
                 config.status = status
                 agent_resp = get_response(input_text)
                 st.write(get_output(agent_resp, input_text))
@@ -55,17 +58,20 @@ def main():
         else:
             st.warning("Please enter some text to generate a response.")
 llm = init_llm()
-def main_headless():
-    while True:
-       input_text = input("Enter your text here: ")
-       agent_resp = get_response(input_text)
-       print('\033[94m' + get_output(agent_resp, input_text) + '\033[0m')
 if __name__ == "__main__":
-    print(f"Test env vars: {os.getenv('TEST_SECRET')}")
     if config.headless:
-        main_headless()
     else:
         main()

 import os
+from time import time
 import huggingface_hub
 import streamlit as st
 from config import config
 from utils import get_assistant_message
 from functioncall import ModelInference
 @st.cache_resource(show_spinner="Loading model..")
 def init_llm():
     llm = ModelInference(chat_template=config.chat_template)
     return llm
 def get_response(prompt):
     try:
         return llm.generate_function_call(
+            prompt, config.chat_template, config.num_fewshot, config.max_depth
         )
     except Exception as e:
         return f"An error occurred: {str(e)}"
 def get_output(context, user_input):
     try:
         config.status.update(label=":bulb: Preparing answer..")
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        prompt_path = os.path.join(script_dir, 'prompt_assets', 'output_sys_prompt.yml')
+        prompt_schema = llm.prompter.read_yaml_file(prompt_path)
+        sys_prompt = (
+            llm.prompter.format_yaml_prompt(prompt_schema, dict())
+            + f"Information:\n{context}"
+        )
         convo = [
             {"role": "system", "content": sys_prompt},
             {"role": "user", "content": user_input},
         ]
         response = llm.run_inference(convo)
+        return response
     except Exception as e:
         return f"An error occurred: {str(e)}"
 def main():
     st.title("LLM-ADE 9B Demo")
     input_text = st.text_area("Enter your text here:", value="", height=200)
     if st.button("Generate"):
         if input_text:
+            with st.status("Generating response...") as status:
                 config.status = status
                 agent_resp = get_response(input_text)
                 st.write(get_output(agent_resp, input_text))
         else:
             st.warning("Please enter some text to generate a response.")
 llm = init_llm()
+def main_headless(prompt: str):
+    start = time()
+    agent_resp = get_response(prompt)
+    print("\033[94m" + get_output(agent_resp, prompt) + "\033[0m")
+    print(f"Time taken: {time() - start:.2f}s\n" + "-" * 20)
 if __name__ == "__main__":
     if config.headless:
+        import fire
+        fire.Fire(main_headless)
     else:
         main()

src/config.py CHANGED Viewed

@@ -2,12 +2,18 @@ from pydantic import Field
 from pydantic_settings import BaseSettings
 from typing import Dict, Any
 class Config(BaseSettings):
     hf_token: str = Field(...)
-    hf_model: str = Field("InvestmentResearchAI/LLM-ADE-dev")
     headless: bool = Field(False, description="Run in headless mode.")
-    status: Any = None # Hold the status
     az_search_endpoint: str = Field("https://analysis-bank.search.windows.net")
     az_search_api_key: str = Field(...)
@@ -17,7 +23,6 @@ class Config(BaseSettings):
     chat_template: str = Field("chatml", description="Chat template for prompt formatting")
     num_fewshot: int | None = Field(None, description="Option to use json mode examples")
-    load_in_4bit: str = Field("False", description="Option to load in 4bit with bitsandbytes")
     max_depth: int = Field(3, description="Maximum number of recursive iteration")
 config = Config(_env_file=".env")

 from pydantic_settings import BaseSettings
 from typing import Dict, Any
+class MockStatus():
+    # Required for headless mode
+    def update(self, *args, **kwargs):
+        print("MockStatus update called with args: ", args, " and kwargs: ", kwargs)
 class Config(BaseSettings):
     hf_token: str = Field(...)
+    hf_model: str = Field("InvestmentResearchAI/LLM-ADE-dev") # We need this because I can't get the model template out of the ollama model
+    ollama_model: str = Field("llama3")
     headless: bool = Field(False, description="Run in headless mode.")
+    status: Any = MockStatus()
     az_search_endpoint: str = Field("https://analysis-bank.search.windows.net")
     az_search_api_key: str = Field(...)
     chat_template: str = Field("chatml", description="Chat template for prompt formatting")
     num_fewshot: int | None = Field(None, description="Option to use json mode examples")
     max_depth: int = Field(3, description="Maximum number of recursive iteration")
 config = Config(_env_file=".env")

src/functioncall.py CHANGED Viewed

@@ -13,6 +13,7 @@ from transformers import (
 import functions
 from prompter import PromptManager
 from validator import validate_function_call_schema
 from utils import (
     inference_logger,
@@ -22,26 +23,12 @@ from utils import (
 )
 class ModelInference:
-    def __init__(self, chat_template: str, load_in_4bit: bool = False):
         self.prompter = PromptManager()
-        self.bnb_config = None
-        if load_in_4bit == "True": # Never use this
-            self.bnb_config = BitsAndBytesConfig(
-                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_use_double_quant=True,
-            )
-        self.model = AutoModelForCausalLM.from_pretrained(
-            config.hf_model,
-            trust_remote_code=True,
-            return_dict=True,
-            quantization_config=self.bnb_config,
-            torch_dtype=torch.float16,
-            attn_implementation="flash_attention_2",
-            device_map="auto",
-        )
         self.tokenizer = AutoTokenizer.from_pretrained(config.hf_model, trust_remote_code=True)
         self.tokenizer.pad_token = self.tokenizer.eos_token
         self.tokenizer.padding_side = "left"
@@ -49,24 +36,18 @@ class ModelInference:
         if self.tokenizer.chat_template is None:
             print("No chat template defined, getting chat_template...")
             self.tokenizer.chat_template = get_chat_template(chat_template)
-        inference_logger.info(self.model.config)
-        inference_logger.info(self.model.generation_config)
-        inference_logger.info(self.tokenizer.special_tokens_map)
-    def process_completion_and_validate(self, completion, chat_template):
-        assistant_message = get_assistant_message(completion, chat_template, self.tokenizer.eos_token)
-        if assistant_message:
-            validation, tool_calls, error_message = validate_and_extract_tool_calls(assistant_message)
             if validation:
                 inference_logger.info(f"parsed tool calls:\n{json.dumps(tool_calls, indent=2)}")
-                return tool_calls, assistant_message, error_message
             else:
                 tool_calls = None
-                return tool_calls, assistant_message, error_message
         else:
             inference_logger.warning("Assistant message is None")
             raise ValueError("Assistant message is None")
@@ -86,19 +67,10 @@ class ModelInference:
         inputs = self.tokenizer.apply_chat_template(
             prompt,
             add_generation_prompt=True,
-            return_tensors='pt'
-        )
-        tokens = self.model.generate(
-            inputs.to(self.model.device),
-            max_new_tokens=1500,
-            temperature=0.8,
-            repetition_penalty=1.2,
-            do_sample=True,
-            eos_token_id=self.tokenizer.eos_token_id
         )
-        completion = self.tokenizer.decode(tokens[0], skip_special_tokens=False, clean_up_tokenization_space=True)
-        return completion
     def generate_function_call(self, query, chat_template, num_fewshot, max_depth=5):
         try:

 import functions
 from prompter import PromptManager
 from validator import validate_function_call_schema
+from langchain_community.chat_models import ChatOllama
 from utils import (
     inference_logger,
 )
 class ModelInference:
+    def __init__(self, chat_template: str):
         self.prompter = PromptManager()
+        self.model = ChatOllama(model=config.ollama_model,
+                                temperature=0.0, format='json')
         self.tokenizer = AutoTokenizer.from_pretrained(config.hf_model, trust_remote_code=True)
         self.tokenizer.pad_token = self.tokenizer.eos_token
         self.tokenizer.padding_side = "left"
         if self.tokenizer.chat_template is None:
             print("No chat template defined, getting chat_template...")
             self.tokenizer.chat_template = get_chat_template(chat_template)
+    def process_completion_and_validate(self, completion, chat_template):
+        if completion:
+            validation, tool_calls, error_message = validate_and_extract_tool_calls(completion)
             if validation:
                 inference_logger.info(f"parsed tool calls:\n{json.dumps(tool_calls, indent=2)}")
+                return tool_calls, completion, error_message
             else:
                 tool_calls = None
+                return tool_calls, completion, error_message
         else:
             inference_logger.warning("Assistant message is None")
             raise ValueError("Assistant message is None")
         inputs = self.tokenizer.apply_chat_template(
             prompt,
             add_generation_prompt=True,
+            tokenize=False,
         )
+        completion = self.model.invoke(inputs, format='json')
+        return completion.content
     def generate_function_call(self, query, chat_template, num_fewshot, max_depth=5):
         try: