import spaces import json import subprocess from llama_cpp import Llama from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType from llama_cpp_agent.providers import LlamaCppPythonProvider import gradio as gr from huggingface_hub import hf_hub_download import logging import time logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) repo_id = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF" filename = "Meta-Llama-3-8B-Instruct.Q8_0.gguf" try: start_time = time.time() logger.info("Downloading Model....") hf_hub_download( repo_id = repo_id , filename = filename, local_dir="./model" ) end_time = time.time() logger.info(f"Download complete. Time taken : {start_time - end_time} seconds.") except Exception as e: logger.error(f"Unable to download Model : {e}") raise llm = None llm_model = None @spaces.GPU(duration=120) def respond( message, model, system_message, max_tokens, temperature, ): chat_template = MessagesFormatterType.LLAMA_3 global llm global llm_model if llm is None or llm_model != model: llm = Llama( model_path=f"models/{model}", flash_attn=True, n_gpu_layers=-1, n_batch=1024, n_ctx=8192, ) llm_model = model provider = LlamaCppPythonProvider(llm) agent = LlamaCppAgent( provider, system_prompt=f"{system_message}", predefined_messages_formatter_type=chat_template, debug_output=True ) settings = provider.get_provider_default_settings() settings.temperature = temperature settings.max_tokens = max_tokens settings.stream = True stream = agent.get_chat_response( message, llm_sampling_settings=settings, returns_streaming_generator=True, print_output=False ) outputs = "" for output in stream: outputs += output yield outputs DESCRIPTION = '''

ContenteaseAI custom trained model

''' LICENSE = """

--- For more information, visit our [website](https://contentease.ai). """ PLACEHOLDER = """

ContenteaseAI Custom AI trained model

Enter the text extracted from the PDF:

""" css = """ h1 { text-align: center; display: block; } """ # Gradio block chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface') with gr.Blocks(fill_height=True, css=css) as demo: gr.Markdown(DESCRIPTION) gr.ChatInterface( fn=respond, chatbot=chatbot, fill_height=True, additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), additional_inputs=[ gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False), gr.Slider(minimum=128, maximum=2000, step=1, value=700, label="Max new tokens", render=False), ] ) gr.Markdown(LICENSE) if __name__ == "__main__": try: demo.launch(show_error=True, debug = True) except Exception as e: logger.error(f"Error launching Gradio demo: {e}")