ContenteaseAI custom trained model

import spaces
import json
import subprocess
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
import gradio as gr
from huggingface_hub import hf_hub_download
import logging
import time

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

repo_id = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
filename = "Meta-Llama-3-8B-Instruct.Q8_0.gguf"

try:
        start_time = time.time()
        logger.info("Downloading Model....")
        hf_hub_download(
            repo_id = repo_id ,
            filename = filename,
            local_dir="./model"
        )
        end_time = time.time()
        logger.info(f"Download complete. Time taken : {start_time - end_time} seconds.")

except Exception as e:
    logger.error(f"Unable to download Model : {e}")
    raise

llm = None
llm_model = None

@spaces.GPU(duration=120)
def respond(
    message,
    model,
    system_message,
    max_tokens,
    temperature,
):
    chat_template = MessagesFormatterType.LLAMA_3

    global llm
    global llm_model
    
    if llm is None or llm_model != model:
        llm = Llama(
            model_path=f"models/{model}",
            flash_attn=True,
            n_gpu_layers=-1,
            n_batch=1024,
            n_ctx=8192,
        )
        llm_model = model

    provider = LlamaCppPythonProvider(llm)

    agent = LlamaCppAgent(
        provider,
        system_prompt=f"{system_message}",
        predefined_messages_formatter_type=chat_template,
        debug_output=True
    )
    
    settings = provider.get_provider_default_settings()
    settings.temperature = temperature
    settings.max_tokens = max_tokens
    settings.stream = True
    
    stream = agent.get_chat_response(
        message,
        llm_sampling_settings=settings,
        returns_streaming_generator=True,
        print_output=False
    )
    
    outputs = ""
    for output in stream:
        outputs += output
        yield outputs

DESCRIPTION = '''
<div>
<h1 style="text-align: center;">ContenteaseAI custom trained model</h1>
</div>
'''

LICENSE = """
<p/>
---
For more information, visit our [website](https://contentease.ai).
"""

PLACEHOLDER = """
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">ContenteaseAI Custom AI trained model</h1>
   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Enter the text extracted from the PDF:</p>
</div>
"""

css = """
h1 {
  text-align: center;
  display: block;
}
"""
# Gradio block
chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')

with gr.Blocks(fill_height=True, css=css) as demo:
    gr.Markdown(DESCRIPTION)

    gr.ChatInterface(
        fn=respond,
        chatbot=chatbot,
        fill_height=True,
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False),
            gr.Slider(minimum=128, maximum=2000, step=1, value=700, label="Max new tokens", render=False),
        ]
    )

    gr.Markdown(LICENSE)

if __name__ == "__main__":
    try:
        demo.launch(show_error=True, debug = True)
    except Exception as e:
        logger.error(f"Error launching Gradio demo: {e}")