GGUF_Model / app.py
contenteaseAI's picture
Upload app.py
ed5c12c verified
raw
history blame
3.63 kB
import spaces
import json
import subprocess
from llama_cpp import Llama
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
from llama_cpp_agent.providers import LlamaCppPythonProvider
import gradio as gr
from huggingface_hub import hf_hub_download
import logging
import time
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
repo_id = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
filename = "Meta-Llama-3-8B-Instruct.Q8_0.gguf"
try:
start_time = time.time()
logger.info("Downloading Model....")
hf_hub_download(
repo_id = repo_id ,
filename = filename,
local_dir="./model"
)
end_time = time.time()
logger.info(f"Download complete. Time taken : {start_time - end_time} seconds.")
except Exception as e:
logger.error(f"Unable to download Model : {e}")
raise
llm = None
llm_model = None
@spaces.GPU(duration=120)
def respond(
message,
model,
system_message,
max_tokens,
temperature,
):
chat_template = MessagesFormatterType.LLAMA_3
global llm
global llm_model
if llm is None or llm_model != model:
llm = Llama(
model_path=f"models/{model}",
flash_attn=True,
n_gpu_layers=-1,
n_batch=1024,
n_ctx=8192,
)
llm_model = model
provider = LlamaCppPythonProvider(llm)
agent = LlamaCppAgent(
provider,
system_prompt=f"{system_message}",
predefined_messages_formatter_type=chat_template,
debug_output=True
)
settings = provider.get_provider_default_settings()
settings.temperature = temperature
settings.max_tokens = max_tokens
settings.stream = True
stream = agent.get_chat_response(
message,
llm_sampling_settings=settings,
returns_streaming_generator=True,
print_output=False
)
outputs = ""
for output in stream:
outputs += output
yield outputs
DESCRIPTION = '''
<div>
<h1 style="text-align: center;">ContenteaseAI custom trained model</h1>
</div>
'''
LICENSE = """
<p/>
---
For more information, visit our [website](https://contentease.ai).
"""
PLACEHOLDER = """
<div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
<h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">ContenteaseAI Custom AI trained model</h1>
<p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Enter the text extracted from the PDF:</p>
</div>
"""
css = """
h1 {
text-align: center;
display: block;
}
"""
# Gradio block
chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterface')
with gr.Blocks(fill_height=True, css=css) as demo:
gr.Markdown(DESCRIPTION)
gr.ChatInterface(
fn=respond,
chatbot=chatbot,
fill_height=True,
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
additional_inputs=[
gr.Slider(minimum=0, maximum=1, step=0.1, value=0.95, label="Temperature", render=False),
gr.Slider(minimum=128, maximum=2000, step=1, value=700, label="Max new tokens", render=False),
]
)
gr.Markdown(LICENSE)
if __name__ == "__main__":
try:
demo.launch(show_error=True, debug = True)
except Exception as e:
logger.error(f"Error launching Gradio demo: {e}")