File size: 4,488 Bytes
f88a5b9 ab97325 f88a5b9 ab97325 f88a5b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
import json
import subprocess
import gradio as gr
from threading import Thread
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from datetime import datetime
# Load model from Hugging Face Hub
MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
MODEL_FILE = "model-Q8_0.gguf"
model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE)
# Initialize Llama model
llama = Llama(
model_path=model_path_file,
n_gpu_layers=40, # Adjust based on VRAM
n_threads=8, # Match CPU cores
n_batch=512, # Optimize for better VRAM usage
n_ctx=4096, # Context window size
verbose=True # Enable debug logging
)
# Function to generate responses
def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
# chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
chat_prompt = f"{system_prompt}\n ### Instruction: {message}\n ### Response:"
response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True)
text = ""
for chunk in response:
content = chunk["choices"][0]["text"]
if content:
text += content
yield text
# def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
# """Generates a streaming response from the Llama model."""
# messages = [
# {"role": "system", "content": "You are an Urdu Chatbot. Write an appropriate response for the given instruction."},
# ]
# # Add history and the current message
# #for user, bot in history:
# #messages.append({"role": "user", "content": user})
# #messages.append({"role": "assistant", "content": bot})
# messages.append({"role": "user", "content": message})
# response = llama.create_chat_completion(
# messages=messages,
# stream=True,
# )
# partial_message = ""
# for part in response:
# content = part["choices"][0]["delta"].get("content", "")
# partial_message += content
# yield partial_message
# JavaScript function for `on_load`
on_load = """
async()=>{ alert("Welcome to the Traversaal Alif 1.0 Chatbot! This is an experimental AI model. Please use responsibly."); }
"""
placeholder = """
<center><h1>10 Questions</h1><br>Think of a person, place, or thing. I'll ask you 10 yes/no questions to try and guess it.
</center>
"""
# Create custom chat UI using `gr.Blocks`
with gr.Blocks(js=on_load, theme=gr.themes.Default()) as demo:
with gr.Column(scale=1, elem_id="center-content"):
gr.Markdown(
"""
<div style="text-align: center;">
<h1>Alif 1.0 Urdu & English Chatbot 🚀</h1>
<p>Alif 1.0 8B Instruct is an open-source model with highly advanced multilingual reasoning capabilities. It utilizes human refined multilingual synthetic data paired with reasoning to enhance cultural nuance and reasoning capabilities in english and urdu languages.</p>
</div>
""",
)
chat = gr.ChatInterface(
generate_response,
#chatbot=gr.Chatbot(placeholder=placeholder),
#title="🚀" + " " + "Alif-1.0 Chatbot",
#description="Urdu AI Chatbot powered by Llama.cpp",
examples=[
["شہر کراچی کے بارے میں بتاؤ"],
["قابل تجدید توانائی کیا ہے؟"],
["پاکستان کے بارے میں بتائیں"]
],
additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
additional_inputs=[
gr.Textbox(value="You are an Urdu Chatbot. Write an appropriate response for the given instruction in Urdu. Your response should be extremely comprehensive", label="System prompt", render=False),
gr.Slider(0, 1, 0.8, label="Temperature", render=False),
gr.Slider(128, 4096, 2048, label="Max new tokens", render=False),
gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False),
gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False),
gr.Slider(0, 1, 0.95, label="Top P sampling", render=False),
],
)
demo.queue(max_size=10).launch(share=True)
|