Spaces:

large-traversaal
/

Alif-1.0-8B-Instruct

Sleeping

File size: 4,488 Bytes

import os
import json
import subprocess
import gradio as gr
from threading import Thread
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
from datetime import datetime

# Load model from Hugging Face Hub
MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct"
MODEL_FILE = "model-Q8_0.gguf"

model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE)

# Initialize Llama model
llama = Llama(
    model_path=model_path_file,
    n_gpu_layers=40,  # Adjust based on VRAM
    n_threads=8,  # Match CPU cores
    n_batch=512,  # Optimize for better VRAM usage
    n_ctx=4096,  # Context window size
    verbose=True  # Enable debug logging
)


# Function to generate responses
def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
    # chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:"
    chat_prompt = f"{system_prompt}\n ### Instruction: {message}\n ### Response:"
    response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True)
    
    text = ""
    for chunk in response:
        content = chunk["choices"][0]["text"]
        if content:
            text += content
            yield text

# def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
#     """Generates a streaming response from the Llama model."""
#     messages = [
#         {"role": "system", "content": "You are an Urdu Chatbot. Write an appropriate response for the given instruction."},
#     ]

#     # Add history and the current message
#     #for user, bot in history:
#         #messages.append({"role": "user", "content": user})
#         #messages.append({"role": "assistant", "content": bot})

#     messages.append({"role": "user", "content": message})

#     response = llama.create_chat_completion(
#         messages=messages,
#         stream=True,
#     )

#     partial_message = ""
#     for part in response:
#         content = part["choices"][0]["delta"].get("content", "")
#         partial_message += content
#         yield partial_message


# JavaScript function for `on_load`
on_load = """
async()=>{ alert("Welcome to the Traversaal Alif 1.0 Chatbot! This is an experimental AI model. Please use responsibly."); }
"""

placeholder = """
<center><h1>10 Questions</h1><br>Think of a person, place, or thing. I'll ask you 10 yes/no questions to try and guess it.
</center>
"""

# Create custom chat UI using `gr.Blocks`
with gr.Blocks(js=on_load, theme=gr.themes.Default()) as demo:
    with gr.Column(scale=1, elem_id="center-content"):
        gr.Markdown(
            """
            <div style="text-align: center;">
                <h1>Alif 1.0 Urdu & English Chatbot 🚀</h1>
                <p>Alif 1.0 8B Instruct is an open-source model with highly advanced multilingual reasoning capabilities. It utilizes human refined multilingual synthetic data paired with reasoning to enhance cultural nuance and reasoning capabilities in english and urdu languages.</p>
            </div>
            """,
        )
    
    chat = gr.ChatInterface(
        generate_response,
        #chatbot=gr.Chatbot(placeholder=placeholder),
        #title="🚀" + " " + "Alif-1.0 Chatbot",
        #description="Urdu AI Chatbot powered by Llama.cpp",
        examples=[
            ["شہر کراچی کے بارے میں بتاؤ"],
            ["قابل تجدید توانائی کیا ہے؟"],
            ["پاکستان کے بارے میں بتائیں"]
        ],
        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False),
        additional_inputs=[
            gr.Textbox(value="You are an Urdu Chatbot. Write an appropriate response for the given instruction in Urdu. Your response should be extremely comprehensive", label="System prompt", render=False),
            gr.Slider(0, 1, 0.8, label="Temperature", render=False),
            gr.Slider(128, 4096, 2048, label="Max new tokens", render=False),
            gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False),
            gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False),
            gr.Slider(0, 1, 0.95, label="Top P sampling", render=False),
        ],
    )

demo.queue(max_size=10).launch(share=True)