import os import json import subprocess import gradio as gr from threading import Thread from huggingface_hub import hf_hub_download from llama_cpp import Llama from datetime import datetime # Load model from Hugging Face Hub MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct" MODEL_FILE = "model-Q8_0.gguf" model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE) # Initialize Llama model llama = Llama( model_path=model_path_file, n_gpu_layers=40, # Adjust based on VRAM n_threads=8, # Match CPU cores n_batch=512, # Optimize for better VRAM usage n_ctx=4096, # Context window size verbose=True # Enable debug logging ) # Function to generate responses def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p): # chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:" chat_prompt = f"{system_prompt}\n ### Instruction: {message}\n ### Response:" response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True) text = "" for chunk in response: content = chunk["choices"][0]["text"] if content: text += content yield text # def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p): # """Generates a streaming response from the Llama model.""" # messages = [ # {"role": "system", "content": "You are an Urdu Chatbot. Write an appropriate response for the given instruction."}, # ] # # Add history and the current message # #for user, bot in history: # #messages.append({"role": "user", "content": user}) # #messages.append({"role": "assistant", "content": bot}) # messages.append({"role": "user", "content": message}) # response = llama.create_chat_completion( # messages=messages, # stream=True, # ) # partial_message = "" # for part in response: # content = part["choices"][0]["delta"].get("content", "") # partial_message += content # yield partial_message # JavaScript function for `on_load` on_load = """ async()=>{ alert("Welcome to the Traversaal Alif 1.0 Chatbot! This is an experimental AI model. Please use responsibly."); } """ placeholder = """

10 Questions


Think of a person, place, or thing. I'll ask you 10 yes/no questions to try and guess it.
""" # Create custom chat UI using `gr.Blocks` with gr.Blocks(js=on_load, theme=gr.themes.Default()) as demo: with gr.Column(scale=1, elem_id="center-content"): gr.Markdown( """

Alif 1.0 Urdu & English Chatbot 🚀

Alif 1.0 8B Instruct is an open-source model with highly advanced multilingual reasoning capabilities. It utilizes human refined multilingual synthetic data paired with reasoning to enhance cultural nuance and reasoning capabilities in english and urdu languages.

""", ) chat = gr.ChatInterface( generate_response, #chatbot=gr.Chatbot(placeholder=placeholder), #title="🚀" + " " + "Alif-1.0 Chatbot", #description="Urdu AI Chatbot powered by Llama.cpp", examples=[ ["شہر کراچی کے بارے میں بتاؤ"], ["قابل تجدید توانائی کیا ہے؟"], ["پاکستان کے بارے میں بتائیں"] ], additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), additional_inputs=[ gr.Textbox(value="You are an Urdu Chatbot. Write an appropriate response for the given instruction in Urdu. Your response should be extremely comprehensive", label="System prompt", render=False), gr.Slider(0, 1, 0.8, label="Temperature", render=False), gr.Slider(128, 4096, 2048, label="Max new tokens", render=False), gr.Slider(1, 80, 40, step=1, label="Top K sampling", render=False), gr.Slider(0, 2, 1.1, label="Repetition penalty", render=False), gr.Slider(0, 1, 0.95, label="Top P sampling", render=False), ], ) demo.queue(max_size=10).launch(share=True)