import os import json import subprocess import gradio as gr from threading import Thread from huggingface_hub import hf_hub_download from llama_cpp import Llama from datetime import datetime # Load model from Hugging Face Hub MODEL_ID = "large-traversaal/Alif-1.0-8B-Instruct" MODEL_FILE = "model-Q8_0.gguf" model_path_file = hf_hub_download(MODEL_ID, filename=MODEL_FILE) # Initialize Llama model llama = Llama( model_path=model_path_file, n_gpu_layers=40, # Adjust based on VRAM n_threads=8, # Match CPU cores n_batch=512, # Optimize for better VRAM usage n_ctx=4096, # Context window size verbose=True # Enable debug logging ) # Function to generate responses def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p): # chat_prompt = f"You are an Urdu Chatbot. Write an appropriate response for the given instruction: {message} Response:" chat_prompt = f"{system_prompt}\n ### Instruction: {message}\n ### Response:" response = llama(chat_prompt, temperature=temperature, max_tokens=max_new_tokens, top_k=top_k, repeat_penalty=repetition_penalty, top_p=top_p, stop=["Q:", "\n"], echo=False, stream=True) text = "" for chunk in response: content = chunk["choices"][0]["text"] if content: text += content yield text # def generate_response(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p): # """Generates a streaming response from the Llama model.""" # messages = [ # {"role": "system", "content": "You are an Urdu Chatbot. Write an appropriate response for the given instruction."}, # ] # # Add history and the current message # #for user, bot in history: # #messages.append({"role": "user", "content": user}) # #messages.append({"role": "assistant", "content": bot}) # messages.append({"role": "user", "content": message}) # response = llama.create_chat_completion( # messages=messages, # stream=True, # ) # partial_message = "" # for part in response: # content = part["choices"][0]["delta"].get("content", "") # partial_message += content # yield partial_message # JavaScript function for `on_load` on_load = """ async()=>{ alert("Welcome to the Traversaal Alif 1.0 Chatbot! This is an experimental AI model. Please use responsibly."); } """ placeholder = """
Alif 1.0 8B Instruct is an open-source model with highly advanced multilingual reasoning capabilities. It utilizes human refined multilingual synthetic data paired with reasoning to enhance cultural nuance and reasoning capabilities in english and urdu languages.