from transformers import AutoTokenizer import transformers import torch import gradio as gr model = "Harshithacj123/CCU_Llama_7b_chat_test" # meta-llama/Llama-2-7b-chat-hf tokenizer = AutoTokenizer.from_pretrained(model) from transformers import pipeline llama_pipeline = pipeline( "text-generation", # LLM task model=model, torch_dtype=torch.float16, device_map="auto", ) SYSTEM_PROMPT = """[INST] <> You are a helpful bot. Your answers are clear and concise. <> """ def format_message(message: str, history: list, memory_limit: int = 3) -> str: formatted_message = f"[INST] {message} [/INST]" return formatted_message # Generate a response from the Llama model def get_llama_response(message: str, history: list) -> str: """ Generates a conversational response from the Llama model. Parameters: message (str): User's input message. history (list): Past conversation history. Returns: str: Generated response from the Llama model. """ query = format_message(message, history) response = "" sequences = llama_pipeline( query, do_sample=True, top_k=10, num_return_sequences=1, eos_token_id=tokenizer.eos_token_id, max_length=512, ) generated_text = sequences[0]['generated_text'] response = generated_text[len(query):] # Remove the prompt from the output print("Chatbot:", response.strip()) return response.strip() gr.ChatInterface(get_llama_response).launch()