Spaces:
Runtime error
Runtime error
File size: 7,246 Bytes
9c2adc8 464f46b 9c2adc8 464f46b 4dd8683 464f46b 4dd8683 464f46b 4dd8683 464f46b 4dd8683 464f46b 4dd8683 464f46b 4dd8683 464f46b 4dd8683 464f46b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 |
import gradio as gr
from llama_cpp import Llama
import torch
import os
from accelerate import Accelerator
import tensorflow as tf # Import TensorFlow
import numpy as np # For handling input data
# Set device for PyTorch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device set to:", device)
# Initialize the accelerator
accelerator = Accelerator()
class LocalInferenceClient:
def __init__(self, model_name: str, model_path: str):
"""
Initialize the inference client with the model.
Args:
model_name (str): The name of the model.
model_path (str): The path to the model file or directory.
"""
self.model_name = model_name
self.model_path = model_path
# Initialize the Llama model specifically for gguf
self.model = Llama(model_path=model_path, n_ctx=2048, n_threads=8, n_gpu_layers=5)
# Move the model to the appropriate device
self.model = accelerator.prepare(self.model)
# Load the TensorFlow Lite model
self.tflite_interpreter = tf.lite.Interpreter(model_path='model.tflite')
self.tflite_interpreter.allocate_tensors()
# Get input and output tensors
self.input_details = self.tflite_interpreter.get_input_details()
self.output_details = self.tflite_interpreter.get_output_details()
def text_generation(self, prompt: str, max_new_tokens: int, temperature: float, top_p: float) -> str:
"""
Generate text based on the provided prompt.
Args:
prompt (str): The input prompt.
max_new_tokens (int): The maximum number of tokens to generate.
temperature (float): Sampling temperature.
top_p (float): Nucleus sampling probability.
Returns:
str: The generated text.
"""
# Use the Llama model for text generation
response = self.model.create_chat_completion(
messages=[{"role": "user", "content": prompt}],
max_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p
)
# Print the response to understand its structure
print("Response from model:", response)
# Access the content correctly based on the response structure
if 'choices' in response and len(response['choices']) > 0:
return response['choices'][0]['message']['content'] # Access the content key
else:
return "⚠️ Error: Unexpected response format."
def run_tflite_model(self, input_data: np.ndarray) -> np.ndarray:
"""
Run inference using the TensorFlow Lite model.
Args:
input_data (np.ndarray): Input data for the model.
Returns:
np.ndarray: Output data from the model.
"""
# Set the input tensor
self.tflite_interpreter.set_tensor(self.input_details[0]['index'], input_data)
# Run the model
self.tflite_interpreter.invoke()
# Get the output tensor
output_data = self.tflite_interpreter.get_tensor(self.output_details[0]['index'])
return output_data
# Specify the model paths for gguf models
model_configs = {
"Test": {
"path": r"./test-model.gguf",
"specs": """
## Lake 1 Chat Specifications
- **Architecture**: Test
- **Parameters**: IDK
- **Capabilities**: test
- **Intended Use**: test
"""
}
}
# Set up a dictionary mapping model names to their clients
clients = {name: LocalInferenceClient(name, config['path']) for name, config in model_configs.items()}
# Presets for performance/quality tradeoffs
presets = {
"Test": {
"Fast": {"max_new_tokens": 100, "temperature": 1.0, "top_p": 0.9},
"Normal": {"max_new_tokens": 200, "temperature": 0.7, "top_p": 0.95},
"Quality": {"max_new_tokens": 300, "temperature": 0.5, "top_p": 0.90},
}
}
# A system prompt for the model
system_messages = {
"Test": "You are Lake 1 Chat, a powerful open-source reasoning model. Think carefully and answer step by step.",
}
def generate_response(message: str, model_name: str, preset: str) -> str:
"""
Generate a response based on the user's message.
Args:
message (str): The user's message.
model_name (str): The name of the model to use.
preset (str): The performance preset to apply.
Returns:
str: The generated response.
"""
client = clients[model_name]
params = presets[model_name][preset]
system_msg = system_messages[model_name]
prompt = f"{system_msg}\n\n:User {message}\nAssistant:"
return client.text_generation(
prompt,
max_new_tokens=params["max_new_tokens"],
temperature=params["temperature"],
top_p=params["top_p"]
)
def handle_chat(message: str, history: list, model: str, preset: str) -> str:
"""
Handle the chat interaction.
Args:
message (str): The user's message.
history (list): The conversation history.
model (str): The model to use.
preset (str): The performance preset.
Returns:
str: The generated response.
"""
try:
return generate_response(message, model, preset)
except Exception as e:
return f"⚠️ Error: {str(e)}"
with gr.Blocks(title="BI CORP AI Assistant", theme="soft") as demo:
gr.Markdown("# <center>Lake AI Assistant</center>")
gr.Markdown("### <center>Powered by Lake 1 Chat</center>")
with gr.Row():
with gr.Column(scale=1):
model_dropdown = gr.Dropdown(
label="🤖 Model Selection",
choices=list(clients.keys()),
value="Lake 1 Chat",
interactive=True
)
preset_dropdown = gr.Dropdown(
label="⚙️ Performance Preset",
choices=["Fast", "Normal", "Quality"],
value="Normal",
interactive=True
)
model_info_md = gr.Markdown(
value=model_configs["Test"]["specs"],
label="📝 Model Specifications"
)
with gr.Column(scale=3):
chat_interface = gr.ChatInterface(
fn=handle_chat,
additional_inputs=[model_dropdown, preset_dropdown],
examples=[["Explain quantum computing", "Test", "Normal"]],
chatbot=gr.Chatbot(height=600, label="💬 Conversation", show_copy_button=True),
textbox=gr.Textbox(placeholder="Type your message...", container=False, scale=7, autofocus=True),
submit_btn=gr.Button("🚀 Send", variant="primary")
)
clear_button = gr.Button("🧹 Clear History")
clear_button.click(
fn=lambda: None,
inputs=[],
outputs=chat_interface.chatbot,
queue=False
)
model_dropdown.change(
fn=lambda model: model_configs[model]["specs"],
inputs=model_dropdown,
outputs=model_info_md,
queue=False
)
if __name__ == "__main__":
demo.launch(server_port=7865) |