Spaces:
Sleeping
Sleeping
import torch | |
import gradio as gr | |
from transformers import AutoTokenizer | |
from auto_gptq import AutoGPTQForCausalLM | |
import os | |
import psutil | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Hindari beban CPU | |
torch.set_num_threads(2) # Sesuai limit Spaces gratis | |
# Load model & tokenizer | |
print("PyTorch Version",torch.__version__) # Versi PyTorch | |
print("Is GPU Available",torch.cuda.is_available()) # Apakah GPU terdeteksi? | |
print("CPU cores:", psutil.cpu_count()) | |
print("RAM (GB):", psutil.virtual_memory().total / (1024**3)) | |
model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ" | |
# model_name_or_path = "meta-llama/Llama-2-7b-chat-hf" | |
# model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF" | |
# model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ" | |
# model_name_or_path = "unsloth/DeepSeek-R1-0528-GGUF" # 3x lebih cepat dari Mistral-7B | |
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) | |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) | |
model = AutoGPTQForCausalLM.from_quantized( | |
model_name_or_path, | |
model_basename="model", | |
# device_map="auto", # Auto-detects GPU/CPU | |
# device_map="cpu", | |
torch_dtype=torch.float32, # Hindari float16 di CPU | |
use_safetensors=True, | |
trust_remote_code=True, | |
use_triton=False, | |
inject_fused_attention=False, # Wajib untuk CPU | |
inject_fused_mlp=False, | |
disable_exllama=True, # Wajib untuk CPU | |
disable_exllamav2=True, | |
) | |
# Prompt template | |
SYSTEM_PROMPT = "<<SYS>>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<</SYS>>\n\n" | |
# def build_prompt(history): | |
# prompt = f"<s>[INST] {SYSTEM_PROMPT}{history[-1][0]} [/INST]" | |
# return prompt | |
def build_prompt(chat_history): | |
prompt = "<s>[INST] <<SYS>>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<</SYS>>\n\n" | |
for msg in chat_history: | |
if msg["role"] == "user": | |
prompt += f"{msg['content']} [/INST] " | |
else: | |
prompt += f"{msg['content']} </s><s>[INST] " | |
return prompt | |
def chat(user_input, chat_history): | |
print("๐ ๏ธ DEBUG - Input:", user_input) | |
print("๐ ๏ธ DEBUG - History:", chat_history) | |
if not chat_history: | |
chat_history = [] | |
chat_history.append({"role": "user", "content": user_input}) | |
prompt = build_prompt(chat_history) | |
print("๐ ๏ธ DEBUG - prompt:", prompt) | |
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cpu") | |
print("๐ ๏ธ DEBUG - input_ids:", input_ids) | |
with torch.inference_mode(): | |
output_ids = model.generate( | |
input_ids=input_ids, | |
max_new_tokens=24, # Lebih pendek | |
use_cache=False, | |
# do_sample=True, | |
num_beams=1, # Beam search = 1 | |
do_sample=False, # Matikan sampling untuk percepat | |
temperature=0.7, | |
top_p=0.95, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
# output_ids = model.generate( | |
# input_ids=input_ids, | |
# max_new_tokens=128, | |
# use_cache=True, | |
# do_sample=True, | |
# temperature=0.7, | |
# top_p=0.95 | |
# ) | |
print("๐ ๏ธ DEBUG - output_ids:", output_ids) | |
response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("[/INST]")[-1].strip() | |
print("๐ ๏ธ DEBUG - Response:", response) | |
chat_history.append({"role": "assistant", "content": response}) | |
return chat_history, chat_history | |
# def chat(user_input, chat_history): | |
# if not chat_history: | |
# chat_history = [] | |
# prompt = build_prompt(chat_history + [[user_input, ""]]) | |
# input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) | |
# output_ids = model.generate( | |
# input_ids=input_ids, | |
# max_new_tokens=256, | |
# do_sample=True, | |
# temperature=0.7, | |
# top_p=0.95, | |
# ) | |
# generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
# response = generated_text.split("[/INST]")[-1].strip() | |
# chat_history.append([user_input, response]) | |
# return chat_history, chat_history | |
# Gradio UI | |
with gr.Blocks(title="Ujang v3 Chatbot") as demo: | |
gr.Markdown("### ๐ค Ujang v3 - LLaMA 2 Chatbot GPTQ") | |
# chatbot = gr.Chatbot() | |
chatbot = gr.Chatbot(type="messages") | |
msg = gr.Textbox(label="Ketik pesan:") | |
clear = gr.Button("๐งน Bersihkan") | |
state = gr.State([]) | |
msg.submit(chat, [msg, state], [chatbot, state]) | |
clear.click(lambda: ([], []), None, [chatbot, state]) | |
demo.launch() |