Spaces:
Runtime error
Runtime error
File size: 4,520 Bytes
4ac56a3 fe79cb6 c55fe39 fe79cb6 6294fdc 4ac56a3 5bfdd82 fe79cb6 5bfdd82 4e121c2 a72d296 6294fdc 9bf3993 56f7ef6 b31e33d 4ac56a3 f8a5c8b 0a521b4 fe79cb6 4ac56a3 f8a5c8b fe79cb6 f8a5c8b fe79cb6 f8a5c8b 4ac56a3 c4e93e2 4ac56a3 3791b57 4ac56a3 fb52f06 3791b57 fb52f06 3791b57 5bfdd82 6294fdc 5bfdd82 6294fdc fe79cb6 5bfdd82 6294fdc 5bfdd82 fb52f06 5bfdd82 3791b57 fb52f06 6294fdc fb52f06 4ac56a3 fb52f06 4ac56a3 fb52f06 4ac56a3 fb52f06 4ac56a3 fb52f06 4ac56a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import torch
import gradio as gr
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import os
import psutil
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Hindari beban CPU
torch.set_num_threads(2) # Sesuai limit Spaces gratis
# Load model & tokenizer
print("PyTorch Version",torch.__version__) # Versi PyTorch
print("Is GPU Available",torch.cuda.is_available()) # Apakah GPU terdeteksi?
print("CPU cores:", psutil.cpu_count())
print("RAM (GB):", psutil.virtual_memory().total / (1024**3))
model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
# model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
# model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF"
# model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
# model_name_or_path = "unsloth/DeepSeek-R1-0528-GGUF" # 3x lebih cepat dari Mistral-7B
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(
model_name_or_path,
model_basename="model",
# device_map="auto", # Auto-detects GPU/CPU
# device_map="cpu",
torch_dtype=torch.float32, # Hindari float16 di CPU
use_safetensors=True,
trust_remote_code=True,
use_triton=False,
inject_fused_attention=False, # Wajib untuk CPU
inject_fused_mlp=False,
disable_exllama=True, # Wajib untuk CPU
disable_exllamav2=True,
)
# Prompt template
SYSTEM_PROMPT = "<<SYS>>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<</SYS>>\n\n"
# def build_prompt(history):
# prompt = f"<s>[INST] {SYSTEM_PROMPT}{history[-1][0]} [/INST]"
# return prompt
def build_prompt(chat_history):
prompt = "<s>[INST] <<SYS>>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<</SYS>>\n\n"
for msg in chat_history:
if msg["role"] == "user":
prompt += f"{msg['content']} [/INST] "
else:
prompt += f"{msg['content']} </s><s>[INST] "
return prompt
def chat(user_input, chat_history):
print("π οΈ DEBUG - Input:", user_input)
print("π οΈ DEBUG - History:", chat_history)
if not chat_history:
chat_history = []
chat_history.append({"role": "user", "content": user_input})
prompt = build_prompt(chat_history)
print("π οΈ DEBUG - prompt:", prompt)
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cpu")
print("π οΈ DEBUG - input_ids:", input_ids)
with torch.inference_mode():
output_ids = model.generate(
input_ids=input_ids,
max_new_tokens=24, # Lebih pendek
use_cache=False,
# do_sample=True,
num_beams=1, # Beam search = 1
do_sample=False, # Matikan sampling untuk percepat
temperature=0.7,
top_p=0.95,
pad_token_id=tokenizer.eos_token_id
)
# output_ids = model.generate(
# input_ids=input_ids,
# max_new_tokens=128,
# use_cache=True,
# do_sample=True,
# temperature=0.7,
# top_p=0.95
# )
print("π οΈ DEBUG - output_ids:", output_ids)
response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
print("π οΈ DEBUG - Response:", response)
chat_history.append({"role": "assistant", "content": response})
return chat_history, chat_history
# def chat(user_input, chat_history):
# if not chat_history:
# chat_history = []
# prompt = build_prompt(chat_history + [[user_input, ""]])
# input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
# output_ids = model.generate(
# input_ids=input_ids,
# max_new_tokens=256,
# do_sample=True,
# temperature=0.7,
# top_p=0.95,
# )
# generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
# response = generated_text.split("[/INST]")[-1].strip()
# chat_history.append([user_input, response])
# return chat_history, chat_history
# Gradio UI
with gr.Blocks(title="Ujang v3 Chatbot") as demo:
gr.Markdown("### π€ Ujang v3 - LLaMA 2 Chatbot GPTQ")
# chatbot = gr.Chatbot()
chatbot = gr.Chatbot(type="messages")
msg = gr.Textbox(label="Ketik pesan:")
clear = gr.Button("π§Ή Bersihkan")
state = gr.State([])
msg.submit(chat, [msg, state], [chatbot, state])
clear.click(lambda: ([], []), None, [chatbot, state])
demo.launch() |