File size: 4,520 Bytes
4ac56a3
 
 
 
fe79cb6
c55fe39
fe79cb6
6294fdc
4ac56a3
 
5bfdd82
 
fe79cb6
 
5bfdd82
4e121c2
 
a72d296
6294fdc
9bf3993
56f7ef6
b31e33d
4ac56a3
 
 
 
f8a5c8b
0a521b4
fe79cb6
4ac56a3
 
f8a5c8b
fe79cb6
f8a5c8b
fe79cb6
f8a5c8b
4ac56a3
 
 
 
 
c4e93e2
 
 
 
 
 
 
 
 
 
4ac56a3
 
 
3791b57
 
4ac56a3
 
fb52f06
 
 
3791b57
fb52f06
3791b57
5bfdd82
 
 
6294fdc
 
5bfdd82
6294fdc
fe79cb6
5bfdd82
6294fdc
 
5bfdd82
fb52f06
5bfdd82
 
 
 
 
 
 
 
3791b57
fb52f06
6294fdc
fb52f06
 
4ac56a3
fb52f06
 
 
4ac56a3
fb52f06
 
4ac56a3
fb52f06
 
 
 
 
 
 
 
 
 
 
 
4ac56a3
 
 
 
fb52f06
 
4ac56a3
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import torch
import gradio as gr
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM
import os
import psutil
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Hindari beban CPU
torch.set_num_threads(2)  # Sesuai limit Spaces gratis

# Load model & tokenizer
print("PyTorch Version",torch.__version__)  # Versi PyTorch
print("Is GPU Available",torch.cuda.is_available())  # Apakah GPU terdeteksi?
print("CPU cores:", psutil.cpu_count())
print("RAM (GB):", psutil.virtual_memory().total / (1024**3))

model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
# model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
# model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF"
# model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
# model_name_or_path = "unsloth/DeepSeek-R1-0528-GGUF" # 3x lebih cepat dari Mistral-7B
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)

model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    model_basename="model",
    # device_map="auto",  # Auto-detects GPU/CPU
    # device_map="cpu",
    torch_dtype=torch.float32,  # Hindari float16 di CPU
    use_safetensors=True,
    trust_remote_code=True,
    use_triton=False,
    inject_fused_attention=False, # Wajib untuk CPU
    inject_fused_mlp=False,
    disable_exllama=True, # Wajib untuk CPU
    disable_exllamav2=True,
)

# Prompt template
SYSTEM_PROMPT = "<<SYS>>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<</SYS>>\n\n"

# def build_prompt(history):
#     prompt = f"<s>[INST] {SYSTEM_PROMPT}{history[-1][0]} [/INST]"
#     return prompt
def build_prompt(chat_history):
    prompt = "<s>[INST] <<SYS>>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<</SYS>>\n\n"
    for msg in chat_history:
        if msg["role"] == "user":
            prompt += f"{msg['content']} [/INST] "
        else:
            prompt += f"{msg['content']} </s><s>[INST] "
    return prompt

def chat(user_input, chat_history):
    print("πŸ› οΈ DEBUG - Input:", user_input)
    print("πŸ› οΈ DEBUG - History:", chat_history)
    if not chat_history:
        chat_history = []
    
    chat_history.append({"role": "user", "content": user_input})
    prompt = build_prompt(chat_history)
    print("πŸ› οΈ DEBUG - prompt:", prompt)
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cpu")
    print("πŸ› οΈ DEBUG - input_ids:", input_ids)
    with torch.inference_mode():
        output_ids = model.generate(
            input_ids=input_ids, 
            max_new_tokens=24,  # Lebih pendek
            use_cache=False,
            # do_sample=True,
            num_beams=1,        # Beam search = 1
            do_sample=False, # Matikan sampling untuk percepat
            temperature=0.7,
            top_p=0.95,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # output_ids = model.generate(
    #     input_ids=input_ids, 
    #     max_new_tokens=128,
    #     use_cache=True,
    #     do_sample=True,
    #     temperature=0.7,
    #     top_p=0.95
    # )
    print("πŸ› οΈ DEBUG - output_ids:", output_ids)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
    print("πŸ› οΈ DEBUG - Response:", response)
    chat_history.append({"role": "assistant", "content": response})
    return chat_history, chat_history

# def chat(user_input, chat_history):
#     if not chat_history:
#         chat_history = []

#     prompt = build_prompt(chat_history + [[user_input, ""]])
#     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

#     output_ids = model.generate(
#         input_ids=input_ids,
#         max_new_tokens=256,
#         do_sample=True,
#         temperature=0.7,
#         top_p=0.95,
#     )

#     generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#     response = generated_text.split("[/INST]")[-1].strip()
#     chat_history.append([user_input, response])
#     return chat_history, chat_history

# Gradio UI
with gr.Blocks(title="Ujang v3 Chatbot") as demo:
    gr.Markdown("### πŸ€– Ujang v3 - LLaMA 2 Chatbot GPTQ")
    # chatbot = gr.Chatbot()
    chatbot = gr.Chatbot(type="messages")
    msg = gr.Textbox(label="Ketik pesan:")
    clear = gr.Button("🧹 Bersihkan")
    state = gr.State([])

    msg.submit(chat, [msg, state], [chatbot, state])
    clear.click(lambda: ([], []), None, [chatbot, state])

demo.launch()