Spaces:
Runtime error
Runtime error
| import torch | |
| import gradio as gr | |
| from transformers import AutoTokenizer | |
| from auto_gptq import AutoGPTQForCausalLM | |
| import os | |
| import psutil | |
| os.environ["TOKENIZERS_PARALLELISM"] = "false" # Hindari beban CPU | |
| torch.set_num_threads(2) # Sesuai limit Spaces gratis | |
| # Load model & tokenizer | |
| print("PyTorch Version",torch.__version__) # Versi PyTorch | |
| print("Is GPU Available",torch.cuda.is_available()) # Apakah GPU terdeteksi? | |
| print("CPU cores:", psutil.cpu_count()) | |
| print("RAM (GB):", psutil.virtual_memory().total / (1024**3)) | |
| # model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ" | |
| model_name_or_path = "meta-llama/Llama-2-7b-chat-hf" | |
| # model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF" | |
| # model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ" | |
| # model_name_or_path = "unsloth/DeepSeek-R1-0528-GGUF" # 3x lebih cepat dari Mistral-7B | |
| # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) | |
| tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True) | |
| model = AutoGPTQForCausalLM.from_quantized( | |
| model_name_or_path, | |
| model_basename="model", | |
| # device_map="auto", # Auto-detects GPU/CPU | |
| # device_map="cpu", | |
| torch_dtype=torch.float32, # Hindari float16 di CPU | |
| use_safetensors=True, | |
| trust_remote_code=True, | |
| use_triton=False, | |
| inject_fused_attention=False, # Wajib untuk CPU | |
| inject_fused_mlp=False, | |
| disable_exllama=True, # Wajib untuk CPU | |
| disable_exllamav2=True, | |
| ) | |
| # Prompt template | |
| SYSTEM_PROMPT = "<<SYS>>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<</SYS>>\n\n" | |
| # def build_prompt(history): | |
| # prompt = f"<s>[INST] {SYSTEM_PROMPT}{history[-1][0]} [/INST]" | |
| # return prompt | |
| def build_prompt(chat_history): | |
| prompt = "<s>[INST] <<SYS>>\nKamu adalah asisten AI yang santuy dan suka ngoding.\n<</SYS>>\n\n" | |
| for msg in chat_history: | |
| if msg["role"] == "user": | |
| prompt += f"{msg['content']} [/INST] " | |
| else: | |
| prompt += f"{msg['content']} </s><s>[INST] " | |
| return prompt | |
| def chat(user_input, chat_history): | |
| print("π οΈ DEBUG - Input:", user_input) | |
| print("π οΈ DEBUG - History:", chat_history) | |
| if not chat_history: | |
| chat_history = [] | |
| chat_history.append({"role": "user", "content": user_input}) | |
| prompt = build_prompt(chat_history) | |
| print("π οΈ DEBUG - prompt:", prompt) | |
| input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cpu") | |
| print("π οΈ DEBUG - input_ids:", input_ids) | |
| with torch.inference_mode(): | |
| output_ids = model.generate( | |
| input_ids=input_ids, | |
| max_new_tokens=24, # Lebih pendek | |
| use_cache=False, | |
| # do_sample=True, | |
| num_beams=1, # Beam search = 1 | |
| do_sample=False, # Matikan sampling untuk percepat | |
| temperature=0.7, | |
| top_p=0.95, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| # output_ids = model.generate( | |
| # input_ids=input_ids, | |
| # max_new_tokens=128, | |
| # use_cache=True, | |
| # do_sample=True, | |
| # temperature=0.7, | |
| # top_p=0.95 | |
| # ) | |
| print("π οΈ DEBUG - output_ids:", output_ids) | |
| response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("[/INST]")[-1].strip() | |
| print("π οΈ DEBUG - Response:", response) | |
| chat_history.append({"role": "assistant", "content": response}) | |
| return chat_history, chat_history | |
| # def chat(user_input, chat_history): | |
| # if not chat_history: | |
| # chat_history = [] | |
| # prompt = build_prompt(chat_history + [[user_input, ""]]) | |
| # input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) | |
| # output_ids = model.generate( | |
| # input_ids=input_ids, | |
| # max_new_tokens=256, | |
| # do_sample=True, | |
| # temperature=0.7, | |
| # top_p=0.95, | |
| # ) | |
| # generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True) | |
| # response = generated_text.split("[/INST]")[-1].strip() | |
| # chat_history.append([user_input, response]) | |
| # return chat_history, chat_history | |
| # Gradio UI | |
| with gr.Blocks(title="Ujang v3 Chatbot") as demo: | |
| gr.Markdown("### π€ Ujang v3 - LLaMA 2 Chatbot GPTQ") | |
| # chatbot = gr.Chatbot() | |
| chatbot = gr.Chatbot(type="messages") | |
| msg = gr.Textbox(label="Ketik pesan:") | |
| clear = gr.Button("π§Ή Bersihkan") | |
| state = gr.State([]) | |
| msg.submit(chat, [msg, state], [chatbot, state]) | |
| clear.click(lambda: ([], []), None, [chatbot, state]) | |
| demo.launch() |