# import gradio as gr # from huggingface_hub import InferenceClient # import os # client = InferenceClient( # model="mistralai/Mistral-Small-24B-Instruct-2501", # token=os.getenv('HF_TOKEN') # ) # def chat_fn(message, system_message, history_str, max_tokens, temperature, top_p): # # Convert history string (optional) to message list # messages = [{"role": "system", "content": system_message}] # if history_str: # # Format: user1||assistant1\nuser2||assistant2 # for pair in history_str.split("\n"): # if "||" in pair: # user_msg, assistant_msg = pair.split("||", 1) # messages.append({"role": "user", "content": user_msg}) # messages.append({"role": "assistant", "content": assistant_msg}) # messages.append({"role": "user", "content": message}) # # Get response from HF # response = "" # for chunk in client.chat_completion( # messages=messages, # stream=True, # max_tokens=max_tokens, # temperature=temperature, # top_p=top_p, # ): # response += chunk.choices[0].delta.content or "" # return response # demo = gr.Interface( # fn=chat_fn, # inputs=[ # gr.Textbox(lines=2, label="User Message"), # gr.Textbox(value="You are a friendly Chatbot.", label="System Prompt"), # gr.Textbox(lines=4, placeholder="user||bot\nuser2||bot2", label="Conversation History (optional)"), # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max Tokens"), # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), # gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"), # ], # outputs="text", # allow_flagging="never", # title="LLM Budaya", # description="Chatbot menggunakan model HuggingFace Zephyr-7B" # ) # if __name__ == "__main__": # demo.launch() import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM # Load model & tokenizer model_id = "mistralai/Mistral-Small-24B-Instruct-2501" tokenizer = AutoTokenizer.from_pretrained(model_id) # Load model di CPU model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype=torch.float32, device_map={"": "cpu"} ) # Inference function def chat_fn(message, system_prompt, max_tokens, temperature, top_p): prompt = f"[INST] {system_prompt.strip()}\n{message.strip()} [/INST]" inputs = tokenizer(prompt, return_tensors="pt").to("cpu") with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, do_sample=True, pad_token_id=tokenizer.eos_token_id ) decoded = tokenizer.decode(output[0], skip_special_tokens=True) return decoded.split("[/INST]")[-1].strip() # Gradio UI demo = gr.Interface( fn=chat_fn, inputs=[ gr.Textbox(lines=2, label="User Message"), gr.Textbox(value="You are a helpful and concise assistant.", label="System Prompt"), gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max Tokens"), gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p"), ], outputs="text", title="Mistral-Small-24B CPU Chat", description="Chatbot menggunakan model Mistral-Small-24B-Instruct-2501 dijalankan lokal via CPU. Ini akan berjalan lambat.", flagging_mode="never", ) if __name__ == "__main__": demo.launch()