Spaces:
Runtime error
Runtime error
Gumelar Teja Sukma
commited on
Commit
Β·
6294fdc
1
Parent(s):
0a521b4
bug fix
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ from auto_gptq import AutoGPTQForCausalLM
|
|
| 5 |
import os
|
| 6 |
import psutil
|
| 7 |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Hindari beban CPU
|
|
|
|
| 8 |
|
| 9 |
# Load model & tokenizer
|
| 10 |
# model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
|
|
@@ -14,7 +15,8 @@ print("CPU cores:", psutil.cpu_count())
|
|
| 14 |
print("RAM (GB):", psutil.virtual_memory().total / (1024**3))
|
| 15 |
|
| 16 |
# model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF"
|
| 17 |
-
model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
|
|
|
|
| 18 |
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
|
| 19 |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
| 20 |
|
|
@@ -62,12 +64,14 @@ def chat(user_input, chat_history):
|
|
| 62 |
with torch.inference_mode():
|
| 63 |
output_ids = model.generate(
|
| 64 |
input_ids=input_ids,
|
| 65 |
-
max_new_tokens=
|
| 66 |
-
use_cache=
|
| 67 |
# do_sample=True,
|
|
|
|
| 68 |
do_sample=False, # Matikan sampling untuk percepat
|
| 69 |
temperature=0.7,
|
| 70 |
-
top_p=0.95
|
|
|
|
| 71 |
)
|
| 72 |
|
| 73 |
# output_ids = model.generate(
|
|
@@ -80,7 +84,7 @@ def chat(user_input, chat_history):
|
|
| 80 |
# )
|
| 81 |
print("π οΈ DEBUG - output_ids:", output_ids)
|
| 82 |
response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
|
| 83 |
-
print("π οΈ DEBUG - Response:", response)
|
| 84 |
chat_history.append({"role": "assistant", "content": response})
|
| 85 |
return chat_history, chat_history
|
| 86 |
|
|
|
|
| 5 |
import os
|
| 6 |
import psutil
|
| 7 |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # Hindari beban CPU
|
| 8 |
+
torch.set_num_threads(2) # Sesuai limit Spaces gratis
|
| 9 |
|
| 10 |
# Load model & tokenizer
|
| 11 |
# model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
|
|
|
|
| 15 |
print("RAM (GB):", psutil.virtual_memory().total / (1024**3))
|
| 16 |
|
| 17 |
# model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF"
|
| 18 |
+
# model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
|
| 19 |
+
model_name_or_path = "TheBloke/TinyLlama-1.1B-Chat-GPTQ"
|
| 20 |
# tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
|
| 21 |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
| 22 |
|
|
|
|
| 64 |
with torch.inference_mode():
|
| 65 |
output_ids = model.generate(
|
| 66 |
input_ids=input_ids,
|
| 67 |
+
max_new_tokens=24, # Lebih pendek
|
| 68 |
+
use_cache=False,
|
| 69 |
# do_sample=True,
|
| 70 |
+
num_beams=1, # Beam search = 1
|
| 71 |
do_sample=False, # Matikan sampling untuk percepat
|
| 72 |
temperature=0.7,
|
| 73 |
+
top_p=0.95,
|
| 74 |
+
pad_token_id=tokenizer.eos_token_id
|
| 75 |
)
|
| 76 |
|
| 77 |
# output_ids = model.generate(
|
|
|
|
| 84 |
# )
|
| 85 |
print("π οΈ DEBUG - output_ids:", output_ids)
|
| 86 |
response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
|
| 87 |
+
print("π οΈ DEBUG - Response:", response)
|
| 88 |
chat_history.append({"role": "assistant", "content": response})
|
| 89 |
return chat_history, chat_history
|
| 90 |
|