Gumelar Teja Sukma commited on
Commit
6294fdc
Β·
1 Parent(s): 0a521b4
Files changed (1) hide show
  1. app.py +9 -5
app.py CHANGED
@@ -5,6 +5,7 @@ from auto_gptq import AutoGPTQForCausalLM
5
  import os
6
  import psutil
7
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # Hindari beban CPU
 
8
 
9
  # Load model & tokenizer
10
  # model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
@@ -14,7 +15,8 @@ print("CPU cores:", psutil.cpu_count())
14
  print("RAM (GB):", psutil.virtual_memory().total / (1024**3))
15
 
16
  # model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF"
17
- model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
 
18
  # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
19
  tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
20
 
@@ -62,12 +64,14 @@ def chat(user_input, chat_history):
62
  with torch.inference_mode():
63
  output_ids = model.generate(
64
  input_ids=input_ids,
65
- max_new_tokens=32,
66
- use_cache=True,
67
  # do_sample=True,
 
68
  do_sample=False, # Matikan sampling untuk percepat
69
  temperature=0.7,
70
- top_p=0.95
 
71
  )
72
 
73
  # output_ids = model.generate(
@@ -80,7 +84,7 @@ def chat(user_input, chat_history):
80
  # )
81
  print("πŸ› οΈ DEBUG - output_ids:", output_ids)
82
  response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
83
- print("πŸ› οΈ DEBUG - Response:", response) # Akan muncul di terminal/logs
84
  chat_history.append({"role": "assistant", "content": response})
85
  return chat_history, chat_history
86
 
 
5
  import os
6
  import psutil
7
  os.environ["TOKENIZERS_PARALLELISM"] = "false" # Hindari beban CPU
8
+ torch.set_num_threads(2) # Sesuai limit Spaces gratis
9
 
10
  # Load model & tokenizer
11
  # model_name_or_path = "TheBloke/Llama-2-7B-Chat-GPTQ"
 
15
  print("RAM (GB):", psutil.virtual_memory().total / (1024**3))
16
 
17
  # model_name_or_path = "TheBloke/Llama-2-7B-Chat-GGUF"
18
+ # model_name_or_path = "TheBloke/Mistral-7B-v0.1-GPTQ"
19
+ model_name_or_path = "TheBloke/TinyLlama-1.1B-Chat-GPTQ"
20
  # tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
21
  tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
22
 
 
64
  with torch.inference_mode():
65
  output_ids = model.generate(
66
  input_ids=input_ids,
67
+ max_new_tokens=24, # Lebih pendek
68
+ use_cache=False,
69
  # do_sample=True,
70
+ num_beams=1, # Beam search = 1
71
  do_sample=False, # Matikan sampling untuk percepat
72
  temperature=0.7,
73
+ top_p=0.95,
74
+ pad_token_id=tokenizer.eos_token_id
75
  )
76
 
77
  # output_ids = model.generate(
 
84
  # )
85
  print("πŸ› οΈ DEBUG - output_ids:", output_ids)
86
  response = tokenizer.decode(output_ids[0], skip_special_tokens=True).split("[/INST]")[-1].strip()
87
+ print("πŸ› οΈ DEBUG - Response:", response)
88
  chat_history.append({"role": "assistant", "content": response})
89
  return chat_history, chat_history
90