Spaces:

chrispie
/

llama-hqq-1-bit

Runtime error

chrispie commited on Mar 30, 2024

Commit

b212660

verified ·

1 Parent(s): c84bec0

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ from threading import Thread
 #Load the model
 model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
-model     = HQQModelForCausalLM.from_quantized(model_id, adapter='adapter_v0.1.lora')
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 #Setup Inference Mode
@@ -16,7 +16,7 @@ model.config.use_cache  = True
 model.eval();
 # Optional: torch compile for faster inference
-model = torch.compile(model)
 def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cpu'):
     tokenizer.use_default_system_prompt = False

 #Load the model
 model_id = 'mobiuslabsgmbh/Llama-2-7b-chat-hf_1bitgs8_hqq'
+model     = HQQModelForCausalLM.from_quantized(model_id, adapter='adapter_v0.1.lora', device='cpu')
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 #Setup Inference Mode
 model.eval();
 # Optional: torch compile for faster inference
+# model = torch.compile(model)
 def chat_processor(chat, max_new_tokens=100, do_sample=True, device='cpu'):
     tokenizer.use_default_system_prompt = False