radussad commited on
Commit
be43e8b
·
verified ·
1 Parent(s): a3d72f2

use 4-bit quantization with bitsandbytes

Browse files
Files changed (1) hide show
  1. app.py +7 -1
app.py CHANGED
@@ -11,7 +11,13 @@ from retriever import retrieve_documents
11
  # Load Mistral 7B model
12
  MODEL_NAME = "mistralai/Mistral-7B-v0.1"
13
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"), cache_dir="/tmp/huggingface")
14
- model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"), cache_dir="/tmp/huggingface") #, device_map="auto", torch_dtype=torch.float16)
 
 
 
 
 
 
15
 
16
  # Create inference pipeline
17
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
 
11
  # Load Mistral 7B model
12
  MODEL_NAME = "mistralai/Mistral-7B-v0.1"
13
  tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"), cache_dir="/tmp/huggingface")
14
+ model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,
15
+ use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"),
16
+ cache_dir="/tmp/huggingface",
17
+ device_map="auto",
18
+ torch_dtype=torch.float16,
19
+ load_in_4bit=True
20
+ )
21
 
22
  # Create inference pipeline
23
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer)