Spaces:

radussad
/

mistral-rag

Running

radussad commited on 8 days ago

Commit

be43e8b

verified ·

1 Parent(s): a3d72f2

use 4-bit quantization with bitsandbytes

Files changed (1) hide show

app.py CHANGED Viewed

@@ -11,7 +11,13 @@ from retriever import retrieve_documents
 # Load Mistral 7B model
 MODEL_NAME = "mistralai/Mistral-7B-v0.1"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"), cache_dir="/tmp/huggingface")
-model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"), cache_dir="/tmp/huggingface") #, device_map="auto", torch_dtype=torch.float16)
 # Create inference pipeline
 generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

 # Load Mistral 7B model
 MODEL_NAME = "mistralai/Mistral-7B-v0.1"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"), cache_dir="/tmp/huggingface")
+model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,
+                                             use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"),
+                                             cache_dir="/tmp/huggingface",
+                                             device_map="auto",
+                                             torch_dtype=torch.float16,
+                                             load_in_4bit=True
+                                            )
 # Create inference pipeline
 generator = pipeline("text-generation", model=model, tokenizer=tokenizer)