Spaces:
Running
Running
use 4-bit quantization with bitsandbytes
Browse files
app.py
CHANGED
@@ -11,7 +11,13 @@ from retriever import retrieve_documents
|
|
11 |
# Load Mistral 7B model
|
12 |
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
|
13 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"), cache_dir="/tmp/huggingface")
|
14 |
-
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
# Create inference pipeline
|
17 |
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
|
|
|
11 |
# Load Mistral 7B model
|
12 |
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
|
13 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"), cache_dir="/tmp/huggingface")
|
14 |
+
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME,
|
15 |
+
use_auth_token=os.getenv("HUGGING_FACE_HUB_TOKEN"),
|
16 |
+
cache_dir="/tmp/huggingface",
|
17 |
+
device_map="auto",
|
18 |
+
torch_dtype=torch.float16,
|
19 |
+
load_in_4bit=True
|
20 |
+
)
|
21 |
|
22 |
# Create inference pipeline
|
23 |
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
|