Update app.py
Browse files
app.py
CHANGED
@@ -82,28 +82,28 @@ def clean_up(cache, origin_len):
|
|
82 |
new_cache.key_cache[i] = new_cache.key_cache[i][:, :, :origin_len, :]
|
83 |
new_cache.value_cache[i] = new_cache.value_cache[i][:, :, :origin_len, :]
|
84 |
return new_cache
|
85 |
-
os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
86 |
-
os.environ["HF_HUB_OFFLINE"] = "1"
|
87 |
|
88 |
# Path to your local model
|
89 |
|
90 |
# Initialize model and tokenizer
|
91 |
def load_model_and_tokenizer():
|
92 |
-
|
93 |
|
94 |
# Load tokenizer and model from disk (without trust_remote_code)
|
95 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
96 |
if torch.cuda.is_available():
|
97 |
# Load model on GPU if CUDA is available
|
98 |
model = AutoModelForCausalLM.from_pretrained(
|
99 |
-
|
100 |
torch_dtype=torch.float16,
|
101 |
device_map="auto" # Automatically map model layers to GPU
|
102 |
)
|
103 |
else:
|
104 |
# Load model on CPU if no GPU is available
|
105 |
model = AutoModelForCausalLM.from_pretrained(
|
106 |
-
|
107 |
torch_dtype=torch.float32, # Use float32 for compatibility with CPU
|
108 |
low_cpu_mem_usage=True # Reduce memory usage on CPU
|
109 |
)
|
|
|
82 |
new_cache.key_cache[i] = new_cache.key_cache[i][:, :, :origin_len, :]
|
83 |
new_cache.value_cache[i] = new_cache.value_cache[i][:, :, :origin_len, :]
|
84 |
return new_cache
|
85 |
+
#os.environ["TRANSFORMERS_OFFLINE"] = "1"
|
86 |
+
#os.environ["HF_HUB_OFFLINE"] = "1"
|
87 |
|
88 |
# Path to your local model
|
89 |
|
90 |
# Initialize model and tokenizer
|
91 |
def load_model_and_tokenizer():
|
92 |
+
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
|
93 |
|
94 |
# Load tokenizer and model from disk (without trust_remote_code)
|
95 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
96 |
if torch.cuda.is_available():
|
97 |
# Load model on GPU if CUDA is available
|
98 |
model = AutoModelForCausalLM.from_pretrained(
|
99 |
+
model_name,
|
100 |
torch_dtype=torch.float16,
|
101 |
device_map="auto" # Automatically map model layers to GPU
|
102 |
)
|
103 |
else:
|
104 |
# Load model on CPU if no GPU is available
|
105 |
model = AutoModelForCausalLM.from_pretrained(
|
106 |
+
model_name,
|
107 |
torch_dtype=torch.float32, # Use float32 for compatibility with CPU
|
108 |
low_cpu_mem_usage=True # Reduce memory usage on CPU
|
109 |
)
|