Spaces:
Running
on
T4
Running
on
T4
sparkleman
commited on
Commit
·
aeaf225
1
Parent(s):
89db2d9
UPDATE: Change gpu state display
Browse files
app.py
CHANGED
@@ -1,4 +1,10 @@
|
|
1 |
from config import CONFIG, ModelConfig
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
import os, copy, types, gc, sys, re, time, collections, asyncio
|
4 |
from huggingface_hub import hf_hub_download
|
@@ -28,6 +34,15 @@ if "cuda" in CONFIG.STRATEGY.lower():
|
|
28 |
nvmlInit()
|
29 |
gpu_h = nvmlDeviceGetHandleByIndex(0)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
torch.backends.cudnn.benchmark = True
|
32 |
torch.backends.cudnn.allow_tf32 = True
|
33 |
torch.backends.cuda.matmul.allow_tf32 = True
|
@@ -56,7 +71,6 @@ from api_types import (
|
|
56 |
ChatCompletionChoice,
|
57 |
ChatCompletionMessage,
|
58 |
)
|
59 |
-
from utils import cleanMessages, parse_think_response, remove_nested_think_tags_stack
|
60 |
|
61 |
|
62 |
class ModelStorage:
|
@@ -72,6 +86,8 @@ DEFAULT_REASONING_MODEL_NAME = None
|
|
72 |
|
73 |
logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
|
74 |
|
|
|
|
|
75 |
for model_config in CONFIG.MODELS:
|
76 |
logger.info(f"Load Model - {model_config.SERVICE_NAME}")
|
77 |
|
@@ -109,6 +125,7 @@ for model_config in CONFIG.MODELS:
|
|
109 |
MODEL_STORAGE[model_config.SERVICE_NAME].MODEL_CONFIG = model_config
|
110 |
MODEL_STORAGE[model_config.SERVICE_NAME].model = tmp_model
|
111 |
MODEL_STORAGE[model_config.SERVICE_NAME].pipeline = tmp_pipeline
|
|
|
112 |
|
113 |
|
114 |
logger.info(f"Load Model - DEFALUT_MODEL_NAME is `{DEFALUT_MODEL_NAME}`")
|
@@ -643,11 +660,7 @@ async def chat_completions(request: ChatCompletionRequest):
|
|
643 |
raise f"Can not find `{modelName}`"
|
644 |
|
645 |
async def chatResponseStreamDisconnect():
|
646 |
-
|
647 |
-
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
|
648 |
-
logger.info(
|
649 |
-
f"[STATUS] vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}"
|
650 |
-
)
|
651 |
|
652 |
model_state = None
|
653 |
request_dict = request.model_dump()
|
|
|
1 |
from config import CONFIG, ModelConfig
|
2 |
+
from utils import (
|
3 |
+
cleanMessages,
|
4 |
+
parse_think_response,
|
5 |
+
remove_nested_think_tags_stack,
|
6 |
+
format_bytes,
|
7 |
+
)
|
8 |
|
9 |
import os, copy, types, gc, sys, re, time, collections, asyncio
|
10 |
from huggingface_hub import hf_hub_download
|
|
|
34 |
nvmlInit()
|
35 |
gpu_h = nvmlDeviceGetHandleByIndex(0)
|
36 |
|
37 |
+
|
38 |
+
def logGPUState():
|
39 |
+
if "cuda" in CONFIG.STRATEGY:
|
40 |
+
gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
|
41 |
+
logger.info(
|
42 |
+
f"[STATUS] vram {format_bytes(gpu_info.total)} used {format_bytes(gpu_info.used)} free {format_bytes(gpu_info.free)}"
|
43 |
+
)
|
44 |
+
|
45 |
+
|
46 |
torch.backends.cudnn.benchmark = True
|
47 |
torch.backends.cudnn.allow_tf32 = True
|
48 |
torch.backends.cuda.matmul.allow_tf32 = True
|
|
|
71 |
ChatCompletionChoice,
|
72 |
ChatCompletionMessage,
|
73 |
)
|
|
|
74 |
|
75 |
|
76 |
class ModelStorage:
|
|
|
86 |
|
87 |
logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
|
88 |
|
89 |
+
logGPUState()
|
90 |
+
|
91 |
for model_config in CONFIG.MODELS:
|
92 |
logger.info(f"Load Model - {model_config.SERVICE_NAME}")
|
93 |
|
|
|
125 |
MODEL_STORAGE[model_config.SERVICE_NAME].MODEL_CONFIG = model_config
|
126 |
MODEL_STORAGE[model_config.SERVICE_NAME].model = tmp_model
|
127 |
MODEL_STORAGE[model_config.SERVICE_NAME].pipeline = tmp_pipeline
|
128 |
+
logGPUState()
|
129 |
|
130 |
|
131 |
logger.info(f"Load Model - DEFALUT_MODEL_NAME is `{DEFALUT_MODEL_NAME}`")
|
|
|
660 |
raise f"Can not find `{modelName}`"
|
661 |
|
662 |
async def chatResponseStreamDisconnect():
|
663 |
+
logGPUState()
|
|
|
|
|
|
|
|
|
664 |
|
665 |
model_state = None
|
666 |
request_dict = request.model_dump()
|
utils.py
CHANGED
@@ -58,3 +58,13 @@ def remove_nested_think_tags_stack(text):
|
|
58 |
else:
|
59 |
i += 1
|
60 |
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
else:
|
59 |
i += 1
|
60 |
return result
|
61 |
+
|
62 |
+
|
63 |
+
def format_bytes(size):
|
64 |
+
power = 2**10
|
65 |
+
n = 0
|
66 |
+
power_labels = {0: "", 1: "K", 2: "M", 3: "G", 4: "T"}
|
67 |
+
while size > power:
|
68 |
+
size /= power
|
69 |
+
n += 1
|
70 |
+
return f"{size:.4f}{power_labels[n]+'B'}"
|