sparkleman commited on
Commit
aeaf225
·
1 Parent(s): 89db2d9

UPDATE: Change gpu state display

Browse files
Files changed (2) hide show
  1. app.py +19 -6
  2. utils.py +10 -0
app.py CHANGED
@@ -1,4 +1,10 @@
1
  from config import CONFIG, ModelConfig
 
 
 
 
 
 
2
 
3
  import os, copy, types, gc, sys, re, time, collections, asyncio
4
  from huggingface_hub import hf_hub_download
@@ -28,6 +34,15 @@ if "cuda" in CONFIG.STRATEGY.lower():
28
  nvmlInit()
29
  gpu_h = nvmlDeviceGetHandleByIndex(0)
30
 
 
 
 
 
 
 
 
 
 
31
  torch.backends.cudnn.benchmark = True
32
  torch.backends.cudnn.allow_tf32 = True
33
  torch.backends.cuda.matmul.allow_tf32 = True
@@ -56,7 +71,6 @@ from api_types import (
56
  ChatCompletionChoice,
57
  ChatCompletionMessage,
58
  )
59
- from utils import cleanMessages, parse_think_response, remove_nested_think_tags_stack
60
 
61
 
62
  class ModelStorage:
@@ -72,6 +86,8 @@ DEFAULT_REASONING_MODEL_NAME = None
72
 
73
  logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
74
 
 
 
75
  for model_config in CONFIG.MODELS:
76
  logger.info(f"Load Model - {model_config.SERVICE_NAME}")
77
 
@@ -109,6 +125,7 @@ for model_config in CONFIG.MODELS:
109
  MODEL_STORAGE[model_config.SERVICE_NAME].MODEL_CONFIG = model_config
110
  MODEL_STORAGE[model_config.SERVICE_NAME].model = tmp_model
111
  MODEL_STORAGE[model_config.SERVICE_NAME].pipeline = tmp_pipeline
 
112
 
113
 
114
  logger.info(f"Load Model - DEFALUT_MODEL_NAME is `{DEFALUT_MODEL_NAME}`")
@@ -643,11 +660,7 @@ async def chat_completions(request: ChatCompletionRequest):
643
  raise f"Can not find `{modelName}`"
644
 
645
  async def chatResponseStreamDisconnect():
646
- if "cuda" in CONFIG.STRATEGY:
647
- gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
648
- logger.info(
649
- f"[STATUS] vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}"
650
- )
651
 
652
  model_state = None
653
  request_dict = request.model_dump()
 
1
  from config import CONFIG, ModelConfig
2
+ from utils import (
3
+ cleanMessages,
4
+ parse_think_response,
5
+ remove_nested_think_tags_stack,
6
+ format_bytes,
7
+ )
8
 
9
  import os, copy, types, gc, sys, re, time, collections, asyncio
10
  from huggingface_hub import hf_hub_download
 
34
  nvmlInit()
35
  gpu_h = nvmlDeviceGetHandleByIndex(0)
36
 
37
+
38
+ def logGPUState():
39
+ if "cuda" in CONFIG.STRATEGY:
40
+ gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
41
+ logger.info(
42
+ f"[STATUS] vram {format_bytes(gpu_info.total)} used {format_bytes(gpu_info.used)} free {format_bytes(gpu_info.free)}"
43
+ )
44
+
45
+
46
  torch.backends.cudnn.benchmark = True
47
  torch.backends.cudnn.allow_tf32 = True
48
  torch.backends.cuda.matmul.allow_tf32 = True
 
71
  ChatCompletionChoice,
72
  ChatCompletionMessage,
73
  )
 
74
 
75
 
76
  class ModelStorage:
 
86
 
87
  logger.info(f"STRATEGY - {CONFIG.STRATEGY}")
88
 
89
+ logGPUState()
90
+
91
  for model_config in CONFIG.MODELS:
92
  logger.info(f"Load Model - {model_config.SERVICE_NAME}")
93
 
 
125
  MODEL_STORAGE[model_config.SERVICE_NAME].MODEL_CONFIG = model_config
126
  MODEL_STORAGE[model_config.SERVICE_NAME].model = tmp_model
127
  MODEL_STORAGE[model_config.SERVICE_NAME].pipeline = tmp_pipeline
128
+ logGPUState()
129
 
130
 
131
  logger.info(f"Load Model - DEFALUT_MODEL_NAME is `{DEFALUT_MODEL_NAME}`")
 
660
  raise f"Can not find `{modelName}`"
661
 
662
  async def chatResponseStreamDisconnect():
663
+ logGPUState()
 
 
 
 
664
 
665
  model_state = None
666
  request_dict = request.model_dump()
utils.py CHANGED
@@ -58,3 +58,13 @@ def remove_nested_think_tags_stack(text):
58
  else:
59
  i += 1
60
  return result
 
 
 
 
 
 
 
 
 
 
 
58
  else:
59
  i += 1
60
  return result
61
+
62
+
63
+ def format_bytes(size):
64
+ power = 2**10
65
+ n = 0
66
+ power_labels = {0: "", 1: "K", 2: "M", 3: "G", 4: "T"}
67
+ while size > power:
68
+ size /= power
69
+ n += 1
70
+ return f"{size:.4f}{power_labels[n]+'B'}"