Spaces:
Runtime error
Runtime error
Add system monitor
Browse files- Dockerfile +2 -0
- app_system_monitor.py +87 -0
- app_training.py +15 -4
- requirements-monitor.txt +4 -0
- trainer.py +13 -8
Dockerfile
CHANGED
|
@@ -44,6 +44,8 @@ RUN pyenv install ${PYTHON_VERSION} && \
|
|
| 44 |
RUN pip install --no-cache-dir -U torch==1.13.1 torchvision==0.14.1
|
| 45 |
COPY --chown=1000 requirements.txt /tmp/requirements.txt
|
| 46 |
RUN pip install --no-cache-dir -U -r /tmp/requirements.txt
|
|
|
|
|
|
|
| 47 |
|
| 48 |
COPY --chown=1000 . ${HOME}/app
|
| 49 |
RUN cd Tune-A-Video && patch -p1 < ../patch
|
|
|
|
| 44 |
RUN pip install --no-cache-dir -U torch==1.13.1 torchvision==0.14.1
|
| 45 |
COPY --chown=1000 requirements.txt /tmp/requirements.txt
|
| 46 |
RUN pip install --no-cache-dir -U -r /tmp/requirements.txt
|
| 47 |
+
COPY --chown=1000 requirements-monitor.txt /tmp/requirements-monitor.txt
|
| 48 |
+
RUN pip install --no-cache-dir -U -r /tmp/requirements-monitor.txt
|
| 49 |
|
| 50 |
COPY --chown=1000 . ${HOME}/app
|
| 51 |
RUN cd Tune-A-Video && patch -p1 < ../patch
|
app_system_monitor.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import collections
|
| 6 |
+
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import nvitop
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import plotly.express as px
|
| 11 |
+
import psutil
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class SystemMonitor:
|
| 15 |
+
MAX_SIZE = 61
|
| 16 |
+
|
| 17 |
+
def __init__(self):
|
| 18 |
+
self.devices = nvitop.Device.all()
|
| 19 |
+
self.cpu_memory_usage = collections.deque(
|
| 20 |
+
[0 for _ in range(self.MAX_SIZE)], maxlen=self.MAX_SIZE)
|
| 21 |
+
self.cpu_memory_usage_str = ''
|
| 22 |
+
self.gpu_memory_usage = collections.deque(
|
| 23 |
+
[0 for _ in range(self.MAX_SIZE)], maxlen=self.MAX_SIZE)
|
| 24 |
+
self.gpu_util = collections.deque([0 for _ in range(self.MAX_SIZE)],
|
| 25 |
+
maxlen=self.MAX_SIZE)
|
| 26 |
+
self.gpu_memory_usage_str = ''
|
| 27 |
+
self.gpu_util_str = ''
|
| 28 |
+
|
| 29 |
+
def update(self) -> None:
|
| 30 |
+
self.update_cpu()
|
| 31 |
+
self.update_gpu()
|
| 32 |
+
|
| 33 |
+
def update_cpu(self) -> None:
|
| 34 |
+
memory = psutil.virtual_memory()
|
| 35 |
+
self.cpu_memory_usage.append(memory.percent)
|
| 36 |
+
self.cpu_memory_usage_str = f'{memory.used / 1024**3:0.2f}GiB / {memory.total / 1024**3:0.2f}GiB ({memory.percent}%)'
|
| 37 |
+
|
| 38 |
+
def update_gpu(self) -> None:
|
| 39 |
+
if not self.devices:
|
| 40 |
+
return
|
| 41 |
+
device = self.devices[0]
|
| 42 |
+
self.gpu_memory_usage.append(device.memory_percent())
|
| 43 |
+
self.gpu_util.append(device.gpu_utilization())
|
| 44 |
+
self.gpu_memory_usage_str = f'{device.memory_usage()} ({device.memory_percent()}%)'
|
| 45 |
+
self.gpu_util_str = f'{device.gpu_utilization()}%'
|
| 46 |
+
|
| 47 |
+
def get_json(self) -> dict[str, str]:
|
| 48 |
+
return {
|
| 49 |
+
'CPU memory usage': self.cpu_memory_usage_str,
|
| 50 |
+
'GPU memory usage': self.gpu_memory_usage_str,
|
| 51 |
+
'GPU Util': self.gpu_util_str,
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
def get_graph_data(self) -> dict[str, list[int | float]]:
|
| 55 |
+
return {
|
| 56 |
+
'index': list(range(-self.MAX_SIZE + 1, 1)),
|
| 57 |
+
'CPU memory usage': self.cpu_memory_usage,
|
| 58 |
+
'GPU memory usage': self.gpu_memory_usage,
|
| 59 |
+
'GPU Util': self.gpu_util,
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
def get_graph(self):
|
| 63 |
+
df = pd.DataFrame(self.get_graph_data())
|
| 64 |
+
return px.line(df,
|
| 65 |
+
x='index',
|
| 66 |
+
y=[
|
| 67 |
+
'CPU memory usage',
|
| 68 |
+
'GPU memory usage',
|
| 69 |
+
'GPU Util',
|
| 70 |
+
],
|
| 71 |
+
range_y=[-5,
|
| 72 |
+
105]).update_layout(xaxis_title='Time',
|
| 73 |
+
yaxis_title='Percentage')
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def create_monitor_demo() -> gr.Blocks:
|
| 77 |
+
monitor = SystemMonitor()
|
| 78 |
+
with gr.Blocks() as demo:
|
| 79 |
+
gr.JSON(value=monitor.update, every=1, visible=False)
|
| 80 |
+
gr.JSON(value=monitor.get_json, show_label=False, every=1)
|
| 81 |
+
gr.Plot(value=monitor.get_graph, show_label=False, every=1)
|
| 82 |
+
return demo
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
if __name__ == '__main__':
|
| 86 |
+
demo = create_monitor_demo()
|
| 87 |
+
demo.queue(api_open=False).launch()
|
app_training.py
CHANGED
|
@@ -6,6 +6,7 @@ import os
|
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
|
|
|
|
| 9 |
from constants import UploadTarget
|
| 10 |
from inference import InferencePipeline
|
| 11 |
from trainer import Trainer
|
|
@@ -13,6 +14,11 @@ from trainer import Trainer
|
|
| 13 |
|
| 14 |
def create_training_demo(trainer: Trainer,
|
| 15 |
pipe: InferencePipeline | None = None) -> gr.Blocks:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
hf_token = os.getenv('HF_TOKEN')
|
| 17 |
with gr.Blocks() as demo:
|
| 18 |
with gr.Row():
|
|
@@ -108,8 +114,14 @@ def create_training_demo(trainer: Trainer,
|
|
| 108 |
run_button = gr.Button('Start Training')
|
| 109 |
|
| 110 |
with gr.Box():
|
| 111 |
-
gr.
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
if pipe is not None:
|
| 115 |
run_button.click(fn=pipe.clear)
|
|
@@ -136,8 +148,7 @@ def create_training_demo(trainer: Trainer,
|
|
| 136 |
upload_to,
|
| 137 |
remove_gpu_after_training,
|
| 138 |
input_token,
|
| 139 |
-
]
|
| 140 |
-
outputs=output_message)
|
| 141 |
return demo
|
| 142 |
|
| 143 |
|
|
|
|
| 6 |
|
| 7 |
import gradio as gr
|
| 8 |
|
| 9 |
+
from app_system_monitor import create_monitor_demo
|
| 10 |
from constants import UploadTarget
|
| 11 |
from inference import InferencePipeline
|
| 12 |
from trainer import Trainer
|
|
|
|
| 14 |
|
| 15 |
def create_training_demo(trainer: Trainer,
|
| 16 |
pipe: InferencePipeline | None = None) -> gr.Blocks:
|
| 17 |
+
def read_log() -> str:
|
| 18 |
+
with open(trainer.log_file) as f:
|
| 19 |
+
lines = f.readlines()
|
| 20 |
+
return ''.join(lines[-10:])
|
| 21 |
+
|
| 22 |
hf_token = os.getenv('HF_TOKEN')
|
| 23 |
with gr.Blocks() as demo:
|
| 24 |
with gr.Row():
|
|
|
|
| 114 |
run_button = gr.Button('Start Training')
|
| 115 |
|
| 116 |
with gr.Box():
|
| 117 |
+
gr.Text(label='Log',
|
| 118 |
+
value=read_log,
|
| 119 |
+
lines=10,
|
| 120 |
+
max_lines=10,
|
| 121 |
+
every=1)
|
| 122 |
+
if not os.getenv('DISABLE_SYSTEM_MONITOR'):
|
| 123 |
+
with gr.Accordion(label='System info', open=False):
|
| 124 |
+
create_monitor_demo()
|
| 125 |
|
| 126 |
if pipe is not None:
|
| 127 |
run_button.click(fn=pipe.clear)
|
|
|
|
| 148 |
upload_to,
|
| 149 |
remove_gpu_after_training,
|
| 150 |
input_token,
|
| 151 |
+
])
|
|
|
|
| 152 |
return demo
|
| 153 |
|
| 154 |
|
requirements-monitor.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
nvitop==1.1.1
|
| 2 |
+
pandas==2.0.0
|
| 3 |
+
plotly==5.14.1
|
| 4 |
+
psutil==5.9.4
|
trainer.py
CHANGED
|
@@ -32,6 +32,9 @@ class Trainer:
|
|
| 32 |
self.checkpoint_dir = pathlib.Path('checkpoints')
|
| 33 |
self.checkpoint_dir.mkdir(exist_ok=True)
|
| 34 |
|
|
|
|
|
|
|
|
|
|
| 35 |
def download_base_model(self, base_model_id: str) -> str:
|
| 36 |
model_dir = self.checkpoint_dir / base_model_id
|
| 37 |
if not model_dir.exists():
|
|
@@ -72,7 +75,7 @@ class Trainer:
|
|
| 72 |
upload_to: str,
|
| 73 |
remove_gpu_after_training: bool,
|
| 74 |
input_token: str,
|
| 75 |
-
) ->
|
| 76 |
if SPACE_ID == ORIGINAL_SPACE_ID:
|
| 77 |
raise gr.Error(
|
| 78 |
'This Space does not work on this Shared UI. Duplicate the Space and attribute a GPU'
|
|
@@ -134,15 +137,19 @@ class Trainer:
|
|
| 134 |
OmegaConf.save(config, f)
|
| 135 |
|
| 136 |
command = f'accelerate launch Tune-A-Video/train_tuneavideo.py --config {config_path}'
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
save_model_card(save_dir=output_dir,
|
| 139 |
base_model=base_model,
|
| 140 |
training_prompt=training_prompt,
|
| 141 |
test_prompt=validation_prompt,
|
| 142 |
test_image_dir='samples')
|
| 143 |
|
| 144 |
-
|
| 145 |
-
|
| 146 |
|
| 147 |
if upload_to_hub:
|
| 148 |
upload_message = self.model_uploader.upload_model(
|
|
@@ -152,8 +159,8 @@ class Trainer:
|
|
| 152 |
private=use_private_repo,
|
| 153 |
delete_existing_repo=delete_existing_repo,
|
| 154 |
input_token=input_token)
|
| 155 |
-
|
| 156 |
-
|
| 157 |
|
| 158 |
if remove_gpu_after_training:
|
| 159 |
space_id = os.getenv('SPACE_ID')
|
|
@@ -162,5 +169,3 @@ class Trainer:
|
|
| 162 |
token=self.hf_token if self.hf_token else input_token)
|
| 163 |
api.request_space_hardware(repo_id=space_id,
|
| 164 |
hardware='cpu-basic')
|
| 165 |
-
|
| 166 |
-
return message
|
|
|
|
| 32 |
self.checkpoint_dir = pathlib.Path('checkpoints')
|
| 33 |
self.checkpoint_dir.mkdir(exist_ok=True)
|
| 34 |
|
| 35 |
+
self.log_file = pathlib.Path('log.txt')
|
| 36 |
+
self.log_file.touch(exist_ok=True)
|
| 37 |
+
|
| 38 |
def download_base_model(self, base_model_id: str) -> str:
|
| 39 |
model_dir = self.checkpoint_dir / base_model_id
|
| 40 |
if not model_dir.exists():
|
|
|
|
| 75 |
upload_to: str,
|
| 76 |
remove_gpu_after_training: bool,
|
| 77 |
input_token: str,
|
| 78 |
+
) -> None:
|
| 79 |
if SPACE_ID == ORIGINAL_SPACE_ID:
|
| 80 |
raise gr.Error(
|
| 81 |
'This Space does not work on this Shared UI. Duplicate the Space and attribute a GPU'
|
|
|
|
| 137 |
OmegaConf.save(config, f)
|
| 138 |
|
| 139 |
command = f'accelerate launch Tune-A-Video/train_tuneavideo.py --config {config_path}'
|
| 140 |
+
with open(self.log_file, 'w') as f:
|
| 141 |
+
subprocess.run(shlex.split(command),
|
| 142 |
+
stdout=f,
|
| 143 |
+
stderr=subprocess.STDOUT,
|
| 144 |
+
text=True)
|
| 145 |
save_model_card(save_dir=output_dir,
|
| 146 |
base_model=base_model,
|
| 147 |
training_prompt=training_prompt,
|
| 148 |
test_prompt=validation_prompt,
|
| 149 |
test_image_dir='samples')
|
| 150 |
|
| 151 |
+
with open(self.log_file, 'a') as f:
|
| 152 |
+
f.write('Training completed!\n')
|
| 153 |
|
| 154 |
if upload_to_hub:
|
| 155 |
upload_message = self.model_uploader.upload_model(
|
|
|
|
| 159 |
private=use_private_repo,
|
| 160 |
delete_existing_repo=delete_existing_repo,
|
| 161 |
input_token=input_token)
|
| 162 |
+
with open(self.log_file, 'a') as f:
|
| 163 |
+
f.write(upload_message)
|
| 164 |
|
| 165 |
if remove_gpu_after_training:
|
| 166 |
space_id = os.getenv('SPACE_ID')
|
|
|
|
| 169 |
token=self.hf_token if self.hf_token else input_token)
|
| 170 |
api.request_space_hardware(repo_id=space_id,
|
| 171 |
hardware='cpu-basic')
|
|
|
|
|
|