Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- app/services/github_service.py +44 -0
- app/services/hf_service.py +18 -0
- app/services/logger.py +15 -0
- app/services/resource_manager.py +74 -0
app/services/github_service.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
4 |
+
from github import Github
|
5 |
+
from app.services.logger import get_logger
|
6 |
+
|
7 |
+
log = get_logger(__name__)
|
8 |
+
|
9 |
+
class GitHubClient:
|
10 |
+
def __init__(self, token_env="GITHUB_TOKEN", repo_env="GITHUB_RESULTS_REPO", results_dir="experiment_logs"):
|
11 |
+
self.token = os.getenv(token_env)
|
12 |
+
self.repo_name = os.getenv(repo_env)
|
13 |
+
self.results_dir = results_dir
|
14 |
+
self.repo = None
|
15 |
+
if self.token and self.repo_name:
|
16 |
+
try:
|
17 |
+
g = Github(self.token)
|
18 |
+
self.repo = g.get_repo(self.repo_name)
|
19 |
+
except Exception as e:
|
20 |
+
log.error("GitHub init failed: %s", e)
|
21 |
+
|
22 |
+
def ready(self) -> bool:
|
23 |
+
return self.repo is not None
|
24 |
+
|
25 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
|
26 |
+
def push_json(self, experiment_id: str, data: dict):
|
27 |
+
if not self.ready():
|
28 |
+
log.warning("GitHub not configured; skipping push")
|
29 |
+
return False
|
30 |
+
path = f"{self.results_dir}/{experiment_id}.json"
|
31 |
+
content = json.dumps(data, indent=2)
|
32 |
+
msg = f"Add/Update results for {experiment_id}"
|
33 |
+
try:
|
34 |
+
self.repo.create_file(path, msg, content)
|
35 |
+
log.info("Created %s", path)
|
36 |
+
except Exception:
|
37 |
+
try:
|
38 |
+
current = self.repo.get_contents(path)
|
39 |
+
self.repo.update_file(current.path, msg, content, current.sha)
|
40 |
+
log.info("Updated %s", path)
|
41 |
+
except Exception as e:
|
42 |
+
log.error("Failed to push results: %s", e)
|
43 |
+
raise
|
44 |
+
return True
|
app/services/hf_service.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from huggingface_hub import HfApi
|
2 |
+
from huggingface_hub.errors import HfHubHTTPError
|
3 |
+
from tenacity import retry, stop_after_attempt, wait_exponential
|
4 |
+
from app.utils.validation import safe_dataset_id
|
5 |
+
from app.services.logger import get_logger
|
6 |
+
|
7 |
+
log = get_logger(__name__)
|
8 |
+
api = HfApi()
|
9 |
+
|
10 |
+
@retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=8))
|
11 |
+
def dataset_exists(dataset_name: str) -> bool:
|
12 |
+
safe_dataset_id(dataset_name)
|
13 |
+
try:
|
14 |
+
api.dataset_info(dataset_name)
|
15 |
+
except HfHubHTTPError as e:
|
16 |
+
log.error(f"Hugging Face dataset lookup failed: {e}")
|
17 |
+
raise
|
18 |
+
return True
|
app/services/logger.py
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
|
4 |
+
_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
|
5 |
+
|
6 |
+
def get_logger(name: str) -> logging.Logger:
|
7 |
+
logger = logging.getLogger(name)
|
8 |
+
if logger.handlers:
|
9 |
+
return logger
|
10 |
+
logger.setLevel(_LEVEL)
|
11 |
+
h = logging.StreamHandler()
|
12 |
+
fmt = logging.Formatter('%(asctime)s %(levelname)s %(name)s: %(message)s')
|
13 |
+
h.setFormatter(fmt)
|
14 |
+
logger.addHandler(h)
|
15 |
+
return logger
|
app/services/resource_manager.py
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os, shutil, time
|
2 |
+
import tempfile
|
3 |
+
import torch
|
4 |
+
import threading
|
5 |
+
from contextlib import contextmanager
|
6 |
+
from app.services.logger import get_logger
|
7 |
+
|
8 |
+
log = get_logger(__name__)
|
9 |
+
|
10 |
+
# Concurrency
|
11 |
+
_MAX_CONCURRENT = int(os.getenv("MAX_CONCURRENT_JOBS", "1"))
|
12 |
+
_sema = threading.Semaphore(_MAX_CONCURRENT)
|
13 |
+
|
14 |
+
# Retention policy (seconds)
|
15 |
+
_RESULTS_RETENTION = int(os.getenv("RESULTS_RETENTION_SEC", str(60*60*24*7))) # 7 days
|
16 |
+
|
17 |
+
@contextmanager
|
18 |
+
def job_slot(timeout_sec: int = 60*45):
|
19 |
+
acquired = _sema.acquire(timeout=5)
|
20 |
+
if not acquired:
|
21 |
+
raise RuntimeError("All runners busy; try later")
|
22 |
+
try:
|
23 |
+
yield
|
24 |
+
finally:
|
25 |
+
_sema.release()
|
26 |
+
|
27 |
+
@contextmanager
|
28 |
+
def temp_workdir(prefix: str = "job_"):
|
29 |
+
d = tempfile.mkdtemp(prefix=prefix)
|
30 |
+
try:
|
31 |
+
yield d
|
32 |
+
finally:
|
33 |
+
try:
|
34 |
+
shutil.rmtree(d, ignore_errors=True)
|
35 |
+
except Exception:
|
36 |
+
log.warning("Failed to rmtree %s", d)
|
37 |
+
|
38 |
+
def check_gpu(mem_required_gb: float = 4.0) -> None:
|
39 |
+
if torch.cuda.is_available():
|
40 |
+
try:
|
41 |
+
props = torch.cuda.get_device_properties(0)
|
42 |
+
total_gb = props.total_memory / (1024**3)
|
43 |
+
log.info("Detected GPU with %.1f GB", total_gb)
|
44 |
+
if total_gb < mem_required_gb:
|
45 |
+
raise RuntimeError(f"GPU has {total_gb:.1f}GB < required {mem_required_gb}GB")
|
46 |
+
except Exception as e:
|
47 |
+
log.warning("GPU check failed: %s", e)
|
48 |
+
else:
|
49 |
+
log.info("CUDA not available; running on CPU")
|
50 |
+
|
51 |
+
def prune_old_results(local_results_dir: str = "local_results"):
|
52 |
+
now = time.time()
|
53 |
+
if not os.path.isdir(local_results_dir):
|
54 |
+
return
|
55 |
+
for fname in os.listdir(local_results_dir):
|
56 |
+
path = os.path.join(local_results_dir, fname)
|
57 |
+
try:
|
58 |
+
if now - os.path.getmtime(path) > _RESULTS_RETENTION:
|
59 |
+
os.remove(path)
|
60 |
+
log.info("Pruned old result %s", path)
|
61 |
+
except Exception as e:
|
62 |
+
log.warning("Prune failed for %s: %s", path, e)
|
63 |
+
|
64 |
+
# Heuristic OOM estimate (very rough)
|
65 |
+
def suggest_batch_limit(model_size_mb: float = 400):
|
66 |
+
try:
|
67 |
+
if torch.cuda.is_available():
|
68 |
+
props = torch.cuda.get_device_properties(0)
|
69 |
+
free_gb = props.total_memory / (1024**3)
|
70 |
+
suggested = max(1, int((free_gb * 1024) / (model_size_mb)))
|
71 |
+
return suggested
|
72 |
+
except Exception:
|
73 |
+
pass
|
74 |
+
return 1
|