rider-provider-777 commited on
Commit
62aa251
·
verified ·
1 Parent(s): 4c78db1

Upload 4 files

Browse files
app/services/github_service.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from tenacity import retry, stop_after_attempt, wait_exponential
4
+ from github import Github
5
+ from app.services.logger import get_logger
6
+
7
+ log = get_logger(__name__)
8
+
9
+ class GitHubClient:
10
+ def __init__(self, token_env="GITHUB_TOKEN", repo_env="GITHUB_RESULTS_REPO", results_dir="experiment_logs"):
11
+ self.token = os.getenv(token_env)
12
+ self.repo_name = os.getenv(repo_env)
13
+ self.results_dir = results_dir
14
+ self.repo = None
15
+ if self.token and self.repo_name:
16
+ try:
17
+ g = Github(self.token)
18
+ self.repo = g.get_repo(self.repo_name)
19
+ except Exception as e:
20
+ log.error("GitHub init failed: %s", e)
21
+
22
+ def ready(self) -> bool:
23
+ return self.repo is not None
24
+
25
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=10))
26
+ def push_json(self, experiment_id: str, data: dict):
27
+ if not self.ready():
28
+ log.warning("GitHub not configured; skipping push")
29
+ return False
30
+ path = f"{self.results_dir}/{experiment_id}.json"
31
+ content = json.dumps(data, indent=2)
32
+ msg = f"Add/Update results for {experiment_id}"
33
+ try:
34
+ self.repo.create_file(path, msg, content)
35
+ log.info("Created %s", path)
36
+ except Exception:
37
+ try:
38
+ current = self.repo.get_contents(path)
39
+ self.repo.update_file(current.path, msg, content, current.sha)
40
+ log.info("Updated %s", path)
41
+ except Exception as e:
42
+ log.error("Failed to push results: %s", e)
43
+ raise
44
+ return True
app/services/hf_service.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import HfApi
2
+ from huggingface_hub.errors import HfHubHTTPError
3
+ from tenacity import retry, stop_after_attempt, wait_exponential
4
+ from app.utils.validation import safe_dataset_id
5
+ from app.services.logger import get_logger
6
+
7
+ log = get_logger(__name__)
8
+ api = HfApi()
9
+
10
+ @retry(stop=stop_after_attempt(3), wait=wait_exponential(min=1, max=8))
11
+ def dataset_exists(dataset_name: str) -> bool:
12
+ safe_dataset_id(dataset_name)
13
+ try:
14
+ api.dataset_info(dataset_name)
15
+ except HfHubHTTPError as e:
16
+ log.error(f"Hugging Face dataset lookup failed: {e}")
17
+ raise
18
+ return True
app/services/logger.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ _LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
5
+
6
+ def get_logger(name: str) -> logging.Logger:
7
+ logger = logging.getLogger(name)
8
+ if logger.handlers:
9
+ return logger
10
+ logger.setLevel(_LEVEL)
11
+ h = logging.StreamHandler()
12
+ fmt = logging.Formatter('%(asctime)s %(levelname)s %(name)s: %(message)s')
13
+ h.setFormatter(fmt)
14
+ logger.addHandler(h)
15
+ return logger
app/services/resource_manager.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, shutil, time
2
+ import tempfile
3
+ import torch
4
+ import threading
5
+ from contextlib import contextmanager
6
+ from app.services.logger import get_logger
7
+
8
+ log = get_logger(__name__)
9
+
10
+ # Concurrency
11
+ _MAX_CONCURRENT = int(os.getenv("MAX_CONCURRENT_JOBS", "1"))
12
+ _sema = threading.Semaphore(_MAX_CONCURRENT)
13
+
14
+ # Retention policy (seconds)
15
+ _RESULTS_RETENTION = int(os.getenv("RESULTS_RETENTION_SEC", str(60*60*24*7))) # 7 days
16
+
17
+ @contextmanager
18
+ def job_slot(timeout_sec: int = 60*45):
19
+ acquired = _sema.acquire(timeout=5)
20
+ if not acquired:
21
+ raise RuntimeError("All runners busy; try later")
22
+ try:
23
+ yield
24
+ finally:
25
+ _sema.release()
26
+
27
+ @contextmanager
28
+ def temp_workdir(prefix: str = "job_"):
29
+ d = tempfile.mkdtemp(prefix=prefix)
30
+ try:
31
+ yield d
32
+ finally:
33
+ try:
34
+ shutil.rmtree(d, ignore_errors=True)
35
+ except Exception:
36
+ log.warning("Failed to rmtree %s", d)
37
+
38
+ def check_gpu(mem_required_gb: float = 4.0) -> None:
39
+ if torch.cuda.is_available():
40
+ try:
41
+ props = torch.cuda.get_device_properties(0)
42
+ total_gb = props.total_memory / (1024**3)
43
+ log.info("Detected GPU with %.1f GB", total_gb)
44
+ if total_gb < mem_required_gb:
45
+ raise RuntimeError(f"GPU has {total_gb:.1f}GB < required {mem_required_gb}GB")
46
+ except Exception as e:
47
+ log.warning("GPU check failed: %s", e)
48
+ else:
49
+ log.info("CUDA not available; running on CPU")
50
+
51
+ def prune_old_results(local_results_dir: str = "local_results"):
52
+ now = time.time()
53
+ if not os.path.isdir(local_results_dir):
54
+ return
55
+ for fname in os.listdir(local_results_dir):
56
+ path = os.path.join(local_results_dir, fname)
57
+ try:
58
+ if now - os.path.getmtime(path) > _RESULTS_RETENTION:
59
+ os.remove(path)
60
+ log.info("Pruned old result %s", path)
61
+ except Exception as e:
62
+ log.warning("Prune failed for %s: %s", path, e)
63
+
64
+ # Heuristic OOM estimate (very rough)
65
+ def suggest_batch_limit(model_size_mb: float = 400):
66
+ try:
67
+ if torch.cuda.is_available():
68
+ props = torch.cuda.get_device_properties(0)
69
+ free_gb = props.total_memory / (1024**3)
70
+ suggested = max(1, int((free_gb * 1024) / (model_size_mb)))
71
+ return suggested
72
+ except Exception:
73
+ pass
74
+ return 1