Commit
Β·
22ef2c1
1
Parent(s):
cc36ad4
Create path_config.py
Browse files- path_config.py +246 -0
path_config.py
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Dict, Optional
|
| 5 |
+
import logging
|
| 6 |
+
|
| 7 |
+
logger = logging.getLogger(__name__)
|
| 8 |
+
|
| 9 |
+
class EnvironmentPathManager:
|
| 10 |
+
"""Dynamic path management for different deployment environments"""
|
| 11 |
+
|
| 12 |
+
def __init__(self):
|
| 13 |
+
self.environment = self._detect_environment()
|
| 14 |
+
self.base_paths = self._configure_paths()
|
| 15 |
+
self._ensure_directories()
|
| 16 |
+
|
| 17 |
+
def _detect_environment(self) -> str:
|
| 18 |
+
"""Detect the current deployment environment"""
|
| 19 |
+
# Check for HuggingFace Spaces
|
| 20 |
+
if os.environ.get('SPACE_ID') or os.path.exists('/app/app.py') or os.path.exists('/app/streamlit_app.py'):
|
| 21 |
+
return 'huggingface_spaces'
|
| 22 |
+
|
| 23 |
+
# Check for Docker container
|
| 24 |
+
if os.path.exists('/.dockerenv') or os.environ.get('DOCKER_CONTAINER'):
|
| 25 |
+
return 'docker'
|
| 26 |
+
|
| 27 |
+
# Check if running from /app directory (likely container)
|
| 28 |
+
if str(Path.cwd()).startswith('/app'):
|
| 29 |
+
return 'container'
|
| 30 |
+
|
| 31 |
+
# Default to local development
|
| 32 |
+
return 'local'
|
| 33 |
+
|
| 34 |
+
def _configure_paths(self) -> Dict[str, Path]:
|
| 35 |
+
"""Configure paths based on environment"""
|
| 36 |
+
if self.environment == 'huggingface_spaces':
|
| 37 |
+
# HuggingFace Spaces: Use /app structure
|
| 38 |
+
base_dir = Path('/app')
|
| 39 |
+
return {
|
| 40 |
+
'base': base_dir,
|
| 41 |
+
'data': base_dir / 'data',
|
| 42 |
+
'model': base_dir / 'model',
|
| 43 |
+
'logs': base_dir / 'logs',
|
| 44 |
+
'cache': base_dir / 'cache',
|
| 45 |
+
'temp': base_dir / 'temp'
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
elif self.environment in ['docker', 'container']:
|
| 49 |
+
# Docker/Container: Use /app structure with /tmp for temporary files
|
| 50 |
+
base_dir = Path('/app')
|
| 51 |
+
return {
|
| 52 |
+
'base': base_dir,
|
| 53 |
+
'data': base_dir / 'data',
|
| 54 |
+
'model': base_dir / 'model',
|
| 55 |
+
'logs': base_dir / 'logs',
|
| 56 |
+
'cache': Path('/tmp/cache'),
|
| 57 |
+
'temp': Path('/tmp/temp')
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
else:
|
| 61 |
+
# Local development: Use project structure
|
| 62 |
+
# Find project root (where this file is located)
|
| 63 |
+
current_file = Path(__file__).resolve()
|
| 64 |
+
project_root = current_file.parent
|
| 65 |
+
|
| 66 |
+
# Navigate up to find the actual project root
|
| 67 |
+
while project_root.parent != project_root:
|
| 68 |
+
if (project_root / 'requirements.txt').exists():
|
| 69 |
+
break
|
| 70 |
+
project_root = project_root.parent
|
| 71 |
+
|
| 72 |
+
return {
|
| 73 |
+
'base': project_root,
|
| 74 |
+
'data': project_root / 'data',
|
| 75 |
+
'model': project_root / 'model',
|
| 76 |
+
'logs': project_root / 'logs',
|
| 77 |
+
'cache': project_root / 'cache',
|
| 78 |
+
'temp': project_root / 'temp'
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
def _ensure_directories(self):
|
| 82 |
+
"""Ensure all necessary directories exist"""
|
| 83 |
+
for path_name, path in self.base_paths.items():
|
| 84 |
+
try:
|
| 85 |
+
path.mkdir(parents=True, exist_ok=True)
|
| 86 |
+
logger.debug(f"Ensured directory exists: {path}")
|
| 87 |
+
except PermissionError:
|
| 88 |
+
logger.warning(f"Cannot create directory {path}, using fallback")
|
| 89 |
+
if path_name in ['cache', 'temp']:
|
| 90 |
+
# Fallback to user's home directory for cache/temp
|
| 91 |
+
fallback_path = Path.home() / f'.fake_news_detector/{path_name}'
|
| 92 |
+
fallback_path.mkdir(parents=True, exist_ok=True)
|
| 93 |
+
self.base_paths[path_name] = fallback_path
|
| 94 |
+
except Exception as e:
|
| 95 |
+
logger.error(f"Failed to create directory {path}: {e}")
|
| 96 |
+
|
| 97 |
+
def get_data_path(self, filename: str = '') -> Path:
|
| 98 |
+
"""Get data directory path"""
|
| 99 |
+
return self.base_paths['data'] / filename if filename else self.base_paths['data']
|
| 100 |
+
|
| 101 |
+
def get_model_path(self, filename: str = '') -> Path:
|
| 102 |
+
"""Get model directory path"""
|
| 103 |
+
return self.base_paths['model'] / filename if filename else self.base_paths['model']
|
| 104 |
+
|
| 105 |
+
def get_logs_path(self, filename: str = '') -> Path:
|
| 106 |
+
"""Get logs directory path"""
|
| 107 |
+
return self.base_paths['logs'] / filename if filename else self.base_paths['logs']
|
| 108 |
+
|
| 109 |
+
def get_cache_path(self, filename: str = '') -> Path:
|
| 110 |
+
"""Get cache directory path"""
|
| 111 |
+
return self.base_paths['cache'] / filename if filename else self.base_paths['cache']
|
| 112 |
+
|
| 113 |
+
def get_temp_path(self, filename: str = '') -> Path:
|
| 114 |
+
"""Get temporary directory path"""
|
| 115 |
+
return self.base_paths['temp'] / filename if filename else self.base_paths['temp']
|
| 116 |
+
|
| 117 |
+
def get_activity_log_path(self) -> Path:
|
| 118 |
+
"""Get activity log file path"""
|
| 119 |
+
return self.get_logs_path('activity_log.json')
|
| 120 |
+
|
| 121 |
+
def get_metadata_path(self) -> Path:
|
| 122 |
+
"""Get model metadata file path"""
|
| 123 |
+
return self.get_model_path('metadata.json')
|
| 124 |
+
|
| 125 |
+
def get_combined_dataset_path(self) -> Path:
|
| 126 |
+
"""Get combined dataset path"""
|
| 127 |
+
return self.get_data_path('combined_dataset.csv')
|
| 128 |
+
|
| 129 |
+
def get_scraped_data_path(self) -> Path:
|
| 130 |
+
"""Get scraped data path"""
|
| 131 |
+
return self.get_data_path('scraped_real.csv')
|
| 132 |
+
|
| 133 |
+
def get_generated_data_path(self) -> Path:
|
| 134 |
+
"""Get generated fake data path"""
|
| 135 |
+
return self.get_data_path('generated_fake.csv')
|
| 136 |
+
|
| 137 |
+
def get_model_file_path(self) -> Path:
|
| 138 |
+
"""Get main model file path"""
|
| 139 |
+
return self.get_model_path('model.pkl')
|
| 140 |
+
|
| 141 |
+
def get_vectorizer_path(self) -> Path:
|
| 142 |
+
"""Get vectorizer file path"""
|
| 143 |
+
return self.get_model_path('vectorizer.pkl')
|
| 144 |
+
|
| 145 |
+
def get_pipeline_path(self) -> Path:
|
| 146 |
+
"""Get pipeline file path"""
|
| 147 |
+
return self.get_model_path('pipeline.pkl')
|
| 148 |
+
|
| 149 |
+
def get_candidate_model_path(self) -> Path:
|
| 150 |
+
"""Get candidate model file path"""
|
| 151 |
+
return self.get_model_path('model_candidate.pkl')
|
| 152 |
+
|
| 153 |
+
def get_candidate_vectorizer_path(self) -> Path:
|
| 154 |
+
"""Get candidate vectorizer file path"""
|
| 155 |
+
return self.get_model_path('vectorizer_candidate.pkl')
|
| 156 |
+
|
| 157 |
+
def get_candidate_pipeline_path(self) -> Path:
|
| 158 |
+
"""Get candidate pipeline file path"""
|
| 159 |
+
return self.get_model_path('pipeline_candidate.pkl')
|
| 160 |
+
|
| 161 |
+
def list_available_datasets(self) -> Dict[str, bool]:
|
| 162 |
+
"""List available datasets and their existence status"""
|
| 163 |
+
datasets = {
|
| 164 |
+
'combined_dataset.csv': self.get_combined_dataset_path().exists(),
|
| 165 |
+
'scraped_real.csv': self.get_scraped_data_path().exists(),
|
| 166 |
+
'generated_fake.csv': self.get_generated_data_path().exists(),
|
| 167 |
+
'kaggle/Fake.csv': (self.get_data_path() / 'kaggle' / 'Fake.csv').exists(),
|
| 168 |
+
'kaggle/True.csv': (self.get_data_path() / 'kaggle' / 'True.csv').exists(),
|
| 169 |
+
}
|
| 170 |
+
return datasets
|
| 171 |
+
|
| 172 |
+
def list_available_models(self) -> Dict[str, bool]:
|
| 173 |
+
"""List available models and their existence status"""
|
| 174 |
+
models = {
|
| 175 |
+
'model.pkl': self.get_model_file_path().exists(),
|
| 176 |
+
'vectorizer.pkl': self.get_vectorizer_path().exists(),
|
| 177 |
+
'pipeline.pkl': self.get_pipeline_path().exists(),
|
| 178 |
+
'model_candidate.pkl': self.get_candidate_model_path().exists(),
|
| 179 |
+
'vectorizer_candidate.pkl': self.get_candidate_vectorizer_path().exists(),
|
| 180 |
+
'pipeline_candidate.pkl': self.get_candidate_pipeline_path().exists(),
|
| 181 |
+
'metadata.json': self.get_metadata_path().exists()
|
| 182 |
+
}
|
| 183 |
+
return models
|
| 184 |
+
|
| 185 |
+
def get_environment_info(self) -> Dict:
|
| 186 |
+
"""Get comprehensive environment information"""
|
| 187 |
+
return {
|
| 188 |
+
'environment': self.environment,
|
| 189 |
+
'base_dir': str(self.base_paths['base']),
|
| 190 |
+
'data_dir': str(self.base_paths['data']),
|
| 191 |
+
'model_dir': str(self.base_paths['model']),
|
| 192 |
+
'logs_dir': str(self.base_paths['logs']),
|
| 193 |
+
'available_datasets': self.list_available_datasets(),
|
| 194 |
+
'available_models': self.list_available_models(),
|
| 195 |
+
'current_working_directory': str(Path.cwd()),
|
| 196 |
+
'python_path': sys.path[0],
|
| 197 |
+
'space_id': os.environ.get('SPACE_ID', 'Not HF Spaces'),
|
| 198 |
+
'docker_env': os.path.exists('/.dockerenv')
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
def log_environment_info(self):
|
| 202 |
+
"""Log detailed environment information"""
|
| 203 |
+
info = self.get_environment_info()
|
| 204 |
+
logger.info(f"π Environment: {info['environment']}")
|
| 205 |
+
logger.info(f"π Base directory: {info['base_dir']}")
|
| 206 |
+
logger.info(f"π Data directory: {info['data_dir']}")
|
| 207 |
+
logger.info(f"π€ Model directory: {info['model_dir']}")
|
| 208 |
+
logger.info(f"π Logs directory: {info['logs_dir']}")
|
| 209 |
+
|
| 210 |
+
# Log available files
|
| 211 |
+
datasets = info['available_datasets']
|
| 212 |
+
models = info['available_models']
|
| 213 |
+
|
| 214 |
+
logger.info(f"π Available datasets: {sum(datasets.values())}/{len(datasets)}")
|
| 215 |
+
for name, exists in datasets.items():
|
| 216 |
+
status = "β
" if exists else "β"
|
| 217 |
+
logger.info(f" {status} {name}")
|
| 218 |
+
|
| 219 |
+
logger.info(f"π― Available models: {sum(models.values())}/{len(models)}")
|
| 220 |
+
for name, exists in models.items():
|
| 221 |
+
status = "β
" if exists else "β"
|
| 222 |
+
logger.info(f" {status} {name}")
|
| 223 |
+
|
| 224 |
+
# Global instance
|
| 225 |
+
path_manager = EnvironmentPathManager()
|
| 226 |
+
|
| 227 |
+
# Convenience functions for backward compatibility
|
| 228 |
+
def get_data_path(filename: str = '') -> Path:
|
| 229 |
+
return path_manager.get_data_path(filename)
|
| 230 |
+
|
| 231 |
+
def get_model_path(filename: str = '') -> Path:
|
| 232 |
+
return path_manager.get_model_path(filename)
|
| 233 |
+
|
| 234 |
+
def get_logs_path(filename: str = '') -> Path:
|
| 235 |
+
return path_manager.get_logs_path(filename)
|
| 236 |
+
|
| 237 |
+
def get_environment_info() -> Dict:
|
| 238 |
+
return path_manager.get_environment_info()
|
| 239 |
+
|
| 240 |
+
def log_environment_info():
|
| 241 |
+
path_manager.log_environment_info()
|
| 242 |
+
|
| 243 |
+
# For debugging
|
| 244 |
+
if __name__ == "__main__":
|
| 245 |
+
logging.basicConfig(level=logging.INFO)
|
| 246 |
+
log_environment_info()
|