Spaces:
Running
on
Zero
Running
on
Zero
#!/usr/bin/env python3 | |
""" | |
Utilitaires pour Step 03 - Lecture de la configuration Step 02 | |
""" | |
import json | |
from pathlib import Path | |
from typing import Dict, Optional | |
class Step03Config: | |
"""Gestionnaire de configuration Step 03 basé sur la sortie Step 02.""" | |
def __init__(self, config_file: str = "step03_config.json"): | |
self.config_file = Path(config_file) | |
self.config = self.load_config() | |
def load_config(self) -> Dict: | |
"""Charge la configuration Step 03.""" | |
if not self.config_file.exists(): | |
raise FileNotFoundError( | |
f"❌ Configuration Step 03 non trouvée: {self.config_file}\n" | |
f"💡 Lancez d'abord: python step02_upload_embeddings.py" | |
) | |
try: | |
with open(self.config_file, 'r', encoding='utf-8') as f: | |
config = json.load(f) | |
# Vérification de la structure | |
if not config.get("step02_completed"): | |
raise ValueError("❌ Step 02 non complété selon la configuration") | |
required_keys = ["huggingface", "embeddings_info"] | |
for key in required_keys: | |
if key not in config: | |
raise ValueError(f"❌ Clé manquante dans configuration: {key}") | |
return config | |
except json.JSONDecodeError as e: | |
raise ValueError(f"❌ Configuration Step 03 malformée: {e}") | |
def repo_id(self) -> str: | |
"""Repository Hugging Face ID.""" | |
return self.config["huggingface"]["repo_id"] | |
def dataset_name(self) -> str: | |
"""Nom du dataset.""" | |
return self.config["huggingface"]["dataset_name"] | |
def embeddings_file(self) -> str: | |
"""Nom du fichier SafeTensors.""" | |
return self.config["huggingface"]["files"]["embeddings"] | |
def metadata_file(self) -> str: | |
"""Nom du fichier métadonnées.""" | |
return self.config["huggingface"]["files"]["metadata"] | |
def total_vectors(self) -> int: | |
"""Nombre total de vecteurs.""" | |
return self.config["embeddings_info"]["total_vectors"] | |
def vector_dimension(self) -> int: | |
"""Dimension des vecteurs.""" | |
return self.config["embeddings_info"]["vector_dimension"] | |
def embedding_model(self) -> str: | |
"""Modèle d'embedding utilisé.""" | |
return self.config["embeddings_info"]["embedding_model"] | |
def download_command(self) -> str: | |
"""Commande de téléchargement HF Hub.""" | |
return self.config["usage_examples"]["download_command"] | |
def load_command(self) -> str: | |
"""Commande de chargement SafeTensors.""" | |
return self.config["usage_examples"]["load_command"] | |
def print_summary(self): | |
"""Affiche un résumé de la configuration.""" | |
print("📋 Configuration Step 03 - Résumé") | |
print("=" * 40) | |
print(f"📦 Repository HF: {self.repo_id}") | |
print(f"📊 Embeddings: {self.total_vectors:,} vecteurs") | |
print(f"📏 Dimension: {self.vector_dimension}") | |
print(f"🧠 Modèle: {self.embedding_model}") | |
print(f"📁 Fichier: {self.embeddings_file}") | |
print(f"⏰ Complété: {self.config.get('completion_timestamp', 'N/A')}") | |
print() | |
print("🚀 Prêt pour la recherche sémantique !") | |
def get_download_instructions(self) -> Dict[str, str]: | |
"""Retourne les instructions de téléchargement.""" | |
return { | |
"python_code": f''' | |
from huggingface_hub import hf_hub_download | |
from safetensors.torch import load_file | |
import json | |
# Télécharger les fichiers | |
embeddings_file = hf_hub_download( | |
repo_id="{self.repo_id}", | |
filename="{self.embeddings_file}" | |
) | |
metadata_file = hf_hub_download( | |
repo_id="{self.repo_id}", | |
filename="{self.metadata_file}" | |
) | |
# Charger les embeddings | |
tensors = load_file(embeddings_file) | |
embeddings = tensors["embeddings"] # Shape: [{self.total_vectors}, {self.vector_dimension}] | |
# Charger les métadonnées | |
with open(metadata_file, 'r') as f: | |
metadata = json.load(f) | |
print(f"✅ Embeddings chargés: {{embeddings.shape}}") | |
'''.strip(), | |
"cli_download": f"huggingface-cli download {self.repo_id} --repo-type dataset", | |
"repo_url": f"https://huggingface.co/datasets/{self.repo_id}" | |
} | |
def load_step03_config(config_file: str = "step03_config.json") -> Step03Config: | |
""" | |
Fonction utilitaire pour charger la configuration Step 03. | |
Args: | |
config_file: Chemin vers le fichier de configuration | |
Returns: | |
Instance de Step03Config | |
Raises: | |
FileNotFoundError: Si le fichier n'existe pas | |
ValueError: Si la configuration est invalide | |
""" | |
return Step03Config(config_file) | |
def check_step03_ready() -> bool: | |
""" | |
Vérifie si Step 03 peut être lancé (configuration Step 02 disponible). | |
Returns: | |
True si prêt, False sinon | |
""" | |
try: | |
config = load_step03_config() | |
return config.config.get("step02_completed", False) | |
except (FileNotFoundError, ValueError): | |
return False | |
if __name__ == "__main__": | |
"""Test de la configuration Step 03.""" | |
try: | |
print("🧪 Test de configuration Step 03") | |
print("=" * 40) | |
if check_step03_ready(): | |
config = load_step03_config() | |
config.print_summary() | |
print("\n📖 Instructions de téléchargement:") | |
instructions = config.get_download_instructions() | |
print(instructions["python_code"]) | |
else: | |
print("❌ Step 03 non prêt") | |
print("💡 Lancez d'abord: python step02_upload_embeddings.py") | |
except Exception as e: | |
print(f"❌ Erreur: {e}") |