swift-mlx-qwen3-chatbot / step03_utils.py
VincentGOURBIN's picture
Upload step03_utils.py with huggingface_hub
1c0abf0 verified
#!/usr/bin/env python3
"""
Utilitaires pour Step 03 - Lecture de la configuration Step 02
"""
import json
from pathlib import Path
from typing import Dict, Optional
class Step03Config:
"""Gestionnaire de configuration Step 03 basé sur la sortie Step 02."""
def __init__(self, config_file: str = "step03_config.json"):
self.config_file = Path(config_file)
self.config = self.load_config()
def load_config(self) -> Dict:
"""Charge la configuration Step 03."""
if not self.config_file.exists():
raise FileNotFoundError(
f"❌ Configuration Step 03 non trouvée: {self.config_file}\n"
f"💡 Lancez d'abord: python step02_upload_embeddings.py"
)
try:
with open(self.config_file, 'r', encoding='utf-8') as f:
config = json.load(f)
# Vérification de la structure
if not config.get("step02_completed"):
raise ValueError("❌ Step 02 non complété selon la configuration")
required_keys = ["huggingface", "embeddings_info"]
for key in required_keys:
if key not in config:
raise ValueError(f"❌ Clé manquante dans configuration: {key}")
return config
except json.JSONDecodeError as e:
raise ValueError(f"❌ Configuration Step 03 malformée: {e}")
@property
def repo_id(self) -> str:
"""Repository Hugging Face ID."""
return self.config["huggingface"]["repo_id"]
@property
def dataset_name(self) -> str:
"""Nom du dataset."""
return self.config["huggingface"]["dataset_name"]
@property
def embeddings_file(self) -> str:
"""Nom du fichier SafeTensors."""
return self.config["huggingface"]["files"]["embeddings"]
@property
def metadata_file(self) -> str:
"""Nom du fichier métadonnées."""
return self.config["huggingface"]["files"]["metadata"]
@property
def total_vectors(self) -> int:
"""Nombre total de vecteurs."""
return self.config["embeddings_info"]["total_vectors"]
@property
def vector_dimension(self) -> int:
"""Dimension des vecteurs."""
return self.config["embeddings_info"]["vector_dimension"]
@property
def embedding_model(self) -> str:
"""Modèle d'embedding utilisé."""
return self.config["embeddings_info"]["embedding_model"]
@property
def download_command(self) -> str:
"""Commande de téléchargement HF Hub."""
return self.config["usage_examples"]["download_command"]
@property
def load_command(self) -> str:
"""Commande de chargement SafeTensors."""
return self.config["usage_examples"]["load_command"]
def print_summary(self):
"""Affiche un résumé de la configuration."""
print("📋 Configuration Step 03 - Résumé")
print("=" * 40)
print(f"📦 Repository HF: {self.repo_id}")
print(f"📊 Embeddings: {self.total_vectors:,} vecteurs")
print(f"📏 Dimension: {self.vector_dimension}")
print(f"🧠 Modèle: {self.embedding_model}")
print(f"📁 Fichier: {self.embeddings_file}")
print(f"⏰ Complété: {self.config.get('completion_timestamp', 'N/A')}")
print()
print("🚀 Prêt pour la recherche sémantique !")
def get_download_instructions(self) -> Dict[str, str]:
"""Retourne les instructions de téléchargement."""
return {
"python_code": f'''
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file
import json
# Télécharger les fichiers
embeddings_file = hf_hub_download(
repo_id="{self.repo_id}",
filename="{self.embeddings_file}"
)
metadata_file = hf_hub_download(
repo_id="{self.repo_id}",
filename="{self.metadata_file}"
)
# Charger les embeddings
tensors = load_file(embeddings_file)
embeddings = tensors["embeddings"] # Shape: [{self.total_vectors}, {self.vector_dimension}]
# Charger les métadonnées
with open(metadata_file, 'r') as f:
metadata = json.load(f)
print(f"✅ Embeddings chargés: {{embeddings.shape}}")
'''.strip(),
"cli_download": f"huggingface-cli download {self.repo_id} --repo-type dataset",
"repo_url": f"https://huggingface.co/datasets/{self.repo_id}"
}
def load_step03_config(config_file: str = "step03_config.json") -> Step03Config:
"""
Fonction utilitaire pour charger la configuration Step 03.
Args:
config_file: Chemin vers le fichier de configuration
Returns:
Instance de Step03Config
Raises:
FileNotFoundError: Si le fichier n'existe pas
ValueError: Si la configuration est invalide
"""
return Step03Config(config_file)
def check_step03_ready() -> bool:
"""
Vérifie si Step 03 peut être lancé (configuration Step 02 disponible).
Returns:
True si prêt, False sinon
"""
try:
config = load_step03_config()
return config.config.get("step02_completed", False)
except (FileNotFoundError, ValueError):
return False
if __name__ == "__main__":
"""Test de la configuration Step 03."""
try:
print("🧪 Test de configuration Step 03")
print("=" * 40)
if check_step03_ready():
config = load_step03_config()
config.print_summary()
print("\n📖 Instructions de téléchargement:")
instructions = config.get_download_instructions()
print(instructions["python_code"])
else:
print("❌ Step 03 non prêt")
print("💡 Lancez d'abord: python step02_upload_embeddings.py")
except Exception as e:
print(f"❌ Erreur: {e}")