jbilcke-hf HF Staff commited on
Commit
608088a
·
1 Parent(s): 3f5a4ba

adding some logs to investigate

Browse files
Files changed (1) hide show
  1. vms/ui/project/services/training.py +11 -8
vms/ui/project/services/training.py CHANGED
@@ -1664,25 +1664,25 @@ class TrainingService:
1664
  # Check in lora_weights directory
1665
  lora_weights_dir = self.app.output_path / "lora_weights"
1666
  if lora_weights_dir.exists():
1667
- #logger.info(f"Found lora_weights directory: {lora_weights_dir}")
1668
 
1669
  # Look for the latest checkpoint directory in lora_weights
1670
  lora_checkpoints = [d for d in lora_weights_dir.glob("*") if d.is_dir() and d.name.isdigit()]
1671
  if lora_checkpoints:
1672
  latest_lora_checkpoint = max(lora_checkpoints, key=lambda x: int(x.name))
1673
- #logger.info(f"Found latest LoRA checkpoint: {latest_lora_checkpoint}")
1674
 
1675
  # Extract step count from directory name
1676
  result["steps"] = int(latest_lora_checkpoint.name)
1677
 
1678
  # List contents of the latest checkpoint directory
1679
  checkpoint_contents = list(latest_lora_checkpoint.glob("*"))
1680
- #logger.info(f"Contents of LoRA checkpoint {latest_lora_checkpoint.name}: {checkpoint_contents}")
1681
 
1682
  # Check for weights in the latest LoRA checkpoint
1683
  lora_safetensors = latest_lora_checkpoint / "pytorch_lora_weights.safetensors"
1684
  if lora_safetensors.exists():
1685
- #logger.info(f"Found weights in latest LoRA checkpoint: {lora_safetensors}")
1686
  result["path"] = str(lora_safetensors)
1687
  return result
1688
 
@@ -1697,14 +1697,14 @@ class TrainingService:
1697
  for weight_file in possible_weight_files:
1698
  weight_path = latest_lora_checkpoint / weight_file
1699
  if weight_path.exists():
1700
- #logger.info(f"Found weights file {weight_file} in latest LoRA checkpoint: {weight_path}")
1701
  result["path"] = str(weight_path)
1702
  return result
1703
 
1704
  # Check if any .safetensors files exist
1705
  safetensors_files = list(latest_lora_checkpoint.glob("*.safetensors"))
1706
  if safetensors_files:
1707
- #logger.info(f"Found .safetensors files in LoRA checkpoint: {safetensors_files}")
1708
  # Return the first .safetensors file found
1709
  result["path"] = str(safetensors_files[0])
1710
  return result
@@ -1712,7 +1712,7 @@ class TrainingService:
1712
  # Fallback: check for direct safetensors file in lora_weights root
1713
  lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
1714
  if lora_safetensors.exists():
1715
- #logger.info(f"Found weights in lora_weights directory: {lora_safetensors}")
1716
  result["path"] = str(lora_safetensors)
1717
  return result
1718
  else:
@@ -1753,7 +1753,10 @@ class TrainingService:
1753
  Returns:
1754
  Path to safetensors file or None if not found
1755
  """
1756
- return self.get_model_output_info()["path"]
 
 
 
1757
 
1758
  def create_training_dataset_zip(self) -> str:
1759
  """Create a ZIP file containing all training data
 
1664
  # Check in lora_weights directory
1665
  lora_weights_dir = self.app.output_path / "lora_weights"
1666
  if lora_weights_dir.exists():
1667
+ logger.info(f"Found lora_weights directory: {lora_weights_dir}")
1668
 
1669
  # Look for the latest checkpoint directory in lora_weights
1670
  lora_checkpoints = [d for d in lora_weights_dir.glob("*") if d.is_dir() and d.name.isdigit()]
1671
  if lora_checkpoints:
1672
  latest_lora_checkpoint = max(lora_checkpoints, key=lambda x: int(x.name))
1673
+ logger.info(f"Found latest LoRA checkpoint: {latest_lora_checkpoint}")
1674
 
1675
  # Extract step count from directory name
1676
  result["steps"] = int(latest_lora_checkpoint.name)
1677
 
1678
  # List contents of the latest checkpoint directory
1679
  checkpoint_contents = list(latest_lora_checkpoint.glob("*"))
1680
+ logger.info(f"Contents of LoRA checkpoint {latest_lora_checkpoint.name}: {checkpoint_contents}")
1681
 
1682
  # Check for weights in the latest LoRA checkpoint
1683
  lora_safetensors = latest_lora_checkpoint / "pytorch_lora_weights.safetensors"
1684
  if lora_safetensors.exists():
1685
+ logger.info(f"Found weights in latest LoRA checkpoint: {lora_safetensors}")
1686
  result["path"] = str(lora_safetensors)
1687
  return result
1688
 
 
1697
  for weight_file in possible_weight_files:
1698
  weight_path = latest_lora_checkpoint / weight_file
1699
  if weight_path.exists():
1700
+ logger.info(f"Found weights file {weight_file} in latest LoRA checkpoint: {weight_path}")
1701
  result["path"] = str(weight_path)
1702
  return result
1703
 
1704
  # Check if any .safetensors files exist
1705
  safetensors_files = list(latest_lora_checkpoint.glob("*.safetensors"))
1706
  if safetensors_files:
1707
+ logger.info(f"Found .safetensors files in LoRA checkpoint: {safetensors_files}")
1708
  # Return the first .safetensors file found
1709
  result["path"] = str(safetensors_files[0])
1710
  return result
 
1712
  # Fallback: check for direct safetensors file in lora_weights root
1713
  lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
1714
  if lora_safetensors.exists():
1715
+ logger.info(f"Found weights in lora_weights directory: {lora_safetensors}")
1716
  result["path"] = str(lora_safetensors)
1717
  return result
1718
  else:
 
1753
  Returns:
1754
  Path to safetensors file or None if not found
1755
  """
1756
+ path = self.get_model_output_info()["path"]
1757
+ if not path:
1758
+ raise gr.Error("No model weights found. Please train a model first.")
1759
+ return path
1760
 
1761
  def create_training_dataset_zip(self) -> str:
1762
  """Create a ZIP file containing all training data