Spaces:
Running
Running
Commit
·
608088a
1
Parent(s):
3f5a4ba
adding some logs to investigate
Browse files
vms/ui/project/services/training.py
CHANGED
@@ -1664,25 +1664,25 @@ class TrainingService:
|
|
1664 |
# Check in lora_weights directory
|
1665 |
lora_weights_dir = self.app.output_path / "lora_weights"
|
1666 |
if lora_weights_dir.exists():
|
1667 |
-
|
1668 |
|
1669 |
# Look for the latest checkpoint directory in lora_weights
|
1670 |
lora_checkpoints = [d for d in lora_weights_dir.glob("*") if d.is_dir() and d.name.isdigit()]
|
1671 |
if lora_checkpoints:
|
1672 |
latest_lora_checkpoint = max(lora_checkpoints, key=lambda x: int(x.name))
|
1673 |
-
|
1674 |
|
1675 |
# Extract step count from directory name
|
1676 |
result["steps"] = int(latest_lora_checkpoint.name)
|
1677 |
|
1678 |
# List contents of the latest checkpoint directory
|
1679 |
checkpoint_contents = list(latest_lora_checkpoint.glob("*"))
|
1680 |
-
|
1681 |
|
1682 |
# Check for weights in the latest LoRA checkpoint
|
1683 |
lora_safetensors = latest_lora_checkpoint / "pytorch_lora_weights.safetensors"
|
1684 |
if lora_safetensors.exists():
|
1685 |
-
|
1686 |
result["path"] = str(lora_safetensors)
|
1687 |
return result
|
1688 |
|
@@ -1697,14 +1697,14 @@ class TrainingService:
|
|
1697 |
for weight_file in possible_weight_files:
|
1698 |
weight_path = latest_lora_checkpoint / weight_file
|
1699 |
if weight_path.exists():
|
1700 |
-
|
1701 |
result["path"] = str(weight_path)
|
1702 |
return result
|
1703 |
|
1704 |
# Check if any .safetensors files exist
|
1705 |
safetensors_files = list(latest_lora_checkpoint.glob("*.safetensors"))
|
1706 |
if safetensors_files:
|
1707 |
-
|
1708 |
# Return the first .safetensors file found
|
1709 |
result["path"] = str(safetensors_files[0])
|
1710 |
return result
|
@@ -1712,7 +1712,7 @@ class TrainingService:
|
|
1712 |
# Fallback: check for direct safetensors file in lora_weights root
|
1713 |
lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
|
1714 |
if lora_safetensors.exists():
|
1715 |
-
|
1716 |
result["path"] = str(lora_safetensors)
|
1717 |
return result
|
1718 |
else:
|
@@ -1753,7 +1753,10 @@ class TrainingService:
|
|
1753 |
Returns:
|
1754 |
Path to safetensors file or None if not found
|
1755 |
"""
|
1756 |
-
|
|
|
|
|
|
|
1757 |
|
1758 |
def create_training_dataset_zip(self) -> str:
|
1759 |
"""Create a ZIP file containing all training data
|
|
|
1664 |
# Check in lora_weights directory
|
1665 |
lora_weights_dir = self.app.output_path / "lora_weights"
|
1666 |
if lora_weights_dir.exists():
|
1667 |
+
logger.info(f"Found lora_weights directory: {lora_weights_dir}")
|
1668 |
|
1669 |
# Look for the latest checkpoint directory in lora_weights
|
1670 |
lora_checkpoints = [d for d in lora_weights_dir.glob("*") if d.is_dir() and d.name.isdigit()]
|
1671 |
if lora_checkpoints:
|
1672 |
latest_lora_checkpoint = max(lora_checkpoints, key=lambda x: int(x.name))
|
1673 |
+
logger.info(f"Found latest LoRA checkpoint: {latest_lora_checkpoint}")
|
1674 |
|
1675 |
# Extract step count from directory name
|
1676 |
result["steps"] = int(latest_lora_checkpoint.name)
|
1677 |
|
1678 |
# List contents of the latest checkpoint directory
|
1679 |
checkpoint_contents = list(latest_lora_checkpoint.glob("*"))
|
1680 |
+
logger.info(f"Contents of LoRA checkpoint {latest_lora_checkpoint.name}: {checkpoint_contents}")
|
1681 |
|
1682 |
# Check for weights in the latest LoRA checkpoint
|
1683 |
lora_safetensors = latest_lora_checkpoint / "pytorch_lora_weights.safetensors"
|
1684 |
if lora_safetensors.exists():
|
1685 |
+
logger.info(f"Found weights in latest LoRA checkpoint: {lora_safetensors}")
|
1686 |
result["path"] = str(lora_safetensors)
|
1687 |
return result
|
1688 |
|
|
|
1697 |
for weight_file in possible_weight_files:
|
1698 |
weight_path = latest_lora_checkpoint / weight_file
|
1699 |
if weight_path.exists():
|
1700 |
+
logger.info(f"Found weights file {weight_file} in latest LoRA checkpoint: {weight_path}")
|
1701 |
result["path"] = str(weight_path)
|
1702 |
return result
|
1703 |
|
1704 |
# Check if any .safetensors files exist
|
1705 |
safetensors_files = list(latest_lora_checkpoint.glob("*.safetensors"))
|
1706 |
if safetensors_files:
|
1707 |
+
logger.info(f"Found .safetensors files in LoRA checkpoint: {safetensors_files}")
|
1708 |
# Return the first .safetensors file found
|
1709 |
result["path"] = str(safetensors_files[0])
|
1710 |
return result
|
|
|
1712 |
# Fallback: check for direct safetensors file in lora_weights root
|
1713 |
lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
|
1714 |
if lora_safetensors.exists():
|
1715 |
+
logger.info(f"Found weights in lora_weights directory: {lora_safetensors}")
|
1716 |
result["path"] = str(lora_safetensors)
|
1717 |
return result
|
1718 |
else:
|
|
|
1753 |
Returns:
|
1754 |
Path to safetensors file or None if not found
|
1755 |
"""
|
1756 |
+
path = self.get_model_output_info()["path"]
|
1757 |
+
if not path:
|
1758 |
+
raise gr.Error("No model weights found. Please train a model first.")
|
1759 |
+
return path
|
1760 |
|
1761 |
def create_training_dataset_zip(self) -> str:
|
1762 |
"""Create a ZIP file containing all training data
|