Spaces:

Dovakiins
/

qwerrwe

Build error

App Files Files Community

winglian commited on Feb 26, 2024

Commit

5894f0e

unverified ·

1 Parent(s): 5cf226e

make mlflow optional (#1317)

Browse files

* make mlflow optional

* fix xformers

don't patch swiglu if xformers not working
fix the check for xformers swiglu

* fix install of xformers with extra index url for docker builds

* fix docker build arg quoting

Files changed (12) hide show

.github/workflows/main.yml +2 -0
.github/workflows/tests.yml +3 -0
docker/Dockerfile +3 -2
docker/Dockerfile-tests +3 -2
requirements.txt +0 -1
setup.py +3 -0
src/axolotl/core/trainer_builder.py +10 -2
src/axolotl/monkeypatch/llama_attn_hijack_flash.py +12 -0
src/axolotl/utils/{callbacks.py → callbacks/__init__.py} +1 -30
src/axolotl/utils/callbacks/mlflow_.py +44 -0
src/axolotl/utils/models.py +2 -1
tests/e2e/patched/test_fused_llama.py +3 -3

.github/workflows/main.yml CHANGED Viewed

@@ -18,6 +18,7 @@ jobs:
             python_version: "3.10"
             pytorch: 2.1.2
             axolotl_extras:
             is_latest: true
           - cuda: 121
             cuda_version: 12.1.0
@@ -54,6 +55,7 @@ jobs:
             BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
             CUDA=${{ matrix.cuda }}
             PYTORCH_VERSION=${{ matrix.pytorch }}
           file: ./docker/Dockerfile
           push: ${{ github.event_name != 'pull_request' }}
           tags: |

             python_version: "3.10"
             pytorch: 2.1.2
             axolotl_extras:
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
             is_latest: true
           - cuda: 121
             cuda_version: 12.1.0
             BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
             CUDA=${{ matrix.cuda }}
             PYTORCH_VERSION=${{ matrix.pytorch }}
+            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
           file: ./docker/Dockerfile
           push: ${{ github.event_name != 'pull_request' }}
           tags: |

.github/workflows/tests.yml CHANGED Viewed

@@ -70,6 +70,7 @@ jobs:
             cuda_version: 11.8.0
             python_version: "3.10"
             pytorch: 2.1.2
           - cuda: 121
             cuda_version: 12.1.0
             python_version: "3.10"
@@ -87,11 +88,13 @@ jobs:
           # Set up build arguments
           BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}"
           CUDA="${{ matrix.cuda }}"
           PYTORCH_VERSION="${{ matrix.pytorch }}"
           # Build the Docker image
           docker build . \
             --file ./docker/Dockerfile-tests \
             --build-arg BASE_TAG=$BASE_TAG \
             --build-arg CUDA=$CUDA \
             --build-arg GITHUB_REF=$GITHUB_REF \
             --build-arg PYTORCH_VERSION=$PYTORCH_VERSION \

             cuda_version: 11.8.0
             python_version: "3.10"
             pytorch: 2.1.2
+            axolotl_args: "--extra-index-url https://download.pytorch.org/whl/cu118"
           - cuda: 121
             cuda_version: 12.1.0
             python_version: "3.10"
           # Set up build arguments
           BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}"
           CUDA="${{ matrix.cuda }}"
+          AXOLOTL_ARGS="${{ matrix.axolotl_args }}"
           PYTORCH_VERSION="${{ matrix.pytorch }}"
           # Build the Docker image
           docker build . \
             --file ./docker/Dockerfile-tests \
             --build-arg BASE_TAG=$BASE_TAG \
+            --build-arg AXOLOTL_ARGS="$AXOLOTL_ARGS" \
             --build-arg CUDA=$CUDA \
             --build-arg GITHUB_REF=$GITHUB_REF \
             --build-arg PYTORCH_VERSION=$PYTORCH_VERSION \

docker/Dockerfile CHANGED Viewed

@@ -3,6 +3,7 @@ FROM winglian/axolotl-base:$BASE_TAG
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
 ARG PYTORCH_VERSION="2.0.1"
@@ -20,9 +21,9 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
     else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
     fi
 # So we can test the Docker image

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
+ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
 ARG PYTORCH_VERSION="2.0.1"
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
     else \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
     fi
 # So we can test the Docker image

docker/Dockerfile-tests CHANGED Viewed

@@ -3,6 +3,7 @@ FROM winglian/axolotl-base:$BASE_TAG
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
 ARG PYTORCH_VERSION="2.0.1"
@@ -24,9 +25,9 @@ RUN git fetch origin +$GITHUB_REF && \
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
     else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
     fi
 # So we can test the Docker image

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
+ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ENV BNB_CUDA_VERSION=$CUDA
 ARG PYTORCH_VERSION="2.0.1"
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
     else \
+        pip install -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
     fi
 # So we can test the Docker image

requirements.txt CHANGED Viewed

@@ -21,7 +21,6 @@ hf_transfer
 colorama
 numba
 numpy>=1.24.4
-mlflow
 # qlora things
 evaluate==0.4.1
 scipy

 colorama
 numba
 numpy>=1.24.4
 # qlora things
 evaluate==0.4.1
 scipy

setup.py CHANGED Viewed

@@ -82,5 +82,8 @@ setup(
         "auto-gptq": [
             "auto-gptq==0.5.1",
         ],
     },
 )

         "auto-gptq": [
             "auto-gptq==0.5.1",
         ],
+        "mlflow": [
+            "mlflow",
+        ],
     },
 )

src/axolotl/core/trainer_builder.py CHANGED Viewed

@@ -5,6 +5,7 @@ Builder for the training args and trainer
 import abc
 import importlib
 import logging
 import math
 import sys
@@ -34,7 +35,6 @@ from axolotl.utils.callbacks import (
     EvalFirstStepCallback,
     GPUStatsCallback,
     LossWatchDogCallback,
-    SaveAxolotlConfigtoMlflowCallback,
     SaveAxolotlConfigtoWandBCallback,
     SaveBetterTransformerModelCallback,
     bench_eval_callback_factory,
@@ -62,6 +62,10 @@ except ImportError:
 LOG = logging.getLogger("axolotl.core.trainer_builder")
 def _sanitize_kwargs_for_tagging(tag_names, kwargs=None):
     if isinstance(tag_names, str):
         tag_names = [tag_names]
@@ -648,7 +652,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
             callbacks.append(
                 SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
             )
-        if self.cfg.use_mlflow:
             callbacks.append(
                 SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path)
             )

 import abc
 import importlib
+import importlib.util
 import logging
 import math
 import sys
     EvalFirstStepCallback,
     GPUStatsCallback,
     LossWatchDogCallback,
     SaveAxolotlConfigtoWandBCallback,
     SaveBetterTransformerModelCallback,
     bench_eval_callback_factory,
 LOG = logging.getLogger("axolotl.core.trainer_builder")
+def is_mlflow_available():
+    return importlib.util.find_spec("mlflow") is not None
 def _sanitize_kwargs_for_tagging(tag_names, kwargs=None):
     if isinstance(tag_names, str):
         tag_names = [tag_names]
             callbacks.append(
                 SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
             )
+        if self.cfg.use_mlflow and is_mlflow_available():
+            from axolotl.utils.callbacks.mlflow_ import (
+                SaveAxolotlConfigtoMlflowCallback,
+            )
             callbacks.append(
                 SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path)
             )

src/axolotl/monkeypatch/llama_attn_hijack_flash.py CHANGED Viewed

@@ -44,6 +44,18 @@ except ImportError:
 LOG = logging.getLogger("axolotl")
 def replace_llama_mlp_with_swiglu(model):
     for name, module in model.named_modules():
         if isinstance(module, LlamaMLP):

 LOG = logging.getLogger("axolotl")
+def is_xformers_swiglu_available() -> bool:
+    from xformers.ops.common import get_xformers_operator
+    try:
+        get_xformers_operator("swiglu_packedw")()
+        return True
+    except RuntimeError as exc:
+        if "No such operator xformers::swiglu_packedw " in str(exc):
+            return False
+        return True
 def replace_llama_mlp_with_swiglu(model):
     for name, module in model.named_modules():
         if isinstance(module, LlamaMLP):

src/axolotl/utils/{callbacks.py → callbacks/__init__.py} RENAMED Viewed

@@ -9,7 +9,6 @@ from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING, Dict, List
 import evaluate
-import mlflow
 import numpy as np
 import pandas as pd
 import torch
@@ -42,8 +41,8 @@ from axolotl.utils.distributed import (
 if TYPE_CHECKING:
     from axolotl.core.trainer_builder import AxolotlTrainingArguments
-LOG = logging.getLogger("axolotl.callbacks")
 IGNORE_INDEX = -100
 class EvalFirstStepCallback(
@@ -756,31 +755,3 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
             except (FileNotFoundError, ConnectionError) as err:
                 LOG.warning(f"Error while saving Axolotl config to WandB: {err}")
         return control
-class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
-    """Callback to save axolotl config to mlflow"""
-    def __init__(self, axolotl_config_path):
-        self.axolotl_config_path = axolotl_config_path
-    def on_train_begin(
-        self,
-        args: AxolotlTrainingArguments,  # pylint: disable=unused-argument
-        state: TrainerState,  # pylint: disable=unused-argument
-        control: TrainerControl,
-        **kwargs,  # pylint: disable=unused-argument
-    ):
-        if is_main_process():
-            try:
-                with NamedTemporaryFile(
-                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
-                ) as temp_file:
-                    copyfile(self.axolotl_config_path, temp_file.name)
-                    mlflow.log_artifact(temp_file.name, artifact_path="")
-                    LOG.info(
-                        "The Axolotl config has been saved to the MLflow artifacts."
-                    )
-            except (FileNotFoundError, ConnectionError) as err:
-                LOG.warning(f"Error while saving Axolotl config to MLflow: {err}")
-        return control

 from typing import TYPE_CHECKING, Dict, List
 import evaluate
 import numpy as np
 import pandas as pd
 import torch
 if TYPE_CHECKING:
     from axolotl.core.trainer_builder import AxolotlTrainingArguments
 IGNORE_INDEX = -100
+LOG = logging.getLogger("axolotl.callbacks")
 class EvalFirstStepCallback(
             except (FileNotFoundError, ConnectionError) as err:
                 LOG.warning(f"Error while saving Axolotl config to WandB: {err}")
         return control

src/axolotl/utils/callbacks/mlflow_.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""MLFlow module for trainer callbacks"""
+import logging
+from shutil import copyfile
+from tempfile import NamedTemporaryFile
+from typing import TYPE_CHECKING
+import mlflow
+from transformers import TrainerCallback, TrainerControl, TrainerState
+from axolotl.utils.distributed import is_main_process
+if TYPE_CHECKING:
+    from axolotl.core.trainer_builder import AxolotlTrainingArguments
+LOG = logging.getLogger("axolotl.callbacks")
+class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
+    # pylint: disable=duplicate-code
+    """Callback to save axolotl config to mlflow"""
+    def __init__(self, axolotl_config_path):
+        self.axolotl_config_path = axolotl_config_path
+    def on_train_begin(
+        self,
+        args: "AxolotlTrainingArguments",  # pylint: disable=unused-argument
+        state: TrainerState,  # pylint: disable=unused-argument
+        control: TrainerControl,
+        **kwargs,  # pylint: disable=unused-argument
+    ):
+        if is_main_process():
+            try:
+                with NamedTemporaryFile(
+                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
+                ) as temp_file:
+                    copyfile(self.axolotl_config_path, temp_file.name)
+                    mlflow.log_artifact(temp_file.name, artifact_path="")
+                    LOG.info(
+                        "The Axolotl config has been saved to the MLflow artifacts."
+                    )
+            except (FileNotFoundError, ConnectionError) as err:
+                LOG.warning(f"Error while saving Axolotl config to MLflow: {err}")
+        return control

src/axolotl/utils/models.py CHANGED Viewed

@@ -512,11 +512,12 @@ def load_model(
             if cfg.flash_attention and not inference:
                 from axolotl.monkeypatch.llama_attn_hijack_flash import (
                     replace_llama_mlp_with_swiglu,
                     replace_llama_qkv_with_fused,
                 )
-                if cfg.flash_attn_fuse_mlp:
                     LOG.info("patching with SwiGLU")
                     replace_llama_mlp_with_swiglu(model)

             if cfg.flash_attention and not inference:
                 from axolotl.monkeypatch.llama_attn_hijack_flash import (
+                    is_xformers_swiglu_available,
                     replace_llama_mlp_with_swiglu,
                     replace_llama_qkv_with_fused,
                 )
+                if cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available():
                     LOG.info("patching with SwiGLU")
                     replace_llama_mlp_with_swiglu(model)

tests/e2e/patched/test_fused_llama.py CHANGED Viewed

@@ -57,9 +57,9 @@ class TestFusedLlama(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch",
                 "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "save_steps": 10,
-                "eval_steps": 10,
             }
         )
         if is_torch_bf16_gpu_available():

                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch",
                 "lr_scheduler": "cosine",
+                "max_steps": 10,
+                "save_steps": 5,
+                "eval_steps": 5,
             }
         )
         if is_torch_bf16_gpu_available():