WilliamGazeley
commited on
Commit
·
7067b68
1
Parent(s):
341985c
Move flash attention install to runtime
Browse files- app.py +3 -0
- requirements.txt +0 -1
app.py
CHANGED
|
@@ -6,6 +6,9 @@ from utils import get_assistant_message
|
|
| 6 |
from functioncall import ModelInference
|
| 7 |
from prompter import PromptManager
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
@st.cache_resource(show_spinner="Loading model..")
|
| 11 |
def init_llm():
|
|
|
|
| 6 |
from functioncall import ModelInference
|
| 7 |
from prompter import PromptManager
|
| 8 |
|
| 9 |
+
# HACK
|
| 10 |
+
import subprocess
|
| 11 |
+
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
|
| 12 |
|
| 13 |
@st.cache_resource(show_spinner="Loading model..")
|
| 14 |
def init_llm():
|
requirements.txt
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
flash-attn==2.5.5
|
| 2 |
ninja==1.11.1.1
|
| 3 |
numpy==1.26.4
|
| 4 |
orjson==3.10.3
|
|
|
|
|
|
|
| 1 |
ninja==1.11.1.1
|
| 2 |
numpy==1.26.4
|
| 3 |
orjson==3.10.3
|