Spaces:
Running
Running
update
Browse files
eagle_vl/serve/inference.py
CHANGED
|
@@ -44,6 +44,8 @@ def load_model_from_nv(model_path: str = "nvidia/Eagle-2.5-8B"):
|
|
| 44 |
def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
|
| 45 |
|
| 46 |
token = os.environ.get("HF_TOKEN")
|
|
|
|
|
|
|
| 47 |
# hotfix the model to use flash attention 2
|
| 48 |
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
|
| 49 |
config._attn_implementation = "flash_attention_2"
|
|
@@ -51,7 +53,6 @@ def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
|
|
| 51 |
config.text_config._attn_implementation = "flash_attention_2"
|
| 52 |
print("Successfully set the attn_implementation to flash_attention_2")
|
| 53 |
|
| 54 |
-
logger.info(f"token = {token[:4]}***{token[-2:]}")
|
| 55 |
model = AutoModel.from_pretrained(
|
| 56 |
model_path,
|
| 57 |
trust_remote_code=True,
|
|
|
|
| 44 |
def load_model_from_eagle(model_path: str = "NVEagle/Eagle2.5-VL-8B-Preview"):
|
| 45 |
|
| 46 |
token = os.environ.get("HF_TOKEN")
|
| 47 |
+
logger.info(f"token = {token[:4]}***{token[-2:]}")
|
| 48 |
+
|
| 49 |
# hotfix the model to use flash attention 2
|
| 50 |
config = AutoConfig.from_pretrained(model_path, trust_remote_code=True, token=token)
|
| 51 |
config._attn_implementation = "flash_attention_2"
|
|
|
|
| 53 |
config.text_config._attn_implementation = "flash_attention_2"
|
| 54 |
print("Successfully set the attn_implementation to flash_attention_2")
|
| 55 |
|
|
|
|
| 56 |
model = AutoModel.from_pretrained(
|
| 57 |
model_path,
|
| 58 |
trust_remote_code=True,
|