Spaces:
Paused
Paused
Update inference/infer.py
Browse files- inference/infer.py +2 -7
inference/infer.py
CHANGED
|
@@ -68,10 +68,8 @@ os.makedirs(stage1_output_dir, exist_ok=True)
|
|
| 68 |
os.makedirs(stage2_output_dir, exist_ok=True)
|
| 69 |
|
| 70 |
# load tokenizer and model
|
| 71 |
-
|
| 72 |
|
| 73 |
-
# Check if CUDA is available
|
| 74 |
-
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 75 |
# Now you can use `device` to move your tensors or models to the GPU (if available)
|
| 76 |
print(f"Using device: {device}")
|
| 77 |
|
|
@@ -80,10 +78,7 @@ model = AutoModelForCausalLM.from_pretrained(
|
|
| 80 |
stage1_model,
|
| 81 |
torch_dtype=torch.bfloat16,
|
| 82 |
attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
|
| 83 |
-
)
|
| 84 |
-
# to device, if gpu is available
|
| 85 |
-
model.to(device)
|
| 86 |
-
model.eval()
|
| 87 |
|
| 88 |
codectool = CodecManipulator("xcodec", 0, 1)
|
| 89 |
codectool_stage2 = CodecManipulator("xcodec", 0, 8)
|
|
|
|
| 68 |
os.makedirs(stage2_output_dir, exist_ok=True)
|
| 69 |
|
| 70 |
# load tokenizer and model
|
| 71 |
+
device = torch.device(f"cuda:{cuda_idx}" if torch.cuda.is_available() else "cpu")
|
| 72 |
|
|
|
|
|
|
|
| 73 |
# Now you can use `device` to move your tensors or models to the GPU (if available)
|
| 74 |
print(f"Using device: {device}")
|
| 75 |
|
|
|
|
| 78 |
stage1_model,
|
| 79 |
torch_dtype=torch.bfloat16,
|
| 80 |
attn_implementation="flash_attention_2", # To enable flashattn, you have to install flash-attn
|
| 81 |
+
).to(device).eval()
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
codectool = CodecManipulator("xcodec", 0, 1)
|
| 84 |
codectool_stage2 = CodecManipulator("xcodec", 0, 8)
|