gemma-3-270m-it

Sleeping

gobeldan commited on 22 days ago

Commit

c2df3e1

verified ·

1 Parent(s): 8b02fc8

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -27,7 +27,16 @@ MAX_MAX_NEW_TOKENS = 2048
 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # model_id = "google/gemma-3-270m-it"
 model_id = "unsloth/gemma-3-270m-it"
@@ -36,7 +45,7 @@ model = AutoModelForCausalLM.from_pretrained(
     model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
-    attn_implementation="flash_attention_2",
     trust_remote_code=True,
 )
 model.config.sliding_window = 4096

 DEFAULT_MAX_NEW_TOKENS = 1024
 MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# Pick attention backend based on device availability
+if torch.cuda.is_available():
+    device = "cuda"
+    attn_impl = "flash_attention_2"   # or "flash" depending on the library
+    torch_dtype = torch.bfloat16      # or torch.float16
+else:
+    device = "cpu"
+    attn_impl = "eager"
+    torch_dtype = torch.bfloat16      # or float32, bfloat16 supported on CPUs with AVX512-BF16 or AMX (e.g., Intel Ice Lake / Sapphire Rapids, some newer AMD). But many ops may still fall back to float32.
 # model_id = "google/gemma-3-270m-it"
 model_id = "unsloth/gemma-3-270m-it"
     model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16,
+    attn_implementation=attn_impl,
     trust_remote_code=True,
 )
 model.config.sliding_window = 4096