Spaces:

Tousifahamed
/

smol-lm2-demo

Running

App Files Files Community

Tousifahamed commited on Jan 22

Commit

8c5b923

verified ·

1 Parent(s): a222095

Upload app.py

Browse files

Files changed (1) hide show

app.py +29 -67

app.py CHANGED Viewed

@@ -1,78 +1,40 @@
 import torch
-torch.backends.quantized.engine = 'fbgemm'
-print("PyTorch version:", torch.__version__)
-print("Supported quantized engines:", torch.backends.quantized.supported_engines)
 import torch.nn as nn
 from transformers import AutoTokenizer
-from model import TransformerModel
 import gradio as gr
-# Load the tokenizer
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
-def load_quantized_model(checkpoint_path):
-    # 1. Create the float model
-    model = TransformerModel(
-        vocab_size=49152,
-        hidden_size=576,
-        num_hidden_layers=30,
-        num_attention_heads=9,
-        intermediate_size=1536,
-        num_key_value_heads=3,
-        max_position_embeddings=2048,
-        rms_norm_eps=1e-5,
-        hidden_act="silu",
-        tie_word_embeddings=True,
-    )
-    # 2. Load the actual checkpoint weights
-    #    If "quantized_model.pt" is a state_dict, do:
-    checkpoint = torch.load(checkpoint_path, map_location="cpu")
-    model.load_state_dict(checkpoint)  # or checkpoint["model_state_dict"] if saved that way
-    model.eval()
-    # 3. Dynamically quantize relevant layers
-    #    For embeddings, we typically use torch.quint8
-    #    so we don't run into any embedding dtype errors
-    quantized_model = torch.quantization.quantize_dynamic(
-        model,
-        {nn.Linear, nn.Embedding},
-        dtype=torch.quint8
-    )
-    return quantized_model
-# 4. Load the quantized model
-model = load_quantized_model("quantized_model.pt")
-# 5. Inference function
-def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
-    input_ids = tokenizer.encode(prompt, return_tensors="pt")
     with torch.no_grad():
-        output_ids = model.generate(
-            input_ids,
-            max_length=max_length,
-            temperature=temperature,
-            top_k=top_k,
-            do_sample=True,
-        )
-    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-    return generated_text
-# 6. Gradio interface
-interface = gr.Interface(
-    fn=generate_text,
-    inputs=[
-        gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
-        gr.Slider(minimum=10, maximum=200, value=50, label="Max Length"),
-        gr.Slider(minimum=0.1, maximum=2.0, value=1.0, label="Temperature"),
-        gr.Slider(minimum=1, maximum=100, value=50, label="Top-k Sampling"),
-    ],
-    outputs=gr.Textbox(label="Generated Text"),
-    title="Text Generation with Quantized SMOL-LM2",
-    description="Generate text using a dynamically quantized SMOL-LM2 model.",
-)
-interface.launch()

 import torch
 import torch.nn as nn
+from model import TransformerModel  # or however you define your model classes
 from transformers import AutoTokenizer
 import gradio as gr
+# Load half-precision state_dict
+checkpoint = torch.load("model_weights_fp16.pt", map_location="cpu")
+state_dict_fp16 = checkpoint["model_state_dict"]
+# Create model in FP16
+model = TransformerModel(
+    vocab_size=49152,
+    hidden_size=576,
+    num_hidden_layers=30,
+    num_attention_heads=9,
+    intermediate_size=1536,
+    num_key_value_heads=3,
+    max_position_embeddings=2048,
+    rms_norm_eps=1e-5,
+    hidden_act="silu",
+    tie_word_embeddings=True,
+)
+# Convert model to half precision
+model.half()
+# Load the half-precision weights
+model.load_state_dict(state_dict_fp16, strict=False)
+model.eval()
+tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
+def generate_text(prompt, max_length=50):
+    input_ids = tokenizer.encode(prompt, return_tensors="pt").half()  # match model dtype
     with torch.no_grad():
+        output_ids = model.generate(input_ids, max_length=max_length, do_sample=True)
+    return tokenizer.decode(output_ids[0], skip_special_tokens=True)
+gr.Interface(fn=generate_text, inputs="text", outputs="text").launch()