Tousifahamed commited on
Commit
b789c6c
·
verified ·
1 Parent(s): 4d798b4

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +22 -17
  2. model.py +1 -5
app.py CHANGED
@@ -6,9 +6,8 @@ import gradio as gr
6
  # Load the tokenizer
7
  tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
8
 
9
- # Load the model
10
- def load_model(checkpoint_path):
11
- # Initialize the model (replace with your model's configuration)
12
  model = TransformerModel(
13
  vocab_size=49152,
14
  hidden_size=576,
@@ -20,24 +19,35 @@ def load_model(checkpoint_path):
20
  rms_norm_eps=1e-5,
21
  hidden_act="silu",
22
  tie_word_embeddings=True,
23
- pad_token_id=tokenizer.pad_token_id,
24
  )
25
 
26
- # Load the checkpoint
 
 
 
 
 
 
 
 
 
 
27
  checkpoint = torch.load(checkpoint_path, map_location="cpu")
28
  model.load_state_dict(checkpoint["model_state_dict"])
 
29
  model.eval()
30
  return model
31
 
32
- # Load the model
33
- model = load_model("checkpoint_quantized.pt")
 
 
 
34
 
35
  # Function to generate text
36
  def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
37
- # Encode the prompt
38
  input_ids = tokenizer.encode(prompt, return_tensors="pt")
39
 
40
- # Generate text
41
  with torch.no_grad():
42
  output_ids = model.generate(
43
  input_ids,
@@ -47,17 +57,12 @@ def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
47
  do_sample=True,
48
  )
49
 
50
- # Decode the generated text
51
  generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
52
  return generated_text
53
 
54
  # Gradio Interface
55
- def gradio_generate_text(prompt, max_length, temperature, top_k):
56
- return generate_text(prompt, max_length, temperature, top_k)
57
-
58
- # Create the Gradio app
59
  interface = gr.Interface(
60
- fn=gradio_generate_text,
61
  inputs=[
62
  gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
63
  gr.Slider(minimum=10, maximum=200, value=50, label="Max Length"),
@@ -65,8 +70,8 @@ interface = gr.Interface(
65
  gr.Slider(minimum=1, maximum=100, value=50, label="Top-k Sampling"),
66
  ],
67
  outputs=gr.Textbox(label="Generated Text"),
68
- title="Text Generation with SMOL-LM2",
69
- description="Generate text using the SMOL-LM2 model.",
70
  )
71
 
72
  # Launch the app
 
6
  # Load the tokenizer
7
  tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
8
 
9
+ def load_quantized_model(checkpoint_path):
10
+ # Define the model architecture
 
11
  model = TransformerModel(
12
  vocab_size=49152,
13
  hidden_size=576,
 
19
  rms_norm_eps=1e-5,
20
  hidden_act="silu",
21
  tie_word_embeddings=True,
 
22
  )
23
 
24
+ # Apply dynamic quantization to the embedding layer
25
+ model.embed_tokens = torch.quantization.quantize_dynamic(
26
+ model.embed_tokens, {torch.nn.Embedding}, dtype=torch.qint8
27
+ )
28
+
29
+ # Apply static quantization to the rest of the model
30
+ model.qconfig = torch.quantization.default_qconfig
31
+ model = torch.quantization.prepare(model, inplace=False)
32
+ model = torch.quantization.convert(model, inplace=False)
33
+
34
+ # Load the quantized checkpoint
35
  checkpoint = torch.load(checkpoint_path, map_location="cpu")
36
  model.load_state_dict(checkpoint["model_state_dict"])
37
+
38
  model.eval()
39
  return model
40
 
41
+
42
+ import gradio as gr
43
+
44
+ # Load the quantized model
45
+ model = load_quantized_model("checkpoint_quantized.pt")
46
 
47
  # Function to generate text
48
  def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
 
49
  input_ids = tokenizer.encode(prompt, return_tensors="pt")
50
 
 
51
  with torch.no_grad():
52
  output_ids = model.generate(
53
  input_ids,
 
57
  do_sample=True,
58
  )
59
 
 
60
  generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
61
  return generated_text
62
 
63
  # Gradio Interface
 
 
 
 
64
  interface = gr.Interface(
65
+ fn=generate_text,
66
  inputs=[
67
  gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
68
  gr.Slider(minimum=10, maximum=200, value=50, label="Max Length"),
 
70
  gr.Slider(minimum=1, maximum=100, value=50, label="Top-k Sampling"),
71
  ],
72
  outputs=gr.Textbox(label="Generated Text"),
73
+ title="Text Generation with Quantized SMOL-LM2",
74
+ description="Generate text using a quantized version of the SMOL-LM2 model.",
75
  )
76
 
77
  # Launch the app
model.py CHANGED
@@ -160,9 +160,6 @@ class TransformerBlock(nn.Module):
160
  return x
161
 
162
  class TransformerModel(nn.Module):
163
- """
164
- The full transformer model with multiple layers.
165
- """
166
  def __init__(
167
  self,
168
  vocab_size: int,
@@ -175,7 +172,6 @@ class TransformerModel(nn.Module):
175
  rms_norm_eps: float,
176
  hidden_act: str = "silu",
177
  tie_word_embeddings: bool = True,
178
- pad_token_id: Optional[int] = None,
179
  ):
180
  super().__init__()
181
  self.vocab_size = vocab_size
@@ -183,7 +179,7 @@ class TransformerModel(nn.Module):
183
  self.num_hidden_layers = num_hidden_layers
184
  self.max_position_embeddings = max_position_embeddings
185
 
186
- # Embedding layers
187
  self.embed_tokens = nn.Embedding(vocab_size, hidden_size)
188
  self.embed_positions = nn.Embedding(max_position_embeddings, hidden_size)
189
 
 
160
  return x
161
 
162
  class TransformerModel(nn.Module):
 
 
 
163
  def __init__(
164
  self,
165
  vocab_size: int,
 
172
  rms_norm_eps: float,
173
  hidden_act: str = "silu",
174
  tie_word_embeddings: bool = True,
 
175
  ):
176
  super().__init__()
177
  self.vocab_size = vocab_size
 
179
  self.num_hidden_layers = num_hidden_layers
180
  self.max_position_embeddings = max_position_embeddings
181
 
182
+ # Embedding layers (skip quantization for these)
183
  self.embed_tokens = nn.Embedding(vocab_size, hidden_size)
184
  self.embed_positions = nn.Embedding(max_position_embeddings, hidden_size)
185