Spaces:

Tousifahamed
/

smol-lm2-demo

Sleeping

App Files Files Community

Tousifahamed commited on Jan 22

Commit

b789c6c

verified ·

1 Parent(s): 4d798b4

Upload 2 files

Browse files

Files changed (2) hide show

app.py +22 -17
model.py +1 -5

app.py CHANGED Viewed

@@ -6,9 +6,8 @@ import gradio as gr
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
-# Load the model
-def load_model(checkpoint_path):
-    # Initialize the model (replace with your model's configuration)
     model = TransformerModel(
         vocab_size=49152,
         hidden_size=576,
@@ -20,24 +19,35 @@ def load_model(checkpoint_path):
         rms_norm_eps=1e-5,
         hidden_act="silu",
         tie_word_embeddings=True,
-        pad_token_id=tokenizer.pad_token_id,
     )
-    # Load the checkpoint
     checkpoint = torch.load(checkpoint_path, map_location="cpu")
     model.load_state_dict(checkpoint["model_state_dict"])
     model.eval()
     return model
-# Load the model
-model = load_model("checkpoint_quantized.pt")
 # Function to generate text
 def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
-    # Encode the prompt
     input_ids = tokenizer.encode(prompt, return_tensors="pt")
-    # Generate text
     with torch.no_grad():
         output_ids = model.generate(
             input_ids,
@@ -47,17 +57,12 @@ def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
             do_sample=True,
         )
-    # Decode the generated text
     generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
     return generated_text
 # Gradio Interface
-def gradio_generate_text(prompt, max_length, temperature, top_k):
-    return generate_text(prompt, max_length, temperature, top_k)
-# Create the Gradio app
 interface = gr.Interface(
-    fn=gradio_generate_text,
     inputs=[
         gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
         gr.Slider(minimum=10, maximum=200, value=50, label="Max Length"),
@@ -65,8 +70,8 @@ interface = gr.Interface(
         gr.Slider(minimum=1, maximum=100, value=50, label="Top-k Sampling"),
     ],
     outputs=gr.Textbox(label="Generated Text"),
-    title="Text Generation with SMOL-LM2",
-    description="Generate text using the SMOL-LM2 model.",
 )
 # Launch the app

 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
+def load_quantized_model(checkpoint_path):
+    # Define the model architecture
     model = TransformerModel(
         vocab_size=49152,
         hidden_size=576,
         rms_norm_eps=1e-5,
         hidden_act="silu",
         tie_word_embeddings=True,
     )
+    # Apply dynamic quantization to the embedding layer
+    model.embed_tokens = torch.quantization.quantize_dynamic(
+        model.embed_tokens, {torch.nn.Embedding}, dtype=torch.qint8
+    )
+    # Apply static quantization to the rest of the model
+    model.qconfig = torch.quantization.default_qconfig
+    model = torch.quantization.prepare(model, inplace=False)
+    model = torch.quantization.convert(model, inplace=False)
+    # Load the quantized checkpoint
     checkpoint = torch.load(checkpoint_path, map_location="cpu")
     model.load_state_dict(checkpoint["model_state_dict"])
     model.eval()
     return model
+import gradio as gr
+# Load the quantized model
+model = load_quantized_model("checkpoint_quantized.pt")
 # Function to generate text
 def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
     input_ids = tokenizer.encode(prompt, return_tensors="pt")
     with torch.no_grad():
         output_ids = model.generate(
             input_ids,
             do_sample=True,
         )
     generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
     return generated_text
 # Gradio Interface
 interface = gr.Interface(
+    fn=generate_text,
     inputs=[
         gr.Textbox(label="Prompt", placeholder="Enter your prompt here..."),
         gr.Slider(minimum=10, maximum=200, value=50, label="Max Length"),
         gr.Slider(minimum=1, maximum=100, value=50, label="Top-k Sampling"),
     ],
     outputs=gr.Textbox(label="Generated Text"),
+    title="Text Generation with Quantized SMOL-LM2",
+    description="Generate text using a quantized version of the SMOL-LM2 model.",
 )
 # Launch the app

model.py CHANGED Viewed

@@ -160,9 +160,6 @@ class TransformerBlock(nn.Module):
         return x
 class TransformerModel(nn.Module):
-    """
-    The full transformer model with multiple layers.
-    """
     def __init__(
         self,
         vocab_size: int,
@@ -175,7 +172,6 @@ class TransformerModel(nn.Module):
         rms_norm_eps: float,
         hidden_act: str = "silu",
         tie_word_embeddings: bool = True,
-        pad_token_id: Optional[int] = None,
     ):
         super().__init__()
         self.vocab_size = vocab_size
@@ -183,7 +179,7 @@ class TransformerModel(nn.Module):
         self.num_hidden_layers = num_hidden_layers
         self.max_position_embeddings = max_position_embeddings
-        # Embedding layers
         self.embed_tokens = nn.Embedding(vocab_size, hidden_size)
         self.embed_positions = nn.Embedding(max_position_embeddings, hidden_size)

         return x
 class TransformerModel(nn.Module):
     def __init__(
         self,
         vocab_size: int,
         rms_norm_eps: float,
         hidden_act: str = "silu",
         tie_word_embeddings: bool = True,
     ):
         super().__init__()
         self.vocab_size = vocab_size
         self.num_hidden_layers = num_hidden_layers
         self.max_position_embeddings = max_position_embeddings
+        # Embedding layers (skip quantization for these)
         self.embed_tokens = nn.Embedding(vocab_size, hidden_size)
         self.embed_positions = nn.Embedding(max_position_embeddings, hidden_size)