Spaces:

Tousifahamed
/

smol-lm2-demo

Sleeping

Tousifahamed commited on Jan 22

Commit

ad95929

verified ·

1 Parent(s): b789c6c

Upload 2 files

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import torch
 from transformers import AutoTokenizer
 from model import TransformerModel  # Replace with your model class
 import gradio as gr
@@ -21,15 +22,13 @@ def load_quantized_model(checkpoint_path):
         tie_word_embeddings=True,
     )
-    # Apply dynamic quantization to the embedding layer
-    model.embed_tokens = torch.quantization.quantize_dynamic(
-        model.embed_tokens, {torch.nn.Embedding}, dtype=torch.qint8
-    )
     # Apply static quantization to the rest of the model
-    model.qconfig = torch.quantization.default_qconfig
-    model = torch.quantization.prepare(model, inplace=False)
-    model = torch.quantization.convert(model, inplace=False)
     # Load the quantized checkpoint
     checkpoint = torch.load(checkpoint_path, map_location="cpu")
@@ -38,12 +37,19 @@ def load_quantized_model(checkpoint_path):
     model.eval()
     return model
 import gradio as gr
 # Load the quantized model
 model = load_quantized_model("checkpoint_quantized.pt")
 # Function to generate text
 def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
     input_ids = tokenizer.encode(prompt, return_tensors="pt")

 import torch
+import torch.ao.quantization as quantization
 from transformers import AutoTokenizer
 from model import TransformerModel  # Replace with your model class
 import gradio as gr
         tie_word_embeddings=True,
     )
+    # Set the quantization configuration for the embedding layer
+    model.embed_tokens.qconfig = quantization.float_qparams_weight_only_qconfig
     # Apply static quantization to the rest of the model
+    model.qconfig = quantization.default_qconfig
+    model = quantization.prepare(model, inplace=False)
+    model = quantization.convert(model, inplace=False)
     # Load the quantized checkpoint
     checkpoint = torch.load(checkpoint_path, map_location="cpu")
     model.eval()
     return model
 import gradio as gr
 # Load the quantized model
 model = load_quantized_model("checkpoint_quantized.pt")
+# Set the quantization configuration for the embedding layer
+model.embed_tokens.qconfig = quantization.float_qparams_weight_only_qconfig
+# Apply static quantization to the rest of the model
+model.qconfig = quantization.default_qconfig
+model = quantization.prepare(model, inplace=False)
+model = quantization.convert(model, inplace=False)
 # Function to generate text
 def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
     input_ids = tokenizer.encode(prompt, return_tensors="pt")