Spaces:

Tousifahamed
/

smol-lm2-demo

Sleeping

App Files Files Community

Tousifahamed commited on Jan 22

Commit

57a4ca3

verified ·

1 Parent(s): a222a4b

Upload app.py

Browse files

Files changed (1) hide show

app.py +21 -26

app.py CHANGED Viewed

@@ -1,21 +1,19 @@
 import torch
-torch.backends.quantized.engine = 'fbgemm'  # ensure we use fbgemm
 print("PyTorch version:", torch.__version__)
 print("Supported quantized engines:", torch.backends.quantized.supported_engines)
 import torch.nn as nn
-import torch.quantization  # <--- Use the older namespace for default_qconfig
 from transformers import AutoTokenizer
 from model import TransformerModel
 import gradio as gr
-from torch.ao.quantization.qconfig import float_qparams_weight_only_qconfig
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
 def load_quantized_model(checkpoint_path):
     model = TransformerModel(
         vocab_size=49152,
         hidden_size=576,
@@ -29,30 +27,29 @@ def load_quantized_model(checkpoint_path):
         tie_word_embeddings=True,
     )
-    # This qconfig is typically for your other layers
-    default_qconfig = torch.quantization.get_default_qconfig("fbgemm")
-    model.qconfig = default_qconfig
-    # For embeddings, force the specialized config:
-    model.embed_tokens.qconfig = float_qparams_weight_only_qconfig
-    model.embed_positions.qconfig = float_qparams_weight_only_qconfig
-    # Then prepare, calibrate, and convert
-    model = torch.quantization.prepare(model, inplace=False)
-    # Calibration pass here...
-    model = torch.quantization.convert(model, inplace=False)
-    return model
-# Load the quantized model
 model = load_quantized_model("quantized_model.pt")
-# Function to generate text
 def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
     input_ids = tokenizer.encode(prompt, return_tensors="pt")
     with torch.no_grad():
         output_ids = model.generate(
             input_ids,
@@ -61,11 +58,10 @@ def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
             top_k=top_k,
             do_sample=True,
         )
     generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
     return generated_text
-# Gradio Interface
 interface = gr.Interface(
     fn=generate_text,
     inputs=[
@@ -76,8 +72,7 @@ interface = gr.Interface(
     ],
     outputs=gr.Textbox(label="Generated Text"),
     title="Text Generation with Quantized SMOL-LM2",
-    description="Generate text using a quantized version of the SMOL-LM2 model.",
 )
-# Launch the app
-interface.launch()

 import torch
+torch.backends.quantized.engine = 'fbgemm'
 print("PyTorch version:", torch.__version__)
 print("Supported quantized engines:", torch.backends.quantized.supported_engines)
 import torch.nn as nn
 from transformers import AutoTokenizer
 from model import TransformerModel
 import gradio as gr
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/cosmo2-tokenizer")
 def load_quantized_model(checkpoint_path):
+    # 1. Create the float model
     model = TransformerModel(
         vocab_size=49152,
         hidden_size=576,
         tie_word_embeddings=True,
     )
+    # 2. Load the actual checkpoint weights
+    #    If "quantized_model.pt" is a state_dict, do:
+    checkpoint = torch.load(checkpoint_path, map_location="cpu")
+    model.load_state_dict(checkpoint)  # or checkpoint["model_state_dict"] if saved that way
+    model.eval()
+    # 3. Dynamically quantize relevant layers
+    #    For embeddings, we typically use torch.quint8
+    #    so we don't run into any embedding dtype errors
+    quantized_model = torch.quantization.quantize_dynamic(
+        model,
+        {nn.Linear, nn.Embedding},
+        dtype=torch.quint8
+    )
+    return quantized_model
+# 4. Load the quantized model
 model = load_quantized_model("quantized_model.pt")
+# 5. Inference function
 def generate_text(prompt, max_length=50, temperature=1.0, top_k=50):
     input_ids = tokenizer.encode(prompt, return_tensors="pt")
     with torch.no_grad():
         output_ids = model.generate(
             input_ids,
             top_k=top_k,
             do_sample=True,
         )
     generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
     return generated_text
+# 6. Gradio interface
 interface = gr.Interface(
     fn=generate_text,
     inputs=[
     ],
     outputs=gr.Textbox(label="Generated Text"),
     title="Text Generation with Quantized SMOL-LM2",
+    description="Generate text using a dynamically quantized SMOL-LM2 model.",
 )
+interface.launch()