Spaces:

martynattakit
/

CodeSentinel-CWE_Classification

Sleeping

App Files Files Community

martynattakit commited on Jun 10

Commit

8a621d3

verified ·

1 Parent(s): a0046fe

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -60

app.py CHANGED Viewed

@@ -1,17 +1,16 @@
 import torch
 from transformers import RobertaTokenizer, RobertaModel
-from huggingface_hub import hf_hub_download # <--- NEW IMPORT
 import numpy as np
 from scipy.special import softmax
 import gradio as gr
 import re
-import os # <--- NEW IMPORT
 # Define the model class with dimension reduction
 class CodeClassifier(torch.nn.Module):
     def __init__(self, base_model, num_labels=6):
         super(CodeClassifier, self).__init__()
-        self.base = base_model # This will be the microsoft/codebert-base model
         self.reduction = torch.nn.Linear(768, 512)
         self.classifier = torch.nn.Linear(512, num_labels)
@@ -20,62 +19,21 @@ class CodeClassifier(torch.nn.Module):
         reduced = self.reduction(outputs.pooler_output)
         return self.classifier(reduced)
-# --- START OF MODIFIED LOADING LOGIC ---
-# Hugging Face Model ID where your .pt file is located
-HF_MODEL_REPO_ID = 'martynattakit/CodeSentinel-Model'
-# The exact filename of your .pt file in that repository
-HF_MODEL_FILENAME = 'best_model.pt' # <--- CONFIRM THIS IS THE EXACT FILENAME YOU UPLOADED
-# Load the base tokenizer (from Hugging Face Hub as before)
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
-# Initialize the base CodeBERT model from Hugging Face (standard download)
-base_codebert_model = RobertaModel.from_pretrained('microsoft/codebert-base')
-# Instantiate your custom CodeClassifier model
-model = CodeClassifier(base_codebert_model, num_labels=6)
-# Download the .pt file from Hugging Face Hub
-print(f"Attempting to download {HF_MODEL_FILENAME} from {HF_MODEL_REPO_ID}...")
-try:
-    # hf_hub_download will download the file and return its local path (which might be in cache)
-    downloaded_model_path = hf_hub_download(
-        repo_id=HF_MODEL_REPO_ID,
-        filename=HF_MODEL_FILENAME,
-        # If your model repo is private, you might need to ensure
-        # that huggingface-cli login has been run in your environment.
-    )
-    print(f"Model downloaded to: {downloaded_model_path}")
-    # Load the state dictionary from the downloaded .pt file
-    state_dict = torch.load(downloaded_model_path, map_location=device)
-    # Handle 'module.' prefix if the model was saved with DataParallel
-    new_state_dict = {}
-    for k, v in state_dict.items():
-        if k.startswith('module.'):
-            new_state_dict[k[7:]] = v # remove 'module.' prefix
-        else:
-            new_state_dict[k] = v
-    model.load_state_dict(new_state_dict)
-    print(f"Successfully loaded model state into CodeClassifier.")
-except Exception as e:
-    print(f"Error during model download or loading: {e}")
-    print("Please ensure:")
-    print(f"1. The repository '{HF_MODEL_REPO_ID}' exists and is public (or you're logged in with `huggingface-cli login`).")
-    print(f"2. The file '{HF_MODEL_FILENAME}' exists within that repository on Hugging Face and is spelled exactly correctly.")
-    # Exiting here is good for deployment environments like Hugging Face Spaces,
-    # as it makes the error clear early on.
-    exit()
-# --- END OF MODIFIED LOADING LOGIC ---
-print("Loaded state dict keys (after loading .pt):", model.state_dict().keys())
-print("Classifier weight shape (after loading .pt):", model.classifier.weight.shape)
 model.eval()
 model.to(device)
@@ -109,11 +67,11 @@ def evaluate_code(code):
     try:
         if len(code) >= 1500000:
             return "Code too large"
         cleaned_code = clean_code(code)
         inputs = tokenizer(cleaned_code, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
         print("Input shape:", inputs['input_ids'].shape)
         with torch.no_grad():
             outputs = model(**inputs)
             print("Raw logits:", outputs.cpu().numpy())
@@ -121,7 +79,7 @@ def evaluate_code(code):
             pred = np.argmax(probs, axis=1)[0]
             cwe, description = label_map[pred]
             return f"{cwe} {description}"
     except Exception as e:
         return f"Error during prediction: {str(e)}"

 import torch
 from transformers import RobertaTokenizer, RobertaModel
 import numpy as np
 from scipy.special import softmax
 import gradio as gr
 import re
+from huggingface_hub import hf_hub_download
 # Define the model class with dimension reduction
 class CodeClassifier(torch.nn.Module):
     def __init__(self, base_model, num_labels=6):
         super(CodeClassifier, self).__init__()
+        self.base = base_model
         self.reduction = torch.nn.Linear(768, 512)
         self.classifier = torch.nn.Linear(512, num_labels)
         reduced = self.reduction(outputs.pooler_output)
         return self.classifier(reduced)
+# Load base model and tokenizer from Hugging Face Model Hub
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 tokenizer = RobertaTokenizer.from_pretrained('microsoft/codebert-base')
+base_model = RobertaModel.from_pretrained('microsoft/codebert-base')
+# Initialize the CodeClassifier with the base model
+model = CodeClassifier(base_model)
+# Load the checkpoint from Hugginface Model Hub
+checkpoint_path = hf_hub_download(repo_id="martynattakit/CodeSentinel-Model", filename="best_model.pt")
+checkpoint = torch.load(checkpoint_path, map_location=device)
+# Load the state dict, focusing on classifier weights
+model_state = checkpoint.get('model_state_dict', checkpoint)
+model.load_state_dict(model_state, strict=False)
+print("Loaded state dict keys:", model.state_dict().keys())
+print("Classifier weight shape:", model.classifier.weight.shape)
 model.eval()
 model.to(device)
     try:
         if len(code) >= 1500000:
             return "Code too large"
         cleaned_code = clean_code(code)
         inputs = tokenizer(cleaned_code, return_tensors="pt", truncation=True, padding=True, max_length=256).to(device)
         print("Input shape:", inputs['input_ids'].shape)
         with torch.no_grad():
             outputs = model(**inputs)
             print("Raw logits:", outputs.cpu().numpy())
             pred = np.argmax(probs, axis=1)[0]
             cwe, description = label_map[pred]
             return f"{cwe} {description}"
     except Exception as e:
         return f"Error during prediction: {str(e)}"