Spaces:

frankjosh
/

repo_recommender

Sleeping

App Files Files Community

frankjosh commited on Jan 15

Commit

ca5a024

verified ·

1 Parent(s): 573ba9d

Update app.py

Browse files

Files changed (1) hide show

app.py +15 -33

app.py CHANGED Viewed

@@ -24,6 +24,7 @@ from datetime import datetime
 import json
 import torch.cuda
 import os
 # Configure GPU if available
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -35,34 +36,12 @@ if 'feedback' not in st.session_state:
     st.session_state.feedback = {}
-# Configuration
-DATASET_GDRIVE_ID = "1pPYlUEtIA3bi8iLVKqzF-37sHoaOhTZz"  # Replace with your actual file ID
-LOCAL_DATA_DIR = "data"
-DATASET_FILENAME = "filtered_dataset.parquet"
-def download_from_gdrive():
-    """
-    Download dataset from Google Drive with proper error handling
-    """
-    os.makedirs(LOCAL_DATA_DIR, exist_ok=True)
-    local_path = os.path.join(LOCAL_DATA_DIR, DATASET_FILENAME)
-    if not os.path.exists(local_path):
-        try:
-            with st.spinner('Downloading dataset from Google Drive... This might take a few minutes...'):
-                # Create direct download URL
-                url = f'https://drive.google.com/uc?id={DATASET_GDRIVE_ID}'
-                # Download file
-                gdown.download(url, local_path, quiet=False)
-                if os.path.exists(local_path):
-                    st.success("Dataset downloaded successfully!")
-                else:
-                    st.error("Failed to download dataset")
-                    st.stop()
-        except Exception as e:
-            st.error(f"Error downloading dataset: {str(e)}")
-            st.stop()
-    return local_path
 # Step 1: Load Dataset and Precompute Embeddings
 @st.cache_resource
@@ -72,17 +51,20 @@ def load_data_and_model():
     """
     try:
         # Download and load dataset
-        dataset_path = download_from_gdrive()
-        data = pd.read_parquet(dataset_path)
     except Exception as e:
         st.error(f"Error loading dataset: {str(e)}")
         st.stop()
-    # Combine text fields for embedding generation
-    data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
     # Load CodeT5-small model and tokenizer
     model_name = "Salesforce/codet5-small"
     @st.cache_resource
     def load_model_and_tokenizer():

 import json
 import torch.cuda
 import os
+from datasets import load_dataset
 # Configure GPU if available
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
     st.session_state.feedback = {}
+@st.cache_data
+def generate_embedding(model, tokenizer, text):
+    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
 # Step 1: Load Dataset and Precompute Embeddings
 @st.cache_resource
     """
     try:
         # Download and load dataset
+        dataset = load_dataset("frankjosh/filtered_dataset")
+        data = pd.DataFrame(dataset['train'])
     except Exception as e:
         st.error(f"Error loading dataset: {str(e)}")
         st.stop()
     # Load CodeT5-small model and tokenizer
     model_name = "Salesforce/codet5-small"
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoTokenizer.from_pretrained(model_name)
+    # Combine text fields for embedding generation
+    data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
+    return data, tokenizer, model
     @st.cache_resource
     def load_model_and_tokenizer():