Spaces:

frankjosh
/

repo_recommender

Sleeping

App Files Files Community

frankjosh commited on Jan 15

Commit

7160c8d

verified ·

1 Parent(s): 2145d76

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -53

app.py CHANGED Viewed

@@ -23,6 +23,7 @@ from pathlib import Path
 from datetime import datetime
 import json
 import torch.cuda
 # Configure GPU if available
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -33,67 +34,93 @@ if 'history' not in st.session_state:
 if 'feedback' not in st.session_state:
     st.session_state.feedback = {}
-# Step 1: Optimized Model Loading
-@st.cache_resource
-def load_model_and_tokenizer():
-    """
-    Optimized model loading with GPU support and model quantization
-    """
-    model_name = "Salesforce/codet5-small"
-    # Load tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    # Load model with optimizations
-    model = AutoModel.from_pretrained(
-        model_name,
-        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
-        low_cpu_mem_usage=True
-    )
-    # Move model to GPU if available
-    model = model.to(device)
-    # Set to evaluation mode
-    model.eval()
-    return tokenizer, model
-# Step 2: Optimized Dataset Loading
 @st.cache_resource
-def load_data():
     """
-    Load and prepare dataset with progress tracking
     """
-    Path("data").mkdir(exist_ok=True)
-    dataset_path = "/content/drive/MyDrive/practice_ml/filtered_dataset.parquet"
-    if not Path(dataset_path).exists():
-        with st.spinner('Downloading dataset... This might take a few minutes...'):
-            url = "https://drive.google.com/drive/folders/1dphd3vDKV46GwWKW5uo-MBl0GWGyCWUs?usp=drive_link"
-            gdown.download(url, dataset_path, quiet=False)
-    data = pd.read_parquet(dataset_path)
-    data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
-    return data
-# Step 3: Optimized Embedding Generation
-@st.cache_data
-def generate_embedding(_model, tokenizer, text):
-    """
-    Generate embeddings with optimized batch processing
-    """
-    inputs = tokenizer(
-        text,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=512
-    ).to(device)
-    with torch.no_grad():
-        outputs = _model.encoder(**inputs)
-    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
 def generate_case_study(repo_data):
     """

 from datetime import datetime
 import json
 import torch.cuda
+import os
 # Configure GPU if available
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 if 'feedback' not in st.session_state:
     st.session_state.feedback = {}
+# Configuration
+DATASET_GDRIVE_ID = "1pPYlUEtIA3bi8iLVKqzF-37sHoaOhTZz"  # Replace with your actual file ID
+LOCAL_DATA_DIR = "data"
+DATASET_FILENAME = "filtered_dataset.parquet"
+def download_from_gdrive():
+    """
+    Download dataset from Google Drive with proper error handling
+    """
+    os.makedirs(LOCAL_DATA_DIR, exist_ok=True)
+    local_path = os.path.join(LOCAL_DATA_DIR, DATASET_FILENAME)
+    if not os.path.exists(local_path):
+        try:
+            with st.spinner('Downloading dataset from Google Drive... This might take a few minutes...'):
+                # Create direct download URL
+                url = f'https://drive.google.com/uc?id={DATASET_GDRIVE_ID}'
+                # Download file
+                gdown.download(url, local_path, quiet=False)
+                if os.path.exists(local_path):
+                    st.success("Dataset downloaded successfully!")
+                else:
+                    st.error("Failed to download dataset")
+                    st.stop()
+        except Exception as e:
+            st.error(f"Error downloading dataset: {str(e)}")
+            st.stop()
+    return local_path
+# Step 1: Load Dataset and Precompute Embeddings
 @st.cache_resource
+def load_data_and_model():
     """
+    Load the dataset and precompute embeddings. Load the CodeT5-small model and tokenizer.
     """
+    try:
+        # Download and load dataset
+        dataset_path = download_from_gdrive()
+        data = pd.read_parquet(dataset_path)
+    except Exception as e:
+        st.error(f"Error loading dataset: {str(e)}")
+        st.stop()
+    # Combine text fields for embedding generation
+    data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
+    # Load CodeT5-small model and tokenizer
+    model_name = "Salesforce/codet5-small"
+    @st.cache_resource
+    def load_model_and_tokenizer():
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = AutoModel.from_pretrained(model_name)
+            # Move model to GPU if available
+            if torch.cuda.is_available():
+                model = model.to('cuda')
+            model.eval()  # Set to evaluation mode
+            return tokenizer, model
+        except Exception as e:
+            st.error(f"Error loading model: {str(e)}")
+            st.stop()
+    tokenizer, model = load_model_and_tokenizer()
+    # Precompute embeddings with GPU support
+    @st.cache_data
+    def generate_embedding(text):
+        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
+        # Move inputs to GPU if available
+        if torch.cuda.is_available():
+            inputs = {k: v.to('cuda') for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = model.encoder(**inputs)
+        # Move output back to CPU if needed
+        embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
+        if torch.cuda.is_available():
+            embedding = embedding.cpu()
+        return embedding.numpy()
+    # Generate embeddings with progress bar
+    with st.spinner('Generating embeddings... This might take a few minutes on first run...'):
+        data['embedding'] = data['text'].apply(lambda x: generate_embedding(x))
+    return data, tokenizer, model
 def generate_case_study(repo_data):
     """