frankjosh commited on
Commit
7160c8d
·
verified ·
1 Parent(s): 2145d76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -53
app.py CHANGED
@@ -23,6 +23,7 @@ from pathlib import Path
23
  from datetime import datetime
24
  import json
25
  import torch.cuda
 
26
 
27
  # Configure GPU if available
28
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -33,67 +34,93 @@ if 'history' not in st.session_state:
33
  if 'feedback' not in st.session_state:
34
  st.session_state.feedback = {}
35
 
36
- # Step 1: Optimized Model Loading
37
- @st.cache_resource
38
- def load_model_and_tokenizer():
39
- """
40
- Optimized model loading with GPU support and model quantization
41
- """
42
- model_name = "Salesforce/codet5-small"
43
-
44
- # Load tokenizer
45
- tokenizer = AutoTokenizer.from_pretrained(model_name)
46
 
47
- # Load model with optimizations
48
- model = AutoModel.from_pretrained(
49
- model_name,
50
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
51
- low_cpu_mem_usage=True
52
- )
53
 
54
- # Move model to GPU if available
55
- model = model.to(device)
56
-
57
- # Set to evaluation mode
58
- model.eval()
59
-
60
- return tokenizer, model
61
-
62
- # Step 2: Optimized Dataset Loading
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  @st.cache_resource
64
- def load_data():
65
  """
66
- Load and prepare dataset with progress tracking
67
  """
68
- Path("data").mkdir(exist_ok=True)
69
- dataset_path = "/content/drive/MyDrive/practice_ml/filtered_dataset.parquet"
 
 
 
 
 
 
 
 
70
 
71
- if not Path(dataset_path).exists():
72
- with st.spinner('Downloading dataset... This might take a few minutes...'):
73
- url = "https://drive.google.com/drive/folders/1dphd3vDKV46GwWKW5uo-MBl0GWGyCWUs?usp=drive_link"
74
- gdown.download(url, dataset_path, quiet=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- data = pd.read_parquet(dataset_path)
77
- data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
78
- return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # Step 3: Optimized Embedding Generation
81
- @st.cache_data
82
- def generate_embedding(_model, tokenizer, text):
83
- """
84
- Generate embeddings with optimized batch processing
85
- """
86
- inputs = tokenizer(
87
- text,
88
- return_tensors="pt",
89
- padding=True,
90
- truncation=True,
91
- max_length=512
92
- ).to(device)
93
-
94
- with torch.no_grad():
95
- outputs = _model.encoder(**inputs)
96
- return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
97
 
98
  def generate_case_study(repo_data):
99
  """
 
23
  from datetime import datetime
24
  import json
25
  import torch.cuda
26
+ import os
27
 
28
  # Configure GPU if available
29
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
34
  if 'feedback' not in st.session_state:
35
  st.session_state.feedback = {}
36
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Configuration
39
+ DATASET_GDRIVE_ID = "1pPYlUEtIA3bi8iLVKqzF-37sHoaOhTZz" # Replace with your actual file ID
40
+ LOCAL_DATA_DIR = "data"
41
+ DATASET_FILENAME = "filtered_dataset.parquet"
 
 
42
 
43
+ def download_from_gdrive():
44
+ """
45
+ Download dataset from Google Drive with proper error handling
46
+ """
47
+ os.makedirs(LOCAL_DATA_DIR, exist_ok=True)
48
+ local_path = os.path.join(LOCAL_DATA_DIR, DATASET_FILENAME)
49
+
50
+ if not os.path.exists(local_path):
51
+ try:
52
+ with st.spinner('Downloading dataset from Google Drive... This might take a few minutes...'):
53
+ # Create direct download URL
54
+ url = f'https://drive.google.com/uc?id={DATASET_GDRIVE_ID}'
55
+ # Download file
56
+ gdown.download(url, local_path, quiet=False)
57
+ if os.path.exists(local_path):
58
+ st.success("Dataset downloaded successfully!")
59
+ else:
60
+ st.error("Failed to download dataset")
61
+ st.stop()
62
+ except Exception as e:
63
+ st.error(f"Error downloading dataset: {str(e)}")
64
+ st.stop()
65
+ return local_path
66
+
67
+ # Step 1: Load Dataset and Precompute Embeddings
68
  @st.cache_resource
69
+ def load_data_and_model():
70
  """
71
+ Load the dataset and precompute embeddings. Load the CodeT5-small model and tokenizer.
72
  """
73
+ try:
74
+ # Download and load dataset
75
+ dataset_path = download_from_gdrive()
76
+ data = pd.read_parquet(dataset_path)
77
+ except Exception as e:
78
+ st.error(f"Error loading dataset: {str(e)}")
79
+ st.stop()
80
+
81
+ # Combine text fields for embedding generation
82
+ data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
83
 
84
+ # Load CodeT5-small model and tokenizer
85
+ model_name = "Salesforce/codet5-small"
86
+
87
+ @st.cache_resource
88
+ def load_model_and_tokenizer():
89
+ try:
90
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
91
+ model = AutoModel.from_pretrained(model_name)
92
+ # Move model to GPU if available
93
+ if torch.cuda.is_available():
94
+ model = model.to('cuda')
95
+ model.eval() # Set to evaluation mode
96
+ return tokenizer, model
97
+ except Exception as e:
98
+ st.error(f"Error loading model: {str(e)}")
99
+ st.stop()
100
+
101
+ tokenizer, model = load_model_and_tokenizer()
102
 
103
+ # Precompute embeddings with GPU support
104
+ @st.cache_data
105
+ def generate_embedding(text):
106
+ inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
107
+ # Move inputs to GPU if available
108
+ if torch.cuda.is_available():
109
+ inputs = {k: v.to('cuda') for k, v in inputs.items()}
110
+ with torch.no_grad():
111
+ outputs = model.encoder(**inputs)
112
+ # Move output back to CPU if needed
113
+ embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
114
+ if torch.cuda.is_available():
115
+ embedding = embedding.cpu()
116
+ return embedding.numpy()
117
+
118
+ # Generate embeddings with progress bar
119
+ with st.spinner('Generating embeddings... This might take a few minutes on first run...'):
120
+ data['embedding'] = data['text'].apply(lambda x: generate_embedding(x))
121
+
122
+ return data, tokenizer, model
123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  def generate_case_study(repo_data):
126
  """