frankjosh commited on
Commit
a2df113
Β·
verified Β·
1 Parent(s): 5e9f512

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -78
app.py CHANGED
@@ -1,14 +1,8 @@
1
  # -*- coding: utf-8 -*-
2
- """app.ipynb
3
-
4
- Automatically generated by Colab.
5
-
6
- Original file is located at
7
- https://colab.research.google.com/drive/1deINvEblsMkv9h0gJzuGB4uSamW0DMX5
8
  """
9
 
10
- #pip install streamlit transformers gdown torch pandas numpy
11
-
12
  import warnings
13
  warnings.filterwarnings('ignore')
14
 
@@ -18,13 +12,9 @@ import numpy as np
18
  from sklearn.metrics.pairwise import cosine_similarity
19
  from transformers import AutoTokenizer, AutoModel
20
  import torch
21
- import gdown
22
- from pathlib import Path
23
- from datetime import datetime
24
- import json
25
- import torch.cuda
26
- import os
27
  from datasets import load_dataset
 
28
 
29
  # Configure GPU if available
30
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -35,10 +25,6 @@ if 'history' not in st.session_state:
35
  if 'feedback' not in st.session_state:
36
  st.session_state.feedback = {}
37
 
38
-
39
-
40
-
41
-
42
  # Step 1: Load Dataset and Precompute Embeddings
43
  @st.cache_resource
44
  def load_data_and_model():
@@ -49,35 +35,35 @@ def load_data_and_model():
49
  # Download and load dataset
50
  dataset = load_dataset("frankjosh/filtered_dataset")
51
  data = pd.DataFrame(dataset['train'])
 
 
 
 
 
 
 
 
 
 
52
  except Exception as e:
53
  st.error(f"Error loading dataset: {str(e)}")
54
  st.stop()
55
 
56
  # Load CodeT5-small model and tokenizer
57
  model_name = "Salesforce/codet5-small"
58
- tokenizer = AutoTokenizer.from_pretrained(model_name)
59
- model = AutoTokenizer.from_pretrained(model_name)
 
60
 
61
- # Combine text fields for embedding generation
62
- data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
63
- return data, tokenizer, model
64
-
65
- @st.cache_resource
66
- def load_model_and_tokenizer():
67
- try:
68
- tokenizer = AutoTokenizer.from_pretrained(model_name)
69
- model = AutoModel.from_pretrained(model_name)
70
- # Move model to GPU if available
71
- if torch.cuda.is_available():
72
- model = model.to('cuda')
73
- model.eval() # Set to evaluation mode
74
- return tokenizer, model
75
- except Exception as e:
76
- st.error(f"Error loading model: {str(e)}")
77
- st.stop()
78
-
79
- tokenizer, model = load_model_and_tokenizer()
80
 
 
81
 
82
  # Define the embedding generation function
83
  @st.cache_data
@@ -92,31 +78,17 @@ def generate_embedding(_model, _tokenizer, text):
92
  embedding = embedding.cpu()
93
  return embedding.numpy()
94
 
95
- # Error handling for generating query embeddings
96
- try:
97
- query_embedding = generate_embedding(model, tokenizer, user_query)
98
- except Exception as e:
99
- st.error(f"Error generating embedding: {str(e)}")
100
- st.stop()
101
-
102
  # Precompute embeddings for dataset
103
  def precompute_embeddings(data, model, tokenizer):
104
- @st.cache_data
105
- def generate_cached_embedding(text):
106
- return generate_embedding(model, tokenizer, text)
107
-
108
- # Apply embedding generation with progress bar
109
- with st.spinner('Generating embeddings... This might take a few minutes on first run...'):
110
- data['embedding'] = data['text'].apply(lambda x: generate_cached_embedding(x))
111
  return data
112
 
113
- # Example usage:
114
- # data = precompute_embeddings(data, model, tokenizer)
115
-
116
  def generate_case_study(repo_data):
117
- """
118
- Generate a concise case study brief from repository data
119
- """
120
  template = f"""
121
  **Project Overview**: {repo_data['summary'][:50]}...
122
 
@@ -124,7 +96,7 @@ def generate_case_study(repo_data):
124
  - Repository contains production-ready {repo_data['path'].split('/')[-1]} implementation
125
  - {repo_data['docstring'][:50]}...
126
 
127
- **Potential Applications**: This repository can be utilized for projects requiring {repo_data['summary'].split()[0:5]}...
128
 
129
  **Implementation Complexity**: {'Medium' if len(repo_data['docstring']) > 500 else 'Low'}
130
 
@@ -132,15 +104,17 @@ def generate_case_study(repo_data):
132
  """
133
  return template[:150] + "..."
134
 
 
135
  def save_feedback(repo_id, feedback_type):
136
- """
137
- Save user feedback for a repository
138
- """
139
  if repo_id not in st.session_state.feedback:
140
  st.session_state.feedback[repo_id] = {'likes': 0, 'dislikes': 0}
141
  st.session_state.feedback[repo_id][feedback_type] += 1
142
 
143
- # Main App
 
 
 
 
144
  st.title("Enhanced Repository Recommender System πŸš€")
145
 
146
  # Sidebar for History and Stats
@@ -159,18 +133,9 @@ with st.sidebar:
159
  st.header("πŸ“ˆ Usage Statistics")
160
  st.write(f"Total Searches: {len(st.session_state.history)}")
161
  if st.session_state.feedback:
162
- total_likes = sum(f['likes'] for f in st.session_state.feedback.values())
163
- total_dislikes = sum(f['dislikes'] for f in st.session_state.feedback.values())
164
- st.write(f"Total Likes: {total_likes}")
165
- st.write(f"Total Dislikes: {total_dislikes}")
166
-
167
- # Load resources
168
- @st.cache_resource
169
- def initialize_resources():
170
- data, tokenizer, model = load_data_and_model()
171
- return data, tokenizer, model
172
-
173
- data, tokenizer, model = initialize_resources()
174
 
175
  # Main interface
176
  user_query = st.text_area(
@@ -186,7 +151,7 @@ with col1:
186
  with col2:
187
  top_n = st.selectbox("Number of results:", [3, 5, 10], index=1)
188
 
189
- if search_button and user_query:
190
  with st.spinner("Finding relevant repositories..."):
191
  # Generate query embedding and get recommendations
192
  query_embedding = generate_embedding(model, tokenizer, user_query)
@@ -242,4 +207,4 @@ st.markdown(
242
  GPU Status: {'🟒 Enabled' if torch.cuda.is_available() else 'πŸ”΄ Disabled'} |
243
  Model: CodeT5-Small
244
  """
245
- )
 
1
  # -*- coding: utf-8 -*-
2
+ """app.py
3
+ Enhanced Repository Recommender System using Streamlit and CodeT5-small
 
 
 
 
4
  """
5
 
 
 
6
  import warnings
7
  warnings.filterwarnings('ignore')
8
 
 
12
  from sklearn.metrics.pairwise import cosine_similarity
13
  from transformers import AutoTokenizer, AutoModel
14
  import torch
15
+ from tqdm import tqdm
 
 
 
 
 
16
  from datasets import load_dataset
17
+ from datetime import datetime
18
 
19
  # Configure GPU if available
20
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
25
  if 'feedback' not in st.session_state:
26
  st.session_state.feedback = {}
27
 
 
 
 
 
28
  # Step 1: Load Dataset and Precompute Embeddings
29
  @st.cache_resource
30
  def load_data_and_model():
 
35
  # Download and load dataset
36
  dataset = load_dataset("frankjosh/filtered_dataset")
37
  data = pd.DataFrame(dataset['train'])
38
+
39
+ # Ensure required columns exist
40
+ required_columns = ['docstring', 'summary']
41
+ for col in required_columns:
42
+ if col not in data.columns:
43
+ st.error(f"Missing required column: {col}")
44
+ st.stop()
45
+
46
+ # Combine text fields for embedding generation
47
+ data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
48
  except Exception as e:
49
  st.error(f"Error loading dataset: {str(e)}")
50
  st.stop()
51
 
52
  # Load CodeT5-small model and tokenizer
53
  model_name = "Salesforce/codet5-small"
54
+ try:
55
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
56
+ model = AutoModel.from_pretrained(model_name)
57
 
58
+ # Move model to GPU if available
59
+ if torch.cuda.is_available():
60
+ model = model.to('cuda')
61
+ model.eval() # Set to evaluation mode
62
+ except Exception as e:
63
+ st.error(f"Error loading model: {str(e)}")
64
+ st.stop()
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ return data, tokenizer, model
67
 
68
  # Define the embedding generation function
69
  @st.cache_data
 
78
  embedding = embedding.cpu()
79
  return embedding.numpy()
80
 
 
 
 
 
 
 
 
81
  # Precompute embeddings for dataset
82
  def precompute_embeddings(data, model, tokenizer):
83
+ embeddings = []
84
+ for text in tqdm(data['text'], desc="Generating embeddings"):
85
+ embedding = generate_embedding(model, tokenizer, text)
86
+ embeddings.append(embedding)
87
+ data['embedding'] = embeddings
 
 
88
  return data
89
 
90
+ # Generate a concise case study brief from repository data
 
 
91
  def generate_case_study(repo_data):
 
 
 
92
  template = f"""
93
  **Project Overview**: {repo_data['summary'][:50]}...
94
 
 
96
  - Repository contains production-ready {repo_data['path'].split('/')[-1]} implementation
97
  - {repo_data['docstring'][:50]}...
98
 
99
+ **Potential Applications**: This repository can be utilized for projects requiring {' '.join(repo_data['summary'].split()[:5])}...
100
 
101
  **Implementation Complexity**: {'Medium' if len(repo_data['docstring']) > 500 else 'Low'}
102
 
 
104
  """
105
  return template[:150] + "..."
106
 
107
+ # Save user feedback for a repository
108
  def save_feedback(repo_id, feedback_type):
 
 
 
109
  if repo_id not in st.session_state.feedback:
110
  st.session_state.feedback[repo_id] = {'likes': 0, 'dislikes': 0}
111
  st.session_state.feedback[repo_id][feedback_type] += 1
112
 
113
+ # Load resources
114
+ data, tokenizer, model = load_data_and_model()
115
+ data = precompute_embeddings(data, model, tokenizer)
116
+
117
+ # Main App Interface
118
  st.title("Enhanced Repository Recommender System πŸš€")
119
 
120
  # Sidebar for History and Stats
 
133
  st.header("πŸ“ˆ Usage Statistics")
134
  st.write(f"Total Searches: {len(st.session_state.history)}")
135
  if st.session_state.feedback:
136
+ feedback_df = pd.DataFrame(st.session_state.feedback).T
137
+ feedback_df['Total'] = feedback_df['likes'] + feedback_df['dislikes']
138
+ st.bar_chart(feedback_df[['likes', 'dislikes']])
 
 
 
 
 
 
 
 
 
139
 
140
  # Main interface
141
  user_query = st.text_area(
 
151
  with col2:
152
  top_n = st.selectbox("Number of results:", [3, 5, 10], index=1)
153
 
154
+ if search_button and user_query.strip():
155
  with st.spinner("Finding relevant repositories..."):
156
  # Generate query embedding and get recommendations
157
  query_embedding = generate_embedding(model, tokenizer, user_query)
 
207
  GPU Status: {'🟒 Enabled' if torch.cuda.is_available() else 'πŸ”΄ Disabled'} |
208
  Model: CodeT5-Small
209
  """
210
+ )