frankjosh commited on
Commit
73463ed
·
verified ·
1 Parent(s): 62db363

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -1
app.py CHANGED
@@ -44,13 +44,19 @@ def load_model_and_tokenizer():
44
  def load_data():
45
  """
46
  Load and sample the dataset from Hugging Face.
47
- Returns a DataFrame with a fixed subset of repositories.
48
  """
49
  dataset = load_dataset("frankjosh/filtered_dataset")
50
  data = pd.DataFrame(dataset['train'])
 
 
51
  data = data.sample(n=min(SUBSET_SIZE, len(data)), random_state=42).reset_index(drop=True)
 
 
 
52
  return data
53
 
 
54
  @st.cache_resource
55
  def precompute_embeddings(data: pd.DataFrame, _tokenizer, _model, batch_size=BATCH_SIZE):
56
  """
 
44
  def load_data():
45
  """
46
  Load and sample the dataset from Hugging Face.
47
+ Ensures the 'text' column is created for embedding precomputation.
48
  """
49
  dataset = load_dataset("frankjosh/filtered_dataset")
50
  data = pd.DataFrame(dataset['train'])
51
+
52
+ # Take a random subset of data
53
  data = data.sample(n=min(SUBSET_SIZE, len(data)), random_state=42).reset_index(drop=True)
54
+
55
+ # Create a 'text' column by combining relevant fields
56
+ data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
57
  return data
58
 
59
+
60
  @st.cache_resource
61
  def precompute_embeddings(data: pd.DataFrame, _tokenizer, _model, batch_size=BATCH_SIZE):
62
  """