Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -44,13 +44,19 @@ def load_model_and_tokenizer():
|
|
44 |
def load_data():
|
45 |
"""
|
46 |
Load and sample the dataset from Hugging Face.
|
47 |
-
|
48 |
"""
|
49 |
dataset = load_dataset("frankjosh/filtered_dataset")
|
50 |
data = pd.DataFrame(dataset['train'])
|
|
|
|
|
51 |
data = data.sample(n=min(SUBSET_SIZE, len(data)), random_state=42).reset_index(drop=True)
|
|
|
|
|
|
|
52 |
return data
|
53 |
|
|
|
54 |
@st.cache_resource
|
55 |
def precompute_embeddings(data: pd.DataFrame, _tokenizer, _model, batch_size=BATCH_SIZE):
|
56 |
"""
|
|
|
44 |
def load_data():
|
45 |
"""
|
46 |
Load and sample the dataset from Hugging Face.
|
47 |
+
Ensures the 'text' column is created for embedding precomputation.
|
48 |
"""
|
49 |
dataset = load_dataset("frankjosh/filtered_dataset")
|
50 |
data = pd.DataFrame(dataset['train'])
|
51 |
+
|
52 |
+
# Take a random subset of data
|
53 |
data = data.sample(n=min(SUBSET_SIZE, len(data)), random_state=42).reset_index(drop=True)
|
54 |
+
|
55 |
+
# Create a 'text' column by combining relevant fields
|
56 |
+
data['text'] = data['docstring'].fillna('') + ' ' + data['summary'].fillna('')
|
57 |
return data
|
58 |
|
59 |
+
|
60 |
@st.cache_resource
|
61 |
def precompute_embeddings(data: pd.DataFrame, _tokenizer, _model, batch_size=BATCH_SIZE):
|
62 |
"""
|