Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -52,10 +52,10 @@ def load_data():
|
|
52 |
return data
|
53 |
|
54 |
@st.cache_resource
|
55 |
-
def precompute_embeddings(data: pd.DataFrame, _tokenizer,
|
56 |
"""
|
57 |
Precompute embeddings for repository metadata to optimize query performance.
|
58 |
-
The tokenizer
|
59 |
"""
|
60 |
class TextDataset(Dataset):
|
61 |
def __init__(self, texts: List[str], tokenizer, max_length=512):
|
@@ -108,7 +108,7 @@ def precompute_embeddings(data: pd.DataFrame, _tokenizer, model, batch_size=BATC
|
|
108 |
|
109 |
embeddings = []
|
110 |
for batch in dataloader:
|
111 |
-
batch_embeddings = generate_embeddings_batch(
|
112 |
embeddings.extend(batch_embeddings)
|
113 |
|
114 |
data['embedding'] = embeddings
|
|
|
52 |
return data
|
53 |
|
54 |
@st.cache_resource
|
55 |
+
def precompute_embeddings(data: pd.DataFrame, _tokenizer, _model, batch_size=BATCH_SIZE):
|
56 |
"""
|
57 |
Precompute embeddings for repository metadata to optimize query performance.
|
58 |
+
The tokenizer and model are excluded from caching as they are unhashable.
|
59 |
"""
|
60 |
class TextDataset(Dataset):
|
61 |
def __init__(self, texts: List[str], tokenizer, max_length=512):
|
|
|
108 |
|
109 |
embeddings = []
|
110 |
for batch in dataloader:
|
111 |
+
batch_embeddings = generate_embeddings_batch(_model, batch, device)
|
112 |
embeddings.extend(batch_embeddings)
|
113 |
|
114 |
data['embedding'] = embeddings
|