updated embedding model
Browse files
app.py
CHANGED
@@ -21,6 +21,8 @@ import gradio as gr
|
|
21 |
from datetime import datetime
|
22 |
import sys
|
23 |
|
|
|
|
|
24 |
|
25 |
|
26 |
gr.set_static_paths(paths=["static/"])
|
@@ -183,54 +185,16 @@ print(f"Setting up language model: {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
|
183 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
184 |
print(f"Using device: {device}")
|
185 |
|
186 |
-
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_aug2023refresh_base')
|
187 |
-
model = AutoAdapterModel.from_pretrained('allenai/specter2_aug2023refresh_base')
|
|
|
|
|
188 |
|
189 |
|
190 |
@spaces.GPU(duration=60)
|
191 |
def create_embeddings(texts_to_embedd):
|
192 |
-
# Set up the device
|
193 |
-
|
194 |
-
|
195 |
-
print(len(texts_to_embedd))
|
196 |
-
|
197 |
-
# Load the proximity adapter and activate it
|
198 |
-
model.load_adapter("allenai/specter2_aug2023refresh", source="hf", load_as="proximity", set_active=True)
|
199 |
-
model.set_active_adapters("proximity")
|
200 |
|
201 |
-
model.
|
202 |
-
|
203 |
-
def batch_generator(data, batch_size):
|
204 |
-
"""Yield consecutive batches of data."""
|
205 |
-
for i in range(0, len(data), batch_size):
|
206 |
-
yield data[i:i + batch_size]
|
207 |
-
|
208 |
-
|
209 |
-
def encode_texts(texts, device, batch_size=16):
|
210 |
-
"""Process texts in batches and return their embeddings."""
|
211 |
-
model.eval()
|
212 |
-
with torch.no_grad():
|
213 |
-
all_embeddings = []
|
214 |
-
count = 0
|
215 |
-
for batch in tqdm(batch_generator(texts, batch_size)):
|
216 |
-
inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
|
217 |
-
outputs = model(**inputs)
|
218 |
-
embeddings = outputs.last_hidden_state[:, 0, :] # Taking the [CLS] token representation
|
219 |
-
|
220 |
-
all_embeddings.append(embeddings.cpu()) # Move to CPU to free GPU memory
|
221 |
-
#torch.mps.empty_cache() # Clear cache to free up memory
|
222 |
-
if count == 100:
|
223 |
-
#torch.mps.empty_cache()
|
224 |
-
torch.cuda.empty_cache()
|
225 |
-
count = 0
|
226 |
-
|
227 |
-
count +=1
|
228 |
-
|
229 |
-
all_embeddings = torch.cat(all_embeddings, dim=0)
|
230 |
-
return all_embeddings
|
231 |
-
|
232 |
-
# Concatenate title and abstract
|
233 |
-
embeddings = encode_texts(texts_to_embedd, device, batch_size=32).cpu().numpy() # Process texts in batches of 10
|
234 |
|
235 |
return embeddings
|
236 |
|
|
|
21 |
from datetime import datetime
|
22 |
import sys
|
23 |
|
24 |
+
from sentence_transformers import SentenceTransformer
|
25 |
+
|
26 |
|
27 |
|
28 |
gr.set_static_paths(paths=["static/"])
|
|
|
185 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
186 |
print(f"Using device: {device}")
|
187 |
|
188 |
+
#tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_aug2023refresh_base')
|
189 |
+
#model = AutoAdapterModel.from_pretrained('allenai/specter2_aug2023refresh_base')
|
190 |
+
|
191 |
+
model = SentenceTransformer("m7n/discipline-tuned_specter_2_024")
|
192 |
|
193 |
|
194 |
@spaces.GPU(duration=60)
|
195 |
def create_embeddings(texts_to_embedd):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
+
embeddings = model.encode(texts_to_embedd,show_progress_bar=True,batch_size=32)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
198 |
|
199 |
return embeddings
|
200 |
|