m7n commited on
Commit
a2bb48a
·
verified ·
1 Parent(s): 217bb89

updated embedding model

Browse files
Files changed (1) hide show
  1. app.py +7 -43
app.py CHANGED
@@ -21,6 +21,8 @@ import gradio as gr
21
  from datetime import datetime
22
  import sys
23
 
 
 
24
 
25
 
26
  gr.set_static_paths(paths=["static/"])
@@ -183,54 +185,16 @@ print(f"Setting up language model: {time.strftime('%Y-%m-%d %H:%M:%S')}")
183
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
184
  print(f"Using device: {device}")
185
 
186
- tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_aug2023refresh_base')
187
- model = AutoAdapterModel.from_pretrained('allenai/specter2_aug2023refresh_base')
 
 
188
 
189
 
190
  @spaces.GPU(duration=60)
191
  def create_embeddings(texts_to_embedd):
192
- # Set up the device
193
-
194
-
195
- print(len(texts_to_embedd))
196
-
197
- # Load the proximity adapter and activate it
198
- model.load_adapter("allenai/specter2_aug2023refresh", source="hf", load_as="proximity", set_active=True)
199
- model.set_active_adapters("proximity")
200
 
201
- model.to(device)
202
-
203
- def batch_generator(data, batch_size):
204
- """Yield consecutive batches of data."""
205
- for i in range(0, len(data), batch_size):
206
- yield data[i:i + batch_size]
207
-
208
-
209
- def encode_texts(texts, device, batch_size=16):
210
- """Process texts in batches and return their embeddings."""
211
- model.eval()
212
- with torch.no_grad():
213
- all_embeddings = []
214
- count = 0
215
- for batch in tqdm(batch_generator(texts, batch_size)):
216
- inputs = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=512).to(device)
217
- outputs = model(**inputs)
218
- embeddings = outputs.last_hidden_state[:, 0, :] # Taking the [CLS] token representation
219
-
220
- all_embeddings.append(embeddings.cpu()) # Move to CPU to free GPU memory
221
- #torch.mps.empty_cache() # Clear cache to free up memory
222
- if count == 100:
223
- #torch.mps.empty_cache()
224
- torch.cuda.empty_cache()
225
- count = 0
226
-
227
- count +=1
228
-
229
- all_embeddings = torch.cat(all_embeddings, dim=0)
230
- return all_embeddings
231
-
232
- # Concatenate title and abstract
233
- embeddings = encode_texts(texts_to_embedd, device, batch_size=32).cpu().numpy() # Process texts in batches of 10
234
 
235
  return embeddings
236
 
 
21
  from datetime import datetime
22
  import sys
23
 
24
+ from sentence_transformers import SentenceTransformer
25
+
26
 
27
 
28
  gr.set_static_paths(paths=["static/"])
 
185
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
186
  print(f"Using device: {device}")
187
 
188
+ #tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_aug2023refresh_base')
189
+ #model = AutoAdapterModel.from_pretrained('allenai/specter2_aug2023refresh_base')
190
+
191
+ model = SentenceTransformer("m7n/discipline-tuned_specter_2_024")
192
 
193
 
194
  @spaces.GPU(duration=60)
195
  def create_embeddings(texts_to_embedd):
 
 
 
 
 
 
 
 
196
 
197
+ embeddings = model.encode(texts_to_embedd,show_progress_bar=True,batch_size=32)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  return embeddings
200