File size: 3,114 Bytes
38fe90f 4e2c9cd 38fe90f 4e2c9cd cd618c5 b6a467a 5606667 38fe90f 43ded21 38fe90f e63c0a3 38fe90f b6a467a 38fe90f 690b43c 38fe90f cd618c5 38fe90f cd618c5 38fe90f cd618c5 38fe90f b6a467a 38fe90f b6a467a 38fe90f b6a467a 38fe90f b6a467a 38fe90f b6a467a 38fe90f b6a467a 38fe90f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 |
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import faiss
import torch
import numpy as np
import time
def log(message):
print(f"β
{message}")
# β
Load datasets dynamically
datasets = {
"sales": load_dataset("goendalf666/sales-conversations", trust_remote_code=True),
"blended": load_dataset("blended_skill_talk", trust_remote_code=True),
"dialog": load_dataset("daily_dialog", trust_remote_code=True),
"multiwoz": load_dataset("multi_woz_v22", trust_remote_code=True),
}
# β
Load MiniLM model for embeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
def embed_text(texts):
"""Generate embeddings for a batch of texts."""
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
with torch.no_grad():
embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
return embeddings
# β
Batch processing function
def create_embeddings(dataset_name, dataset, batch_size=100):
log(f"π₯ Creating embeddings for {dataset_name}...")
# β
Extract text based on dataset structure
if dataset_name == "sales":
texts = [" ".join(row.values()) for row in dataset["train"]]
elif dataset_name == "blended":
texts = [" ".join(row["free_messages"] + row["guided_messages"]) for row in dataset["train"]]
elif dataset_name == "dialog":
texts = [" ".join(row["dialog"]) for row in dataset["train"]]
elif dataset_name == "multiwoz":
texts = [" ".join(row["turns"]["utterance"]) for row in dataset["train"]]
else:
log(f"β οΈ Unknown dataset structure for {dataset_name}!")
texts = []
log(f"β
Extracted {len(texts)} texts from {dataset_name}.")
# β
Process in batches
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
batch_embeddings = embed_text(batch)
all_embeddings.append(batch_embeddings)
# β
Log progress
log(f"π Processed {i + len(batch)}/{len(texts)} embeddings for {dataset_name}...")
# β
Simulate delay for monitoring
time.sleep(1)
# β
Convert list of numpy arrays to a single numpy array
all_embeddings = np.vstack(all_embeddings)
return all_embeddings
# β
Save embeddings to FAISS with unique filename
def save_embeddings_to_faiss(embeddings, index_name="my_embeddings"):
index_file = f"{index_name}.faiss"
# β
Create new FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings).astype(np.float32))
# β
Save FAISS index
faiss.write_index(index, index_file)
log(f"β
Saved FAISS index: {index_file}")
# β
Run embedding process for all datasets
for name, dataset in datasets.items():
embeddings = create_embeddings(name, dataset, batch_size=100)
save_embeddings_to_faiss(embeddings, index_name=name)
log(f"β
Embeddings for {name} saved to FAISS.")
|