sms_agent / app.py
abrah926's picture
saved
43ded21 verified
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import faiss
import torch
import numpy as np
import time
def log(message):
print(f"βœ… {message}")
# βœ… Load datasets dynamically
datasets = {
"sales": load_dataset("goendalf666/sales-conversations", trust_remote_code=True),
"blended": load_dataset("blended_skill_talk", trust_remote_code=True),
"dialog": load_dataset("daily_dialog", trust_remote_code=True),
"multiwoz": load_dataset("multi_woz_v22", trust_remote_code=True),
}
# βœ… Load MiniLM model for embeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
def embed_text(texts):
"""Generate embeddings for a batch of texts."""
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
with torch.no_grad():
embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
return embeddings
# βœ… Batch processing function
def create_embeddings(dataset_name, dataset, batch_size=100):
log(f"πŸ“₯ Creating embeddings for {dataset_name}...")
# βœ… Extract text based on dataset structure
if dataset_name == "sales":
texts = [" ".join(row.values()) for row in dataset["train"]]
elif dataset_name == "blended":
texts = [" ".join(row["free_messages"] + row["guided_messages"]) for row in dataset["train"]]
elif dataset_name == "dialog":
texts = [" ".join(row["dialog"]) for row in dataset["train"]]
elif dataset_name == "multiwoz":
texts = [" ".join(row["turns"]["utterance"]) for row in dataset["train"]]
else:
log(f"⚠️ Unknown dataset structure for {dataset_name}!")
texts = []
log(f"βœ… Extracted {len(texts)} texts from {dataset_name}.")
# βœ… Process in batches
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
batch_embeddings = embed_text(batch)
all_embeddings.append(batch_embeddings)
# βœ… Log progress
log(f"πŸš€ Processed {i + len(batch)}/{len(texts)} embeddings for {dataset_name}...")
# βœ… Simulate delay for monitoring
time.sleep(1)
# βœ… Convert list of numpy arrays to a single numpy array
all_embeddings = np.vstack(all_embeddings)
return all_embeddings
# βœ… Save embeddings to FAISS with unique filename
def save_embeddings_to_faiss(embeddings, index_name="my_embeddings"):
index_file = f"{index_name}.faiss"
# βœ… Create new FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings).astype(np.float32))
# βœ… Save FAISS index
faiss.write_index(index, index_file)
log(f"βœ… Saved FAISS index: {index_file}")
# βœ… Run embedding process for all datasets
for name, dataset in datasets.items():
embeddings = create_embeddings(name, dataset, batch_size=100)
save_embeddings_to_faiss(embeddings, index_name=name)
log(f"βœ… Embeddings for {name} saved to FAISS.")