File size: 3,114 Bytes
38fe90f
 
4e2c9cd
38fe90f
4e2c9cd
cd618c5
b6a467a
5606667
 
 
38fe90f
 
43ded21
 
 
 
38fe90f
e63c0a3
38fe90f
 
 
 
b6a467a
38fe90f
 
 
 
 
 
690b43c
38fe90f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd618c5
38fe90f
cd618c5
38fe90f
 
 
 
 
 
cd618c5
38fe90f
 
b6a467a
38fe90f
 
b6a467a
38fe90f
 
 
b6a467a
38fe90f
 
 
b6a467a
38fe90f
 
 
b6a467a
38fe90f
 
 
b6a467a
38fe90f
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import faiss
import torch
import numpy as np
import time

def log(message):
    print(f"βœ… {message}")

# βœ… Load datasets dynamically
datasets = {
    "sales": load_dataset("goendalf666/sales-conversations", trust_remote_code=True),
    "blended": load_dataset("blended_skill_talk", trust_remote_code=True),
    "dialog": load_dataset("daily_dialog", trust_remote_code=True),
    "multiwoz": load_dataset("multi_woz_v22", trust_remote_code=True),
}

# βœ… Load MiniLM model for embeddings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def embed_text(texts):
    """Generate embeddings for a batch of texts."""
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

# βœ… Batch processing function
def create_embeddings(dataset_name, dataset, batch_size=100):
    log(f"πŸ“₯ Creating embeddings for {dataset_name}...")
    
    # βœ… Extract text based on dataset structure
    if dataset_name == "sales":
        texts = [" ".join(row.values()) for row in dataset["train"]]
    elif dataset_name == "blended":
        texts = [" ".join(row["free_messages"] + row["guided_messages"]) for row in dataset["train"]]
    elif dataset_name == "dialog":
        texts = [" ".join(row["dialog"]) for row in dataset["train"]]
    elif dataset_name == "multiwoz":
        texts = [" ".join(row["turns"]["utterance"]) for row in dataset["train"]]
    else:
        log(f"⚠️ Unknown dataset structure for {dataset_name}!")
        texts = []

    log(f"βœ… Extracted {len(texts)} texts from {dataset_name}.")

    # βœ… Process in batches
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i : i + batch_size]
        batch_embeddings = embed_text(batch)
        all_embeddings.append(batch_embeddings)

        # βœ… Log progress
        log(f"πŸš€ Processed {i + len(batch)}/{len(texts)} embeddings for {dataset_name}...")

        # βœ… Simulate delay for monitoring
        time.sleep(1)

    # βœ… Convert list of numpy arrays to a single numpy array
    all_embeddings = np.vstack(all_embeddings)
    return all_embeddings

# βœ… Save embeddings to FAISS with unique filename
def save_embeddings_to_faiss(embeddings, index_name="my_embeddings"):
    index_file = f"{index_name}.faiss"

    # βœ… Create new FAISS index
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings).astype(np.float32))

    # βœ… Save FAISS index
    faiss.write_index(index, index_file)
    log(f"βœ… Saved FAISS index: {index_file}")

# βœ… Run embedding process for all datasets
for name, dataset in datasets.items():
    embeddings = create_embeddings(name, dataset, batch_size=100)
    save_embeddings_to_faiss(embeddings, index_name=name)
    log(f"βœ… Embeddings for {name} saved to FAISS.")