File size: 1,802 Bytes
3f5fd34
 
 
 
 
 
dfbcc66
 
 
 
3f5fd34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
import faiss
import torch
import numpy as np

def log(message):
    print(f"βœ… {message}")


# βœ… Load datasets
datasets = {
    "sales": load_dataset("goendalf666/sales-conversations"),
    "blended": load_dataset("blended_skill_talk"),
    "dialog": load_dataset("daily_dialog"),
    "multiwoz": load_dataset("multi_woz_v22"),
}

# βœ… Load MiniLM model and tokenizer
model_name = "sentence-transformers/all-MiniLM-L6-v2"  # Model for embeddings
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def embed_text(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1).cpu().numpy()
    return embeddings

# βœ… Extract and embed the datasets
def create_embeddings(dataset_name, dataset):
    print(f"Creating embeddings for {dataset_name}...")
    texts = [text for text in dataset["train"]['text']]  # Adjust the field depending on dataset structure
    embeddings = embed_text(texts)
    return embeddings

# βœ… Save embeddings to a database
def save_embeddings_to_faiss(embeddings, index_name="my_embeddings"):
    print("Saving embeddings to FAISS...")
    index = faiss.IndexFlatL2(embeddings.shape[1])  # Assuming 512-dimensional embeddings
    index.add(np.array(embeddings).astype(np.float32))
    faiss.write_index(index, index_name)  # Save FAISS index to file
    return index

# βœ… Create embeddings for all datasets
for name, dataset in datasets.items():
    embeddings = create_embeddings(name, dataset)
    index = save_embeddings_to_faiss(embeddings)
    print(f"Embeddings for {name} saved to FAISS.")