File size: 9,161 Bytes
ccb8edf
 
 
 
 
 
e9d5e9c
6b52825
84cb849
 
 
2618588
 
 
84cb849
 
 
 
 
8d0bee3
b35adb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84cb849
 
 
 
9efbb97
 
 
 
 
 
 
 
 
84cb849
 
9efbb97
84cb849
 
 
 
 
 
 
 
 
9efbb97
84cb849
 
 
 
 
 
 
 
9efbb97
84cb849
9efbb97
84cb849
9efbb97
 
 
84cb849
9efbb97
 
1754322
9efbb97
fed8ef0
9efbb97
 
fed8ef0
9efbb97
 
7fad639
9efbb97
 
 
 
 
fed8ef0
ccb8edf
7fad639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import os
import time
import pdfplumber
import docx
import nltk
import gradio as gr
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import (
    OpenAIEmbeddings,
    CohereEmbeddings,
)
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS, Chroma
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    TokenTextSplitter,
)
from typing import List, Dict, Any
import pandas as pd


nltk.download('punkt', quiet=True)

FILES_DIR = './files'

MODELS = {
    'HuggingFace': {
        'e5-base-de': "danielheinz/e5-base-sts-en-de",
        'paraphrase-miniLM': "paraphrase-multilingual-MiniLM-L12-v2",
        'paraphrase-mpnet': "paraphrase-multilingual-mpnet-base-v2",
        'gte-large': "gte-large",
        'gbert-base': "gbert-base"
    },
    'OpenAI': {
        'text-embedding-ada-002': "text-embedding-ada-002"
    },
    'Cohere': {
        'embed-multilingual-v2.0': "embed-multilingual-v2.0"
    }
}

class FileHandler:
    @staticmethod
    def extract_text(file_path):
        ext = os.path.splitext(file_path)[-1].lower()
        if ext == '.pdf':
            return FileHandler._extract_from_pdf(file_path)
        elif ext == '.docx':
            return FileHandler._extract_from_docx(file_path)
        elif ext == '.txt':
            return FileHandler._extract_from_txt(file_path)
        else:
            raise ValueError(f"Unsupported file type: {ext}")

    @staticmethod
    def _extract_from_pdf(file_path):
        with pdfplumber.open(file_path) as pdf:
            return ' '.join([page.extract_text() for page in pdf.pages])

    @staticmethod
    def _extract_from_docx(file_path):
        doc = docx.Document(file_path)
        return ' '.join([para.text for para in doc.paragraphs])

    @staticmethod
    def _extract_from_txt(file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()

def get_embedding_model(model_type, model_name):
    if model_type == 'HuggingFace':
        return HuggingFaceEmbeddings(model_name=MODELS[model_type][model_name])
    elif model_type == 'OpenAI':
        return OpenAIEmbeddings(model=MODELS[model_type][model_name])
    elif model_type == 'Cohere':
        return CohereEmbeddings(model=MODELS[model_type][model_name])
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

def get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separators=None):
    if split_strategy == 'token':
        return TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap_size)
    elif split_strategy == 'recursive':
        return RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=overlap_size,
            separators=custom_separators or ["\n\n", "\n", " ", ""]
        )
    else:
        raise ValueError(f"Unsupported split strategy: {split_strategy}")

def get_vector_store(store_type, texts, embedding_model):
    if store_type == 'FAISS':
        return FAISS.from_texts(texts, embedding_model)
    elif store_type == 'Chroma':
        return Chroma.from_texts(texts, embedding_model)
    else:
        raise ValueError(f"Unsupported vector store type: {store_type}")

def get_retriever(vector_store, search_type, search_kwargs=None):
    if search_type == 'similarity':
        return vector_store.as_retriever(search_type="similarity", search_kwargs=search_kwargs)
    elif search_type == 'mmr':
        return vector_store.as_retriever(search_type="mmr", search_kwargs=search_kwargs)
    else:
        raise ValueError(f"Unsupported search type: {search_type}")

def process_files(file_path, model_type, model_name, split_strategy, chunk_size, overlap_size, custom_separators):
    if file_path:
        text = FileHandler.extract_text(file_path)
    else:
        text = ""
        for file in os.listdir(FILES_DIR):
            file_path = os.path.join(FILES_DIR, file)
            text += FileHandler.extract_text(file_path)

    text_splitter = get_text_splitter(split_strategy, chunk_size, overlap_size, custom_separators)
    chunks = text_splitter.split_text(text)

    embedding_model = get_embedding_model(model_type, model_name)

    return chunks, embedding_model, len(text.split())

def search_embeddings(chunks, embedding_model, vector_store_type, search_type, query, top_k):
    vector_store = get_vector_store(vector_store_type, chunks, embedding_model)
    retriever = get_retriever(vector_store, search_type, {"k": top_k})

    start_time = time.time()
    results = retriever.get_relevant_documents(query)
    end_time = time.time()

    return results, end_time - start_time, vector_store

def calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model):
    return {
        "num_results": len(results),
        "avg_content_length": sum(len(doc.page_content) for doc in results) / len(results) if results else 0,
        "search_time": search_time,
        "vector_store_size": vector_store._index.ntotal if hasattr(vector_store, '_index') else "N/A",
        "num_documents": len(vector_store.docstore._dict),
        "num_tokens": num_tokens,
        "embedding_vocab_size": embedding_model.client.get_vocab_size() if hasattr(embedding_model, 'client') and hasattr(embedding_model.client, 'get_vocab_size') else "N/A"
    }

def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k):
    all_results = []
    all_stats = []
    settings = {
        "split_strategy": split_strategy,
        "chunk_size": chunk_size,
        "overlap_size": overlap_size,
        "custom_separators": custom_separators,
        "vector_store_type": vector_store_type,
        "search_type": search_type,
        "top_k": top_k
    }

    for model_type, model_name in zip(model_types, model_names):
        chunks, embedding_model, num_tokens = process_files(
            file.name if file else None,
            model_type,
            model_name,
            split_strategy,
            chunk_size,
            overlap_size,
            custom_separators.split(',') if custom_separators else None
        )

        results, search_time, vector_store = search_embeddings(
            chunks,
            embedding_model,
            vector_store_type,
            search_type,
            query,
            top_k
        )

        stats = calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model)
        stats["model"] = f"{model_type} - {model_name}"
        stats.update(settings)

        formatted_results = format_results(results, stats)
        all_results.extend(formatted_results)
        all_stats.append(stats)

    results_df = pd.DataFrame(all_results)
    stats_df = pd.DataFrame(all_stats)

    return results_df, stats_df

def format_results(results, stats):
    formatted_results = []
    for doc in results:
        result = {
            "Model": stats["model"],
            "Content": doc.page_content,
            **doc.metadata,
            **{k: v for k, v in stats.items() if k not in ["model"]}
        }
        formatted_results.append(result)
    return formatted_results

# Gradio interface
def launch_interface(share=True):
    iface = gr.Interface(
        fn=compare_embeddings,
        inputs=[
            gr.File(label="Upload File (Optional)"),
            gr.Textbox(label="Search Query"),
            gr.CheckboxGroup(choices=list(MODELS.keys()), label="Embedding Model Types", value=["HuggingFace"]),
            gr.CheckboxGroup(choices=[model for models in MODELS.values() for model in models], label="Embedding Models", value=["e5-base-de"]),
            gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
            gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
            gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
            gr.Textbox(label="Custom Split Separators (comma-separated, optional)"),
            gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS"),
            gr.Radio(choices=["similarity", "mmr"], label="Search Type", value="similarity"),
            gr.Slider(1, 10, step=1, value=5, label="Top K")
        ],
        outputs=[
            gr.Dataframe(label="Results", interactive=False),
            gr.Dataframe(label="Statistics", interactive=False)
        ],
        title="Embedding Comparison Tool",
        description="Compare different embedding models and retrieval strategies",
        examples=[
            ["example.pdf", "What is machine learning?", ["HuggingFace"], ["e5-base-de"], "recursive", 500, 50, "", "FAISS", "similarity", 5]
        ],
        allow_flagging="never"
    )

    tutorial_md = """
    # Embedding Comparison Tool Tutorial

    ... (tutorial content remains the same) ...
    """

    iface = gr.TabbedInterface(
        [iface, gr.Markdown(tutorial_md)],
        ["Embedding Comparison", "Tutorial"]
    )

    iface.launch(share=share)

if __name__ == "__main__":
    launch_interface()