File size: 4,322 Bytes
ccb8edf
 
 
 
 
 
e9d5e9c
6b52825
84cb849
 
 
2618588
 
 
84cb849
 
 
 
 
8d0bee3
7fad639
84cb849
 
 
 
9efbb97
 
 
 
 
 
 
 
 
84cb849
 
9efbb97
84cb849
 
 
 
 
 
 
 
 
9efbb97
84cb849
 
 
 
 
 
 
 
9efbb97
84cb849
9efbb97
84cb849
9efbb97
 
 
84cb849
9efbb97
 
1754322
9efbb97
fed8ef0
9efbb97
 
fed8ef0
9efbb97
 
7fad639
9efbb97
 
 
 
 
fed8ef0
ccb8edf
7fad639
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import os
import time
import pdfplumber
import docx
import nltk
import gradio as gr
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import (
    OpenAIEmbeddings,
    CohereEmbeddings,
)
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS, Chroma
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    TokenTextSplitter,
)
from typing import List, Dict, Any
import pandas as pd

# ... (previous code remains the same) ...

def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k):
    all_results = []
    all_stats = []
    settings = {
        "split_strategy": split_strategy,
        "chunk_size": chunk_size,
        "overlap_size": overlap_size,
        "custom_separators": custom_separators,
        "vector_store_type": vector_store_type,
        "search_type": search_type,
        "top_k": top_k
    }

    for model_type, model_name in zip(model_types, model_names):
        chunks, embedding_model, num_tokens = process_files(
            file.name if file else None,
            model_type,
            model_name,
            split_strategy,
            chunk_size,
            overlap_size,
            custom_separators.split(',') if custom_separators else None
        )

        results, search_time, vector_store = search_embeddings(
            chunks,
            embedding_model,
            vector_store_type,
            search_type,
            query,
            top_k
        )

        stats = calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model)
        stats["model"] = f"{model_type} - {model_name}"
        stats.update(settings)

        formatted_results = format_results(results, stats)
        all_results.extend(formatted_results)
        all_stats.append(stats)

    results_df = pd.DataFrame(all_results)
    stats_df = pd.DataFrame(all_stats)

    return results_df, stats_df

def format_results(results, stats):
    formatted_results = []
    for doc in results:
        result = {
            "Model": stats["model"],
            "Content": doc.page_content,
            **doc.metadata,
            **{k: v for k, v in stats.items() if k not in ["model"]}
        }
        formatted_results.append(result)
    return formatted_results

# Gradio interface
def launch_interface(share=True):
    iface = gr.Interface(
        fn=compare_embeddings,
        inputs=[
            gr.File(label="Upload File (Optional)"),
            gr.Textbox(label="Search Query"),
            gr.CheckboxGroup(choices=list(MODELS.keys()), label="Embedding Model Types", value=["HuggingFace"]),
            gr.CheckboxGroup(choices=[model for models in MODELS.values() for model in models], label="Embedding Models", value=["e5-base-de"]),
            gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
            gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
            gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
            gr.Textbox(label="Custom Split Separators (comma-separated, optional)"),
            gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS"),
            gr.Radio(choices=["similarity", "mmr"], label="Search Type", value="similarity"),
            gr.Slider(1, 10, step=1, value=5, label="Top K")
        ],
        outputs=[
            gr.Dataframe(label="Results", interactive=False),
            gr.Dataframe(label="Statistics", interactive=False)
        ],
        title="Embedding Comparison Tool",
        description="Compare different embedding models and retrieval strategies",
        examples=[
            ["example.pdf", "What is machine learning?", ["HuggingFace"], ["e5-base-de"], "recursive", 500, 50, "", "FAISS", "similarity", 5]
        ],
        allow_flagging="never"
    )

    tutorial_md = """
    # Embedding Comparison Tool Tutorial

    ... (tutorial content remains the same) ...
    """

    iface = gr.TabbedInterface(
        [iface, gr.Markdown(tutorial_md)],
        ["Embedding Comparison", "Tutorial"]
    )

    iface.launch(share=share)

if __name__ == "__main__":
    launch_interface()