File size: 4,322 Bytes
ccb8edf e9d5e9c 6b52825 84cb849 2618588 84cb849 8d0bee3 7fad639 84cb849 9efbb97 84cb849 9efbb97 84cb849 9efbb97 84cb849 9efbb97 84cb849 9efbb97 84cb849 9efbb97 84cb849 9efbb97 1754322 9efbb97 fed8ef0 9efbb97 fed8ef0 9efbb97 7fad639 9efbb97 fed8ef0 ccb8edf 7fad639 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import os
import time
import pdfplumber
import docx
import nltk
import gradio as gr
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.embeddings import (
OpenAIEmbeddings,
CohereEmbeddings,
)
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS, Chroma
from langchain_text_splitters import (
RecursiveCharacterTextSplitter,
TokenTextSplitter,
)
from typing import List, Dict, Any
import pandas as pd
# ... (previous code remains the same) ...
def compare_embeddings(file, query, model_types, model_names, split_strategy, chunk_size, overlap_size, custom_separators, vector_store_type, search_type, top_k):
all_results = []
all_stats = []
settings = {
"split_strategy": split_strategy,
"chunk_size": chunk_size,
"overlap_size": overlap_size,
"custom_separators": custom_separators,
"vector_store_type": vector_store_type,
"search_type": search_type,
"top_k": top_k
}
for model_type, model_name in zip(model_types, model_names):
chunks, embedding_model, num_tokens = process_files(
file.name if file else None,
model_type,
model_name,
split_strategy,
chunk_size,
overlap_size,
custom_separators.split(',') if custom_separators else None
)
results, search_time, vector_store = search_embeddings(
chunks,
embedding_model,
vector_store_type,
search_type,
query,
top_k
)
stats = calculate_statistics(results, search_time, vector_store, num_tokens, embedding_model)
stats["model"] = f"{model_type} - {model_name}"
stats.update(settings)
formatted_results = format_results(results, stats)
all_results.extend(formatted_results)
all_stats.append(stats)
results_df = pd.DataFrame(all_results)
stats_df = pd.DataFrame(all_stats)
return results_df, stats_df
def format_results(results, stats):
formatted_results = []
for doc in results:
result = {
"Model": stats["model"],
"Content": doc.page_content,
**doc.metadata,
**{k: v for k, v in stats.items() if k not in ["model"]}
}
formatted_results.append(result)
return formatted_results
# Gradio interface
def launch_interface(share=True):
iface = gr.Interface(
fn=compare_embeddings,
inputs=[
gr.File(label="Upload File (Optional)"),
gr.Textbox(label="Search Query"),
gr.CheckboxGroup(choices=list(MODELS.keys()), label="Embedding Model Types", value=["HuggingFace"]),
gr.CheckboxGroup(choices=[model for models in MODELS.values() for model in models], label="Embedding Models", value=["e5-base-de"]),
gr.Radio(choices=["token", "recursive"], label="Split Strategy", value="recursive"),
gr.Slider(100, 1000, step=100, value=500, label="Chunk Size"),
gr.Slider(0, 100, step=10, value=50, label="Overlap Size"),
gr.Textbox(label="Custom Split Separators (comma-separated, optional)"),
gr.Radio(choices=["FAISS", "Chroma"], label="Vector Store Type", value="FAISS"),
gr.Radio(choices=["similarity", "mmr"], label="Search Type", value="similarity"),
gr.Slider(1, 10, step=1, value=5, label="Top K")
],
outputs=[
gr.Dataframe(label="Results", interactive=False),
gr.Dataframe(label="Statistics", interactive=False)
],
title="Embedding Comparison Tool",
description="Compare different embedding models and retrieval strategies",
examples=[
["example.pdf", "What is machine learning?", ["HuggingFace"], ["e5-base-de"], "recursive", 500, 50, "", "FAISS", "similarity", 5]
],
allow_flagging="never"
)
tutorial_md = """
# Embedding Comparison Tool Tutorial
... (tutorial content remains the same) ...
"""
iface = gr.TabbedInterface(
[iface, gr.Markdown(tutorial_md)],
["Embedding Comparison", "Tutorial"]
)
iface.launch(share=share)
if __name__ == "__main__":
launch_interface() |