File size: 3,908 Bytes
f3d315e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3942f2
f3d315e
 
b3942f2
f3d315e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b3942f2
f3d315e
 
b3942f2
f3d315e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c19c0c9
e3f298a
f3d315e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
beaf3b0
 
f3d315e
 
 
 
 
 
 
 
 
 
 
beaf3b0
f3d315e
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import gradio as gr
import tempfile
import os
import fitz  # PyMuPDF
import uuid


from middleware import Middleware

def generate_uuid(state):
    # Check if UUID already exists in session state
    if state["user_uuid"] is None:
        # Generate a new UUID if not already set
        state["user_uuid"] = str(uuid.uuid4())

    return state["user_uuid"]


class PDFSearchApp:
    def __init__(self):
        self.indexed_docs = {}
        self.current_pdf = None
    
        
    def upload_and_convert(self, state, file, max_pages):
        id = generate_uuid(state)

        if file is None:
            return "No file uploaded"

        print(f"Uploading file: {file.name}, id: {id}")
            
        try:
            self.current_pdf = file.name

            middleware = Middleware(id, create_collection=True)

            pages = middleware.index(pdf_path=file.name, id=id, max_pages=max_pages)

            self.indexed_docs[id] = True
            
            return f"Uploaded and extracted {len(pages)} pages"
        except Exception as e:
            return f"Error processing PDF: {str(e)}"
    
    
    def search_documents(self, state, query, num_results=5):
        print(f"Searching for query: {query}")
        id = generate_uuid(state)
        
        if not self.indexed_docs[id]:
            print("Please index documents first")
            return "Please index documents first", "--"
        if not query:
            print("Please enter a search query")
            return "Please enter a search query", "--"
            
        try:

            middleware = Middleware(id, create_collection=False)
            
            search_results = middleware.search([query])[0]

            page_num = search_results[0][1] + 1

            print(f"Retrieved page number: {page_num}")

            img_path = f"pages/{id}/page_{page_num}.png"

            print(f"Retrieved image path: {img_path}")

            return img_path, f"Retrieved page number: {page_num}"
            
        except Exception as e:
            return f"Error during search: {str(e)}", "--"

def create_ui():
    app = PDFSearchApp()
    
    with gr.Blocks() as demo:
        state = gr.State(value={"user_uuid": None})

        gr.Markdown("# Colpali Milvus Search Demo")
        gr.Markdown("This demo showcases how to use [Colpali](https://github.com/illuin-tech/colpali) embeddings with [Milvus](https://milvus.io/) for pdf search.")
        
        with gr.Tab("Upload PDFs"):
            with gr.Column():
                file_input = gr.File(label="Upload PDFs")
                
                max_pages_input = gr.Slider(
                    minimum=1,
                    maximum=50,
                    value=20,
                    step=10,
                    label="Max Pages"
                )
                
                status = gr.Textbox(label="Status", interactive=False)
        
        with gr.Tab("Search"):
            with gr.Column():
                query_input = gr.Textbox(label="Query")
                num_results = gr.Slider(
                    minimum=1,
                    maximum=10,
                    value=5,
                    step=1,
                    label="Number of results"
                )
                search_btn = gr.Button("Search")
                llm_answer = gr.Textbox(label="Answer", interactive=False)
                images = gr.Image(label="Pages used for RAG")
        
        # Event handlers
        file_input.change(
            fn=app.upload_and_convert,
            inputs=[state, file_input, max_pages_input],
            outputs=[status]
        )
        
        search_btn.click(
            fn=app.search_documents,
            inputs=[state, query_input, num_results],
            outputs=[images, llm_answer]
        )
    
    return demo

if __name__ == "__main__":
    demo = create_ui()
    demo.launch()