Update app.py
Browse files
app.py
CHANGED
@@ -1,65 +1,177 @@
|
|
1 |
-
import gradio as gr
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import sqlite3
|
4 |
+
import numpy as np
|
5 |
+
from huggingface_hub import InferenceClient
|
6 |
+
from sentence_transformers import SentenceTransformer, util
|
7 |
+
|
8 |
+
|
9 |
+
# Initialize the model and InferenceClient
|
10 |
+
model = SentenceTransformer("xmanii/maux-gte-persian", trust_remote_code=True)
|
11 |
+
|
12 |
+
huggingface_token = os.getenv('RAG')
|
13 |
+
print(huggingface_token)
|
14 |
+
|
15 |
+
client = InferenceClient(api_key = huggingface_token)
|
16 |
+
|
17 |
+
|
18 |
+
# Database initialization
|
19 |
+
DB_NAME = 'files.db'
|
20 |
+
conn = sqlite3.connect(DB_NAME, check_same_thread=False)
|
21 |
+
cursor = conn.cursor()
|
22 |
+
|
23 |
+
# Creating a table to store file information if it doesn't exist
|
24 |
+
cursor.execute('''
|
25 |
+
CREATE TABLE IF NOT EXISTS files (
|
26 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
27 |
+
filename TEXT NOT NULL,
|
28 |
+
path TEXT NOT NULL,
|
29 |
+
chunks TEXT NOT NULL
|
30 |
+
)
|
31 |
+
''')
|
32 |
+
|
33 |
+
# Creating a table to store embeddings
|
34 |
+
cursor.execute('''
|
35 |
+
CREATE TABLE IF NOT EXISTS embeddings (
|
36 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
37 |
+
file_id INTEGER NOT NULL,
|
38 |
+
chunk_index INTEGER NOT NULL,
|
39 |
+
embedding BLOB NOT NULL,
|
40 |
+
FOREIGN KEY (file_id) REFERENCES files(id)
|
41 |
+
)
|
42 |
+
''')
|
43 |
+
conn.commit()
|
44 |
+
|
45 |
+
UPLOAD_FOLDER = 'uploads'
|
46 |
+
os.makedirs(UPLOAD_FOLDER, exist_ok=True)
|
47 |
+
|
48 |
+
def read_file(file_path):
|
49 |
+
try:
|
50 |
+
with open(file_path, 'r', encoding='utf-8') as file:
|
51 |
+
return file.read()
|
52 |
+
except FileNotFoundError:
|
53 |
+
return None
|
54 |
+
except Exception as e:
|
55 |
+
return None
|
56 |
+
|
57 |
+
def chunk_text(text, chunk_size=1000, overlap_size=100):
|
58 |
+
chunks = []
|
59 |
+
start = 0
|
60 |
+
while start < len(text):
|
61 |
+
end = start + chunk_size
|
62 |
+
chunks.append(text[start:end])
|
63 |
+
start += (chunk_size - overlap_size)
|
64 |
+
return chunks
|
65 |
+
|
66 |
+
def calculate_similarity(embedding1, embedding2):
|
67 |
+
similarity = util.cos_sim(embedding1, embedding2)
|
68 |
+
return similarity.item()
|
69 |
+
|
70 |
+
def store_embedding(file_id, chunk_index, embedding):
|
71 |
+
cursor.execute("INSERT INTO embeddings (file_id, chunk_index, embedding) VALUES (?, ?, ?)",
|
72 |
+
(file_id, chunk_index, embedding))
|
73 |
+
conn.commit()
|
74 |
+
|
75 |
+
def get_embeddings(file_id):
|
76 |
+
cursor.execute("SELECT chunk_index, embedding FROM embeddings WHERE file_id = ?", (file_id,))
|
77 |
+
return cursor.fetchall()
|
78 |
+
|
79 |
+
def upload_file(file):
|
80 |
+
if file is None:
|
81 |
+
return "No file selected.", None
|
82 |
+
|
83 |
+
file_path = os.path.join(UPLOAD_FOLDER, file.name)
|
84 |
+
with open(file_path, 'wb') as f:
|
85 |
+
f.write(file.read())
|
86 |
+
|
87 |
+
text = read_file(file_path)
|
88 |
+
if text is None:
|
89 |
+
return f"Could not read file at path: {file_path}", None
|
90 |
+
|
91 |
+
chunks = chunk_text(text)
|
92 |
+
chunks_str = '|'.join(chunks)
|
93 |
+
|
94 |
+
# Insert file metadata
|
95 |
+
cursor.execute("INSERT INTO files (filename, path, chunks) VALUES (?, ?, ?)",
|
96 |
+
(file.name, file_path, chunks_str))
|
97 |
+
file_id = cursor.lastrowid # get the id of the inserted file
|
98 |
+
conn.commit()
|
99 |
+
|
100 |
+
print("Calculate and store embeddings")
|
101 |
+
for index, chunk in enumerate(chunks):
|
102 |
+
embedding = model.encode(chunk).tobytes() # convert to bytes for storage
|
103 |
+
store_embedding(file_id, index, embedding)
|
104 |
+
|
105 |
+
return f"File '{file.name}' uploaded and processed successfully.", file_id
|
106 |
+
|
107 |
+
def chat(input_sentence, file_id):
|
108 |
+
if not input_sentence:
|
109 |
+
return "User input is required."
|
110 |
+
|
111 |
+
# Get the embeddings from the uploaded file
|
112 |
+
cursor.execute("SELECT id FROM files WHERE id = ?", (file_id,))
|
113 |
+
if cursor.fetchone() is None:
|
114 |
+
return "No uploaded files found. Please upload a file."
|
115 |
+
|
116 |
+
embeddings = get_embeddings(file_id)
|
117 |
+
if not embeddings:
|
118 |
+
return "No embeddings found for the uploaded file."
|
119 |
+
|
120 |
+
# Encode the user input for comparison
|
121 |
+
input_embedding = model.encode(input_sentence)
|
122 |
+
|
123 |
+
# Calculate similarities using stored embeddings
|
124 |
+
similarities = []
|
125 |
+
for index, embedding in embeddings:
|
126 |
+
stored_embedding = np.frombuffer(embedding, dtype=np.float32) # convert bytes back to numpy array
|
127 |
+
similarity = calculate_similarity(input_embedding, stored_embedding)
|
128 |
+
similarities.append(similarity)
|
129 |
+
|
130 |
+
sorted_indices = sorted(range(len(similarities)), key=lambda i: similarities[i], reverse=True)
|
131 |
+
|
132 |
+
# Get the top chunks based on similarity
|
133 |
+
cursor.execute("SELECT chunks FROM files WHERE id = ?", (file_id,))
|
134 |
+
chunks = cursor.fetchone()[0].split('|')
|
135 |
+
answercontext = "".join(chunks[i] for i in sorted_indices[:5])
|
136 |
+
|
137 |
+
system_prompt = (
|
138 |
+
"فقط بر اساس متن زیر به پرسش کاربر پاسخ بده و اگر پاسخ در متن نبود بنویس نمیدانم.\n\n"
|
139 |
+
"متن:\n" + answercontext)
|
140 |
+
|
141 |
+
messages = [
|
142 |
+
{"role": "system", "content": system_prompt},
|
143 |
+
{"role": "user", "content": input_sentence}
|
144 |
+
]
|
145 |
+
|
146 |
+
response = ""
|
147 |
+
for message in client.chat_completion(
|
148 |
+
messages,
|
149 |
+
temperature=0.5,
|
150 |
+
max_tokens=4096,
|
151 |
+
top_p=0.7,
|
152 |
+
stream=False
|
153 |
+
):
|
154 |
+
response += message.choices[0].delta.content
|
155 |
+
|
156 |
+
return response
|
157 |
+
|
158 |
+
# Gradio Interface
|
159 |
+
with gr.Blocks() as demo:
|
160 |
+
gr.Markdown("## File Upload and Chat Interface")
|
161 |
+
|
162 |
+
with gr.Row():
|
163 |
+
file_input = gr.File(label="Upload a file")
|
164 |
+
upload_button = gr.Button("Upload File")
|
165 |
+
|
166 |
+
upload_output = gr.Textbox(label="Upload Status", interactive=False)
|
167 |
+
file_id = gr.State()
|
168 |
+
|
169 |
+
upload_button.click(upload_file, inputs=file_input, outputs=[upload_output, file_id])
|
170 |
+
|
171 |
+
user_input = gr.Textbox(label="Your Question")
|
172 |
+
chat_button = gr.Button("Ask")
|
173 |
+
chat_output = gr.Textbox(label="Response", interactive=False)
|
174 |
+
|
175 |
+
chat_button.click(chat, inputs=[user_input, file_id], outputs=chat_output)
|
176 |
+
|
177 |
+
demo.launch()
|