Update app.py
Browse files
app.py
CHANGED
@@ -3,12 +3,23 @@ import gradio as gr
|
|
3 |
from langchain.vectorstores import FAISS
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.schema import Document
|
|
|
|
|
6 |
|
7 |
-
|
|
|
|
|
|
|
|
|
8 |
def initialize_system():
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
12 |
# Create documents
|
13 |
documents = [
|
14 |
Document(
|
@@ -16,30 +27,35 @@ def initialize_system():
|
|
16 |
metadata={"question": row['Question'], "answer": row['Answer']}
|
17 |
) for _, row in data.iterrows()
|
18 |
]
|
19 |
-
|
20 |
-
#
|
21 |
-
embeddings = HuggingFaceEmbeddings(
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
vector_store = initialize_system()
|
28 |
|
29 |
def classify_question(query: str, k: int = 3):
|
30 |
-
# Retrieve similar Q&A pairs
|
31 |
results = vector_store.similarity_search(query, k=k)
|
32 |
-
|
33 |
-
|
|
|
|
|
34 |
answers = " ".join([doc.metadata['answer'] for doc in results])
|
35 |
-
keywords = list(dict.fromkeys(answers.split()))[:5]
|
36 |
category = " ".join(keywords)
|
37 |
-
|
38 |
-
# Format output
|
39 |
return {
|
40 |
"Category": category,
|
41 |
"Top Matches": "\n\n".join([f"Q: {doc.metadata['question']}\nA: {doc.metadata['answer']}"
|
42 |
-
|
43 |
"Confidence": f"{len(results)/k:.0%}"
|
44 |
}
|
45 |
|
@@ -53,8 +69,8 @@ interface = gr.Interface(
|
|
53 |
gr.Textbox(label="Confidence")
|
54 |
],
|
55 |
title="Question Classification System",
|
56 |
-
description="Classify questions based on existing Q&A pairs using
|
57 |
)
|
58 |
|
59 |
if __name__ == "__main__":
|
60 |
-
interface.launch()
|
|
|
3 |
from langchain.vectorstores import FAISS
|
4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
5 |
from langchain.schema import Document
|
6 |
+
import os
|
7 |
+
import pickle
|
8 |
|
9 |
+
EMBEDDING_MODEL = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
|
10 |
+
DATASET_PATH = "qa_dataset.csv"
|
11 |
+
FAISS_INDEX_PATH = "faiss_index"
|
12 |
+
|
13 |
+
# Initialize system (Load dataset and FAISS index)
|
14 |
def initialize_system():
|
15 |
+
if os.path.exists(FAISS_INDEX_PATH):
|
16 |
+
print("Loading FAISS index from cache...")
|
17 |
+
with open(FAISS_INDEX_PATH, "rb") as f:
|
18 |
+
return pickle.load(f)
|
19 |
+
|
20 |
+
print("Initializing FAISS from scratch...")
|
21 |
+
data = pd.read_csv(DATASET_PATH).dropna().head(500) # Limit rows for speed
|
22 |
+
|
23 |
# Create documents
|
24 |
documents = [
|
25 |
Document(
|
|
|
27 |
metadata={"question": row['Question'], "answer": row['Answer']}
|
28 |
) for _, row in data.iterrows()
|
29 |
]
|
30 |
+
|
31 |
+
# Load embedding model once
|
32 |
+
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
|
33 |
+
|
34 |
+
# Create FAISS vector store
|
35 |
+
vector_store = FAISS.from_documents(documents, embeddings)
|
36 |
+
|
37 |
+
# Cache the FAISS index
|
38 |
+
with open(FAISS_INDEX_PATH, "wb") as f:
|
39 |
+
pickle.dump(vector_store, f)
|
40 |
+
|
41 |
+
return vector_store
|
42 |
|
43 |
vector_store = initialize_system()
|
44 |
|
45 |
def classify_question(query: str, k: int = 3):
|
|
|
46 |
results = vector_store.similarity_search(query, k=k)
|
47 |
+
|
48 |
+
if not results:
|
49 |
+
return {"Category": "Unknown", "Top Matches": "No matches found", "Confidence": "0%"}
|
50 |
+
|
51 |
answers = " ".join([doc.metadata['answer'] for doc in results])
|
52 |
+
keywords = list(dict.fromkeys(answers.split()))[:5] # Extract first 5 unique words
|
53 |
category = " ".join(keywords)
|
54 |
+
|
|
|
55 |
return {
|
56 |
"Category": category,
|
57 |
"Top Matches": "\n\n".join([f"Q: {doc.metadata['question']}\nA: {doc.metadata['answer']}"
|
58 |
+
for doc in results]),
|
59 |
"Confidence": f"{len(results)/k:.0%}"
|
60 |
}
|
61 |
|
|
|
69 |
gr.Textbox(label="Confidence")
|
70 |
],
|
71 |
title="Question Classification System",
|
72 |
+
description="Classify questions based on existing Q&A pairs using FAISS"
|
73 |
)
|
74 |
|
75 |
if __name__ == "__main__":
|
76 |
+
interface.launch(share=True)
|