Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,14 +3,11 @@ import os
|
|
3 |
import PyPDF2
|
4 |
import pandas as pd
|
5 |
import openai
|
6 |
-
import zipfile
|
7 |
-
from io import BytesIO
|
8 |
from langchain_community.embeddings import OpenAIEmbeddings
|
9 |
from langchain_community.vectorstores import FAISS
|
10 |
from langchain_community.llms import OpenAI
|
11 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
12 |
|
13 |
-
|
14 |
def detect_language(text):
|
15 |
"""Detects the language of the input text using OpenAI."""
|
16 |
response = openai.ChatCompletion.create(
|
@@ -25,52 +22,42 @@ def detect_language(text):
|
|
25 |
# Set up OpenAI API key (replace with your key)
|
26 |
openai.api_key = "YOUR_OPENAI_API_KEY"
|
27 |
|
28 |
-
def
|
29 |
-
"""
|
30 |
extracted_files = {"pdf": [], "txt": [], "csv": []}
|
31 |
|
32 |
-
|
33 |
-
for file_name in
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
elif file_name.endswith(".csv"):
|
42 |
-
extracted_files["csv"].append(BytesIO(content))
|
43 |
return extracted_files
|
44 |
|
45 |
-
def
|
46 |
-
"""
|
47 |
-
response = openai.ChatCompletion.create(
|
48 |
-
model="gpt-3.5-turbo",
|
49 |
-
messages=[
|
50 |
-
{"role": "system", "content": "Analyze this document and extract key points, links, and complementary information."},
|
51 |
-
{"role": "user", "content": text}
|
52 |
-
]
|
53 |
-
)
|
54 |
-
return response["choices"][0]["message"]["content"].strip()
|
55 |
-
|
56 |
-
def get_text_from_pdf(pdf_files):
|
57 |
text = ""
|
58 |
-
for
|
59 |
-
|
60 |
-
|
61 |
-
text += page.extract_text() + "\n"
|
62 |
return text
|
63 |
|
64 |
-
def
|
65 |
text = ""
|
66 |
-
for
|
67 |
-
|
|
|
|
|
|
|
68 |
return text
|
69 |
|
70 |
def get_text_from_csv(csv_files):
|
71 |
text = ""
|
72 |
-
for
|
73 |
-
df = pd.read_csv(
|
74 |
text += df.to_string() + "\n"
|
75 |
return text
|
76 |
|
@@ -81,7 +68,18 @@ def create_vector_database(text):
|
|
81 |
vector_db = FAISS.from_texts(texts, embeddings)
|
82 |
return vector_db
|
83 |
|
84 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
retriever = vector_db.as_retriever()
|
86 |
docs = retriever.get_relevant_documents(question)
|
87 |
|
@@ -93,33 +91,33 @@ def get_answer(question, vector_db, analysis):
|
|
93 |
response = openai.ChatCompletion.create(
|
94 |
model="gpt-3.5-turbo",
|
95 |
messages=[
|
96 |
-
{"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents
|
97 |
-
{"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\
|
98 |
]
|
99 |
)
|
100 |
return response["choices"][0]["message"]["content"]
|
101 |
|
102 |
-
def chatbot_interface(
|
103 |
-
if not
|
104 |
-
return "Please
|
105 |
|
106 |
-
extracted_files =
|
107 |
-
|
|
|
108 |
|
109 |
if not text:
|
110 |
-
return "The
|
111 |
|
112 |
-
|
113 |
vector_db = create_vector_database(text)
|
114 |
-
return get_answer(question, vector_db,
|
115 |
|
116 |
# Gradio interface
|
117 |
demo = gr.Interface(
|
118 |
fn=chatbot_interface,
|
119 |
-
inputs=[gr.
|
120 |
gr.Textbox(label="Ask a question", placeholder="Type your question here...")],
|
121 |
outputs=gr.Textbox(label="Answer")
|
122 |
)
|
123 |
|
124 |
demo.launch()
|
125 |
-
|
|
|
3 |
import PyPDF2
|
4 |
import pandas as pd
|
5 |
import openai
|
|
|
|
|
6 |
from langchain_community.embeddings import OpenAIEmbeddings
|
7 |
from langchain_community.vectorstores import FAISS
|
8 |
from langchain_community.llms import OpenAI
|
9 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
10 |
|
|
|
11 |
def detect_language(text):
|
12 |
"""Detects the language of the input text using OpenAI."""
|
13 |
response = openai.ChatCompletion.create(
|
|
|
22 |
# Set up OpenAI API key (replace with your key)
|
23 |
openai.api_key = "YOUR_OPENAI_API_KEY"
|
24 |
|
25 |
+
def extract_files_from_folder(folder_path):
|
26 |
+
"""Scans a folder and its subfolders for PDF, TXT, and CSV files."""
|
27 |
extracted_files = {"pdf": [], "txt": [], "csv": []}
|
28 |
|
29 |
+
for root, _, files in os.walk(folder_path):
|
30 |
+
for file_name in files:
|
31 |
+
file_path = os.path.join(root, file_name)
|
32 |
+
if file_name.endswith(".pdf"):
|
33 |
+
extracted_files["pdf"].append(file_path)
|
34 |
+
elif file_name.endswith(".txt"):
|
35 |
+
extracted_files["txt"].append(file_path)
|
36 |
+
elif file_name.endswith(".csv"):
|
37 |
+
extracted_files["csv"].append(file_path)
|
|
|
|
|
38 |
return extracted_files
|
39 |
|
40 |
+
def read_text_from_files(file_paths):
|
41 |
+
"""Reads text content from a list of files."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
text = ""
|
43 |
+
for file_path in file_paths:
|
44 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
|
45 |
+
text += file.read() + "\n"
|
|
|
46 |
return text
|
47 |
|
48 |
+
def get_text_from_pdf(pdf_files):
|
49 |
text = ""
|
50 |
+
for pdf_path in pdf_files:
|
51 |
+
with open(pdf_path, "rb") as pdf_file:
|
52 |
+
reader = PyPDF2.PdfReader(pdf_file)
|
53 |
+
for page in reader.pages:
|
54 |
+
text += page.extract_text() + "\n"
|
55 |
return text
|
56 |
|
57 |
def get_text_from_csv(csv_files):
|
58 |
text = ""
|
59 |
+
for csv_path in csv_files:
|
60 |
+
df = pd.read_csv(csv_path)
|
61 |
text += df.to_string() + "\n"
|
62 |
return text
|
63 |
|
|
|
68 |
vector_db = FAISS.from_texts(texts, embeddings)
|
69 |
return vector_db
|
70 |
|
71 |
+
def correct_exercises(text):
|
72 |
+
"""Uses OpenAI to correct and complete exercises found in the documents."""
|
73 |
+
response = openai.ChatCompletion.create(
|
74 |
+
model="gpt-3.5-turbo",
|
75 |
+
messages=[
|
76 |
+
{"role": "system", "content": "Analyze the text and complete or correct any incomplete exercises."},
|
77 |
+
{"role": "user", "content": text}
|
78 |
+
]
|
79 |
+
)
|
80 |
+
return response["choices"][0]["message"]["content"].strip()
|
81 |
+
|
82 |
+
def get_answer(question, vector_db, corrected_exercises):
|
83 |
retriever = vector_db.as_retriever()
|
84 |
docs = retriever.get_relevant_documents(question)
|
85 |
|
|
|
91 |
response = openai.ChatCompletion.create(
|
92 |
model="gpt-3.5-turbo",
|
93 |
messages=[
|
94 |
+
{"role": "system", "content": f"You are a Data Analytics assistant. Answer in {language}. Use the documents to answer questions. Also, use the corrected exercises if relevant."},
|
95 |
+
{"role": "user", "content": question + "\n\nBased on the following document content:\n" + context + "\n\nCorrected Exercises:\n" + corrected_exercises}
|
96 |
]
|
97 |
)
|
98 |
return response["choices"][0]["message"]["content"]
|
99 |
|
100 |
+
def chatbot_interface(folder_path, question):
|
101 |
+
if not folder_path:
|
102 |
+
return "Please provide a folder path before asking a question."
|
103 |
|
104 |
+
extracted_files = extract_files_from_folder(folder_path)
|
105 |
+
|
106 |
+
text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"])
|
107 |
|
108 |
if not text:
|
109 |
+
return "The folder does not contain valid PDF, TXT, or CSV files. Please upload supported file types."
|
110 |
|
111 |
+
corrected_exercises = correct_exercises(text)
|
112 |
vector_db = create_vector_database(text)
|
113 |
+
return get_answer(question, vector_db, corrected_exercises)
|
114 |
|
115 |
# Gradio interface
|
116 |
demo = gr.Interface(
|
117 |
fn=chatbot_interface,
|
118 |
+
inputs=[gr.Textbox(label="Folder Path", placeholder="Enter the path to the folder containing the documents"),
|
119 |
gr.Textbox(label="Ask a question", placeholder="Type your question here...")],
|
120 |
outputs=gr.Textbox(label="Answer")
|
121 |
)
|
122 |
|
123 |
demo.launch()
|
|