Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -50,65 +50,8 @@ def extract_files_from_folder(folder_path):
|
|
50 |
print("Files found:", extracted_files) # Debugging log
|
51 |
return extracted_files
|
52 |
|
53 |
-
def
|
54 |
-
"""
|
55 |
-
text = ""
|
56 |
-
for file_path in file_paths:
|
57 |
-
print(f"Reading text file: {file_path}") # Debugging log
|
58 |
-
with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
|
59 |
-
file_text = file.read()
|
60 |
-
text += file_text + "\n"
|
61 |
-
print("Extracted text from TXT files:", text[:500]) # Debugging log (First 500 chars)
|
62 |
-
return text
|
63 |
-
|
64 |
-
def get_text_from_pdf(pdf_files):
|
65 |
-
text = ""
|
66 |
-
for pdf_path in pdf_files:
|
67 |
-
print(f"Reading PDF file: {pdf_path}") # Debugging log
|
68 |
-
with open(pdf_path, "rb") as pdf_file:
|
69 |
-
reader = PyPDF2.PdfReader(pdf_file)
|
70 |
-
for page in reader.pages:
|
71 |
-
page_text = page.extract_text()
|
72 |
-
if page_text:
|
73 |
-
text += page_text + "\n"
|
74 |
-
print("Extracted text from PDF files:", text[:500]) # Debugging log (First 500 chars)
|
75 |
-
return text
|
76 |
-
|
77 |
-
def get_text_from_csv(csv_files):
|
78 |
-
text = ""
|
79 |
-
for csv_path in csv_files:
|
80 |
-
print(f"Reading CSV file: {csv_path}") # Debugging log
|
81 |
-
df = pd.read_csv(csv_path)
|
82 |
-
text += df.to_string() + "\n"
|
83 |
-
print("Extracted text from CSV files:", text[:500]) # Debugging log (First 500 chars)
|
84 |
-
return text
|
85 |
-
|
86 |
-
def get_text_from_docx(docx_files):
|
87 |
-
text = ""
|
88 |
-
for docx_path in docx_files:
|
89 |
-
print(f"Reading DOCX file: {docx_path}") # Debugging log
|
90 |
-
doc = Document(docx_path)
|
91 |
-
for para in doc.paragraphs:
|
92 |
-
text += para.text + "\n"
|
93 |
-
print("Extracted text from DOCX files:", text[:500]) # Debugging log (First 500 chars)
|
94 |
-
return text
|
95 |
-
|
96 |
-
def get_text_from_ipynb(ipynb_files):
|
97 |
-
text = ""
|
98 |
-
for ipynb_path in ipynb_files:
|
99 |
-
print(f"Reading IPYNB file: {ipynb_path}") # Debugging log
|
100 |
-
with open(ipynb_path, "r", encoding="utf-8") as file:
|
101 |
-
notebook = json.load(file)
|
102 |
-
for cell in notebook.get("cells", []):
|
103 |
-
if cell.get("cell_type") == "markdown":
|
104 |
-
text += "\n".join(cell.get("source", [])) + "\n"
|
105 |
-
print("Extracted text from IPYNB files:", text[:500]) # Debugging log (First 500 chars)
|
106 |
-
return text
|
107 |
-
|
108 |
-
def chatbot_interface(question):
|
109 |
-
folder_path = "New_Data_Analytics/"
|
110 |
-
extracted_files = extract_files_from_folder(folder_path)
|
111 |
-
|
112 |
text = (
|
113 |
get_text_from_pdf(extracted_files["pdf"]) +
|
114 |
read_text_from_files(extracted_files["txt"]) +
|
@@ -116,13 +59,31 @@ def chatbot_interface(question):
|
|
116 |
get_text_from_docx(extracted_files["docx"]) +
|
117 |
get_text_from_ipynb(extracted_files["ipynb"])
|
118 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
print("Final extracted text for chatbot processing:", text[:500]) # Debugging log (First 500 chars)
|
121 |
|
122 |
if not text.strip():
|
123 |
return "The folder does not contain valid PDF, TXT, CSV, DOCX, or IPYNB files. Please upload supported file types."
|
124 |
|
125 |
-
return
|
126 |
|
127 |
# Gradio interface
|
128 |
demo = gr.Interface(
|
@@ -131,4 +92,4 @@ demo = gr.Interface(
|
|
131 |
outputs=gr.Textbox(label="Answer")
|
132 |
)
|
133 |
|
134 |
-
demo.launch()
|
|
|
50 |
print("Files found:", extracted_files) # Debugging log
|
51 |
return extracted_files
|
52 |
|
53 |
+
def combine_text_from_files(extracted_files):
|
54 |
+
"""Combines text from all extracted files."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
text = (
|
56 |
get_text_from_pdf(extracted_files["pdf"]) +
|
57 |
read_text_from_files(extracted_files["txt"]) +
|
|
|
59 |
get_text_from_docx(extracted_files["docx"]) +
|
60 |
get_text_from_ipynb(extracted_files["ipynb"])
|
61 |
)
|
62 |
+
return text
|
63 |
+
|
64 |
+
def generate_response(question, text):
|
65 |
+
"""Uses OpenAI to answer a question based on extracted text."""
|
66 |
+
response = openai.ChatCompletion.create(
|
67 |
+
model="gpt-3.5-turbo",
|
68 |
+
messages=[
|
69 |
+
{"role": "system", "content": "You are a data analytics assistant. Answer the question based on the provided document content."},
|
70 |
+
{"role": "user", "content": f"{question}\n\nBased on the following document content:\n{text[:3000]}"} # Limit to 3000 characters to avoid excessive token usage
|
71 |
+
]
|
72 |
+
)
|
73 |
+
return response["choices"][0]["message"]["content"].strip()
|
74 |
+
|
75 |
+
def chatbot_interface(question):
|
76 |
+
folder_path = "New_Data_Analytics/"
|
77 |
+
extracted_files = extract_files_from_folder(folder_path)
|
78 |
+
|
79 |
+
text = combine_text_from_files(extracted_files)
|
80 |
|
81 |
print("Final extracted text for chatbot processing:", text[:500]) # Debugging log (First 500 chars)
|
82 |
|
83 |
if not text.strip():
|
84 |
return "The folder does not contain valid PDF, TXT, CSV, DOCX, or IPYNB files. Please upload supported file types."
|
85 |
|
86 |
+
return generate_response(question, text)
|
87 |
|
88 |
# Gradio interface
|
89 |
demo = gr.Interface(
|
|
|
92 |
outputs=gr.Textbox(label="Answer")
|
93 |
)
|
94 |
|
95 |
+
demo.launch()
|