Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,6 +3,7 @@ import os
|
|
3 |
import PyPDF2
|
4 |
import pandas as pd
|
5 |
import openai
|
|
|
6 |
from langchain_community.embeddings import OpenAIEmbeddings
|
7 |
from langchain_community.vectorstores import FAISS
|
8 |
from langchain_community.llms import OpenAI
|
@@ -59,7 +60,11 @@ def get_text_from_pdf(pdf_files):
|
|
59 |
with open(pdf_path, "rb") as pdf_file:
|
60 |
reader = PyPDF2.PdfReader(pdf_file)
|
61 |
for page in reader.pages:
|
62 |
-
|
|
|
|
|
|
|
|
|
63 |
return text
|
64 |
|
65 |
def get_text_from_csv(csv_files):
|
@@ -70,6 +75,15 @@ def get_text_from_csv(csv_files):
|
|
70 |
text += df.to_string() + "\n"
|
71 |
return text
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
def create_vector_database(text):
|
74 |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
75 |
texts = splitter.split_text(text)
|
@@ -110,7 +124,7 @@ def chatbot_interface(question):
|
|
110 |
folder_path = "/mnt/data/Data Analitics/"
|
111 |
extracted_files = extract_files_from_folder(folder_path)
|
112 |
|
113 |
-
text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"])
|
114 |
|
115 |
if not text:
|
116 |
return "The folder does not contain valid PDF, TXT, CSV, or DOCX files. Please upload supported file types."
|
@@ -126,4 +140,4 @@ demo = gr.Interface(
|
|
126 |
outputs=gr.Textbox(label="Answer")
|
127 |
)
|
128 |
|
129 |
-
demo.launch()
|
|
|
3 |
import PyPDF2
|
4 |
import pandas as pd
|
5 |
import openai
|
6 |
+
import docx
|
7 |
from langchain_community.embeddings import OpenAIEmbeddings
|
8 |
from langchain_community.vectorstores import FAISS
|
9 |
from langchain_community.llms import OpenAI
|
|
|
60 |
with open(pdf_path, "rb") as pdf_file:
|
61 |
reader = PyPDF2.PdfReader(pdf_file)
|
62 |
for page in reader.pages:
|
63 |
+
page_text = page.extract_text()
|
64 |
+
if page_text:
|
65 |
+
text += page_text + "\n"
|
66 |
+
else:
|
67 |
+
text += "[Could not extract text from this page]\n"
|
68 |
return text
|
69 |
|
70 |
def get_text_from_csv(csv_files):
|
|
|
75 |
text += df.to_string() + "\n"
|
76 |
return text
|
77 |
|
78 |
+
def get_text_from_docx(docx_files):
|
79 |
+
text = ""
|
80 |
+
for docx_path in docx_files:
|
81 |
+
print(f"Reading DOCX file: {docx_path}") # Debugging log
|
82 |
+
doc = docx.Document(docx_path)
|
83 |
+
for para in doc.paragraphs:
|
84 |
+
text += para.text + "\n"
|
85 |
+
return text
|
86 |
+
|
87 |
def create_vector_database(text):
|
88 |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
89 |
texts = splitter.split_text(text)
|
|
|
124 |
folder_path = "/mnt/data/Data Analitics/"
|
125 |
extracted_files = extract_files_from_folder(folder_path)
|
126 |
|
127 |
+
text = get_text_from_pdf(extracted_files["pdf"]) + read_text_from_files(extracted_files["txt"]) + get_text_from_csv(extracted_files["csv"]) + get_text_from_docx(extracted_files["docx"])
|
128 |
|
129 |
if not text:
|
130 |
return "The folder does not contain valid PDF, TXT, CSV, or DOCX files. Please upload supported file types."
|
|
|
140 |
outputs=gr.Textbox(label="Answer")
|
141 |
)
|
142 |
|
143 |
+
demo.launch()
|