Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
-
|
2 |
# app.py
|
3 |
import streamlit as st
|
4 |
import os
|
5 |
import shutil
|
|
|
6 |
from io import BytesIO
|
7 |
from PyPDF2 import PdfReader
|
8 |
import pandas as pd
|
@@ -57,9 +57,12 @@ with st.sidebar:
|
|
57 |
input_data = st.file_uploader("Upload a PDF, TXT, XLS/XLSX, or DOC/DOCX file", type=["pdf", "txt", "xls", "xlsx", "doc", "docx"])
|
58 |
|
59 |
if st.button("Process File") and input_data is not None:
|
60 |
-
|
61 |
-
|
62 |
-
|
|
|
|
|
|
|
63 |
|
64 |
# Display chat history
|
65 |
st.subheader("Chat History")
|
@@ -133,8 +136,13 @@ def main():
|
|
133 |
st.write("**Answer:**", answer)
|
134 |
|
135 |
def process_input(input_data):
|
136 |
-
# Create uploads directory
|
137 |
-
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
# Initialize progress bar and status
|
140 |
progress_bar = st.progress(0)
|
@@ -143,41 +151,53 @@ def process_input(input_data):
|
|
143 |
documents = ""
|
144 |
file_name = input_data.name.lower()
|
145 |
|
146 |
-
# Step 1:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
status.update(label="Reading file...")
|
148 |
-
progress_bar.progress(0.
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
165 |
status.update(label="Splitting text into chunks...")
|
166 |
-
progress_bar.progress(0.
|
167 |
|
168 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
169 |
texts = text_splitter.split_text(documents)
|
170 |
|
171 |
-
# Step
|
172 |
status.update(label="Creating embeddings...")
|
173 |
-
progress_bar.progress(0.
|
174 |
|
175 |
hf_embeddings = HuggingFaceEmbeddings(
|
176 |
model_name="sentence-transformers/all-mpnet-base-v2",
|
177 |
model_kwargs={'device': 'cpu'}
|
178 |
)
|
179 |
|
180 |
-
# Step
|
181 |
status.update(label="Building vector store...")
|
182 |
progress_bar.progress(0.90)
|
183 |
|
|
|
|
|
1 |
# app.py
|
2 |
import streamlit as st
|
3 |
import os
|
4 |
import shutil
|
5 |
+
import tempfile
|
6 |
from io import BytesIO
|
7 |
from PyPDF2 import PdfReader
|
8 |
import pandas as pd
|
|
|
57 |
input_data = st.file_uploader("Upload a PDF, TXT, XLS/XLSX, or DOC/DOCX file", type=["pdf", "txt", "xls", "xlsx", "doc", "docx"])
|
58 |
|
59 |
if st.button("Process File") and input_data is not None:
|
60 |
+
try:
|
61 |
+
vector_store = process_input(input_data)
|
62 |
+
st.session_state.vectorstore = vector_store
|
63 |
+
st.success("File processed successfully. You can now ask questions.")
|
64 |
+
except (PermissionError, OSError) as e:
|
65 |
+
st.error(f"Error processing file: {str(e)}. Check file permissions or server configuration.")
|
66 |
|
67 |
# Display chat history
|
68 |
st.subheader("Chat History")
|
|
|
136 |
st.write("**Answer:**", answer)
|
137 |
|
138 |
def process_input(input_data):
|
139 |
+
# Create uploads directory with proper permissions
|
140 |
+
try:
|
141 |
+
os.makedirs("uploads", exist_ok=True)
|
142 |
+
os.chmod("uploads", 0o777) # Ensure write permissions
|
143 |
+
except PermissionError as e:
|
144 |
+
st.error(f"Failed to create uploads directory: {str(e)}")
|
145 |
+
raise
|
146 |
|
147 |
# Initialize progress bar and status
|
148 |
progress_bar = st.progress(0)
|
|
|
151 |
documents = ""
|
152 |
file_name = input_data.name.lower()
|
153 |
|
154 |
+
# Step 1: Save file temporarily
|
155 |
+
status.update(label="Saving file...")
|
156 |
+
progress_bar.progress(0.20)
|
157 |
+
|
158 |
+
with tempfile.NamedTemporaryFile(delete=False, dir="uploads", suffix=file_name) as tmp_file:
|
159 |
+
tmp_file.write(input_data.read())
|
160 |
+
tmp_file_path = tmp_file.name
|
161 |
+
|
162 |
+
# Step 2: Read file
|
163 |
status.update(label="Reading file...")
|
164 |
+
progress_bar.progress(0.40)
|
165 |
+
|
166 |
+
try:
|
167 |
+
if file_name.endswith(".pdf"):
|
168 |
+
pdf_reader = PdfReader(tmp_file_path)
|
169 |
+
for page in pdf_reader.pages:
|
170 |
+
documents += page.extract_text() or ""
|
171 |
+
elif file_name.endswith(".txt"):
|
172 |
+
with open(tmp_file_path, "r", encoding="utf-8") as f:
|
173 |
+
documents = f.read()
|
174 |
+
elif file_name.endswith((".xls", ".xlsx")):
|
175 |
+
df = pd.read_excel(tmp_file_path)
|
176 |
+
documents = " ".join(df.astype(str).values.flatten())
|
177 |
+
elif file_name.endswith((".doc", ".docx")):
|
178 |
+
doc = Document(tmp_file_path)
|
179 |
+
for para in doc.paragraphs:
|
180 |
+
documents += para.text + "\n"
|
181 |
+
finally:
|
182 |
+
os.remove(tmp_file_path) # Clean up temporary file
|
183 |
+
|
184 |
+
# Step 3: Split text
|
185 |
status.update(label="Splitting text into chunks...")
|
186 |
+
progress_bar.progress(0.60)
|
187 |
|
188 |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
|
189 |
texts = text_splitter.split_text(documents)
|
190 |
|
191 |
+
# Step 4: Create embeddings
|
192 |
status.update(label="Creating embeddings...")
|
193 |
+
progress_bar.progress(0.80)
|
194 |
|
195 |
hf_embeddings = HuggingFaceEmbeddings(
|
196 |
model_name="sentence-transformers/all-mpnet-base-v2",
|
197 |
model_kwargs={'device': 'cpu'}
|
198 |
)
|
199 |
|
200 |
+
# Step 5: Initialize FAISS vector store
|
201 |
status.update(label="Building vector store...")
|
202 |
progress_bar.progress(0.90)
|
203 |
|