samim2024 commited on
Commit
875ad97
·
verified ·
1 Parent(s): 1676c9d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -28
app.py CHANGED
@@ -1,8 +1,8 @@
1
-
2
  # app.py
3
  import streamlit as st
4
  import os
5
  import shutil
 
6
  from io import BytesIO
7
  from PyPDF2 import PdfReader
8
  import pandas as pd
@@ -57,9 +57,12 @@ with st.sidebar:
57
  input_data = st.file_uploader("Upload a PDF, TXT, XLS/XLSX, or DOC/DOCX file", type=["pdf", "txt", "xls", "xlsx", "doc", "docx"])
58
 
59
  if st.button("Process File") and input_data is not None:
60
- vector_store = process_input(input_data)
61
- st.session_state.vectorstore = vector_store
62
- st.success("File processed successfully. You can now ask questions.")
 
 
 
63
 
64
  # Display chat history
65
  st.subheader("Chat History")
@@ -133,8 +136,13 @@ def main():
133
  st.write("**Answer:**", answer)
134
 
135
  def process_input(input_data):
136
- # Create uploads directory
137
- os.makedirs("uploads", exist_ok=True)
 
 
 
 
 
138
 
139
  # Initialize progress bar and status
140
  progress_bar = st.progress(0)
@@ -143,41 +151,53 @@ def process_input(input_data):
143
  documents = ""
144
  file_name = input_data.name.lower()
145
 
146
- # Step 1: Read file
 
 
 
 
 
 
 
 
147
  status.update(label="Reading file...")
148
- progress_bar.progress(0.25)
149
-
150
- if file_name.endswith(".pdf"):
151
- pdf_reader = PdfReader(input_data)
152
- for page in pdf_reader.pages:
153
- documents += page.extract_text() or ""
154
- elif file_name.endswith(".txt"):
155
- documents = input_data.read().decode("utf-8")
156
- elif file_name.endswith((".xls", ".xlsx")):
157
- df = pd.read_excel(input_data)
158
- documents = " ".join(df.astype(str).values.flatten())
159
- elif file_name.endswith((".doc", ".docx")):
160
- doc = Document(input_data)
161
- for para in doc.paragraphs:
162
- documents += para.text + "\n"
163
-
164
- # Step 2: Split text
 
 
 
 
165
  status.update(label="Splitting text into chunks...")
166
- progress_bar.progress(0.50)
167
 
168
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
169
  texts = text_splitter.split_text(documents)
170
 
171
- # Step 3: Create embeddings
172
  status.update(label="Creating embeddings...")
173
- progress_bar.progress(0.75)
174
 
175
  hf_embeddings = HuggingFaceEmbeddings(
176
  model_name="sentence-transformers/all-mpnet-base-v2",
177
  model_kwargs={'device': 'cpu'}
178
  )
179
 
180
- # Step 4: Initialize FAISS vector store
181
  status.update(label="Building vector store...")
182
  progress_bar.progress(0.90)
183
 
 
 
1
  # app.py
2
  import streamlit as st
3
  import os
4
  import shutil
5
+ import tempfile
6
  from io import BytesIO
7
  from PyPDF2 import PdfReader
8
  import pandas as pd
 
57
  input_data = st.file_uploader("Upload a PDF, TXT, XLS/XLSX, or DOC/DOCX file", type=["pdf", "txt", "xls", "xlsx", "doc", "docx"])
58
 
59
  if st.button("Process File") and input_data is not None:
60
+ try:
61
+ vector_store = process_input(input_data)
62
+ st.session_state.vectorstore = vector_store
63
+ st.success("File processed successfully. You can now ask questions.")
64
+ except (PermissionError, OSError) as e:
65
+ st.error(f"Error processing file: {str(e)}. Check file permissions or server configuration.")
66
 
67
  # Display chat history
68
  st.subheader("Chat History")
 
136
  st.write("**Answer:**", answer)
137
 
138
  def process_input(input_data):
139
+ # Create uploads directory with proper permissions
140
+ try:
141
+ os.makedirs("uploads", exist_ok=True)
142
+ os.chmod("uploads", 0o777) # Ensure write permissions
143
+ except PermissionError as e:
144
+ st.error(f"Failed to create uploads directory: {str(e)}")
145
+ raise
146
 
147
  # Initialize progress bar and status
148
  progress_bar = st.progress(0)
 
151
  documents = ""
152
  file_name = input_data.name.lower()
153
 
154
+ # Step 1: Save file temporarily
155
+ status.update(label="Saving file...")
156
+ progress_bar.progress(0.20)
157
+
158
+ with tempfile.NamedTemporaryFile(delete=False, dir="uploads", suffix=file_name) as tmp_file:
159
+ tmp_file.write(input_data.read())
160
+ tmp_file_path = tmp_file.name
161
+
162
+ # Step 2: Read file
163
  status.update(label="Reading file...")
164
+ progress_bar.progress(0.40)
165
+
166
+ try:
167
+ if file_name.endswith(".pdf"):
168
+ pdf_reader = PdfReader(tmp_file_path)
169
+ for page in pdf_reader.pages:
170
+ documents += page.extract_text() or ""
171
+ elif file_name.endswith(".txt"):
172
+ with open(tmp_file_path, "r", encoding="utf-8") as f:
173
+ documents = f.read()
174
+ elif file_name.endswith((".xls", ".xlsx")):
175
+ df = pd.read_excel(tmp_file_path)
176
+ documents = " ".join(df.astype(str).values.flatten())
177
+ elif file_name.endswith((".doc", ".docx")):
178
+ doc = Document(tmp_file_path)
179
+ for para in doc.paragraphs:
180
+ documents += para.text + "\n"
181
+ finally:
182
+ os.remove(tmp_file_path) # Clean up temporary file
183
+
184
+ # Step 3: Split text
185
  status.update(label="Splitting text into chunks...")
186
+ progress_bar.progress(0.60)
187
 
188
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
189
  texts = text_splitter.split_text(documents)
190
 
191
+ # Step 4: Create embeddings
192
  status.update(label="Creating embeddings...")
193
+ progress_bar.progress(0.80)
194
 
195
  hf_embeddings = HuggingFaceEmbeddings(
196
  model_name="sentence-transformers/all-mpnet-base-v2",
197
  model_kwargs={'device': 'cpu'}
198
  )
199
 
200
+ # Step 5: Initialize FAISS vector store
201
  status.update(label="Building vector store...")
202
  progress_bar.progress(0.90)
203