Tesneem commited on
Commit
dc82c6a
·
verified ·
1 Parent(s): 13693fd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -39
app.py CHANGED
@@ -1,19 +1,17 @@
1
  import os
2
  import streamlit as st
3
  import tempfile
 
4
  from datetime import datetime
5
  from pathlib import Path
6
- from pymongo import MongoClient
7
- from urllib.parse import quote_plus
8
  from document_chunker import DocumentChunker
9
- import time
10
 
11
  # === MongoDB connection via Hugging Face secrets ===
12
  user = quote_plus(os.getenv("MONGO_USER"))
13
  password = quote_plus(os.getenv("MONGO_PASS"))
14
  cluster = os.getenv("MONGO_CLUSTER")
15
  db_name = os.environ.get("MONGO_DB", "grant_docs")
16
-
17
  mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true"
18
  client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000)
19
  db = client[db_name]
@@ -24,14 +22,15 @@ st.title("📄 Document Chunker & Uploader")
24
 
25
  with st.sidebar:
26
  st.header("Settings")
 
 
27
  try:
28
  existing_collections = db.list_collection_names()
29
  existing_collections.append("Create New Collection")
30
- default_index = existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0
31
- selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=default_index)
32
  except Exception as e:
33
  st.error(f"Failed to list collections: {e}")
34
- selected_collection = "doc_chunks_cat"
35
 
36
  if selected_collection == "Create New Collection":
37
  selected_collection = st.sidebar.text_input("Enter Collection Name:")
@@ -43,47 +42,35 @@ with st.sidebar:
43
 
44
  uploaded_file = st.file_uploader("Upload a DOCX, TXT, or PDF file", type=["docx", "txt", "pdf"])
45
 
46
- # === Store session state after upload ===
47
- if uploaded_file and "ready_to_process" not in st.session_state:
48
  temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
49
  with open(temp_path, "wb") as f:
50
  f.write(uploaded_file.getbuffer())
51
 
52
- st.session_state["uploaded_file_name"] = uploaded_file.name
53
- st.session_state["collection_name"] = selected_collection
54
- st.session_state["is_grant_app"] = is_grant_app
55
- st.session_state["temp_path"] = str(temp_path)
56
- st.session_state["ready_to_process"] = True
57
- st.rerun()
58
-
59
- # === Process document ===
60
- if st.session_state.get("ready_to_process"):
61
- file_name = st.session_state["uploaded_file_name"]
62
- collection_name = st.session_state["collection_name"]
63
- is_grant_app = st.session_state["is_grant_app"]
64
- temp_path = st.session_state["temp_path"]
65
 
66
- st.success(f"Uploaded `{file_name}`")
67
- collection = db[collection_name]
68
 
69
- if collection.find_one({"metadata.title": file_name}):
70
  st.warning("⚠️ This file already exists in the collection. Skipping...")
71
  else:
72
  st.write("⏳ Processing with DocumentChunker...")
73
  chunker = DocumentChunker()
74
- chunks = chunker.process_document(temp_path)
75
 
76
  if chunks:
77
  for chunk in chunks:
78
  chunk['metadata'].update({
79
- "title": file_name,
80
- "uploaded_at": datetime.now().isoformat(),
81
  "is_grant_app": is_grant_app,
82
  })
83
  collection.insert_one(chunk)
84
 
85
- st.success(f"✅ {len(chunks)} chunks inserted into `{collection_name}`")
86
 
 
87
  for i, c in enumerate(chunks[:3]):
88
  st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
89
  st.markdown(c['text'][:400] + "...")
@@ -92,16 +79,11 @@ if st.session_state.get("ready_to_process"):
92
 
93
  if len(chunks) > 3:
94
  st.info(f"... and {len(chunks)-3} more chunks processed.")
 
95
  else:
96
  st.warning("⚠️ No chunks were generated.")
97
 
98
- # Clean up
99
- try:
100
- os.remove(temp_path)
101
- except Exception as e:
102
- st.warning(f"⚠️ Could not delete temp file: {e}")
103
- time.sleep(2)
104
- # Reset session
105
- for key in ["uploaded_file_name", "collection_name", "is_grant_app", "temp_path", "ready_to_process"]:
106
- st.session_state.pop(key, None)
107
- st.rerun()
 
1
  import os
2
  import streamlit as st
3
  import tempfile
4
+ from pymongo import MongoClient
5
  from datetime import datetime
6
  from pathlib import Path
 
 
7
  from document_chunker import DocumentChunker
8
+ from urllib.parse import quote_plus
9
 
10
  # === MongoDB connection via Hugging Face secrets ===
11
  user = quote_plus(os.getenv("MONGO_USER"))
12
  password = quote_plus(os.getenv("MONGO_PASS"))
13
  cluster = os.getenv("MONGO_CLUSTER")
14
  db_name = os.environ.get("MONGO_DB", "grant_docs")
 
15
  mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true"
16
  client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000)
17
  db = client[db_name]
 
22
 
23
  with st.sidebar:
24
  st.header("Settings")
25
+
26
+ # Fetch collection names for dropdown
27
  try:
28
  existing_collections = db.list_collection_names()
29
  existing_collections.append("Create New Collection")
30
+ selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=existing_collections.index("default_collection") if "default_collection" in existing_collections else 0)
 
31
  except Exception as e:
32
  st.error(f"Failed to list collections: {e}")
33
+ selected_collection = "default_collection"
34
 
35
  if selected_collection == "Create New Collection":
36
  selected_collection = st.sidebar.text_input("Enter Collection Name:")
 
42
 
43
  uploaded_file = st.file_uploader("Upload a DOCX, TXT, or PDF file", type=["docx", "txt", "pdf"])
44
 
45
+ if uploaded_file:
 
46
  temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
47
  with open(temp_path, "wb") as f:
48
  f.write(uploaded_file.getbuffer())
49
 
50
+ st.success(f"Uploaded `{uploaded_file.name}`")
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
+ modified_time = datetime.now().isoformat()
53
+ collection = db[selected_collection]
54
 
55
+ if collection.find_one({"metadata.title": uploaded_file.name}):
56
  st.warning("⚠️ This file already exists in the collection. Skipping...")
57
  else:
58
  st.write("⏳ Processing with DocumentChunker...")
59
  chunker = DocumentChunker()
60
+ chunks = chunker.process_document(str(temp_path))
61
 
62
  if chunks:
63
  for chunk in chunks:
64
  chunk['metadata'].update({
65
+ "title": uploaded_file.name,
66
+ "uploaded_at": modified_time,
67
  "is_grant_app": is_grant_app,
68
  })
69
  collection.insert_one(chunk)
70
 
71
+ st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`")
72
 
73
+ # Show a few previews
74
  for i, c in enumerate(chunks[:3]):
75
  st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
76
  st.markdown(c['text'][:400] + "...")
 
79
 
80
  if len(chunks) > 3:
81
  st.info(f"... and {len(chunks)-3} more chunks processed.")
82
+
83
  else:
84
  st.warning("⚠️ No chunks were generated.")
85
 
86
+ # try:
87
+ # os.remove(temp_path)
88
+ # except Exception as e:
89
+ # st.warning(f"⚠️ Could not delete temp file: {e}")