Tesneem commited on
Commit
23a7785
Β·
verified Β·
1 Parent(s): 5ece6b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -17
app.py CHANGED
@@ -1,61 +1,65 @@
1
- import streamlit as st
2
  import os
 
3
  import tempfile
4
  from pymongo import MongoClient
5
  from datetime import datetime
6
  from pathlib import Path
7
  from document_chunker import DocumentChunker
8
- from dotenv import load_dotenv
9
-
10
- load_dotenv()
11
 
12
- # MongoDB connection
13
- mongo_uri = os.getenv("MONGO_URI")
14
- db_name = os.getenv("MONGO_DB", "grant_docs")
15
  client = MongoClient(mongo_uri)
 
16
 
 
17
  st.set_page_config(page_title="Doc Chunker", layout="wide")
18
  st.title("πŸ“„ Document Chunker & Uploader")
19
 
20
  with st.sidebar:
21
  st.header("Settings")
22
- selected_collection = st.text_input("MongoDB Collection Name", "doc_chunks_cat")
 
 
 
 
 
 
 
 
23
  is_grant_app = st.toggle("Is this a Grant Application?", value=True)
24
- if st.button("Connect to Collection"):
25
- collection = client[db_name][selected_collection]
26
- st.success(f"Connected to `{selected_collection}` in `{db_name}`")
27
 
28
  uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"])
29
 
30
  if uploaded_file:
31
- # Save file to temp path
32
  temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
33
  with open(temp_path, "wb") as f:
34
  f.write(uploaded_file.getbuffer())
35
 
36
  st.success(f"Uploaded `{uploaded_file.name}`")
37
 
38
- # Check if file already exists in collection
39
  modified_time = datetime.now().isoformat()
40
- collection = client[db_name][selected_collection]
 
41
  if collection.find_one({"metadata.title": uploaded_file.name}):
42
  st.warning("⚠️ This file already exists in the collection. Skipping...")
43
  else:
44
- st.write("⏳ Processing...")
45
  chunker = DocumentChunker()
46
- chunks = chunker.process_document(str(temp_path)) if is_grant_app else chunker.process_document(str(temp_path))
47
 
48
  if chunks:
49
  for chunk in chunks:
50
  chunk['metadata'].update({
51
  "title": uploaded_file.name,
52
  "uploaded_at": modified_time,
 
53
  })
54
  collection.insert_one(chunk)
55
 
56
  st.success(f"βœ… {len(chunks)} chunks inserted into `{selected_collection}`")
57
 
58
- # Show preview
59
  for i, c in enumerate(chunks[:3]):
60
  st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
61
  st.markdown(c['text'][:400] + "...")
 
 
1
  import os
2
+ import streamlit as st
3
  import tempfile
4
  from pymongo import MongoClient
5
  from datetime import datetime
6
  from pathlib import Path
7
  from document_chunker import DocumentChunker
 
 
 
8
 
9
+ # === MongoDB connection via Hugging Face secrets ===
10
+ mongo_uri = os.environ["MONGO_URI"]
11
+ db_name = os.environ.get("MONGO_DB", "grant_docs")
12
  client = MongoClient(mongo_uri)
13
+ db = client[db_name]
14
 
15
+ # === Streamlit UI ===
16
  st.set_page_config(page_title="Doc Chunker", layout="wide")
17
  st.title("πŸ“„ Document Chunker & Uploader")
18
 
19
  with st.sidebar:
20
  st.header("Settings")
21
+
22
+ # Fetch collection names for dropdown
23
+ try:
24
+ existing_collections = db.list_collection_names()
25
+ selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0)
26
+ except Exception as e:
27
+ st.error(f"Failed to list collections: {e}")
28
+ selected_collection = "doc_chunks_cat"
29
+
30
  is_grant_app = st.toggle("Is this a Grant Application?", value=True)
 
 
 
31
 
32
  uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"])
33
 
34
  if uploaded_file:
 
35
  temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
36
  with open(temp_path, "wb") as f:
37
  f.write(uploaded_file.getbuffer())
38
 
39
  st.success(f"Uploaded `{uploaded_file.name}`")
40
 
 
41
  modified_time = datetime.now().isoformat()
42
+ collection = db[selected_collection]
43
+
44
  if collection.find_one({"metadata.title": uploaded_file.name}):
45
  st.warning("⚠️ This file already exists in the collection. Skipping...")
46
  else:
47
+ st.write("⏳ Processing with DocumentChunker...")
48
  chunker = DocumentChunker()
49
+ chunks = chunker.process_document(str(temp_path))
50
 
51
  if chunks:
52
  for chunk in chunks:
53
  chunk['metadata'].update({
54
  "title": uploaded_file.name,
55
  "uploaded_at": modified_time,
56
+ "is_grant_app": is_grant_app,
57
  })
58
  collection.insert_one(chunk)
59
 
60
  st.success(f"βœ… {len(chunks)} chunks inserted into `{selected_collection}`")
61
 
62
+ # Show a few previews
63
  for i, c in enumerate(chunks[:3]):
64
  st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
65
  st.markdown(c['text'][:400] + "...")