Tesneem commited on
Commit
b9ae50a
·
verified ·
1 Parent(s): 501d87c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -0
app.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import tempfile
4
+ from pymongo import MongoClient
5
+ from datetime import datetime
6
+ from pathlib import Path
7
+ from document_chunker import DocumentChunker
8
+ from dotenv import load_dotenv
9
+
10
+ load_dotenv()
11
+
12
+ # MongoDB connection
13
+ mongo_uri = os.getenv("MONGO_URI")
14
+ db_name = os.getenv("MONGO_DB", "grant_docs")
15
+ client = MongoClient(mongo_uri)
16
+
17
+ st.set_page_config(page_title="Doc Chunker", layout="wide")
18
+ st.title("📄 Document Chunker & Uploader")
19
+
20
+ with st.sidebar:
21
+ st.header("Settings")
22
+ selected_collection = st.text_input("MongoDB Collection Name", "doc_chunks_cat")
23
+ is_grant_app = st.toggle("Is this a Grant Application?", value=True)
24
+ if st.button("Connect to Collection"):
25
+ collection = client[db_name][selected_collection]
26
+ st.success(f"Connected to `{selected_collection}` in `{db_name}`")
27
+
28
+ uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"])
29
+
30
+ if uploaded_file:
31
+ # Save file to temp path
32
+ temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
33
+ with open(temp_path, "wb") as f:
34
+ f.write(uploaded_file.getbuffer())
35
+
36
+ st.success(f"Uploaded `{uploaded_file.name}`")
37
+
38
+ # Check if file already exists in collection
39
+ modified_time = datetime.now().isoformat()
40
+ collection = client[db_name][selected_collection]
41
+ if collection.find_one({"metadata.title": uploaded_file.name}):
42
+ st.warning("⚠️ This file already exists in the collection. Skipping...")
43
+ else:
44
+ st.write("⏳ Processing...")
45
+ chunker = DocumentChunker()
46
+ chunks = chunker.process_document(str(temp_path)) if is_grant_app else chunker.process_document(str(temp_path))
47
+
48
+ if chunks:
49
+ for chunk in chunks:
50
+ chunk['metadata'].update({
51
+ "title": uploaded_file.name,
52
+ "uploaded_at": modified_time,
53
+ })
54
+ collection.insert_one(chunk)
55
+
56
+ st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`")
57
+
58
+ # Show preview
59
+ for i, c in enumerate(chunks[:3]):
60
+ st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
61
+ st.markdown(c['text'][:400] + "...")
62
+ st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}")
63
+ st.progress(c['metadata']['confidence_score'])
64
+
65
+ if len(chunks) > 3:
66
+ st.info(f"... and {len(chunks)-3} more chunks processed.")
67
+ else:
68
+ st.warning("⚠️ No chunks were generated.")