Spaces:
Running
Running
File size: 2,750 Bytes
b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a 23a7785 b9ae50a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
import os
import streamlit as st
import tempfile
from pymongo import MongoClient
from datetime import datetime
from pathlib import Path
from document_chunker import DocumentChunker
# === MongoDB connection via Hugging Face secrets ===
mongo_uri = os.environ["MONGO_URI"]
db_name = os.environ.get("MONGO_DB", "grant_docs")
client = MongoClient(mongo_uri)
db = client[db_name]
# === Streamlit UI ===
st.set_page_config(page_title="Doc Chunker", layout="wide")
st.title("📄 Document Chunker & Uploader")
with st.sidebar:
st.header("Settings")
# Fetch collection names for dropdown
try:
existing_collections = db.list_collection_names()
selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0)
except Exception as e:
st.error(f"Failed to list collections: {e}")
selected_collection = "doc_chunks_cat"
is_grant_app = st.toggle("Is this a Grant Application?", value=True)
uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"])
if uploaded_file:
temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.success(f"Uploaded `{uploaded_file.name}`")
modified_time = datetime.now().isoformat()
collection = db[selected_collection]
if collection.find_one({"metadata.title": uploaded_file.name}):
st.warning("⚠️ This file already exists in the collection. Skipping...")
else:
st.write("⏳ Processing with DocumentChunker...")
chunker = DocumentChunker()
chunks = chunker.process_document(str(temp_path))
if chunks:
for chunk in chunks:
chunk['metadata'].update({
"title": uploaded_file.name,
"uploaded_at": modified_time,
"is_grant_app": is_grant_app,
})
collection.insert_one(chunk)
st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`")
# Show a few previews
for i, c in enumerate(chunks[:3]):
st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
st.markdown(c['text'][:400] + "...")
st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}")
st.progress(c['metadata']['confidence_score'])
if len(chunks) > 3:
st.info(f"... and {len(chunks)-3} more chunks processed.")
else:
st.warning("⚠️ No chunks were generated.")
|