Spaces:

Tesneem
/

document_chunker

Running

App Files Files Community

document_chunker / app.py

Tesneem

Update app.py

3246e10 verified 7 days ago

raw

history blame

3.15 kB

	import os
	import streamlit as st
	import tempfile
	from pymongo import MongoClient
	from datetime import datetime
	from pathlib import Path
	from document_chunker import DocumentChunker
	from urllib.parse import quote_plus

	# === MongoDB connection via Hugging Face secrets ===
	user = quote_plus(os.getenv("MONGO_USER"))
	password = quote_plus(os.getenv("MONGO_PASS"))
	cluster = os.getenv("MONGO_CLUSTER")
	# db_name = "grant_docs"
	db_name = os.environ.get("MONGO_DB", "grant_docs")
	mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true"
	# mongo_uri = os.environ["MONGO_URI"]
	client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000)
	db = client[db_name]

	# === Streamlit UI ===
	st.set_page_config(page_title="Doc Chunker", layout="wide")
	st.title("📄 Document Chunker & Uploader")

	with st.sidebar:
	st.header("Settings")

	# Fetch collection names for dropdown
	try:
	existing_collections = db.list_collection_names()
	selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=existing_collections.index("doc_chunks_cat") if "doc_chunks_cat" in existing_collections else 0)
	except Exception as e:
	st.error(f"Failed to list collections: {e}")
	selected_collection = "doc_chunks_cat"

	is_grant_app = st.toggle("Is this a Grant Application?", value=True)

	uploaded_file = st.file_uploader("Upload a DOCX or TXT file", type=["docx", "txt"])

	if uploaded_file:
	temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
	with open(temp_path, "wb") as f:
	f.write(uploaded_file.getbuffer())

	st.success(f"Uploaded `{uploaded_file.name}`")

	modified_time = datetime.now().isoformat()
	collection = db[selected_collection]

	if collection.find_one({"metadata.title": uploaded_file.name}):
	st.warning("⚠️ This file already exists in the collection. Skipping...")
	else:
	st.write("⏳ Processing with DocumentChunker...")
	chunker = DocumentChunker()
	chunks = chunker.process_document(str(temp_path))

	if chunks:
	for chunk in chunks:
	chunk['metadata'].update({
	"title": uploaded_file.name,
	"uploaded_at": modified_time,
	"is_grant_app": is_grant_app,
	})
	collection.insert_one(chunk)

	st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`")

	# Show a few previews
	for i, c in enumerate(chunks[:3]):
	st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
	st.markdown(c['text'][:400] + "...")
	st.caption(f"Topics: {', '.join(c['metadata']['topics'])} \| Category: {c['metadata']['category']}")
	st.progress(c['metadata']['confidence_score'])

	if len(chunks) > 3:
	st.info(f"... and {len(chunks)-3} more chunks processed.")
	else:
	st.warning("⚠️ No chunks were generated.")