Spaces:
Running
Running
File size: 4,417 Bytes
b9ae50a 23a7785 b9ae50a dc82c6a b9ae50a f8d2230 dc82c6a b9ae50a 23a7785 3246e10 23a7785 3246e10 23a7785 bb1806c b9ae50a bb1806c 23a7785 bb1806c b9ae50a dc82c6a 23a7785 7242cc4 2cdf48b c7c0d2c d75a194 b9ae50a 9564b65 dc82c6a b9ae50a dc82c6a b9ae50a dc82c6a 7242cc4 6904829 7242cc4 b9ae50a 23a7785 b9ae50a dc82c6a b9ae50a 7242cc4 b9ae50a dc82c6a 23a7785 b9ae50a c7c0d2c 7242cc4 b9ae50a dc82c6a b9ae50a dc82c6a b9ae50a c7c0d2c dc82c6a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import os
import streamlit as st
import tempfile
from pymongo import MongoClient
from datetime import datetime
from pathlib import Path
from document_chunker import DocumentChunker
from urllib.parse import quote_plus
# === MongoDB connection via Hugging Face secrets ===
user = quote_plus(os.getenv("MONGO_USER"))
password = quote_plus(os.getenv("MONGO_PASS"))
cluster = os.getenv("MONGO_CLUSTER")
db_name = os.environ.get("MONGO_DB", "grant_docs")
mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true"
client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000)
db = client[db_name]
def gate_ui():
APP_PASSWORD=st.secrets.get("APP_PASSWORD", os.getenv("APP_PASSWORD")).strip()
if "authed" not in st.session_state:
st.session_state.authed = False
if not APP_PASSWORD:
st.session_state.authed = True
return True
if st.session_state.authed:
return True
st.title("🔒 Grant Buddy Login")
pwd=st.text_input("Enter password", type=password)
if st.button("Login"):
if pwd==APP_PASSWORD:
st.session_state.authed=True
st.rerun()
else:
st.error("Incorrect password.")
return False
# === Streamlit UI ===
if not gate_ui():
return
st.set_page_config(page_title="Doc Chunker", layout="wide")
st.title("📄 Document Chunker & Uploader")
with st.sidebar:
st.header("Settings")
# Fetch collection names for dropdown
try:
existing_categories = db["final_chunks"].distinct("collection_category") or []
except Exception:
existing_categories = []
existing_categories=sorted([c for c in existing_categories if c])+["Create New Category"]
selected_category = st.selectbox(
"Choose Category (collection_category)",
existing_categories,
index=existing_categories.index("Create New Category") if "Create New Category" in existing_categories else 0
)
if selected_category == "Create New Category":
selected_category = st.sidebar.text_input("Enter Category Name:")
if not selected_category:
st.warning("⚠️ Enter a category name to proceed.")
st.stop()
is_grant_app = st.toggle("Is this a Grant Application?", value=False)
uploaded_file = st.file_uploader("Upload a DOCX, TXT, or PDF file", type=["docx", "txt", "pdf"])
if uploaded_file:
temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
with open(temp_path, "wb") as f:
f.write(uploaded_file.getbuffer())
st.success(f"Uploaded `{uploaded_file.name}`")
modified_time = datetime.now().isoformat()
collection = db['final_chunks']
already = collection.find_one({
"metadata.title": uploaded_file.name,
"collection_category": selected_category
})
if already:
st.warning(f"⚠️ `{uploaded_file.name}` already exists in category `{selected_category}`. Skipping…")
else:
st.write("⏳ Processing with DocumentChunker...")
chunker = DocumentChunker()
chunks = chunker.process_document(str(temp_path))
if chunks:
for chunk in chunks:
chunk['collection_category']=selected_category
chunk['metadata'].update({
"title": uploaded_file.name,
"uploaded_at": modified_time,
"is_grant_app": is_grant_app,
})
collection.insert_one(chunk)
st.success(f"✅ {len(chunks)} chunks inserted into `final_chunks` (category: `{selected_category}`)")
# Show a few previews
for i, c in enumerate(chunks[:3]):
st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
st.markdown(c['text'][:400] + "...")
st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}")
st.progress(c['metadata']['confidence_score'])
if len(chunks) > 3:
st.info(f"... and {len(chunks)-3} more chunks processed.")
else:
st.warning("⚠️ No chunks were generated.")
# try:
# os.remove(temp_path)
# except Exception as e:
# st.warning(f"⚠️ Could not delete temp file: {e}")
|