File size: 3,575 Bytes
b9ae50a
23a7785
b9ae50a
dc82c6a
b9ae50a
 
f8d2230
dc82c6a
b9ae50a
23a7785
3246e10
 
 
23a7785
3246e10
 
23a7785
b9ae50a
23a7785
b9ae50a
 
 
 
 
dc82c6a
 
23a7785
 
2cdf48b
dc82c6a
23a7785
 
dc82c6a
c7c0d2c
2cdf48b
c7c0d2c
2cdf48b
 
 
c7c0d2c
d75a194
b9ae50a
9564b65
 
dc82c6a
b9ae50a
 
 
 
dc82c6a
b9ae50a
dc82c6a
 
23a7785
dc82c6a
b9ae50a
 
23a7785
b9ae50a
dc82c6a
b9ae50a
 
 
 
dc82c6a
 
23a7785
b9ae50a
 
c7c0d2c
dc82c6a
b9ae50a
dc82c6a
b9ae50a
 
 
 
 
 
 
 
dc82c6a
b9ae50a
 
c7c0d2c
dc82c6a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import streamlit as st
import tempfile
from pymongo import MongoClient
from datetime import datetime
from pathlib import Path
from document_chunker import DocumentChunker
from urllib.parse import quote_plus

# === MongoDB connection via Hugging Face secrets ===
user = quote_plus(os.getenv("MONGO_USER"))
password = quote_plus(os.getenv("MONGO_PASS"))
cluster = os.getenv("MONGO_CLUSTER")
db_name = os.environ.get("MONGO_DB", "grant_docs")
mongo_uri = f"mongodb+srv://{user}:{password}@{cluster}/{db_name}?retryWrites=true&w=majority&tls=true&tlsAllowInvalidCertificates=true"
client = MongoClient(mongo_uri, tls=True, tlsAllowInvalidCertificates=True, serverSelectionTimeoutMS=20000)
db = client[db_name]

# === Streamlit UI ===
st.set_page_config(page_title="Doc Chunker", layout="wide")
st.title("📄 Document Chunker & Uploader")

with st.sidebar:
    st.header("Settings")

    # Fetch collection names for dropdown
    try:
        existing_collections = db.list_collection_names()
        existing_collections.append("Create New Collection")
        selected_collection = st.selectbox("Choose MongoDB Collection", existing_collections, index=existing_collections.index("default_collection") if "default_collection" in existing_collections else 0)
    except Exception as e:
        st.error(f"Failed to list collections: {e}")
        selected_collection = "default_collection"

    if selected_collection == "Create New Collection":
        selected_collection = st.sidebar.text_input("Enter Collection Name:")
        if not selected_collection:
            st.warning("⚠️ Enter a collection name to proceed.")
            st.stop()

    is_grant_app = st.toggle("Is this a Grant Application?", value=False)

uploaded_file = st.file_uploader("Upload a DOCX, TXT, or PDF file", type=["docx", "txt", "pdf"])

if uploaded_file:
    temp_path = Path(tempfile.gettempdir()) / uploaded_file.name
    with open(temp_path, "wb") as f:
        f.write(uploaded_file.getbuffer())

    st.success(f"Uploaded `{uploaded_file.name}`")

    modified_time = datetime.now().isoformat()
    collection = db[selected_collection]

    if collection.find_one({"metadata.title": uploaded_file.name}):
        st.warning("⚠️ This file already exists in the collection. Skipping...")
    else:
        st.write("⏳ Processing with DocumentChunker...")
        chunker = DocumentChunker()
        chunks = chunker.process_document(str(temp_path))

        if chunks:
            for chunk in chunks:
                chunk['metadata'].update({
                    "title": uploaded_file.name,
                    "uploaded_at": modified_time,
                    "is_grant_app": is_grant_app,
                })
                collection.insert_one(chunk)

            st.success(f"✅ {len(chunks)} chunks inserted into `{selected_collection}`")

            # Show a few previews
            for i, c in enumerate(chunks[:3]):
                st.subheader(f"Chunk {i+1}: {c['metadata'].get('header') or 'No Header'}")
                st.markdown(c['text'][:400] + "...")
                st.caption(f"Topics: {', '.join(c['metadata']['topics'])} | Category: {c['metadata']['category']}")
                st.progress(c['metadata']['confidence_score'])

            if len(chunks) > 3:
                st.info(f"... and {len(chunks)-3} more chunks processed.")

        else:
            st.warning("⚠️ No chunks were generated.")

    # try:
    #     os.remove(temp_path)
    # except Exception as e:
    #     st.warning(f"⚠️ Could not delete temp file: {e}")