cgd-ui

Sleeping

App Files Files

xet

Community

Update app.py

by gigiliu12 - opened Aug 6

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+87

-34

Files changed (1) hide show

app.py +87 -34

app.py CHANGED Viewed

@@ -1,10 +1,14 @@
 import streamlit as st
 import pandas as pd
 import psycopg2
-import os
-# Load DB credentials from Hugging Face secrets or environment variables
 DB_HOST = os.getenv("DB_HOST")
 DB_PORT = os.getenv("DB_PORT", "5432")
 DB_NAME = os.getenv("DB_NAME")
@@ -12,7 +16,7 @@ DB_USER = os.getenv("DB_USER")
 DB_PASSWORD = os.getenv("DB_PASSWORD")
 @st.cache_data(ttl=600)
-def get_data():
     try:
         conn = psycopg2.connect(
             host=DB_HOST,
@@ -20,85 +24,134 @@ def get_data():
             dbname=DB_NAME,
             user=DB_USER,
             password=DB_PASSWORD,
-            sslmode="require"
         )
-        query = "SELECT country, year, section, question_code, question_text, answer_code, answer_text FROM survey_info;"
-        df = pd.read_sql_query(query, conn)
         conn.close()
-        return df
     except Exception as e:
         st.error(f"Failed to connect to the database: {e}")
         st.stop()
-# Load data
-df = get_data()
-# Streamlit UI
 st.title("🌍 CGD Survey Explorer (Live DB)")
 st.sidebar.header("🔎 Filter Questions")
-# Multiselect filters with default = show all
 country_options = sorted(df["country"].dropna().unique())
-year_options = sorted(df["year"].dropna().unique())
 selected_countries = st.sidebar.multiselect("Select Country/Countries", country_options)
-selected_years = st.sidebar.multiselect("Select Year(s)", year_options)
 keyword = st.sidebar.text_input(
     "Keyword Search (Question text / Answer text / Question code)", ""
-) #NEW
 group_by_question = st.sidebar.checkbox("Group by Question Text")
-# Apply filters
 filtered = df[
     (df["country"].isin(selected_countries) if selected_countries else True) &
-    (df["year"].isin(selected_years) if selected_years else True) &
     (
         df["question_text"].str.contains(keyword, case=False, na=False) |
         df["answer_text"].str.contains(keyword, case=False, na=False)   |
-        df["question_code"].astype(str).str.contains(keyword, case=False, na=False)  # NEW
     )
 ]
-# Output
 if group_by_question:
     st.subheader("📊 Grouped by Question Text")
     grouped = (
         filtered.groupby("question_text")
         .agg({
             "country": lambda x: sorted(set(x)),
-            "year": lambda x: sorted(set(x)),
-            "answer_text": lambda x: list(x)[:3]  # preview up to 3 answers
         })
         .reset_index()
         .rename(columns={
             "country": "Countries",
-            "year": "Years",
             "answer_text": "Sample Answers"
         })
     )
     st.dataframe(grouped)
     if grouped.empty:
         st.info("No questions found with current filters.")
 else:
-    # Context-aware heading
     heading_parts = []
     if selected_countries:
         heading_parts.append("Countries: " + ", ".join(selected_countries))
     if selected_years:
         heading_parts.append("Years: " + ", ".join(map(str, selected_years)))
-    if heading_parts:
-        st.markdown("### Results for " + " | ".join(heading_parts))
-    else:
-        st.markdown("### Results for All Countries and Years")
     st.dataframe(filtered[["country", "year", "question_text", "answer_text"]])
     if filtered.empty:
         st.info("No matching questions found.")

+import os, io, json, gc
 import streamlit as st
 import pandas as pd
 import psycopg2
+import boto3, torch
+from sentence_transformers import SentenceTransformer, util
+# ────────────────────────────────────────────────────────────────────────
+# 1)  DB credentials (from HF secrets or env)  – original
+# ────────────────────────────────────────────────────────────────────────
 DB_HOST = os.getenv("DB_HOST")
 DB_PORT = os.getenv("DB_PORT", "5432")
 DB_NAME = os.getenv("DB_NAME")
 DB_PASSWORD = os.getenv("DB_PASSWORD")
 @st.cache_data(ttl=600)
+def get_data() -> pd.DataFrame:
     try:
         conn = psycopg2.connect(
             host=DB_HOST,
             dbname=DB_NAME,
             user=DB_USER,
             password=DB_PASSWORD,
+            sslmode="require",
         )
+        query = """
+            SELECT id, country, year, section,
+                   question_code, question_text,
+                   answer_code, answer_text
+              FROM survey_info;
+        """
+        df_ = pd.read_sql_query(query, conn)
         conn.close()
+        return df_
     except Exception as e:
         st.error(f"Failed to connect to the database: {e}")
         st.stop()
+df = get_data()              # ← original DataFrame
+# Build a quick lookup row-index → DataFrame row for later
+row_lookup = {row.id: i for i, row in df.iterrows()}
+# ────────────────────────────────────────────────────────────────────────
+# 2)  Load embeddings + ids once per session  (S3) – new, cached
+# ────────────────────────────────────────────────────────────────────────
+@st.cache_resource
+def load_embeddings():
+    # credentials already in env (HF secrets) – boto3 will pick them up
+    BUCKET = "cgd-embeddings-bucket"
+    KEY    = "survey_info_embeddings.pt"   # dict {'ids', 'embeddings'}
+    buf = io.BytesIO()
+    boto3.client("s3").download_fileobj(BUCKET, KEY, buf)
+    buf.seek(0)
+    ckpt = torch.load(buf, map_location="cpu")
+    buf.close(); gc.collect()
+    if not (isinstance(ckpt, dict) and {"ids","embeddings"} <= ckpt.keys()):
+        st.error("Bad checkpoint format in survey_info_embeddings.pt"); st.stop()
+    return ckpt["ids"], ckpt["embeddings"]
+ids_list, emb_tensor = load_embeddings()
+# ────────────────────────────────────────────────────────────────────────
+# 3)  Streamlit UI – original filters + new semantic search
+# ────────────────────────────────────────────────────────────────────────
 st.title("🌍 CGD Survey Explorer (Live DB)")
 st.sidebar.header("🔎 Filter Questions")
 country_options = sorted(df["country"].dropna().unique())
+year_options    = sorted(df["year"].dropna().unique())
 selected_countries = st.sidebar.multiselect("Select Country/Countries", country_options)
+selected_years     = st.sidebar.multiselect("Select Year(s)", year_options)
 keyword = st.sidebar.text_input(
     "Keyword Search (Question text / Answer text / Question code)", ""
+)
 group_by_question = st.sidebar.checkbox("Group by Question Text")
+# ── new semantic search panel ───────────────────────────────────────────
+st.sidebar.markdown("---")
+st.sidebar.subheader("🧠 Semantic Search")
+sem_query = st.sidebar.text_input("Enter a natural-language query")
+if st.sidebar.button("Search", disabled=not sem_query.strip()):
+    with st.spinner("Embedding & searching…"):
+        model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+        q_vec = model.encode(sem_query.strip(), convert_to_tensor=True).cpu()
+        scores = util.cos_sim(q_vec, emb_tensor)[0]
+        top_vals, top_idx = torch.topk(scores, k=10)   # grab extra
+        results = []
+        for score, emb_row in zip(top_vals.tolist(), top_idx.tolist()):
+            db_id = ids_list[emb_row]
+            if db_id in row_lookup:
+                row = df.iloc[row_lookup[db_id]]
+                if row["question_text"] and row["answer_text"]:
+                    results.append({
+                        "Score": f"{score:.3f}",
+                        "Country": row["country"],
+                        "Year": row["year"],
+                        "Question": row["question_text"],
+                        "Answer": row["answer_text"],
+                    })
+        if results:
+            st.subheader(f"🔍 Semantic Results ({len(results)} found)")
+            st.dataframe(pd.DataFrame(results).head(5))
+        else:
+            st.info("No semantic matches found.")
+st.markdown("---")
+# ── apply original filters ──────────────────────────────────────────────
 filtered = df[
     (df["country"].isin(selected_countries) if selected_countries else True) &
+    (df["year"].isin(selected_years)        if selected_years else True)       &
     (
         df["question_text"].str.contains(keyword, case=False, na=False) |
         df["answer_text"].str.contains(keyword, case=False, na=False)   |
+        df["question_code"].astype(str).str.contains(keyword, case=False, na=False)
     )
 ]
+# ── original output logic ───────────────────────
 if group_by_question:
     st.subheader("📊 Grouped by Question Text")
     grouped = (
         filtered.groupby("question_text")
         .agg({
             "country": lambda x: sorted(set(x)),
+            "year":    lambda x: sorted(set(x)),
+            "answer_text": lambda x: list(x)[:3]
         })
         .reset_index()
         .rename(columns={
             "country": "Countries",
+            "year":    "Years",
             "answer_text": "Sample Answers"
         })
     )
     st.dataframe(grouped)
     if grouped.empty:
         st.info("No questions found with current filters.")
 else:
     heading_parts = []
     if selected_countries:
         heading_parts.append("Countries: " + ", ".join(selected_countries))
     if selected_years:
         heading_parts.append("Years: " + ", ".join(map(str, selected_years)))
+    st.markdown("### Results for " + (" | ".join(heading_parts) if heading_parts else "All Countries and Years"))
     st.dataframe(filtered[["country", "year", "question_text", "answer_text"]])
     if filtered.empty:
         st.info("No matching questions found.")