cgd-ui-TEST

Sleeping

App Files Files Community

gigiliu12 commited on 9 days ago

Commit

feb2540

verified ·

1 Parent(s): ecd8944

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -98

app.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import os, io, json, gc
 import streamlit as st
 import pandas as pd
@@ -5,106 +8,106 @@ import psycopg2
 import boto3, torch
 from sentence_transformers import SentenceTransformer, util
-# ────────────────────────────────────────────────────────────────────────
-# 1)  DB credentials (from HF secrets or env)  – original
-# ────────────────────────────────────────────────────────────────────────
 DB_HOST = os.getenv("DB_HOST")
 DB_PORT = os.getenv("DB_PORT", "5432")
 DB_NAME = os.getenv("DB_NAME")
 DB_USER = os.getenv("DB_USER")
 DB_PASSWORD = os.getenv("DB_PASSWORD")
 @st.cache_data(ttl=600)
 def get_data() -> pd.DataFrame:
-    try:
-        conn = psycopg2.connect(
-            host=DB_HOST,
-            dbname=DB_NAME,
-            user=DB_USER,
-            password=DB_PASSWORD,
-            sslmode="require",
-        )
-        query = """
-            SELECT id, country, year, section,
-                   question_code, question_text,
-                   answer_code, answer_text
-              FROM survey_info;
-        """
-        df_ = pd.read_sql_query(query, conn)
-        conn.close()
-        return df_
-    except Exception as e:
-        st.error(f"Failed to connect to the database: {e}")
-        st.stop()
-df = get_data()              # ← original DataFrame
-# Build a quick lookup row-index → DataFrame row for later
 row_lookup = {row.id: i for i, row in df.iterrows()}
-# ────────────────────────────────────────────────────────────────────────
-# 2)  Load embeddings + ids once per session  (S3) – new, cached
-# ────────────────────────────────────────────────────────────────────────
 @st.cache_resource
-def get_st_model():
-    return SentenceTransformer(
-        "sentence-transformers/all-MiniLM-L6-v2",
-        device="cpu",
-    )
 def load_embeddings():
-    # credentials already in env (HF secrets) – boto3 will pick them up
-    BUCKET = "cgd-embeddings-bucket"
-    KEY    = "survey_info_embeddings.pt"   # dict {'ids', 'embeddings'}
     buf = io.BytesIO()
     boto3.client("s3").download_fileobj(BUCKET, KEY, buf)
     buf.seek(0)
     ckpt = torch.load(buf, map_location="cpu")
     buf.close(); gc.collect()
-    if not (isinstance(ckpt, dict) and {"ids","embeddings"} <= ckpt.keys()):
         st.error("Bad checkpoint format in survey_info_embeddings.pt"); st.stop()
     return ckpt["ids"], ckpt["embeddings"]
 ids_list, emb_tensor = load_embeddings()
-# ────────────────────────────────────────────────────────────────────────
-# 3)  Streamlit UI – original filters + new semantic search
-# ───────────────────────────────────────────────────────────���────────────
 st.title("🌍 CGD Survey Explorer (Live DB)")
 st.sidebar.header("🔎 Filter Questions")
-country_options = sorted(df["country"].dropna().unique())
-year_options    = sorted(df["year"].dropna().unique())
-selected_countries = st.sidebar.multiselect("Select Country/Countries", country_options)
-selected_years     = st.sidebar.multiselect("Select Year(s)", year_options)
-keyword = st.sidebar.text_input(
-    "Keyword Search (Question text / Answer text / Question code)", ""
-)
-group_by_question = st.sidebar.checkbox("Group by Question Text")
-# ── new semantic search panel ───────────────────────────────────────────
 st.sidebar.markdown("---")
 st.sidebar.subheader("🧠 Semantic Search")
-sem_query = st.sidebar.text_input("Enter a natural-language query")
-if st.sidebar.button("Search", disabled=not sem_query.strip()):
     with st.spinner("Embedding & searching…"):
-        # 1) embed query
-        model = get_st_model()                      # cached CPU model
         q_vec = model.encode(
             sem_query.strip(),
             convert_to_tensor=True,
             device="cpu"
         ).cpu()
-        # 2) semantic similarity
         sims = util.cos_sim(q_vec, emb_tensor)[0]
-        top_vals, top_idx = torch.topk(sims, k=50)
         sem_ids   = [ids_list[i] for i in top_idx.tolist()]
         sem_rows  = df.loc[df["id"].isin(sem_ids)].copy()
@@ -112,9 +115,9 @@ if st.sidebar.button("Search", disabled=not sem_query.strip()):
         sem_rows["Score"] = sem_rows["id"].map(score_map)
         sem_rows = sem_rows.sort_values("Score", ascending=False)
-        # 3) keyword / dropdown remainder
         remainder = filtered.loc[~filtered["id"].isin(sem_ids)].copy()
-        remainder["Score"] = ""         # blank for keyword-only rows
         combined = pd.concat([sem_rows, remainder], ignore_index=True)
@@ -123,23 +126,13 @@ if st.sidebar.button("Search", disabled=not sem_query.strip()):
         combined[["Score", "country", "year", "question_text", "answer_text"]],
         use_container_width=True,
     )
-    st.stop()   # skip the old display logic below when semantic search ran
-# ── apply original filters ──────────────────────────────────────────────
-filtered = df[
-    (df["country"].isin(selected_countries) if selected_countries else True) &
-    (df["year"].isin(selected_years)        if selected_years else True)       &
-    (
-        df["question_text"].str.contains(keyword, case=False, na=False) |
-        df["answer_text"].str.contains(keyword, case=False, na=False)   |
-        df["question_code"].astype(str).str.contains(keyword, case=False, na=False)
-    )
-]
-# ── original output logic ───────────────────────
-if group_by_question:
     st.subheader("📊 Grouped by Question Text")
     grouped = (
         filtered.groupby("question_text")
         .agg({
@@ -154,25 +147,17 @@ if group_by_question:
             "answer_text": "Sample Answers"
         })
     )
-    st.dataframe(grouped)
     if grouped.empty:
         st.info("No questions found with current filters.")
 else:
-    heading_parts = []
-    if selected_countries:
-        heading_parts.append("Countries: " + ", ".join(selected_countries))
-    if selected_years:
-        heading_parts.append("Years: " + ", ".join(map(str, selected_years)))
-    st.markdown("### Results for " + (" | ".join(heading_parts) if heading_parts else "All Countries and Years"))
-    st.dataframe(filtered[["country", "year", "question_text", "answer_text"]])
     if filtered.empty:
-        st.info("No matching questions found.")

+#!/usr/bin/env python3
+# app.py  – CGD Survey Explorer (keyword + semantic in one table)
 import os, io, json, gc
 import streamlit as st
 import pandas as pd
 import boto3, torch
 from sentence_transformers import SentenceTransformer, util
+# ─────────────────────────────────────────────────────────────
+# 1)  Database credentials (HF Secrets or env vars)
+# ─────────────────────────────────────────────────────────────
 DB_HOST = os.getenv("DB_HOST")
 DB_PORT = os.getenv("DB_PORT", "5432")
 DB_NAME = os.getenv("DB_NAME")
 DB_USER = os.getenv("DB_USER")
 DB_PASSWORD = os.getenv("DB_PASSWORD")
 @st.cache_data(ttl=600)
 def get_data() -> pd.DataFrame:
+    """Read survey_info once every 10 min."""
+    conn = psycopg2.connect(
+        host=DB_HOST, port=DB_PORT,
+        dbname=DB_NAME, user=DB_USER, password=DB_PASSWORD,
+        sslmode="require",
+    )
+    df_ = pd.read_sql_query("""
+        SELECT id, country, year, section,
+               question_code, question_text,
+               answer_code,  answer_text
+          FROM survey_info;
+    """, conn)
+    conn.close()
+    return df_
+df = get_data()
 row_lookup = {row.id: i for i, row in df.iterrows()}
+# ─────────────────────────────────────────────────────────────
+# 2)  Cached resources
+# ─────────────────────────────────────────────────────────────
 @st.cache_resource
 def load_embeddings():
+    """Download ids + embedding tensor from S3 once per session."""
+    BUCKET, KEY = "cgd-embeddings-bucket", "survey_info_embeddings.pt"
     buf = io.BytesIO()
     boto3.client("s3").download_fileobj(BUCKET, KEY, buf)
     buf.seek(0)
     ckpt = torch.load(buf, map_location="cpu")
     buf.close(); gc.collect()
+    if not (isinstance(ckpt, dict) and {"ids", "embeddings"} <= ckpt.keys()):
         st.error("Bad checkpoint format in survey_info_embeddings.pt"); st.stop()
     return ckpt["ids"], ckpt["embeddings"]
 ids_list, emb_tensor = load_embeddings()
+@st.cache_resource
+def get_st_model():
+    """Mini-LM sentence-transformer pinned to CPU (avoids meta-tensor bug)."""
+    return SentenceTransformer(
+        "sentence-transformers/all-MiniLM-L6-v2",
+        device="cpu",
+    )
+# ─────────────────────────────────────────────────────────────
+# 3)  Streamlit UI
+# ─────────────────────────────────────────────────────────────
 st.title("🌍 CGD Survey Explorer (Live DB)")
 st.sidebar.header("🔎 Filter Questions")
+country_opts = sorted(df["country"].dropna().unique())
+year_opts    = sorted(df["year"].dropna().unique())
+sel_countries = st.sidebar.multiselect("Select Country/Countries", country_opts)
+sel_years     = st.sidebar.multiselect("Select Year(s)", year_opts)
+keyword       = st.sidebar.text_input("Keyword Search (Question / Answer / Code)")
+group_by_q    = st.sidebar.checkbox("Group by Question Text")
+# ── Semantic search panel
 st.sidebar.markdown("---")
 st.sidebar.subheader("🧠 Semantic Search")
+sem_query      = st.sidebar.text_input("Enter a natural-language query")
+search_clicked = st.sidebar.button("Search", disabled=not sem_query.strip())
+# ── Always build the keyword/dropdown subset
+filtered = df[
+    (df["country"].isin(sel_countries) if sel_countries else True) &
+    (df["year"].isin(sel_years)        if sel_years     else True) &
+    (
+        df["question_text"].str.contains(keyword, case=False, na=False) |
+        df["answer_text"].str.contains(keyword, case=False, na=False)   |
+        df["question_code"].astype(str).str.contains(keyword, case=False, na=False)
+    )
+]
+# ─────────────────────────────────────────────────────────────
+# 4)  Semantic Search → merged table
+# ─────────────────────────────────────────────────────────────
+if search_clicked:
     with st.spinner("Embedding & searching…"):
+        model = get_st_model()
         q_vec = model.encode(
             sem_query.strip(),
             convert_to_tensor=True,
             device="cpu"
         ).cpu()
         sims = util.cos_sim(q_vec, emb_tensor)[0]
+        top_vals, top_idx = torch.topk(sims, k=50)        # 50 candidates
         sem_ids   = [ids_list[i] for i in top_idx.tolist()]
         sem_rows  = df.loc[df["id"].isin(sem_ids)].copy()
         sem_rows["Score"] = sem_rows["id"].map(score_map)
         sem_rows = sem_rows.sort_values("Score", ascending=False)
+        # rows that matched keyword/dropdown but not semantic
         remainder = filtered.loc[~filtered["id"].isin(sem_ids)].copy()
+        remainder["Score"] = ""    # blank score
         combined = pd.concat([sem_rows, remainder], ignore_index=True)
         combined[["Score", "country", "year", "question_text", "answer_text"]],
         use_container_width=True,
     )
+    st.stop()   # skip original display logic below when semantic ran
+# ─────────────────────────────────────────────────────────────
+# 5)  Original display (keyword / filters only)
+# ─────────────────────────────────────────────────────────────
+if group_by_q:
     st.subheader("📊 Grouped by Question Text")
     grouped = (
         filtered.groupby("question_text")
         .agg({
             "answer_text": "Sample Answers"
         })
     )
+    st.dataframe(grouped, use_container_width=True)
     if grouped.empty:
         st.info("No questions found with current filters.")
 else:
+    hdr = []
+    if sel_countries: hdr.append("Countries: " + ", ".join(sel_countries))
+    if sel_years:     hdr.append("Years: " + ", ".join(map(str, sel_years)))
+    st.markdown("### Results for " + (" | ".join(hdr) if hdr else "All Countries and Years"))
+    st.dataframe(
+        filtered[["country", "year", "question_text", "answer_text"]],
+        use_container_width=True,
+    )
     if filtered.empty:
+        st.info("No matching questions found.")