|
import streamlit as st |
|
import pandas as pd |
|
from pymongo import MongoClient |
|
from dotenv import load_dotenv |
|
import os |
|
import json |
|
import re |
|
|
|
|
|
load_dotenv() |
|
MONGODB_URI = os.getenv( |
|
"MONGODB_UR", |
|
"mongodb+srv://milind:[email protected]/?retryWrites=true&w=majority&appName=Cluster0", |
|
) |
|
|
|
client = MongoClient(MONGODB_URI) |
|
db = client["novascholar_db"] |
|
collection = db["research_papers"] |
|
|
|
|
|
def convert_mixed_columns(df: pd.DataFrame) -> pd.DataFrame: |
|
""" |
|
Convert any columns that contain lists into comma-separated strings |
|
to ensure consistent data types for CSV export. |
|
""" |
|
for col in df.columns: |
|
if any(isinstance(val, list) for val in df[col].dropna()): |
|
df[col] = df[col].apply( |
|
lambda x: ( |
|
", ".join(map(str, x)) |
|
if isinstance(x, list) |
|
else (str(x) if pd.notna(x) else "") |
|
) |
|
) |
|
return df |
|
|
|
|
|
def filter_and_export_collection_to_csv(keywords_list, doc_collection): |
|
""" |
|
Fetch documents from the specified collection where the 'Keywords' field |
|
matches ANY of the keywords in 'keywords_list'. Convert to DataFrame, |
|
ensure consistent column types, save to CSV, and return the DataFrame |
|
and CSV filename. |
|
""" |
|
|
|
regex_keywords = [f".*{keyword}.*" for keyword in keywords_list] |
|
docs = list( |
|
doc_collection.find( |
|
{"Keywords": {"$regex": "|".join(regex_keywords), "$options": "i"}} |
|
) |
|
) |
|
|
|
|
|
df = pd.DataFrame(docs) |
|
|
|
if not df.empty: |
|
|
|
df = convert_mixed_columns(df) |
|
|
|
csv_filename = "filtered_papers_export.csv" |
|
df.to_csv(csv_filename, index=False) |
|
return df, csv_filename |
|
else: |
|
|
|
return pd.DataFrame(), None |
|
|
|
|
|
def main(): |
|
st.title("Filter and Export Papers by Keyword") |
|
|
|
|
|
paper_type = st.selectbox( |
|
"Select type of research paper:", |
|
[ |
|
"Review Based Paper", |
|
"Opinion/Perspective Based Paper", |
|
"Empirical Research Paper", |
|
"Research Paper (Other)", |
|
], |
|
) |
|
|
|
|
|
keyword_input = st.text_input( |
|
"Enter the exact keyword to filter papers by 'Keywords' field:" |
|
) |
|
|
|
|
|
if st.button("Export Filtered Papers to CSV"): |
|
with st.spinner("Exporting filtered documents..."): |
|
try: |
|
|
|
collection_name = paper_type.replace(" ", "_").lower() |
|
doc_collection = db[collection_name] |
|
|
|
|
|
keywords_list = [ |
|
kw.strip() for kw in keyword_input.split(",") if kw.strip() |
|
] |
|
|
|
if not keywords_list: |
|
st.warning("Please enter at least one keyword.") |
|
else: |
|
df, csv_filename = filter_and_export_collection_to_csv( |
|
keywords_list, doc_collection |
|
) |
|
if not df.empty and csv_filename: |
|
st.success( |
|
f"Successfully exported filtered papers to {csv_filename}!" |
|
) |
|
st.download_button( |
|
label="Download CSV", |
|
data=df.to_csv(index=False).encode("utf-8"), |
|
file_name=csv_filename, |
|
mime="text/csv", |
|
) |
|
st.write("Preview of the filtered DataFrame:") |
|
st.dataframe(df) |
|
else: |
|
st.warning( |
|
"No matching documents found for the provided keyword(s)." |
|
) |
|
except Exception as e: |
|
st.error(f"Error exporting filtered papers: {str(e)}") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|