YashJD's picture
Initial Commit
e107ee4
raw
history blame
4.45 kB
import streamlit as st
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv
import os
import json
import re
# 1. Load environment variables
load_dotenv()
MONGODB_URI = os.getenv(
"MONGODB_UR",
"mongodb+srv://milind:[email protected]/?retryWrites=true&w=majority&appName=Cluster0",
)
# 2. Create MongoDB connection
client = MongoClient(MONGODB_URI)
db = client["novascholar_db"]
collection = db["research_papers"]
def convert_mixed_columns(df: pd.DataFrame) -> pd.DataFrame:
"""
Convert any columns that contain lists into comma-separated strings
to ensure consistent data types for CSV export.
"""
for col in df.columns:
if any(isinstance(val, list) for val in df[col].dropna()):
df[col] = df[col].apply(
lambda x: (
", ".join(map(str, x))
if isinstance(x, list)
else (str(x) if pd.notna(x) else "")
)
)
return df
def filter_and_export_collection_to_csv(keywords_list, doc_collection):
"""
Fetch documents from the specified collection where the 'Keywords' field
matches ANY of the keywords in 'keywords_list'. Convert to DataFrame,
ensure consistent column types, save to CSV, and return the DataFrame
and CSV filename.
"""
# 3. Retrieve filtered documents from the collection based on 'Keywords' using $in with regex for substring matching
regex_keywords = [f".*{keyword}.*" for keyword in keywords_list]
docs = list(
doc_collection.find(
{"Keywords": {"$regex": "|".join(regex_keywords), "$options": "i"}}
)
)
# Convert documents to DataFrame
df = pd.DataFrame(docs)
if not df.empty:
# 4. Convert mixed columns
df = convert_mixed_columns(df)
# 5. Export to CSV
csv_filename = "filtered_papers_export.csv"
df.to_csv(csv_filename, index=False)
return df, csv_filename
else:
# Return an empty DataFrame and None if no documents found
return pd.DataFrame(), None
def main():
st.title("Filter and Export Papers by Keyword")
# Let user select the paper type
paper_type = st.selectbox(
"Select type of research paper:",
[
"Review Based Paper",
"Opinion/Perspective Based Paper",
"Empirical Research Paper",
"Research Paper (Other)",
],
)
# Let user enter the keyword to filter
keyword_input = st.text_input(
"Enter the exact keyword to filter papers by 'Keywords' field:"
)
# When user clicks button, use the collection for the selected paper type
if st.button("Export Filtered Papers to CSV"):
with st.spinner("Exporting filtered documents..."):
try:
# Determine dynamic collection based on paper type
collection_name = paper_type.replace(" ", "_").lower()
doc_collection = db[collection_name]
# Split keywords by commas and strip whitespace
keywords_list = [
kw.strip() for kw in keyword_input.split(",") if kw.strip()
]
if not keywords_list:
st.warning("Please enter at least one keyword.")
else:
df, csv_filename = filter_and_export_collection_to_csv(
keywords_list, doc_collection
)
if not df.empty and csv_filename:
st.success(
f"Successfully exported filtered papers to {csv_filename}!"
)
st.download_button(
label="Download CSV",
data=df.to_csv(index=False).encode("utf-8"),
file_name=csv_filename,
mime="text/csv",
)
st.write("Preview of the filtered DataFrame:")
st.dataframe(df)
else:
st.warning(
"No matching documents found for the provided keyword(s)."
)
except Exception as e:
st.error(f"Error exporting filtered papers: {str(e)}")
if __name__ == "__main__":
main()