Spaces:

SPJIMR-Internship
/

SPJIMR_FlipClassroom_RCopilot_ResearchInternship

Running

App Files Files Community

SPJIMR_FlipClassroom_RCopilot_ResearchInternship / new_keywords.py

YashJD

Initial Commit

e107ee4 6 months ago

raw

history blame

4.45 kB

	import streamlit as st
	import pandas as pd
	from pymongo import MongoClient
	from dotenv import load_dotenv
	import os
	import json
	import re

	# 1. Load environment variables
	load_dotenv()
	MONGODB_URI = os.getenv(
	"MONGODB_UR",
	"mongodb+srv://milind:[email protected]/?retryWrites=true&w=majority&appName=Cluster0",
	)
	# 2. Create MongoDB connection
	client = MongoClient(MONGODB_URI)
	db = client["novascholar_db"]
	collection = db["research_papers"]


	def convert_mixed_columns(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Convert any columns that contain lists into comma-separated strings
	to ensure consistent data types for CSV export.
	"""
	for col in df.columns:
	if any(isinstance(val, list) for val in df[col].dropna()):
	df[col] = df[col].apply(
	lambda x: (
	", ".join(map(str, x))
	if isinstance(x, list)
	else (str(x) if pd.notna(x) else "")
	)
	)
	return df


	def filter_and_export_collection_to_csv(keywords_list, doc_collection):
	"""
	Fetch documents from the specified collection where the 'Keywords' field
	matches ANY of the keywords in 'keywords_list'. Convert to DataFrame,
	ensure consistent column types, save to CSV, and return the DataFrame
	and CSV filename.
	"""
	# 3. Retrieve filtered documents from the collection based on 'Keywords' using $in with regex for substring matching
	regex_keywords = [f".{keyword}." for keyword in keywords_list]
	docs = list(
	doc_collection.find(
	{"Keywords": {"$regex": "\|".join(regex_keywords), "$options": "i"}}
	)
	)

	# Convert documents to DataFrame
	df = pd.DataFrame(docs)

	if not df.empty:
	# 4. Convert mixed columns
	df = convert_mixed_columns(df)
	# 5. Export to CSV
	csv_filename = "filtered_papers_export.csv"
	df.to_csv(csv_filename, index=False)
	return df, csv_filename
	else:
	# Return an empty DataFrame and None if no documents found
	return pd.DataFrame(), None


	def main():
	st.title("Filter and Export Papers by Keyword")

	# Let user select the paper type
	paper_type = st.selectbox(
	"Select type of research paper:",
	[
	"Review Based Paper",
	"Opinion/Perspective Based Paper",
	"Empirical Research Paper",
	"Research Paper (Other)",
	],
	)

	# Let user enter the keyword to filter
	keyword_input = st.text_input(
	"Enter the exact keyword to filter papers by 'Keywords' field:"
	)

	# When user clicks button, use the collection for the selected paper type
	if st.button("Export Filtered Papers to CSV"):
	with st.spinner("Exporting filtered documents..."):
	try:
	# Determine dynamic collection based on paper type
	collection_name = paper_type.replace(" ", "_").lower()
	doc_collection = db[collection_name]

	# Split keywords by commas and strip whitespace
	keywords_list = [
	kw.strip() for kw in keyword_input.split(",") if kw.strip()
	]

	if not keywords_list:
	st.warning("Please enter at least one keyword.")
	else:
	df, csv_filename = filter_and_export_collection_to_csv(
	keywords_list, doc_collection
	)
	if not df.empty and csv_filename:
	st.success(
	f"Successfully exported filtered papers to {csv_filename}!"
	)
	st.download_button(
	label="Download CSV",
	data=df.to_csv(index=False).encode("utf-8"),
	file_name=csv_filename,
	mime="text/csv",
	)
	st.write("Preview of the filtered DataFrame:")
	st.dataframe(df)
	else:
	st.warning(
	"No matching documents found for the provided keyword(s)."
	)
	except Exception as e:
	st.error(f"Error exporting filtered papers: {str(e)}")


	if __name__ == "__main__":
	main()