Spaces:
Sleeping
Sleeping
import streamlit as st | |
from langchain.chains import LLMChain | |
from langchain.prompts import PromptTemplate | |
from langchain.llms import HuggingFaceHub | |
import fitz | |
from PIL import Image | |
import os | |
import pytesseract | |
import re | |
# Set Hugging Face API Key | |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["HF_TOKEN"] | |
# Initialize LLM | |
llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.3", model_kwargs={"temperature": 0.5}) | |
# App Configuration | |
st.set_page_config(page_title="DocuMentorAI", layout="wide", page_icon="📄") | |
st.title("📄 DocuMentorAI") | |
# Improved CSS | |
st.markdown(""" | |
<style> | |
.output-container { | |
background-color: #f0f2f6; | |
padding: 20px; | |
border-radius: 10px; | |
margin-top: 20px; | |
white-space: pre-wrap; | |
} | |
.stTextArea textarea { | |
font-size: 16px !important; | |
} | |
.stButton button { | |
width: 100%; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Helper Functions | |
def extract_text_from_pdf(pdf_file): | |
try: | |
pdf_bytes = pdf_file.read() | |
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: | |
return " ".join([page.get_text() for page in doc]) | |
except Exception as e: | |
st.error(f"Error extracting text from PDF: {e}") | |
return "" | |
def extract_text_from_image(image_file): | |
try: | |
image = Image.open(image_file) | |
return pytesseract.image_to_string(image) | |
except Exception as e: | |
st.error(f"Error extracting text from image: {e}") | |
return "" | |
def extract_text(uploaded_file): | |
if not uploaded_file: | |
return "" | |
return extract_text_from_pdf(uploaded_file) if uploaded_file.type == "application/pdf" else extract_text_from_image(uploaded_file) | |
def parse_resume(resume_text): | |
"""Extract key information from resume text""" | |
parsed_info = { | |
'education': [], | |
'skills': [], | |
'experience': [], | |
'projects': [], | |
'publications': [] | |
} | |
# Find education details | |
edu_markers = ['Education:', 'EDUCATION', 'Academic Background'] | |
exp_markers = ['Experience:', 'EXPERIENCE', 'Work History', 'Employment'] | |
skill_markers = ['Skills:', 'SKILLS', 'Technical Skills', 'Technologies'] | |
proj_markers = ['Projects:', 'PROJECTS', 'Key Projects'] | |
pub_markers = ['Publications:', 'PUBLICATIONS', 'Research Papers'] | |
# Helper function to extract section content | |
def extract_section(text, start_markers, end_markers): | |
content = [] | |
for start in start_markers: | |
start_idx = text.find(start) | |
if start_idx != -1: | |
section_start = start_idx + len(start) | |
section_end = len(text) | |
# Find the next section marker | |
for end in end_markers: | |
next_section = text.find(end, section_start) | |
if next_section != -1: | |
section_end = min(section_end, next_section) | |
section_content = text[section_start:section_end].strip() | |
content.append(section_content) | |
return '\n'.join(content) | |
# Extract sections | |
all_markers = edu_markers + exp_markers + skill_markers + proj_markers + pub_markers | |
parsed_info['education'] = extract_section(resume_text, edu_markers, all_markers) | |
parsed_info['experience'] = extract_section(resume_text, exp_markers, all_markers) | |
parsed_info['skills'] = extract_section(resume_text, skill_markers, all_markers) | |
parsed_info['projects'] = extract_section(resume_text, proj_markers, all_markers) | |
parsed_info['publications'] = extract_section(resume_text, pub_markers, all_markers) | |
return parsed_info | |
def extract_professor_details(text): | |
professor_pattern = r"(Dr\.|Professor|Prof\.?)\s+([A-Z][a-z]+\s[A-Z][a-z]+)" | |
university_pattern = r"(University|Institute|College|School of [A-Z][A-Za-z\s]+)" | |
professor_match = re.search(professor_pattern, text) | |
university_match = re.search(university_pattern, text) | |
return (professor_match.group(0) if professor_match else "Not Found", | |
university_match.group(0) if university_match else "Not Found") | |
def clean_email_output(email_text): | |
"""Clean and format email content""" | |
start_idx = email_text.find("Dear") | |
if start_idx == -1: | |
start_idx = 0 | |
end_markers = ["Best regards,", "Sincerely,", "Yours sincerely,", "Kind regards,"] | |
end_idx = len(email_text) | |
for marker in end_markers: | |
idx = email_text.find(marker) | |
if idx != -1: | |
end_idx = email_text.find("\n\n", idx) if email_text.find("\n\n", idx) != -1 else len(email_text) | |
break | |
email_content = email_text[start_idx:end_idx].strip() | |
if "Phone:" in email_text or "Email:" in email_text: | |
contact_info = "\n\n" + "\n".join([ | |
line for line in email_text[end_idx:].split("\n") | |
if any(info in line for info in ["Phone:", "Email:"]) | |
]).strip() | |
email_content += contact_info | |
return email_content | |
def clean_cover_letter_output(letter_text): | |
"""Clean and format cover letter content""" | |
start_markers = ["Dear", "To Whom", "Hiring"] | |
start_idx = len(letter_text) | |
for marker in start_markers: | |
idx = letter_text.find(marker) | |
if idx != -1: | |
start_idx = min(start_idx, idx) | |
end_markers = ["Sincerely,", "Best regards,", "Yours truly,", "Regards,"] | |
end_idx = len(letter_text) | |
for marker in end_markers: | |
idx = letter_text.find(marker) | |
if idx != -1: | |
end_idx = letter_text.find("\n\n", idx) if letter_text.find("\n\n", idx) != -1 else len(letter_text) | |
break | |
return letter_text[start_idx:end_idx].strip() | |
def clean_research_statement_output(statement_text): | |
"""Clean and format research statement content""" | |
# Remove common headers | |
headers = ["Research Statement", "Statement of Research", "Research Interests"] | |
cleaned_text = statement_text | |
for header in headers: | |
if cleaned_text.startswith(header): | |
cleaned_text = cleaned_text[len(header):].lstrip(":\n") | |
# Remove any trailing references or bibliography sections | |
end_markers = ["References", "Bibliography", "Citations"] | |
for marker in end_markers: | |
idx = cleaned_text.find(marker) | |
if idx != -1: | |
cleaned_text = cleaned_text[:idx].strip() | |
return cleaned_text.strip() | |
def clean_sop_output(sop_text): | |
"""Clean and format Statement of Purpose content""" | |
# Remove common headers | |
headers = ["Statement of Purpose", "Personal Statement", "Academic Statement"] | |
cleaned_text = sop_text | |
for header in headers: | |
if cleaned_text.startswith(header): | |
cleaned_text = cleaned_text[len(header):].lstrip(":\n") | |
# Remove any trailing sections | |
end_markers = ["Thank you", "References", "Additional Information"] | |
for marker in end_markers: | |
idx = cleaned_text.find(marker) | |
if idx != -1: | |
cleaned_text = cleaned_text[:idx].strip() | |
return cleaned_text.strip() | |
# Initialize session state | |
if 'generated_content' not in st.session_state: | |
st.session_state.generated_content = { | |
'email': None, | |
'cover_letter': None, | |
'research_statement': None, | |
'sop': None | |
} | |
# Template Definitions | |
templates = { | |
'email': PromptTemplate.from_template(""" | |
Write ONLY a formal cold email for a research position. | |
Start with 'Dear Professor' and end with a signature. | |
Use these specific details from the CV: | |
Education: {education} | |
Relevant Experience: {experience} | |
Key Skills: {skills} | |
Notable Projects: {projects} | |
Publications: {publications} | |
Additional Context: | |
Professor: {professor_name} | |
University: {university_name} | |
Research Interests: {research_interests} | |
Why This Lab: {reason} | |
Guidelines: | |
1. Keep the email concise (max 400 words) | |
2. Focus on the most relevant experience and skills that match the lab's research | |
3. Mention 1-2 specific projects or publications that align with the lab's work | |
4. Include a clear statement of interest and why you're a good fit | |
5. End with your contact information | |
"""), | |
'cover_letter': PromptTemplate.from_template(""" | |
Write ONLY a professional cover letter. | |
Use these specific details from the CV: | |
Education: {education} | |
Relevant Experience: {experience} | |
Technical Skills: {skills} | |
Notable Projects: {projects} | |
Publications: {publications} | |
Position Details: | |
Job Title: {job_title} | |
Company: {company} | |
Required Skills: {key_skills} | |
Guidelines: | |
1. Start with a formal greeting | |
2. Focus on experiences and skills that directly match the job requirements | |
3. Provide specific examples from your projects and work history | |
4. Demonstrate how your background makes you an ideal candidate | |
5. End with a professional closing | |
"""), | |
'research_statement': PromptTemplate.from_template(""" | |
Write ONLY a research statement focused on your academic journey and research goals. | |
Use these specific details from your background: | |
Education: {education} | |
Research Experience: {experience} | |
Technical Skills: {skills} | |
Research Projects: {projects} | |
Publications: {publications} | |
Additional Context: | |
Research Background: {research_background} | |
Key Projects: {key_projects} | |
Future Goals: {future_goals} | |
Guidelines: | |
1. Describe your research journey and motivation | |
2. Highlight key research achievements and findings | |
3. Connect past work to future research goals | |
4. Demonstrate technical expertise and methodological knowledge | |
5. End with your vision for future contributions to the field | |
"""), | |
'sop': PromptTemplate.from_template(""" | |
Write ONLY a Statement of Purpose (SOP). | |
Use these specific details from your background: | |
Education: {education} | |
Research Experience: {experience} | |
Technical Skills: {skills} | |
Notable Projects: {projects} | |
Publications: {publications} | |
Additional Context: | |
Motivation: {motivation} | |
Academic Goals: {academic_background} | |
Research Interests: {research_experiences} | |
Career Objectives: {career_goals} | |
Program Interest: {why_this_program} | |
Guidelines: | |
1. Tell a coherent story about your academic journey | |
2. Connect your background to your future goals | |
3. Demonstrate why you're prepared for graduate study | |
4. Show alignment between your interests and the program | |
5. Make a compelling case for why you should be admitted | |
""") | |
} | |
# Create LangChain instances | |
chains = {key: LLMChain(llm=llm, prompt=template) for key, template in templates.items()} | |
# Sidebar for Input Collection | |
with st.sidebar: | |
st.subheader("📝 Input Details") | |
job_opening_text = st.text_area("Job/Research Opening Details", height=150) | |
cv_resume_file = st.file_uploader("Upload CV/Resume", type=["pdf", "png", "jpg", "jpeg"]) | |
cv_resume_text = extract_text(cv_resume_file) | |
# Tab Layout | |
tab1, tab2, tab3, tab4 = st.tabs(["Cold Email", "Cover Letter", "Research Statement", "SOP"]) | |
# Cold Email Tab | |
with tab1: | |
professor_name, university_name = extract_professor_details(job_opening_text) | |
research_interests = st.text_input("Research Interests") | |
reason = st.text_input("Why this professor/lab?") | |
if st.button("Generate Email", key="email_btn"): | |
if job_opening_text and cv_resume_text: | |
with st.spinner("Generating..."): | |
try: | |
# Parse resume information | |
resume_info = parse_resume(cv_resume_text) | |
# Generate email with parsed information | |
generated_email = chains['email'].run({ | |
"professor_name": professor_name, | |
"university_name": university_name, | |
"research_interests": research_interests, | |
"reason": reason, | |
"education": resume_info['education'], | |
"experience": resume_info['experience'], | |
"skills": resume_info['skills'], | |
"projects": resume_info['projects'], | |
"publications": resume_info['publications'] | |
}) | |
st.session_state.generated_content['email'] = clean_email_output(generated_email) | |
except Exception as e: | |
st.error(f"Generation error: {e}") | |
else: | |
st.error("Please provide all required inputs") | |
# Cover Letter Tab | |
with tab2: | |
job_title = st.text_input("Job Title") | |
company_name = university_name if university_name != "Not Found" else st.text_input("Company/University") | |
key_skills = st.text_input("Key Skills") | |
if st.button("Generate Cover Letter", key="cover_letter_btn"): | |
if job_opening_text and cv_resume_text: | |
with st.spinner("Generating..."): | |
try: | |
resume_info = parse_resume(cv_resume_text) | |
generated_letter = chains['cover_letter'].run({ | |
"job_title": job_title, | |
"company": company_name, | |
"key_skills": key_skills, | |
"reason": reason, | |
"skills": resume_info['skills'], | |
"education": resume_info['education'], | |
"experience": resume_info['experience'] | |
}) | |
st.session_state.generated_content['cover_letter'] = clean_cover_letter_output(generated_letter) | |
except Exception as e: | |
st.error(f"Generation error: {e}") | |
else: | |
st.error("Please provide all required inputs") | |
if st.session_state.generated_content['cover_letter']: | |
st.markdown('<div class="output-container">', unsafe_allow_html=True) | |
st.markdown(st.session_state.generated_content['cover_letter']) | |
st.download_button("Download Cover Letter", st.session_state.generated_content['cover_letter'], | |
file_name="cover_letter.txt", key="cover_letter_download") | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Research Statement Tab | |
with tab3: | |
key_projects = st.text_input("Key Research Projects") | |
future_goals = st.text_input("Future Research Goals") | |
if st.button("Generate Research Statement", key="research_stmt_btn"): | |
with st.spinner("Generating..."): | |
try: | |
resume_info = parse_resume(cv_resume_text) | |
generated_statement = chains['research_statement'].run({ | |
"reason": reason, | |
"education": resume_info['education'], | |
"experience": resume_info['experience'], | |
"skills": resume_info['skills'], | |
"projects": resume_info['projects'], | |
"publications": resume_info['publications'], | |
"research_background": resume_info['publications'], | |
"key_projects": key_projects, | |
"future_goals": future_goals | |
}) | |
st.session_state.generated_content['research_statement'] = clean_research_statement_output(generated_statement) | |
except Exception as e: | |
st.error(f"Generation error: {e}") | |
if st.session_state.generated_content['research_statement']: | |
st.markdown('<div class="output-container">', unsafe_allow_html=True) | |
st.markdown(st.session_state.generated_content['research_statement']) | |
st.download_button("Download Research Statement", st.session_state.generated_content['research_statement'], | |
file_name="research_statement.txt", key="research_stmt_download") | |
st.markdown('</div>', unsafe_allow_html=True) | |
# SOP Tab | |
with tab4: | |
motivation = st.text_input("Motivation for Graduate Studies") | |
career_goals = st.text_input("Career Goals") | |
why_this_program = st.text_input("Why This Program") | |
if st.button("Generate SOP", key="sop_btn"): | |
with st.spinner("Generating..."): | |
try: | |
resume_info = parse_resume(cv_resume_text) | |
generated_sop = chains['sop'].run({ | |
"motivation": motivation, | |
"academic_background": resume_info['education'], | |
"research_experiences": resume_info['publications'], | |
"career_goals": career_goals, | |
"why_this_program": why_this_program, | |
"experience": resume_info['experience'], | |
"skills": resume_info['skills'], | |
"projects": resume_info['projects'] | |
}) | |
st.session_state.generated_content['sop'] = clean_sop_output(generated_sop) | |
except Exception as e: | |
st.error(f"Generation error: {e}") | |
if st.session_state.generated_content['sop']: | |
st.markdown('<div class="output-container">', unsafe_allow_html=True) | |
st.markdown(st.session_state.generated_content['sop']) | |
st.download_button("Download SOP", st.session_state.generated_content['sop'], | |
file_name="sop.txt", key="sop_download") | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Reset Button | |
if st.sidebar.button("🔄 Reset All"): | |
st.session_state.generated_content = {key: None for key in st.session_state.generated_content} | |
st.experimental_rerun() |