import streamlit as st from langchain.chains import LLMChain from langchain.prompts import PromptTemplate from langchain.llms import HuggingFaceHub import fitz from PIL import Image import os import pytesseract import re # Set Hugging Face API Key os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["HF_TOKEN"] # Initialize LLM llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.3", model_kwargs={"temperature": 0.5}) # App Configuration st.set_page_config(page_title="DocuMentorAI", layout="wide", page_icon="📄") st.title("📄 DocuMentorAI") # Improved CSS st.markdown(""" """, unsafe_allow_html=True) # Helper Functions def extract_text_from_pdf(pdf_file): try: pdf_bytes = pdf_file.read() with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: return " ".join([page.get_text() for page in doc]) except Exception as e: st.error(f"Error extracting text from PDF: {e}") return "" def extract_text_from_image(image_file): try: image = Image.open(image_file) return pytesseract.image_to_string(image) except Exception as e: st.error(f"Error extracting text from image: {e}") return "" def extract_text(uploaded_file): if not uploaded_file: return "" return extract_text_from_pdf(uploaded_file) if uploaded_file.type == "application/pdf" else extract_text_from_image(uploaded_file) def parse_resume(resume_text): """Extract key information from resume text using improved parsing""" sections = { 'education': ['Education:', 'EDUCATION', 'Academic Background'], 'experience': ['Experience:', 'EXPERIENCE', 'Work History', 'Employment'], 'skills': ['Skills:', 'SKILLS', 'Technical Skills', 'Technologies'], 'projects': ['Projects:', 'PROJECTS', 'Key Projects'], 'publications': ['Publications:', 'PUBLICATIONS', 'Research Papers'] } parsed_info = {key: '' for key in sections} # Convert text to lines for better parsing lines = resume_text.split('\n') current_section = None section_content = [] for line in lines: line = line.strip() if not line: continue # Check if this line is a section header for section, headers in sections.items(): if any(header.lower() in line.lower() for header in headers): if current_section: parsed_info[current_section] = '\n'.join(section_content) current_section = section section_content = [] break else: if current_section: section_content.append(line) # Add the last section if current_section and section_content: parsed_info[current_section] = '\n'.join(section_content) return parsed_info def extract_professor_details(text): professor_pattern = r"(Dr\.|Professor|Prof\.?)\s+([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)" university_pattern = r"(University|Institute|College|School) of [A-Z][A-Za-z\s]+" professor_match = re.search(professor_pattern, text) university_match = re.search(university_pattern, text) return (professor_match.group(0) if professor_match else "Not Found", university_match.group(0) if university_match else "Not Found") def clean_output(text, type_="general"): """Unified cleaning function for all document types""" if not text: return "" # Common start markers start_markers = { "email": ["Dear"], "cover_letter": ["Dear", "To Whom", "Hiring"], "research_statement": ["Research Statement", "Statement of Research"], "sop": ["Statement of Purpose", "Personal Statement"] } # Common end markers end_markers = ["Best regards,", "Sincerely,", "Yours sincerely,", "Kind regards,", "Thank you"] # Find start of content start_idx = 0 relevant_starts = start_markers.get(type_, start_markers["email"]) for marker in relevant_starts: idx = text.find(marker) if idx != -1: start_idx = idx break # Find end of content end_idx = len(text) for marker in end_markers: idx = text.find(marker) if idx != -1: end_idx = text.find("\n\n", idx) if text.find("\n\n", idx) != -1 else len(text) break cleaned_text = text[start_idx:end_idx].strip() # Add contact information for emails if type_ == "email" and ("Phone:" in text or "Email:" in text): contact_info = "\n\n" + "\n".join([ line for line in text[end_idx:].split("\n") if any(info in line for info in ["Phone:", "Email:"]) ]).strip() cleaned_text += contact_info return cleaned_text # Initialize session state if 'generated_content' not in st.session_state: st.session_state.generated_content = { 'email': None, 'cover_letter': None, 'research_statement': None, 'sop': None } # Template Definitions (simplified and standardized) templates = { 'email': """ Write ONLY a formal cold email for a research position. Start with 'Dear Professor' and end with a signature. Use these specific details from the CV: {education} {experience} {skills} {projects} {publications} Additional Context: Professor: {professor_name} University: {university_name} Research Interests: {research_interests} Why This Lab: {reason} Guidelines: 1. Keep the email concise (max 400 words) 2. Focus on the most relevant experience and skills 3. Mention 1-2 specific projects that align with the lab's work 4. Include a clear statement of interest 5. End with your contact information """, 'cover_letter': """ Write ONLY a professional cover letter for {job_title} at {company}. Use these specific details: {education} {experience} {skills} {projects} Required Skills: {key_skills} Guidelines: 1. Start with a formal greeting 2. Focus on experiences matching job requirements 3. Provide specific examples 4. Show why you're an ideal candidate 5. End professionally """, 'research_statement': """ Write ONLY a research statement focused on your academic journey and future goals. Background: {education} {experience} {skills} {projects} {publications} Research Focus: {key_projects} Future Goals: {future_goals} Guidelines: 1. Describe your research journey 2. Highlight key achievements 3. Connect past work to future goals 4. Show technical expertise 5. Present your research vision """, 'sop': """ Write ONLY a Statement of Purpose (SOP) for graduate studies. Background: {education} {experience} {skills} {projects} {publications} Context: Motivation: {motivation} Career Goals: {career_goals} Program Interest: {why_this_program} Guidelines: 1. Tell your academic journey 2. Connect background to goals 3. Show preparation for graduate study 4. Demonstrate program alignment 5. Make a compelling case """ } # Convert templates to PromptTemplate objects templates = {k: PromptTemplate.from_template(v) for k, v in templates.items()} chains = {key: LLMChain(llm=llm, prompt=template) for key, template in templates.items()} # Sidebar for Input Collection with st.sidebar: st.subheader("📝 Input Details") job_opening_text = st.text_area("Job/Research Opening Details", height=150) cv_resume_file = st.file_uploader("Upload CV/Resume", type=["pdf", "png", "jpg", "jpeg"]) cv_resume_text = extract_text(cv_resume_file) if cv_resume_file else "" # Parse resume once for all tabs resume_info = parse_resume(cv_resume_text) if cv_resume_text else { 'education': '', 'experience': '', 'skills': '', 'projects': '', 'publications': '' } # Tab Layout tab1, tab2, tab3, tab4 = st.tabs(["Cold Email", "Cover Letter", "Research Statement", "SOP"]) # Cold Email Tab with tab1: professor_name, university_name = extract_professor_details(job_opening_text) research_interests = st.text_input("Research Interests") reason = st.text_input("Why this professor/lab?") if st.button("Generate Email", key="email_btn"): if job_opening_text and cv_resume_text: with st.spinner("Generating..."): try: generated_email = chains['email'].run({ **resume_info, "professor_name": professor_name, "university_name": university_name, "research_interests": research_interests, "reason": reason }) st.session_state.generated_content['email'] = clean_output(generated_email, "email") except Exception as e: st.error(f"Generation error: {e}") else: st.error("Please provide all required inputs") if st.session_state.generated_content['email']: st.markdown('