Spaces:
Sleeping
Sleeping
import streamlit as st | |
from langchain.chains import LLMChain | |
from langchain.prompts import PromptTemplate | |
from langchain.llms import HuggingFaceHub | |
import fitz | |
from PIL import Image | |
import os | |
import pytesseract | |
import re | |
# Set Hugging Face API Key | |
os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["HF_TOKEN"] | |
# Initialize LLM | |
llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.3", model_kwargs={"temperature": 0.5}) | |
# App Configuration | |
st.set_page_config(page_title="DocuMentorAI", layout="wide", page_icon="📄") | |
st.title("📄 DocuMentorAI") | |
# Improved CSS | |
st.markdown(""" | |
<style> | |
.output-container { | |
background-color: #f0f2f6; | |
padding: 20px; | |
border-radius: 10px; | |
margin-top: 20px; | |
white-space: pre-wrap; | |
} | |
.stTextArea textarea { | |
font-size: 16px !important; | |
} | |
.stButton button { | |
width: 100%; | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Helper Functions | |
def extract_text_from_pdf(pdf_file): | |
try: | |
pdf_bytes = pdf_file.read() | |
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: | |
return " ".join([page.get_text() for page in doc]) | |
except Exception as e: | |
st.error(f"Error extracting text from PDF: {e}") | |
return "" | |
def extract_text_from_image(image_file): | |
try: | |
image = Image.open(image_file) | |
return pytesseract.image_to_string(image) | |
except Exception as e: | |
st.error(f"Error extracting text from image: {e}") | |
return "" | |
def extract_text(uploaded_file): | |
if not uploaded_file: | |
return "" | |
return extract_text_from_pdf(uploaded_file) if uploaded_file.type == "application/pdf" else extract_text_from_image(uploaded_file) | |
def parse_resume(resume_text): | |
"""Extract key information from resume text using improved parsing""" | |
sections = { | |
'education': ['Education:', 'EDUCATION', 'Academic Background'], | |
'experience': ['Experience:', 'EXPERIENCE', 'Work History', 'Employment'], | |
'skills': ['Skills:', 'SKILLS', 'Technical Skills', 'Technologies'], | |
'projects': ['Projects:', 'PROJECTS', 'Key Projects'], | |
'publications': ['Publications:', 'PUBLICATIONS', 'Research Papers'] | |
} | |
parsed_info = {key: '' for key in sections} | |
# Convert text to lines for better parsing | |
lines = resume_text.split('\n') | |
current_section = None | |
section_content = [] | |
for line in lines: | |
line = line.strip() | |
if not line: | |
continue | |
# Check if this line is a section header | |
for section, headers in sections.items(): | |
if any(header.lower() in line.lower() for header in headers): | |
if current_section: | |
parsed_info[current_section] = '\n'.join(section_content) | |
current_section = section | |
section_content = [] | |
break | |
else: | |
if current_section: | |
section_content.append(line) | |
# Add the last section | |
if current_section and section_content: | |
parsed_info[current_section] = '\n'.join(section_content) | |
return parsed_info | |
def extract_professor_details(text): | |
professor_pattern = r"(Dr\.|Professor|Prof\.?)\s+([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)" | |
university_pattern = r"(University|Institute|College|School) of [A-Z][A-Za-z\s]+" | |
professor_match = re.search(professor_pattern, text) | |
university_match = re.search(university_pattern, text) | |
return (professor_match.group(0) if professor_match else "Not Found", | |
university_match.group(0) if university_match else "Not Found") | |
def clean_output(text, type_="general"): | |
"""Unified cleaning function for all document types""" | |
if not text: | |
return "" | |
# Common start markers | |
start_markers = { | |
"email": ["Dear"], | |
"cover_letter": ["Dear", "To Whom", "Hiring"], | |
"research_statement": ["Research Statement", "Statement of Research"], | |
"sop": ["Statement of Purpose", "Personal Statement"] | |
} | |
# Common end markers | |
end_markers = ["Best regards,", "Sincerely,", "Yours sincerely,", "Kind regards,", "Thank you"] | |
# Find start of content | |
start_idx = 0 | |
relevant_starts = start_markers.get(type_, start_markers["email"]) | |
for marker in relevant_starts: | |
idx = text.find(marker) | |
if idx != -1: | |
start_idx = idx | |
break | |
# Find end of content | |
end_idx = len(text) | |
for marker in end_markers: | |
idx = text.find(marker) | |
if idx != -1: | |
end_idx = text.find("\n\n", idx) if text.find("\n\n", idx) != -1 else len(text) | |
break | |
cleaned_text = text[start_idx:end_idx].strip() | |
# Add contact information for emails | |
if type_ == "email" and ("Phone:" in text or "Email:" in text): | |
contact_info = "\n\n" + "\n".join([ | |
line for line in text[end_idx:].split("\n") | |
if any(info in line for info in ["Phone:", "Email:"]) | |
]).strip() | |
cleaned_text += contact_info | |
return cleaned_text | |
# Initialize session state | |
if 'generated_content' not in st.session_state: | |
st.session_state.generated_content = { | |
'email': None, | |
'cover_letter': None, | |
'research_statement': None, | |
'sop': None | |
} | |
# Template Definitions (simplified and standardized) | |
templates = { | |
'email': """ | |
Write ONLY a formal cold email for a research position. | |
Start with 'Dear Professor' and end with a signature. | |
Use these specific details from the CV: | |
{education} | |
{experience} | |
{skills} | |
{projects} | |
{publications} | |
Additional Context: | |
Professor: {professor_name} | |
University: {university_name} | |
Research Interests: {research_interests} | |
Why This Lab: {reason} | |
Guidelines: | |
1. Keep the email concise (max 400 words) | |
2. Focus on the most relevant experience and skills | |
3. Mention 1-2 specific projects that align with the lab's work | |
4. Include a clear statement of interest | |
5. End with your contact information | |
""", | |
'cover_letter': """ | |
Write ONLY a professional cover letter for {job_title} at {company}. | |
Use these specific details: | |
{education} | |
{experience} | |
{skills} | |
{projects} | |
Required Skills: {key_skills} | |
Guidelines: | |
1. Start with a formal greeting | |
2. Focus on experiences matching job requirements | |
3. Provide specific examples | |
4. Show why you're an ideal candidate | |
5. End professionally | |
""", | |
'research_statement': """ | |
Write ONLY a research statement focused on your academic journey and future goals. | |
Background: | |
{education} | |
{experience} | |
{skills} | |
{projects} | |
{publications} | |
Research Focus: | |
{key_projects} | |
Future Goals: {future_goals} | |
Guidelines: | |
1. Describe your research journey | |
2. Highlight key achievements | |
3. Connect past work to future goals | |
4. Show technical expertise | |
5. Present your research vision | |
""", | |
'sop': """ | |
Write ONLY a Statement of Purpose (SOP) for graduate studies. | |
Background: | |
{education} | |
{experience} | |
{skills} | |
{projects} | |
{publications} | |
Context: | |
Motivation: {motivation} | |
Career Goals: {career_goals} | |
Program Interest: {why_this_program} | |
Guidelines: | |
1. Tell your academic journey | |
2. Connect background to goals | |
3. Show preparation for graduate study | |
4. Demonstrate program alignment | |
5. Make a compelling case | |
""" | |
} | |
# Convert templates to PromptTemplate objects | |
templates = {k: PromptTemplate.from_template(v) for k, v in templates.items()} | |
chains = {key: LLMChain(llm=llm, prompt=template) for key, template in templates.items()} | |
# Sidebar for Input Collection | |
with st.sidebar: | |
st.subheader("📝 Input Details") | |
job_opening_text = st.text_area("Job/Research Opening Details", height=150) | |
cv_resume_file = st.file_uploader("Upload CV/Resume", type=["pdf", "png", "jpg", "jpeg"]) | |
cv_resume_text = extract_text(cv_resume_file) if cv_resume_file else "" | |
# Parse resume once for all tabs | |
resume_info = parse_resume(cv_resume_text) if cv_resume_text else { | |
'education': '', 'experience': '', 'skills': '', 'projects': '', 'publications': '' | |
} | |
# Tab Layout | |
tab1, tab2, tab3, tab4 = st.tabs(["Cold Email", "Cover Letter", "Research Statement", "SOP"]) | |
# Cold Email Tab | |
with tab1: | |
professor_name, university_name = extract_professor_details(job_opening_text) | |
research_interests = st.text_input("Research Interests") | |
reason = st.text_input("Why this professor/lab?") | |
if st.button("Generate Email", key="email_btn"): | |
if job_opening_text and cv_resume_text: | |
with st.spinner("Generating..."): | |
try: | |
generated_email = chains['email'].run({ | |
**resume_info, | |
"professor_name": professor_name, | |
"university_name": university_name, | |
"research_interests": research_interests, | |
"reason": reason | |
}) | |
st.session_state.generated_content['email'] = clean_output(generated_email, "email") | |
except Exception as e: | |
st.error(f"Generation error: {e}") | |
else: | |
st.error("Please provide all required inputs") | |
if st.session_state.generated_content['email']: | |
st.markdown('<div class="output-container">', unsafe_allow_html=True) | |
st.markdown(st.session_state.generated_content['email']) | |
st.download_button("Download Email", st.session_state.generated_content['email'], | |
file_name="email.txt", key="email_download") | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Cover Letter Tab | |
with tab2: | |
job_title = st.text_input("Job Title") | |
company_name = university_name if university_name != "Not Found" else st.text_input("Company/University") | |
key_skills = st.text_input("Key Skills Required") | |
if st.button("Generate Cover Letter", key="cover_letter_btn"): | |
if job_opening_text and cv_resume_text: | |
with st.spinner("Generating..."): | |
try: | |
generated_letter = chains['cover_letter'].run({ | |
**resume_info, | |
"job_title": job_title, | |
"company": company_name, | |
"key_skills": key_skills | |
}) | |
st.session_state.generated_content['cover_letter'] = clean_output(generated_letter, "cover_letter") | |
except Exception as e: | |
st.error(f"Generation error: {e}") | |
else: | |
st.error("Please provide all required inputs") | |
if st.session_state.generated_content['cover_letter']: | |
st.markdown('<div class="output-container">', unsafe_allow_html=True) | |
st.markdown(st.session_state.generated_content['cover_letter']) | |
st.download_button("Download Cover Letter", st.session_state.generated_content['cover_letter'], | |
file_name="cover_letter.txt", key="cover_letter_download") | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Research Statement Tab | |
with tab3: | |
key_projects = st.text_input("Key Research Projects") | |
future_goals = st.text_input("Future Research Goals") | |
if st.button("Generate Research Statement", key="research_stmt_btn"): | |
if cv_resume_text: | |
with st.spinner("Generating..."): | |
try: | |
generated_statement = chains['research_statement'].run({ | |
**resume_info, | |
"key_projects": key_projects, | |
"future_goals": future_goals | |
}) | |
st.session_state.generated_content['research_statement'] = clean_output(generated_statement, "research_statement") | |
except Exception as e: | |
st.error(f"Generation error: {e}") | |
else: | |
st.error("Please upload your CV/Resume") | |
if st.session_state.generated_content['research_statement']: | |
st.markdown('<div class="output-container">', unsafe_allow_html=True) | |
st.markdown(st.session_state.generated_content['research_statement']) | |
st.download_button("Download Research Statement", st.session_state.generated_content['research_statement'], | |
file_name="research_statement.txt", key="research_stmt_download") | |
st.markdown('</div>', unsafe_allow_html=True) | |
# SOP Tab | |
with tab4: | |
motivation = st.text_input("Motivation for Graduate Studies") | |
career_goals = st.text_input("Career Goals") | |
why_this_program = st.text_input("Why This Program") | |
if st.button("Generate SOP", key="sop_btn"): | |
if cv_resume_text: | |
with st.spinner("Generating..."): | |
try: | |
generated_sop = chains['sop'].run({ | |
**resume_info, | |
"motivation": motivation, | |
"career_goals": career_goals, | |
"why_this_program": why_this_program | |
}) | |
st.session_state.generated_content['sop'] = clean_output(generated_sop, "sop") | |
except Exception as e: | |
st.error(f"Generation error: {e}") | |
else: | |
st.error("Please upload your CV/Resume") | |
if st.session_state.generated_content['sop']: | |
st.markdown('<div class="output-container">', unsafe_allow_html=True) | |
st.markdown(st.session_state.generated_content['sop']) | |
st.download_button("Download SOP", st.session_state.generated_content['sop'], | |
file_name="sop.txt", key="sop_download") | |
st.markdown('</div>', unsafe_allow_html=True) | |
# Reset Button | |
if st.sidebar.button("🔄 Reset All"): | |
st.session_state.generated_content = {key: None for key in st.session_state.generated_content} | |
st.experimental_rerun() |