Spaces:

Sobit
/

DocuMentorAI

Sleeping

App Files Files Community

Sobit commited on 19 days ago

Commit

642f31f

verified ·

1 Parent(s): c16bb21

Update app.py

Browse files

Files changed (1) hide show

app.py +147 -378

app.py CHANGED Viewed

@@ -1,390 +1,159 @@
-import streamlit as st
-from langchain.chains import LLMChain
-from langchain.prompts import PromptTemplate
-from langchain.llms import HuggingFaceHub
-import fitz
-from PIL import Image
 import os
-import pytesseract
-import re
 # Set Hugging Face API Key
 os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["HF_TOKEN"]
 # Initialize LLM
-llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.3", model_kwargs={"temperature": 0.5})
-# App Configuration
-st.set_page_config(page_title="DocuMentorAI", layout="wide", page_icon="📄")
-st.title("📄 DocuMentorAI")
-# Improved CSS
-st.markdown("""
-<style>
-    .output-container {
-        background-color: #f0f2f6;
-        padding: 20px;
-        border-radius: 10px;
-        margin-top: 20px;
-        white-space: pre-wrap;
-    }
-    .stTextArea textarea {
-        font-size: 16px !important;
-    }
-    .stButton button {
-        width: 100%;
-    }
-</style>
-""", unsafe_allow_html=True)
-# Helper Functions
-def extract_text_from_pdf(pdf_file):
-    try:
-        pdf_bytes = pdf_file.read()
-        with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
-            return " ".join([page.get_text() for page in doc])
-    except Exception as e:
-        st.error(f"Error extracting text from PDF: {e}")
-        return ""
-def extract_text_from_image(image_file):
     try:
-        image = Image.open(image_file)
-        return pytesseract.image_to_string(image)
     except Exception as e:
-        st.error(f"Error extracting text from image: {e}")
-        return ""
-def extract_text(uploaded_file):
-    if not uploaded_file:
         return ""
-    return extract_text_from_pdf(uploaded_file) if uploaded_file.type == "application/pdf" else extract_text_from_image(uploaded_file)
-def parse_resume(resume_text):
-    """Extract key information from resume text using improved parsing"""
-    sections = {
-        'education': ['Education:', 'EDUCATION', 'Academic Background'],
-        'experience': ['Experience:', 'EXPERIENCE', 'Work History', 'Employment'],
-        'skills': ['Skills:', 'SKILLS', 'Technical Skills', 'Technologies'],
-        'projects': ['Projects:', 'PROJECTS', 'Key Projects'],
-        'publications': ['Publications:', 'PUBLICATIONS', 'Research Papers']
-    }
-    parsed_info = {key: '' for key in sections}
-    # Convert text to lines for better parsing
-    lines = resume_text.split('\n')
-    current_section = None
-    section_content = []
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        # Check if this line is a section header
-        for section, headers in sections.items():
-            if any(header.lower() in line.lower() for header in headers):
-                if current_section:
-                    parsed_info[current_section] = '\n'.join(section_content)
-                current_section = section
-                section_content = []
-                break
-        else:
-            if current_section:
-                section_content.append(line)
-    # Add the last section
-    if current_section and section_content:
-        parsed_info[current_section] = '\n'.join(section_content)
-    return parsed_info
-def extract_professor_details(text):
-    professor_pattern = r"(Dr\.|Professor|Prof\.?)\s+([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)"
-    university_pattern = r"(University|Institute|College|School) of [A-Z][A-Za-z\s]+"
-    professor_match = re.search(professor_pattern, text)
-    university_match = re.search(university_pattern, text)
-    return (professor_match.group(0) if professor_match else "Not Found",
-            university_match.group(0) if university_match else "Not Found")
-def clean_output(text, type_="general"):
-    """Unified cleaning function for all document types"""
-    if not text:
-        return ""
-    # Common start markers
-    start_markers = {
-        "email": ["Dear"],
-        "cover_letter": ["Dear", "To Whom", "Hiring"],
-        "research_statement": ["Research Statement", "Statement of Research"],
-        "sop": ["Statement of Purpose", "Personal Statement"]
-    }
-    # Common end markers
-    end_markers = ["Best regards,", "Sincerely,", "Yours sincerely,", "Kind regards,", "Thank you"]
-    # Find start of content
-    start_idx = 0
-    relevant_starts = start_markers.get(type_, start_markers["email"])
-    for marker in relevant_starts:
-        idx = text.find(marker)
-        if idx != -1:
-            start_idx = idx
-            break
-    # Find end of content
-    end_idx = len(text)
-    for marker in end_markers:
-        idx = text.find(marker)
-        if idx != -1:
-            end_idx = text.find("\n\n", idx) if text.find("\n\n", idx) != -1 else len(text)
-            break
-    cleaned_text = text[start_idx:end_idx].strip()
-    # Add contact information for emails
-    if type_ == "email" and ("Phone:" in text or "Email:" in text):
-        contact_info = "\n\n" + "\n".join([
-            line for line in text[end_idx:].split("\n")
-            if any(info in line for info in ["Phone:", "Email:"])
-        ]).strip()
-        cleaned_text += contact_info
-    return cleaned_text
-# Initialize session state
-if 'generated_content' not in st.session_state:
-    st.session_state.generated_content = {
-        'email': None,
-        'cover_letter': None,
-        'research_statement': None,
-        'sop': None
-    }
-# Template Definitions (simplified and standardized)
-templates = {
-    'email': """
-Write ONLY a formal cold email for a research position.
-Start with 'Dear Professor' and end with a signature.
-Use these specific details from the CV:
-{education}
-{experience}
-{skills}
-{projects}
-{publications}
-Additional Context:
-Professor: {professor_name}
-University: {university_name}
-Research Interests: {research_interests}
-Why This Lab: {reason}
-Guidelines:
-1. Keep the email concise (max 400 words)
-2. Focus on the most relevant experience and skills
-3. Mention 1-2 specific projects that align with the lab's work
-4. Include a clear statement of interest
-5. End with your contact information
-""",
-    'cover_letter': """
-Write ONLY a professional cover letter for {job_title} at {company}.
-Use these specific details:
-{education}
-{experience}
-{skills}
-{projects}
-Required Skills: {key_skills}
-Guidelines:
-1. Start with a formal greeting
-2. Focus on experiences matching job requirements
-3. Provide specific examples
-4. Show why you're an ideal candidate
-5. End professionally
-""",
-    'research_statement': """
-Write ONLY a research statement focused on your academic journey and future goals.
-Background:
-{education}
-{experience}
-{skills}
-{projects}
-{publications}
-Research Focus:
-{key_projects}
-Future Goals: {future_goals}
-Guidelines:
-1. Describe your research journey
-2. Highlight key achievements
-3. Connect past work to future goals
-4. Show technical expertise
-5. Present your research vision
-""",
-    'sop': """
-Write ONLY a Statement of Purpose (SOP) for graduate studies.
-Background:
-{education}
-{experience}
-{skills}
-{projects}
-{publications}
-Context:
-Motivation: {motivation}
-Career Goals: {career_goals}
-Program Interest: {why_this_program}
-Guidelines:
-1. Tell your academic journey
-2. Connect background to goals
-3. Show preparation for graduate study
-4. Demonstrate program alignment
-5. Make a compelling case
-"""
-}
-# Convert templates to PromptTemplate objects
-templates = {k: PromptTemplate.from_template(v) for k, v in templates.items()}
-chains = {key: LLMChain(llm=llm, prompt=template) for key, template in templates.items()}
-# Sidebar for Input Collection
-with st.sidebar:
-    st.subheader("📝 Input Details")
-    job_opening_text = st.text_area("Job/Research Opening Details", height=150)
-    cv_resume_file = st.file_uploader("Upload CV/Resume", type=["pdf", "png", "jpg", "jpeg"])
-    cv_resume_text = extract_text(cv_resume_file) if cv_resume_file else ""
-# Parse resume once for all tabs
-resume_info = parse_resume(cv_resume_text) if cv_resume_text else {
-    'education': '', 'experience': '', 'skills': '', 'projects': '', 'publications': ''
-}
-# Tab Layout
-tab1, tab2, tab3, tab4 = st.tabs(["Cold Email", "Cover Letter", "Research Statement", "SOP"])
-# Cold Email Tab
-with tab1:
-    professor_name, university_name = extract_professor_details(job_opening_text)
-    research_interests = st.text_input("Research Interests")
-    reason = st.text_input("Why this professor/lab?")
-    if st.button("Generate Email", key="email_btn"):
-        if job_opening_text and cv_resume_text:
-            with st.spinner("Generating..."):
-                try:
-                    generated_email = chains['email'].run({
-                        **resume_info,
-                        "professor_name": professor_name,
-                        "university_name": university_name,
-                        "research_interests": research_interests,
-                        "reason": reason
-                    })
-                    st.session_state.generated_content['email'] = clean_output(generated_email, "email")
-                except Exception as e:
-                    st.error(f"Generation error: {e}")
-        else:
-            st.error("Please provide all required inputs")
-    if st.session_state.generated_content['email']:
-        st.markdown('<div class="output-container">', unsafe_allow_html=True)
-        st.markdown(st.session_state.generated_content['email'])
-        st.download_button("Download Email", st.session_state.generated_content['email'],
-                         file_name="email.txt", key="email_download")
-        st.markdown('</div>', unsafe_allow_html=True)
-# Cover Letter Tab
-with tab2:
-    job_title = st.text_input("Job Title")
-    company_name = university_name if university_name != "Not Found" else st.text_input("Company/University")
-    key_skills = st.text_input("Key Skills Required")
-    if st.button("Generate Cover Letter", key="cover_letter_btn"):
-        if job_opening_text and cv_resume_text:
-            with st.spinner("Generating..."):
-                try:
-                    generated_letter = chains['cover_letter'].run({
-                        **resume_info,
-                        "job_title": job_title,
-                        "company": company_name,
-                        "key_skills": key_skills
-                    })
-                    st.session_state.generated_content['cover_letter'] = clean_output(generated_letter, "cover_letter")
-                except Exception as e:
-                    st.error(f"Generation error: {e}")
-        else:
-            st.error("Please provide all required inputs")
-    if st.session_state.generated_content['cover_letter']:
-        st.markdown('<div class="output-container">', unsafe_allow_html=True)
-        st.markdown(st.session_state.generated_content['cover_letter'])
-        st.download_button("Download Cover Letter", st.session_state.generated_content['cover_letter'],
-                         file_name="cover_letter.txt", key="cover_letter_download")
-        st.markdown('</div>', unsafe_allow_html=True)
-# Research Statement Tab
-with tab3:
-    key_projects = st.text_input("Key Research Projects")
-    future_goals = st.text_input("Future Research Goals")
-    if st.button("Generate Research Statement", key="research_stmt_btn"):
-        if cv_resume_text:
-            with st.spinner("Generating..."):
-                try:
-                    generated_statement = chains['research_statement'].run({
-                        **resume_info,
-                        "key_projects": key_projects,
-                        "future_goals": future_goals
-                    })
-                    st.session_state.generated_content['research_statement'] = clean_output(generated_statement, "research_statement")
-                except Exception as e:
-                    st.error(f"Generation error: {e}")
-        else:
-            st.error("Please upload your CV/Resume")
-    if st.session_state.generated_content['research_statement']:
-        st.markdown('<div class="output-container">', unsafe_allow_html=True)
-        st.markdown(st.session_state.generated_content['research_statement'])
-        st.download_button("Download Research Statement", st.session_state.generated_content['research_statement'],
-                         file_name="research_statement.txt", key="research_stmt_download")
-        st.markdown('</div>', unsafe_allow_html=True)
-# SOP Tab
-with tab4:
-    motivation = st.text_input("Motivation for Graduate Studies")
-    career_goals = st.text_input("Career Goals")
-    why_this_program = st.text_input("Why This Program")
-    if st.button("Generate SOP", key="sop_btn"):
-        if cv_resume_text:
-            with st.spinner("Generating..."):
-                try:
-                    generated_sop = chains['sop'].run({
-                        **resume_info,
-                        "motivation": motivation,
-                        "career_goals": career_goals,
-                        "why_this_program": why_this_program
-                    })
-                    st.session_state.generated_content['sop'] = clean_output(generated_sop, "sop")
-                except Exception as e:
-                    st.error(f"Generation error: {e}")
-        else:
-            st.error("Please upload your CV/Resume")
-    if st.session_state.generated_content['sop']:
-        st.markdown('<div class="output-container">', unsafe_allow_html=True)
-        st.markdown(st.session_state.generated_content['sop'])
-        st.download_button("Download SOP", st.session_state.generated_content['sop'],
-                         file_name="sop.txt", key="sop_download")
-        st.markdown('</div>', unsafe_allow_html=True)
-# Reset Button
-if st.sidebar.button("🔄 Reset All"):
-    st.session_state.generated_content = {key: None for key in st.session_state.generated_content}
-    st.experimental_rerun()

 import os
+import streamlit as st
+import PyPDF2
+from langchain_community.llms import HuggingFaceHub
+# Streamlit page configuration
+st.set_page_config(page_title="Research Position Application Generator", page_icon="🔬")
 # Set Hugging Face API Key
 os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["HF_TOKEN"]
 # Initialize LLM
+llm = HuggingFaceHub(
+    repo_id="mistralai/Mistral-7B-Instruct-v0.3",
+    model_kwargs={"temperature": 0.5}
+)
+def extract_text_from_pdf(uploaded_file):
+    """
+    Extract text from an uploaded PDF file.
+    Args:
+        uploaded_file (UploadedFile): PDF file uploaded by the user
+    Returns:
+        str: Extracted text from the PDF
+    """
     try:
+        pdf_reader = PyPDF2.PdfReader(uploaded_file)
+        text = ""
+        for page in pdf_reader.pages:
+            text += page.extract_text()
+        return text
     except Exception as e:
+        st.error(f"Error extracting PDF text: {e}")
         return ""
+def generate_cold_email(position_details, cv_text):
+    """
+    Generate a professional cold email using the LLM.
+    Args:
+        position_details (dict): Details about the research position
+        cv_text (str): Text extracted from the CV/resume
+    Returns:
+        str: Generated cold email
+    """
+    prompt = f"""Write a professional and concise cold email to Professor {position_details['professor_name']}
+    at {position_details['university']} about the research position in {position_details['research_focus']}.
+    The email should:
+    1. Demonstrate knowledge of the professor's research
+    2. Highlight relevant experience from the CV
+    3. Express genuine interest in the position
+    4. Be no more than 250 words
+    CV Details:
+    {cv_text}
+    Research Position Details:
+    Research Focus: {position_details['research_focus']}
+    Professor: {position_details['professor_name']}
+    University: {position_details['university']}
+    """
+    return llm.invoke(prompt)
+def generate_cover_letter(position_details, cv_text):
+    """
+    Generate a formal cover letter using the LLM.
+    Args:
+        position_details (dict): Details about the research position
+        cv_text (str): Text extracted from the CV/resume
+    Returns:
+        str: Generated cover letter
+    """
+    prompt = f"""Write a professional and formal cover letter for a research position with the following details:
+    Research Focus: {position_details['research_focus']}
+    University: {position_details['university']}
+    The cover letter should:
+    1. Follow a standard business letter format
+    2. Clearly state the purpose of the letter
+    3. Highlight relevant skills and experiences from the CV
+    4. Demonstrate alignment with the research position
+    5. Be 300-400 words long
+    6. Include a strong closing paragraph
+    CV Details:
+    {cv_text}
+    """
+    return llm.invoke(prompt)
+def main():
+    """
+    Main Streamlit app function
+    """
+    st.title("🔬 Research Position Application Generator")
+    # Sidebar for position details
+    st.sidebar.header("Research Position Details")
+    professor_name = st.sidebar.text_input("Professor's Name")
+    university = st.sidebar.text_input("University")
+    research_focus = st.sidebar.text_input("Research Focus")
+    # CV Upload
+    st.sidebar.header("Upload CV/Resume")
+    uploaded_cv = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
+    # Generate button
+    if st.sidebar.button("Generate Documents"):
+        # Validate inputs
+        if not (professor_name and university and research_focus and uploaded_cv):
+            st.error("Please fill in all details and upload a CV")
+            return
+        # Extract CV text
+        cv_text = extract_text_from_pdf(uploaded_cv)
+        # Prepare position details
+        position_details = {
+            'professor_name': professor_name,
+            'university': university,
+            'research_focus': research_focus
+        }
+        # Generate documents
+        with st.spinner('Generating documents...'):
+            cold_email = generate_cold_email(position_details, cv_text)
+            cover_letter = generate_cover_letter(position_details, cv_text)
+        # Display results
+        st.header("Generated Documents")
+        # Cold Email
+        st.subheader("Cold Email")
+        st.write(cold_email)
+        st.download_button(
+            label="Download Cold Email",
+            data=cold_email,
+            file_name="cold_email.txt",
+            mime="text/plain"
+        )
+        # Cover Letter
+        st.subheader("Cover Letter")
+        st.write(cover_letter)
+        st.download_button(
+            label="Download Cover Letter",
+            data=cover_letter,
+            file_name="cover_letter.txt",
+            mime="text/plain"
+        )
+if __name__ == "__main__":
+    main()