Sobit commited on
Commit
642f31f
·
verified ·
1 Parent(s): c16bb21

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -378
app.py CHANGED
@@ -1,390 +1,159 @@
1
- import streamlit as st
2
- from langchain.chains import LLMChain
3
- from langchain.prompts import PromptTemplate
4
- from langchain.llms import HuggingFaceHub
5
- import fitz
6
- from PIL import Image
7
  import os
8
- import pytesseract
9
- import re
 
 
 
 
10
 
11
  # Set Hugging Face API Key
12
  os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["HF_TOKEN"]
13
 
14
  # Initialize LLM
15
- llm = HuggingFaceHub(repo_id="mistralai/Mistral-7B-Instruct-v0.3", model_kwargs={"temperature": 0.5})
16
-
17
- # App Configuration
18
- st.set_page_config(page_title="DocuMentorAI", layout="wide", page_icon="📄")
19
- st.title("📄 DocuMentorAI")
20
-
21
- # Improved CSS
22
- st.markdown("""
23
- <style>
24
- .output-container {
25
- background-color: #f0f2f6;
26
- padding: 20px;
27
- border-radius: 10px;
28
- margin-top: 20px;
29
- white-space: pre-wrap;
30
- }
31
- .stTextArea textarea {
32
- font-size: 16px !important;
33
- }
34
- .stButton button {
35
- width: 100%;
36
- }
37
- </style>
38
- """, unsafe_allow_html=True)
39
-
40
- # Helper Functions
41
- def extract_text_from_pdf(pdf_file):
42
- try:
43
- pdf_bytes = pdf_file.read()
44
- with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
45
- return " ".join([page.get_text() for page in doc])
46
- except Exception as e:
47
- st.error(f"Error extracting text from PDF: {e}")
48
- return ""
49
 
50
- def extract_text_from_image(image_file):
 
 
 
 
 
 
 
 
 
51
  try:
52
- image = Image.open(image_file)
53
- return pytesseract.image_to_string(image)
 
 
 
54
  except Exception as e:
55
- st.error(f"Error extracting text from image: {e}")
56
- return ""
57
-
58
- def extract_text(uploaded_file):
59
- if not uploaded_file:
60
  return ""
61
- return extract_text_from_pdf(uploaded_file) if uploaded_file.type == "application/pdf" else extract_text_from_image(uploaded_file)
62
 
63
- def parse_resume(resume_text):
64
- """Extract key information from resume text using improved parsing"""
65
- sections = {
66
- 'education': ['Education:', 'EDUCATION', 'Academic Background'],
67
- 'experience': ['Experience:', 'EXPERIENCE', 'Work History', 'Employment'],
68
- 'skills': ['Skills:', 'SKILLS', 'Technical Skills', 'Technologies'],
69
- 'projects': ['Projects:', 'PROJECTS', 'Key Projects'],
70
- 'publications': ['Publications:', 'PUBLICATIONS', 'Research Papers']
71
- }
72
-
73
- parsed_info = {key: '' for key in sections}
74
-
75
- # Convert text to lines for better parsing
76
- lines = resume_text.split('\n')
77
- current_section = None
78
- section_content = []
79
-
80
- for line in lines:
81
- line = line.strip()
82
- if not line:
83
- continue
84
-
85
- # Check if this line is a section header
86
- for section, headers in sections.items():
87
- if any(header.lower() in line.lower() for header in headers):
88
- if current_section:
89
- parsed_info[current_section] = '\n'.join(section_content)
90
- current_section = section
91
- section_content = []
92
- break
93
- else:
94
- if current_section:
95
- section_content.append(line)
96
-
97
- # Add the last section
98
- if current_section and section_content:
99
- parsed_info[current_section] = '\n'.join(section_content)
100
-
101
- return parsed_info
102
-
103
- def extract_professor_details(text):
104
- professor_pattern = r"(Dr\.|Professor|Prof\.?)\s+([A-Z][a-z]+(?:\s[A-Z][a-z]+)*)"
105
- university_pattern = r"(University|Institute|College|School) of [A-Z][A-Za-z\s]+"
106
-
107
- professor_match = re.search(professor_pattern, text)
108
- university_match = re.search(university_pattern, text)
109
-
110
- return (professor_match.group(0) if professor_match else "Not Found",
111
- university_match.group(0) if university_match else "Not Found")
112
-
113
- def clean_output(text, type_="general"):
114
- """Unified cleaning function for all document types"""
115
- if not text:
116
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
- # Common start markers
119
- start_markers = {
120
- "email": ["Dear"],
121
- "cover_letter": ["Dear", "To Whom", "Hiring"],
122
- "research_statement": ["Research Statement", "Statement of Research"],
123
- "sop": ["Statement of Purpose", "Personal Statement"]
124
- }
125
-
126
- # Common end markers
127
- end_markers = ["Best regards,", "Sincerely,", "Yours sincerely,", "Kind regards,", "Thank you"]
128
-
129
- # Find start of content
130
- start_idx = 0
131
- relevant_starts = start_markers.get(type_, start_markers["email"])
132
- for marker in relevant_starts:
133
- idx = text.find(marker)
134
- if idx != -1:
135
- start_idx = idx
136
- break
137
-
138
- # Find end of content
139
- end_idx = len(text)
140
- for marker in end_markers:
141
- idx = text.find(marker)
142
- if idx != -1:
143
- end_idx = text.find("\n\n", idx) if text.find("\n\n", idx) != -1 else len(text)
144
- break
145
-
146
- cleaned_text = text[start_idx:end_idx].strip()
147
-
148
- # Add contact information for emails
149
- if type_ == "email" and ("Phone:" in text or "Email:" in text):
150
- contact_info = "\n\n" + "\n".join([
151
- line for line in text[end_idx:].split("\n")
152
- if any(info in line for info in ["Phone:", "Email:"])
153
- ]).strip()
154
- cleaned_text += contact_info
155
-
156
- return cleaned_text
157
-
158
- # Initialize session state
159
- if 'generated_content' not in st.session_state:
160
- st.session_state.generated_content = {
161
- 'email': None,
162
- 'cover_letter': None,
163
- 'research_statement': None,
164
- 'sop': None
165
- }
166
-
167
- # Template Definitions (simplified and standardized)
168
- templates = {
169
- 'email': """
170
- Write ONLY a formal cold email for a research position.
171
- Start with 'Dear Professor' and end with a signature.
172
-
173
- Use these specific details from the CV:
174
- {education}
175
- {experience}
176
- {skills}
177
- {projects}
178
- {publications}
179
-
180
- Additional Context:
181
- Professor: {professor_name}
182
- University: {university_name}
183
- Research Interests: {research_interests}
184
- Why This Lab: {reason}
185
-
186
- Guidelines:
187
- 1. Keep the email concise (max 400 words)
188
- 2. Focus on the most relevant experience and skills
189
- 3. Mention 1-2 specific projects that align with the lab's work
190
- 4. Include a clear statement of interest
191
- 5. End with your contact information
192
- """,
193
- 'cover_letter': """
194
- Write ONLY a professional cover letter for {job_title} at {company}.
195
- Use these specific details:
196
- {education}
197
- {experience}
198
- {skills}
199
- {projects}
200
-
201
- Required Skills: {key_skills}
202
-
203
- Guidelines:
204
- 1. Start with a formal greeting
205
- 2. Focus on experiences matching job requirements
206
- 3. Provide specific examples
207
- 4. Show why you're an ideal candidate
208
- 5. End professionally
209
- """,
210
- 'research_statement': """
211
- Write ONLY a research statement focused on your academic journey and future goals.
212
- Background:
213
- {education}
214
- {experience}
215
- {skills}
216
- {projects}
217
- {publications}
218
-
219
- Research Focus:
220
- {key_projects}
221
- Future Goals: {future_goals}
222
-
223
- Guidelines:
224
- 1. Describe your research journey
225
- 2. Highlight key achievements
226
- 3. Connect past work to future goals
227
- 4. Show technical expertise
228
- 5. Present your research vision
229
- """,
230
- 'sop': """
231
- Write ONLY a Statement of Purpose (SOP) for graduate studies.
232
- Background:
233
- {education}
234
- {experience}
235
- {skills}
236
- {projects}
237
- {publications}
238
-
239
- Context:
240
- Motivation: {motivation}
241
- Career Goals: {career_goals}
242
- Program Interest: {why_this_program}
243
-
244
- Guidelines:
245
- 1. Tell your academic journey
246
- 2. Connect background to goals
247
- 3. Show preparation for graduate study
248
- 4. Demonstrate program alignment
249
- 5. Make a compelling case
250
- """
251
- }
252
-
253
- # Convert templates to PromptTemplate objects
254
- templates = {k: PromptTemplate.from_template(v) for k, v in templates.items()}
255
- chains = {key: LLMChain(llm=llm, prompt=template) for key, template in templates.items()}
256
-
257
- # Sidebar for Input Collection
258
- with st.sidebar:
259
- st.subheader("📝 Input Details")
260
- job_opening_text = st.text_area("Job/Research Opening Details", height=150)
261
- cv_resume_file = st.file_uploader("Upload CV/Resume", type=["pdf", "png", "jpg", "jpeg"])
262
- cv_resume_text = extract_text(cv_resume_file) if cv_resume_file else ""
263
-
264
- # Parse resume once for all tabs
265
- resume_info = parse_resume(cv_resume_text) if cv_resume_text else {
266
- 'education': '', 'experience': '', 'skills': '', 'projects': '', 'publications': ''
267
- }
268
-
269
- # Tab Layout
270
- tab1, tab2, tab3, tab4 = st.tabs(["Cold Email", "Cover Letter", "Research Statement", "SOP"])
271
-
272
- # Cold Email Tab
273
- with tab1:
274
- professor_name, university_name = extract_professor_details(job_opening_text)
275
- research_interests = st.text_input("Research Interests")
276
- reason = st.text_input("Why this professor/lab?")
277
-
278
- if st.button("Generate Email", key="email_btn"):
279
- if job_opening_text and cv_resume_text:
280
- with st.spinner("Generating..."):
281
- try:
282
- generated_email = chains['email'].run({
283
- **resume_info,
284
- "professor_name": professor_name,
285
- "university_name": university_name,
286
- "research_interests": research_interests,
287
- "reason": reason
288
- })
289
- st.session_state.generated_content['email'] = clean_output(generated_email, "email")
290
- except Exception as e:
291
- st.error(f"Generation error: {e}")
292
- else:
293
- st.error("Please provide all required inputs")
294
-
295
- if st.session_state.generated_content['email']:
296
- st.markdown('<div class="output-container">', unsafe_allow_html=True)
297
- st.markdown(st.session_state.generated_content['email'])
298
- st.download_button("Download Email", st.session_state.generated_content['email'],
299
- file_name="email.txt", key="email_download")
300
- st.markdown('</div>', unsafe_allow_html=True)
301
-
302
- # Cover Letter Tab
303
- with tab2:
304
- job_title = st.text_input("Job Title")
305
- company_name = university_name if university_name != "Not Found" else st.text_input("Company/University")
306
- key_skills = st.text_input("Key Skills Required")
307
-
308
- if st.button("Generate Cover Letter", key="cover_letter_btn"):
309
- if job_opening_text and cv_resume_text:
310
- with st.spinner("Generating..."):
311
- try:
312
- generated_letter = chains['cover_letter'].run({
313
- **resume_info,
314
- "job_title": job_title,
315
- "company": company_name,
316
- "key_skills": key_skills
317
- })
318
- st.session_state.generated_content['cover_letter'] = clean_output(generated_letter, "cover_letter")
319
- except Exception as e:
320
- st.error(f"Generation error: {e}")
321
- else:
322
- st.error("Please provide all required inputs")
323
-
324
- if st.session_state.generated_content['cover_letter']:
325
- st.markdown('<div class="output-container">', unsafe_allow_html=True)
326
- st.markdown(st.session_state.generated_content['cover_letter'])
327
- st.download_button("Download Cover Letter", st.session_state.generated_content['cover_letter'],
328
- file_name="cover_letter.txt", key="cover_letter_download")
329
- st.markdown('</div>', unsafe_allow_html=True)
330
-
331
- # Research Statement Tab
332
- with tab3:
333
- key_projects = st.text_input("Key Research Projects")
334
- future_goals = st.text_input("Future Research Goals")
335
-
336
- if st.button("Generate Research Statement", key="research_stmt_btn"):
337
- if cv_resume_text:
338
- with st.spinner("Generating..."):
339
- try:
340
- generated_statement = chains['research_statement'].run({
341
- **resume_info,
342
- "key_projects": key_projects,
343
- "future_goals": future_goals
344
- })
345
- st.session_state.generated_content['research_statement'] = clean_output(generated_statement, "research_statement")
346
- except Exception as e:
347
- st.error(f"Generation error: {e}")
348
- else:
349
- st.error("Please upload your CV/Resume")
350
-
351
- if st.session_state.generated_content['research_statement']:
352
- st.markdown('<div class="output-container">', unsafe_allow_html=True)
353
- st.markdown(st.session_state.generated_content['research_statement'])
354
- st.download_button("Download Research Statement", st.session_state.generated_content['research_statement'],
355
- file_name="research_statement.txt", key="research_stmt_download")
356
- st.markdown('</div>', unsafe_allow_html=True)
357
-
358
- # SOP Tab
359
- with tab4:
360
- motivation = st.text_input("Motivation for Graduate Studies")
361
- career_goals = st.text_input("Career Goals")
362
- why_this_program = st.text_input("Why This Program")
363
-
364
- if st.button("Generate SOP", key="sop_btn"):
365
- if cv_resume_text:
366
- with st.spinner("Generating..."):
367
- try:
368
- generated_sop = chains['sop'].run({
369
- **resume_info,
370
- "motivation": motivation,
371
- "career_goals": career_goals,
372
- "why_this_program": why_this_program
373
- })
374
- st.session_state.generated_content['sop'] = clean_output(generated_sop, "sop")
375
- except Exception as e:
376
- st.error(f"Generation error: {e}")
377
- else:
378
- st.error("Please upload your CV/Resume")
379
-
380
- if st.session_state.generated_content['sop']:
381
- st.markdown('<div class="output-container">', unsafe_allow_html=True)
382
- st.markdown(st.session_state.generated_content['sop'])
383
- st.download_button("Download SOP", st.session_state.generated_content['sop'],
384
- file_name="sop.txt", key="sop_download")
385
- st.markdown('</div>', unsafe_allow_html=True)
386
-
387
- # Reset Button
388
- if st.sidebar.button("🔄 Reset All"):
389
- st.session_state.generated_content = {key: None for key in st.session_state.generated_content}
390
- st.experimental_rerun()
 
 
 
 
 
 
 
1
  import os
2
+ import streamlit as st
3
+ import PyPDF2
4
+ from langchain_community.llms import HuggingFaceHub
5
+
6
+ # Streamlit page configuration
7
+ st.set_page_config(page_title="Research Position Application Generator", page_icon="🔬")
8
 
9
  # Set Hugging Face API Key
10
  os.environ["HUGGINGFACEHUB_API_TOKEN"] = st.secrets["HF_TOKEN"]
11
 
12
  # Initialize LLM
13
+ llm = HuggingFaceHub(
14
+ repo_id="mistralai/Mistral-7B-Instruct-v0.3",
15
+ model_kwargs={"temperature": 0.5}
16
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
+ def extract_text_from_pdf(uploaded_file):
19
+ """
20
+ Extract text from an uploaded PDF file.
21
+
22
+ Args:
23
+ uploaded_file (UploadedFile): PDF file uploaded by the user
24
+
25
+ Returns:
26
+ str: Extracted text from the PDF
27
+ """
28
  try:
29
+ pdf_reader = PyPDF2.PdfReader(uploaded_file)
30
+ text = ""
31
+ for page in pdf_reader.pages:
32
+ text += page.extract_text()
33
+ return text
34
  except Exception as e:
35
+ st.error(f"Error extracting PDF text: {e}")
 
 
 
 
36
  return ""
 
37
 
38
+ def generate_cold_email(position_details, cv_text):
39
+ """
40
+ Generate a professional cold email using the LLM.
41
+
42
+ Args:
43
+ position_details (dict): Details about the research position
44
+ cv_text (str): Text extracted from the CV/resume
45
+
46
+ Returns:
47
+ str: Generated cold email
48
+ """
49
+ prompt = f"""Write a professional and concise cold email to Professor {position_details['professor_name']}
50
+ at {position_details['university']} about the research position in {position_details['research_focus']}.
51
+ The email should:
52
+ 1. Demonstrate knowledge of the professor's research
53
+ 2. Highlight relevant experience from the CV
54
+ 3. Express genuine interest in the position
55
+ 4. Be no more than 250 words
56
+
57
+ CV Details:
58
+ {cv_text}
59
+
60
+ Research Position Details:
61
+ Research Focus: {position_details['research_focus']}
62
+ Professor: {position_details['professor_name']}
63
+ University: {position_details['university']}
64
+ """
65
+
66
+ return llm.invoke(prompt)
67
+
68
+ def generate_cover_letter(position_details, cv_text):
69
+ """
70
+ Generate a formal cover letter using the LLM.
71
+
72
+ Args:
73
+ position_details (dict): Details about the research position
74
+ cv_text (str): Text extracted from the CV/resume
75
+
76
+ Returns:
77
+ str: Generated cover letter
78
+ """
79
+ prompt = f"""Write a professional and formal cover letter for a research position with the following details:
80
+ Research Focus: {position_details['research_focus']}
81
+ University: {position_details['university']}
82
+
83
+ The cover letter should:
84
+ 1. Follow a standard business letter format
85
+ 2. Clearly state the purpose of the letter
86
+ 3. Highlight relevant skills and experiences from the CV
87
+ 4. Demonstrate alignment with the research position
88
+ 5. Be 300-400 words long
89
+ 6. Include a strong closing paragraph
90
+
91
+ CV Details:
92
+ {cv_text}
93
+ """
94
+
95
+ return llm.invoke(prompt)
96
+
97
+ def main():
98
+ """
99
+ Main Streamlit app function
100
+ """
101
+ st.title("🔬 Research Position Application Generator")
102
+
103
+ # Sidebar for position details
104
+ st.sidebar.header("Research Position Details")
105
+ professor_name = st.sidebar.text_input("Professor's Name")
106
+ university = st.sidebar.text_input("University")
107
+ research_focus = st.sidebar.text_input("Research Focus")
108
+
109
+ # CV Upload
110
+ st.sidebar.header("Upload CV/Resume")
111
+ uploaded_cv = st.sidebar.file_uploader("Choose a PDF file", type="pdf")
112
+
113
+ # Generate button
114
+ if st.sidebar.button("Generate Documents"):
115
+ # Validate inputs
116
+ if not (professor_name and university and research_focus and uploaded_cv):
117
+ st.error("Please fill in all details and upload a CV")
118
+ return
119
 
120
+ # Extract CV text
121
+ cv_text = extract_text_from_pdf(uploaded_cv)
122
+
123
+ # Prepare position details
124
+ position_details = {
125
+ 'professor_name': professor_name,
126
+ 'university': university,
127
+ 'research_focus': research_focus
128
+ }
129
+
130
+ # Generate documents
131
+ with st.spinner('Generating documents...'):
132
+ cold_email = generate_cold_email(position_details, cv_text)
133
+ cover_letter = generate_cover_letter(position_details, cv_text)
134
+
135
+ # Display results
136
+ st.header("Generated Documents")
137
+
138
+ # Cold Email
139
+ st.subheader("Cold Email")
140
+ st.write(cold_email)
141
+ st.download_button(
142
+ label="Download Cold Email",
143
+ data=cold_email,
144
+ file_name="cold_email.txt",
145
+ mime="text/plain"
146
+ )
147
+
148
+ # Cover Letter
149
+ st.subheader("Cover Letter")
150
+ st.write(cover_letter)
151
+ st.download_button(
152
+ label="Download Cover Letter",
153
+ data=cover_letter,
154
+ file_name="cover_letter.txt",
155
+ mime="text/plain"
156
+ )
157
+
158
+ if __name__ == "__main__":
159
+ main()