|
import re |
|
from utils import extract_text_from_pdf |
|
import streamlit as st |
|
|
|
def clean_text(text): |
|
replacements = { |
|
'\u2013': '-', |
|
'\u2014': '-', |
|
'\u201c': '"', |
|
'\u201d': '"', |
|
'\u2022': '', |
|
'\u2019': "'", |
|
} |
|
for key, value in replacements.items(): |
|
text = text.replace(key, value) |
|
return text |
|
|
|
def extract_contact_info(line, resume_json): |
|
phone_number_pattern = re.compile(r'\+?\d[\d\s\-]+\d') |
|
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b') |
|
linkedin_pattern = re.compile(r'(linkedin.com/in/[A-Za-z0-9-_]+)', re.IGNORECASE) |
|
github_pattern = re.compile(r'(github.com/[A-Za-z0-9-_]+)', re.IGNORECASE) |
|
|
|
contact_lines = re.split(r'\s*[—-]\s*|\s*\|\|\s*|\s*,\s*', line) |
|
for item in contact_lines: |
|
item = item.strip() |
|
if not item: |
|
continue |
|
if "Email" not in resume_json["Contact Information"] and email_pattern.search(item): |
|
resume_json["Contact Information"]["Email"] = email_pattern.search(item).group() |
|
elif "Phone" not in resume_json["Contact Information"] and phone_number_pattern.search(item): |
|
resume_json["Contact Information"]["Phone"] = phone_number_pattern.search(item).group() |
|
elif "LinkedIn" not in resume_json["Contact Information"] and linkedin_pattern.search(item): |
|
resume_json["Contact Information"]["LinkedIn"] = linkedin_pattern.search(item).group() |
|
elif "GitHub" not in resume_json["Contact Information"] and github_pattern.search(item): |
|
resume_json["Contact Information"]["GitHub"] = github_pattern.search(item).group() |
|
|
|
def parse_resume(resume_text): |
|
resume_json = { |
|
"Contact Information": {}, |
|
"Professional Experience": [], |
|
"Projects": [], |
|
"Skills": [], |
|
"Education": [], |
|
"Achievements": [], |
|
"Extra-Curricular Activities": [] |
|
} |
|
|
|
section_keywords = { |
|
"Contact Information": ["Contact Information", "Contact Info", "Contact"], |
|
"Professional Experience": ["Professional Experience", "Work Experience", "Experience"], |
|
"Projects": ["Projects", "Project"], |
|
"Skills": ["Skills", "Technical Skills"], |
|
"Education": ["Education", "Academic Background"], |
|
"Achievements": ["Achievements", "Awards", "Honors"], |
|
"Extra-Curricular Activities": ["Extra-Curricular Activities", "Extracurricular Activities", "Activities", "Volunteer Experience"] |
|
} |
|
|
|
contact_info_patterns = ["@", "linkedin", "github", "phone", "+91"] |
|
section = None |
|
|
|
lines = resume_text.split('\n') |
|
for line in lines: |
|
line = clean_text(line.strip()) |
|
if not line: |
|
continue |
|
|
|
section_detected = False |
|
for key, keywords in section_keywords.items(): |
|
if any(keyword.lower() in line.lower() for keyword in keywords): |
|
section = key |
|
section_detected = True |
|
break |
|
|
|
if section_detected: |
|
continue |
|
|
|
if section == "Contact Information" or any(pattern in line.lower() for pattern in contact_info_patterns): |
|
section = "Contact Information" |
|
extract_contact_info(line, resume_json) |
|
elif section and section != "Contact Information": |
|
resume_json[section].append(line) |
|
|
|
if "Phone" not in resume_json["Contact Information"]: |
|
resume_json["Contact Information"]["Phone"] = "Not Provided" |
|
if "Email" not in resume_json["Contact Information"]: |
|
resume_json["Contact Information"]["Email"] = "Not Provided" |
|
if "LinkedIn" not in resume_json["Contact Information"]: |
|
resume_json["Contact Information"]["LinkedIn"] = "Not Provided" |
|
if "GitHub" not in resume_json["Contact Information"]: |
|
resume_json["Contact Information"]["GitHub"] = "Not Provided" |
|
|
|
return resume_json |
|
|
|
st.title("Resume Parser to JSON") |
|
|
|
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf") |
|
default_file_path = "Kushagra_Sharma_Resume.pdf" |
|
|
|
if uploaded_file is not None: |
|
resume_text = extract_text_from_pdf(uploaded_file) |
|
file_name = uploaded_file.name |
|
else: |
|
with open(default_file_path, "rb") as file: |
|
resume_text = extract_text_from_pdf(file) |
|
file_name = default_file_path |
|
|
|
parsed_resume = parse_resume(resume_text) |
|
st.text(f"Currently using file: {file_name}") |
|
st.json(parsed_resume) |
|
|