kushagrasharma-13's picture
First
d5cc663
import re
from utils import extract_text_from_pdf
import streamlit as st
def clean_text(text):
replacements = {
'\u2013': '-',
'\u2014': '-',
'\u201c': '"',
'\u201d': '"',
'\u2022': '',
'\u2019': "'",
}
for key, value in replacements.items():
text = text.replace(key, value)
return text
def extract_contact_info(line, resume_json):
phone_number_pattern = re.compile(r'\+?\d[\d\s\-]+\d')
email_pattern = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
linkedin_pattern = re.compile(r'(linkedin.com/in/[A-Za-z0-9-_]+)', re.IGNORECASE)
github_pattern = re.compile(r'(github.com/[A-Za-z0-9-_]+)', re.IGNORECASE)
contact_lines = re.split(r'\s*[—-]\s*|\s*\|\|\s*|\s*,\s*', line)
for item in contact_lines:
item = item.strip()
if not item:
continue
if "Email" not in resume_json["Contact Information"] and email_pattern.search(item):
resume_json["Contact Information"]["Email"] = email_pattern.search(item).group()
elif "Phone" not in resume_json["Contact Information"] and phone_number_pattern.search(item):
resume_json["Contact Information"]["Phone"] = phone_number_pattern.search(item).group()
elif "LinkedIn" not in resume_json["Contact Information"] and linkedin_pattern.search(item):
resume_json["Contact Information"]["LinkedIn"] = linkedin_pattern.search(item).group()
elif "GitHub" not in resume_json["Contact Information"] and github_pattern.search(item):
resume_json["Contact Information"]["GitHub"] = github_pattern.search(item).group()
def parse_resume(resume_text):
resume_json = {
"Contact Information": {},
"Professional Experience": [],
"Projects": [],
"Skills": [],
"Education": [],
"Achievements": [],
"Extra-Curricular Activities": []
}
section_keywords = {
"Contact Information": ["Contact Information", "Contact Info", "Contact"],
"Professional Experience": ["Professional Experience", "Work Experience", "Experience"],
"Projects": ["Projects", "Project"],
"Skills": ["Skills", "Technical Skills"],
"Education": ["Education", "Academic Background"],
"Achievements": ["Achievements", "Awards", "Honors"],
"Extra-Curricular Activities": ["Extra-Curricular Activities", "Extracurricular Activities", "Activities", "Volunteer Experience"]
}
contact_info_patterns = ["@", "linkedin", "github", "phone", "+91"]
section = None
lines = resume_text.split('\n')
for line in lines:
line = clean_text(line.strip())
if not line:
continue
section_detected = False
for key, keywords in section_keywords.items():
if any(keyword.lower() in line.lower() for keyword in keywords):
section = key
section_detected = True
break
if section_detected:
continue
if section == "Contact Information" or any(pattern in line.lower() for pattern in contact_info_patterns):
section = "Contact Information"
extract_contact_info(line, resume_json)
elif section and section != "Contact Information":
resume_json[section].append(line)
if "Phone" not in resume_json["Contact Information"]:
resume_json["Contact Information"]["Phone"] = "Not Provided"
if "Email" not in resume_json["Contact Information"]:
resume_json["Contact Information"]["Email"] = "Not Provided"
if "LinkedIn" not in resume_json["Contact Information"]:
resume_json["Contact Information"]["LinkedIn"] = "Not Provided"
if "GitHub" not in resume_json["Contact Information"]:
resume_json["Contact Information"]["GitHub"] = "Not Provided"
return resume_json
st.title("Resume Parser to JSON")
uploaded_file = st.file_uploader("Choose a PDF file", type="pdf")
default_file_path = "Kushagra_Sharma_Resume.pdf"
if uploaded_file is not None:
resume_text = extract_text_from_pdf(uploaded_file)
file_name = uploaded_file.name
else:
with open(default_file_path, "rb") as file:
resume_text = extract_text_from_pdf(file)
file_name = default_file_path
parsed_resume = parse_resume(resume_text)
st.text(f"Currently using file: {file_name}")
st.json(parsed_resume)