Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
|
|
|
| 3 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 4 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
| 5 |
import re
|
| 6 |
from PyPDF2 import PdfReader
|
| 7 |
|
|
@@ -22,35 +24,44 @@ def clean_text(text):
|
|
| 22 |
text = re.sub(r'\W', ' ', text)
|
| 23 |
return text.lower()
|
| 24 |
|
| 25 |
-
def
|
| 26 |
tfidf_vectorizer = TfidfVectorizer()
|
| 27 |
tfidf_matrix = tfidf_vectorizer.fit_transform(resumes + [keywords])
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
st.title("Resume Analyzer")
|
| 32 |
|
| 33 |
st.sidebar.subheader("Enter Keywords and Priority")
|
| 34 |
-
|
| 35 |
data = pd.DataFrame({
|
| 36 |
'Keyword': ['']*10,
|
| 37 |
'Priority': ['']*10
|
| 38 |
})
|
| 39 |
-
|
| 40 |
keywords_df = st.sidebar.data_editor(data, num_rows="dynamic", key="keyword_table")
|
| 41 |
|
| 42 |
if not keywords_df['Keyword'].isnull().all():
|
| 43 |
keywords_combined = " ".join(keywords_df.apply(lambda row: f"{row['Keyword']} " * int(row['Priority']) if row['Priority'].isdigit() else row['Keyword'], axis=1))
|
| 44 |
-
|
| 45 |
st.subheader("Upload up to 5 resumes (PDF or Text files)")
|
| 46 |
uploaded_files = st.file_uploader("Choose Resume Files", accept_multiple_files=True, type=["txt", "pdf"])
|
| 47 |
-
|
| 48 |
if len(uploaded_files) > 0 and keywords_combined:
|
| 49 |
with st.spinner("Analyzing Resumes..."):
|
| 50 |
resumes = []
|
| 51 |
for file in uploaded_files:
|
| 52 |
try:
|
| 53 |
-
|
| 54 |
resume_text = extract_text_from_file(file)
|
| 55 |
clean_resume = clean_text(resume_text)
|
| 56 |
resumes.append(clean_resume)
|
|
@@ -59,13 +70,26 @@ if not keywords_df['Keyword'].isnull().all():
|
|
| 59 |
|
| 60 |
clean_keywords = clean_text(keywords_combined)
|
| 61 |
|
| 62 |
-
|
| 63 |
|
| 64 |
st.subheader("Resume Analysis Results")
|
| 65 |
results_df = pd.DataFrame({
|
| 66 |
'Resume': [file.name for file in uploaded_files],
|
| 67 |
-
'Similarity
|
|
|
|
|
|
|
| 68 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
st.dataframe(results_df)
|
| 70 |
else:
|
| 71 |
-
st.info("Please upload resumes and enter keywords with priority.")
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
+
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
|
| 6 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 7 |
import re
|
| 8 |
from PyPDF2 import PdfReader
|
| 9 |
|
|
|
|
| 24 |
text = re.sub(r'\W', ' ', text)
|
| 25 |
return text.lower()
|
| 26 |
|
| 27 |
+
def calculate_similarity_metrics(resumes, keywords):
|
| 28 |
tfidf_vectorizer = TfidfVectorizer()
|
| 29 |
tfidf_matrix = tfidf_vectorizer.fit_transform(resumes + [keywords])
|
| 30 |
+
|
| 31 |
+
cosine_sim = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
|
| 32 |
+
|
| 33 |
+
def jaccard_similarity(doc1, doc2):
|
| 34 |
+
set1 = set(doc1.split())
|
| 35 |
+
set2 = set(doc2.split())
|
| 36 |
+
return len(set1.intersection(set2)) / len(set1.union(set2))
|
| 37 |
+
|
| 38 |
+
jaccard_sim = [jaccard_similarity(keywords, resume) for resume in resumes]
|
| 39 |
+
|
| 40 |
+
euclidean_dist = euclidean_distances(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
|
| 41 |
+
euclidean_sim = 1 / (1 + euclidean_dist)
|
| 42 |
+
|
| 43 |
+
return cosine_sim, jaccard_sim, euclidean_sim
|
| 44 |
|
| 45 |
st.title("Resume Analyzer")
|
| 46 |
|
| 47 |
st.sidebar.subheader("Enter Keywords and Priority")
|
|
|
|
| 48 |
data = pd.DataFrame({
|
| 49 |
'Keyword': ['']*10,
|
| 50 |
'Priority': ['']*10
|
| 51 |
})
|
|
|
|
| 52 |
keywords_df = st.sidebar.data_editor(data, num_rows="dynamic", key="keyword_table")
|
| 53 |
|
| 54 |
if not keywords_df['Keyword'].isnull().all():
|
| 55 |
keywords_combined = " ".join(keywords_df.apply(lambda row: f"{row['Keyword']} " * int(row['Priority']) if row['Priority'].isdigit() else row['Keyword'], axis=1))
|
| 56 |
+
|
| 57 |
st.subheader("Upload up to 5 resumes (PDF or Text files)")
|
| 58 |
uploaded_files = st.file_uploader("Choose Resume Files", accept_multiple_files=True, type=["txt", "pdf"])
|
| 59 |
+
|
| 60 |
if len(uploaded_files) > 0 and keywords_combined:
|
| 61 |
with st.spinner("Analyzing Resumes..."):
|
| 62 |
resumes = []
|
| 63 |
for file in uploaded_files:
|
| 64 |
try:
|
|
|
|
| 65 |
resume_text = extract_text_from_file(file)
|
| 66 |
clean_resume = clean_text(resume_text)
|
| 67 |
resumes.append(clean_resume)
|
|
|
|
| 70 |
|
| 71 |
clean_keywords = clean_text(keywords_combined)
|
| 72 |
|
| 73 |
+
cosine_scores, jaccard_scores, euclidean_scores = calculate_similarity_metrics(resumes, clean_keywords)
|
| 74 |
|
| 75 |
st.subheader("Resume Analysis Results")
|
| 76 |
results_df = pd.DataFrame({
|
| 77 |
'Resume': [file.name for file in uploaded_files],
|
| 78 |
+
'Cosine Similarity': cosine_scores,
|
| 79 |
+
'Jaccard Index': jaccard_scores,
|
| 80 |
+
'Euclidean Similarity': euclidean_scores
|
| 81 |
})
|
| 82 |
+
|
| 83 |
+
scaler = MinMaxScaler()
|
| 84 |
+
normalized_scores = scaler.fit_transform(results_df[['Cosine Similarity', 'Jaccard Index', 'Euclidean Similarity']])
|
| 85 |
+
|
| 86 |
+
overall_scores = np.mean(normalized_scores, axis=1)
|
| 87 |
+
results_df['Overall Score'] = overall_scores
|
| 88 |
+
|
| 89 |
+
results_df['Rank'] = results_df['Overall Score'].rank(ascending=False, method='min').astype(int)
|
| 90 |
+
|
| 91 |
+
results_df = results_df.sort_values('Rank')
|
| 92 |
+
|
| 93 |
st.dataframe(results_df)
|
| 94 |
else:
|
| 95 |
+
st.info("Please upload resumes and enter keywords with priority.")
|