Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -14,7 +14,7 @@ from sklearn.preprocessing import LabelEncoder
|
|
14 |
|
15 |
|
16 |
def cleanResume(resumeText):
|
17 |
-
|
18 |
resumeText = re.sub('http\S+\s*', ' ', resumeText)
|
19 |
resumeText = re.sub('RT|cc', ' ', resumeText)
|
20 |
resumeText = re.sub('#\S+', '', resumeText)
|
@@ -25,35 +25,35 @@ def cleanResume(resumeText):
|
|
25 |
return resumeText
|
26 |
|
27 |
def pdf_to_text(file):
|
28 |
-
|
29 |
text = extract_text(file)
|
30 |
return text
|
31 |
|
32 |
def predict_category(resumes_data, selected_category,max_sequence_length):
|
33 |
-
|
34 |
model = load_deeprank_model(max_sequence_length)
|
35 |
|
36 |
-
|
37 |
resumes_df = pd.DataFrame(resumes_data)
|
38 |
resumes_text = resumes_df['ResumeText'].values
|
39 |
|
40 |
-
|
41 |
tokenized_text = tokenizer.texts_to_sequences(resumes_text)
|
42 |
|
43 |
-
|
44 |
max_sequence_length = 500 # Assuming maximum sequence length of 500 words
|
45 |
padded_text = pad_sequences(tokenized_text, maxlen=max_sequence_length)
|
46 |
|
47 |
-
|
48 |
predicted_probs = model.predict(padded_text)
|
49 |
|
50 |
-
|
51 |
for i, category in enumerate(label.classes_):
|
52 |
resumes_df[category] = predicted_probs[:, i]
|
53 |
|
54 |
resumes_df_sorted = resumes_df.sort_values(by=selected_category, ascending=False)
|
55 |
|
56 |
-
|
57 |
ranks = []
|
58 |
for rank, (idx, row) in enumerate(resumes_df_sorted.iterrows()):
|
59 |
rank = rank + 1
|
@@ -63,9 +63,9 @@ def predict_category(resumes_data, selected_category,max_sequence_length):
|
|
63 |
return ranks
|
64 |
|
65 |
def load_deeprank_model(max_sequence_length):
|
66 |
-
|
67 |
model = Sequential()
|
68 |
-
|
69 |
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
|
70 |
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
|
71 |
model.add(MaxPooling1D(pool_size=2))
|
@@ -82,7 +82,7 @@ def main():
|
|
82 |
resumes_data = []
|
83 |
selected_category = ""
|
84 |
|
85 |
-
|
86 |
files = st.file_uploader("Upload resumes", type=["pdf"], accept_multiple_files=True)
|
87 |
if files:
|
88 |
for file in files:
|
@@ -98,13 +98,13 @@ def main():
|
|
98 |
st.write(pd.DataFrame(ranks))
|
99 |
|
100 |
if __name__ == '__main__':
|
101 |
-
|
102 |
df = pd.read_csv('UpdatedResumeDataSet.csv')
|
103 |
df['cleaned'] = df['Resume'].apply(lambda x: cleanResume(x))
|
104 |
label = LabelEncoder()
|
105 |
df['Category'] = label.fit_transform(df['Category'])
|
106 |
|
107 |
-
|
108 |
text = df['cleaned'].values
|
109 |
#text=df['Resume'].values
|
110 |
tokenizer = Tokenizer()
|
|
|
14 |
|
15 |
|
16 |
def cleanResume(resumeText):
|
17 |
+
|
18 |
resumeText = re.sub('http\S+\s*', ' ', resumeText)
|
19 |
resumeText = re.sub('RT|cc', ' ', resumeText)
|
20 |
resumeText = re.sub('#\S+', '', resumeText)
|
|
|
25 |
return resumeText
|
26 |
|
27 |
def pdf_to_text(file):
|
28 |
+
|
29 |
text = extract_text(file)
|
30 |
return text
|
31 |
|
32 |
def predict_category(resumes_data, selected_category,max_sequence_length):
|
33 |
+
|
34 |
model = load_deeprank_model(max_sequence_length)
|
35 |
|
36 |
+
|
37 |
resumes_df = pd.DataFrame(resumes_data)
|
38 |
resumes_text = resumes_df['ResumeText'].values
|
39 |
|
40 |
+
|
41 |
tokenized_text = tokenizer.texts_to_sequences(resumes_text)
|
42 |
|
43 |
+
|
44 |
max_sequence_length = 500 # Assuming maximum sequence length of 500 words
|
45 |
padded_text = pad_sequences(tokenized_text, maxlen=max_sequence_length)
|
46 |
|
47 |
+
|
48 |
predicted_probs = model.predict(padded_text)
|
49 |
|
50 |
+
|
51 |
for i, category in enumerate(label.classes_):
|
52 |
resumes_df[category] = predicted_probs[:, i]
|
53 |
|
54 |
resumes_df_sorted = resumes_df.sort_values(by=selected_category, ascending=False)
|
55 |
|
56 |
+
|
57 |
ranks = []
|
58 |
for rank, (idx, row) in enumerate(resumes_df_sorted.iterrows()):
|
59 |
rank = rank + 1
|
|
|
63 |
return ranks
|
64 |
|
65 |
def load_deeprank_model(max_sequence_length):
|
66 |
+
|
67 |
model = Sequential()
|
68 |
+
|
69 |
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
|
70 |
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
|
71 |
model.add(MaxPooling1D(pool_size=2))
|
|
|
82 |
resumes_data = []
|
83 |
selected_category = ""
|
84 |
|
85 |
+
|
86 |
files = st.file_uploader("Upload resumes", type=["pdf"], accept_multiple_files=True)
|
87 |
if files:
|
88 |
for file in files:
|
|
|
98 |
st.write(pd.DataFrame(ranks))
|
99 |
|
100 |
if __name__ == '__main__':
|
101 |
+
|
102 |
df = pd.read_csv('UpdatedResumeDataSet.csv')
|
103 |
df['cleaned'] = df['Resume'].apply(lambda x: cleanResume(x))
|
104 |
label = LabelEncoder()
|
105 |
df['Category'] = label.fit_transform(df['Category'])
|
106 |
|
107 |
+
|
108 |
text = df['cleaned'].values
|
109 |
#text=df['Resume'].values
|
110 |
tokenizer = Tokenizer()
|