File size: 3,807 Bytes
67d85c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d8a8115
67d85c4
 
 
 
 
 
 
 
 
 
d8a8115
67d85c4
 
 
efa6192
d8a8115
efa6192
67d85c4
d8a8115
67d85c4
 
 
d8a8115
67d85c4
 
d8a8115
807012c
67d85c4
 
d8a8115
67d85c4
 
d8a8115
67d85c4
 
 
 
 
d8a8115
67d85c4
 
 
 
 
 
 
 
bbbf59d
d8a8115
2b16bf8
d8a8115
d728717
 
 
 
 
 
807012c
67d85c4
 
 
 
 
 
 
 
 
d8a8115
67d85c4
 
 
 
 
 
 
 
 
 
 
4fc594e
67d85c4
 
 
d8a8115
67d85c4
 
 
 
 
d8a8115
4119b5d
 
67d85c4
 
 
 
 
bbbf59d
 
67d85c4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import streamlit as st
import pandas as pd
import numpy as np
import re
import pickle
import pdfminer
from pdfminer.high_level import extract_text
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


def cleanResume(resumeText):
    
    resumeText = re.sub('http\S+\s*', ' ', resumeText)
    resumeText = re.sub('RT|cc', ' ', resumeText)
    resumeText = re.sub('#\S+', '', resumeText)
    resumeText = re.sub('@\S+', '  ', resumeText)
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
    resumeText = re.sub('\s+', ' ', resumeText)
    return resumeText

def pdf_to_text(file):
    
    text = extract_text(file)
    return text

def predict_category(resumes_data, selected_category,max_sequence_length):
   
    model = load_deeprank_model(max_sequence_length)

    
    resumes_df = pd.DataFrame(resumes_data)
    resumes_text = resumes_df['ResumeText'].values

    
    tokenized_text = tokenizer.texts_to_sequences(resumes_text)

    
    max_sequence_length = 500  
    padded_text = pad_sequences(tokenized_text, maxlen=max_sequence_length)

    
    predicted_probs = model.predict(padded_text)

    
    for i, category in enumerate(label.classes_):
        resumes_df[category] = predicted_probs[:, i]

    resumes_df_sorted = resumes_df.sort_values(by=selected_category, ascending=False)

    
    ranks = []
    for rank, (idx, row) in enumerate(resumes_df_sorted.iterrows()):
        rank = rank + 1
        file_name = row['FileName']
        ranks.append({'Rank': rank, 'FileName': file_name})

    return ranks

def load_deeprank_model(max_sequence_length):
    
    model = Sequential()
    
    model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(64))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.load_weights('deeprank_model_v2.h5') 
    return model

def main():
    st.title("Resume Ranking App")
    st.text("Upload resumes and select a category to rank them.")

    resumes_data = []
    selected_category = ""

    
    files = st.file_uploader("Upload resumes", type=["pdf"], accept_multiple_files=True)
    if files:
        for file in files:
            text = cleanResume(pdf_to_text(file))
            resumes_data.append({'ResumeText': text, 'FileName': file.name})
        selected_category = st.selectbox("Select a category to rank by", label.classes_)

    if st.button("Rank Resumes"):
        if not resumes_data or not selected_category:
            st.warning("Please upload resumes and select a category to continue.")
        else:
            ranks = predict_category(resumes_data, selected_category,max_sequence_length)
            st.write(pd.DataFrame(ranks))

if __name__ == '__main__':
    
    df = pd.read_csv('UpdatedResumeDataSet.csv')
    df['cleaned'] = df['Resume'].apply(lambda x: cleanResume(x))
    label = LabelEncoder()
    df['Category'] = label.fit_transform(df['Category'])

    
    text = df['cleaned'].values
    #text=df['Resume'].values
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    vocab_size = len(tokenizer.word_index) + 1
    num_classes = len(label.classes_)

    max_sequence_length = 500
    
    main()