File size: 4,459 Bytes
67d85c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
efa6192
67d85c4
efa6192
67d85c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbbf59d
67d85c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4fc594e
67d85c4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbbf59d
 
67d85c4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import streamlit as st
import pandas as pd
import numpy as np
import re
import pickle
import pdfminer
from pdfminer.high_level import extract_text
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


def cleanResume(resumeText):
    # Your existing cleanResume function remains unchanged
    resumeText = re.sub('http\S+\s*', ' ', resumeText)
    resumeText = re.sub('RT|cc', ' ', resumeText)
    resumeText = re.sub('#\S+', '', resumeText)
    resumeText = re.sub('@\S+', '  ', resumeText)
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
    resumeText = re.sub('\s+', ' ', resumeText)
    return resumeText

def pdf_to_text(file):
    # Use pdfminer.six to extract text from the PDF file
    text = extract_text(file)
    return text

def predict_category(resumes_data, selected_category,max_sequence_length):
    # Load the trained DeepRank model
    model = load_deeprank_model(max_sequence_length)

    # Process the resumes data
    resumes_df = pd.DataFrame(resumes_data)
    resumes_text = resumes_df['ResumeText'].values

    # Tokenize the text and convert to sequences
    tokenized_text = tokenizer.texts_to_sequences(resumes_text)

    # Pad sequences to have the same length
    max_sequence_length = 500  # Assuming maximum sequence length of 500 words
    padded_text = pad_sequences(tokenized_text, maxlen=max_sequence_length)

    # Make predictions
    predicted_probs = model.predict(padded_text)

    # Assign probabilities to respective job categories
    for i, category in enumerate(label.classes_):
        resumes_df[category] = predicted_probs[:, i]

    resumes_df_sorted = resumes_df.sort_values(by=selected_category, ascending=False)

    # Get the ranks for the selected category
    ranks = []
    for rank, (idx, row) in enumerate(resumes_df_sorted.iterrows()):
        rank = rank + 1
        file_name = row['FileName']
        ranks.append({'Rank': rank, 'FileName': file_name})

    return ranks

def load_deeprank_model(max_sequence_length):
    # Load the saved DeepRank model
    model = Sequential()
    # Add layers to the model (example architecture, adjust as needed)
    model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
    model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(LSTM(64))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.load_weights('deeprank_model.h5')  # Replace 'deeprank_model.h5' with your saved model file
    return model

def main():
    st.title("Resume Ranking App")
    st.text("Upload resumes and select a category to rank them.")

    resumes_data = []
    selected_category = ""

    # Handle multiple file uploads
    files = st.file_uploader("Upload resumes", type=["pdf"], accept_multiple_files=True)
    if files:
        for file in files:
            text = cleanResume(pdf_to_text(file))
            resumes_data.append({'ResumeText': text, 'FileName': file.name})
        selected_category = st.selectbox("Select a category to rank by", label.classes_)

    if st.button("Rank Resumes"):
        if not resumes_data or not selected_category:
            st.warning("Please upload resumes and select a category to continue.")
        else:
            ranks = predict_category(resumes_data, selected_category,max_sequence_length)
            st.write(pd.DataFrame(ranks))

if __name__ == '__main__':
    # Load label encoder and tokenizer
    df = pd.read_csv('UpdatedResumeDataSet.csv')
    df['cleaned'] = df['Resume'].apply(lambda x: cleanResume(x))
    label = LabelEncoder()
    df['Category'] = label.fit_transform(df['Category'])

    # Tokenize text and get vocabulary size and number of classes
    text = df['cleaned'].values
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(text)
    vocab_size = len(tokenizer.word_index) + 1
    num_classes = len(label.classes_)

    max_sequence_length = 500
    
    main()