billusanda007 commited on
Commit
d8a8115
·
verified ·
1 Parent(s): 86733d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -14
app.py CHANGED
@@ -14,7 +14,7 @@ from sklearn.preprocessing import LabelEncoder
14
 
15
 
16
  def cleanResume(resumeText):
17
- # Your existing cleanResume function remains unchanged
18
  resumeText = re.sub('http\S+\s*', ' ', resumeText)
19
  resumeText = re.sub('RT|cc', ' ', resumeText)
20
  resumeText = re.sub('#\S+', '', resumeText)
@@ -25,35 +25,35 @@ def cleanResume(resumeText):
25
  return resumeText
26
 
27
  def pdf_to_text(file):
28
- # Use pdfminer.six to extract text from the PDF file
29
  text = extract_text(file)
30
  return text
31
 
32
  def predict_category(resumes_data, selected_category,max_sequence_length):
33
- # Load the trained DeepRank model
34
  model = load_deeprank_model(max_sequence_length)
35
 
36
- # Process the resumes data
37
  resumes_df = pd.DataFrame(resumes_data)
38
  resumes_text = resumes_df['ResumeText'].values
39
 
40
- # Tokenize the text and convert to sequences
41
  tokenized_text = tokenizer.texts_to_sequences(resumes_text)
42
 
43
- # Pad sequences to have the same length
44
  max_sequence_length = 500 # Assuming maximum sequence length of 500 words
45
  padded_text = pad_sequences(tokenized_text, maxlen=max_sequence_length)
46
 
47
- # Make predictions
48
  predicted_probs = model.predict(padded_text)
49
 
50
- # Assign probabilities to respective job categories
51
  for i, category in enumerate(label.classes_):
52
  resumes_df[category] = predicted_probs[:, i]
53
 
54
  resumes_df_sorted = resumes_df.sort_values(by=selected_category, ascending=False)
55
 
56
- # Get the ranks for the selected category
57
  ranks = []
58
  for rank, (idx, row) in enumerate(resumes_df_sorted.iterrows()):
59
  rank = rank + 1
@@ -63,9 +63,9 @@ def predict_category(resumes_data, selected_category,max_sequence_length):
63
  return ranks
64
 
65
  def load_deeprank_model(max_sequence_length):
66
- # Load the saved DeepRank model
67
  model = Sequential()
68
- # Add layers to the model (example architecture, adjust as needed)
69
  model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
70
  model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
71
  model.add(MaxPooling1D(pool_size=2))
@@ -82,7 +82,7 @@ def main():
82
  resumes_data = []
83
  selected_category = ""
84
 
85
- # Handle multiple file uploads
86
  files = st.file_uploader("Upload resumes", type=["pdf"], accept_multiple_files=True)
87
  if files:
88
  for file in files:
@@ -98,13 +98,13 @@ def main():
98
  st.write(pd.DataFrame(ranks))
99
 
100
  if __name__ == '__main__':
101
- # Load label encoder and tokenizer
102
  df = pd.read_csv('UpdatedResumeDataSet.csv')
103
  df['cleaned'] = df['Resume'].apply(lambda x: cleanResume(x))
104
  label = LabelEncoder()
105
  df['Category'] = label.fit_transform(df['Category'])
106
 
107
- # Tokenize text and get vocabulary size and number of classes
108
  text = df['cleaned'].values
109
  #text=df['Resume'].values
110
  tokenizer = Tokenizer()
 
14
 
15
 
16
  def cleanResume(resumeText):
17
+
18
  resumeText = re.sub('http\S+\s*', ' ', resumeText)
19
  resumeText = re.sub('RT|cc', ' ', resumeText)
20
  resumeText = re.sub('#\S+', '', resumeText)
 
25
  return resumeText
26
 
27
  def pdf_to_text(file):
28
+
29
  text = extract_text(file)
30
  return text
31
 
32
  def predict_category(resumes_data, selected_category,max_sequence_length):
33
+
34
  model = load_deeprank_model(max_sequence_length)
35
 
36
+
37
  resumes_df = pd.DataFrame(resumes_data)
38
  resumes_text = resumes_df['ResumeText'].values
39
 
40
+
41
  tokenized_text = tokenizer.texts_to_sequences(resumes_text)
42
 
43
+
44
  max_sequence_length = 500 # Assuming maximum sequence length of 500 words
45
  padded_text = pad_sequences(tokenized_text, maxlen=max_sequence_length)
46
 
47
+
48
  predicted_probs = model.predict(padded_text)
49
 
50
+
51
  for i, category in enumerate(label.classes_):
52
  resumes_df[category] = predicted_probs[:, i]
53
 
54
  resumes_df_sorted = resumes_df.sort_values(by=selected_category, ascending=False)
55
 
56
+
57
  ranks = []
58
  for rank, (idx, row) in enumerate(resumes_df_sorted.iterrows()):
59
  rank = rank + 1
 
63
  return ranks
64
 
65
  def load_deeprank_model(max_sequence_length):
66
+
67
  model = Sequential()
68
+
69
  model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_sequence_length))
70
  model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
71
  model.add(MaxPooling1D(pool_size=2))
 
82
  resumes_data = []
83
  selected_category = ""
84
 
85
+
86
  files = st.file_uploader("Upload resumes", type=["pdf"], accept_multiple_files=True)
87
  if files:
88
  for file in files:
 
98
  st.write(pd.DataFrame(ranks))
99
 
100
  if __name__ == '__main__':
101
+
102
  df = pd.read_csv('UpdatedResumeDataSet.csv')
103
  df['cleaned'] = df['Resume'].apply(lambda x: cleanResume(x))
104
  label = LabelEncoder()
105
  df['Category'] = label.fit_transform(df['Category'])
106
 
107
+
108
  text = df['cleaned'].values
109
  #text=df['Resume'].values
110
  tokenizer = Tokenizer()