Sahil Borhade commited on
Commit
5b8c994
·
verified ·
1 Parent(s): d5dd643

Upload 6 files

Browse files
clustered_job_titles.csv ADDED
The diff for this file is too large to render. See raw diff
 
kmeans_model-1.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f50b7873e093fac469d3148a723772123d733a0e838d4d91bc1581512fd6d28a
3
+ size 1918483
main.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import joblib
3
+ import numpy as np
4
+ import logging
5
+ from processing import JobTitlePreprocessor # Import your preprocessor class
6
+
7
+ # Configure logging for errors
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
+ # Load the pre-trained models
12
+ vectorizer = joblib.load('vectorizer_model.pkl')
13
+ kmeans_model = joblib.load('kmeans_model-1.pkl')
14
+
15
+ # Initialize the preprocessor
16
+ preprocessor = JobTitlePreprocessor()
17
+
18
+ # Streamlit app title
19
+ st.title("Job Title Clustering App")
20
+
21
+ # Input fields for job titles
22
+ job_title_1 = st.text_input("Enter the first job title:")
23
+ job_title_2 = st.text_input("Enter the second job title:")
24
+
25
+ # Button to process the inputs
26
+ if st.button("Submit"):
27
+ if not job_title_1 or not job_title_2:
28
+ st.error("Please enter both job titles.")
29
+ else:
30
+ try:
31
+ # Preprocess the input job titles
32
+ clean_title_1 = preprocessor.preprocess(job_title_1)
33
+ clean_title_2 = preprocessor.preprocess(job_title_2)
34
+
35
+ # Log the preprocessed titles
36
+ logger.info(f"Preprocessed Title 1: {clean_title_1}")
37
+ logger.info(f"Preprocessed Title 2: {clean_title_2}")
38
+
39
+ # Vectorize the preprocessed job titles
40
+ title_vector_1 = vectorizer.transform([clean_title_1])
41
+ title_vector_2 = vectorizer.transform([clean_title_2])
42
+
43
+ # Predict clusters for each job title
44
+ cluster_1 = kmeans_model.predict(title_vector_1)[0]
45
+ cluster_2 = kmeans_model.predict(title_vector_2)[0]
46
+
47
+ # Display results
48
+ st.write(f"Cluster for '{job_title_1}': {cluster_1}")
49
+ st.write(f"Cluster for '{job_title_2}': {cluster_2}")
50
+
51
+ if cluster_1 == cluster_2:
52
+ st.success(f"The job titles '{job_title_1}' and '{job_title_2}' belong to the same cluster!")
53
+ else:
54
+ st.warning(f"The job titles '{job_title_1}' and '{job_title_2}' do not belong to the same cluster.")
55
+
56
+ # Display top words for the predicted clusters
57
+ def get_top_words(cluster, vectorizer, kmeans_model):
58
+ feature_names = vectorizer.get_feature_names_out()
59
+ top_word_indices = np.argsort(kmeans_model.cluster_centers_[cluster])[::-1][:5]
60
+ top_words = [feature_names[i] for i in top_word_indices]
61
+ return top_words
62
+
63
+ top_words_1 = get_top_words(cluster_1, vectorizer, kmeans_model)
64
+ top_words_2 = get_top_words(cluster_2, vectorizer, kmeans_model)
65
+
66
+ st.write(f"Top words in Cluster {cluster_1}: {', '.join(top_words_1)}")
67
+ st.write(f"Top words in Cluster {cluster_2}: {', '.join(top_words_2)}")
68
+
69
+ except Exception as e:
70
+ logger.error(f"Error occurred: {e}", exc_info=True)
71
+ st.error(f"An error occurred: {e}")
processing.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import re
3
+ import logging
4
+
5
+ # Configure logging
6
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
7
+ logger = logging.getLogger(__name__)
8
+
9
+ class JobTitlePreprocessor:
10
+ """Preprocesses job titles by converting to lowercase, removing unwanted words,
11
+ special characters, numbers greater than 10, and content from location, states, regions, etc."""
12
+
13
+ def __init__(self):
14
+ # Define unwanted words and initialize counters
15
+ self.unwanted_words = ['remote', 'hybrid', 'flexible location', 'location', 'open to work',
16
+ 'role', 'job', 'level', 'remot']
17
+ self.location_removed_count = 0
18
+ self.unwanted_words_removed_count = 0
19
+ self.brackets_removed_count = 0
20
+ self.state_region_removed_count = 0
21
+ self.numbers_removed_count = 0
22
+
23
+ def remove_location_unwanted_words_brackets(self, row):
24
+ """Removes parts of the title based on location, unwanted words, bracketed content,
25
+ numbers greater than 10, and also removes symbols other than alphanumeric."""
26
+ title = row['titles_title']
27
+ location = row['LOCATION']
28
+ states = row.get('STATES', '') # Get values from 'STATES' column if present
29
+ region_state = row.get('REGION_STATE', '') # Get values from 'REGION_STATE' column if present
30
+ county = row.get('COUNTY', '') # Get values from 'COUNTY' column if present
31
+ city = row.get('city', '') # Get values from 'city' column if present
32
+
33
+ # Ensure title is a string
34
+ if isinstance(title, str):
35
+ # Remove location if present in the title
36
+ if isinstance(location, str) and re.search(r'\b{}\b'.format(re.escape(location)), title, flags=re.IGNORECASE):
37
+ title = re.sub(r'\b{}\b'.format(re.escape(location)), '', title, flags=re.IGNORECASE)
38
+ self.location_removed_count += 1
39
+
40
+ # Remove unwanted words
41
+ for word in self.unwanted_words:
42
+ pattern = r'\b{}\b'.format(re.escape(word))
43
+ if re.search(pattern, title, flags=re.IGNORECASE):
44
+ title = re.sub(pattern, '', title, flags=re.IGNORECASE)
45
+ self.unwanted_words_removed_count += 1
46
+
47
+ # Remove content from STATES, REGION_STATE, COUNTY, and city
48
+ for region in [states, region_state, county, city]:
49
+ if isinstance(region, str) and re.search(r'\b{}\b'.format(re.escape(region)), title, flags=re.IGNORECASE):
50
+ title = re.sub(r'\b{}\b'.format(re.escape(region)), '', title, flags=re.IGNORECASE)
51
+ self.state_region_removed_count += 1
52
+
53
+ # Remove content within brackets
54
+ if re.search(r'\[.*?\]|\(.*?\)|\{.*?\}', title):
55
+ title = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}', '', title)
56
+ self.brackets_removed_count += 1
57
+
58
+ # Remove any non-alphanumeric characters (keeping spaces)
59
+ title = re.sub(r'[^a-zA-Z0-9\s]', '', title)
60
+
61
+ # Remove numbers greater than 10
62
+ if re.search(r'\b(?:[1-9][0-9]+|1[1-9]|[2-9][0-9])\b', title):
63
+ title = re.sub(r'\b(?:[1-9][0-9]+|1[1-9]|[2-9][0-9])\b', '', title)
64
+ self.numbers_removed_count += 1
65
+
66
+ # Clean up extra spaces
67
+ title = re.sub(r'\s+', ' ', title).strip()
68
+
69
+ return title
70
+
71
+ def preprocess(self, title: str) -> str:
72
+ """Converts title to lowercase, removes unwanted words, replaces specific terms,
73
+ and standardizes job titles."""
74
+ if not isinstance(title, str):
75
+ return title
76
+
77
+ # Convert to lowercase
78
+ title = title.lower()
79
+
80
+ # Replace specific terms and Roman numerals
81
+ replacements = [
82
+ (r'\b(?:SR|sr|Sr\.?|SR\.?|Senior|senior)\b', 'Senior'),
83
+ (r'\b(?:JR|jr|Jr\.?|JR\.?|Junior|junior)\b', 'Junior'),
84
+ (r'\b(?:AIML|aiml|ML|ml|MachineLearning|machinelearning|machine[_\-]learning)\b', 'Machine Learning'),
85
+ (r'\b(?:GenAI|genai|Genai|generative[_\-]ai|GenerativeAI|generativeai)\b', 'Generative AI'),
86
+ (r'\b(?:NLP|nlp|natural[_\-]language[_\-]processing|natural language processing)\b', 'NLP'),
87
+ (r'\b(?:i|I)\b', '1'),
88
+ (r'\b(?:ii|II)\b', '2'),
89
+ (r'\b(?:iii|III)\b', '3'),
90
+ (r'\b(?:iv|IV)\b', '4'),
91
+ (r'\b(?:v|V)\b', '5')
92
+ ]
93
+
94
+ for pattern, replacement in replacements:
95
+ title = re.sub(pattern, replacement, title, flags=re.IGNORECASE)
96
+
97
+ # Handle specific Data Scientist cases
98
+ title = re.sub(r'\b(director|dir\.?|dir)\b.*?(data\s*scientist|data\s*science)', 'Director Data Scientist', title, flags=re.IGNORECASE)
99
+ title = re.sub(r'\b(manager|mgr)\b.*?(data\s*scientist|data\s*science)', 'Manager Data Scientist', title, flags=re.IGNORECASE)
100
+ title = re.sub(r'\b(lead)\b.*?(data\s*scientist|data\s*science)', 'Lead Data Scientist', title, flags=re.IGNORECASE)
101
+ title = re.sub(r'\b(associate|associates?)\b.*?(data\s*scientist|data\s*science)', 'Associate Data Scientist', title, flags=re.IGNORECASE)
102
+ title = re.sub(r'\b(applied)\b.*?(data\s*scientist|data\s*science)', 'Applied Data Scientist', title, flags=re.IGNORECASE)
103
+ title = re.sub(r'\b(intern|internship|trainee)\b.*?(data\s*scientist|data\s*science)', 'Intern Data Scientist', title, flags=re.IGNORECASE)
104
+
105
+ # Ensure "ML" or "NLP" is retained if found in the title
106
+ if re.search(r'\bdata\s*scientist\b', title, flags=re.IGNORECASE):
107
+ if re.search(r'\b(?:ai|artificial intelligence|ml|machine learning|deep learning|dl)\b', title, flags=re.IGNORECASE):
108
+ return 'ML Data Scientist'
109
+ elif re.search(r'\b(?:nlp|natural language processing)\b', title, flags=re.IGNORECASE):
110
+ return 'NLP Data Scientist'
111
+ return title
112
+
113
+ # Clean up extra spaces
114
+ title = re.sub(r'\s+', ' ', title).strip()
115
+
116
+ return title
117
+
118
+ def is_title_empty(row):
119
+ """
120
+ Check if the 'titles_title' is effectively empty, which includes
121
+ strings that are either empty or contain only whitespace.
122
+ """
123
+ title = row['titles_title']
124
+ return pd.isna(title) or (isinstance(title, str) and title.strip() == '')
125
+
126
+ def main_preprocessing():
127
+ try:
128
+ # Load the dataset
129
+ df = pd.read_csv(r"Struct Data_Data Science 100K.csv", low_memory=False)
130
+
131
+ # Initialize preprocessor
132
+ preprocessor = JobTitlePreprocessor()
133
+
134
+ # Apply both the removal and standard preprocessing steps
135
+ df['clean_title'] = df.apply(preprocessor.remove_location_unwanted_words_brackets, axis=1)
136
+ df['clean_title'] = df['clean_title'].apply(preprocessor.preprocess)
137
+
138
+ # Remove rows where 'titles_title' is empty or contains only whitespace
139
+ df = df[~df.apply(is_title_empty, axis=1)]
140
+
141
+ # Drop rows where 'clean_title' is NaN
142
+ df = df.dropna(subset=['clean_title'])
143
+
144
+ # Log some information about the dataset
145
+ logger.info(f"Original dataset shape: {df.shape}")
146
+ logger.info(f"Number of non-empty titles: {df['clean_title'].notna().sum()}")
147
+
148
+ # Save the preprocessed data
149
+ output_df = df[['titles_title', 'clean_title']]
150
+ output_df.to_csv('preprocessed_job_titles.csv', index=False)
151
+
152
+ logger.info(f"Preprocessed dataset shape: {output_df.shape}")
153
+ logger.info("Job title preprocessing completed successfully.")
154
+ logger.info(f"Total rows with part of location removed from titles: {preprocessor.location_removed_count}")
155
+ logger.info(f"Total unwanted words removed: {preprocessor.unwanted_words_removed_count}")
156
+ logger.info(f"Total brackets removed: {preprocessor.brackets_removed_count}")
157
+ logger.info(f"Total states/regions removed: {preprocessor.state_region_removed_count}")
158
+ logger.info(f"Total numbers removed: {preprocessor.numbers_removed_count}")
159
+
160
+ except Exception as e:
161
+ logger.error(f"An error occurred during preprocessing: {e}")
162
+
163
+ if __name__ == "__main__":
164
+ main_preprocessing()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ scikit-learn
vectorizer_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c2dd076934094a674858b413185262bdf916f4157f7096af89f3064930ae692
3
+ size 105867