Spaces:
Sleeping
Sleeping
File size: 8,257 Bytes
5b8c994 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import pandas as pd
import re
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class JobTitlePreprocessor:
"""Preprocesses job titles by converting to lowercase, removing unwanted words,
special characters, numbers greater than 10, and content from location, states, regions, etc."""
def __init__(self):
# Define unwanted words and initialize counters
self.unwanted_words = ['remote', 'hybrid', 'flexible location', 'location', 'open to work',
'role', 'job', 'level', 'remot']
self.location_removed_count = 0
self.unwanted_words_removed_count = 0
self.brackets_removed_count = 0
self.state_region_removed_count = 0
self.numbers_removed_count = 0
def remove_location_unwanted_words_brackets(self, row):
"""Removes parts of the title based on location, unwanted words, bracketed content,
numbers greater than 10, and also removes symbols other than alphanumeric."""
title = row['titles_title']
location = row['LOCATION']
states = row.get('STATES', '') # Get values from 'STATES' column if present
region_state = row.get('REGION_STATE', '') # Get values from 'REGION_STATE' column if present
county = row.get('COUNTY', '') # Get values from 'COUNTY' column if present
city = row.get('city', '') # Get values from 'city' column if present
# Ensure title is a string
if isinstance(title, str):
# Remove location if present in the title
if isinstance(location, str) and re.search(r'\b{}\b'.format(re.escape(location)), title, flags=re.IGNORECASE):
title = re.sub(r'\b{}\b'.format(re.escape(location)), '', title, flags=re.IGNORECASE)
self.location_removed_count += 1
# Remove unwanted words
for word in self.unwanted_words:
pattern = r'\b{}\b'.format(re.escape(word))
if re.search(pattern, title, flags=re.IGNORECASE):
title = re.sub(pattern, '', title, flags=re.IGNORECASE)
self.unwanted_words_removed_count += 1
# Remove content from STATES, REGION_STATE, COUNTY, and city
for region in [states, region_state, county, city]:
if isinstance(region, str) and re.search(r'\b{}\b'.format(re.escape(region)), title, flags=re.IGNORECASE):
title = re.sub(r'\b{}\b'.format(re.escape(region)), '', title, flags=re.IGNORECASE)
self.state_region_removed_count += 1
# Remove content within brackets
if re.search(r'\[.*?\]|\(.*?\)|\{.*?\}', title):
title = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}', '', title)
self.brackets_removed_count += 1
# Remove any non-alphanumeric characters (keeping spaces)
title = re.sub(r'[^a-zA-Z0-9\s]', '', title)
# Remove numbers greater than 10
if re.search(r'\b(?:[1-9][0-9]+|1[1-9]|[2-9][0-9])\b', title):
title = re.sub(r'\b(?:[1-9][0-9]+|1[1-9]|[2-9][0-9])\b', '', title)
self.numbers_removed_count += 1
# Clean up extra spaces
title = re.sub(r'\s+', ' ', title).strip()
return title
def preprocess(self, title: str) -> str:
"""Converts title to lowercase, removes unwanted words, replaces specific terms,
and standardizes job titles."""
if not isinstance(title, str):
return title
# Convert to lowercase
title = title.lower()
# Replace specific terms and Roman numerals
replacements = [
(r'\b(?:SR|sr|Sr\.?|SR\.?|Senior|senior)\b', 'Senior'),
(r'\b(?:JR|jr|Jr\.?|JR\.?|Junior|junior)\b', 'Junior'),
(r'\b(?:AIML|aiml|ML|ml|MachineLearning|machinelearning|machine[_\-]learning)\b', 'Machine Learning'),
(r'\b(?:GenAI|genai|Genai|generative[_\-]ai|GenerativeAI|generativeai)\b', 'Generative AI'),
(r'\b(?:NLP|nlp|natural[_\-]language[_\-]processing|natural language processing)\b', 'NLP'),
(r'\b(?:i|I)\b', '1'),
(r'\b(?:ii|II)\b', '2'),
(r'\b(?:iii|III)\b', '3'),
(r'\b(?:iv|IV)\b', '4'),
(r'\b(?:v|V)\b', '5')
]
for pattern, replacement in replacements:
title = re.sub(pattern, replacement, title, flags=re.IGNORECASE)
# Handle specific Data Scientist cases
title = re.sub(r'\b(director|dir\.?|dir)\b.*?(data\s*scientist|data\s*science)', 'Director Data Scientist', title, flags=re.IGNORECASE)
title = re.sub(r'\b(manager|mgr)\b.*?(data\s*scientist|data\s*science)', 'Manager Data Scientist', title, flags=re.IGNORECASE)
title = re.sub(r'\b(lead)\b.*?(data\s*scientist|data\s*science)', 'Lead Data Scientist', title, flags=re.IGNORECASE)
title = re.sub(r'\b(associate|associates?)\b.*?(data\s*scientist|data\s*science)', 'Associate Data Scientist', title, flags=re.IGNORECASE)
title = re.sub(r'\b(applied)\b.*?(data\s*scientist|data\s*science)', 'Applied Data Scientist', title, flags=re.IGNORECASE)
title = re.sub(r'\b(intern|internship|trainee)\b.*?(data\s*scientist|data\s*science)', 'Intern Data Scientist', title, flags=re.IGNORECASE)
# Ensure "ML" or "NLP" is retained if found in the title
if re.search(r'\bdata\s*scientist\b', title, flags=re.IGNORECASE):
if re.search(r'\b(?:ai|artificial intelligence|ml|machine learning|deep learning|dl)\b', title, flags=re.IGNORECASE):
return 'ML Data Scientist'
elif re.search(r'\b(?:nlp|natural language processing)\b', title, flags=re.IGNORECASE):
return 'NLP Data Scientist'
return title
# Clean up extra spaces
title = re.sub(r'\s+', ' ', title).strip()
return title
def is_title_empty(row):
"""
Check if the 'titles_title' is effectively empty, which includes
strings that are either empty or contain only whitespace.
"""
title = row['titles_title']
return pd.isna(title) or (isinstance(title, str) and title.strip() == '')
def main_preprocessing():
try:
# Load the dataset
df = pd.read_csv(r"Struct Data_Data Science 100K.csv", low_memory=False)
# Initialize preprocessor
preprocessor = JobTitlePreprocessor()
# Apply both the removal and standard preprocessing steps
df['clean_title'] = df.apply(preprocessor.remove_location_unwanted_words_brackets, axis=1)
df['clean_title'] = df['clean_title'].apply(preprocessor.preprocess)
# Remove rows where 'titles_title' is empty or contains only whitespace
df = df[~df.apply(is_title_empty, axis=1)]
# Drop rows where 'clean_title' is NaN
df = df.dropna(subset=['clean_title'])
# Log some information about the dataset
logger.info(f"Original dataset shape: {df.shape}")
logger.info(f"Number of non-empty titles: {df['clean_title'].notna().sum()}")
# Save the preprocessed data
output_df = df[['titles_title', 'clean_title']]
output_df.to_csv('preprocessed_job_titles.csv', index=False)
logger.info(f"Preprocessed dataset shape: {output_df.shape}")
logger.info("Job title preprocessing completed successfully.")
logger.info(f"Total rows with part of location removed from titles: {preprocessor.location_removed_count}")
logger.info(f"Total unwanted words removed: {preprocessor.unwanted_words_removed_count}")
logger.info(f"Total brackets removed: {preprocessor.brackets_removed_count}")
logger.info(f"Total states/regions removed: {preprocessor.state_region_removed_count}")
logger.info(f"Total numbers removed: {preprocessor.numbers_removed_count}")
except Exception as e:
logger.error(f"An error occurred during preprocessing: {e}")
if __name__ == "__main__":
main_preprocessing()
|