File size: 8,257 Bytes
5b8c994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import pandas as pd
import re
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class JobTitlePreprocessor:
    """Preprocesses job titles by converting to lowercase, removing unwanted words,
    special characters, numbers greater than 10, and content from location, states, regions, etc."""

    def __init__(self):
        # Define unwanted words and initialize counters
        self.unwanted_words = ['remote', 'hybrid', 'flexible location', 'location', 'open to work', 
                               'role', 'job', 'level', 'remot']
        self.location_removed_count = 0
        self.unwanted_words_removed_count = 0
        self.brackets_removed_count = 0
        self.state_region_removed_count = 0
        self.numbers_removed_count = 0

    def remove_location_unwanted_words_brackets(self, row):
        """Removes parts of the title based on location, unwanted words, bracketed content,
        numbers greater than 10, and also removes symbols other than alphanumeric."""
        title = row['titles_title']
        location = row['LOCATION']
        states = row.get('STATES', '')  # Get values from 'STATES' column if present
        region_state = row.get('REGION_STATE', '')  # Get values from 'REGION_STATE' column if present
        county = row.get('COUNTY', '')  # Get values from 'COUNTY' column if present
        city = row.get('city', '')  # Get values from 'city' column if present

        # Ensure title is a string
        if isinstance(title, str):
            # Remove location if present in the title
            if isinstance(location, str) and re.search(r'\b{}\b'.format(re.escape(location)), title, flags=re.IGNORECASE):
                title = re.sub(r'\b{}\b'.format(re.escape(location)), '', title, flags=re.IGNORECASE)
                self.location_removed_count += 1
            
            # Remove unwanted words
            for word in self.unwanted_words:
                pattern = r'\b{}\b'.format(re.escape(word))
                if re.search(pattern, title, flags=re.IGNORECASE):
                    title = re.sub(pattern, '', title, flags=re.IGNORECASE)
                    self.unwanted_words_removed_count += 1

            # Remove content from STATES, REGION_STATE, COUNTY, and city
            for region in [states, region_state, county, city]:
                if isinstance(region, str) and re.search(r'\b{}\b'.format(re.escape(region)), title, flags=re.IGNORECASE):
                    title = re.sub(r'\b{}\b'.format(re.escape(region)), '', title, flags=re.IGNORECASE)
                    self.state_region_removed_count += 1

            # Remove content within brackets
            if re.search(r'\[.*?\]|\(.*?\)|\{.*?\}', title):
                title = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}', '', title)
                self.brackets_removed_count += 1

            # Remove any non-alphanumeric characters (keeping spaces)
            title = re.sub(r'[^a-zA-Z0-9\s]', '', title)

            # Remove numbers greater than 10
            if re.search(r'\b(?:[1-9][0-9]+|1[1-9]|[2-9][0-9])\b', title):
                title = re.sub(r'\b(?:[1-9][0-9]+|1[1-9]|[2-9][0-9])\b', '', title)
                self.numbers_removed_count += 1

            # Clean up extra spaces
            title = re.sub(r'\s+', ' ', title).strip()

        return title

    def preprocess(self, title: str) -> str:
        """Converts title to lowercase, removes unwanted words, replaces specific terms,
        and standardizes job titles."""
        if not isinstance(title, str):
            return title
        
        # Convert to lowercase
        title = title.lower()
        
        # Replace specific terms and Roman numerals
        replacements = [
            (r'\b(?:SR|sr|Sr\.?|SR\.?|Senior|senior)\b', 'Senior'),
            (r'\b(?:JR|jr|Jr\.?|JR\.?|Junior|junior)\b', 'Junior'),
            (r'\b(?:AIML|aiml|ML|ml|MachineLearning|machinelearning|machine[_\-]learning)\b', 'Machine Learning'),
            (r'\b(?:GenAI|genai|Genai|generative[_\-]ai|GenerativeAI|generativeai)\b', 'Generative AI'),
            (r'\b(?:NLP|nlp|natural[_\-]language[_\-]processing|natural language processing)\b', 'NLP'),  
            (r'\b(?:i|I)\b', '1'),
            (r'\b(?:ii|II)\b', '2'),
            (r'\b(?:iii|III)\b', '3'),
            (r'\b(?:iv|IV)\b', '4'),
            (r'\b(?:v|V)\b', '5')
        ]
        
        for pattern, replacement in replacements:
            title = re.sub(pattern, replacement, title, flags=re.IGNORECASE)

        # Handle specific Data Scientist cases
        title = re.sub(r'\b(director|dir\.?|dir)\b.*?(data\s*scientist|data\s*science)', 'Director Data Scientist', title, flags=re.IGNORECASE)
        title = re.sub(r'\b(manager|mgr)\b.*?(data\s*scientist|data\s*science)', 'Manager Data Scientist', title, flags=re.IGNORECASE)
        title = re.sub(r'\b(lead)\b.*?(data\s*scientist|data\s*science)', 'Lead Data Scientist', title, flags=re.IGNORECASE)
        title = re.sub(r'\b(associate|associates?)\b.*?(data\s*scientist|data\s*science)', 'Associate Data Scientist', title, flags=re.IGNORECASE)
        title = re.sub(r'\b(applied)\b.*?(data\s*scientist|data\s*science)', 'Applied Data Scientist', title, flags=re.IGNORECASE)
        title = re.sub(r'\b(intern|internship|trainee)\b.*?(data\s*scientist|data\s*science)', 'Intern Data Scientist', title, flags=re.IGNORECASE)

        # Ensure "ML" or "NLP" is retained if found in the title
        if re.search(r'\bdata\s*scientist\b', title, flags=re.IGNORECASE):
            if re.search(r'\b(?:ai|artificial intelligence|ml|machine learning|deep learning|dl)\b', title, flags=re.IGNORECASE):
                return 'ML Data Scientist'
            elif re.search(r'\b(?:nlp|natural language processing)\b', title, flags=re.IGNORECASE):
                return 'NLP Data Scientist'
            return title

        # Clean up extra spaces
        title = re.sub(r'\s+', ' ', title).strip()

        return title
        
def is_title_empty(row):
    """
    Check if the 'titles_title' is effectively empty, which includes
    strings that are either empty or contain only whitespace.
    """
    title = row['titles_title']
    return pd.isna(title) or (isinstance(title, str) and title.strip() == '')

def main_preprocessing():
    try:
        # Load the dataset
        df = pd.read_csv(r"Struct Data_Data Science 100K.csv", low_memory=False)
        
        # Initialize preprocessor
        preprocessor = JobTitlePreprocessor()
        
        # Apply both the removal and standard preprocessing steps
        df['clean_title'] = df.apply(preprocessor.remove_location_unwanted_words_brackets, axis=1)
        df['clean_title'] = df['clean_title'].apply(preprocessor.preprocess)
        
        # Remove rows where 'titles_title' is empty or contains only whitespace
        df = df[~df.apply(is_title_empty, axis=1)]
        
        # Drop rows where 'clean_title' is NaN
        df = df.dropna(subset=['clean_title'])
        
        # Log some information about the dataset
        logger.info(f"Original dataset shape: {df.shape}")
        logger.info(f"Number of non-empty titles: {df['clean_title'].notna().sum()}")
        
        # Save the preprocessed data
        output_df = df[['titles_title', 'clean_title']]
        output_df.to_csv('preprocessed_job_titles.csv', index=False)
        
        logger.info(f"Preprocessed dataset shape: {output_df.shape}")
        logger.info("Job title preprocessing completed successfully.")
        logger.info(f"Total rows with part of location removed from titles: {preprocessor.location_removed_count}")
        logger.info(f"Total unwanted words removed: {preprocessor.unwanted_words_removed_count}")
        logger.info(f"Total brackets removed: {preprocessor.brackets_removed_count}")
        logger.info(f"Total states/regions removed: {preprocessor.state_region_removed_count}")
        logger.info(f"Total numbers removed: {preprocessor.numbers_removed_count}")
        
    except Exception as e:
        logger.error(f"An error occurred during preprocessing: {e}")

if __name__ == "__main__":
    main_preprocessing()