Spaces:

nonstopiodemo
/

TalentEdge-2

Sleeping

App Files Files Community

Sahil Borhade commited on Sep 24, 2024

Commit

5b8c994

verified ·

1 Parent(s): d5dd643

Upload 6 files

Browse files

Files changed (6) hide show

clustered_job_titles.csv +0 -0
kmeans_model-1.pkl +3 -0
main.py +71 -0
processing.py +164 -0
requirements.txt +4 -0
vectorizer_model.pkl +3 -0

clustered_job_titles.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

kmeans_model-1.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f50b7873e093fac469d3148a723772123d733a0e838d4d91bc1581512fd6d28a
+size 1918483

main.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import streamlit as st
+import joblib
+import numpy as np
+import logging
+from processing import JobTitlePreprocessor  # Import your preprocessor class
+# Configure logging for errors
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Load the pre-trained models
+vectorizer = joblib.load('vectorizer_model.pkl')
+kmeans_model = joblib.load('kmeans_model-1.pkl')
+# Initialize the preprocessor
+preprocessor = JobTitlePreprocessor()
+# Streamlit app title
+st.title("Job Title Clustering App")
+# Input fields for job titles
+job_title_1 = st.text_input("Enter the first job title:")
+job_title_2 = st.text_input("Enter the second job title:")
+# Button to process the inputs
+if st.button("Submit"):
+    if not job_title_1 or not job_title_2:
+        st.error("Please enter both job titles.")
+    else:
+        try:
+            # Preprocess the input job titles
+            clean_title_1 = preprocessor.preprocess(job_title_1)
+            clean_title_2 = preprocessor.preprocess(job_title_2)
+            # Log the preprocessed titles
+            logger.info(f"Preprocessed Title 1: {clean_title_1}")
+            logger.info(f"Preprocessed Title 2: {clean_title_2}")
+            # Vectorize the preprocessed job titles
+            title_vector_1 = vectorizer.transform([clean_title_1])
+            title_vector_2 = vectorizer.transform([clean_title_2])
+            # Predict clusters for each job title
+            cluster_1 = kmeans_model.predict(title_vector_1)[0]
+            cluster_2 = kmeans_model.predict(title_vector_2)[0]
+            # Display results
+            st.write(f"Cluster for '{job_title_1}': {cluster_1}")
+            st.write(f"Cluster for '{job_title_2}': {cluster_2}")
+            if cluster_1 == cluster_2:
+                st.success(f"The job titles '{job_title_1}' and '{job_title_2}' belong to the same cluster!")
+            else:
+                st.warning(f"The job titles '{job_title_1}' and '{job_title_2}' do not belong to the same cluster.")
+            # Display top words for the predicted clusters
+            def get_top_words(cluster, vectorizer, kmeans_model):
+                feature_names = vectorizer.get_feature_names_out()
+                top_word_indices = np.argsort(kmeans_model.cluster_centers_[cluster])[::-1][:5]
+                top_words = [feature_names[i] for i in top_word_indices]
+                return top_words
+            top_words_1 = get_top_words(cluster_1, vectorizer, kmeans_model)
+            top_words_2 = get_top_words(cluster_2, vectorizer, kmeans_model)
+            st.write(f"Top words in Cluster {cluster_1}: {', '.join(top_words_1)}")
+            st.write(f"Top words in Cluster {cluster_2}: {', '.join(top_words_2)}")
+        except Exception as e:
+            logger.error(f"Error occurred: {e}", exc_info=True)
+            st.error(f"An error occurred: {e}")

processing.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import pandas as pd
+import re
+import logging
+# Configure logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+class JobTitlePreprocessor:
+    """Preprocesses job titles by converting to lowercase, removing unwanted words,
+    special characters, numbers greater than 10, and content from location, states, regions, etc."""
+    def __init__(self):
+        # Define unwanted words and initialize counters
+        self.unwanted_words = ['remote', 'hybrid', 'flexible location', 'location', 'open to work',
+                               'role', 'job', 'level', 'remot']
+        self.location_removed_count = 0
+        self.unwanted_words_removed_count = 0
+        self.brackets_removed_count = 0
+        self.state_region_removed_count = 0
+        self.numbers_removed_count = 0
+    def remove_location_unwanted_words_brackets(self, row):
+        """Removes parts of the title based on location, unwanted words, bracketed content,
+        numbers greater than 10, and also removes symbols other than alphanumeric."""
+        title = row['titles_title']
+        location = row['LOCATION']
+        states = row.get('STATES', '')  # Get values from 'STATES' column if present
+        region_state = row.get('REGION_STATE', '')  # Get values from 'REGION_STATE' column if present
+        county = row.get('COUNTY', '')  # Get values from 'COUNTY' column if present
+        city = row.get('city', '')  # Get values from 'city' column if present
+        # Ensure title is a string
+        if isinstance(title, str):
+            # Remove location if present in the title
+            if isinstance(location, str) and re.search(r'\b{}\b'.format(re.escape(location)), title, flags=re.IGNORECASE):
+                title = re.sub(r'\b{}\b'.format(re.escape(location)), '', title, flags=re.IGNORECASE)
+                self.location_removed_count += 1
+            # Remove unwanted words
+            for word in self.unwanted_words:
+                pattern = r'\b{}\b'.format(re.escape(word))
+                if re.search(pattern, title, flags=re.IGNORECASE):
+                    title = re.sub(pattern, '', title, flags=re.IGNORECASE)
+                    self.unwanted_words_removed_count += 1
+            # Remove content from STATES, REGION_STATE, COUNTY, and city
+            for region in [states, region_state, county, city]:
+                if isinstance(region, str) and re.search(r'\b{}\b'.format(re.escape(region)), title, flags=re.IGNORECASE):
+                    title = re.sub(r'\b{}\b'.format(re.escape(region)), '', title, flags=re.IGNORECASE)
+                    self.state_region_removed_count += 1
+            # Remove content within brackets
+            if re.search(r'\[.*?\]|\(.*?\)|\{.*?\}', title):
+                title = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}', '', title)
+                self.brackets_removed_count += 1
+            # Remove any non-alphanumeric characters (keeping spaces)
+            title = re.sub(r'[^a-zA-Z0-9\s]', '', title)
+            # Remove numbers greater than 10
+            if re.search(r'\b(?:[1-9][0-9]+|1[1-9]|[2-9][0-9])\b', title):
+                title = re.sub(r'\b(?:[1-9][0-9]+|1[1-9]|[2-9][0-9])\b', '', title)
+                self.numbers_removed_count += 1
+            # Clean up extra spaces
+            title = re.sub(r'\s+', ' ', title).strip()
+        return title
+    def preprocess(self, title: str) -> str:
+        """Converts title to lowercase, removes unwanted words, replaces specific terms,
+        and standardizes job titles."""
+        if not isinstance(title, str):
+            return title
+        # Convert to lowercase
+        title = title.lower()
+        # Replace specific terms and Roman numerals
+        replacements = [
+            (r'\b(?:SR|sr|Sr\.?|SR\.?|Senior|senior)\b', 'Senior'),
+            (r'\b(?:JR|jr|Jr\.?|JR\.?|Junior|junior)\b', 'Junior'),
+            (r'\b(?:AIML|aiml|ML|ml|MachineLearning|machinelearning|machine[_\-]learning)\b', 'Machine Learning'),
+            (r'\b(?:GenAI|genai|Genai|generative[_\-]ai|GenerativeAI|generativeai)\b', 'Generative AI'),
+            (r'\b(?:NLP|nlp|natural[_\-]language[_\-]processing|natural language processing)\b', 'NLP'),
+            (r'\b(?:i|I)\b', '1'),
+            (r'\b(?:ii|II)\b', '2'),
+            (r'\b(?:iii|III)\b', '3'),
+            (r'\b(?:iv|IV)\b', '4'),
+            (r'\b(?:v|V)\b', '5')
+        ]
+        for pattern, replacement in replacements:
+            title = re.sub(pattern, replacement, title, flags=re.IGNORECASE)
+        # Handle specific Data Scientist cases
+        title = re.sub(r'\b(director|dir\.?|dir)\b.*?(data\s*scientist|data\s*science)', 'Director Data Scientist', title, flags=re.IGNORECASE)
+        title = re.sub(r'\b(manager|mgr)\b.*?(data\s*scientist|data\s*science)', 'Manager Data Scientist', title, flags=re.IGNORECASE)
+        title = re.sub(r'\b(lead)\b.*?(data\s*scientist|data\s*science)', 'Lead Data Scientist', title, flags=re.IGNORECASE)
+        title = re.sub(r'\b(associate|associates?)\b.*?(data\s*scientist|data\s*science)', 'Associate Data Scientist', title, flags=re.IGNORECASE)
+        title = re.sub(r'\b(applied)\b.*?(data\s*scientist|data\s*science)', 'Applied Data Scientist', title, flags=re.IGNORECASE)
+        title = re.sub(r'\b(intern|internship|trainee)\b.*?(data\s*scientist|data\s*science)', 'Intern Data Scientist', title, flags=re.IGNORECASE)
+        # Ensure "ML" or "NLP" is retained if found in the title
+        if re.search(r'\bdata\s*scientist\b', title, flags=re.IGNORECASE):
+            if re.search(r'\b(?:ai|artificial intelligence|ml|machine learning|deep learning|dl)\b', title, flags=re.IGNORECASE):
+                return 'ML Data Scientist'
+            elif re.search(r'\b(?:nlp|natural language processing)\b', title, flags=re.IGNORECASE):
+                return 'NLP Data Scientist'
+            return title
+        # Clean up extra spaces
+        title = re.sub(r'\s+', ' ', title).strip()
+        return title
+def is_title_empty(row):
+    """
+    Check if the 'titles_title' is effectively empty, which includes
+    strings that are either empty or contain only whitespace.
+    """
+    title = row['titles_title']
+    return pd.isna(title) or (isinstance(title, str) and title.strip() == '')
+def main_preprocessing():
+    try:
+        # Load the dataset
+        df = pd.read_csv(r"Struct Data_Data Science 100K.csv", low_memory=False)
+        # Initialize preprocessor
+        preprocessor = JobTitlePreprocessor()
+        # Apply both the removal and standard preprocessing steps
+        df['clean_title'] = df.apply(preprocessor.remove_location_unwanted_words_brackets, axis=1)
+        df['clean_title'] = df['clean_title'].apply(preprocessor.preprocess)
+        # Remove rows where 'titles_title' is empty or contains only whitespace
+        df = df[~df.apply(is_title_empty, axis=1)]
+        # Drop rows where 'clean_title' is NaN
+        df = df.dropna(subset=['clean_title'])
+        # Log some information about the dataset
+        logger.info(f"Original dataset shape: {df.shape}")
+        logger.info(f"Number of non-empty titles: {df['clean_title'].notna().sum()}")
+        # Save the preprocessed data
+        output_df = df[['titles_title', 'clean_title']]
+        output_df.to_csv('preprocessed_job_titles.csv', index=False)
+        logger.info(f"Preprocessed dataset shape: {output_df.shape}")
+        logger.info("Job title preprocessing completed successfully.")
+        logger.info(f"Total rows with part of location removed from titles: {preprocessor.location_removed_count}")
+        logger.info(f"Total unwanted words removed: {preprocessor.unwanted_words_removed_count}")
+        logger.info(f"Total brackets removed: {preprocessor.brackets_removed_count}")
+        logger.info(f"Total states/regions removed: {preprocessor.state_region_removed_count}")
+        logger.info(f"Total numbers removed: {preprocessor.numbers_removed_count}")
+    except Exception as e:
+        logger.error(f"An error occurred during preprocessing: {e}")
+if __name__ == "__main__":
+    main_preprocessing()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+streamlit
+pandas
+numpy
+scikit-learn

vectorizer_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c2dd076934094a674858b413185262bdf916f4157f7096af89f3064930ae692
+size 105867