File size: 3,119 Bytes
5b8c994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc89cb7
 
 
0d1cf3d
 
 
cc89cb7
5b8c994
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import streamlit as st
import joblib
import numpy as np
import logging
from processing import JobTitlePreprocessor  # Import your preprocessor class

# Configure logging for errors
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load the pre-trained models
vectorizer = joblib.load('vectorizer_model.pkl')
kmeans_model = joblib.load('kmeans_model-1.pkl')

# Initialize the preprocessor
preprocessor = JobTitlePreprocessor()

# Streamlit app title
st.title("Job Title Clustering App")

# Display Silhouette Score in the sidebar
st.sidebar.header("Insights")  # This creates a big heading in the sidebar
st.sidebar.write("Silhouette Score: 0.5840")
st.sidebar.write("number of unique title: 6000")
st.sidebar.write("number of cluster: 40")


# Input fields for job titles
job_title_1 = st.text_input("Enter the first job title:")
job_title_2 = st.text_input("Enter the second job title:")

# Button to process the inputs
if st.button("Submit"):
    if not job_title_1 or not job_title_2:
        st.error("Please enter both job titles.")
    else:
        try:
            # Preprocess the input job titles
            clean_title_1 = preprocessor.preprocess(job_title_1)
            clean_title_2 = preprocessor.preprocess(job_title_2)

            # Log the preprocessed titles
            logger.info(f"Preprocessed Title 1: {clean_title_1}")
            logger.info(f"Preprocessed Title 2: {clean_title_2}")

            # Vectorize the preprocessed job titles
            title_vector_1 = vectorizer.transform([clean_title_1])
            title_vector_2 = vectorizer.transform([clean_title_2])

            # Predict clusters for each job title
            cluster_1 = kmeans_model.predict(title_vector_1)[0]
            cluster_2 = kmeans_model.predict(title_vector_2)[0]

            # Display results
            st.write(f"Cluster for '{job_title_1}': {cluster_1}")
            st.write(f"Cluster for '{job_title_2}': {cluster_2}")

            if cluster_1 == cluster_2:
                st.success(f"The job titles '{job_title_1}' and '{job_title_2}' belong to the same cluster!")
            else:
                st.warning(f"The job titles '{job_title_1}' and '{job_title_2}' do not belong to the same cluster.")

            # Display top words for the predicted clusters
            def get_top_words(cluster, vectorizer, kmeans_model):
                feature_names = vectorizer.get_feature_names_out()
                top_word_indices = np.argsort(kmeans_model.cluster_centers_[cluster])[::-1][:5]
                top_words = [feature_names[i] for i in top_word_indices]
                return top_words

            top_words_1 = get_top_words(cluster_1, vectorizer, kmeans_model)
            top_words_2 = get_top_words(cluster_2, vectorizer, kmeans_model)

            st.write(f"Top words in Cluster {cluster_1}: {', '.join(top_words_1)}")
            st.write(f"Top words in Cluster {cluster_2}: {', '.join(top_words_2)}")

        except Exception as e:
            logger.error(f"Error occurred: {e}", exc_info=True)
            st.error(f"An error occurred: {e}")