Spaces:
Sleeping
Sleeping
File size: 3,119 Bytes
5b8c994 cc89cb7 0d1cf3d cc89cb7 5b8c994 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 |
import streamlit as st
import joblib
import numpy as np
import logging
from processing import JobTitlePreprocessor # Import your preprocessor class
# Configure logging for errors
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load the pre-trained models
vectorizer = joblib.load('vectorizer_model.pkl')
kmeans_model = joblib.load('kmeans_model-1.pkl')
# Initialize the preprocessor
preprocessor = JobTitlePreprocessor()
# Streamlit app title
st.title("Job Title Clustering App")
# Display Silhouette Score in the sidebar
st.sidebar.header("Insights") # This creates a big heading in the sidebar
st.sidebar.write("Silhouette Score: 0.5840")
st.sidebar.write("number of unique title: 6000")
st.sidebar.write("number of cluster: 40")
# Input fields for job titles
job_title_1 = st.text_input("Enter the first job title:")
job_title_2 = st.text_input("Enter the second job title:")
# Button to process the inputs
if st.button("Submit"):
if not job_title_1 or not job_title_2:
st.error("Please enter both job titles.")
else:
try:
# Preprocess the input job titles
clean_title_1 = preprocessor.preprocess(job_title_1)
clean_title_2 = preprocessor.preprocess(job_title_2)
# Log the preprocessed titles
logger.info(f"Preprocessed Title 1: {clean_title_1}")
logger.info(f"Preprocessed Title 2: {clean_title_2}")
# Vectorize the preprocessed job titles
title_vector_1 = vectorizer.transform([clean_title_1])
title_vector_2 = vectorizer.transform([clean_title_2])
# Predict clusters for each job title
cluster_1 = kmeans_model.predict(title_vector_1)[0]
cluster_2 = kmeans_model.predict(title_vector_2)[0]
# Display results
st.write(f"Cluster for '{job_title_1}': {cluster_1}")
st.write(f"Cluster for '{job_title_2}': {cluster_2}")
if cluster_1 == cluster_2:
st.success(f"The job titles '{job_title_1}' and '{job_title_2}' belong to the same cluster!")
else:
st.warning(f"The job titles '{job_title_1}' and '{job_title_2}' do not belong to the same cluster.")
# Display top words for the predicted clusters
def get_top_words(cluster, vectorizer, kmeans_model):
feature_names = vectorizer.get_feature_names_out()
top_word_indices = np.argsort(kmeans_model.cluster_centers_[cluster])[::-1][:5]
top_words = [feature_names[i] for i in top_word_indices]
return top_words
top_words_1 = get_top_words(cluster_1, vectorizer, kmeans_model)
top_words_2 = get_top_words(cluster_2, vectorizer, kmeans_model)
st.write(f"Top words in Cluster {cluster_1}: {', '.join(top_words_1)}")
st.write(f"Top words in Cluster {cluster_2}: {', '.join(top_words_2)}")
except Exception as e:
logger.error(f"Error occurred: {e}", exc_info=True)
st.error(f"An error occurred: {e}")
|