Spaces:
Sleeping
Sleeping
import streamlit as st | |
import joblib | |
import numpy as np | |
import logging | |
from processing import JobTitlePreprocessor # Import your preprocessor class | |
# Configure logging for errors | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
# Load the pre-trained models | |
vectorizer = joblib.load('vectorizer_model.pkl') | |
kmeans_model = joblib.load('kmeans_model-1.pkl') | |
# Initialize the preprocessor | |
preprocessor = JobTitlePreprocessor() | |
# Streamlit app title | |
st.title("Job Title Clustering App") | |
# Display Silhouette Score in the sidebar | |
st.sidebar.header("Insights") # This creates a big heading in the sidebar | |
st.sidebar.write("Silhouette Score: 0.5840") | |
st.sidebar.write("number of unique title: 6000") | |
st.sidebar.write("number of cluster: 40") | |
# Input fields for job titles | |
job_title_1 = st.text_input("Enter the first job title:") | |
job_title_2 = st.text_input("Enter the second job title:") | |
# Button to process the inputs | |
if st.button("Submit"): | |
if not job_title_1 or not job_title_2: | |
st.error("Please enter both job titles.") | |
else: | |
try: | |
# Preprocess the input job titles | |
clean_title_1 = preprocessor.preprocess(job_title_1) | |
clean_title_2 = preprocessor.preprocess(job_title_2) | |
# Log the preprocessed titles | |
logger.info(f"Preprocessed Title 1: {clean_title_1}") | |
logger.info(f"Preprocessed Title 2: {clean_title_2}") | |
# Vectorize the preprocessed job titles | |
title_vector_1 = vectorizer.transform([clean_title_1]) | |
title_vector_2 = vectorizer.transform([clean_title_2]) | |
# Predict clusters for each job title | |
cluster_1 = kmeans_model.predict(title_vector_1)[0] | |
cluster_2 = kmeans_model.predict(title_vector_2)[0] | |
# Display results | |
st.write(f"Cluster for '{job_title_1}': {cluster_1}") | |
st.write(f"Cluster for '{job_title_2}': {cluster_2}") | |
if cluster_1 == cluster_2: | |
st.success(f"The job titles '{job_title_1}' and '{job_title_2}' belong to the same cluster!") | |
else: | |
st.warning(f"The job titles '{job_title_1}' and '{job_title_2}' do not belong to the same cluster.") | |
# Display top words for the predicted clusters | |
def get_top_words(cluster, vectorizer, kmeans_model): | |
feature_names = vectorizer.get_feature_names_out() | |
top_word_indices = np.argsort(kmeans_model.cluster_centers_[cluster])[::-1][:5] | |
top_words = [feature_names[i] for i in top_word_indices] | |
return top_words | |
top_words_1 = get_top_words(cluster_1, vectorizer, kmeans_model) | |
top_words_2 = get_top_words(cluster_2, vectorizer, kmeans_model) | |
st.write(f"Top words in Cluster {cluster_1}: {', '.join(top_words_1)}") | |
st.write(f"Top words in Cluster {cluster_2}: {', '.join(top_words_2)}") | |
except Exception as e: | |
logger.error(f"Error occurred: {e}", exc_info=True) | |
st.error(f"An error occurred: {e}") | |