import streamlit as st import joblib import numpy as np import logging from processing import JobTitlePreprocessor # Import your preprocessor class # Configure logging for errors logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # Load the pre-trained models vectorizer = joblib.load('vectorizer_model.pkl') kmeans_model = joblib.load('kmeans_model-1.pkl') # Initialize the preprocessor preprocessor = JobTitlePreprocessor() # Streamlit app title st.title("Job Title Clustering App") # Display Silhouette Score in the sidebar st.sidebar.header("Insights") # This creates a big heading in the sidebar st.sidebar.write("Silhouette Score: 0.5840") st.sidebar.write("number of unique title: 6000") st.sidebar.write("number of cluster: 40") # Input fields for job titles job_title_1 = st.text_input("Enter the first job title:") job_title_2 = st.text_input("Enter the second job title:") # Button to process the inputs if st.button("Submit"): if not job_title_1 or not job_title_2: st.error("Please enter both job titles.") else: try: # Preprocess the input job titles clean_title_1 = preprocessor.preprocess(job_title_1) clean_title_2 = preprocessor.preprocess(job_title_2) # Log the preprocessed titles logger.info(f"Preprocessed Title 1: {clean_title_1}") logger.info(f"Preprocessed Title 2: {clean_title_2}") # Vectorize the preprocessed job titles title_vector_1 = vectorizer.transform([clean_title_1]) title_vector_2 = vectorizer.transform([clean_title_2]) # Predict clusters for each job title cluster_1 = kmeans_model.predict(title_vector_1)[0] cluster_2 = kmeans_model.predict(title_vector_2)[0] # Display results st.write(f"Cluster for '{job_title_1}': {cluster_1}") st.write(f"Cluster for '{job_title_2}': {cluster_2}") if cluster_1 == cluster_2: st.success(f"The job titles '{job_title_1}' and '{job_title_2}' belong to the same cluster!") else: st.warning(f"The job titles '{job_title_1}' and '{job_title_2}' do not belong to the same cluster.") # Display top words for the predicted clusters def get_top_words(cluster, vectorizer, kmeans_model): feature_names = vectorizer.get_feature_names_out() top_word_indices = np.argsort(kmeans_model.cluster_centers_[cluster])[::-1][:5] top_words = [feature_names[i] for i in top_word_indices] return top_words top_words_1 = get_top_words(cluster_1, vectorizer, kmeans_model) top_words_2 = get_top_words(cluster_2, vectorizer, kmeans_model) st.write(f"Top words in Cluster {cluster_1}: {', '.join(top_words_1)}") st.write(f"Top words in Cluster {cluster_2}: {', '.join(top_words_2)}") except Exception as e: logger.error(f"Error occurred: {e}", exc_info=True) st.error(f"An error occurred: {e}")