import streamlit as st # Web app framework import numpy as np # For numerical operations import matplotlib.pyplot as plt # For plotting graphs from sklearn.calibration import calibration_curve, CalibratedClassifierCV # Model calibration from sklearn.linear_model import LogisticRegression # Logistic regression model from sklearn.model_selection import train_test_split # Splitting dataset from sklearn.metrics import accuracy_score # Evaluating model accuracy from Levenshtein import distance as levenshtein_distance # Levenshtein distance metric from textdistance import jaro_winkler, damerau_levenshtein, cosine # Other similarity metrics from sklearn.feature_extraction.text import CountVectorizer # Converting text to numerical format from sklearn.preprocessing import normalize # Normalizing numerical data import pandas as pd # Handling data efficiently # ----------------------- # 🎨 STREAMLIT APP LAYOUT # ----------------------- st.title("🔍 String Similarity & Model Calibration App") # Main title st.sidebar.header("📌 Select an Option") # Sidebar header option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"]) # User selection # ----------------------- # ℹī¸ INFORMATION SECTION (For non-technical users) # ----------------------- st.sidebar.subheader("ℹī¸ About This App") st.sidebar.write( """ This app provides two key functionalities: **1ī¸âƒŖ String Similarity** 📝 - Used in **spell checking, data matching, and fuzzy search**. **2ī¸âƒŖ Model Calibration** 📊 - Helps improve the **reliability of probability predictions** from ML models. 📌 **Project Repository:** 👉 [RAHULJUNEJA33/String_Similarity_Calibration-Models](https://github.com/RAHULJUNEJA33/String_Similarity_Calibration-Models) """ ) # ----------------------- # 1ī¸âƒŖ STRING SIMILARITY MODELS # ----------------------- if option == "String Similarity": st.header("📝 String Similarity Calculator") # Section header # User inputs: Two words to compare word1 = st.text_input("Enter First Word:", "MARTHA") word2 = st.text_input("Enter Second Word:", "MARHTA") if st.button("Compute Similarity"): # Compute similarity when button is clicked # Compute similarity metrics lev_dist = levenshtein_distance(word1, word2) # Levenshtein Distance jaro_wink = jaro_winkler(word1, word2) # Jaro-Winkler Similarity damerau_lev = damerau_levenshtein(word1, word2) # Damerau-Levenshtein Distance cosine_sim = cosine(word1, word2) # Cosine Similarity # Q-Gram Similarity Function def qgram_similarity(s1, s2, q=2): vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q)) # Convert text into character n-grams q1 = vectorizer.fit_transform([s1, s2]) # Transform input words into vectors q1 = normalize(q1, norm='l1') # Normalize the vectors return (q1 * q1.T).toarray()[0, 1] # Compute similarity score qgram_sim = qgram_similarity(word1, word2) # Compute Q-Gram Similarity # Display Results st.subheader("🔹 Similarity Scores") st.write(f"**Levenshtein Distance:** {lev_dist}") st.write(f"**Jaro-Winkler Similarity:** {jaro_wink:.4f}") st.write(f"**Damerau-Levenshtein Distance:** {damerau_lev}") st.write(f"**Cosine Similarity:** {cosine_sim:.4f}") st.write(f"**Q-Gram Similarity:** {qgram_sim:.4f}") # ----------------------- # 2ī¸âƒŖ MODEL CALIBRATION (RELIABILITY DIAGRAM) # ----------------------- elif option == "Model Calibration": st.header("📊 Model Calibration & Reliability Diagram") # Section header # Generate synthetic dataset (random data) np.random.seed(42) # Set seed for reproducibility X = np.random.rand(1000, 5) # 1000 samples, 5 random features y = (X[:, 0] + X[:, 1] > 1).astype(int) # Classification rule (sum of first 2 features > 1) # Split data into training and testing sets (70%-30%) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) # Train a Logistic Regression Model clf = LogisticRegression() clf.fit(X_train, y_train) # Fit model to training data y_prob = clf.predict_proba(X_test)[:, 1] # Get probability scores for class 1 # Apply Model Calibration (Platt Scaling & Isotonic Regression) platt_scaling = CalibratedClassifierCV(clf, method='sigmoid') # Platt Scaling method iso_regression = CalibratedClassifierCV(clf, method='isotonic') # Isotonic Regression method platt_scaling.fit(X_train, y_train) # Train calibrated models iso_regression.fit(X_train, y_train) y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1] # Platt Scaling probabilities y_prob_iso = iso_regression.predict_proba(X_test)[:, 1] # Isotonic Regression probabilities # Compute Calibration Curves (actual vs. predicted probabilities) prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10) prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10) prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10) # Plot Calibration Curves fig, ax = plt.subplots(figsize=(8, 6)) ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model") ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling") ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression") ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration") # Ideal case ax.set_xlabel("Predicted Probability") ax.set_ylabel("True Probability") ax.legend() ax.set_title("Calibration Curve (Reliability Diagram)") # Display plot in Streamlit st.pyplot(fig) # ----------------------- # 3ī¸âƒŖ EVALUATE MODEL PERFORMANCE # ----------------------- y_pred = clf.predict(X_test) # Predictions (uncalibrated) y_pred_platt = platt_scaling.predict(X_test) # Predictions (Platt Scaling) y_pred_iso = iso_regression.predict(X_test) # Predictions (Isotonic Regression) # Display Accuracy Scores st.subheader("🔹 Model Accuracy Scores:") st.write(f"**Uncalibrated Model Accuracy:** {accuracy_score(y_test, y_pred):.4f}") st.write(f"**Platt Scaled Model Accuracy:** {accuracy_score(y_test, y_pred_platt):.4f}") st.write(f"**Isotonic Regression Model Accuracy:** {accuracy_score(y_test, y_pred_iso):.4f}")