|
import streamlit as st |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
from sklearn.calibration import calibration_curve, CalibratedClassifierCV |
|
from sklearn.linear_model import LogisticRegression |
|
from sklearn.model_selection import train_test_split |
|
from sklearn.metrics import accuracy_score |
|
from Levenshtein import distance as levenshtein_distance |
|
from textdistance import jaro_winkler, damerau_levenshtein, cosine |
|
from sklearn.feature_extraction.text import CountVectorizer |
|
from sklearn.preprocessing import normalize |
|
|
|
|
|
|
|
|
|
|
|
|
|
st.title("🔍 String Similarity & Model Calibration App") |
|
|
|
|
|
st.sidebar.header("📌 Select an Option") |
|
|
|
option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"]) |
|
|
|
|
|
|
|
st.sidebar.subheader("ℹ️ About This App") |
|
st.sidebar.write( |
|
""" |
|
This app explores two key concepts: |
|
|
|
**1️⃣ String Similarity Models** 📝 |
|
- Compare words using different similarity algorithms. |
|
- Helps with **spell checking, record linkage, and fuzzy matching**. |
|
|
|
**2️⃣ Model Calibration** 📊 |
|
- Evaluate how well a model’s probability predictions match reality. |
|
- Uses **Platt Scaling & Isotonic Regression** to improve predictions. |
|
""" |
|
) |
|
|
|
|
|
st.sidebar.subheader("🧠 How It Works?") |
|
st.sidebar.write( |
|
""" |
|
- **Levenshtein Distance**: Counts how many edits are needed to turn one word into another. |
|
- **Jaro-Winkler**: Focuses on shared characters, especially at the start of words. |
|
- **Damerau-Levenshtein**: Similar to Levenshtein but also considers transpositions (changing letter order). |
|
- **Cosine Similarity**: Treats words as vectors (arrays of numbers) and compares their angle. |
|
- **Q-Gram Similarity**: Breaks words into small parts (n-grams) and compares them. |
|
|
|
**Model Calibration** |
|
- Checks how accurate a model’s probability predictions are. |
|
- **Platt Scaling** applies logistic regression for adjustment. |
|
- **Isotonic Regression** fine-tunes predictions using a flexible non-linear approach. |
|
""" |
|
) |
|
|
|
|
|
|
|
|
|
if option == "String Similarity": |
|
st.header("📝 String Similarity Calculator") |
|
|
|
|
|
word1 = st.text_input("Enter First Word:", "AARUSH") |
|
word2 = st.text_input("Enter Second Word:", "AASHVI") |
|
|
|
|
|
if st.button("Compute Similarity"): |
|
|
|
lev_dist = levenshtein_distance(word1, word2) |
|
jaro_wink = jaro_winkler(word1, word2) |
|
damerau_lev = damerau_levenshtein(word1, word2) |
|
cosine_sim = cosine(word1, word2) |
|
|
|
|
|
def qgram_similarity(s1, s2, q=2): |
|
vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q)) |
|
q1 = vectorizer.fit_transform([s1, s2]) |
|
q1 = normalize(q1, norm='l1') |
|
return (q1 * q1.T).toarray()[0, 1] |
|
|
|
qgram_sim = qgram_similarity(word1, word2) |
|
|
|
|
|
st.subheader("🔹 Similarity Scores") |
|
st.write(f"**Levenshtein Distance:** {lev_dist}") |
|
st.write(f"**Jaro-Winkler Similarity:** {jaro_wink:.4f}") |
|
st.write(f"**Damerau-Levenshtein Distance:** {damerau_lev}") |
|
st.write(f"**Cosine Similarity:** {cosine_sim:.4f}") |
|
st.write(f"**Q-Gram Similarity:** {qgram_sim:.4f}") |
|
|
|
|
|
|
|
|
|
elif option == "Model Calibration": |
|
st.header("📊 Model Calibration & Reliability Diagram") |
|
|
|
|
|
np.random.seed(42) |
|
X = np.random.rand(1000, 5) |
|
y = (X[:, 0] + X[:, 1] > 1).astype(int) |
|
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) |
|
|
|
|
|
clf = LogisticRegression() |
|
clf.fit(X_train, y_train) |
|
y_prob = clf.predict_proba(X_test)[:, 1] |
|
|
|
|
|
platt_scaling = CalibratedClassifierCV(clf, method='sigmoid') |
|
iso_regression = CalibratedClassifierCV(clf, method='isotonic') |
|
platt_scaling.fit(X_train, y_train) |
|
iso_regression.fit(X_train, y_train) |
|
|
|
y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1] |
|
y_prob_iso = iso_regression.predict_proba(X_test)[:, 1] |
|
|
|
|
|
prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10) |
|
prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10) |
|
prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10) |
|
|
|
|
|
fig, ax = plt.subplots(figsize=(8, 6)) |
|
ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model") |
|
ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling") |
|
ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression") |
|
ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration") |
|
ax.set_xlabel("Predicted Probability") |
|
ax.set_ylabel("True Probability") |
|
ax.legend() |
|
ax.set_title("Calibration Curve (Reliability Diagram)") |
|
|
|
|
|
st.pyplot(fig) |
|
|
|
|
|
|
|
|
|
|
|
|
|
y_pred = clf.predict(X_test) |
|
y_pred_platt = platt_scaling.predict(X_test) |
|
y_pred_iso = iso_regression.predict(X_test) |
|
|
|
|
|
st.subheader("🔹 Model Accuracy Scores:") |
|
st.write(f"**Uncalibrated Model Accuracy:** {accuracy_score(y_test, y_pred):.4f}") |
|
st.write(f"**Platt Scaled Model Accuracy:** {accuracy_score(y_test, y_pred_platt):.4f}") |
|
st.write(f"**Isotonic Regression Model Accuracy:** {accuracy_score(y_test, y_pred_iso):.4f}") |