Spaces:

RAHULJUNEJA33
/

String_Similarity_Calibration-Models

Sleeping

App Files Files Community

String_Similarity_Calibration-Models / app.py

RAHULJUNEJA33

Update app.py

a729e45 verified 19 days ago

raw

history blame contribute delete

7.81 kB

	import streamlit as st
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.calibration import calibration_curve, CalibratedClassifierCV
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score
	from Levenshtein import distance as levenshtein_distance
	from textdistance import jaro_winkler, damerau_levenshtein, cosine
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.preprocessing import normalize

	# -----------------------
	# 🎨 Streamlit App Layout
	# -----------------------

	# Display title of the app on the web page
	st.title("🔍 String Similarity & Model Calibration App")

	# Sidebar for selecting the task to perform
	st.sidebar.header("📌 Select an Option")
	# Option to choose between String Similarity and Model Calibration
	option = st.sidebar.radio("Choose a Task:", ["String Similarity", "Model Calibration"])

	# ℹ️ PROJECT INFORMATION & EDUCATION SECTION
	# -----------------------
	st.sidebar.subheader("ℹ️ About This App")
	st.sidebar.write(
	"""
	This app explores two key concepts:

	1️⃣ String Similarity Models 📝
	- Compare words using different similarity algorithms.
	- Helps with spell checking, record linkage, and fuzzy matching.

	2️⃣ Model Calibration 📊
	- Evaluate how well a model’s probability predictions match reality.
	- Uses Platt Scaling & Isotonic Regression to improve predictions.
	"""
	)

	# Explaining how the models work in simple terms
	st.sidebar.subheader("🧠 How It Works?")
	st.sidebar.write(
	"""
	- Levenshtein Distance: Counts how many edits are needed to turn one word into another.
	- Jaro-Winkler: Focuses on shared characters, especially at the start of words.
	- Damerau-Levenshtein: Similar to Levenshtein but also considers transpositions (changing letter order).
	- Cosine Similarity: Treats words as vectors (arrays of numbers) and compares their angle.
	- Q-Gram Similarity: Breaks words into small parts (n-grams) and compares them.

	Model Calibration
	- Checks how accurate a model’s probability predictions are.
	- Platt Scaling applies logistic regression for adjustment.
	- Isotonic Regression fine-tunes predictions using a flexible non-linear approach.
	"""
	)

	# -----------------------
	# 1️⃣ STRING SIMILARITY MODELS
	# -----------------------
	if option == "String Similarity":
	st.header("📝 String Similarity Calculator")

	# User inputs the two words they want to compare
	word1 = st.text_input("Enter First Word:", "AARUSH")
	word2 = st.text_input("Enter Second Word:", "AASHVI")

	# When the user clicks the button, calculate similarity
	if st.button("Compute Similarity"):
	# Compute the different types of word similarity
	lev_dist = levenshtein_distance(word1, word2) # Levenshtein distance (edit distance)
	jaro_wink = jaro_winkler(word1, word2) # Jaro-Winkler similarity
	damerau_lev = damerau_levenshtein(word1, word2) # Damerau-Levenshtein distance (edit + transposition)
	cosine_sim = cosine(word1, word2) # Cosine similarity (word angle comparison)

	# Q-Gram Similarity function calculates similarity based on small parts (n-grams)
	def qgram_similarity(s1, s2, q=2):
	vectorizer = CountVectorizer(analyzer='char', ngram_range=(q, q)) # Split words into small parts
	q1 = vectorizer.fit_transform([s1, s2]) # Convert words into vector form
	q1 = normalize(q1, norm='l1') # Normalize vectors
	return (q1 * q1.T).toarray()[0, 1] # Compare the vectors and return similarity

	qgram_sim = qgram_similarity(word1, word2) # Compute Q-Gram similarity

	# Display the computed similarity results
	st.subheader("🔹 Similarity Scores")
	st.write(f"Levenshtein Distance: {lev_dist}") # Shows Levenshtein distance
	st.write(f"Jaro-Winkler Similarity: {jaro_wink:.4f}") # Shows Jaro-Winkler similarity
	st.write(f"Damerau-Levenshtein Distance: {damerau_lev}") # Shows Damerau-Levenshtein distance
	st.write(f"Cosine Similarity: {cosine_sim:.4f}") # Shows Cosine similarity
	st.write(f"Q-Gram Similarity: {qgram_sim:.4f}") # Shows Q-Gram similarity

	# -----------------------
	# 2️⃣ MODEL CALIBRATION (RELIABILITY DIAGRAM)
	# -----------------------
	elif option == "Model Calibration":
	st.header("📊 Model Calibration & Reliability Diagram")

	# Generate synthetic dataset (random numbers) for model training
	np.random.seed(42) # For reproducibility
	X = np.random.rand(1000, 5) # Random features (1000 samples, 5 features)
	y = (X[:, 0] + X[:, 1] > 1).astype(int) # Simple rule to generate labels (0 or 1)

	# Split the data into training and testing sets
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

	# Train a Logistic Regression Model
	clf = LogisticRegression()
	clf.fit(X_train, y_train) # Train the model
	y_prob = clf.predict_proba(X_test)[:, 1] # Get the predicted probabilities for the test set

	# Apply Calibration methods (Platt Scaling and Isotonic Regression)
	platt_scaling = CalibratedClassifierCV(clf, method='sigmoid') # Platt Scaling method
	iso_regression = CalibratedClassifierCV(clf, method='isotonic') # Isotonic Regression method
	platt_scaling.fit(X_train, y_train) # Fit Platt Scaling model
	iso_regression.fit(X_train, y_train) # Fit Isotonic Regression model

	y_prob_platt = platt_scaling.predict_proba(X_test)[:, 1] # Get probabilities after Platt Scaling
	y_prob_iso = iso_regression.predict_proba(X_test)[:, 1] # Get probabilities after Isotonic Regression

	# Compute Calibration Curves (shows how close the predicted probabilities are to the true values)
	prob_true, prob_pred = calibration_curve(y_test, y_prob, n_bins=10)
	prob_true_platt, prob_pred_platt = calibration_curve(y_test, y_prob_platt, n_bins=10)
	prob_true_iso, prob_pred_iso = calibration_curve(y_test, y_prob_iso, n_bins=10)

	# Plot the Calibration Curves
	fig, ax = plt.subplots(figsize=(8, 6)) # Create a figure for the plot
	ax.plot(prob_pred, prob_true, "s-", label="Uncalibrated Model") # Plot uncalibrated model
	ax.plot(prob_pred_platt, prob_true_platt, "o-", label="Platt Scaling") # Plot Platt Scaling
	ax.plot(prob_pred_iso, prob_true_iso, "d-", label="Isotonic Regression") # Plot Isotonic Regression
	ax.plot([0, 1], [0, 1], "k--", label="Perfect Calibration") # Plot perfect calibration line
	ax.set_xlabel("Predicted Probability") # Label for X-axis
	ax.set_ylabel("True Probability") # Label for Y-axis
	ax.legend() # Show legend to differentiate lines
	ax.set_title("Calibration Curve (Reliability Diagram)") # Title of the plot

	# Display the plot in Streamlit
	st.pyplot(fig)

	# -----------------------
	# 3️⃣ EVALUATE MODEL PERFORMANCE
	# -----------------------

	# Predict the labels for the test set using different models
	y_pred = clf.predict(X_test)
	y_pred_platt = platt_scaling.predict(X_test)
	y_pred_iso = iso_regression.predict(X_test)

	# Display the accuracy of each model
	st.subheader("🔹 Model Accuracy Scores:")
	st.write(f"Uncalibrated Model Accuracy: {accuracy_score(y_test, y_pred):.4f}") # Accuracy of uncalibrated model
	st.write(f"Platt Scaled Model Accuracy: {accuracy_score(y_test, y_pred_platt):.4f}") # Accuracy after Platt Scaling
	st.write(f"Isotonic Regression Model Accuracy: {accuracy_score(y_test, y_pred_iso):.4f}") # Accuracy after Isotonic Regression