|
import networkx as nx |
|
from sklearn.cluster import HDBSCAN |
|
import matplotlib.pyplot as plt |
|
import numpy as np |
|
from sklearn.manifold import TSNE |
|
import umap |
|
from sklearn.cluster import KMeans |
|
from adjustText import adjust_text |
|
from constants import high_level_families, primary_families_branches |
|
|
|
|
|
def filter_languages_by_families(matrix, languages, families): |
|
""" |
|
Filters the languages based on their families. |
|
|
|
Parameters: |
|
- languages: list of languages to filter. |
|
- families: list of families to include. |
|
|
|
Returns: |
|
- filtered_languages: list of languages that belong to the specified families. |
|
""" |
|
filtered_languages = [ |
|
(i, lang) |
|
for i, lang in enumerate(languages) |
|
if high_level_families[lang] in families |
|
] |
|
filtered_indices = [i for i, lang in filtered_languages] |
|
filtered_languages = [lang for i, lang in filtered_languages] |
|
filtered_matrix = matrix[np.ix_(filtered_indices, filtered_indices)] |
|
return filtered_matrix, filtered_languages |
|
|
|
|
|
def get_dynamic_color_map(n_colors): |
|
""" |
|
Generates a dynamic color map with the specified number of colors. |
|
|
|
Parameters: |
|
- n_colors: int, the number of distinct colors required. |
|
|
|
Returns: |
|
- color_map: list of RGB tuples representing the colors. |
|
""" |
|
cmap = plt.get_cmap("tab20") if n_colors <= 20 else plt.get_cmap("hsv") |
|
color_map = [cmap(i / n_colors) for i in range(n_colors)] |
|
return color_map |
|
|
|
|
|
def cluster_languages_by_families(languages): |
|
lang_families = [high_level_families[lang] for lang in languages] |
|
legend = sorted(set(lang_families)) |
|
clusters = [legend.index(family) for family in lang_families] |
|
return clusters, legend |
|
|
|
|
|
def cluster_languages_by_subfamilies(languages): |
|
labels = [ |
|
high_level_families[lang] + f" ({primary_families_branches[lang]})" |
|
for lang in languages |
|
] |
|
legend = sorted(set(labels)) |
|
clusters = [legend.index(family) for family in labels] |
|
return clusters, legend |
|
|
|
|
|
def plot_mst( |
|
matrix, |
|
languages, |
|
clusters, |
|
legend=None, |
|
fig_size=(20, 20), |
|
): |
|
""" |
|
Plots a Minimum Spanning Tree (MST) from a given distance matrix, node labels, and cluster assignments. |
|
|
|
Parameters: |
|
- dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between nodes. |
|
- labels: list of length N containing the labels for each node. |
|
- clusters: list of length N containing the cluster assignment (or ID) for each node. |
|
""" |
|
|
|
G = nx.Graph() |
|
|
|
|
|
N = len(languages) |
|
|
|
|
|
|
|
for i in range(N): |
|
for j in range(i + 1, N): |
|
G.add_edge(i, j, weight=matrix[i, j]) |
|
|
|
|
|
mst = nx.minimum_spanning_tree(G) |
|
|
|
|
|
pos = nx.kamada_kawai_layout(mst, weight="weight") |
|
|
|
|
|
unique_clusters = sorted(set(clusters)) |
|
cmap = get_dynamic_color_map(len(unique_clusters)) |
|
cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} |
|
|
|
node_colors = [cluster_colors.get(cluster) for cluster in clusters] |
|
|
|
|
|
fig, ax = plt.subplots(figsize=fig_size) |
|
|
|
|
|
nx.draw_networkx_edges(mst, pos, edge_color="gray", ax=ax) |
|
|
|
|
|
nx.draw_networkx_nodes( |
|
mst, pos, node_color=node_colors, node_size=100, ax=ax, alpha=0.7 |
|
) |
|
|
|
|
|
texts = [] |
|
for i, label in enumerate(languages): |
|
x, y = pos[i] |
|
texts.append(ax.text(x, y, label, fontsize=10)) |
|
|
|
|
|
|
|
adjust_text(texts, expand_text=(1.05, 1.2)) |
|
|
|
|
|
if legend is None: |
|
legend = {cluster: str(cluster) for cluster in unique_clusters} |
|
legend_handles = [ |
|
plt.Line2D( |
|
[0], |
|
[0], |
|
marker="o", |
|
color="w", |
|
markerfacecolor=cluster_colors[cluster], |
|
markersize=10, |
|
alpha=0.7, |
|
label=legend[cluster], |
|
) |
|
for cluster in unique_clusters |
|
] |
|
ax.legend(handles=legend_handles, title="Clusters", loc="best") |
|
|
|
|
|
ax.axis("off") |
|
|
|
|
|
return fig |
|
|
|
|
|
def cluster_languages_kmeans(dist_matrix, languages, n_clusters=5): |
|
""" |
|
Clusters languages using a distance matrix and KMeans. |
|
|
|
Parameters: |
|
- dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between languages. |
|
- n_clusters: int, the number of clusters to form. |
|
|
|
Returns: |
|
- filtered_matrix: 2D NumPy array of the filtered distance matrix. |
|
- filtered_languages: list of filtered languages. |
|
- filtered_clusters: list of filtered cluster assignments. |
|
""" |
|
|
|
|
|
kmeans_model = KMeans(n_clusters=n_clusters, random_state=23) |
|
clusters = kmeans_model.fit_predict(dist_matrix) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return dist_matrix, languages, clusters |
|
|
|
|
|
def cluster_languages_hdbscan(dist_matrix, languages, min_cluster_size=2): |
|
""" |
|
Clusters languages using a distance matrix and HDBSCAN. |
|
|
|
Parameters: |
|
- dist_matrix: 2D NumPy array (N x N) representing the pairwise distances between languages. |
|
- min_cluster_size: int, the minimum size of clusters. |
|
|
|
Returns: |
|
- clusters: list of length N containing the cluster assignment (or ID) for each language. |
|
""" |
|
|
|
clustering_model = HDBSCAN(metric="precomputed", min_cluster_size=min_cluster_size) |
|
clusters = clustering_model.fit_predict(dist_matrix) |
|
|
|
|
|
valid_indices = np.where(clusters != -1)[0] |
|
filtered_matrix = dist_matrix[np.ix_(valid_indices, valid_indices)] |
|
filtered_languages = np.array(languages)[valid_indices] |
|
filtered_clusters = np.array(clusters)[valid_indices] |
|
return filtered_matrix, filtered_languages, filtered_clusters |
|
|
|
|
|
def plot_distances_tsne( |
|
matrix, |
|
languages, |
|
clusters, |
|
legend=None, |
|
fig_size=(16, 12), |
|
): |
|
""" |
|
Plots all languages from the distances matrix using t-SNE and colors them by clusters. |
|
""" |
|
tsne = TSNE(n_components=2, random_state=23, metric="precomputed", init="random") |
|
tsne_results = tsne.fit_transform(matrix) |
|
|
|
|
|
unique_clusters = sorted(set(clusters)) |
|
cmap = get_dynamic_color_map(len(unique_clusters)) |
|
cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} |
|
|
|
fig, ax = plt.subplots(figsize=fig_size) |
|
scatter = ax.scatter( |
|
tsne_results[:, 0], |
|
tsne_results[:, 1], |
|
c=[cluster_colors[cluster] for cluster in clusters], |
|
alpha=0.7, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
texts = [] |
|
for i, label in enumerate(languages): |
|
x, y = tsne_results[i, 0], tsne_results[i, 1] |
|
texts.append(ax.text(x, y, label, fontsize=10)) |
|
|
|
|
|
|
|
adjust_text(texts, expand_text=(1.05, 1.2)) |
|
|
|
|
|
if legend is None: |
|
legend = {cluster: str(cluster) for cluster in unique_clusters} |
|
legend_handles = [ |
|
plt.Line2D( |
|
[0], |
|
[0], |
|
marker="o", |
|
color="w", |
|
markerfacecolor=cluster_colors[cluster], |
|
markersize=10, |
|
label=legend[cluster], |
|
) |
|
for cluster in unique_clusters |
|
] |
|
ax.legend(handles=legend_handles, title="Clusters", loc="best") |
|
|
|
|
|
|
|
|
|
|
|
|
|
ax.axis("off") |
|
return fig |
|
|
|
|
|
def plot_distances_umap( |
|
matrix, |
|
languages, |
|
clusters, |
|
legend=None, |
|
fig_size=(16, 12), |
|
): |
|
""" |
|
Plots all languages from the distances matrix using UMAP and colors them by clusters. |
|
""" |
|
|
|
umap_model = umap.UMAP(metric="precomputed", random_state=23) |
|
umap_results = umap_model.fit_transform(matrix) |
|
|
|
|
|
unique_clusters = sorted(set(clusters)) |
|
cmap = get_dynamic_color_map(len(unique_clusters)) |
|
cluster_colors = {cluster: cmap[i] for i, cluster in enumerate(unique_clusters)} |
|
|
|
fig, ax = plt.subplots(figsize=fig_size) |
|
scatter = ax.scatter( |
|
umap_results[:, 0], |
|
umap_results[:, 1], |
|
c=[cluster_colors[cluster] for cluster in clusters], |
|
alpha=0.7, |
|
) |
|
|
|
|
|
|
|
|
|
|
|
texts = [] |
|
for i, label in enumerate(languages): |
|
x, y = umap_results[i, 0], umap_results[i, 1] |
|
texts.append(ax.text(x, y, label, fontsize=10)) |
|
|
|
|
|
|
|
adjust_text(texts, expand_text=(1.05, 1.2)) |
|
|
|
|
|
if legend is None: |
|
legend = {cluster: str(cluster) for cluster in unique_clusters} |
|
legend_handles = [ |
|
plt.Line2D( |
|
[0], |
|
[0], |
|
marker="o", |
|
color="w", |
|
markerfacecolor=cluster_colors[cluster], |
|
markersize=10, |
|
label=legend[cluster], |
|
) |
|
for cluster in unique_clusters |
|
] |
|
ax.legend(handles=legend_handles, title="Clusters", loc="best") |
|
|
|
|
|
|
|
|
|
|
|
|
|
ax.axis("off") |
|
return fig |
|
|