|
"""Plot pandas.DataFrame with DBSCAN clustering.""" |
|
|
|
|
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
from sklearn.cluster import DBSCAN |
|
|
|
from logzero import logger |
|
|
|
|
|
if "get_ipython" in globals(): |
|
plt.ion() |
|
|
|
|
|
|
|
def plot_df( |
|
df_: pd.DataFram, |
|
min_samples: int = 6, |
|
eps: float = 10, |
|
ylim: int = None, |
|
xlabel: str = "en", |
|
ylabel: str = "zh", |
|
) -> plt: |
|
|
|
"""Plot df with DBSCAN clustering. |
|
|
|
Args: |
|
df_: pandas.DataFrame, with three columns columns=["x", "y", "cos"] |
|
Returns: |
|
matplotlib.pyplot: for possible use in gradio |
|
|
|
plot_df(pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos'])) |
|
df_ = pd.DataFrame(cmat2tset(smat), columns=['x', 'y', 'cos']) |
|
|
|
# sort 'x', axis 0 changes, index regenerated |
|
df_s = df_.sort_values('x', axis=0, ignore_index=True) |
|
|
|
# sorintg does not seem to impact clustering |
|
DBSCAN(1.5, min_samples=3).fit(df_).labels_ |
|
DBSCAN(1.5, min_samples=3).fit(df_s).labels_ |
|
|
|
""" |
|
df_ = pd.DataFrame(df_) |
|
if df_.columns.__len__() < 3: |
|
logger.error( |
|
"expected 3 columns DataFram, got: %s, cant proceed, returninng None", |
|
df_.columns.tolist(), |
|
) |
|
return None |
|
|
|
|
|
columns = df_.columns[:3] |
|
df_ = df_[columns] |
|
|
|
|
|
df_.columns = ["x", "y", "cos"] |
|
|
|
sns.set() |
|
sns.set_style("darkgrid") |
|
fig, (ax0, ax1) = plt.subplots(2, figsize=(11.69, 8.27)) |
|
fig.suptitle("alignment projection") |
|
_ = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ > -1 |
|
_x = DBSCAN(min_samples=min_samples, eps=eps).fit(df_).labels_ < 0 |
|
|
|
|
|
|
|
|
|
|
|
df_.plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax0) |
|
|
|
|
|
df_[_].plot.scatter("x", "y", c="cos", cmap="viridis_r", ax=ax1) |
|
|
|
|
|
df_[_x].plot.scatter("x", "y", c="r", marker="x", alpha=0.6, ax=ax0) |
|
|
|
|
|
|
|
ax0.set_xlabel("") |
|
ax0.set_ylabel(ylabel) |
|
xlim = len(df_) |
|
ax0.set_xlim(0, xlim) |
|
if ylim: |
|
ax0.set_ylim(0, ylim) |
|
ax0.set_title("max similarity along columns (outliers denoted by 'x')") |
|
|
|
|
|
|
|
ax1.set_xlabel(xlabel) |
|
ax1.set_ylabel(ylabel) |
|
|
|
ax1.set_xlim(0, xlim) |
|
if ylim: |
|
ax1.set_ylim(0, ylim) |
|
ax1.set_title(f"potential aligned pairs ({round(sum(_) / len(df_), 2):.0%})") |
|
|
|
return plt |
|
|