Map-Data / utilities /cluster_label.py
akhil-vaidya's picture
feat: auto-label-embed-cluster
81ebdf3
raw
history blame contribute delete
975 Bytes
import numpy as numpy
import pandas as pd
import numpy as np
import tensorflow_hub as hub
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
def embed(input):
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
return model(input)
def generate_use_embeddings(data):
embeddings = embed(data)
embeddings = np.array(embeddings).tolist()
return embeddings
def autogenerate_labels(df):
map_data = df['Map Data'].to_numpy()
embeddings_list = generate_use_embeddings(map_data)
np_embeddings = np.array(embeddings_list)
df_embeddings = pd.DataFrame(np_embeddings)
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(np_embeddings)
n_clusters = 4
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(scaled_embeddings)
y_kmeans = kmeans.labels_
df['label'] = y_kmeans + 1
return df, df_embeddings