""" The implementation of Log Clustering model for anomaly detection. Authors: LogPAI Team Reference: [1] Qingwei Lin, Hongyu Zhang, Jian-Guang Lou, Yu Zhang, Xuewei Chen. Log Clustering based Problem Identification for Online Service Systems. International Conference on Software Engineering (ICSE), 2016. """ import numpy as np import pprint from scipy.special import expit from numpy import linalg as LA from scipy.cluster.hierarchy import linkage, fcluster from scipy.spatial.distance import pdist, squareform from ..utils import metrics class LogClustering(object): def __init__(self, max_dist=0.3, anomaly_threshold=0.3, mode='online', num_bootstrap_samples=1000): """ Attributes ---------- max_dist: float, the threshold to stop the clustering process anomaly_threshold: float, the threshold for anomaly detection mode: str, 'offline' or 'online' mode for clustering num_bootstrap_samples: int, online clustering starts with a bootstraping process, which determines the initial cluster representatives offline using a subset of samples representatives: ndarray, the representative samples of clusters, of shape num_clusters-by-num_events cluster_size_dict: dict, the size of each cluster, used to update representatives online """ self.max_dist = max_dist self.anomaly_threshold = anomaly_threshold self.mode = mode self.num_bootstrap_samples = num_bootstrap_samples self.representatives = list() self.cluster_size_dict = dict() def fit(self, X): print('====== Model summary ======') if self.mode == 'offline': # The offline mode can process about 10K samples only due to huge memory consumption. self._offline_clustering(X) elif self.mode == 'online': # Bootstrapping phase if self.num_bootstrap_samples > 0: X_bootstrap = X[0:self.num_bootstrap_samples, :] self._offline_clustering(X_bootstrap) # Online learning phase if X.shape[0] > self.num_bootstrap_samples: self._online_clustering(X) def predict(self, X): y_pred = np.zeros(X.shape[0]) for i in range(X.shape[0]): min_dist, min_index = self._get_min_cluster_dist(X[i, :]) if min_dist > self.anomaly_threshold: y_pred[i] = 1 return y_pred def evaluate(self, X, y_true): print('====== Evaluation summary ======') y_pred = self.predict(X) precision, recall, f1 = metrics(y_pred, y_true) print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n' \ .format(precision, recall, f1)) return precision, recall, f1 def _offline_clustering(self, X): print('Starting offline clustering...') p_dist = pdist(X, metric=self._distance_metric) Z = linkage(p_dist, 'complete') cluster_index = fcluster(Z, self.max_dist, criterion='distance') self._extract_representatives(X, cluster_index) print('Processed {} instances.'.format(X.shape[0])) print('Found {} clusters offline.\n'.format(len(self.representatives))) # print('The representive vectors are:') # pprint.pprint(self.representatives.tolist()) def _extract_representatives(self, X, cluster_index): num_clusters = len(set(cluster_index)) for clu in range(num_clusters): clu_idx = np.argwhere(cluster_index == clu + 1)[:, 0] self.cluster_size_dict[clu] = clu_idx.shape[0] repre_center = np.average(X[clu_idx, :], axis=0) self.representatives.append(repre_center) def _online_clustering(self, X): print("Starting online clustering...") for i in range(self.num_bootstrap_samples, X.shape[0]): if (i + 1) % 2000 == 0: print('Processed {} instances.'.format(i + 1)) instance_vec = X[i, :] if len(self.representatives) > 0: min_dist, clu_id = self._get_min_cluster_dist(instance_vec) if min_dist <= self.max_dist: self.cluster_size_dict[clu_id] += 1 self.representatives[clu_id] = self.representatives[clu_id] \ + (instance_vec - self.representatives[clu_id]) \ / self.cluster_size_dict[clu_id] continue self.cluster_size_dict[len(self.representatives)] = 1 self.representatives.append(instance_vec) print('Processed {} instances.'.format(X.shape[0])) print('Found {} clusters online.\n'.format(len(self.representatives))) # print('The representive vectors are:') # pprint.pprint(self.representatives.tolist()) def _distance_metric(self, x1, x2): norm= LA.norm(x1) * LA.norm(x2) distance = 1 - np.dot(x1, x2) / (norm + 1e-8) if distance < 1e-8: distance = 0 return distance def _get_min_cluster_dist(self, instance_vec): min_index = -1 min_dist = float('inf') for i in range(len(self.representatives)): cluster_rep = self.representatives[i] dist = self._distance_metric(instance_vec, cluster_rep) if dist < 1e-8: min_dist = 0 min_index = i break elif dist < min_dist: min_dist = dist min_index = i return min_dist, min_index