Spaces:
Runtime error
Runtime error
""" | |
The implementation of Log Clustering model for anomaly detection. | |
Authors: | |
LogPAI Team | |
Reference: | |
[1] Qingwei Lin, Hongyu Zhang, Jian-Guang Lou, Yu Zhang, Xuewei Chen. Log Clustering | |
based Problem Identification for Online Service Systems. International Conference | |
on Software Engineering (ICSE), 2016. | |
""" | |
import numpy as np | |
import pprint | |
from scipy.special import expit | |
from numpy import linalg as LA | |
from scipy.cluster.hierarchy import linkage, fcluster | |
from scipy.spatial.distance import pdist, squareform | |
from ..utils import metrics | |
class LogClustering(object): | |
def __init__(self, max_dist=0.3, anomaly_threshold=0.3, mode='online', num_bootstrap_samples=1000): | |
""" | |
Attributes | |
---------- | |
max_dist: float, the threshold to stop the clustering process | |
anomaly_threshold: float, the threshold for anomaly detection | |
mode: str, 'offline' or 'online' mode for clustering | |
num_bootstrap_samples: int, online clustering starts with a bootstraping process, which | |
determines the initial cluster representatives offline using a subset of samples | |
representatives: ndarray, the representative samples of clusters, of shape | |
num_clusters-by-num_events | |
cluster_size_dict: dict, the size of each cluster, used to update representatives online | |
""" | |
self.max_dist = max_dist | |
self.anomaly_threshold = anomaly_threshold | |
self.mode = mode | |
self.num_bootstrap_samples = num_bootstrap_samples | |
self.representatives = list() | |
self.cluster_size_dict = dict() | |
def fit(self, X): | |
print('====== Model summary ======') | |
if self.mode == 'offline': | |
# The offline mode can process about 10K samples only due to huge memory consumption. | |
self._offline_clustering(X) | |
elif self.mode == 'online': | |
# Bootstrapping phase | |
if self.num_bootstrap_samples > 0: | |
X_bootstrap = X[0:self.num_bootstrap_samples, :] | |
self._offline_clustering(X_bootstrap) | |
# Online learning phase | |
if X.shape[0] > self.num_bootstrap_samples: | |
self._online_clustering(X) | |
def predict(self, X): | |
y_pred = np.zeros(X.shape[0]) | |
for i in range(X.shape[0]): | |
min_dist, min_index = self._get_min_cluster_dist(X[i, :]) | |
if min_dist > self.anomaly_threshold: | |
y_pred[i] = 1 | |
return y_pred | |
def evaluate(self, X, y_true): | |
print('====== Evaluation summary ======') | |
y_pred = self.predict(X) | |
precision, recall, f1 = metrics(y_pred, y_true) | |
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n' \ | |
.format(precision, recall, f1)) | |
return precision, recall, f1 | |
def _offline_clustering(self, X): | |
print('Starting offline clustering...') | |
p_dist = pdist(X, metric=self._distance_metric) | |
Z = linkage(p_dist, 'complete') | |
cluster_index = fcluster(Z, self.max_dist, criterion='distance') | |
self._extract_representatives(X, cluster_index) | |
print('Processed {} instances.'.format(X.shape[0])) | |
print('Found {} clusters offline.\n'.format(len(self.representatives))) | |
# print('The representive vectors are:') | |
# pprint.pprint(self.representatives.tolist()) | |
def _extract_representatives(self, X, cluster_index): | |
num_clusters = len(set(cluster_index)) | |
for clu in range(num_clusters): | |
clu_idx = np.argwhere(cluster_index == clu + 1)[:, 0] | |
self.cluster_size_dict[clu] = clu_idx.shape[0] | |
repre_center = np.average(X[clu_idx, :], axis=0) | |
self.representatives.append(repre_center) | |
def _online_clustering(self, X): | |
print("Starting online clustering...") | |
for i in range(self.num_bootstrap_samples, X.shape[0]): | |
if (i + 1) % 2000 == 0: | |
print('Processed {} instances.'.format(i + 1)) | |
instance_vec = X[i, :] | |
if len(self.representatives) > 0: | |
min_dist, clu_id = self._get_min_cluster_dist(instance_vec) | |
if min_dist <= self.max_dist: | |
self.cluster_size_dict[clu_id] += 1 | |
self.representatives[clu_id] = self.representatives[clu_id] \ | |
+ (instance_vec - self.representatives[clu_id]) \ | |
/ self.cluster_size_dict[clu_id] | |
continue | |
self.cluster_size_dict[len(self.representatives)] = 1 | |
self.representatives.append(instance_vec) | |
print('Processed {} instances.'.format(X.shape[0])) | |
print('Found {} clusters online.\n'.format(len(self.representatives))) | |
# print('The representive vectors are:') | |
# pprint.pprint(self.representatives.tolist()) | |
def _distance_metric(self, x1, x2): | |
norm= LA.norm(x1) * LA.norm(x2) | |
distance = 1 - np.dot(x1, x2) / (norm + 1e-8) | |
if distance < 1e-8: | |
distance = 0 | |
return distance | |
def _get_min_cluster_dist(self, instance_vec): | |
min_index = -1 | |
min_dist = float('inf') | |
for i in range(len(self.representatives)): | |
cluster_rep = self.representatives[i] | |
dist = self._distance_metric(instance_vec, cluster_rep) | |
if dist < 1e-8: | |
min_dist = 0 | |
min_index = i | |
break | |
elif dist < min_dist: | |
min_dist = dist | |
min_index = i | |
return min_dist, min_index | |