File size: 5,724 Bytes
9c323ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
The implementation of Log Clustering model for anomaly detection.

Authors: 
    LogPAI Team

Reference: 
    [1] Qingwei Lin, Hongyu Zhang, Jian-Guang Lou, Yu Zhang, Xuewei Chen. Log Clustering 
        based Problem Identification for Online Service Systems. International Conference
        on Software Engineering (ICSE), 2016.

"""

import numpy as np
import pprint
from scipy.special import expit
from numpy import linalg as LA
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import pdist, squareform
from ..utils import metrics


class LogClustering(object):

    def __init__(self, max_dist=0.3, anomaly_threshold=0.3, mode='online', num_bootstrap_samples=1000):
        """
        Attributes
        ----------
            max_dist: float, the threshold to stop the clustering process
            anomaly_threshold: float, the threshold for anomaly detection
            mode: str, 'offline' or 'online' mode for clustering
            num_bootstrap_samples: int, online clustering starts with a bootstraping process, which
                determines the initial cluster representatives offline using a subset of samples 
            representatives: ndarray, the representative samples of clusters, of shape 
                num_clusters-by-num_events
            cluster_size_dict: dict, the size of each cluster, used to update representatives online 
        """
        self.max_dist = max_dist
        self.anomaly_threshold = anomaly_threshold
        self.mode = mode
        self.num_bootstrap_samples = num_bootstrap_samples
        self.representatives = list()
        self.cluster_size_dict = dict()

    def fit(self, X):   
        print('====== Model summary ======')         
        if self.mode == 'offline':
            # The offline mode can process about 10K samples only due to huge memory consumption.
            self._offline_clustering(X)
        elif self.mode == 'online':
            # Bootstrapping phase
            if self.num_bootstrap_samples > 0:
                X_bootstrap = X[0:self.num_bootstrap_samples, :]
                self._offline_clustering(X_bootstrap)
            # Online learning phase
            if X.shape[0] > self.num_bootstrap_samples:
                self._online_clustering(X)

    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            min_dist, min_index = self._get_min_cluster_dist(X[i, :])
            if min_dist > self.anomaly_threshold:
                y_pred[i] = 1
        return y_pred

    def evaluate(self, X, y_true):
        print('====== Evaluation summary ======')
        y_pred = self.predict(X)
        precision, recall, f1 = metrics(y_pred, y_true)
        print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n' \
              .format(precision, recall, f1))
        return precision, recall, f1

    def _offline_clustering(self, X):
        print('Starting offline clustering...')
        p_dist = pdist(X, metric=self._distance_metric)
        Z = linkage(p_dist, 'complete')
        cluster_index = fcluster(Z, self.max_dist, criterion='distance')
        self._extract_representatives(X, cluster_index)
        print('Processed {} instances.'.format(X.shape[0]))
        print('Found {} clusters offline.\n'.format(len(self.representatives)))
        # print('The representive vectors are:')
        # pprint.pprint(self.representatives.tolist())

    def _extract_representatives(self, X, cluster_index):
        num_clusters = len(set(cluster_index))
        for clu in range(num_clusters):
            clu_idx = np.argwhere(cluster_index == clu + 1)[:, 0]
            self.cluster_size_dict[clu] = clu_idx.shape[0]
            repre_center = np.average(X[clu_idx, :], axis=0)
            self.representatives.append(repre_center)

    def _online_clustering(self, X):
        print("Starting online clustering...")
        for i in range(self.num_bootstrap_samples, X.shape[0]):
            if (i + 1) % 2000 == 0:
                print('Processed {} instances.'.format(i + 1))
            instance_vec = X[i, :]
            if len(self.representatives) > 0:
                min_dist, clu_id = self._get_min_cluster_dist(instance_vec)
                if min_dist <= self.max_dist:
                    self.cluster_size_dict[clu_id] += 1
                    self.representatives[clu_id] = self.representatives[clu_id] \
                                                 + (instance_vec - self.representatives[clu_id]) \
                                                 / self.cluster_size_dict[clu_id]
                    continue
            self.cluster_size_dict[len(self.representatives)] = 1
            self.representatives.append(instance_vec)
        print('Processed {} instances.'.format(X.shape[0]))
        print('Found {} clusters online.\n'.format(len(self.representatives)))
        # print('The representive vectors are:')
        # pprint.pprint(self.representatives.tolist())

    def _distance_metric(self, x1, x2):
        norm= LA.norm(x1) * LA.norm(x2)
        distance = 1 - np.dot(x1, x2) / (norm + 1e-8)
        if distance < 1e-8:
            distance = 0
        return distance

    def _get_min_cluster_dist(self, instance_vec):
        min_index = -1
        min_dist = float('inf')
        for i in range(len(self.representatives)):
            cluster_rep = self.representatives[i]
            dist = self._distance_metric(instance_vec, cluster_rep)
            if dist < 1e-8:
                min_dist = 0
                min_index = i
                break
            elif dist < min_dist:
                min_dist = dist
                min_index = i
        return min_dist, min_index