""" The implementation of PCA model for anomaly detection. Authors: LogPAI Team Reference: [1] Wei Xu, Ling Huang, Armando Fox, David Patterson, Michael I. Jordan. Large-Scale System Problems Detection by Mining Console Logs. ACM Symposium on Operating Systems Principles (SOSP), 2009. """ import numpy as np from ..utils import metrics class PCA(object): def __init__(self, n_components=0.95, threshold=None, c_alpha=3.2905): """ The PCA model for anomaly detection Attributes ---------- proj_C: The projection matrix for projecting feature vector to abnormal space n_components: float/int, number of principal compnents or the variance ratio they cover threshold: float, the anomaly detection threshold. When setting to None, the threshold is automatically caculated using Q-statistics c_alpha: float, the c_alpha parameter for caculating anomaly detection threshold using Q-statistics. The following is lookup table for c_alpha: c_alpha = 1.7507; # alpha = 0.08 c_alpha = 1.9600; # alpha = 0.05 c_alpha = 2.5758; # alpha = 0.01 c_alpha = 2.807; # alpha = 0.005 c_alpha = 2.9677; # alpha = 0.003 c_alpha = 3.2905; # alpha = 0.001 c_alpha = 3.4808; # alpha = 0.0005 c_alpha = 3.8906; # alpha = 0.0001 c_alpha = 4.4172; # alpha = 0.00001 """ self.proj_C = None self.components = None self.n_components = n_components self.threshold = threshold self.c_alpha = c_alpha def fit(self, X): """ Auguments --------- X: ndarray, the event count matrix of shape num_instances-by-num_events """ print('====== Model summary ======') num_instances, num_events = X.shape X_cov = np.dot(X.T, X) / float(num_instances) U, sigma, V = np.linalg.svd(X_cov) n_components = self.n_components if n_components < 1: total_variance = np.sum(sigma) variance = 0 for i in range(num_events): variance += sigma[i] if variance / total_variance >= n_components: break n_components = i + 1 P = U[:, :n_components] I = np.identity(num_events, int) self.components = P self.proj_C = I - np.dot(P, P.T) print('n_components: {}'.format(n_components)) print('Project matrix shape: {}-by-{}'.format(self.proj_C.shape[0], self.proj_C.shape[1])) if not self.threshold: # Calculate threshold using Q-statistic. Information can be found at: # http://conferences.sigcomm.org/sigcomm/2004/papers/p405-lakhina111.pdf phi = np.zeros(3) for i in range(3): for j in range(n_components, num_events): phi[i] += np.power(sigma[j], i + 1) h0 = 1.0 - 2 * phi[0] * phi[2] / (3.0 * phi[1] * phi[1]) self.threshold = phi[0] * np.power(self.c_alpha * np.sqrt(2 * phi[1] * h0 * h0) / phi[0] + 1.0 + phi[1] * h0 * (h0 - 1) / (phi[0] * phi[0]), 1.0 / h0) print('SPE threshold: {}\n'.format(self.threshold)) def predict(self, X): assert self.proj_C is not None, 'PCA model needs to be trained before prediction.' y_pred = np.zeros(X.shape[0]) for i in range(X.shape[0]): y_a = np.dot(self.proj_C, X[i, :]) SPE = np.dot(y_a, y_a) if SPE > self.threshold: y_pred[i] = 1 return y_pred def evaluate(self, X, y_true): print('====== Evaluation summary ======') y_pred = self.predict(X) precision, recall, f1 = metrics(y_pred, y_true) print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1)) return precision, recall, f1