Spaces:
Runtime error
Runtime error
""" | |
The implementation of PCA model for anomaly detection. | |
Authors: | |
LogPAI Team | |
Reference: | |
[1] Wei Xu, Ling Huang, Armando Fox, David Patterson, Michael I. Jordan. | |
Large-Scale System Problems Detection by Mining Console Logs. ACM | |
Symposium on Operating Systems Principles (SOSP), 2009. | |
""" | |
import numpy as np | |
from ..utils import metrics | |
class PCA(object): | |
def __init__(self, n_components=0.95, threshold=None, c_alpha=3.2905): | |
""" The PCA model for anomaly detection | |
Attributes | |
---------- | |
proj_C: The projection matrix for projecting feature vector to abnormal space | |
n_components: float/int, number of principal compnents or the variance ratio they cover | |
threshold: float, the anomaly detection threshold. When setting to None, the threshold | |
is automatically caculated using Q-statistics | |
c_alpha: float, the c_alpha parameter for caculating anomaly detection threshold using | |
Q-statistics. The following is lookup table for c_alpha: | |
c_alpha = 1.7507; # alpha = 0.08 | |
c_alpha = 1.9600; # alpha = 0.05 | |
c_alpha = 2.5758; # alpha = 0.01 | |
c_alpha = 2.807; # alpha = 0.005 | |
c_alpha = 2.9677; # alpha = 0.003 | |
c_alpha = 3.2905; # alpha = 0.001 | |
c_alpha = 3.4808; # alpha = 0.0005 | |
c_alpha = 3.8906; # alpha = 0.0001 | |
c_alpha = 4.4172; # alpha = 0.00001 | |
""" | |
self.proj_C = None | |
self.components = None | |
self.n_components = n_components | |
self.threshold = threshold | |
self.c_alpha = c_alpha | |
def fit(self, X): | |
""" | |
Auguments | |
--------- | |
X: ndarray, the event count matrix of shape num_instances-by-num_events | |
""" | |
print('====== Model summary ======') | |
num_instances, num_events = X.shape | |
X_cov = np.dot(X.T, X) / float(num_instances) | |
U, sigma, V = np.linalg.svd(X_cov) | |
n_components = self.n_components | |
if n_components < 1: | |
total_variance = np.sum(sigma) | |
variance = 0 | |
for i in range(num_events): | |
variance += sigma[i] | |
if variance / total_variance >= n_components: | |
break | |
n_components = i + 1 | |
P = U[:, :n_components] | |
I = np.identity(num_events, int) | |
self.components = P | |
self.proj_C = I - np.dot(P, P.T) | |
print('n_components: {}'.format(n_components)) | |
print('Project matrix shape: {}-by-{}'.format(self.proj_C.shape[0], self.proj_C.shape[1])) | |
if not self.threshold: | |
# Calculate threshold using Q-statistic. Information can be found at: | |
# http://conferences.sigcomm.org/sigcomm/2004/papers/p405-lakhina111.pdf | |
phi = np.zeros(3) | |
for i in range(3): | |
for j in range(n_components, num_events): | |
phi[i] += np.power(sigma[j], i + 1) | |
h0 = 1.0 - 2 * phi[0] * phi[2] / (3.0 * phi[1] * phi[1]) | |
self.threshold = phi[0] * np.power(self.c_alpha * np.sqrt(2 * phi[1] * h0 * h0) / phi[0] | |
+ 1.0 + phi[1] * h0 * (h0 - 1) / (phi[0] * phi[0]), | |
1.0 / h0) | |
print('SPE threshold: {}\n'.format(self.threshold)) | |
def predict(self, X): | |
assert self.proj_C is not None, 'PCA model needs to be trained before prediction.' | |
y_pred = np.zeros(X.shape[0]) | |
for i in range(X.shape[0]): | |
y_a = np.dot(self.proj_C, X[i, :]) | |
SPE = np.dot(y_a, y_a) | |
if SPE > self.threshold: | |
y_pred[i] = 1 | |
return y_pred | |
def evaluate(self, X, y_true): | |
print('====== Evaluation summary ======') | |
y_pred = self.predict(X) | |
precision, recall, f1 = metrics(y_pred, y_true) | |
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1)) | |
return precision, recall, f1 | |