Log-Decoder / loglizer /preprocessing.py
init logdecoder app files
history blame
4.04 kB
The interface for data preprocessing.
LogPAI Team
import pandas as pd
import os
import numpy as np
import re
from collections import Counter
from scipy.special import expit
from itertools import compress
class FeatureExtractor(object):
def __init__(self):
self.idf_vec = None
self.mean_vec = None
self.events = None
self.term_weighting = None
self.normalization = None
self.oov = None
def fit_transform(self, X_seq, term_weighting=None, normalization=None, oov=False, min_count=1):
""" Fit and transform the data matrix
X_seq: ndarray, log sequences matrix
term_weighting: None or `tf-idf`
normalization: None or `zero-mean`
oov: bool, whether to use OOV event
min_count: int, the minimal occurrence of events (default 0), only valid when oov=True.
X_new: The transformed data matrix
print('====== Transformed train data summary ======')
self.term_weighting = term_weighting
self.normalization = normalization
self.oov = oov
X_counts = []
for i in range(X_seq.shape[0]):
event_counts = Counter(X_seq[i])
X_df = pd.DataFrame(X_counts)
X_df = X_df.fillna(0)
self.events = X_df.columns
X = X_df.values
if self.oov:
oov_vec = np.zeros(X.shape[0])
if min_count > 1:
idx = np.sum(X > 0, axis=0) >= min_count
oov_vec = np.sum(X[:, ~idx] > 0, axis=1)
X = X[:, idx]
self.events = np.array(X_df.columns)[idx].tolist()
X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)])
num_instance, num_event = X.shape
if self.term_weighting == 'tf-idf':
df_vec = np.sum(X > 0, axis=0)
self.idf_vec = np.log(num_instance / (df_vec + 1e-8))
idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1))
X = idf_matrix
if self.normalization == 'zero-mean':
mean_vec = X.mean(axis=0)
self.mean_vec = mean_vec.reshape(1, num_event)
X = X - np.tile(self.mean_vec, (num_instance, 1))
elif self.normalization == 'sigmoid':
X[X != 0] = expit(X[X != 0])
X_new = X
print('Train data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1]))
return X_new
def transform(self, X_seq):
""" Transform the data matrix with trained parameters
X: log sequences matrix
term_weighting: None or `tf-idf`
X_new: The transformed data matrix
print('====== Transformed test data summary ======')
X_counts = []
for i in range(X_seq.shape[0]):
event_counts = Counter(X_seq[i])
X_df = pd.DataFrame(X_counts)
X_df = X_df.fillna(0)
empty_events = set(self.events) - set(X_df.columns)
for event in empty_events:
X_df[event] = [0] * len(X_df)
X = X_df[self.events].values
if self.oov:
oov_vec = np.sum(X_df[X_df.columns.difference(self.events)].values > 0, axis=1)
X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)])
num_instance, num_event = X.shape
if self.term_weighting == 'tf-idf':
idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1))
X = idf_matrix
if self.normalization == 'zero-mean':
X = X - np.tile(self.mean_vec, (num_instance, 1))
elif self.normalization == 'sigmoid':
X[X != 0] = expit(X[X != 0])
X_new = X
print('Test data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1]))
return X_new, self.events