Spaces:
Runtime error
Runtime error
""" | |
The interface for data preprocessing. | |
Authors: | |
LogPAI Team | |
""" | |
import pandas as pd | |
import os | |
import numpy as np | |
import re | |
from collections import Counter | |
from scipy.special import expit | |
from itertools import compress | |
class FeatureExtractor(object): | |
def __init__(self): | |
self.idf_vec = None | |
self.mean_vec = None | |
self.events = None | |
self.term_weighting = None | |
self.normalization = None | |
self.oov = None | |
def fit_transform(self, X_seq, term_weighting=None, normalization=None, oov=False, min_count=1): | |
""" Fit and transform the data matrix | |
Arguments | |
--------- | |
X_seq: ndarray, log sequences matrix | |
term_weighting: None or `tf-idf` | |
normalization: None or `zero-mean` | |
oov: bool, whether to use OOV event | |
min_count: int, the minimal occurrence of events (default 0), only valid when oov=True. | |
Returns | |
------- | |
X_new: The transformed data matrix | |
""" | |
print('====== Transformed train data summary ======') | |
self.term_weighting = term_weighting | |
self.normalization = normalization | |
self.oov = oov | |
X_counts = [] | |
for i in range(X_seq.shape[0]): | |
event_counts = Counter(X_seq[i]) | |
X_counts.append(event_counts) | |
X_df = pd.DataFrame(X_counts) | |
X_df = X_df.fillna(0) | |
self.events = X_df.columns | |
X = X_df.values | |
if self.oov: | |
oov_vec = np.zeros(X.shape[0]) | |
if min_count > 1: | |
idx = np.sum(X > 0, axis=0) >= min_count | |
oov_vec = np.sum(X[:, ~idx] > 0, axis=1) | |
X = X[:, idx] | |
self.events = np.array(X_df.columns)[idx].tolist() | |
X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)]) | |
num_instance, num_event = X.shape | |
if self.term_weighting == 'tf-idf': | |
df_vec = np.sum(X > 0, axis=0) | |
self.idf_vec = np.log(num_instance / (df_vec + 1e-8)) | |
idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) | |
X = idf_matrix | |
if self.normalization == 'zero-mean': | |
mean_vec = X.mean(axis=0) | |
self.mean_vec = mean_vec.reshape(1, num_event) | |
X = X - np.tile(self.mean_vec, (num_instance, 1)) | |
elif self.normalization == 'sigmoid': | |
X[X != 0] = expit(X[X != 0]) | |
X_new = X | |
print('Train data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) | |
return X_new | |
def transform(self, X_seq): | |
""" Transform the data matrix with trained parameters | |
Arguments | |
--------- | |
X: log sequences matrix | |
term_weighting: None or `tf-idf` | |
Returns | |
------- | |
X_new: The transformed data matrix | |
""" | |
print('====== Transformed test data summary ======') | |
X_counts = [] | |
for i in range(X_seq.shape[0]): | |
event_counts = Counter(X_seq[i]) | |
X_counts.append(event_counts) | |
X_df = pd.DataFrame(X_counts) | |
X_df = X_df.fillna(0) | |
empty_events = set(self.events) - set(X_df.columns) | |
for event in empty_events: | |
X_df[event] = [0] * len(X_df) | |
X = X_df[self.events].values | |
if self.oov: | |
oov_vec = np.sum(X_df[X_df.columns.difference(self.events)].values > 0, axis=1) | |
X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)]) | |
num_instance, num_event = X.shape | |
if self.term_weighting == 'tf-idf': | |
idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1)) | |
X = idf_matrix | |
if self.normalization == 'zero-mean': | |
X = X - np.tile(self.mean_vec, (num_instance, 1)) | |
elif self.normalization == 'sigmoid': | |
X[X != 0] = expit(X[X != 0]) | |
X_new = X | |
print('Test data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1])) | |
return X_new, self.events | |