Spaces:
Runtime error
Runtime error
jpcabangon
commited on
Commit
·
9c323ee
1
Parent(s):
76ec49e
init logdecoder app files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- 46_Heatmap.png +0 -0
- app.py +314 -0
- data/46_Heatmap.png +0 -0
- data/HDFS/HDFS_100k.log_structured.csv +3 -0
- data/HDFS/anomaly_label.csv +3 -0
- data/app.py +314 -0
- data/data_instances.csv +3 -0
- data/loglizer/__init__.py +0 -0
- data/loglizer/__pycache__/__init__.cpython-38.pyc +0 -0
- data/loglizer/__pycache__/dataloader.cpython-38.pyc +0 -0
- data/loglizer/__pycache__/preprocessing.cpython-38.pyc +0 -0
- data/loglizer/__pycache__/utils.cpython-38.pyc +0 -0
- data/loglizer/dataloader.py +268 -0
- data/loglizer/models/DecisionTree.py +78 -0
- data/loglizer/models/InvariantsMiner.py +296 -0
- data/loglizer/models/IsolationForest.py +96 -0
- data/loglizer/models/LR.py +73 -0
- data/loglizer/models/LogClustering.py +137 -0
- data/loglizer/models/PCA.py +105 -0
- data/loglizer/models/SVM.py +64 -0
- data/loglizer/models/__init__.py +8 -0
- data/loglizer/models/__pycache__/DecisionTree.cpython-38.pyc +0 -0
- data/loglizer/models/__pycache__/InvariantsMiner.cpython-38.pyc +0 -0
- data/loglizer/models/__pycache__/IsolationForest.cpython-38.pyc +0 -0
- data/loglizer/models/__pycache__/LR.cpython-38.pyc +0 -0
- data/loglizer/models/__pycache__/LogClustering.cpython-38.pyc +0 -0
- data/loglizer/models/__pycache__/PCA.cpython-38.pyc +0 -0
- data/loglizer/models/__pycache__/SVM.cpython-38.pyc +0 -0
- data/loglizer/models/__pycache__/__init__.cpython-38.pyc +0 -0
- data/loglizer/preprocessing.py +123 -0
- data/loglizer/utils.py +29 -0
- data/requirements.txt +4 -0
- data/temp/HDFS_100k.log_structured.csv +3 -0
- data_instances.csv +3 -0
- loglizer/__init__.py +0 -0
- loglizer/__pycache__/__init__.cpython-38.pyc +0 -0
- loglizer/__pycache__/dataloader.cpython-38.pyc +0 -0
- loglizer/__pycache__/preprocessing.cpython-38.pyc +0 -0
- loglizer/__pycache__/utils.cpython-38.pyc +0 -0
- loglizer/dataloader.py +268 -0
- loglizer/models/DecisionTree.py +78 -0
- loglizer/models/InvariantsMiner.py +296 -0
- loglizer/models/IsolationForest.py +96 -0
- loglizer/models/LR.py +73 -0
- loglizer/models/LogClustering.py +137 -0
- loglizer/models/PCA.py +105 -0
- loglizer/models/SVM.py +64 -0
- loglizer/models/__init__.py +8 -0
- loglizer/models/__pycache__/DecisionTree.cpython-38.pyc +0 -0
- loglizer/models/__pycache__/InvariantsMiner.cpython-38.pyc +0 -0
46_Heatmap.png
ADDED
![]() |
app.py
ADDED
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# To run streamlit, go to terminal and type: 'streamlit run app.py'
|
2 |
+
# Core Packages ###########################
|
3 |
+
import streamlit as st
|
4 |
+
import os
|
5 |
+
|
6 |
+
import time
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
from loglizer.models import PCA
|
11 |
+
from loglizer import dataloader, preprocessing
|
12 |
+
import openai
|
13 |
+
|
14 |
+
openai.api_key = os.environ["FREEEDU_OPENAI_API_KEY"]
|
15 |
+
|
16 |
+
project_title = "ChatGPT Log Decoder"
|
17 |
+
project_desc = """
|
18 |
+
Intrusion Detection Systems (IDS) are powerful security tools that monitor network traffic for suspicious activity and issues alerts when such activities are discovered.
|
19 |
+
While these systems are commonly used by cybersecurity professionals and IT experts,
|
20 |
+
the technical jargon and analysis that an IDS provides can be incomprehensible to the average user.
|
21 |
+
This project aims to bridge this gap by utilizing ChatGPT's advanced natural language processing to
|
22 |
+
articulate the technical information received from an IDS into human-readable form, allowing common everyday users
|
23 |
+
to grasp the extent of their network's security status without the need for a technical background.
|
24 |
+
"""
|
25 |
+
|
26 |
+
project_link = "https://github.com/logpai/loglizer"
|
27 |
+
project_icon = "46_Heatmap.png"
|
28 |
+
st.set_page_config(page_title=project_title, initial_sidebar_state='collapsed', page_icon=project_icon)
|
29 |
+
|
30 |
+
# additional info from the readme
|
31 |
+
add_info_md = """
|
32 |
+
|
33 |
+
# loglizer
|
34 |
+
|
35 |
+
|
36 |
+
**Loglizer is a machine learning-based log analysis toolkit for automated anomaly detection**.
|
37 |
+
|
38 |
+
|
39 |
+
Logs are imperative in the development and maintenance process of many software systems. They record detailed
|
40 |
+
runtime information during system operation that allows developers and support engineers to monitor their systems and track abnormal behaviors and errors. Loglizer provides a toolkit that implements a number of machine-learning based log analysis techniques for automated anomaly detection.
|
41 |
+
|
42 |
+
:telescope: If you use loglizer in your research for publication, please kindly cite the following paper.
|
43 |
+
+ Shilin He, Jieming Zhu, Pinjia He, Michael R. Lyu. [Experience Report: System Log Analysis for Anomaly Detection](https://jiemingzhu.github.io/pub/slhe_issre2016.pdf), *IEEE International Symposium on Software Reliability Engineering (ISSRE)*, 2016. [[Bibtex](https://dblp.org/rec/bibtex/conf/issre/HeZHL16)][[中文版本](https://github.com/AmateurEvents/article/issues/2)]
|
44 |
+
**(ISSRE Most Influential Paper)**
|
45 |
+
|
46 |
+
## Framework
|
47 |
+
|
48 |
+

|
49 |
+
|
50 |
+
The log analysis framework for anomaly detection usually comprises the following components:
|
51 |
+
|
52 |
+
1. **Log collection:** Logs are generated at runtime and aggregated into a centralized place with a data streaming pipeline, such as Flume and Kafka.
|
53 |
+
2. **Log parsing:** The goal of log parsing is to convert unstructured log messages into a map of structured events, based on which sophisticated machine learning models can be applied. The details of log parsing can be found at [our logparser project](https://github.com/logpai/logparser).
|
54 |
+
3. **Feature extraction:** Structured logs can be sliced into short log sequences through interval window, sliding window, or session window. Then, feature extraction is performed to vectorize each log sequence, for example, using an event counting vector.
|
55 |
+
4. **Anomaly detection:** Anomaly detection models are trained to check whether a given feature vector is an anomaly or not.
|
56 |
+
|
57 |
+
|
58 |
+
## Models
|
59 |
+
|
60 |
+
Anomaly detection models currently available:
|
61 |
+
|
62 |
+
| Model | Paper reference |
|
63 |
+
| :--- | :--- |
|
64 |
+
| **Supervised models** |
|
65 |
+
| LR | [**EuroSys'10**] [Fingerprinting the Datacenter: Automated Classification of Performance Crises](https://www.microsoft.com/en-us/research/wp-content/uploads/2009/07/hiLighter.pdf), by Peter Bodík, Moises Goldszmidt, Armando Fox, Hans Andersen. [**Microsoft**] |
|
66 |
+
| Decision Tree | [**ICAC'04**] [Failure Diagnosis Using Decision Trees](http://www.cs.berkeley.edu/~brewer/papers/icac2004_chen_diagnosis.pdf), by Mike Chen, Alice X. Zheng, Jim Lloyd, Michael I. Jordan, Eric Brewer. [**eBay**] |
|
67 |
+
| SVM | [**ICDM'07**] [Failure Prediction in IBM BlueGene/L Event Logs](https://www.researchgate.net/publication/4324148_Failure_Prediction_in_IBM_BlueGeneL_Event_Logs), by Yinglung Liang, Yanyong Zhang, Hui Xiong, Ramendra Sahoo. [**IBM**]|
|
68 |
+
| **Unsupervised models** |
|
69 |
+
| LOF | [**SIGMOD'00**] [LOF: Identifying Density-Based Local Outliers](), by Markus M. Breunig, Hans-Peter Kriegel, Raymond T. Ng, Jörg Sander. |
|
70 |
+
| One-Class SVM | [**Neural Computation'01**] [Estimating the Support of a High-Dimensional Distribution](), by John Platt, Bernhard Schölkopf, John Shawe-Taylor, Alex J. Smola, Robert C. Williamson. |
|
71 |
+
| Isolation Forest | [**ICDM'08**] [Isolation Forest](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf), by Fei Tony Liu, Kai Ming Ting, Zhi-Hua Zhou. |
|
72 |
+
| PCA | [**SOSP'09**] [Large-Scale System Problems Detection by Mining Console Logs](http://iiis.tsinghua.edu.cn/~weixu/files/sosp09.pdf), by Wei Xu, Ling Huang, Armando Fox, David Patterson, Michael I. Jordan. [**Intel**] |
|
73 |
+
| Invariants Mining | [**ATC'10**] [Mining Invariants from Console Logs for System Problem Detection](https://www.usenix.org/legacy/event/atc10/tech/full_papers/Lou.pdf), by Jian-Guang Lou, Qiang Fu, Shengqi Yang, Ye Xu, Jiang Li. [**Microsoft**]|
|
74 |
+
| Clustering | [**ICSE'16**] [Log Clustering based Problem Identification for Online Service Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/07/ICSE-2016-2-Log-Clustering-based-Problem-Identification-for-Online-Service-Systems.pdf), by Qingwei Lin, Hongyu Zhang, Jian-Guang Lou, Yu Zhang, Xuewei Chen. [**Microsoft**]|
|
75 |
+
| DeepLog (coming)| [**CCS'17**] [DeepLog: Anomaly Detection and Diagnosis from System Logs through Deep Learning](https://www.cs.utah.edu/~lifeifei/papers/deeplog.pdf), by Min Du, Feifei Li, Guineng Zheng, Vivek Srikumar. |
|
76 |
+
| AutoEncoder (coming)| [**Arxiv'18**] [Anomaly Detection using Autoencoders in High Performance Computing Systems](https://arxiv.org/abs/1811.05269), by Andrea Borghesi, Andrea Bartolini, Michele Lombardi, Michela Milano, Luca Benini. |
|
77 |
+
|
78 |
+
|
79 |
+
## Log data
|
80 |
+
We have collected a set of labeled log datasets in [loghub](https://github.com/logpai/loghub) for research purposes. If you are interested in the datasets, please follow the link to submit your access request.
|
81 |
+
|
82 |
+
## Install
|
83 |
+
```bash
|
84 |
+
git clone https://github.com/logpai/loglizer.git
|
85 |
+
cd loglizer
|
86 |
+
pip install -r requirements.txt
|
87 |
+
```
|
88 |
+
|
89 |
+
## API usage
|
90 |
+
|
91 |
+
```python
|
92 |
+
# Load HDFS dataset. If you would like to try your own log, you need to rewrite the load function.
|
93 |
+
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(...)
|
94 |
+
|
95 |
+
# Feature extraction and transformation
|
96 |
+
feature_extractor = preprocessing.FeatureExtractor()
|
97 |
+
feature_extractor.fit_transform(...)
|
98 |
+
|
99 |
+
# Model training
|
100 |
+
model = PCA()
|
101 |
+
model.fit(...)
|
102 |
+
|
103 |
+
# Feature transform after fitting
|
104 |
+
x_test = feature_extractor.transform(...)
|
105 |
+
# Model evaluation with labeled data
|
106 |
+
model.evaluate(...)
|
107 |
+
|
108 |
+
# Anomaly prediction
|
109 |
+
x_test = feature_extractor.transform(...)
|
110 |
+
model.predict(...) # predict anomalies on given data
|
111 |
+
```
|
112 |
+
|
113 |
+
For more details, please follow [the demo](./docs/demo.md) in the docs to get started. Please note that all ML models are not magic, you need to figure out how to tune the parameters in order to make them work on your own data.
|
114 |
+
|
115 |
+
## Benchmarking results
|
116 |
+
|
117 |
+
If you would like to reproduce the following results, please run [benchmarks/HDFS_bechmark.py](./benchmarks/HDFS_bechmark.py) on the full HDFS dataset (HDFS100k is for demo only).
|
118 |
+
|
119 |
+
| | | HDFS | |
|
120 |
+
| :----:|:----:|:----:|:----:|
|
121 |
+
| **Model** | **Precision** | **Recall** | **F1** |
|
122 |
+
| LR| 0.955 | 0.911 | 0.933 |
|
123 |
+
| Decision Tree | 0.998 | 0.998 | 0.998 |
|
124 |
+
| SVM| 0.959 | 0.970 | 0.965 |
|
125 |
+
| LOF | 0.967 | 0.561 | 0.710 |
|
126 |
+
| One-Class SVM | 0.995 | 0.222| 0.363 |
|
127 |
+
| Isolation Forest | 0.830 | 0.776 | 0.802 |
|
128 |
+
| PCA | 0.975 | 0.635 | 0.769|
|
129 |
+
| Invariants Mining | 0.888 | 0.945 | 0.915|
|
130 |
+
| Clustering | 1.000 | 0.720 | 0.837 |
|
131 |
+
|
132 |
+
## Contributors
|
133 |
+
+ [Shilin He](https://shilinhe.github.io), The Chinese University of Hong Kong
|
134 |
+
+ [Jieming Zhu](https://jiemingzhu.github.io), The Chinese University of Hong Kong, currently at Huawei Noah's Ark Lab
|
135 |
+
+ [Pinjia He](https://pinjiahe.github.io/), The Chinese University of Hong Kong, currently at ETH Zurich
|
136 |
+
|
137 |
+
|
138 |
+
## Feedback
|
139 |
+
For any questions or feedback, please post to [the issue page](https://github.com/logpai/loglizer/issues/new).
|
140 |
+
|
141 |
+
"""
|
142 |
+
#######################################################################################################################
|
143 |
+
default_log_path = os.path.join("data","HDFS","HDFS_100k.log_structured.csv")
|
144 |
+
def load_loglizer(train_log_path=default_log_path):
|
145 |
+
print("Loading Loglizer PCA model:")
|
146 |
+
start = time.time()
|
147 |
+
(x_train, _), (_, _), _ = dataloader.load_HDFS(train_log_path, window='session',
|
148 |
+
split_type='sequential', save_csv=True)
|
149 |
+
feature_extractor = preprocessing.FeatureExtractor()
|
150 |
+
x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf',
|
151 |
+
normalization='zero-mean')
|
152 |
+
model = PCA()
|
153 |
+
model.fit(x_train)
|
154 |
+
|
155 |
+
stop = time.time()
|
156 |
+
print("Loading complete. Time elapsed: " + str(stop - start))
|
157 |
+
|
158 |
+
# Extract the event labels for each unique eventid in the source structured log file
|
159 |
+
# Load the CSV file
|
160 |
+
df = pd.read_csv(train_log_path)
|
161 |
+
|
162 |
+
# Extract unique 'EventId' and 'EventTemplate' pairs
|
163 |
+
event_names = df[['EventId', 'EventTemplate']].drop_duplicates()
|
164 |
+
event_names_dict = dict(zip(event_names['EventId'], event_names['EventTemplate']))
|
165 |
+
|
166 |
+
return model, feature_extractor, event_names_dict
|
167 |
+
|
168 |
+
def analyze_with_chatgpt(anomalous_packets):
|
169 |
+
context = {"role":"system",
|
170 |
+
"content":"""
|
171 |
+
You will receive 5 rows of transformed data logs, where the middlemost log is flagged as anomalous.
|
172 |
+
Your job is to analyze and compare this log with its surrounding context logs, and find out why it was flagged as anomalous.
|
173 |
+
Your response should be 50 to 150 words only; be as concise as possible and address the user directly.
|
174 |
+
Only mention the key issues and cause of the anomaly. Never mention the Column Names, the middlemost log, surrounding context log.
|
175 |
+
You can mention what the irregular values in a column represent, and base your report on that.
|
176 |
+
Assume that the user has no knowledge of networks or cybersecurity. Your response should start with
|
177 |
+
"Based on the logs,".
|
178 |
+
Try to avoid telling the user 'if the issue persists'.
|
179 |
+
Give simple step-by-step advice on what the user can do on their own.
|
180 |
+
If the advice is beyond the scope of the everyday user, notify them that the assistance of a professional
|
181 |
+
is necessary for the level of intrusion you found.
|
182 |
+
"""}
|
183 |
+
message_prompt = []
|
184 |
+
message_prompt.append(context)
|
185 |
+
anomalies = f"*Column Names*: {anomalous_packets.columns} --- "
|
186 |
+
# Build a sequence of messages, with each message being a single packet up to the last packet flagged as anomalous
|
187 |
+
for i,p in anomalous_packets.iterrows():
|
188 |
+
anomalies += f"*Row {i+1}*: "
|
189 |
+
anomalies += str(p.values)
|
190 |
+
anomalies += " --- "
|
191 |
+
|
192 |
+
message_format = {"role":"user",
|
193 |
+
"content":anomalies}
|
194 |
+
message_prompt.append(message_format)
|
195 |
+
|
196 |
+
# Generate the response
|
197 |
+
response = openai.ChatCompletion.create(
|
198 |
+
model="gpt-3.5-turbo",
|
199 |
+
messages=message_prompt,
|
200 |
+
max_tokens=1008,
|
201 |
+
temperature=0.7,
|
202 |
+
n=1,
|
203 |
+
stop=None,
|
204 |
+
)
|
205 |
+
|
206 |
+
# Extract the response text from the API response
|
207 |
+
response_text = response['choices'][0]['message']['content']
|
208 |
+
# Return the response text and updated chat history
|
209 |
+
return response_text
|
210 |
+
|
211 |
+
def main():
|
212 |
+
head_col = st.columns([1,8])
|
213 |
+
with head_col[0]:
|
214 |
+
st.image(project_icon)
|
215 |
+
with head_col[1]:
|
216 |
+
st.title(project_title)
|
217 |
+
|
218 |
+
st.write(project_desc)
|
219 |
+
st.write(f"Source Project: {project_link}")
|
220 |
+
expander = st.expander("Additional Information on the Source Project (Loglizer)")
|
221 |
+
expander.markdown(add_info_md)
|
222 |
+
st.markdown("***")
|
223 |
+
st.subheader("")
|
224 |
+
#########################################
|
225 |
+
|
226 |
+
# instructions and file upload button
|
227 |
+
st.subheader("""
|
228 |
+
How to use:
|
229 |
+
1. Upload your log file (must be .csv format)
|
230 |
+
2. Click the 'Run Loglizer' button
|
231 |
+
""")
|
232 |
+
uploaded_file = st.file_uploader("Upload a log file in csv format:",
|
233 |
+
type=["csv"],
|
234 |
+
accept_multiple_files=False)
|
235 |
+
|
236 |
+
#########################################
|
237 |
+
|
238 |
+
run_button = st.button("Run Loglizer")
|
239 |
+
|
240 |
+
if "anomalies" not in st.session_state:
|
241 |
+
st.session_state.anomalies = []
|
242 |
+
if "col_names" not in st.session_state:
|
243 |
+
st.session_state.col_names = []
|
244 |
+
|
245 |
+
# button is clicked
|
246 |
+
if run_button:
|
247 |
+
if uploaded_file is None: # if no files were uploaded:
|
248 |
+
st.error("Please upload a .csv log file")
|
249 |
+
|
250 |
+
else:
|
251 |
+
# Ensure temp directory exists
|
252 |
+
if not os.path.exists('temp'):
|
253 |
+
os.makedirs('temp')
|
254 |
+
|
255 |
+
filename = os.path.basename(uploaded_file.name)
|
256 |
+
file_path = os.path.join("temp", filename)
|
257 |
+
# Write out the uploaded file to temp directory
|
258 |
+
with open(file_path, 'wb') as f:
|
259 |
+
f.write(uploaded_file.getbuffer())
|
260 |
+
|
261 |
+
with st.spinner('Loading Loglizer...'):
|
262 |
+
model, feature_extractor, event_names_dict = load_loglizer()
|
263 |
+
|
264 |
+
# Simulate the 'live' log feed
|
265 |
+
(x_live, _), (_, _), _ = dataloader.load_HDFS(file_path, window='session', split_type='sequential')
|
266 |
+
x_live, st.session_state.col_names = feature_extractor.transform(x_live)
|
267 |
+
# st.write(f"x_live shape: {x_live.shape}")
|
268 |
+
|
269 |
+
# Map event IDs to event templates
|
270 |
+
st.session_state.col_names = [event_names_dict[id] if id in event_names_dict else id for id in
|
271 |
+
st.session_state.col_names]
|
272 |
+
|
273 |
+
for i in range(len(x_live)):
|
274 |
+
log = x_live[i]
|
275 |
+
log = np.expand_dims(log, axis=0)
|
276 |
+
prediction = model.predict(log)
|
277 |
+
print(f"prediction for x_live[{i}]: {prediction}")
|
278 |
+
if prediction == 1:
|
279 |
+
# Anomaly detected
|
280 |
+
context = x_live[max(0, i - 2):min(i + 3, len(x_live))] # Get the two logs before and after
|
281 |
+
st.session_state.anomalies.append(context)
|
282 |
+
# st.write("predict on x_live:")
|
283 |
+
# st.write(model.predict(x_live))
|
284 |
+
|
285 |
+
st.write("Anomalous events")
|
286 |
+
st.write(len(st.session_state.anomalies))
|
287 |
+
#
|
288 |
+
# st.write("Column names")
|
289 |
+
# st.write(st.session_state.col_names)
|
290 |
+
|
291 |
+
if len(st.session_state.anomalies) > 0:
|
292 |
+
# Displaying the anomalous packets and sending to ChatGPT
|
293 |
+
selected_index = st.selectbox('Select an anomaly', list(range(len(st.session_state.anomalies))), 0)
|
294 |
+
selected_anomaly = st.session_state.anomalies[selected_index]
|
295 |
+
st.write(f"Loglizer found and compiled {selected_anomaly.shape[1]} events from the logs.\n The middle entry has an anomaly:")
|
296 |
+
# st.write(f"Shape: {selected_anomaly.shape} Type: {type(selected_anomaly)}")
|
297 |
+
|
298 |
+
# Convert the numpy array to a pandas DataFrame
|
299 |
+
selected_anomaly_df = pd.DataFrame(selected_anomaly)
|
300 |
+
# Set the column names
|
301 |
+
selected_anomaly_df.columns = st.session_state.col_names
|
302 |
+
|
303 |
+
# st.write(selected_anomaly)
|
304 |
+
st.write(selected_anomaly_df)
|
305 |
+
|
306 |
+
if st.button('Send to ChatGPT'):
|
307 |
+
result = analyze_with_chatgpt(selected_anomaly_df)
|
308 |
+
st.write(result)
|
309 |
+
|
310 |
+
|
311 |
+
if __name__ == '__main__':
|
312 |
+
main()
|
313 |
+
|
314 |
+
# To run streamlit, go to terminal and type: 'streamlit run app-source.py'
|
data/46_Heatmap.png
ADDED
![]() |
data/HDFS/HDFS_100k.log_structured.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cbb13cc937bb35ad683dfcd71929242d7b1e0c01e2a94f79c637847c30a42190
|
3 |
+
size 20892087
|
data/HDFS/anomaly_label.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c711ed6c8848fc3243fb4d092f172f31d128c8a6ec7f26ebba72ab931885ed8
|
3 |
+
size 18637554
|
data/app.py
ADDED
@@ -0,0 +1,314 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# To run streamlit, go to terminal and type: 'streamlit run app.py'
|
2 |
+
# Core Packages ###########################
|
3 |
+
import streamlit as st
|
4 |
+
import os
|
5 |
+
|
6 |
+
import time
|
7 |
+
import numpy as np
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
from loglizer.models import PCA
|
11 |
+
from loglizer import dataloader, preprocessing
|
12 |
+
import openai
|
13 |
+
|
14 |
+
openai.api_key = os.environ["FREEEDU_OPENAI_API_KEY"]
|
15 |
+
|
16 |
+
project_title = "ChatGPT Log Decoder"
|
17 |
+
project_desc = """
|
18 |
+
Intrusion Detection Systems (IDS) are powerful security tools that monitor network traffic for suspicious activity and issues alerts when such activities are discovered.
|
19 |
+
While these systems are commonly used by cybersecurity professionals and IT experts,
|
20 |
+
the technical jargon and analysis that an IDS provides can be incomprehensible to the average user.
|
21 |
+
This project aims to bridge this gap by utilizing ChatGPT's advanced natural language processing to
|
22 |
+
articulate the technical information received from an IDS into human-readable form, allowing common everyday users
|
23 |
+
to grasp the extent of their network's security status without the need for a technical background.
|
24 |
+
"""
|
25 |
+
|
26 |
+
project_link = "https://github.com/logpai/loglizer"
|
27 |
+
project_icon = "46_Heatmap.png"
|
28 |
+
st.set_page_config(page_title=project_title, initial_sidebar_state='collapsed', page_icon=project_icon)
|
29 |
+
|
30 |
+
# additional info from the readme
|
31 |
+
add_info_md = """
|
32 |
+
|
33 |
+
# loglizer
|
34 |
+
|
35 |
+
|
36 |
+
**Loglizer is a machine learning-based log analysis toolkit for automated anomaly detection**.
|
37 |
+
|
38 |
+
|
39 |
+
Logs are imperative in the development and maintenance process of many software systems. They record detailed
|
40 |
+
runtime information during system operation that allows developers and support engineers to monitor their systems and track abnormal behaviors and errors. Loglizer provides a toolkit that implements a number of machine-learning based log analysis techniques for automated anomaly detection.
|
41 |
+
|
42 |
+
:telescope: If you use loglizer in your research for publication, please kindly cite the following paper.
|
43 |
+
+ Shilin He, Jieming Zhu, Pinjia He, Michael R. Lyu. [Experience Report: System Log Analysis for Anomaly Detection](https://jiemingzhu.github.io/pub/slhe_issre2016.pdf), *IEEE International Symposium on Software Reliability Engineering (ISSRE)*, 2016. [[Bibtex](https://dblp.org/rec/bibtex/conf/issre/HeZHL16)][[中文版本](https://github.com/AmateurEvents/article/issues/2)]
|
44 |
+
**(ISSRE Most Influential Paper)**
|
45 |
+
|
46 |
+
## Framework
|
47 |
+
|
48 |
+

|
49 |
+
|
50 |
+
The log analysis framework for anomaly detection usually comprises the following components:
|
51 |
+
|
52 |
+
1. **Log collection:** Logs are generated at runtime and aggregated into a centralized place with a data streaming pipeline, such as Flume and Kafka.
|
53 |
+
2. **Log parsing:** The goal of log parsing is to convert unstructured log messages into a map of structured events, based on which sophisticated machine learning models can be applied. The details of log parsing can be found at [our logparser project](https://github.com/logpai/logparser).
|
54 |
+
3. **Feature extraction:** Structured logs can be sliced into short log sequences through interval window, sliding window, or session window. Then, feature extraction is performed to vectorize each log sequence, for example, using an event counting vector.
|
55 |
+
4. **Anomaly detection:** Anomaly detection models are trained to check whether a given feature vector is an anomaly or not.
|
56 |
+
|
57 |
+
|
58 |
+
## Models
|
59 |
+
|
60 |
+
Anomaly detection models currently available:
|
61 |
+
|
62 |
+
| Model | Paper reference |
|
63 |
+
| :--- | :--- |
|
64 |
+
| **Supervised models** |
|
65 |
+
| LR | [**EuroSys'10**] [Fingerprinting the Datacenter: Automated Classification of Performance Crises](https://www.microsoft.com/en-us/research/wp-content/uploads/2009/07/hiLighter.pdf), by Peter Bodík, Moises Goldszmidt, Armando Fox, Hans Andersen. [**Microsoft**] |
|
66 |
+
| Decision Tree | [**ICAC'04**] [Failure Diagnosis Using Decision Trees](http://www.cs.berkeley.edu/~brewer/papers/icac2004_chen_diagnosis.pdf), by Mike Chen, Alice X. Zheng, Jim Lloyd, Michael I. Jordan, Eric Brewer. [**eBay**] |
|
67 |
+
| SVM | [**ICDM'07**] [Failure Prediction in IBM BlueGene/L Event Logs](https://www.researchgate.net/publication/4324148_Failure_Prediction_in_IBM_BlueGeneL_Event_Logs), by Yinglung Liang, Yanyong Zhang, Hui Xiong, Ramendra Sahoo. [**IBM**]|
|
68 |
+
| **Unsupervised models** |
|
69 |
+
| LOF | [**SIGMOD'00**] [LOF: Identifying Density-Based Local Outliers](), by Markus M. Breunig, Hans-Peter Kriegel, Raymond T. Ng, Jörg Sander. |
|
70 |
+
| One-Class SVM | [**Neural Computation'01**] [Estimating the Support of a High-Dimensional Distribution](), by John Platt, Bernhard Schölkopf, John Shawe-Taylor, Alex J. Smola, Robert C. Williamson. |
|
71 |
+
| Isolation Forest | [**ICDM'08**] [Isolation Forest](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf), by Fei Tony Liu, Kai Ming Ting, Zhi-Hua Zhou. |
|
72 |
+
| PCA | [**SOSP'09**] [Large-Scale System Problems Detection by Mining Console Logs](http://iiis.tsinghua.edu.cn/~weixu/files/sosp09.pdf), by Wei Xu, Ling Huang, Armando Fox, David Patterson, Michael I. Jordan. [**Intel**] |
|
73 |
+
| Invariants Mining | [**ATC'10**] [Mining Invariants from Console Logs for System Problem Detection](https://www.usenix.org/legacy/event/atc10/tech/full_papers/Lou.pdf), by Jian-Guang Lou, Qiang Fu, Shengqi Yang, Ye Xu, Jiang Li. [**Microsoft**]|
|
74 |
+
| Clustering | [**ICSE'16**] [Log Clustering based Problem Identification for Online Service Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/07/ICSE-2016-2-Log-Clustering-based-Problem-Identification-for-Online-Service-Systems.pdf), by Qingwei Lin, Hongyu Zhang, Jian-Guang Lou, Yu Zhang, Xuewei Chen. [**Microsoft**]|
|
75 |
+
| DeepLog (coming)| [**CCS'17**] [DeepLog: Anomaly Detection and Diagnosis from System Logs through Deep Learning](https://www.cs.utah.edu/~lifeifei/papers/deeplog.pdf), by Min Du, Feifei Li, Guineng Zheng, Vivek Srikumar. |
|
76 |
+
| AutoEncoder (coming)| [**Arxiv'18**] [Anomaly Detection using Autoencoders in High Performance Computing Systems](https://arxiv.org/abs/1811.05269), by Andrea Borghesi, Andrea Bartolini, Michele Lombardi, Michela Milano, Luca Benini. |
|
77 |
+
|
78 |
+
|
79 |
+
## Log data
|
80 |
+
We have collected a set of labeled log datasets in [loghub](https://github.com/logpai/loghub) for research purposes. If you are interested in the datasets, please follow the link to submit your access request.
|
81 |
+
|
82 |
+
## Install
|
83 |
+
```bash
|
84 |
+
git clone https://github.com/logpai/loglizer.git
|
85 |
+
cd loglizer
|
86 |
+
pip install -r requirements.txt
|
87 |
+
```
|
88 |
+
|
89 |
+
## API usage
|
90 |
+
|
91 |
+
```python
|
92 |
+
# Load HDFS dataset. If you would like to try your own log, you need to rewrite the load function.
|
93 |
+
(x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(...)
|
94 |
+
|
95 |
+
# Feature extraction and transformation
|
96 |
+
feature_extractor = preprocessing.FeatureExtractor()
|
97 |
+
feature_extractor.fit_transform(...)
|
98 |
+
|
99 |
+
# Model training
|
100 |
+
model = PCA()
|
101 |
+
model.fit(...)
|
102 |
+
|
103 |
+
# Feature transform after fitting
|
104 |
+
x_test = feature_extractor.transform(...)
|
105 |
+
# Model evaluation with labeled data
|
106 |
+
model.evaluate(...)
|
107 |
+
|
108 |
+
# Anomaly prediction
|
109 |
+
x_test = feature_extractor.transform(...)
|
110 |
+
model.predict(...) # predict anomalies on given data
|
111 |
+
```
|
112 |
+
|
113 |
+
For more details, please follow [the demo](./docs/demo.md) in the docs to get started. Please note that all ML models are not magic, you need to figure out how to tune the parameters in order to make them work on your own data.
|
114 |
+
|
115 |
+
## Benchmarking results
|
116 |
+
|
117 |
+
If you would like to reproduce the following results, please run [benchmarks/HDFS_bechmark.py](./benchmarks/HDFS_bechmark.py) on the full HDFS dataset (HDFS100k is for demo only).
|
118 |
+
|
119 |
+
| | | HDFS | |
|
120 |
+
| :----:|:----:|:----:|:----:|
|
121 |
+
| **Model** | **Precision** | **Recall** | **F1** |
|
122 |
+
| LR| 0.955 | 0.911 | 0.933 |
|
123 |
+
| Decision Tree | 0.998 | 0.998 | 0.998 |
|
124 |
+
| SVM| 0.959 | 0.970 | 0.965 |
|
125 |
+
| LOF | 0.967 | 0.561 | 0.710 |
|
126 |
+
| One-Class SVM | 0.995 | 0.222| 0.363 |
|
127 |
+
| Isolation Forest | 0.830 | 0.776 | 0.802 |
|
128 |
+
| PCA | 0.975 | 0.635 | 0.769|
|
129 |
+
| Invariants Mining | 0.888 | 0.945 | 0.915|
|
130 |
+
| Clustering | 1.000 | 0.720 | 0.837 |
|
131 |
+
|
132 |
+
## Contributors
|
133 |
+
+ [Shilin He](https://shilinhe.github.io), The Chinese University of Hong Kong
|
134 |
+
+ [Jieming Zhu](https://jiemingzhu.github.io), The Chinese University of Hong Kong, currently at Huawei Noah's Ark Lab
|
135 |
+
+ [Pinjia He](https://pinjiahe.github.io/), The Chinese University of Hong Kong, currently at ETH Zurich
|
136 |
+
|
137 |
+
|
138 |
+
## Feedback
|
139 |
+
For any questions or feedback, please post to [the issue page](https://github.com/logpai/loglizer/issues/new).
|
140 |
+
|
141 |
+
"""
|
142 |
+
#######################################################################################################################
|
143 |
+
default_log_path = os.path.join("data","HDFS","HDFS_100k.log_structured.csv")
|
144 |
+
def load_loglizer(train_log_path=default_log_path):
|
145 |
+
print("Loading Loglizer PCA model:")
|
146 |
+
start = time.time()
|
147 |
+
(x_train, _), (_, _), _ = dataloader.load_HDFS(train_log_path, window='session',
|
148 |
+
split_type='sequential', save_csv=True)
|
149 |
+
feature_extractor = preprocessing.FeatureExtractor()
|
150 |
+
x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf',
|
151 |
+
normalization='zero-mean')
|
152 |
+
model = PCA()
|
153 |
+
model.fit(x_train)
|
154 |
+
|
155 |
+
stop = time.time()
|
156 |
+
print("Loading complete. Time elapsed: " + str(stop - start))
|
157 |
+
|
158 |
+
# Extract the event labels for each unique eventid in the source structured log file
|
159 |
+
# Load the CSV file
|
160 |
+
df = pd.read_csv(train_log_path)
|
161 |
+
|
162 |
+
# Extract unique 'EventId' and 'EventTemplate' pairs
|
163 |
+
event_names = df[['EventId', 'EventTemplate']].drop_duplicates()
|
164 |
+
event_names_dict = dict(zip(event_names['EventId'], event_names['EventTemplate']))
|
165 |
+
|
166 |
+
return model, feature_extractor, event_names_dict
|
167 |
+
|
168 |
+
def analyze_with_chatgpt(anomalous_packets):
|
169 |
+
context = {"role":"system",
|
170 |
+
"content":"""
|
171 |
+
You will receive 5 rows of transformed data logs, where the middlemost log is flagged as anomalous.
|
172 |
+
Your job is to analyze and compare this log with its surrounding context logs, and find out why it was flagged as anomalous.
|
173 |
+
Your response should be 50 to 150 words only; be as concise as possible and address the user directly.
|
174 |
+
Only mention the key issues and cause of the anomaly. Never mention the Column Names, the middlemost log, surrounding context log.
|
175 |
+
You can mention what the irregular values in a column represent, and base your report on that.
|
176 |
+
Assume that the user has no knowledge of networks or cybersecurity. Your response should start with
|
177 |
+
"Based on the logs,".
|
178 |
+
Try to avoid telling the user 'if the issue persists'.
|
179 |
+
Give simple step-by-step advice on what the user can do on their own.
|
180 |
+
If the advice is beyond the scope of the everyday user, notify them that the assistance of a professional
|
181 |
+
is necessary for the level of intrusion you found.
|
182 |
+
"""}
|
183 |
+
message_prompt = []
|
184 |
+
message_prompt.append(context)
|
185 |
+
anomalies = f"*Column Names*: {anomalous_packets.columns} --- "
|
186 |
+
# Build a sequence of messages, with each message being a single packet up to the last packet flagged as anomalous
|
187 |
+
for i,p in anomalous_packets.iterrows():
|
188 |
+
anomalies += f"*Row {i+1}*: "
|
189 |
+
anomalies += str(p.values)
|
190 |
+
anomalies += " --- "
|
191 |
+
|
192 |
+
message_format = {"role":"user",
|
193 |
+
"content":anomalies}
|
194 |
+
message_prompt.append(message_format)
|
195 |
+
|
196 |
+
# Generate the response
|
197 |
+
response = openai.ChatCompletion.create(
|
198 |
+
model="gpt-3.5-turbo",
|
199 |
+
messages=message_prompt,
|
200 |
+
max_tokens=1008,
|
201 |
+
temperature=0.7,
|
202 |
+
n=1,
|
203 |
+
stop=None,
|
204 |
+
)
|
205 |
+
|
206 |
+
# Extract the response text from the API response
|
207 |
+
response_text = response['choices'][0]['message']['content']
|
208 |
+
# Return the response text and updated chat history
|
209 |
+
return response_text
|
210 |
+
|
211 |
+
def main():
|
212 |
+
head_col = st.columns([1,8])
|
213 |
+
with head_col[0]:
|
214 |
+
st.image(project_icon)
|
215 |
+
with head_col[1]:
|
216 |
+
st.title(project_title)
|
217 |
+
|
218 |
+
st.write(project_desc)
|
219 |
+
st.write(f"Source Project: {project_link}")
|
220 |
+
expander = st.expander("Additional Information on the Source Project (Loglizer)")
|
221 |
+
expander.markdown(add_info_md)
|
222 |
+
st.markdown("***")
|
223 |
+
st.subheader("")
|
224 |
+
#########################################
|
225 |
+
|
226 |
+
# instructions and file upload button
|
227 |
+
st.subheader("""
|
228 |
+
How to use:
|
229 |
+
1. Upload your log file (must be .csv format)
|
230 |
+
2. Click the 'Run Loglizer' button
|
231 |
+
""")
|
232 |
+
uploaded_file = st.file_uploader("Upload a log file in csv format:",
|
233 |
+
type=["csv"],
|
234 |
+
accept_multiple_files=False)
|
235 |
+
|
236 |
+
#########################################
|
237 |
+
|
238 |
+
run_button = st.button("Run Loglizer")
|
239 |
+
|
240 |
+
if "anomalies" not in st.session_state:
|
241 |
+
st.session_state.anomalies = []
|
242 |
+
if "col_names" not in st.session_state:
|
243 |
+
st.session_state.col_names = []
|
244 |
+
|
245 |
+
# button is clicked
|
246 |
+
if run_button:
|
247 |
+
if uploaded_file is None: # if no files were uploaded:
|
248 |
+
st.error("Please upload a .csv log file")
|
249 |
+
|
250 |
+
else:
|
251 |
+
# Ensure temp directory exists
|
252 |
+
if not os.path.exists('temp'):
|
253 |
+
os.makedirs('temp')
|
254 |
+
|
255 |
+
filename = os.path.basename(uploaded_file.name)
|
256 |
+
file_path = os.path.join("temp", filename)
|
257 |
+
# Write out the uploaded file to temp directory
|
258 |
+
with open(file_path, 'wb') as f:
|
259 |
+
f.write(uploaded_file.getbuffer())
|
260 |
+
|
261 |
+
with st.spinner('Loading Loglizer...'):
|
262 |
+
model, feature_extractor, event_names_dict = load_loglizer()
|
263 |
+
|
264 |
+
# Simulate the 'live' log feed
|
265 |
+
(x_live, _), (_, _), _ = dataloader.load_HDFS(file_path, window='session', split_type='sequential')
|
266 |
+
x_live, st.session_state.col_names = feature_extractor.transform(x_live)
|
267 |
+
# st.write(f"x_live shape: {x_live.shape}")
|
268 |
+
|
269 |
+
# Map event IDs to event templates
|
270 |
+
st.session_state.col_names = [event_names_dict[id] if id in event_names_dict else id for id in
|
271 |
+
st.session_state.col_names]
|
272 |
+
|
273 |
+
for i in range(len(x_live)):
|
274 |
+
log = x_live[i]
|
275 |
+
log = np.expand_dims(log, axis=0)
|
276 |
+
prediction = model.predict(log)
|
277 |
+
print(f"prediction for x_live[{i}]: {prediction}")
|
278 |
+
if prediction == 1:
|
279 |
+
# Anomaly detected
|
280 |
+
context = x_live[max(0, i - 2):min(i + 3, len(x_live))] # Get the two logs before and after
|
281 |
+
st.session_state.anomalies.append(context)
|
282 |
+
# st.write("predict on x_live:")
|
283 |
+
# st.write(model.predict(x_live))
|
284 |
+
|
285 |
+
st.write("Anomalous events")
|
286 |
+
st.write(len(st.session_state.anomalies))
|
287 |
+
#
|
288 |
+
# st.write("Column names")
|
289 |
+
# st.write(st.session_state.col_names)
|
290 |
+
|
291 |
+
if len(st.session_state.anomalies) > 0:
|
292 |
+
# Displaying the anomalous packets and sending to ChatGPT
|
293 |
+
selected_index = st.selectbox('Select an anomaly', list(range(len(st.session_state.anomalies))), 0)
|
294 |
+
selected_anomaly = st.session_state.anomalies[selected_index]
|
295 |
+
st.write(f"Loglizer found and compiled {selected_anomaly.shape[1]} events from the logs.\n The middle entry has an anomaly:")
|
296 |
+
# st.write(f"Shape: {selected_anomaly.shape} Type: {type(selected_anomaly)}")
|
297 |
+
|
298 |
+
# Convert the numpy array to a pandas DataFrame
|
299 |
+
selected_anomaly_df = pd.DataFrame(selected_anomaly)
|
300 |
+
# Set the column names
|
301 |
+
selected_anomaly_df.columns = st.session_state.col_names
|
302 |
+
|
303 |
+
# st.write(selected_anomaly)
|
304 |
+
st.write(selected_anomaly_df)
|
305 |
+
|
306 |
+
if st.button('Send to ChatGPT'):
|
307 |
+
result = analyze_with_chatgpt(selected_anomaly_df)
|
308 |
+
st.write(result)
|
309 |
+
|
310 |
+
|
311 |
+
if __name__ == '__main__':
|
312 |
+
main()
|
313 |
+
|
314 |
+
# To run streamlit, go to terminal and type: 'streamlit run app-source.py'
|
data/data_instances.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ba4d353dddd4e7ef0de168def9502f38d17dbe95ca5d966c5a6d68aa05b4955
|
3 |
+
size 909112
|
data/loglizer/__init__.py
ADDED
File without changes
|
data/loglizer/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (138 Bytes). View file
|
|
data/loglizer/__pycache__/dataloader.cpython-38.pyc
ADDED
Binary file (7.77 kB). View file
|
|
data/loglizer/__pycache__/preprocessing.cpython-38.pyc
ADDED
Binary file (3.54 kB). View file
|
|
data/loglizer/__pycache__/utils.cpython-38.pyc
ADDED
Binary file (836 Bytes). View file
|
|
data/loglizer/dataloader.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The interface to load log datasets. The datasets currently supported include
|
3 |
+
HDFS and BGL.
|
4 |
+
|
5 |
+
Authors:
|
6 |
+
LogPAI Team
|
7 |
+
|
8 |
+
"""
|
9 |
+
|
10 |
+
import pandas as pd
|
11 |
+
import os
|
12 |
+
import numpy as np
|
13 |
+
import re
|
14 |
+
from sklearn.utils import shuffle
|
15 |
+
from collections import OrderedDict
|
16 |
+
|
17 |
+
def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'):
|
18 |
+
if split_type == 'uniform' and y_data is not None:
|
19 |
+
pos_idx = y_data > 0
|
20 |
+
x_pos = x_data[pos_idx]
|
21 |
+
y_pos = y_data[pos_idx]
|
22 |
+
x_neg = x_data[~pos_idx]
|
23 |
+
y_neg = y_data[~pos_idx]
|
24 |
+
train_pos = int(train_ratio * x_pos.shape[0])
|
25 |
+
train_neg = int(train_ratio * x_neg.shape[0])
|
26 |
+
x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
|
27 |
+
y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
|
28 |
+
x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
|
29 |
+
y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])
|
30 |
+
elif split_type == 'sequential':
|
31 |
+
num_train = int(train_ratio * x_data.shape[0])
|
32 |
+
x_train = x_data[0:num_train]
|
33 |
+
x_test = x_data[num_train:]
|
34 |
+
if y_data is None:
|
35 |
+
y_train = None
|
36 |
+
y_test = None
|
37 |
+
else:
|
38 |
+
y_train = y_data[0:num_train]
|
39 |
+
y_test = y_data[num_train:]
|
40 |
+
# Random shuffle
|
41 |
+
indexes = shuffle(np.arange(x_train.shape[0]))
|
42 |
+
x_train = x_train[indexes]
|
43 |
+
if y_train is not None:
|
44 |
+
y_train = y_train[indexes]
|
45 |
+
return (x_train, y_train), (x_test, y_test)
|
46 |
+
|
47 |
+
def load_HDFS(log_file, label_file=None, window='session', train_ratio=0.5, split_type='sequential', save_csv=False, window_size=0):
|
48 |
+
""" Load HDFS structured log into train and test data
|
49 |
+
|
50 |
+
Arguments
|
51 |
+
---------
|
52 |
+
log_file: str, the file path of structured log.
|
53 |
+
label_file: str, the file path of anomaly labels, None for unlabeled data
|
54 |
+
window: str, the window options including `session` (default).
|
55 |
+
train_ratio: float, the ratio of training data for train/test split.
|
56 |
+
split_type: `uniform` or `sequential`, which determines how to split dataset. `uniform` means
|
57 |
+
to split positive samples and negative samples equally when setting label_file. `sequential`
|
58 |
+
means to split the data sequentially without label_file. That is, the first part is for training,
|
59 |
+
while the second part is for testing.
|
60 |
+
|
61 |
+
Returns
|
62 |
+
-------
|
63 |
+
(x_train, y_train): the training data
|
64 |
+
(x_test, y_test): the testing data
|
65 |
+
"""
|
66 |
+
|
67 |
+
print('====== Input data summary ======')
|
68 |
+
|
69 |
+
if log_file.endswith('.npz'):
|
70 |
+
# Split training and validation set in a class-uniform way
|
71 |
+
data = np.load(log_file)
|
72 |
+
x_data = data['x_data']
|
73 |
+
y_data = data['y_data']
|
74 |
+
(x_train, y_train), (x_test, y_test) = _split_data(x_data, y_data, train_ratio, split_type)
|
75 |
+
|
76 |
+
elif log_file.endswith('.csv'):
|
77 |
+
assert window == 'session', "Only window=session is supported for HDFS dataset."
|
78 |
+
print("Loading", log_file)
|
79 |
+
struct_log = pd.read_csv(log_file, engine='c',
|
80 |
+
na_filter=False, memory_map=True)
|
81 |
+
data_dict = OrderedDict()
|
82 |
+
for idx, row in struct_log.iterrows():
|
83 |
+
blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
|
84 |
+
blkId_set = set(blkId_list)
|
85 |
+
for blk_Id in blkId_set:
|
86 |
+
if not blk_Id in data_dict:
|
87 |
+
data_dict[blk_Id] = []
|
88 |
+
data_dict[blk_Id].append(row['EventId'])
|
89 |
+
data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])
|
90 |
+
|
91 |
+
if label_file:
|
92 |
+
# Split training and validation set in a class-uniform way
|
93 |
+
label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True)
|
94 |
+
label_data = label_data.set_index('BlockId')
|
95 |
+
label_dict = label_data['Label'].to_dict()
|
96 |
+
data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)
|
97 |
+
|
98 |
+
# Split train and test data
|
99 |
+
(x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values,
|
100 |
+
data_df['Label'].values, train_ratio, split_type)
|
101 |
+
|
102 |
+
print(y_train.sum(), y_test.sum())
|
103 |
+
|
104 |
+
if save_csv:
|
105 |
+
data_df.to_csv('data_instances.csv', index=False)
|
106 |
+
|
107 |
+
if window_size > 0:
|
108 |
+
x_train, window_y_train, y_train = slice_hdfs(x_train, y_train, window_size)
|
109 |
+
x_test, window_y_test, y_test = slice_hdfs(x_test, y_test, window_size)
|
110 |
+
log = "{} {} windows ({}/{} anomaly), {}/{} normal"
|
111 |
+
print(log.format("Train:", x_train.shape[0], y_train.sum(), y_train.shape[0], (1-y_train).sum(), y_train.shape[0]))
|
112 |
+
print(log.format("Test:", x_test.shape[0], y_test.sum(), y_test.shape[0], (1-y_test).sum(), y_test.shape[0]))
|
113 |
+
return (x_train, window_y_train, y_train), (x_test, window_y_test, y_test)
|
114 |
+
|
115 |
+
if label_file is None:
|
116 |
+
if split_type == 'uniform':
|
117 |
+
split_type = 'sequential'
|
118 |
+
print('Warning: Only split_type=sequential is supported \
|
119 |
+
if label_file=None.'.format(split_type))
|
120 |
+
# Split training and validation set sequentially
|
121 |
+
x_data = data_df['EventSequence'].values
|
122 |
+
(x_train, _), (x_test, _) = _split_data(x_data, train_ratio=train_ratio, split_type=split_type)
|
123 |
+
print('Total: {} instances, train: {} instances, test: {} instances'.format(
|
124 |
+
x_data.shape[0], x_train.shape[0], x_test.shape[0]))
|
125 |
+
return (x_train, None), (x_test, None), data_df
|
126 |
+
else:
|
127 |
+
raise NotImplementedError('load_HDFS() only support csv and npz files!')
|
128 |
+
|
129 |
+
num_train = x_train.shape[0]
|
130 |
+
num_test = x_test.shape[0]
|
131 |
+
num_total = num_train + num_test
|
132 |
+
num_train_pos = sum(y_train)
|
133 |
+
num_test_pos = sum(y_test)
|
134 |
+
num_pos = num_train_pos + num_test_pos
|
135 |
+
|
136 |
+
print('Total: {} instances, {} anomaly, {} normal' \
|
137 |
+
.format(num_total, num_pos, num_total - num_pos))
|
138 |
+
print('Train: {} instances, {} anomaly, {} normal' \
|
139 |
+
.format(num_train, num_train_pos, num_train - num_train_pos))
|
140 |
+
print('Test: {} instances, {} anomaly, {} normal\n' \
|
141 |
+
.format(num_test, num_test_pos, num_test - num_test_pos))
|
142 |
+
|
143 |
+
return (x_train, y_train), (x_test, y_test)
|
144 |
+
|
145 |
+
def slice_hdfs(x, y, window_size):
|
146 |
+
results_data = []
|
147 |
+
print("Slicing {} sessions, with window {}".format(x.shape[0], window_size))
|
148 |
+
for idx, sequence in enumerate(x):
|
149 |
+
seqlen = len(sequence)
|
150 |
+
i = 0
|
151 |
+
while (i + window_size) < seqlen:
|
152 |
+
slice = sequence[i: i + window_size]
|
153 |
+
results_data.append([idx, slice, sequence[i + window_size], y[idx]])
|
154 |
+
i += 1
|
155 |
+
else:
|
156 |
+
slice = sequence[i: i + window_size]
|
157 |
+
slice += ["#Pad"] * (window_size - len(slice))
|
158 |
+
results_data.append([idx, slice, "#Pad", y[idx]])
|
159 |
+
results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"])
|
160 |
+
print("Slicing done, {} windows generated".format(results_df.shape[0]))
|
161 |
+
return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"]
|
162 |
+
|
163 |
+
|
164 |
+
|
165 |
+
def load_BGL(log_file, label_file=None, window='sliding', time_interval=60, stepping_size=60,
|
166 |
+
train_ratio=0.8):
|
167 |
+
""" TODO
|
168 |
+
|
169 |
+
"""
|
170 |
+
|
171 |
+
|
172 |
+
def bgl_preprocess_data(para, raw_data, event_mapping_data):
|
173 |
+
""" split logs into sliding windows, built an event count matrix and get the corresponding label
|
174 |
+
|
175 |
+
Args:
|
176 |
+
--------
|
177 |
+
para: the parameters dictionary
|
178 |
+
raw_data: list of (label, time)
|
179 |
+
event_mapping_data: a list of event index, where each row index indicates a corresponding log
|
180 |
+
|
181 |
+
Returns:
|
182 |
+
--------
|
183 |
+
event_count_matrix: event count matrix, where each row is an instance (log sequence vector)
|
184 |
+
labels: a list of labels, 1 represents anomaly
|
185 |
+
"""
|
186 |
+
|
187 |
+
# create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running
|
188 |
+
if not os.path.exists(para['save_path']):
|
189 |
+
os.mkdir(para['save_path'])
|
190 |
+
log_size = raw_data.shape[0]
|
191 |
+
sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv'
|
192 |
+
|
193 |
+
#=============divide into sliding windows=========#
|
194 |
+
start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window
|
195 |
+
label_data, time_data = raw_data[:,0], raw_data[:, 1]
|
196 |
+
if not os.path.exists(sliding_file_path):
|
197 |
+
# split into sliding window
|
198 |
+
start_time = time_data[0]
|
199 |
+
start_index = 0
|
200 |
+
end_index = 0
|
201 |
+
|
202 |
+
# get the first start, end index, end time
|
203 |
+
for cur_time in time_data:
|
204 |
+
if cur_time < start_time + para['window_size']*3600:
|
205 |
+
end_index += 1
|
206 |
+
end_time = cur_time
|
207 |
+
else:
|
208 |
+
start_end_pair=tuple((start_index,end_index))
|
209 |
+
start_end_index_list.append(start_end_pair)
|
210 |
+
break
|
211 |
+
# move the start and end index until next sliding window
|
212 |
+
while end_index < log_size:
|
213 |
+
start_time = start_time + para['step_size']*3600
|
214 |
+
end_time = end_time + para['step_size']*3600
|
215 |
+
for i in range(start_index,end_index):
|
216 |
+
if time_data[i] < start_time:
|
217 |
+
i+=1
|
218 |
+
else:
|
219 |
+
break
|
220 |
+
for j in range(end_index, log_size):
|
221 |
+
if time_data[j] < end_time:
|
222 |
+
j+=1
|
223 |
+
else:
|
224 |
+
break
|
225 |
+
start_index = i
|
226 |
+
end_index = j
|
227 |
+
start_end_pair = tuple((start_index, end_index))
|
228 |
+
start_end_index_list.append(start_end_pair)
|
229 |
+
inst_number = len(start_end_index_list)
|
230 |
+
print('there are %d instances (sliding windows) in this dataset\n'%inst_number)
|
231 |
+
np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d')
|
232 |
+
else:
|
233 |
+
print('Loading start_end_index_list from file')
|
234 |
+
start_end_index_list = pd.read_csv(sliding_file_path, header=None).values
|
235 |
+
inst_number = len(start_end_index_list)
|
236 |
+
print('there are %d instances (sliding windows) in this dataset' % inst_number)
|
237 |
+
|
238 |
+
# get all the log indexes in each time window by ranging from start_index to end_index
|
239 |
+
expanded_indexes_list=[]
|
240 |
+
for t in range(inst_number):
|
241 |
+
index_list = []
|
242 |
+
expanded_indexes_list.append(index_list)
|
243 |
+
for i in range(inst_number):
|
244 |
+
start_index = start_end_index_list[i][0]
|
245 |
+
end_index = start_end_index_list[i][1]
|
246 |
+
for l in range(start_index, end_index):
|
247 |
+
expanded_indexes_list[i].append(l)
|
248 |
+
|
249 |
+
event_mapping_data = [row[0] for row in event_mapping_data]
|
250 |
+
event_num = len(list(set(event_mapping_data)))
|
251 |
+
print('There are %d log events'%event_num)
|
252 |
+
|
253 |
+
#=============get labels and event count of each sliding window =========#
|
254 |
+
labels = []
|
255 |
+
event_count_matrix = np.zeros((inst_number,event_num))
|
256 |
+
for j in range(inst_number):
|
257 |
+
label = 0 #0 represent success, 1 represent failure
|
258 |
+
for k in expanded_indexes_list[j]:
|
259 |
+
event_index = event_mapping_data[k]
|
260 |
+
event_count_matrix[j, event_index] += 1
|
261 |
+
if label_data[k]:
|
262 |
+
label = 1
|
263 |
+
continue
|
264 |
+
labels.append(label)
|
265 |
+
assert inst_number == len(labels)
|
266 |
+
print("Among all instances, %d are anomalies"%sum(labels))
|
267 |
+
assert event_count_matrix.shape[0] == len(labels)
|
268 |
+
return event_count_matrix, labels
|
data/loglizer/models/DecisionTree.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of the decision tree model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Mike Chen, Alice X. Zheng, Jim Lloyd, Michael I. Jordan, Eric Brewer.
|
9 |
+
Failure Diagnosis Using Decision Trees. IEEE International Conference
|
10 |
+
on Autonomic Computing (ICAC), 2004.
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
from sklearn import tree
|
16 |
+
from ..utils import metrics
|
17 |
+
|
18 |
+
class DecisionTree(object):
|
19 |
+
|
20 |
+
def __init__(self, criterion='gini', max_depth=None, max_features=None, class_weight=None):
|
21 |
+
""" The Invariants Mining model for anomaly detection
|
22 |
+
Arguments
|
23 |
+
---------
|
24 |
+
See DecisionTreeClassifier API: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
|
25 |
+
|
26 |
+
Attributes
|
27 |
+
----------
|
28 |
+
classifier: object, the classifier for anomaly detection
|
29 |
+
|
30 |
+
"""
|
31 |
+
self.classifier = tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
|
32 |
+
max_features=max_features, class_weight=class_weight)
|
33 |
+
|
34 |
+
def fit(self, X, y):
|
35 |
+
"""
|
36 |
+
Arguments
|
37 |
+
---------
|
38 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
39 |
+
"""
|
40 |
+
print('====== Model summary ======')
|
41 |
+
self.classifier.fit(X, y)
|
42 |
+
|
43 |
+
def predict(self, X):
|
44 |
+
""" Predict anomalies with mined invariants
|
45 |
+
|
46 |
+
Arguments
|
47 |
+
---------
|
48 |
+
X: the input event count matrix
|
49 |
+
|
50 |
+
Returns
|
51 |
+
-------
|
52 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
53 |
+
"""
|
54 |
+
|
55 |
+
y_pred = self.classifier.predict(X)
|
56 |
+
return y_pred
|
57 |
+
|
58 |
+
def predict_proba(self, X):
|
59 |
+
""" Predict anomalies with mined invariants
|
60 |
+
|
61 |
+
Arguments
|
62 |
+
---------
|
63 |
+
X: the input event count matrix
|
64 |
+
|
65 |
+
Returns
|
66 |
+
-------
|
67 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
68 |
+
"""
|
69 |
+
|
70 |
+
y_pred = self.classifier.predict_proba(X)
|
71 |
+
return y_pred
|
72 |
+
|
73 |
+
def evaluate(self, X, y_true):
|
74 |
+
print('====== Evaluation summary ======')
|
75 |
+
y_pred = self.predict(X)
|
76 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
77 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
78 |
+
return precision, recall, f1
|
data/loglizer/models/InvariantsMiner.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of Invariants Mining model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Jian-Guang Lou, Qiang Fu, Shengqi Yang, Ye Xu, Jiang Li. Mining Invariants
|
9 |
+
from Console Logs for System Problem Detection. USENIX Annual Technical
|
10 |
+
Conference (ATC), 2010.
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
from itertools import combinations
|
16 |
+
from ..utils import metrics
|
17 |
+
|
18 |
+
class InvariantsMiner(object):
|
19 |
+
|
20 |
+
def __init__(self, percentage=0.98, epsilon=0.5, longest_invarant=None, scale_list=[1,2,3]):
|
21 |
+
""" The Invariants Mining model for anomaly detection
|
22 |
+
|
23 |
+
Attributes
|
24 |
+
----------
|
25 |
+
percentage: float, percentage of samples satisfying the condition that |X_j * V_i| < epsilon
|
26 |
+
epsilon: float, the threshold for estimating the invariant space
|
27 |
+
longest_invarant: int, the specified maximal length of invariant, default to None. Stop
|
28 |
+
searching when the invariant length is larger than longest_invarant.
|
29 |
+
scale_list: list, the list used to scale the theta of float into integer
|
30 |
+
invariants_dict: dict, dictionary of invariants where key is the selected columns
|
31 |
+
and value is the weights the of invariant
|
32 |
+
"""
|
33 |
+
self.percentage = percentage
|
34 |
+
self.epsilon = epsilon
|
35 |
+
self.longest_invarant = longest_invarant
|
36 |
+
self.scale_list = scale_list
|
37 |
+
self.invariants_dict = None
|
38 |
+
|
39 |
+
def fit(self, X):
|
40 |
+
"""
|
41 |
+
Arguments
|
42 |
+
---------
|
43 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
44 |
+
"""
|
45 |
+
print('====== Model summary ======')
|
46 |
+
invar_dim = self._estimate_invarant_space(X)
|
47 |
+
self._invariants_search(X, invar_dim)
|
48 |
+
|
49 |
+
def predict(self, X):
|
50 |
+
""" Predict anomalies with mined invariants
|
51 |
+
|
52 |
+
Arguments
|
53 |
+
---------
|
54 |
+
X: the input event count matrix
|
55 |
+
|
56 |
+
Returns
|
57 |
+
-------
|
58 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
59 |
+
"""
|
60 |
+
|
61 |
+
y_sum = np.zeros(X.shape[0])
|
62 |
+
for cols, theta in self.invariants_dict.items():
|
63 |
+
y_sum += np.fabs(np.dot(X[:, cols], np.array(theta)))
|
64 |
+
y_pred = (y_sum > 1e-6).astype(int)
|
65 |
+
return y_pred
|
66 |
+
|
67 |
+
def evaluate(self, X, y_true):
|
68 |
+
print('====== Evaluation summary ======')
|
69 |
+
y_pred = self.predict(X)
|
70 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
71 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
72 |
+
return precision, recall, f1
|
73 |
+
|
74 |
+
def _estimate_invarant_space(self, X):
|
75 |
+
""" Estimate the dimension of invariant space using SVD decomposition
|
76 |
+
|
77 |
+
Arguments
|
78 |
+
---------
|
79 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
80 |
+
percentage: float, percentage of samples satisfying the condition that |X_j * V_i| < epsilon
|
81 |
+
epsilon: float, the threshold for estimating the invariant space
|
82 |
+
|
83 |
+
Returns
|
84 |
+
-------
|
85 |
+
r: the dimension of invariant space
|
86 |
+
"""
|
87 |
+
covariance_matrix = np.dot(X.T, X)
|
88 |
+
U, sigma, V = np.linalg.svd(covariance_matrix) # SVD decomposition
|
89 |
+
# Start from the right most column of matrix V, sigular values are in ascending order
|
90 |
+
num_instances, num_events = X.shape
|
91 |
+
r = 0
|
92 |
+
for i in range(num_events - 1, -1, -1):
|
93 |
+
zero_count = sum(abs(np.dot(X, U[:, i])) < self.epsilon)
|
94 |
+
if zero_count / float(num_instances) < self.percentage:
|
95 |
+
break
|
96 |
+
r += 1
|
97 |
+
print('Invariant space dimension: {}'.format(r))
|
98 |
+
|
99 |
+
return r
|
100 |
+
|
101 |
+
def _invariants_search(self, X, r):
|
102 |
+
""" Mine invariant relationships from X
|
103 |
+
|
104 |
+
Arguments
|
105 |
+
---------
|
106 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
107 |
+
r: the dimension of invariant space
|
108 |
+
"""
|
109 |
+
|
110 |
+
num_instances, num_events = X.shape
|
111 |
+
invariants_dict = dict() # save the mined Invariants(value) and its corresponding columns(key)
|
112 |
+
search_space = [] # only invariant candidates in this list are valid
|
113 |
+
|
114 |
+
# invariant of only one column (all zero columns)
|
115 |
+
init_cols = sorted([[item] for item in range(num_events)])
|
116 |
+
for col in init_cols:
|
117 |
+
search_space.append(col)
|
118 |
+
init_col_list = init_cols[:]
|
119 |
+
for col in init_cols:
|
120 |
+
if np.count_nonzero(X[:, col]) == 0:
|
121 |
+
invariants_dict[tuple(col)] = [1]
|
122 |
+
search_space.remove(col)
|
123 |
+
init_col_list.remove(col)
|
124 |
+
|
125 |
+
item_list = init_col_list
|
126 |
+
length = 2
|
127 |
+
FLAG_break_loop = False
|
128 |
+
# check invariant of more columns
|
129 |
+
while len(item_list) != 0:
|
130 |
+
if self.longest_invarant and len(item_list[0]) >= self.longest_invarant:
|
131 |
+
break
|
132 |
+
joined_item_list = self._join_set(item_list, length) # generate new invariant candidates
|
133 |
+
for items in joined_item_list:
|
134 |
+
if self._check_candi_valid(items, length, search_space):
|
135 |
+
search_space.append(items)
|
136 |
+
item_list = []
|
137 |
+
for item in joined_item_list:
|
138 |
+
if tuple(item) in invariants_dict:
|
139 |
+
continue
|
140 |
+
if item not in search_space:
|
141 |
+
continue
|
142 |
+
if not self._check_candi_valid(tuple(item), length, search_space) and length > 2:
|
143 |
+
search_space.remove(item)
|
144 |
+
continue # an item must be superset of all other subitems in searchSpace, else skip
|
145 |
+
validity, scaled_theta = self._check_invar_validity(X, item)
|
146 |
+
if validity:
|
147 |
+
self._prune(invariants_dict.keys(), set(item), search_space)
|
148 |
+
invariants_dict[tuple(item)] = scaled_theta.tolist()
|
149 |
+
search_space.remove(item)
|
150 |
+
else:
|
151 |
+
item_list.append(item)
|
152 |
+
if len(invariants_dict) >= r:
|
153 |
+
FLAG_break_loop = True
|
154 |
+
break
|
155 |
+
if FLAG_break_loop:
|
156 |
+
break
|
157 |
+
length += 1
|
158 |
+
print('Mined {} invariants: {}\n'.format(len(invariants_dict), invariants_dict))
|
159 |
+
self.invariants_dict = invariants_dict
|
160 |
+
|
161 |
+
def _compute_eigenvector(self, X):
|
162 |
+
""" calculate the smallest eigenvalue and corresponding eigenvector (theta in the paper)
|
163 |
+
for a given sub_matrix
|
164 |
+
|
165 |
+
Arguments
|
166 |
+
---------
|
167 |
+
X: the event count matrix (each row is a log sequence vector, each column represents an event)
|
168 |
+
|
169 |
+
Returns
|
170 |
+
-------
|
171 |
+
min_vec: the eigenvector of corresponding minimum eigen value
|
172 |
+
FLAG_contain_zero: whether the min_vec contains zero (very small value)
|
173 |
+
"""
|
174 |
+
|
175 |
+
FLAG_contain_zero = False
|
176 |
+
count_zero = 0
|
177 |
+
dot_result = np.dot(X.T, X)
|
178 |
+
U, S, V = np.linalg.svd(dot_result)
|
179 |
+
min_vec = U[:, -1]
|
180 |
+
count_zero = sum(np.fabs(min_vec) < 1e-6)
|
181 |
+
if count_zero != 0:
|
182 |
+
FLAG_contain_zero = True
|
183 |
+
return min_vec, FLAG_contain_zero
|
184 |
+
|
185 |
+
|
186 |
+
def _check_invar_validity(self, X, selected_columns):
|
187 |
+
""" scale the eigenvector of float number into integer, and check whether the scaled number is valid
|
188 |
+
|
189 |
+
Arguments
|
190 |
+
---------
|
191 |
+
X: the event count matrix (each row is a log sequence vector, each column represents an event)
|
192 |
+
selected_columns: select columns from all column list
|
193 |
+
|
194 |
+
Returns
|
195 |
+
-------
|
196 |
+
validity: whether the selected columns is valid
|
197 |
+
scaled_theta: the scaled theta vector
|
198 |
+
"""
|
199 |
+
|
200 |
+
sub_matrix = X[:, selected_columns]
|
201 |
+
inst_num = X.shape[0]
|
202 |
+
validity = False
|
203 |
+
min_theta, FLAG_contain_zero = self._compute_eigenvector(sub_matrix)
|
204 |
+
abs_min_theta = [np.fabs(it) for it in min_theta]
|
205 |
+
if FLAG_contain_zero:
|
206 |
+
return validity, []
|
207 |
+
else:
|
208 |
+
for i in self.scale_list:
|
209 |
+
min_index = np.argmin(abs_min_theta)
|
210 |
+
scale = float(i) / min_theta[min_index]
|
211 |
+
scaled_theta = np.array([round(item * scale) for item in min_theta])
|
212 |
+
scaled_theta[min_index] = i
|
213 |
+
scaled_theta = scaled_theta.T
|
214 |
+
if 0 in np.fabs(scaled_theta):
|
215 |
+
continue
|
216 |
+
dot_submat_theta = np.dot(sub_matrix, scaled_theta)
|
217 |
+
count_zero = 0
|
218 |
+
for j in dot_submat_theta:
|
219 |
+
if np.fabs(j) < 1e-8:
|
220 |
+
count_zero += 1
|
221 |
+
if count_zero >= self.percentage * inst_num:
|
222 |
+
validity = True
|
223 |
+
# print('A valid invariant is found: ',scaled_theta, selected_columns)
|
224 |
+
break
|
225 |
+
return validity, scaled_theta
|
226 |
+
|
227 |
+
def _prune(self, valid_cols, new_item_set, search_space):
|
228 |
+
""" prune invalid combination of columns
|
229 |
+
|
230 |
+
Arguments
|
231 |
+
---------
|
232 |
+
valid_cols: existing valid column list
|
233 |
+
new_item_set: item set to be merged
|
234 |
+
search_space: the search space that stores possible candidates
|
235 |
+
|
236 |
+
"""
|
237 |
+
|
238 |
+
if len(valid_cols) == 0:
|
239 |
+
return
|
240 |
+
for se in valid_cols:
|
241 |
+
intersection = set(se) & new_item_set
|
242 |
+
if len(intersection) == 0:
|
243 |
+
continue
|
244 |
+
union = set(se) | new_item_set
|
245 |
+
for item in list(intersection):
|
246 |
+
diff = sorted(list(union - set([item])))
|
247 |
+
if diff in search_space:
|
248 |
+
search_space.remove(diff)
|
249 |
+
|
250 |
+
|
251 |
+
def _join_set(self, item_list, length):
|
252 |
+
""" Join a set with itself and returns the n-element (length) itemsets
|
253 |
+
|
254 |
+
Arguments
|
255 |
+
---------
|
256 |
+
item_list: current list of columns
|
257 |
+
length: generate new items of length
|
258 |
+
|
259 |
+
Returns
|
260 |
+
-------
|
261 |
+
return_list: list of items of length-element
|
262 |
+
"""
|
263 |
+
|
264 |
+
set_len = len(item_list)
|
265 |
+
return_list = []
|
266 |
+
for i in range(set_len):
|
267 |
+
for j in range(i + 1, set_len):
|
268 |
+
i_set = set(item_list[i])
|
269 |
+
j_set = set(item_list[j])
|
270 |
+
if len(i_set.union(j_set)) == length:
|
271 |
+
joined = sorted(list(i_set.union(j_set)))
|
272 |
+
if joined not in return_list:
|
273 |
+
return_list.append(joined)
|
274 |
+
return_list = sorted(return_list)
|
275 |
+
return return_list
|
276 |
+
|
277 |
+
|
278 |
+
def _check_candi_valid(self, item, length, search_space):
|
279 |
+
""" check whether an item's subitems are in searchspace
|
280 |
+
|
281 |
+
Arguments
|
282 |
+
---------
|
283 |
+
item: item to be checked
|
284 |
+
length: the length of item
|
285 |
+
search_space: the search space that stores possible candidates
|
286 |
+
|
287 |
+
Returns
|
288 |
+
-------
|
289 |
+
True or False
|
290 |
+
"""
|
291 |
+
|
292 |
+
for subItem in combinations(item, length - 1):
|
293 |
+
if sorted(list(subItem)) not in search_space:
|
294 |
+
return False
|
295 |
+
return True
|
296 |
+
|
data/loglizer/models/IsolationForest.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of IsolationForest model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Fei Tony Liu, Kai Ming Ting, Zhi-Hua Zhou. Isolation Forest. International
|
9 |
+
Conference on Data Mining (ICDM), 2008.
|
10 |
+
|
11 |
+
"""
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
import numpy as np
|
18 |
+
from sklearn.ensemble import IsolationForest as iForest
|
19 |
+
from ..utils import metrics
|
20 |
+
|
21 |
+
class IsolationForest(iForest):
|
22 |
+
|
23 |
+
def __init__(self, n_estimators=100, max_samples='auto', contamination=0.03, **kwargs):
|
24 |
+
""" The IsolationForest model for anomaly detection
|
25 |
+
|
26 |
+
Arguments
|
27 |
+
---------
|
28 |
+
n_estimators : int, optional (default=100). The number of base estimators in the ensemble.
|
29 |
+
max_samples : int or float, optional (default="auto")
|
30 |
+
The number of samples to draw from X to train each base estimator.
|
31 |
+
- If int, then draw max_samples samples.
|
32 |
+
- If float, then draw max_samples * X.shape[0] samples.
|
33 |
+
- If "auto", then max_samples=min(256, n_samples).
|
34 |
+
If max_samples is larger than the number of samples provided, all samples will be used
|
35 |
+
for all trees (no sampling).
|
36 |
+
contamination : float in (0., 0.5), optional (default='auto')
|
37 |
+
The amount of contamination of the data set, i.e. the proportion of outliers in the data
|
38 |
+
set. Used when fitting to define the threshold on the decision function. If 'auto', the
|
39 |
+
decision function threshold is determined as in the original paper.
|
40 |
+
max_features : int or float, optional (default=1.0)
|
41 |
+
The number of features to draw from X to train each base estimator.
|
42 |
+
- If int, then draw max_features features.
|
43 |
+
- If float, then draw max_features * X.shape[1] features.
|
44 |
+
bootstrap : boolean, optional (default=False)
|
45 |
+
If True, individual trees are fit on random subsets of the training data sampled with replacement.
|
46 |
+
If False, sampling without replacement is performed.
|
47 |
+
n_jobs : int or None, optional (default=None)
|
48 |
+
The number of jobs to run in parallel for both fit and predict. None means 1 unless in a
|
49 |
+
joblib.parallel_backend context. -1 means using all processors.
|
50 |
+
random_state : int, RandomState instance or None, optional (default=None)
|
51 |
+
If int, random_state is the seed used by the random number generator;
|
52 |
+
If RandomState instance, random_state is the random number generator;
|
53 |
+
If None, the random number generator is the RandomState instance used by np.random.
|
54 |
+
|
55 |
+
Reference
|
56 |
+
---------
|
57 |
+
For more information, please visit https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
|
58 |
+
"""
|
59 |
+
|
60 |
+
super(IsolationForest, self).__init__(n_estimators=n_estimators, max_samples=max_samples,
|
61 |
+
contamination=contamination, **kwargs)
|
62 |
+
|
63 |
+
|
64 |
+
def fit(self, X):
|
65 |
+
"""
|
66 |
+
Auguments
|
67 |
+
---------
|
68 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
69 |
+
"""
|
70 |
+
|
71 |
+
print('====== Model summary ======')
|
72 |
+
super(IsolationForest, self).fit(X)
|
73 |
+
|
74 |
+
def predict(self, X):
|
75 |
+
""" Predict anomalies with mined invariants
|
76 |
+
|
77 |
+
Arguments
|
78 |
+
---------
|
79 |
+
X: the input event count matrix
|
80 |
+
|
81 |
+
Returns
|
82 |
+
-------
|
83 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
84 |
+
"""
|
85 |
+
|
86 |
+
y_pred = super(IsolationForest, self).predict(X)
|
87 |
+
y_pred = np.where(y_pred > 0, 0, 1)
|
88 |
+
return y_pred
|
89 |
+
|
90 |
+
def evaluate(self, X, y_true):
|
91 |
+
print('====== Evaluation summary ======')
|
92 |
+
y_pred = self.predict(X)
|
93 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
94 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
95 |
+
return precision, recall, f1
|
96 |
+
|
data/loglizer/models/LR.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
The implementation of the logistic regression model for anomaly detection.
|
4 |
+
|
5 |
+
Authors:
|
6 |
+
LogPAI Team
|
7 |
+
|
8 |
+
Reference:
|
9 |
+
[1] Peter Bodík, Moises Goldszmidt, Armando Fox, Hans Andersen. Fingerprinting
|
10 |
+
the Datacenter: Automated Classification of Performance Crises. The European
|
11 |
+
Conference on Computer Systems (EuroSys), 2010.
|
12 |
+
|
13 |
+
"""
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
from sklearn.linear_model import LogisticRegression
|
17 |
+
from ..utils import metrics
|
18 |
+
|
19 |
+
class LR(object):
|
20 |
+
|
21 |
+
def __init__(self, penalty='l2', C=100, tol=0.01, class_weight=None, max_iter=100):
|
22 |
+
""" The Invariants Mining model for anomaly detection
|
23 |
+
|
24 |
+
Attributes
|
25 |
+
----------
|
26 |
+
classifier: object, the classifier for anomaly detection
|
27 |
+
"""
|
28 |
+
self.classifier = LogisticRegression(penalty=penalty, C=C, tol=tol, class_weight=class_weight,
|
29 |
+
max_iter=max_iter)
|
30 |
+
|
31 |
+
def fit(self, X, y):
|
32 |
+
"""
|
33 |
+
Arguments
|
34 |
+
---------
|
35 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
36 |
+
"""
|
37 |
+
print('====== Model summary ======')
|
38 |
+
self.classifier.fit(X, y)
|
39 |
+
|
40 |
+
def predict(self, X):
|
41 |
+
""" Predict anomalies with mined invariants
|
42 |
+
|
43 |
+
Arguments
|
44 |
+
---------
|
45 |
+
X: the input event count matrix
|
46 |
+
|
47 |
+
Returns
|
48 |
+
-------
|
49 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
50 |
+
"""
|
51 |
+
y_pred = self.classifier.predict(X)
|
52 |
+
return y_pred
|
53 |
+
|
54 |
+
def predict_proba(self, X):
|
55 |
+
""" Predict anomalies with mined invariants
|
56 |
+
|
57 |
+
Arguments
|
58 |
+
---------
|
59 |
+
X: the input event count matrix
|
60 |
+
|
61 |
+
Returns
|
62 |
+
-------
|
63 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
64 |
+
"""
|
65 |
+
y_pred = self.classifier.predict_proba(X)
|
66 |
+
return y_pred
|
67 |
+
|
68 |
+
def evaluate(self, X, y_true):
|
69 |
+
print('====== Evaluation summary ======')
|
70 |
+
y_pred = self.predict(X)
|
71 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
72 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
73 |
+
return precision, recall, f1
|
data/loglizer/models/LogClustering.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of Log Clustering model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Qingwei Lin, Hongyu Zhang, Jian-Guang Lou, Yu Zhang, Xuewei Chen. Log Clustering
|
9 |
+
based Problem Identification for Online Service Systems. International Conference
|
10 |
+
on Software Engineering (ICSE), 2016.
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import pprint
|
16 |
+
from scipy.special import expit
|
17 |
+
from numpy import linalg as LA
|
18 |
+
from scipy.cluster.hierarchy import linkage, fcluster
|
19 |
+
from scipy.spatial.distance import pdist, squareform
|
20 |
+
from ..utils import metrics
|
21 |
+
|
22 |
+
|
23 |
+
class LogClustering(object):
|
24 |
+
|
25 |
+
def __init__(self, max_dist=0.3, anomaly_threshold=0.3, mode='online', num_bootstrap_samples=1000):
|
26 |
+
"""
|
27 |
+
Attributes
|
28 |
+
----------
|
29 |
+
max_dist: float, the threshold to stop the clustering process
|
30 |
+
anomaly_threshold: float, the threshold for anomaly detection
|
31 |
+
mode: str, 'offline' or 'online' mode for clustering
|
32 |
+
num_bootstrap_samples: int, online clustering starts with a bootstraping process, which
|
33 |
+
determines the initial cluster representatives offline using a subset of samples
|
34 |
+
representatives: ndarray, the representative samples of clusters, of shape
|
35 |
+
num_clusters-by-num_events
|
36 |
+
cluster_size_dict: dict, the size of each cluster, used to update representatives online
|
37 |
+
"""
|
38 |
+
self.max_dist = max_dist
|
39 |
+
self.anomaly_threshold = anomaly_threshold
|
40 |
+
self.mode = mode
|
41 |
+
self.num_bootstrap_samples = num_bootstrap_samples
|
42 |
+
self.representatives = list()
|
43 |
+
self.cluster_size_dict = dict()
|
44 |
+
|
45 |
+
def fit(self, X):
|
46 |
+
print('====== Model summary ======')
|
47 |
+
if self.mode == 'offline':
|
48 |
+
# The offline mode can process about 10K samples only due to huge memory consumption.
|
49 |
+
self._offline_clustering(X)
|
50 |
+
elif self.mode == 'online':
|
51 |
+
# Bootstrapping phase
|
52 |
+
if self.num_bootstrap_samples > 0:
|
53 |
+
X_bootstrap = X[0:self.num_bootstrap_samples, :]
|
54 |
+
self._offline_clustering(X_bootstrap)
|
55 |
+
# Online learning phase
|
56 |
+
if X.shape[0] > self.num_bootstrap_samples:
|
57 |
+
self._online_clustering(X)
|
58 |
+
|
59 |
+
def predict(self, X):
|
60 |
+
y_pred = np.zeros(X.shape[0])
|
61 |
+
for i in range(X.shape[0]):
|
62 |
+
min_dist, min_index = self._get_min_cluster_dist(X[i, :])
|
63 |
+
if min_dist > self.anomaly_threshold:
|
64 |
+
y_pred[i] = 1
|
65 |
+
return y_pred
|
66 |
+
|
67 |
+
def evaluate(self, X, y_true):
|
68 |
+
print('====== Evaluation summary ======')
|
69 |
+
y_pred = self.predict(X)
|
70 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
71 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n' \
|
72 |
+
.format(precision, recall, f1))
|
73 |
+
return precision, recall, f1
|
74 |
+
|
75 |
+
def _offline_clustering(self, X):
|
76 |
+
print('Starting offline clustering...')
|
77 |
+
p_dist = pdist(X, metric=self._distance_metric)
|
78 |
+
Z = linkage(p_dist, 'complete')
|
79 |
+
cluster_index = fcluster(Z, self.max_dist, criterion='distance')
|
80 |
+
self._extract_representatives(X, cluster_index)
|
81 |
+
print('Processed {} instances.'.format(X.shape[0]))
|
82 |
+
print('Found {} clusters offline.\n'.format(len(self.representatives)))
|
83 |
+
# print('The representive vectors are:')
|
84 |
+
# pprint.pprint(self.representatives.tolist())
|
85 |
+
|
86 |
+
def _extract_representatives(self, X, cluster_index):
|
87 |
+
num_clusters = len(set(cluster_index))
|
88 |
+
for clu in range(num_clusters):
|
89 |
+
clu_idx = np.argwhere(cluster_index == clu + 1)[:, 0]
|
90 |
+
self.cluster_size_dict[clu] = clu_idx.shape[0]
|
91 |
+
repre_center = np.average(X[clu_idx, :], axis=0)
|
92 |
+
self.representatives.append(repre_center)
|
93 |
+
|
94 |
+
def _online_clustering(self, X):
|
95 |
+
print("Starting online clustering...")
|
96 |
+
for i in range(self.num_bootstrap_samples, X.shape[0]):
|
97 |
+
if (i + 1) % 2000 == 0:
|
98 |
+
print('Processed {} instances.'.format(i + 1))
|
99 |
+
instance_vec = X[i, :]
|
100 |
+
if len(self.representatives) > 0:
|
101 |
+
min_dist, clu_id = self._get_min_cluster_dist(instance_vec)
|
102 |
+
if min_dist <= self.max_dist:
|
103 |
+
self.cluster_size_dict[clu_id] += 1
|
104 |
+
self.representatives[clu_id] = self.representatives[clu_id] \
|
105 |
+
+ (instance_vec - self.representatives[clu_id]) \
|
106 |
+
/ self.cluster_size_dict[clu_id]
|
107 |
+
continue
|
108 |
+
self.cluster_size_dict[len(self.representatives)] = 1
|
109 |
+
self.representatives.append(instance_vec)
|
110 |
+
print('Processed {} instances.'.format(X.shape[0]))
|
111 |
+
print('Found {} clusters online.\n'.format(len(self.representatives)))
|
112 |
+
# print('The representive vectors are:')
|
113 |
+
# pprint.pprint(self.representatives.tolist())
|
114 |
+
|
115 |
+
def _distance_metric(self, x1, x2):
|
116 |
+
norm= LA.norm(x1) * LA.norm(x2)
|
117 |
+
distance = 1 - np.dot(x1, x2) / (norm + 1e-8)
|
118 |
+
if distance < 1e-8:
|
119 |
+
distance = 0
|
120 |
+
return distance
|
121 |
+
|
122 |
+
def _get_min_cluster_dist(self, instance_vec):
|
123 |
+
min_index = -1
|
124 |
+
min_dist = float('inf')
|
125 |
+
for i in range(len(self.representatives)):
|
126 |
+
cluster_rep = self.representatives[i]
|
127 |
+
dist = self._distance_metric(instance_vec, cluster_rep)
|
128 |
+
if dist < 1e-8:
|
129 |
+
min_dist = 0
|
130 |
+
min_index = i
|
131 |
+
break
|
132 |
+
elif dist < min_dist:
|
133 |
+
min_dist = dist
|
134 |
+
min_index = i
|
135 |
+
return min_dist, min_index
|
136 |
+
|
137 |
+
|
data/loglizer/models/PCA.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of PCA model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Wei Xu, Ling Huang, Armando Fox, David Patterson, Michael I. Jordan.
|
9 |
+
Large-Scale System Problems Detection by Mining Console Logs. ACM
|
10 |
+
Symposium on Operating Systems Principles (SOSP), 2009.
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
from ..utils import metrics
|
16 |
+
|
17 |
+
class PCA(object):
|
18 |
+
|
19 |
+
def __init__(self, n_components=0.95, threshold=None, c_alpha=3.2905):
|
20 |
+
""" The PCA model for anomaly detection
|
21 |
+
|
22 |
+
Attributes
|
23 |
+
----------
|
24 |
+
proj_C: The projection matrix for projecting feature vector to abnormal space
|
25 |
+
n_components: float/int, number of principal compnents or the variance ratio they cover
|
26 |
+
threshold: float, the anomaly detection threshold. When setting to None, the threshold
|
27 |
+
is automatically caculated using Q-statistics
|
28 |
+
c_alpha: float, the c_alpha parameter for caculating anomaly detection threshold using
|
29 |
+
Q-statistics. The following is lookup table for c_alpha:
|
30 |
+
c_alpha = 1.7507; # alpha = 0.08
|
31 |
+
c_alpha = 1.9600; # alpha = 0.05
|
32 |
+
c_alpha = 2.5758; # alpha = 0.01
|
33 |
+
c_alpha = 2.807; # alpha = 0.005
|
34 |
+
c_alpha = 2.9677; # alpha = 0.003
|
35 |
+
c_alpha = 3.2905; # alpha = 0.001
|
36 |
+
c_alpha = 3.4808; # alpha = 0.0005
|
37 |
+
c_alpha = 3.8906; # alpha = 0.0001
|
38 |
+
c_alpha = 4.4172; # alpha = 0.00001
|
39 |
+
"""
|
40 |
+
|
41 |
+
self.proj_C = None
|
42 |
+
self.components = None
|
43 |
+
self.n_components = n_components
|
44 |
+
self.threshold = threshold
|
45 |
+
self.c_alpha = c_alpha
|
46 |
+
|
47 |
+
|
48 |
+
def fit(self, X):
|
49 |
+
"""
|
50 |
+
Auguments
|
51 |
+
---------
|
52 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
53 |
+
"""
|
54 |
+
|
55 |
+
print('====== Model summary ======')
|
56 |
+
num_instances, num_events = X.shape
|
57 |
+
X_cov = np.dot(X.T, X) / float(num_instances)
|
58 |
+
U, sigma, V = np.linalg.svd(X_cov)
|
59 |
+
n_components = self.n_components
|
60 |
+
if n_components < 1:
|
61 |
+
total_variance = np.sum(sigma)
|
62 |
+
variance = 0
|
63 |
+
for i in range(num_events):
|
64 |
+
variance += sigma[i]
|
65 |
+
if variance / total_variance >= n_components:
|
66 |
+
break
|
67 |
+
n_components = i + 1
|
68 |
+
|
69 |
+
P = U[:, :n_components]
|
70 |
+
I = np.identity(num_events, int)
|
71 |
+
self.components = P
|
72 |
+
self.proj_C = I - np.dot(P, P.T)
|
73 |
+
print('n_components: {}'.format(n_components))
|
74 |
+
print('Project matrix shape: {}-by-{}'.format(self.proj_C.shape[0], self.proj_C.shape[1]))
|
75 |
+
|
76 |
+
if not self.threshold:
|
77 |
+
# Calculate threshold using Q-statistic. Information can be found at:
|
78 |
+
# http://conferences.sigcomm.org/sigcomm/2004/papers/p405-lakhina111.pdf
|
79 |
+
phi = np.zeros(3)
|
80 |
+
for i in range(3):
|
81 |
+
for j in range(n_components, num_events):
|
82 |
+
phi[i] += np.power(sigma[j], i + 1)
|
83 |
+
h0 = 1.0 - 2 * phi[0] * phi[2] / (3.0 * phi[1] * phi[1])
|
84 |
+
self.threshold = phi[0] * np.power(self.c_alpha * np.sqrt(2 * phi[1] * h0 * h0) / phi[0]
|
85 |
+
+ 1.0 + phi[1] * h0 * (h0 - 1) / (phi[0] * phi[0]),
|
86 |
+
1.0 / h0)
|
87 |
+
print('SPE threshold: {}\n'.format(self.threshold))
|
88 |
+
|
89 |
+
def predict(self, X):
|
90 |
+
assert self.proj_C is not None, 'PCA model needs to be trained before prediction.'
|
91 |
+
y_pred = np.zeros(X.shape[0])
|
92 |
+
for i in range(X.shape[0]):
|
93 |
+
y_a = np.dot(self.proj_C, X[i, :])
|
94 |
+
SPE = np.dot(y_a, y_a)
|
95 |
+
if SPE > self.threshold:
|
96 |
+
y_pred[i] = 1
|
97 |
+
return y_pred
|
98 |
+
|
99 |
+
def evaluate(self, X, y_true):
|
100 |
+
print('====== Evaluation summary ======')
|
101 |
+
y_pred = self.predict(X)
|
102 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
103 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
104 |
+
return precision, recall, f1
|
105 |
+
|
data/loglizer/models/SVM.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of the SVM model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Yinglung Liang, Yanyong Zhang, Hui Xiong, Ramendra Sahoo. Failure Prediction
|
9 |
+
in IBM BlueGene/L Event Logs. IEEE International Conference on Data Mining
|
10 |
+
(ICDM), 2007.
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
from sklearn import svm
|
16 |
+
from ..utils import metrics
|
17 |
+
|
18 |
+
class SVM(object):
|
19 |
+
|
20 |
+
def __init__(self, penalty='l1', tol=0.1, C=1, dual=False, class_weight=None,
|
21 |
+
max_iter=100):
|
22 |
+
""" The Invariants Mining model for anomaly detection
|
23 |
+
Arguments
|
24 |
+
---------
|
25 |
+
See SVM API: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
|
26 |
+
|
27 |
+
Attributes
|
28 |
+
----------
|
29 |
+
classifier: object, the classifier for anomaly detection
|
30 |
+
|
31 |
+
"""
|
32 |
+
self.classifier = svm.LinearSVC(penalty=penalty, tol=tol, C=C, dual=dual,
|
33 |
+
class_weight=class_weight, max_iter=max_iter)
|
34 |
+
|
35 |
+
def fit(self, X, y):
|
36 |
+
"""
|
37 |
+
Arguments
|
38 |
+
---------
|
39 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
40 |
+
"""
|
41 |
+
print('====== Model summary ======')
|
42 |
+
self.classifier.fit(X, y)
|
43 |
+
|
44 |
+
def predict(self, X):
|
45 |
+
""" Predict anomalies with mined invariants
|
46 |
+
|
47 |
+
Arguments
|
48 |
+
---------
|
49 |
+
X: the input event count matrix
|
50 |
+
|
51 |
+
Returns
|
52 |
+
-------
|
53 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
54 |
+
"""
|
55 |
+
|
56 |
+
y_pred = self.classifier.predict(X)
|
57 |
+
return y_pred
|
58 |
+
|
59 |
+
def evaluate(self, X, y_true):
|
60 |
+
print('====== Evaluation summary ======')
|
61 |
+
y_pred = self.predict(X)
|
62 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
63 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
64 |
+
return precision, recall, f1
|
data/loglizer/models/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .PCA import PCA
|
2 |
+
from .InvariantsMiner import InvariantsMiner
|
3 |
+
from .LogClustering import LogClustering
|
4 |
+
from .LR import LR
|
5 |
+
from .SVM import SVM
|
6 |
+
from .DecisionTree import DecisionTree
|
7 |
+
from .IsolationForest import IsolationForest
|
8 |
+
|
data/loglizer/models/__pycache__/DecisionTree.cpython-38.pyc
ADDED
Binary file (2.57 kB). View file
|
|
data/loglizer/models/__pycache__/InvariantsMiner.cpython-38.pyc
ADDED
Binary file (9.52 kB). View file
|
|
data/loglizer/models/__pycache__/IsolationForest.cpython-38.pyc
ADDED
Binary file (4.52 kB). View file
|
|
data/loglizer/models/__pycache__/LR.cpython-38.pyc
ADDED
Binary file (2.39 kB). View file
|
|
data/loglizer/models/__pycache__/LogClustering.cpython-38.pyc
ADDED
Binary file (5.08 kB). View file
|
|
data/loglizer/models/__pycache__/PCA.cpython-38.pyc
ADDED
Binary file (3.86 kB). View file
|
|
data/loglizer/models/__pycache__/SVM.cpython-38.pyc
ADDED
Binary file (2.36 kB). View file
|
|
data/loglizer/models/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (369 Bytes). View file
|
|
data/loglizer/preprocessing.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The interface for data preprocessing.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
"""
|
8 |
+
|
9 |
+
|
10 |
+
import pandas as pd
|
11 |
+
import os
|
12 |
+
import numpy as np
|
13 |
+
import re
|
14 |
+
from collections import Counter
|
15 |
+
from scipy.special import expit
|
16 |
+
from itertools import compress
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
class FeatureExtractor(object):
|
21 |
+
|
22 |
+
def __init__(self):
|
23 |
+
self.idf_vec = None
|
24 |
+
self.mean_vec = None
|
25 |
+
self.events = None
|
26 |
+
self.term_weighting = None
|
27 |
+
self.normalization = None
|
28 |
+
self.oov = None
|
29 |
+
|
30 |
+
def fit_transform(self, X_seq, term_weighting=None, normalization=None, oov=False, min_count=1):
|
31 |
+
""" Fit and transform the data matrix
|
32 |
+
|
33 |
+
Arguments
|
34 |
+
---------
|
35 |
+
X_seq: ndarray, log sequences matrix
|
36 |
+
term_weighting: None or `tf-idf`
|
37 |
+
normalization: None or `zero-mean`
|
38 |
+
oov: bool, whether to use OOV event
|
39 |
+
min_count: int, the minimal occurrence of events (default 0), only valid when oov=True.
|
40 |
+
|
41 |
+
Returns
|
42 |
+
-------
|
43 |
+
X_new: The transformed data matrix
|
44 |
+
"""
|
45 |
+
print('====== Transformed train data summary ======')
|
46 |
+
self.term_weighting = term_weighting
|
47 |
+
self.normalization = normalization
|
48 |
+
self.oov = oov
|
49 |
+
|
50 |
+
X_counts = []
|
51 |
+
for i in range(X_seq.shape[0]):
|
52 |
+
event_counts = Counter(X_seq[i])
|
53 |
+
X_counts.append(event_counts)
|
54 |
+
X_df = pd.DataFrame(X_counts)
|
55 |
+
X_df = X_df.fillna(0)
|
56 |
+
self.events = X_df.columns
|
57 |
+
X = X_df.values
|
58 |
+
if self.oov:
|
59 |
+
oov_vec = np.zeros(X.shape[0])
|
60 |
+
if min_count > 1:
|
61 |
+
idx = np.sum(X > 0, axis=0) >= min_count
|
62 |
+
oov_vec = np.sum(X[:, ~idx] > 0, axis=1)
|
63 |
+
X = X[:, idx]
|
64 |
+
self.events = np.array(X_df.columns)[idx].tolist()
|
65 |
+
X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)])
|
66 |
+
|
67 |
+
num_instance, num_event = X.shape
|
68 |
+
if self.term_weighting == 'tf-idf':
|
69 |
+
df_vec = np.sum(X > 0, axis=0)
|
70 |
+
self.idf_vec = np.log(num_instance / (df_vec + 1e-8))
|
71 |
+
idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1))
|
72 |
+
X = idf_matrix
|
73 |
+
if self.normalization == 'zero-mean':
|
74 |
+
mean_vec = X.mean(axis=0)
|
75 |
+
self.mean_vec = mean_vec.reshape(1, num_event)
|
76 |
+
X = X - np.tile(self.mean_vec, (num_instance, 1))
|
77 |
+
elif self.normalization == 'sigmoid':
|
78 |
+
X[X != 0] = expit(X[X != 0])
|
79 |
+
X_new = X
|
80 |
+
|
81 |
+
print('Train data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1]))
|
82 |
+
return X_new
|
83 |
+
|
84 |
+
def transform(self, X_seq):
|
85 |
+
""" Transform the data matrix with trained parameters
|
86 |
+
|
87 |
+
Arguments
|
88 |
+
---------
|
89 |
+
X: log sequences matrix
|
90 |
+
term_weighting: None or `tf-idf`
|
91 |
+
|
92 |
+
Returns
|
93 |
+
-------
|
94 |
+
X_new: The transformed data matrix
|
95 |
+
"""
|
96 |
+
print('====== Transformed test data summary ======')
|
97 |
+
X_counts = []
|
98 |
+
for i in range(X_seq.shape[0]):
|
99 |
+
event_counts = Counter(X_seq[i])
|
100 |
+
X_counts.append(event_counts)
|
101 |
+
X_df = pd.DataFrame(X_counts)
|
102 |
+
X_df = X_df.fillna(0)
|
103 |
+
empty_events = set(self.events) - set(X_df.columns)
|
104 |
+
for event in empty_events:
|
105 |
+
X_df[event] = [0] * len(X_df)
|
106 |
+
X = X_df[self.events].values
|
107 |
+
if self.oov:
|
108 |
+
oov_vec = np.sum(X_df[X_df.columns.difference(self.events)].values > 0, axis=1)
|
109 |
+
X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)])
|
110 |
+
|
111 |
+
num_instance, num_event = X.shape
|
112 |
+
if self.term_weighting == 'tf-idf':
|
113 |
+
idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1))
|
114 |
+
X = idf_matrix
|
115 |
+
if self.normalization == 'zero-mean':
|
116 |
+
X = X - np.tile(self.mean_vec, (num_instance, 1))
|
117 |
+
elif self.normalization == 'sigmoid':
|
118 |
+
X[X != 0] = expit(X[X != 0])
|
119 |
+
X_new = X
|
120 |
+
|
121 |
+
print('Test data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1]))
|
122 |
+
|
123 |
+
return X_new, self.events
|
data/loglizer/utils.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The utility functions of loglizer
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
"""
|
8 |
+
|
9 |
+
from sklearn.metrics import precision_recall_fscore_support
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
|
13 |
+
def metrics(y_pred, y_true):
|
14 |
+
""" Calucate evaluation metrics for precision, recall, and f1.
|
15 |
+
|
16 |
+
Arguments
|
17 |
+
---------
|
18 |
+
y_pred: ndarry, the predicted result list
|
19 |
+
y_true: ndarray, the ground truth label list
|
20 |
+
|
21 |
+
Returns
|
22 |
+
-------
|
23 |
+
precision: float, precision value
|
24 |
+
recall: float, recall value
|
25 |
+
f1: float, f1 measure value
|
26 |
+
"""
|
27 |
+
precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
|
28 |
+
return precision, recall, f1
|
29 |
+
|
data/requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
scikit-learn
|
2 |
+
pandas
|
3 |
+
streamlit
|
4 |
+
openai
|
data/temp/HDFS_100k.log_structured.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cbb13cc937bb35ad683dfcd71929242d7b1e0c01e2a94f79c637847c30a42190
|
3 |
+
size 20892087
|
data_instances.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ba4d353dddd4e7ef0de168def9502f38d17dbe95ca5d966c5a6d68aa05b4955
|
3 |
+
size 909112
|
loglizer/__init__.py
ADDED
File without changes
|
loglizer/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (138 Bytes). View file
|
|
loglizer/__pycache__/dataloader.cpython-38.pyc
ADDED
Binary file (7.77 kB). View file
|
|
loglizer/__pycache__/preprocessing.cpython-38.pyc
ADDED
Binary file (3.54 kB). View file
|
|
loglizer/__pycache__/utils.cpython-38.pyc
ADDED
Binary file (836 Bytes). View file
|
|
loglizer/dataloader.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The interface to load log datasets. The datasets currently supported include
|
3 |
+
HDFS and BGL.
|
4 |
+
|
5 |
+
Authors:
|
6 |
+
LogPAI Team
|
7 |
+
|
8 |
+
"""
|
9 |
+
|
10 |
+
import pandas as pd
|
11 |
+
import os
|
12 |
+
import numpy as np
|
13 |
+
import re
|
14 |
+
from sklearn.utils import shuffle
|
15 |
+
from collections import OrderedDict
|
16 |
+
|
17 |
+
def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'):
|
18 |
+
if split_type == 'uniform' and y_data is not None:
|
19 |
+
pos_idx = y_data > 0
|
20 |
+
x_pos = x_data[pos_idx]
|
21 |
+
y_pos = y_data[pos_idx]
|
22 |
+
x_neg = x_data[~pos_idx]
|
23 |
+
y_neg = y_data[~pos_idx]
|
24 |
+
train_pos = int(train_ratio * x_pos.shape[0])
|
25 |
+
train_neg = int(train_ratio * x_neg.shape[0])
|
26 |
+
x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
|
27 |
+
y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
|
28 |
+
x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
|
29 |
+
y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])
|
30 |
+
elif split_type == 'sequential':
|
31 |
+
num_train = int(train_ratio * x_data.shape[0])
|
32 |
+
x_train = x_data[0:num_train]
|
33 |
+
x_test = x_data[num_train:]
|
34 |
+
if y_data is None:
|
35 |
+
y_train = None
|
36 |
+
y_test = None
|
37 |
+
else:
|
38 |
+
y_train = y_data[0:num_train]
|
39 |
+
y_test = y_data[num_train:]
|
40 |
+
# Random shuffle
|
41 |
+
indexes = shuffle(np.arange(x_train.shape[0]))
|
42 |
+
x_train = x_train[indexes]
|
43 |
+
if y_train is not None:
|
44 |
+
y_train = y_train[indexes]
|
45 |
+
return (x_train, y_train), (x_test, y_test)
|
46 |
+
|
47 |
+
def load_HDFS(log_file, label_file=None, window='session', train_ratio=0.5, split_type='sequential', save_csv=False, window_size=0):
|
48 |
+
""" Load HDFS structured log into train and test data
|
49 |
+
|
50 |
+
Arguments
|
51 |
+
---------
|
52 |
+
log_file: str, the file path of structured log.
|
53 |
+
label_file: str, the file path of anomaly labels, None for unlabeled data
|
54 |
+
window: str, the window options including `session` (default).
|
55 |
+
train_ratio: float, the ratio of training data for train/test split.
|
56 |
+
split_type: `uniform` or `sequential`, which determines how to split dataset. `uniform` means
|
57 |
+
to split positive samples and negative samples equally when setting label_file. `sequential`
|
58 |
+
means to split the data sequentially without label_file. That is, the first part is for training,
|
59 |
+
while the second part is for testing.
|
60 |
+
|
61 |
+
Returns
|
62 |
+
-------
|
63 |
+
(x_train, y_train): the training data
|
64 |
+
(x_test, y_test): the testing data
|
65 |
+
"""
|
66 |
+
|
67 |
+
print('====== Input data summary ======')
|
68 |
+
|
69 |
+
if log_file.endswith('.npz'):
|
70 |
+
# Split training and validation set in a class-uniform way
|
71 |
+
data = np.load(log_file)
|
72 |
+
x_data = data['x_data']
|
73 |
+
y_data = data['y_data']
|
74 |
+
(x_train, y_train), (x_test, y_test) = _split_data(x_data, y_data, train_ratio, split_type)
|
75 |
+
|
76 |
+
elif log_file.endswith('.csv'):
|
77 |
+
assert window == 'session', "Only window=session is supported for HDFS dataset."
|
78 |
+
print("Loading", log_file)
|
79 |
+
struct_log = pd.read_csv(log_file, engine='c',
|
80 |
+
na_filter=False, memory_map=True)
|
81 |
+
data_dict = OrderedDict()
|
82 |
+
for idx, row in struct_log.iterrows():
|
83 |
+
blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
|
84 |
+
blkId_set = set(blkId_list)
|
85 |
+
for blk_Id in blkId_set:
|
86 |
+
if not blk_Id in data_dict:
|
87 |
+
data_dict[blk_Id] = []
|
88 |
+
data_dict[blk_Id].append(row['EventId'])
|
89 |
+
data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])
|
90 |
+
|
91 |
+
if label_file:
|
92 |
+
# Split training and validation set in a class-uniform way
|
93 |
+
label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True)
|
94 |
+
label_data = label_data.set_index('BlockId')
|
95 |
+
label_dict = label_data['Label'].to_dict()
|
96 |
+
data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)
|
97 |
+
|
98 |
+
# Split train and test data
|
99 |
+
(x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values,
|
100 |
+
data_df['Label'].values, train_ratio, split_type)
|
101 |
+
|
102 |
+
print(y_train.sum(), y_test.sum())
|
103 |
+
|
104 |
+
if save_csv:
|
105 |
+
data_df.to_csv('data_instances.csv', index=False)
|
106 |
+
|
107 |
+
if window_size > 0:
|
108 |
+
x_train, window_y_train, y_train = slice_hdfs(x_train, y_train, window_size)
|
109 |
+
x_test, window_y_test, y_test = slice_hdfs(x_test, y_test, window_size)
|
110 |
+
log = "{} {} windows ({}/{} anomaly), {}/{} normal"
|
111 |
+
print(log.format("Train:", x_train.shape[0], y_train.sum(), y_train.shape[0], (1-y_train).sum(), y_train.shape[0]))
|
112 |
+
print(log.format("Test:", x_test.shape[0], y_test.sum(), y_test.shape[0], (1-y_test).sum(), y_test.shape[0]))
|
113 |
+
return (x_train, window_y_train, y_train), (x_test, window_y_test, y_test)
|
114 |
+
|
115 |
+
if label_file is None:
|
116 |
+
if split_type == 'uniform':
|
117 |
+
split_type = 'sequential'
|
118 |
+
print('Warning: Only split_type=sequential is supported \
|
119 |
+
if label_file=None.'.format(split_type))
|
120 |
+
# Split training and validation set sequentially
|
121 |
+
x_data = data_df['EventSequence'].values
|
122 |
+
(x_train, _), (x_test, _) = _split_data(x_data, train_ratio=train_ratio, split_type=split_type)
|
123 |
+
print('Total: {} instances, train: {} instances, test: {} instances'.format(
|
124 |
+
x_data.shape[0], x_train.shape[0], x_test.shape[0]))
|
125 |
+
return (x_train, None), (x_test, None), data_df
|
126 |
+
else:
|
127 |
+
raise NotImplementedError('load_HDFS() only support csv and npz files!')
|
128 |
+
|
129 |
+
num_train = x_train.shape[0]
|
130 |
+
num_test = x_test.shape[0]
|
131 |
+
num_total = num_train + num_test
|
132 |
+
num_train_pos = sum(y_train)
|
133 |
+
num_test_pos = sum(y_test)
|
134 |
+
num_pos = num_train_pos + num_test_pos
|
135 |
+
|
136 |
+
print('Total: {} instances, {} anomaly, {} normal' \
|
137 |
+
.format(num_total, num_pos, num_total - num_pos))
|
138 |
+
print('Train: {} instances, {} anomaly, {} normal' \
|
139 |
+
.format(num_train, num_train_pos, num_train - num_train_pos))
|
140 |
+
print('Test: {} instances, {} anomaly, {} normal\n' \
|
141 |
+
.format(num_test, num_test_pos, num_test - num_test_pos))
|
142 |
+
|
143 |
+
return (x_train, y_train), (x_test, y_test)
|
144 |
+
|
145 |
+
def slice_hdfs(x, y, window_size):
|
146 |
+
results_data = []
|
147 |
+
print("Slicing {} sessions, with window {}".format(x.shape[0], window_size))
|
148 |
+
for idx, sequence in enumerate(x):
|
149 |
+
seqlen = len(sequence)
|
150 |
+
i = 0
|
151 |
+
while (i + window_size) < seqlen:
|
152 |
+
slice = sequence[i: i + window_size]
|
153 |
+
results_data.append([idx, slice, sequence[i + window_size], y[idx]])
|
154 |
+
i += 1
|
155 |
+
else:
|
156 |
+
slice = sequence[i: i + window_size]
|
157 |
+
slice += ["#Pad"] * (window_size - len(slice))
|
158 |
+
results_data.append([idx, slice, "#Pad", y[idx]])
|
159 |
+
results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"])
|
160 |
+
print("Slicing done, {} windows generated".format(results_df.shape[0]))
|
161 |
+
return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"]
|
162 |
+
|
163 |
+
|
164 |
+
|
165 |
+
def load_BGL(log_file, label_file=None, window='sliding', time_interval=60, stepping_size=60,
|
166 |
+
train_ratio=0.8):
|
167 |
+
""" TODO
|
168 |
+
|
169 |
+
"""
|
170 |
+
|
171 |
+
|
172 |
+
def bgl_preprocess_data(para, raw_data, event_mapping_data):
|
173 |
+
""" split logs into sliding windows, built an event count matrix and get the corresponding label
|
174 |
+
|
175 |
+
Args:
|
176 |
+
--------
|
177 |
+
para: the parameters dictionary
|
178 |
+
raw_data: list of (label, time)
|
179 |
+
event_mapping_data: a list of event index, where each row index indicates a corresponding log
|
180 |
+
|
181 |
+
Returns:
|
182 |
+
--------
|
183 |
+
event_count_matrix: event count matrix, where each row is an instance (log sequence vector)
|
184 |
+
labels: a list of labels, 1 represents anomaly
|
185 |
+
"""
|
186 |
+
|
187 |
+
# create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running
|
188 |
+
if not os.path.exists(para['save_path']):
|
189 |
+
os.mkdir(para['save_path'])
|
190 |
+
log_size = raw_data.shape[0]
|
191 |
+
sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv'
|
192 |
+
|
193 |
+
#=============divide into sliding windows=========#
|
194 |
+
start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window
|
195 |
+
label_data, time_data = raw_data[:,0], raw_data[:, 1]
|
196 |
+
if not os.path.exists(sliding_file_path):
|
197 |
+
# split into sliding window
|
198 |
+
start_time = time_data[0]
|
199 |
+
start_index = 0
|
200 |
+
end_index = 0
|
201 |
+
|
202 |
+
# get the first start, end index, end time
|
203 |
+
for cur_time in time_data:
|
204 |
+
if cur_time < start_time + para['window_size']*3600:
|
205 |
+
end_index += 1
|
206 |
+
end_time = cur_time
|
207 |
+
else:
|
208 |
+
start_end_pair=tuple((start_index,end_index))
|
209 |
+
start_end_index_list.append(start_end_pair)
|
210 |
+
break
|
211 |
+
# move the start and end index until next sliding window
|
212 |
+
while end_index < log_size:
|
213 |
+
start_time = start_time + para['step_size']*3600
|
214 |
+
end_time = end_time + para['step_size']*3600
|
215 |
+
for i in range(start_index,end_index):
|
216 |
+
if time_data[i] < start_time:
|
217 |
+
i+=1
|
218 |
+
else:
|
219 |
+
break
|
220 |
+
for j in range(end_index, log_size):
|
221 |
+
if time_data[j] < end_time:
|
222 |
+
j+=1
|
223 |
+
else:
|
224 |
+
break
|
225 |
+
start_index = i
|
226 |
+
end_index = j
|
227 |
+
start_end_pair = tuple((start_index, end_index))
|
228 |
+
start_end_index_list.append(start_end_pair)
|
229 |
+
inst_number = len(start_end_index_list)
|
230 |
+
print('there are %d instances (sliding windows) in this dataset\n'%inst_number)
|
231 |
+
np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d')
|
232 |
+
else:
|
233 |
+
print('Loading start_end_index_list from file')
|
234 |
+
start_end_index_list = pd.read_csv(sliding_file_path, header=None).values
|
235 |
+
inst_number = len(start_end_index_list)
|
236 |
+
print('there are %d instances (sliding windows) in this dataset' % inst_number)
|
237 |
+
|
238 |
+
# get all the log indexes in each time window by ranging from start_index to end_index
|
239 |
+
expanded_indexes_list=[]
|
240 |
+
for t in range(inst_number):
|
241 |
+
index_list = []
|
242 |
+
expanded_indexes_list.append(index_list)
|
243 |
+
for i in range(inst_number):
|
244 |
+
start_index = start_end_index_list[i][0]
|
245 |
+
end_index = start_end_index_list[i][1]
|
246 |
+
for l in range(start_index, end_index):
|
247 |
+
expanded_indexes_list[i].append(l)
|
248 |
+
|
249 |
+
event_mapping_data = [row[0] for row in event_mapping_data]
|
250 |
+
event_num = len(list(set(event_mapping_data)))
|
251 |
+
print('There are %d log events'%event_num)
|
252 |
+
|
253 |
+
#=============get labels and event count of each sliding window =========#
|
254 |
+
labels = []
|
255 |
+
event_count_matrix = np.zeros((inst_number,event_num))
|
256 |
+
for j in range(inst_number):
|
257 |
+
label = 0 #0 represent success, 1 represent failure
|
258 |
+
for k in expanded_indexes_list[j]:
|
259 |
+
event_index = event_mapping_data[k]
|
260 |
+
event_count_matrix[j, event_index] += 1
|
261 |
+
if label_data[k]:
|
262 |
+
label = 1
|
263 |
+
continue
|
264 |
+
labels.append(label)
|
265 |
+
assert inst_number == len(labels)
|
266 |
+
print("Among all instances, %d are anomalies"%sum(labels))
|
267 |
+
assert event_count_matrix.shape[0] == len(labels)
|
268 |
+
return event_count_matrix, labels
|
loglizer/models/DecisionTree.py
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of the decision tree model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Mike Chen, Alice X. Zheng, Jim Lloyd, Michael I. Jordan, Eric Brewer.
|
9 |
+
Failure Diagnosis Using Decision Trees. IEEE International Conference
|
10 |
+
on Autonomic Computing (ICAC), 2004.
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
from sklearn import tree
|
16 |
+
from ..utils import metrics
|
17 |
+
|
18 |
+
class DecisionTree(object):
|
19 |
+
|
20 |
+
def __init__(self, criterion='gini', max_depth=None, max_features=None, class_weight=None):
|
21 |
+
""" The Invariants Mining model for anomaly detection
|
22 |
+
Arguments
|
23 |
+
---------
|
24 |
+
See DecisionTreeClassifier API: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
|
25 |
+
|
26 |
+
Attributes
|
27 |
+
----------
|
28 |
+
classifier: object, the classifier for anomaly detection
|
29 |
+
|
30 |
+
"""
|
31 |
+
self.classifier = tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
|
32 |
+
max_features=max_features, class_weight=class_weight)
|
33 |
+
|
34 |
+
def fit(self, X, y):
|
35 |
+
"""
|
36 |
+
Arguments
|
37 |
+
---------
|
38 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
39 |
+
"""
|
40 |
+
print('====== Model summary ======')
|
41 |
+
self.classifier.fit(X, y)
|
42 |
+
|
43 |
+
def predict(self, X):
|
44 |
+
""" Predict anomalies with mined invariants
|
45 |
+
|
46 |
+
Arguments
|
47 |
+
---------
|
48 |
+
X: the input event count matrix
|
49 |
+
|
50 |
+
Returns
|
51 |
+
-------
|
52 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
53 |
+
"""
|
54 |
+
|
55 |
+
y_pred = self.classifier.predict(X)
|
56 |
+
return y_pred
|
57 |
+
|
58 |
+
def predict_proba(self, X):
|
59 |
+
""" Predict anomalies with mined invariants
|
60 |
+
|
61 |
+
Arguments
|
62 |
+
---------
|
63 |
+
X: the input event count matrix
|
64 |
+
|
65 |
+
Returns
|
66 |
+
-------
|
67 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
68 |
+
"""
|
69 |
+
|
70 |
+
y_pred = self.classifier.predict_proba(X)
|
71 |
+
return y_pred
|
72 |
+
|
73 |
+
def evaluate(self, X, y_true):
|
74 |
+
print('====== Evaluation summary ======')
|
75 |
+
y_pred = self.predict(X)
|
76 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
77 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
78 |
+
return precision, recall, f1
|
loglizer/models/InvariantsMiner.py
ADDED
@@ -0,0 +1,296 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of Invariants Mining model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Jian-Guang Lou, Qiang Fu, Shengqi Yang, Ye Xu, Jiang Li. Mining Invariants
|
9 |
+
from Console Logs for System Problem Detection. USENIX Annual Technical
|
10 |
+
Conference (ATC), 2010.
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
from itertools import combinations
|
16 |
+
from ..utils import metrics
|
17 |
+
|
18 |
+
class InvariantsMiner(object):
|
19 |
+
|
20 |
+
def __init__(self, percentage=0.98, epsilon=0.5, longest_invarant=None, scale_list=[1,2,3]):
|
21 |
+
""" The Invariants Mining model for anomaly detection
|
22 |
+
|
23 |
+
Attributes
|
24 |
+
----------
|
25 |
+
percentage: float, percentage of samples satisfying the condition that |X_j * V_i| < epsilon
|
26 |
+
epsilon: float, the threshold for estimating the invariant space
|
27 |
+
longest_invarant: int, the specified maximal length of invariant, default to None. Stop
|
28 |
+
searching when the invariant length is larger than longest_invarant.
|
29 |
+
scale_list: list, the list used to scale the theta of float into integer
|
30 |
+
invariants_dict: dict, dictionary of invariants where key is the selected columns
|
31 |
+
and value is the weights the of invariant
|
32 |
+
"""
|
33 |
+
self.percentage = percentage
|
34 |
+
self.epsilon = epsilon
|
35 |
+
self.longest_invarant = longest_invarant
|
36 |
+
self.scale_list = scale_list
|
37 |
+
self.invariants_dict = None
|
38 |
+
|
39 |
+
def fit(self, X):
|
40 |
+
"""
|
41 |
+
Arguments
|
42 |
+
---------
|
43 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
44 |
+
"""
|
45 |
+
print('====== Model summary ======')
|
46 |
+
invar_dim = self._estimate_invarant_space(X)
|
47 |
+
self._invariants_search(X, invar_dim)
|
48 |
+
|
49 |
+
def predict(self, X):
|
50 |
+
""" Predict anomalies with mined invariants
|
51 |
+
|
52 |
+
Arguments
|
53 |
+
---------
|
54 |
+
X: the input event count matrix
|
55 |
+
|
56 |
+
Returns
|
57 |
+
-------
|
58 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
59 |
+
"""
|
60 |
+
|
61 |
+
y_sum = np.zeros(X.shape[0])
|
62 |
+
for cols, theta in self.invariants_dict.items():
|
63 |
+
y_sum += np.fabs(np.dot(X[:, cols], np.array(theta)))
|
64 |
+
y_pred = (y_sum > 1e-6).astype(int)
|
65 |
+
return y_pred
|
66 |
+
|
67 |
+
def evaluate(self, X, y_true):
|
68 |
+
print('====== Evaluation summary ======')
|
69 |
+
y_pred = self.predict(X)
|
70 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
71 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
72 |
+
return precision, recall, f1
|
73 |
+
|
74 |
+
def _estimate_invarant_space(self, X):
|
75 |
+
""" Estimate the dimension of invariant space using SVD decomposition
|
76 |
+
|
77 |
+
Arguments
|
78 |
+
---------
|
79 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
80 |
+
percentage: float, percentage of samples satisfying the condition that |X_j * V_i| < epsilon
|
81 |
+
epsilon: float, the threshold for estimating the invariant space
|
82 |
+
|
83 |
+
Returns
|
84 |
+
-------
|
85 |
+
r: the dimension of invariant space
|
86 |
+
"""
|
87 |
+
covariance_matrix = np.dot(X.T, X)
|
88 |
+
U, sigma, V = np.linalg.svd(covariance_matrix) # SVD decomposition
|
89 |
+
# Start from the right most column of matrix V, sigular values are in ascending order
|
90 |
+
num_instances, num_events = X.shape
|
91 |
+
r = 0
|
92 |
+
for i in range(num_events - 1, -1, -1):
|
93 |
+
zero_count = sum(abs(np.dot(X, U[:, i])) < self.epsilon)
|
94 |
+
if zero_count / float(num_instances) < self.percentage:
|
95 |
+
break
|
96 |
+
r += 1
|
97 |
+
print('Invariant space dimension: {}'.format(r))
|
98 |
+
|
99 |
+
return r
|
100 |
+
|
101 |
+
def _invariants_search(self, X, r):
|
102 |
+
""" Mine invariant relationships from X
|
103 |
+
|
104 |
+
Arguments
|
105 |
+
---------
|
106 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
107 |
+
r: the dimension of invariant space
|
108 |
+
"""
|
109 |
+
|
110 |
+
num_instances, num_events = X.shape
|
111 |
+
invariants_dict = dict() # save the mined Invariants(value) and its corresponding columns(key)
|
112 |
+
search_space = [] # only invariant candidates in this list are valid
|
113 |
+
|
114 |
+
# invariant of only one column (all zero columns)
|
115 |
+
init_cols = sorted([[item] for item in range(num_events)])
|
116 |
+
for col in init_cols:
|
117 |
+
search_space.append(col)
|
118 |
+
init_col_list = init_cols[:]
|
119 |
+
for col in init_cols:
|
120 |
+
if np.count_nonzero(X[:, col]) == 0:
|
121 |
+
invariants_dict[tuple(col)] = [1]
|
122 |
+
search_space.remove(col)
|
123 |
+
init_col_list.remove(col)
|
124 |
+
|
125 |
+
item_list = init_col_list
|
126 |
+
length = 2
|
127 |
+
FLAG_break_loop = False
|
128 |
+
# check invariant of more columns
|
129 |
+
while len(item_list) != 0:
|
130 |
+
if self.longest_invarant and len(item_list[0]) >= self.longest_invarant:
|
131 |
+
break
|
132 |
+
joined_item_list = self._join_set(item_list, length) # generate new invariant candidates
|
133 |
+
for items in joined_item_list:
|
134 |
+
if self._check_candi_valid(items, length, search_space):
|
135 |
+
search_space.append(items)
|
136 |
+
item_list = []
|
137 |
+
for item in joined_item_list:
|
138 |
+
if tuple(item) in invariants_dict:
|
139 |
+
continue
|
140 |
+
if item not in search_space:
|
141 |
+
continue
|
142 |
+
if not self._check_candi_valid(tuple(item), length, search_space) and length > 2:
|
143 |
+
search_space.remove(item)
|
144 |
+
continue # an item must be superset of all other subitems in searchSpace, else skip
|
145 |
+
validity, scaled_theta = self._check_invar_validity(X, item)
|
146 |
+
if validity:
|
147 |
+
self._prune(invariants_dict.keys(), set(item), search_space)
|
148 |
+
invariants_dict[tuple(item)] = scaled_theta.tolist()
|
149 |
+
search_space.remove(item)
|
150 |
+
else:
|
151 |
+
item_list.append(item)
|
152 |
+
if len(invariants_dict) >= r:
|
153 |
+
FLAG_break_loop = True
|
154 |
+
break
|
155 |
+
if FLAG_break_loop:
|
156 |
+
break
|
157 |
+
length += 1
|
158 |
+
print('Mined {} invariants: {}\n'.format(len(invariants_dict), invariants_dict))
|
159 |
+
self.invariants_dict = invariants_dict
|
160 |
+
|
161 |
+
def _compute_eigenvector(self, X):
|
162 |
+
""" calculate the smallest eigenvalue and corresponding eigenvector (theta in the paper)
|
163 |
+
for a given sub_matrix
|
164 |
+
|
165 |
+
Arguments
|
166 |
+
---------
|
167 |
+
X: the event count matrix (each row is a log sequence vector, each column represents an event)
|
168 |
+
|
169 |
+
Returns
|
170 |
+
-------
|
171 |
+
min_vec: the eigenvector of corresponding minimum eigen value
|
172 |
+
FLAG_contain_zero: whether the min_vec contains zero (very small value)
|
173 |
+
"""
|
174 |
+
|
175 |
+
FLAG_contain_zero = False
|
176 |
+
count_zero = 0
|
177 |
+
dot_result = np.dot(X.T, X)
|
178 |
+
U, S, V = np.linalg.svd(dot_result)
|
179 |
+
min_vec = U[:, -1]
|
180 |
+
count_zero = sum(np.fabs(min_vec) < 1e-6)
|
181 |
+
if count_zero != 0:
|
182 |
+
FLAG_contain_zero = True
|
183 |
+
return min_vec, FLAG_contain_zero
|
184 |
+
|
185 |
+
|
186 |
+
def _check_invar_validity(self, X, selected_columns):
|
187 |
+
""" scale the eigenvector of float number into integer, and check whether the scaled number is valid
|
188 |
+
|
189 |
+
Arguments
|
190 |
+
---------
|
191 |
+
X: the event count matrix (each row is a log sequence vector, each column represents an event)
|
192 |
+
selected_columns: select columns from all column list
|
193 |
+
|
194 |
+
Returns
|
195 |
+
-------
|
196 |
+
validity: whether the selected columns is valid
|
197 |
+
scaled_theta: the scaled theta vector
|
198 |
+
"""
|
199 |
+
|
200 |
+
sub_matrix = X[:, selected_columns]
|
201 |
+
inst_num = X.shape[0]
|
202 |
+
validity = False
|
203 |
+
min_theta, FLAG_contain_zero = self._compute_eigenvector(sub_matrix)
|
204 |
+
abs_min_theta = [np.fabs(it) for it in min_theta]
|
205 |
+
if FLAG_contain_zero:
|
206 |
+
return validity, []
|
207 |
+
else:
|
208 |
+
for i in self.scale_list:
|
209 |
+
min_index = np.argmin(abs_min_theta)
|
210 |
+
scale = float(i) / min_theta[min_index]
|
211 |
+
scaled_theta = np.array([round(item * scale) for item in min_theta])
|
212 |
+
scaled_theta[min_index] = i
|
213 |
+
scaled_theta = scaled_theta.T
|
214 |
+
if 0 in np.fabs(scaled_theta):
|
215 |
+
continue
|
216 |
+
dot_submat_theta = np.dot(sub_matrix, scaled_theta)
|
217 |
+
count_zero = 0
|
218 |
+
for j in dot_submat_theta:
|
219 |
+
if np.fabs(j) < 1e-8:
|
220 |
+
count_zero += 1
|
221 |
+
if count_zero >= self.percentage * inst_num:
|
222 |
+
validity = True
|
223 |
+
# print('A valid invariant is found: ',scaled_theta, selected_columns)
|
224 |
+
break
|
225 |
+
return validity, scaled_theta
|
226 |
+
|
227 |
+
def _prune(self, valid_cols, new_item_set, search_space):
|
228 |
+
""" prune invalid combination of columns
|
229 |
+
|
230 |
+
Arguments
|
231 |
+
---------
|
232 |
+
valid_cols: existing valid column list
|
233 |
+
new_item_set: item set to be merged
|
234 |
+
search_space: the search space that stores possible candidates
|
235 |
+
|
236 |
+
"""
|
237 |
+
|
238 |
+
if len(valid_cols) == 0:
|
239 |
+
return
|
240 |
+
for se in valid_cols:
|
241 |
+
intersection = set(se) & new_item_set
|
242 |
+
if len(intersection) == 0:
|
243 |
+
continue
|
244 |
+
union = set(se) | new_item_set
|
245 |
+
for item in list(intersection):
|
246 |
+
diff = sorted(list(union - set([item])))
|
247 |
+
if diff in search_space:
|
248 |
+
search_space.remove(diff)
|
249 |
+
|
250 |
+
|
251 |
+
def _join_set(self, item_list, length):
|
252 |
+
""" Join a set with itself and returns the n-element (length) itemsets
|
253 |
+
|
254 |
+
Arguments
|
255 |
+
---------
|
256 |
+
item_list: current list of columns
|
257 |
+
length: generate new items of length
|
258 |
+
|
259 |
+
Returns
|
260 |
+
-------
|
261 |
+
return_list: list of items of length-element
|
262 |
+
"""
|
263 |
+
|
264 |
+
set_len = len(item_list)
|
265 |
+
return_list = []
|
266 |
+
for i in range(set_len):
|
267 |
+
for j in range(i + 1, set_len):
|
268 |
+
i_set = set(item_list[i])
|
269 |
+
j_set = set(item_list[j])
|
270 |
+
if len(i_set.union(j_set)) == length:
|
271 |
+
joined = sorted(list(i_set.union(j_set)))
|
272 |
+
if joined not in return_list:
|
273 |
+
return_list.append(joined)
|
274 |
+
return_list = sorted(return_list)
|
275 |
+
return return_list
|
276 |
+
|
277 |
+
|
278 |
+
def _check_candi_valid(self, item, length, search_space):
|
279 |
+
""" check whether an item's subitems are in searchspace
|
280 |
+
|
281 |
+
Arguments
|
282 |
+
---------
|
283 |
+
item: item to be checked
|
284 |
+
length: the length of item
|
285 |
+
search_space: the search space that stores possible candidates
|
286 |
+
|
287 |
+
Returns
|
288 |
+
-------
|
289 |
+
True or False
|
290 |
+
"""
|
291 |
+
|
292 |
+
for subItem in combinations(item, length - 1):
|
293 |
+
if sorted(list(subItem)) not in search_space:
|
294 |
+
return False
|
295 |
+
return True
|
296 |
+
|
loglizer/models/IsolationForest.py
ADDED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of IsolationForest model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Fei Tony Liu, Kai Ming Ting, Zhi-Hua Zhou. Isolation Forest. International
|
9 |
+
Conference on Data Mining (ICDM), 2008.
|
10 |
+
|
11 |
+
"""
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
|
16 |
+
|
17 |
+
import numpy as np
|
18 |
+
from sklearn.ensemble import IsolationForest as iForest
|
19 |
+
from ..utils import metrics
|
20 |
+
|
21 |
+
class IsolationForest(iForest):
|
22 |
+
|
23 |
+
def __init__(self, n_estimators=100, max_samples='auto', contamination=0.03, **kwargs):
|
24 |
+
""" The IsolationForest model for anomaly detection
|
25 |
+
|
26 |
+
Arguments
|
27 |
+
---------
|
28 |
+
n_estimators : int, optional (default=100). The number of base estimators in the ensemble.
|
29 |
+
max_samples : int or float, optional (default="auto")
|
30 |
+
The number of samples to draw from X to train each base estimator.
|
31 |
+
- If int, then draw max_samples samples.
|
32 |
+
- If float, then draw max_samples * X.shape[0] samples.
|
33 |
+
- If "auto", then max_samples=min(256, n_samples).
|
34 |
+
If max_samples is larger than the number of samples provided, all samples will be used
|
35 |
+
for all trees (no sampling).
|
36 |
+
contamination : float in (0., 0.5), optional (default='auto')
|
37 |
+
The amount of contamination of the data set, i.e. the proportion of outliers in the data
|
38 |
+
set. Used when fitting to define the threshold on the decision function. If 'auto', the
|
39 |
+
decision function threshold is determined as in the original paper.
|
40 |
+
max_features : int or float, optional (default=1.0)
|
41 |
+
The number of features to draw from X to train each base estimator.
|
42 |
+
- If int, then draw max_features features.
|
43 |
+
- If float, then draw max_features * X.shape[1] features.
|
44 |
+
bootstrap : boolean, optional (default=False)
|
45 |
+
If True, individual trees are fit on random subsets of the training data sampled with replacement.
|
46 |
+
If False, sampling without replacement is performed.
|
47 |
+
n_jobs : int or None, optional (default=None)
|
48 |
+
The number of jobs to run in parallel for both fit and predict. None means 1 unless in a
|
49 |
+
joblib.parallel_backend context. -1 means using all processors.
|
50 |
+
random_state : int, RandomState instance or None, optional (default=None)
|
51 |
+
If int, random_state is the seed used by the random number generator;
|
52 |
+
If RandomState instance, random_state is the random number generator;
|
53 |
+
If None, the random number generator is the RandomState instance used by np.random.
|
54 |
+
|
55 |
+
Reference
|
56 |
+
---------
|
57 |
+
For more information, please visit https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
|
58 |
+
"""
|
59 |
+
|
60 |
+
super(IsolationForest, self).__init__(n_estimators=n_estimators, max_samples=max_samples,
|
61 |
+
contamination=contamination, **kwargs)
|
62 |
+
|
63 |
+
|
64 |
+
def fit(self, X):
|
65 |
+
"""
|
66 |
+
Auguments
|
67 |
+
---------
|
68 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
69 |
+
"""
|
70 |
+
|
71 |
+
print('====== Model summary ======')
|
72 |
+
super(IsolationForest, self).fit(X)
|
73 |
+
|
74 |
+
def predict(self, X):
|
75 |
+
""" Predict anomalies with mined invariants
|
76 |
+
|
77 |
+
Arguments
|
78 |
+
---------
|
79 |
+
X: the input event count matrix
|
80 |
+
|
81 |
+
Returns
|
82 |
+
-------
|
83 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
84 |
+
"""
|
85 |
+
|
86 |
+
y_pred = super(IsolationForest, self).predict(X)
|
87 |
+
y_pred = np.where(y_pred > 0, 0, 1)
|
88 |
+
return y_pred
|
89 |
+
|
90 |
+
def evaluate(self, X, y_true):
|
91 |
+
print('====== Evaluation summary ======')
|
92 |
+
y_pred = self.predict(X)
|
93 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
94 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
95 |
+
return precision, recall, f1
|
96 |
+
|
loglizer/models/LR.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
"""
|
3 |
+
The implementation of the logistic regression model for anomaly detection.
|
4 |
+
|
5 |
+
Authors:
|
6 |
+
LogPAI Team
|
7 |
+
|
8 |
+
Reference:
|
9 |
+
[1] Peter Bodík, Moises Goldszmidt, Armando Fox, Hans Andersen. Fingerprinting
|
10 |
+
the Datacenter: Automated Classification of Performance Crises. The European
|
11 |
+
Conference on Computer Systems (EuroSys), 2010.
|
12 |
+
|
13 |
+
"""
|
14 |
+
|
15 |
+
import numpy as np
|
16 |
+
from sklearn.linear_model import LogisticRegression
|
17 |
+
from ..utils import metrics
|
18 |
+
|
19 |
+
class LR(object):
|
20 |
+
|
21 |
+
def __init__(self, penalty='l2', C=100, tol=0.01, class_weight=None, max_iter=100):
|
22 |
+
""" The Invariants Mining model for anomaly detection
|
23 |
+
|
24 |
+
Attributes
|
25 |
+
----------
|
26 |
+
classifier: object, the classifier for anomaly detection
|
27 |
+
"""
|
28 |
+
self.classifier = LogisticRegression(penalty=penalty, C=C, tol=tol, class_weight=class_weight,
|
29 |
+
max_iter=max_iter)
|
30 |
+
|
31 |
+
def fit(self, X, y):
|
32 |
+
"""
|
33 |
+
Arguments
|
34 |
+
---------
|
35 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
36 |
+
"""
|
37 |
+
print('====== Model summary ======')
|
38 |
+
self.classifier.fit(X, y)
|
39 |
+
|
40 |
+
def predict(self, X):
|
41 |
+
""" Predict anomalies with mined invariants
|
42 |
+
|
43 |
+
Arguments
|
44 |
+
---------
|
45 |
+
X: the input event count matrix
|
46 |
+
|
47 |
+
Returns
|
48 |
+
-------
|
49 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
50 |
+
"""
|
51 |
+
y_pred = self.classifier.predict(X)
|
52 |
+
return y_pred
|
53 |
+
|
54 |
+
def predict_proba(self, X):
|
55 |
+
""" Predict anomalies with mined invariants
|
56 |
+
|
57 |
+
Arguments
|
58 |
+
---------
|
59 |
+
X: the input event count matrix
|
60 |
+
|
61 |
+
Returns
|
62 |
+
-------
|
63 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
64 |
+
"""
|
65 |
+
y_pred = self.classifier.predict_proba(X)
|
66 |
+
return y_pred
|
67 |
+
|
68 |
+
def evaluate(self, X, y_true):
|
69 |
+
print('====== Evaluation summary ======')
|
70 |
+
y_pred = self.predict(X)
|
71 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
72 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
73 |
+
return precision, recall, f1
|
loglizer/models/LogClustering.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of Log Clustering model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Qingwei Lin, Hongyu Zhang, Jian-Guang Lou, Yu Zhang, Xuewei Chen. Log Clustering
|
9 |
+
based Problem Identification for Online Service Systems. International Conference
|
10 |
+
on Software Engineering (ICSE), 2016.
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
import pprint
|
16 |
+
from scipy.special import expit
|
17 |
+
from numpy import linalg as LA
|
18 |
+
from scipy.cluster.hierarchy import linkage, fcluster
|
19 |
+
from scipy.spatial.distance import pdist, squareform
|
20 |
+
from ..utils import metrics
|
21 |
+
|
22 |
+
|
23 |
+
class LogClustering(object):
|
24 |
+
|
25 |
+
def __init__(self, max_dist=0.3, anomaly_threshold=0.3, mode='online', num_bootstrap_samples=1000):
|
26 |
+
"""
|
27 |
+
Attributes
|
28 |
+
----------
|
29 |
+
max_dist: float, the threshold to stop the clustering process
|
30 |
+
anomaly_threshold: float, the threshold for anomaly detection
|
31 |
+
mode: str, 'offline' or 'online' mode for clustering
|
32 |
+
num_bootstrap_samples: int, online clustering starts with a bootstraping process, which
|
33 |
+
determines the initial cluster representatives offline using a subset of samples
|
34 |
+
representatives: ndarray, the representative samples of clusters, of shape
|
35 |
+
num_clusters-by-num_events
|
36 |
+
cluster_size_dict: dict, the size of each cluster, used to update representatives online
|
37 |
+
"""
|
38 |
+
self.max_dist = max_dist
|
39 |
+
self.anomaly_threshold = anomaly_threshold
|
40 |
+
self.mode = mode
|
41 |
+
self.num_bootstrap_samples = num_bootstrap_samples
|
42 |
+
self.representatives = list()
|
43 |
+
self.cluster_size_dict = dict()
|
44 |
+
|
45 |
+
def fit(self, X):
|
46 |
+
print('====== Model summary ======')
|
47 |
+
if self.mode == 'offline':
|
48 |
+
# The offline mode can process about 10K samples only due to huge memory consumption.
|
49 |
+
self._offline_clustering(X)
|
50 |
+
elif self.mode == 'online':
|
51 |
+
# Bootstrapping phase
|
52 |
+
if self.num_bootstrap_samples > 0:
|
53 |
+
X_bootstrap = X[0:self.num_bootstrap_samples, :]
|
54 |
+
self._offline_clustering(X_bootstrap)
|
55 |
+
# Online learning phase
|
56 |
+
if X.shape[0] > self.num_bootstrap_samples:
|
57 |
+
self._online_clustering(X)
|
58 |
+
|
59 |
+
def predict(self, X):
|
60 |
+
y_pred = np.zeros(X.shape[0])
|
61 |
+
for i in range(X.shape[0]):
|
62 |
+
min_dist, min_index = self._get_min_cluster_dist(X[i, :])
|
63 |
+
if min_dist > self.anomaly_threshold:
|
64 |
+
y_pred[i] = 1
|
65 |
+
return y_pred
|
66 |
+
|
67 |
+
def evaluate(self, X, y_true):
|
68 |
+
print('====== Evaluation summary ======')
|
69 |
+
y_pred = self.predict(X)
|
70 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
71 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n' \
|
72 |
+
.format(precision, recall, f1))
|
73 |
+
return precision, recall, f1
|
74 |
+
|
75 |
+
def _offline_clustering(self, X):
|
76 |
+
print('Starting offline clustering...')
|
77 |
+
p_dist = pdist(X, metric=self._distance_metric)
|
78 |
+
Z = linkage(p_dist, 'complete')
|
79 |
+
cluster_index = fcluster(Z, self.max_dist, criterion='distance')
|
80 |
+
self._extract_representatives(X, cluster_index)
|
81 |
+
print('Processed {} instances.'.format(X.shape[0]))
|
82 |
+
print('Found {} clusters offline.\n'.format(len(self.representatives)))
|
83 |
+
# print('The representive vectors are:')
|
84 |
+
# pprint.pprint(self.representatives.tolist())
|
85 |
+
|
86 |
+
def _extract_representatives(self, X, cluster_index):
|
87 |
+
num_clusters = len(set(cluster_index))
|
88 |
+
for clu in range(num_clusters):
|
89 |
+
clu_idx = np.argwhere(cluster_index == clu + 1)[:, 0]
|
90 |
+
self.cluster_size_dict[clu] = clu_idx.shape[0]
|
91 |
+
repre_center = np.average(X[clu_idx, :], axis=0)
|
92 |
+
self.representatives.append(repre_center)
|
93 |
+
|
94 |
+
def _online_clustering(self, X):
|
95 |
+
print("Starting online clustering...")
|
96 |
+
for i in range(self.num_bootstrap_samples, X.shape[0]):
|
97 |
+
if (i + 1) % 2000 == 0:
|
98 |
+
print('Processed {} instances.'.format(i + 1))
|
99 |
+
instance_vec = X[i, :]
|
100 |
+
if len(self.representatives) > 0:
|
101 |
+
min_dist, clu_id = self._get_min_cluster_dist(instance_vec)
|
102 |
+
if min_dist <= self.max_dist:
|
103 |
+
self.cluster_size_dict[clu_id] += 1
|
104 |
+
self.representatives[clu_id] = self.representatives[clu_id] \
|
105 |
+
+ (instance_vec - self.representatives[clu_id]) \
|
106 |
+
/ self.cluster_size_dict[clu_id]
|
107 |
+
continue
|
108 |
+
self.cluster_size_dict[len(self.representatives)] = 1
|
109 |
+
self.representatives.append(instance_vec)
|
110 |
+
print('Processed {} instances.'.format(X.shape[0]))
|
111 |
+
print('Found {} clusters online.\n'.format(len(self.representatives)))
|
112 |
+
# print('The representive vectors are:')
|
113 |
+
# pprint.pprint(self.representatives.tolist())
|
114 |
+
|
115 |
+
def _distance_metric(self, x1, x2):
|
116 |
+
norm= LA.norm(x1) * LA.norm(x2)
|
117 |
+
distance = 1 - np.dot(x1, x2) / (norm + 1e-8)
|
118 |
+
if distance < 1e-8:
|
119 |
+
distance = 0
|
120 |
+
return distance
|
121 |
+
|
122 |
+
def _get_min_cluster_dist(self, instance_vec):
|
123 |
+
min_index = -1
|
124 |
+
min_dist = float('inf')
|
125 |
+
for i in range(len(self.representatives)):
|
126 |
+
cluster_rep = self.representatives[i]
|
127 |
+
dist = self._distance_metric(instance_vec, cluster_rep)
|
128 |
+
if dist < 1e-8:
|
129 |
+
min_dist = 0
|
130 |
+
min_index = i
|
131 |
+
break
|
132 |
+
elif dist < min_dist:
|
133 |
+
min_dist = dist
|
134 |
+
min_index = i
|
135 |
+
return min_dist, min_index
|
136 |
+
|
137 |
+
|
loglizer/models/PCA.py
ADDED
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of PCA model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Wei Xu, Ling Huang, Armando Fox, David Patterson, Michael I. Jordan.
|
9 |
+
Large-Scale System Problems Detection by Mining Console Logs. ACM
|
10 |
+
Symposium on Operating Systems Principles (SOSP), 2009.
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
from ..utils import metrics
|
16 |
+
|
17 |
+
class PCA(object):
|
18 |
+
|
19 |
+
def __init__(self, n_components=0.95, threshold=None, c_alpha=3.2905):
|
20 |
+
""" The PCA model for anomaly detection
|
21 |
+
|
22 |
+
Attributes
|
23 |
+
----------
|
24 |
+
proj_C: The projection matrix for projecting feature vector to abnormal space
|
25 |
+
n_components: float/int, number of principal compnents or the variance ratio they cover
|
26 |
+
threshold: float, the anomaly detection threshold. When setting to None, the threshold
|
27 |
+
is automatically caculated using Q-statistics
|
28 |
+
c_alpha: float, the c_alpha parameter for caculating anomaly detection threshold using
|
29 |
+
Q-statistics. The following is lookup table for c_alpha:
|
30 |
+
c_alpha = 1.7507; # alpha = 0.08
|
31 |
+
c_alpha = 1.9600; # alpha = 0.05
|
32 |
+
c_alpha = 2.5758; # alpha = 0.01
|
33 |
+
c_alpha = 2.807; # alpha = 0.005
|
34 |
+
c_alpha = 2.9677; # alpha = 0.003
|
35 |
+
c_alpha = 3.2905; # alpha = 0.001
|
36 |
+
c_alpha = 3.4808; # alpha = 0.0005
|
37 |
+
c_alpha = 3.8906; # alpha = 0.0001
|
38 |
+
c_alpha = 4.4172; # alpha = 0.00001
|
39 |
+
"""
|
40 |
+
|
41 |
+
self.proj_C = None
|
42 |
+
self.components = None
|
43 |
+
self.n_components = n_components
|
44 |
+
self.threshold = threshold
|
45 |
+
self.c_alpha = c_alpha
|
46 |
+
|
47 |
+
|
48 |
+
def fit(self, X):
|
49 |
+
"""
|
50 |
+
Auguments
|
51 |
+
---------
|
52 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
53 |
+
"""
|
54 |
+
|
55 |
+
print('====== Model summary ======')
|
56 |
+
num_instances, num_events = X.shape
|
57 |
+
X_cov = np.dot(X.T, X) / float(num_instances)
|
58 |
+
U, sigma, V = np.linalg.svd(X_cov)
|
59 |
+
n_components = self.n_components
|
60 |
+
if n_components < 1:
|
61 |
+
total_variance = np.sum(sigma)
|
62 |
+
variance = 0
|
63 |
+
for i in range(num_events):
|
64 |
+
variance += sigma[i]
|
65 |
+
if variance / total_variance >= n_components:
|
66 |
+
break
|
67 |
+
n_components = i + 1
|
68 |
+
|
69 |
+
P = U[:, :n_components]
|
70 |
+
I = np.identity(num_events, int)
|
71 |
+
self.components = P
|
72 |
+
self.proj_C = I - np.dot(P, P.T)
|
73 |
+
print('n_components: {}'.format(n_components))
|
74 |
+
print('Project matrix shape: {}-by-{}'.format(self.proj_C.shape[0], self.proj_C.shape[1]))
|
75 |
+
|
76 |
+
if not self.threshold:
|
77 |
+
# Calculate threshold using Q-statistic. Information can be found at:
|
78 |
+
# http://conferences.sigcomm.org/sigcomm/2004/papers/p405-lakhina111.pdf
|
79 |
+
phi = np.zeros(3)
|
80 |
+
for i in range(3):
|
81 |
+
for j in range(n_components, num_events):
|
82 |
+
phi[i] += np.power(sigma[j], i + 1)
|
83 |
+
h0 = 1.0 - 2 * phi[0] * phi[2] / (3.0 * phi[1] * phi[1])
|
84 |
+
self.threshold = phi[0] * np.power(self.c_alpha * np.sqrt(2 * phi[1] * h0 * h0) / phi[0]
|
85 |
+
+ 1.0 + phi[1] * h0 * (h0 - 1) / (phi[0] * phi[0]),
|
86 |
+
1.0 / h0)
|
87 |
+
print('SPE threshold: {}\n'.format(self.threshold))
|
88 |
+
|
89 |
+
def predict(self, X):
|
90 |
+
assert self.proj_C is not None, 'PCA model needs to be trained before prediction.'
|
91 |
+
y_pred = np.zeros(X.shape[0])
|
92 |
+
for i in range(X.shape[0]):
|
93 |
+
y_a = np.dot(self.proj_C, X[i, :])
|
94 |
+
SPE = np.dot(y_a, y_a)
|
95 |
+
if SPE > self.threshold:
|
96 |
+
y_pred[i] = 1
|
97 |
+
return y_pred
|
98 |
+
|
99 |
+
def evaluate(self, X, y_true):
|
100 |
+
print('====== Evaluation summary ======')
|
101 |
+
y_pred = self.predict(X)
|
102 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
103 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
104 |
+
return precision, recall, f1
|
105 |
+
|
loglizer/models/SVM.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
The implementation of the SVM model for anomaly detection.
|
3 |
+
|
4 |
+
Authors:
|
5 |
+
LogPAI Team
|
6 |
+
|
7 |
+
Reference:
|
8 |
+
[1] Yinglung Liang, Yanyong Zhang, Hui Xiong, Ramendra Sahoo. Failure Prediction
|
9 |
+
in IBM BlueGene/L Event Logs. IEEE International Conference on Data Mining
|
10 |
+
(ICDM), 2007.
|
11 |
+
|
12 |
+
"""
|
13 |
+
|
14 |
+
import numpy as np
|
15 |
+
from sklearn import svm
|
16 |
+
from ..utils import metrics
|
17 |
+
|
18 |
+
class SVM(object):
|
19 |
+
|
20 |
+
def __init__(self, penalty='l1', tol=0.1, C=1, dual=False, class_weight=None,
|
21 |
+
max_iter=100):
|
22 |
+
""" The Invariants Mining model for anomaly detection
|
23 |
+
Arguments
|
24 |
+
---------
|
25 |
+
See SVM API: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
|
26 |
+
|
27 |
+
Attributes
|
28 |
+
----------
|
29 |
+
classifier: object, the classifier for anomaly detection
|
30 |
+
|
31 |
+
"""
|
32 |
+
self.classifier = svm.LinearSVC(penalty=penalty, tol=tol, C=C, dual=dual,
|
33 |
+
class_weight=class_weight, max_iter=max_iter)
|
34 |
+
|
35 |
+
def fit(self, X, y):
|
36 |
+
"""
|
37 |
+
Arguments
|
38 |
+
---------
|
39 |
+
X: ndarray, the event count matrix of shape num_instances-by-num_events
|
40 |
+
"""
|
41 |
+
print('====== Model summary ======')
|
42 |
+
self.classifier.fit(X, y)
|
43 |
+
|
44 |
+
def predict(self, X):
|
45 |
+
""" Predict anomalies with mined invariants
|
46 |
+
|
47 |
+
Arguments
|
48 |
+
---------
|
49 |
+
X: the input event count matrix
|
50 |
+
|
51 |
+
Returns
|
52 |
+
-------
|
53 |
+
y_pred: ndarray, the predicted label vector of shape (num_instances,)
|
54 |
+
"""
|
55 |
+
|
56 |
+
y_pred = self.classifier.predict(X)
|
57 |
+
return y_pred
|
58 |
+
|
59 |
+
def evaluate(self, X, y_true):
|
60 |
+
print('====== Evaluation summary ======')
|
61 |
+
y_pred = self.predict(X)
|
62 |
+
precision, recall, f1 = metrics(y_pred, y_true)
|
63 |
+
print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
|
64 |
+
return precision, recall, f1
|
loglizer/models/__init__.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .PCA import PCA
|
2 |
+
from .InvariantsMiner import InvariantsMiner
|
3 |
+
from .LogClustering import LogClustering
|
4 |
+
from .LR import LR
|
5 |
+
from .SVM import SVM
|
6 |
+
from .DecisionTree import DecisionTree
|
7 |
+
from .IsolationForest import IsolationForest
|
8 |
+
|
loglizer/models/__pycache__/DecisionTree.cpython-38.pyc
ADDED
Binary file (2.57 kB). View file
|
|
loglizer/models/__pycache__/InvariantsMiner.cpython-38.pyc
ADDED
Binary file (9.52 kB). View file
|
|