jpcabangon commited on
Commit
9c323ee
·
1 Parent(s): 76ec49e

init logdecoder app files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. 46_Heatmap.png +0 -0
  2. app.py +314 -0
  3. data/46_Heatmap.png +0 -0
  4. data/HDFS/HDFS_100k.log_structured.csv +3 -0
  5. data/HDFS/anomaly_label.csv +3 -0
  6. data/app.py +314 -0
  7. data/data_instances.csv +3 -0
  8. data/loglizer/__init__.py +0 -0
  9. data/loglizer/__pycache__/__init__.cpython-38.pyc +0 -0
  10. data/loglizer/__pycache__/dataloader.cpython-38.pyc +0 -0
  11. data/loglizer/__pycache__/preprocessing.cpython-38.pyc +0 -0
  12. data/loglizer/__pycache__/utils.cpython-38.pyc +0 -0
  13. data/loglizer/dataloader.py +268 -0
  14. data/loglizer/models/DecisionTree.py +78 -0
  15. data/loglizer/models/InvariantsMiner.py +296 -0
  16. data/loglizer/models/IsolationForest.py +96 -0
  17. data/loglizer/models/LR.py +73 -0
  18. data/loglizer/models/LogClustering.py +137 -0
  19. data/loglizer/models/PCA.py +105 -0
  20. data/loglizer/models/SVM.py +64 -0
  21. data/loglizer/models/__init__.py +8 -0
  22. data/loglizer/models/__pycache__/DecisionTree.cpython-38.pyc +0 -0
  23. data/loglizer/models/__pycache__/InvariantsMiner.cpython-38.pyc +0 -0
  24. data/loglizer/models/__pycache__/IsolationForest.cpython-38.pyc +0 -0
  25. data/loglizer/models/__pycache__/LR.cpython-38.pyc +0 -0
  26. data/loglizer/models/__pycache__/LogClustering.cpython-38.pyc +0 -0
  27. data/loglizer/models/__pycache__/PCA.cpython-38.pyc +0 -0
  28. data/loglizer/models/__pycache__/SVM.cpython-38.pyc +0 -0
  29. data/loglizer/models/__pycache__/__init__.cpython-38.pyc +0 -0
  30. data/loglizer/preprocessing.py +123 -0
  31. data/loglizer/utils.py +29 -0
  32. data/requirements.txt +4 -0
  33. data/temp/HDFS_100k.log_structured.csv +3 -0
  34. data_instances.csv +3 -0
  35. loglizer/__init__.py +0 -0
  36. loglizer/__pycache__/__init__.cpython-38.pyc +0 -0
  37. loglizer/__pycache__/dataloader.cpython-38.pyc +0 -0
  38. loglizer/__pycache__/preprocessing.cpython-38.pyc +0 -0
  39. loglizer/__pycache__/utils.cpython-38.pyc +0 -0
  40. loglizer/dataloader.py +268 -0
  41. loglizer/models/DecisionTree.py +78 -0
  42. loglizer/models/InvariantsMiner.py +296 -0
  43. loglizer/models/IsolationForest.py +96 -0
  44. loglizer/models/LR.py +73 -0
  45. loglizer/models/LogClustering.py +137 -0
  46. loglizer/models/PCA.py +105 -0
  47. loglizer/models/SVM.py +64 -0
  48. loglizer/models/__init__.py +8 -0
  49. loglizer/models/__pycache__/DecisionTree.cpython-38.pyc +0 -0
  50. loglizer/models/__pycache__/InvariantsMiner.cpython-38.pyc +0 -0
46_Heatmap.png ADDED
app.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To run streamlit, go to terminal and type: 'streamlit run app.py'
2
+ # Core Packages ###########################
3
+ import streamlit as st
4
+ import os
5
+
6
+ import time
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from loglizer.models import PCA
11
+ from loglizer import dataloader, preprocessing
12
+ import openai
13
+
14
+ openai.api_key = os.environ["FREEEDU_OPENAI_API_KEY"]
15
+
16
+ project_title = "ChatGPT Log Decoder"
17
+ project_desc = """
18
+ Intrusion Detection Systems (IDS) are powerful security tools that monitor network traffic for suspicious activity and issues alerts when such activities are discovered.
19
+ While these systems are commonly used by cybersecurity professionals and IT experts,
20
+ the technical jargon and analysis that an IDS provides can be incomprehensible to the average user.
21
+ This project aims to bridge this gap by utilizing ChatGPT's advanced natural language processing to
22
+ articulate the technical information received from an IDS into human-readable form, allowing common everyday users
23
+ to grasp the extent of their network's security status without the need for a technical background.
24
+ """
25
+
26
+ project_link = "https://github.com/logpai/loglizer"
27
+ project_icon = "46_Heatmap.png"
28
+ st.set_page_config(page_title=project_title, initial_sidebar_state='collapsed', page_icon=project_icon)
29
+
30
+ # additional info from the readme
31
+ add_info_md = """
32
+
33
+ # loglizer
34
+
35
+
36
+ **Loglizer is a machine learning-based log analysis toolkit for automated anomaly detection**.
37
+
38
+
39
+ Logs are imperative in the development and maintenance process of many software systems. They record detailed
40
+ runtime information during system operation that allows developers and support engineers to monitor their systems and track abnormal behaviors and errors. Loglizer provides a toolkit that implements a number of machine-learning based log analysis techniques for automated anomaly detection.
41
+
42
+ :telescope: If you use loglizer in your research for publication, please kindly cite the following paper.
43
+ + Shilin He, Jieming Zhu, Pinjia He, Michael R. Lyu. [Experience Report: System Log Analysis for Anomaly Detection](https://jiemingzhu.github.io/pub/slhe_issre2016.pdf), *IEEE International Symposium on Software Reliability Engineering (ISSRE)*, 2016. [[Bibtex](https://dblp.org/rec/bibtex/conf/issre/HeZHL16)][[中文版本](https://github.com/AmateurEvents/article/issues/2)]
44
+ **(ISSRE Most Influential Paper)**
45
+
46
+ ## Framework
47
+
48
+ ![Framework of Anomaly Detection](/docs/img/framework.png)
49
+
50
+ The log analysis framework for anomaly detection usually comprises the following components:
51
+
52
+ 1. **Log collection:** Logs are generated at runtime and aggregated into a centralized place with a data streaming pipeline, such as Flume and Kafka.
53
+ 2. **Log parsing:** The goal of log parsing is to convert unstructured log messages into a map of structured events, based on which sophisticated machine learning models can be applied. The details of log parsing can be found at [our logparser project](https://github.com/logpai/logparser).
54
+ 3. **Feature extraction:** Structured logs can be sliced into short log sequences through interval window, sliding window, or session window. Then, feature extraction is performed to vectorize each log sequence, for example, using an event counting vector.
55
+ 4. **Anomaly detection:** Anomaly detection models are trained to check whether a given feature vector is an anomaly or not.
56
+
57
+
58
+ ## Models
59
+
60
+ Anomaly detection models currently available:
61
+
62
+ | Model | Paper reference |
63
+ | :--- | :--- |
64
+ | **Supervised models** |
65
+ | LR | [**EuroSys'10**] [Fingerprinting the Datacenter: Automated Classification of Performance Crises](https://www.microsoft.com/en-us/research/wp-content/uploads/2009/07/hiLighter.pdf), by Peter Bodík, Moises Goldszmidt, Armando Fox, Hans Andersen. [**Microsoft**] |
66
+ | Decision Tree | [**ICAC'04**] [Failure Diagnosis Using Decision Trees](http://www.cs.berkeley.edu/~brewer/papers/icac2004_chen_diagnosis.pdf), by Mike Chen, Alice X. Zheng, Jim Lloyd, Michael I. Jordan, Eric Brewer. [**eBay**] |
67
+ | SVM | [**ICDM'07**] [Failure Prediction in IBM BlueGene/L Event Logs](https://www.researchgate.net/publication/4324148_Failure_Prediction_in_IBM_BlueGeneL_Event_Logs), by Yinglung Liang, Yanyong Zhang, Hui Xiong, Ramendra Sahoo. [**IBM**]|
68
+ | **Unsupervised models** |
69
+ | LOF | [**SIGMOD'00**] [LOF: Identifying Density-Based Local Outliers](), by Markus M. Breunig, Hans-Peter Kriegel, Raymond T. Ng, Jörg Sander. |
70
+ | One-Class SVM | [**Neural Computation'01**] [Estimating the Support of a High-Dimensional Distribution](), by John Platt, Bernhard Schölkopf, John Shawe-Taylor, Alex J. Smola, Robert C. Williamson. |
71
+ | Isolation Forest | [**ICDM'08**] [Isolation Forest](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf), by Fei Tony Liu, Kai Ming Ting, Zhi-Hua Zhou. |
72
+ | PCA | [**SOSP'09**] [Large-Scale System Problems Detection by Mining Console Logs](http://iiis.tsinghua.edu.cn/~weixu/files/sosp09.pdf), by Wei Xu, Ling Huang, Armando Fox, David Patterson, Michael I. Jordan. [**Intel**] |
73
+ | Invariants Mining | [**ATC'10**] [Mining Invariants from Console Logs for System Problem Detection](https://www.usenix.org/legacy/event/atc10/tech/full_papers/Lou.pdf), by Jian-Guang Lou, Qiang Fu, Shengqi Yang, Ye Xu, Jiang Li. [**Microsoft**]|
74
+ | Clustering | [**ICSE'16**] [Log Clustering based Problem Identification for Online Service Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/07/ICSE-2016-2-Log-Clustering-based-Problem-Identification-for-Online-Service-Systems.pdf), by Qingwei Lin, Hongyu Zhang, Jian-Guang Lou, Yu Zhang, Xuewei Chen. [**Microsoft**]|
75
+ | DeepLog (coming)| [**CCS'17**] [DeepLog: Anomaly Detection and Diagnosis from System Logs through Deep Learning](https://www.cs.utah.edu/~lifeifei/papers/deeplog.pdf), by Min Du, Feifei Li, Guineng Zheng, Vivek Srikumar. |
76
+ | AutoEncoder (coming)| [**Arxiv'18**] [Anomaly Detection using Autoencoders in High Performance Computing Systems](https://arxiv.org/abs/1811.05269), by Andrea Borghesi, Andrea Bartolini, Michele Lombardi, Michela Milano, Luca Benini. |
77
+
78
+
79
+ ## Log data
80
+ We have collected a set of labeled log datasets in [loghub](https://github.com/logpai/loghub) for research purposes. If you are interested in the datasets, please follow the link to submit your access request.
81
+
82
+ ## Install
83
+ ```bash
84
+ git clone https://github.com/logpai/loglizer.git
85
+ cd loglizer
86
+ pip install -r requirements.txt
87
+ ```
88
+
89
+ ## API usage
90
+
91
+ ```python
92
+ # Load HDFS dataset. If you would like to try your own log, you need to rewrite the load function.
93
+ (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(...)
94
+
95
+ # Feature extraction and transformation
96
+ feature_extractor = preprocessing.FeatureExtractor()
97
+ feature_extractor.fit_transform(...)
98
+
99
+ # Model training
100
+ model = PCA()
101
+ model.fit(...)
102
+
103
+ # Feature transform after fitting
104
+ x_test = feature_extractor.transform(...)
105
+ # Model evaluation with labeled data
106
+ model.evaluate(...)
107
+
108
+ # Anomaly prediction
109
+ x_test = feature_extractor.transform(...)
110
+ model.predict(...) # predict anomalies on given data
111
+ ```
112
+
113
+ For more details, please follow [the demo](./docs/demo.md) in the docs to get started. Please note that all ML models are not magic, you need to figure out how to tune the parameters in order to make them work on your own data.
114
+
115
+ ## Benchmarking results
116
+
117
+ If you would like to reproduce the following results, please run [benchmarks/HDFS_bechmark.py](./benchmarks/HDFS_bechmark.py) on the full HDFS dataset (HDFS100k is for demo only).
118
+
119
+ | | | HDFS | |
120
+ | :----:|:----:|:----:|:----:|
121
+ | **Model** | **Precision** | **Recall** | **F1** |
122
+ | LR| 0.955 | 0.911 | 0.933 |
123
+ | Decision Tree | 0.998 | 0.998 | 0.998 |
124
+ | SVM| 0.959 | 0.970 | 0.965 |
125
+ | LOF | 0.967 | 0.561 | 0.710 |
126
+ | One-Class SVM | 0.995 | 0.222| 0.363 |
127
+ | Isolation Forest | 0.830 | 0.776 | 0.802 |
128
+ | PCA | 0.975 | 0.635 | 0.769|
129
+ | Invariants Mining | 0.888 | 0.945 | 0.915|
130
+ | Clustering | 1.000 | 0.720 | 0.837 |
131
+
132
+ ## Contributors
133
+ + [Shilin He](https://shilinhe.github.io), The Chinese University of Hong Kong
134
+ + [Jieming Zhu](https://jiemingzhu.github.io), The Chinese University of Hong Kong, currently at Huawei Noah's Ark Lab
135
+ + [Pinjia He](https://pinjiahe.github.io/), The Chinese University of Hong Kong, currently at ETH Zurich
136
+
137
+
138
+ ## Feedback
139
+ For any questions or feedback, please post to [the issue page](https://github.com/logpai/loglizer/issues/new).
140
+
141
+ """
142
+ #######################################################################################################################
143
+ default_log_path = os.path.join("data","HDFS","HDFS_100k.log_structured.csv")
144
+ def load_loglizer(train_log_path=default_log_path):
145
+ print("Loading Loglizer PCA model:")
146
+ start = time.time()
147
+ (x_train, _), (_, _), _ = dataloader.load_HDFS(train_log_path, window='session',
148
+ split_type='sequential', save_csv=True)
149
+ feature_extractor = preprocessing.FeatureExtractor()
150
+ x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf',
151
+ normalization='zero-mean')
152
+ model = PCA()
153
+ model.fit(x_train)
154
+
155
+ stop = time.time()
156
+ print("Loading complete. Time elapsed: " + str(stop - start))
157
+
158
+ # Extract the event labels for each unique eventid in the source structured log file
159
+ # Load the CSV file
160
+ df = pd.read_csv(train_log_path)
161
+
162
+ # Extract unique 'EventId' and 'EventTemplate' pairs
163
+ event_names = df[['EventId', 'EventTemplate']].drop_duplicates()
164
+ event_names_dict = dict(zip(event_names['EventId'], event_names['EventTemplate']))
165
+
166
+ return model, feature_extractor, event_names_dict
167
+
168
+ def analyze_with_chatgpt(anomalous_packets):
169
+ context = {"role":"system",
170
+ "content":"""
171
+ You will receive 5 rows of transformed data logs, where the middlemost log is flagged as anomalous.
172
+ Your job is to analyze and compare this log with its surrounding context logs, and find out why it was flagged as anomalous.
173
+ Your response should be 50 to 150 words only; be as concise as possible and address the user directly.
174
+ Only mention the key issues and cause of the anomaly. Never mention the Column Names, the middlemost log, surrounding context log.
175
+ You can mention what the irregular values in a column represent, and base your report on that.
176
+ Assume that the user has no knowledge of networks or cybersecurity. Your response should start with
177
+ "Based on the logs,".
178
+ Try to avoid telling the user 'if the issue persists'.
179
+ Give simple step-by-step advice on what the user can do on their own.
180
+ If the advice is beyond the scope of the everyday user, notify them that the assistance of a professional
181
+ is necessary for the level of intrusion you found.
182
+ """}
183
+ message_prompt = []
184
+ message_prompt.append(context)
185
+ anomalies = f"*Column Names*: {anomalous_packets.columns} --- "
186
+ # Build a sequence of messages, with each message being a single packet up to the last packet flagged as anomalous
187
+ for i,p in anomalous_packets.iterrows():
188
+ anomalies += f"*Row {i+1}*: "
189
+ anomalies += str(p.values)
190
+ anomalies += " --- "
191
+
192
+ message_format = {"role":"user",
193
+ "content":anomalies}
194
+ message_prompt.append(message_format)
195
+
196
+ # Generate the response
197
+ response = openai.ChatCompletion.create(
198
+ model="gpt-3.5-turbo",
199
+ messages=message_prompt,
200
+ max_tokens=1008,
201
+ temperature=0.7,
202
+ n=1,
203
+ stop=None,
204
+ )
205
+
206
+ # Extract the response text from the API response
207
+ response_text = response['choices'][0]['message']['content']
208
+ # Return the response text and updated chat history
209
+ return response_text
210
+
211
+ def main():
212
+ head_col = st.columns([1,8])
213
+ with head_col[0]:
214
+ st.image(project_icon)
215
+ with head_col[1]:
216
+ st.title(project_title)
217
+
218
+ st.write(project_desc)
219
+ st.write(f"Source Project: {project_link}")
220
+ expander = st.expander("Additional Information on the Source Project (Loglizer)")
221
+ expander.markdown(add_info_md)
222
+ st.markdown("***")
223
+ st.subheader("")
224
+ #########################################
225
+
226
+ # instructions and file upload button
227
+ st.subheader("""
228
+ How to use:
229
+ 1. Upload your log file (must be .csv format)
230
+ 2. Click the 'Run Loglizer' button
231
+ """)
232
+ uploaded_file = st.file_uploader("Upload a log file in csv format:",
233
+ type=["csv"],
234
+ accept_multiple_files=False)
235
+
236
+ #########################################
237
+
238
+ run_button = st.button("Run Loglizer")
239
+
240
+ if "anomalies" not in st.session_state:
241
+ st.session_state.anomalies = []
242
+ if "col_names" not in st.session_state:
243
+ st.session_state.col_names = []
244
+
245
+ # button is clicked
246
+ if run_button:
247
+ if uploaded_file is None: # if no files were uploaded:
248
+ st.error("Please upload a .csv log file")
249
+
250
+ else:
251
+ # Ensure temp directory exists
252
+ if not os.path.exists('temp'):
253
+ os.makedirs('temp')
254
+
255
+ filename = os.path.basename(uploaded_file.name)
256
+ file_path = os.path.join("temp", filename)
257
+ # Write out the uploaded file to temp directory
258
+ with open(file_path, 'wb') as f:
259
+ f.write(uploaded_file.getbuffer())
260
+
261
+ with st.spinner('Loading Loglizer...'):
262
+ model, feature_extractor, event_names_dict = load_loglizer()
263
+
264
+ # Simulate the 'live' log feed
265
+ (x_live, _), (_, _), _ = dataloader.load_HDFS(file_path, window='session', split_type='sequential')
266
+ x_live, st.session_state.col_names = feature_extractor.transform(x_live)
267
+ # st.write(f"x_live shape: {x_live.shape}")
268
+
269
+ # Map event IDs to event templates
270
+ st.session_state.col_names = [event_names_dict[id] if id in event_names_dict else id for id in
271
+ st.session_state.col_names]
272
+
273
+ for i in range(len(x_live)):
274
+ log = x_live[i]
275
+ log = np.expand_dims(log, axis=0)
276
+ prediction = model.predict(log)
277
+ print(f"prediction for x_live[{i}]: {prediction}")
278
+ if prediction == 1:
279
+ # Anomaly detected
280
+ context = x_live[max(0, i - 2):min(i + 3, len(x_live))] # Get the two logs before and after
281
+ st.session_state.anomalies.append(context)
282
+ # st.write("predict on x_live:")
283
+ # st.write(model.predict(x_live))
284
+
285
+ st.write("Anomalous events")
286
+ st.write(len(st.session_state.anomalies))
287
+ #
288
+ # st.write("Column names")
289
+ # st.write(st.session_state.col_names)
290
+
291
+ if len(st.session_state.anomalies) > 0:
292
+ # Displaying the anomalous packets and sending to ChatGPT
293
+ selected_index = st.selectbox('Select an anomaly', list(range(len(st.session_state.anomalies))), 0)
294
+ selected_anomaly = st.session_state.anomalies[selected_index]
295
+ st.write(f"Loglizer found and compiled {selected_anomaly.shape[1]} events from the logs.\n The middle entry has an anomaly:")
296
+ # st.write(f"Shape: {selected_anomaly.shape} Type: {type(selected_anomaly)}")
297
+
298
+ # Convert the numpy array to a pandas DataFrame
299
+ selected_anomaly_df = pd.DataFrame(selected_anomaly)
300
+ # Set the column names
301
+ selected_anomaly_df.columns = st.session_state.col_names
302
+
303
+ # st.write(selected_anomaly)
304
+ st.write(selected_anomaly_df)
305
+
306
+ if st.button('Send to ChatGPT'):
307
+ result = analyze_with_chatgpt(selected_anomaly_df)
308
+ st.write(result)
309
+
310
+
311
+ if __name__ == '__main__':
312
+ main()
313
+
314
+ # To run streamlit, go to terminal and type: 'streamlit run app-source.py'
data/46_Heatmap.png ADDED
data/HDFS/HDFS_100k.log_structured.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbb13cc937bb35ad683dfcd71929242d7b1e0c01e2a94f79c637847c30a42190
3
+ size 20892087
data/HDFS/anomaly_label.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c711ed6c8848fc3243fb4d092f172f31d128c8a6ec7f26ebba72ab931885ed8
3
+ size 18637554
data/app.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # To run streamlit, go to terminal and type: 'streamlit run app.py'
2
+ # Core Packages ###########################
3
+ import streamlit as st
4
+ import os
5
+
6
+ import time
7
+ import numpy as np
8
+ import pandas as pd
9
+
10
+ from loglizer.models import PCA
11
+ from loglizer import dataloader, preprocessing
12
+ import openai
13
+
14
+ openai.api_key = os.environ["FREEEDU_OPENAI_API_KEY"]
15
+
16
+ project_title = "ChatGPT Log Decoder"
17
+ project_desc = """
18
+ Intrusion Detection Systems (IDS) are powerful security tools that monitor network traffic for suspicious activity and issues alerts when such activities are discovered.
19
+ While these systems are commonly used by cybersecurity professionals and IT experts,
20
+ the technical jargon and analysis that an IDS provides can be incomprehensible to the average user.
21
+ This project aims to bridge this gap by utilizing ChatGPT's advanced natural language processing to
22
+ articulate the technical information received from an IDS into human-readable form, allowing common everyday users
23
+ to grasp the extent of their network's security status without the need for a technical background.
24
+ """
25
+
26
+ project_link = "https://github.com/logpai/loglizer"
27
+ project_icon = "46_Heatmap.png"
28
+ st.set_page_config(page_title=project_title, initial_sidebar_state='collapsed', page_icon=project_icon)
29
+
30
+ # additional info from the readme
31
+ add_info_md = """
32
+
33
+ # loglizer
34
+
35
+
36
+ **Loglizer is a machine learning-based log analysis toolkit for automated anomaly detection**.
37
+
38
+
39
+ Logs are imperative in the development and maintenance process of many software systems. They record detailed
40
+ runtime information during system operation that allows developers and support engineers to monitor their systems and track abnormal behaviors and errors. Loglizer provides a toolkit that implements a number of machine-learning based log analysis techniques for automated anomaly detection.
41
+
42
+ :telescope: If you use loglizer in your research for publication, please kindly cite the following paper.
43
+ + Shilin He, Jieming Zhu, Pinjia He, Michael R. Lyu. [Experience Report: System Log Analysis for Anomaly Detection](https://jiemingzhu.github.io/pub/slhe_issre2016.pdf), *IEEE International Symposium on Software Reliability Engineering (ISSRE)*, 2016. [[Bibtex](https://dblp.org/rec/bibtex/conf/issre/HeZHL16)][[中文版本](https://github.com/AmateurEvents/article/issues/2)]
44
+ **(ISSRE Most Influential Paper)**
45
+
46
+ ## Framework
47
+
48
+ ![Framework of Anomaly Detection](/docs/img/framework.png)
49
+
50
+ The log analysis framework for anomaly detection usually comprises the following components:
51
+
52
+ 1. **Log collection:** Logs are generated at runtime and aggregated into a centralized place with a data streaming pipeline, such as Flume and Kafka.
53
+ 2. **Log parsing:** The goal of log parsing is to convert unstructured log messages into a map of structured events, based on which sophisticated machine learning models can be applied. The details of log parsing can be found at [our logparser project](https://github.com/logpai/logparser).
54
+ 3. **Feature extraction:** Structured logs can be sliced into short log sequences through interval window, sliding window, or session window. Then, feature extraction is performed to vectorize each log sequence, for example, using an event counting vector.
55
+ 4. **Anomaly detection:** Anomaly detection models are trained to check whether a given feature vector is an anomaly or not.
56
+
57
+
58
+ ## Models
59
+
60
+ Anomaly detection models currently available:
61
+
62
+ | Model | Paper reference |
63
+ | :--- | :--- |
64
+ | **Supervised models** |
65
+ | LR | [**EuroSys'10**] [Fingerprinting the Datacenter: Automated Classification of Performance Crises](https://www.microsoft.com/en-us/research/wp-content/uploads/2009/07/hiLighter.pdf), by Peter Bodík, Moises Goldszmidt, Armando Fox, Hans Andersen. [**Microsoft**] |
66
+ | Decision Tree | [**ICAC'04**] [Failure Diagnosis Using Decision Trees](http://www.cs.berkeley.edu/~brewer/papers/icac2004_chen_diagnosis.pdf), by Mike Chen, Alice X. Zheng, Jim Lloyd, Michael I. Jordan, Eric Brewer. [**eBay**] |
67
+ | SVM | [**ICDM'07**] [Failure Prediction in IBM BlueGene/L Event Logs](https://www.researchgate.net/publication/4324148_Failure_Prediction_in_IBM_BlueGeneL_Event_Logs), by Yinglung Liang, Yanyong Zhang, Hui Xiong, Ramendra Sahoo. [**IBM**]|
68
+ | **Unsupervised models** |
69
+ | LOF | [**SIGMOD'00**] [LOF: Identifying Density-Based Local Outliers](), by Markus M. Breunig, Hans-Peter Kriegel, Raymond T. Ng, Jörg Sander. |
70
+ | One-Class SVM | [**Neural Computation'01**] [Estimating the Support of a High-Dimensional Distribution](), by John Platt, Bernhard Schölkopf, John Shawe-Taylor, Alex J. Smola, Robert C. Williamson. |
71
+ | Isolation Forest | [**ICDM'08**] [Isolation Forest](https://cs.nju.edu.cn/zhouzh/zhouzh.files/publication/icdm08b.pdf), by Fei Tony Liu, Kai Ming Ting, Zhi-Hua Zhou. |
72
+ | PCA | [**SOSP'09**] [Large-Scale System Problems Detection by Mining Console Logs](http://iiis.tsinghua.edu.cn/~weixu/files/sosp09.pdf), by Wei Xu, Ling Huang, Armando Fox, David Patterson, Michael I. Jordan. [**Intel**] |
73
+ | Invariants Mining | [**ATC'10**] [Mining Invariants from Console Logs for System Problem Detection](https://www.usenix.org/legacy/event/atc10/tech/full_papers/Lou.pdf), by Jian-Guang Lou, Qiang Fu, Shengqi Yang, Ye Xu, Jiang Li. [**Microsoft**]|
74
+ | Clustering | [**ICSE'16**] [Log Clustering based Problem Identification for Online Service Systems](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/07/ICSE-2016-2-Log-Clustering-based-Problem-Identification-for-Online-Service-Systems.pdf), by Qingwei Lin, Hongyu Zhang, Jian-Guang Lou, Yu Zhang, Xuewei Chen. [**Microsoft**]|
75
+ | DeepLog (coming)| [**CCS'17**] [DeepLog: Anomaly Detection and Diagnosis from System Logs through Deep Learning](https://www.cs.utah.edu/~lifeifei/papers/deeplog.pdf), by Min Du, Feifei Li, Guineng Zheng, Vivek Srikumar. |
76
+ | AutoEncoder (coming)| [**Arxiv'18**] [Anomaly Detection using Autoencoders in High Performance Computing Systems](https://arxiv.org/abs/1811.05269), by Andrea Borghesi, Andrea Bartolini, Michele Lombardi, Michela Milano, Luca Benini. |
77
+
78
+
79
+ ## Log data
80
+ We have collected a set of labeled log datasets in [loghub](https://github.com/logpai/loghub) for research purposes. If you are interested in the datasets, please follow the link to submit your access request.
81
+
82
+ ## Install
83
+ ```bash
84
+ git clone https://github.com/logpai/loglizer.git
85
+ cd loglizer
86
+ pip install -r requirements.txt
87
+ ```
88
+
89
+ ## API usage
90
+
91
+ ```python
92
+ # Load HDFS dataset. If you would like to try your own log, you need to rewrite the load function.
93
+ (x_train, y_train), (x_test, y_test) = dataloader.load_HDFS(...)
94
+
95
+ # Feature extraction and transformation
96
+ feature_extractor = preprocessing.FeatureExtractor()
97
+ feature_extractor.fit_transform(...)
98
+
99
+ # Model training
100
+ model = PCA()
101
+ model.fit(...)
102
+
103
+ # Feature transform after fitting
104
+ x_test = feature_extractor.transform(...)
105
+ # Model evaluation with labeled data
106
+ model.evaluate(...)
107
+
108
+ # Anomaly prediction
109
+ x_test = feature_extractor.transform(...)
110
+ model.predict(...) # predict anomalies on given data
111
+ ```
112
+
113
+ For more details, please follow [the demo](./docs/demo.md) in the docs to get started. Please note that all ML models are not magic, you need to figure out how to tune the parameters in order to make them work on your own data.
114
+
115
+ ## Benchmarking results
116
+
117
+ If you would like to reproduce the following results, please run [benchmarks/HDFS_bechmark.py](./benchmarks/HDFS_bechmark.py) on the full HDFS dataset (HDFS100k is for demo only).
118
+
119
+ | | | HDFS | |
120
+ | :----:|:----:|:----:|:----:|
121
+ | **Model** | **Precision** | **Recall** | **F1** |
122
+ | LR| 0.955 | 0.911 | 0.933 |
123
+ | Decision Tree | 0.998 | 0.998 | 0.998 |
124
+ | SVM| 0.959 | 0.970 | 0.965 |
125
+ | LOF | 0.967 | 0.561 | 0.710 |
126
+ | One-Class SVM | 0.995 | 0.222| 0.363 |
127
+ | Isolation Forest | 0.830 | 0.776 | 0.802 |
128
+ | PCA | 0.975 | 0.635 | 0.769|
129
+ | Invariants Mining | 0.888 | 0.945 | 0.915|
130
+ | Clustering | 1.000 | 0.720 | 0.837 |
131
+
132
+ ## Contributors
133
+ + [Shilin He](https://shilinhe.github.io), The Chinese University of Hong Kong
134
+ + [Jieming Zhu](https://jiemingzhu.github.io), The Chinese University of Hong Kong, currently at Huawei Noah's Ark Lab
135
+ + [Pinjia He](https://pinjiahe.github.io/), The Chinese University of Hong Kong, currently at ETH Zurich
136
+
137
+
138
+ ## Feedback
139
+ For any questions or feedback, please post to [the issue page](https://github.com/logpai/loglizer/issues/new).
140
+
141
+ """
142
+ #######################################################################################################################
143
+ default_log_path = os.path.join("data","HDFS","HDFS_100k.log_structured.csv")
144
+ def load_loglizer(train_log_path=default_log_path):
145
+ print("Loading Loglizer PCA model:")
146
+ start = time.time()
147
+ (x_train, _), (_, _), _ = dataloader.load_HDFS(train_log_path, window='session',
148
+ split_type='sequential', save_csv=True)
149
+ feature_extractor = preprocessing.FeatureExtractor()
150
+ x_train = feature_extractor.fit_transform(x_train, term_weighting='tf-idf',
151
+ normalization='zero-mean')
152
+ model = PCA()
153
+ model.fit(x_train)
154
+
155
+ stop = time.time()
156
+ print("Loading complete. Time elapsed: " + str(stop - start))
157
+
158
+ # Extract the event labels for each unique eventid in the source structured log file
159
+ # Load the CSV file
160
+ df = pd.read_csv(train_log_path)
161
+
162
+ # Extract unique 'EventId' and 'EventTemplate' pairs
163
+ event_names = df[['EventId', 'EventTemplate']].drop_duplicates()
164
+ event_names_dict = dict(zip(event_names['EventId'], event_names['EventTemplate']))
165
+
166
+ return model, feature_extractor, event_names_dict
167
+
168
+ def analyze_with_chatgpt(anomalous_packets):
169
+ context = {"role":"system",
170
+ "content":"""
171
+ You will receive 5 rows of transformed data logs, where the middlemost log is flagged as anomalous.
172
+ Your job is to analyze and compare this log with its surrounding context logs, and find out why it was flagged as anomalous.
173
+ Your response should be 50 to 150 words only; be as concise as possible and address the user directly.
174
+ Only mention the key issues and cause of the anomaly. Never mention the Column Names, the middlemost log, surrounding context log.
175
+ You can mention what the irregular values in a column represent, and base your report on that.
176
+ Assume that the user has no knowledge of networks or cybersecurity. Your response should start with
177
+ "Based on the logs,".
178
+ Try to avoid telling the user 'if the issue persists'.
179
+ Give simple step-by-step advice on what the user can do on their own.
180
+ If the advice is beyond the scope of the everyday user, notify them that the assistance of a professional
181
+ is necessary for the level of intrusion you found.
182
+ """}
183
+ message_prompt = []
184
+ message_prompt.append(context)
185
+ anomalies = f"*Column Names*: {anomalous_packets.columns} --- "
186
+ # Build a sequence of messages, with each message being a single packet up to the last packet flagged as anomalous
187
+ for i,p in anomalous_packets.iterrows():
188
+ anomalies += f"*Row {i+1}*: "
189
+ anomalies += str(p.values)
190
+ anomalies += " --- "
191
+
192
+ message_format = {"role":"user",
193
+ "content":anomalies}
194
+ message_prompt.append(message_format)
195
+
196
+ # Generate the response
197
+ response = openai.ChatCompletion.create(
198
+ model="gpt-3.5-turbo",
199
+ messages=message_prompt,
200
+ max_tokens=1008,
201
+ temperature=0.7,
202
+ n=1,
203
+ stop=None,
204
+ )
205
+
206
+ # Extract the response text from the API response
207
+ response_text = response['choices'][0]['message']['content']
208
+ # Return the response text and updated chat history
209
+ return response_text
210
+
211
+ def main():
212
+ head_col = st.columns([1,8])
213
+ with head_col[0]:
214
+ st.image(project_icon)
215
+ with head_col[1]:
216
+ st.title(project_title)
217
+
218
+ st.write(project_desc)
219
+ st.write(f"Source Project: {project_link}")
220
+ expander = st.expander("Additional Information on the Source Project (Loglizer)")
221
+ expander.markdown(add_info_md)
222
+ st.markdown("***")
223
+ st.subheader("")
224
+ #########################################
225
+
226
+ # instructions and file upload button
227
+ st.subheader("""
228
+ How to use:
229
+ 1. Upload your log file (must be .csv format)
230
+ 2. Click the 'Run Loglizer' button
231
+ """)
232
+ uploaded_file = st.file_uploader("Upload a log file in csv format:",
233
+ type=["csv"],
234
+ accept_multiple_files=False)
235
+
236
+ #########################################
237
+
238
+ run_button = st.button("Run Loglizer")
239
+
240
+ if "anomalies" not in st.session_state:
241
+ st.session_state.anomalies = []
242
+ if "col_names" not in st.session_state:
243
+ st.session_state.col_names = []
244
+
245
+ # button is clicked
246
+ if run_button:
247
+ if uploaded_file is None: # if no files were uploaded:
248
+ st.error("Please upload a .csv log file")
249
+
250
+ else:
251
+ # Ensure temp directory exists
252
+ if not os.path.exists('temp'):
253
+ os.makedirs('temp')
254
+
255
+ filename = os.path.basename(uploaded_file.name)
256
+ file_path = os.path.join("temp", filename)
257
+ # Write out the uploaded file to temp directory
258
+ with open(file_path, 'wb') as f:
259
+ f.write(uploaded_file.getbuffer())
260
+
261
+ with st.spinner('Loading Loglizer...'):
262
+ model, feature_extractor, event_names_dict = load_loglizer()
263
+
264
+ # Simulate the 'live' log feed
265
+ (x_live, _), (_, _), _ = dataloader.load_HDFS(file_path, window='session', split_type='sequential')
266
+ x_live, st.session_state.col_names = feature_extractor.transform(x_live)
267
+ # st.write(f"x_live shape: {x_live.shape}")
268
+
269
+ # Map event IDs to event templates
270
+ st.session_state.col_names = [event_names_dict[id] if id in event_names_dict else id for id in
271
+ st.session_state.col_names]
272
+
273
+ for i in range(len(x_live)):
274
+ log = x_live[i]
275
+ log = np.expand_dims(log, axis=0)
276
+ prediction = model.predict(log)
277
+ print(f"prediction for x_live[{i}]: {prediction}")
278
+ if prediction == 1:
279
+ # Anomaly detected
280
+ context = x_live[max(0, i - 2):min(i + 3, len(x_live))] # Get the two logs before and after
281
+ st.session_state.anomalies.append(context)
282
+ # st.write("predict on x_live:")
283
+ # st.write(model.predict(x_live))
284
+
285
+ st.write("Anomalous events")
286
+ st.write(len(st.session_state.anomalies))
287
+ #
288
+ # st.write("Column names")
289
+ # st.write(st.session_state.col_names)
290
+
291
+ if len(st.session_state.anomalies) > 0:
292
+ # Displaying the anomalous packets and sending to ChatGPT
293
+ selected_index = st.selectbox('Select an anomaly', list(range(len(st.session_state.anomalies))), 0)
294
+ selected_anomaly = st.session_state.anomalies[selected_index]
295
+ st.write(f"Loglizer found and compiled {selected_anomaly.shape[1]} events from the logs.\n The middle entry has an anomaly:")
296
+ # st.write(f"Shape: {selected_anomaly.shape} Type: {type(selected_anomaly)}")
297
+
298
+ # Convert the numpy array to a pandas DataFrame
299
+ selected_anomaly_df = pd.DataFrame(selected_anomaly)
300
+ # Set the column names
301
+ selected_anomaly_df.columns = st.session_state.col_names
302
+
303
+ # st.write(selected_anomaly)
304
+ st.write(selected_anomaly_df)
305
+
306
+ if st.button('Send to ChatGPT'):
307
+ result = analyze_with_chatgpt(selected_anomaly_df)
308
+ st.write(result)
309
+
310
+
311
+ if __name__ == '__main__':
312
+ main()
313
+
314
+ # To run streamlit, go to terminal and type: 'streamlit run app-source.py'
data/data_instances.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ba4d353dddd4e7ef0de168def9502f38d17dbe95ca5d966c5a6d68aa05b4955
3
+ size 909112
data/loglizer/__init__.py ADDED
File without changes
data/loglizer/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (138 Bytes). View file
 
data/loglizer/__pycache__/dataloader.cpython-38.pyc ADDED
Binary file (7.77 kB). View file
 
data/loglizer/__pycache__/preprocessing.cpython-38.pyc ADDED
Binary file (3.54 kB). View file
 
data/loglizer/__pycache__/utils.cpython-38.pyc ADDED
Binary file (836 Bytes). View file
 
data/loglizer/dataloader.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The interface to load log datasets. The datasets currently supported include
3
+ HDFS and BGL.
4
+
5
+ Authors:
6
+ LogPAI Team
7
+
8
+ """
9
+
10
+ import pandas as pd
11
+ import os
12
+ import numpy as np
13
+ import re
14
+ from sklearn.utils import shuffle
15
+ from collections import OrderedDict
16
+
17
+ def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'):
18
+ if split_type == 'uniform' and y_data is not None:
19
+ pos_idx = y_data > 0
20
+ x_pos = x_data[pos_idx]
21
+ y_pos = y_data[pos_idx]
22
+ x_neg = x_data[~pos_idx]
23
+ y_neg = y_data[~pos_idx]
24
+ train_pos = int(train_ratio * x_pos.shape[0])
25
+ train_neg = int(train_ratio * x_neg.shape[0])
26
+ x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
27
+ y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
28
+ x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
29
+ y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])
30
+ elif split_type == 'sequential':
31
+ num_train = int(train_ratio * x_data.shape[0])
32
+ x_train = x_data[0:num_train]
33
+ x_test = x_data[num_train:]
34
+ if y_data is None:
35
+ y_train = None
36
+ y_test = None
37
+ else:
38
+ y_train = y_data[0:num_train]
39
+ y_test = y_data[num_train:]
40
+ # Random shuffle
41
+ indexes = shuffle(np.arange(x_train.shape[0]))
42
+ x_train = x_train[indexes]
43
+ if y_train is not None:
44
+ y_train = y_train[indexes]
45
+ return (x_train, y_train), (x_test, y_test)
46
+
47
+ def load_HDFS(log_file, label_file=None, window='session', train_ratio=0.5, split_type='sequential', save_csv=False, window_size=0):
48
+ """ Load HDFS structured log into train and test data
49
+
50
+ Arguments
51
+ ---------
52
+ log_file: str, the file path of structured log.
53
+ label_file: str, the file path of anomaly labels, None for unlabeled data
54
+ window: str, the window options including `session` (default).
55
+ train_ratio: float, the ratio of training data for train/test split.
56
+ split_type: `uniform` or `sequential`, which determines how to split dataset. `uniform` means
57
+ to split positive samples and negative samples equally when setting label_file. `sequential`
58
+ means to split the data sequentially without label_file. That is, the first part is for training,
59
+ while the second part is for testing.
60
+
61
+ Returns
62
+ -------
63
+ (x_train, y_train): the training data
64
+ (x_test, y_test): the testing data
65
+ """
66
+
67
+ print('====== Input data summary ======')
68
+
69
+ if log_file.endswith('.npz'):
70
+ # Split training and validation set in a class-uniform way
71
+ data = np.load(log_file)
72
+ x_data = data['x_data']
73
+ y_data = data['y_data']
74
+ (x_train, y_train), (x_test, y_test) = _split_data(x_data, y_data, train_ratio, split_type)
75
+
76
+ elif log_file.endswith('.csv'):
77
+ assert window == 'session', "Only window=session is supported for HDFS dataset."
78
+ print("Loading", log_file)
79
+ struct_log = pd.read_csv(log_file, engine='c',
80
+ na_filter=False, memory_map=True)
81
+ data_dict = OrderedDict()
82
+ for idx, row in struct_log.iterrows():
83
+ blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
84
+ blkId_set = set(blkId_list)
85
+ for blk_Id in blkId_set:
86
+ if not blk_Id in data_dict:
87
+ data_dict[blk_Id] = []
88
+ data_dict[blk_Id].append(row['EventId'])
89
+ data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])
90
+
91
+ if label_file:
92
+ # Split training and validation set in a class-uniform way
93
+ label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True)
94
+ label_data = label_data.set_index('BlockId')
95
+ label_dict = label_data['Label'].to_dict()
96
+ data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)
97
+
98
+ # Split train and test data
99
+ (x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values,
100
+ data_df['Label'].values, train_ratio, split_type)
101
+
102
+ print(y_train.sum(), y_test.sum())
103
+
104
+ if save_csv:
105
+ data_df.to_csv('data_instances.csv', index=False)
106
+
107
+ if window_size > 0:
108
+ x_train, window_y_train, y_train = slice_hdfs(x_train, y_train, window_size)
109
+ x_test, window_y_test, y_test = slice_hdfs(x_test, y_test, window_size)
110
+ log = "{} {} windows ({}/{} anomaly), {}/{} normal"
111
+ print(log.format("Train:", x_train.shape[0], y_train.sum(), y_train.shape[0], (1-y_train).sum(), y_train.shape[0]))
112
+ print(log.format("Test:", x_test.shape[0], y_test.sum(), y_test.shape[0], (1-y_test).sum(), y_test.shape[0]))
113
+ return (x_train, window_y_train, y_train), (x_test, window_y_test, y_test)
114
+
115
+ if label_file is None:
116
+ if split_type == 'uniform':
117
+ split_type = 'sequential'
118
+ print('Warning: Only split_type=sequential is supported \
119
+ if label_file=None.'.format(split_type))
120
+ # Split training and validation set sequentially
121
+ x_data = data_df['EventSequence'].values
122
+ (x_train, _), (x_test, _) = _split_data(x_data, train_ratio=train_ratio, split_type=split_type)
123
+ print('Total: {} instances, train: {} instances, test: {} instances'.format(
124
+ x_data.shape[0], x_train.shape[0], x_test.shape[0]))
125
+ return (x_train, None), (x_test, None), data_df
126
+ else:
127
+ raise NotImplementedError('load_HDFS() only support csv and npz files!')
128
+
129
+ num_train = x_train.shape[0]
130
+ num_test = x_test.shape[0]
131
+ num_total = num_train + num_test
132
+ num_train_pos = sum(y_train)
133
+ num_test_pos = sum(y_test)
134
+ num_pos = num_train_pos + num_test_pos
135
+
136
+ print('Total: {} instances, {} anomaly, {} normal' \
137
+ .format(num_total, num_pos, num_total - num_pos))
138
+ print('Train: {} instances, {} anomaly, {} normal' \
139
+ .format(num_train, num_train_pos, num_train - num_train_pos))
140
+ print('Test: {} instances, {} anomaly, {} normal\n' \
141
+ .format(num_test, num_test_pos, num_test - num_test_pos))
142
+
143
+ return (x_train, y_train), (x_test, y_test)
144
+
145
+ def slice_hdfs(x, y, window_size):
146
+ results_data = []
147
+ print("Slicing {} sessions, with window {}".format(x.shape[0], window_size))
148
+ for idx, sequence in enumerate(x):
149
+ seqlen = len(sequence)
150
+ i = 0
151
+ while (i + window_size) < seqlen:
152
+ slice = sequence[i: i + window_size]
153
+ results_data.append([idx, slice, sequence[i + window_size], y[idx]])
154
+ i += 1
155
+ else:
156
+ slice = sequence[i: i + window_size]
157
+ slice += ["#Pad"] * (window_size - len(slice))
158
+ results_data.append([idx, slice, "#Pad", y[idx]])
159
+ results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"])
160
+ print("Slicing done, {} windows generated".format(results_df.shape[0]))
161
+ return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"]
162
+
163
+
164
+
165
+ def load_BGL(log_file, label_file=None, window='sliding', time_interval=60, stepping_size=60,
166
+ train_ratio=0.8):
167
+ """ TODO
168
+
169
+ """
170
+
171
+
172
+ def bgl_preprocess_data(para, raw_data, event_mapping_data):
173
+ """ split logs into sliding windows, built an event count matrix and get the corresponding label
174
+
175
+ Args:
176
+ --------
177
+ para: the parameters dictionary
178
+ raw_data: list of (label, time)
179
+ event_mapping_data: a list of event index, where each row index indicates a corresponding log
180
+
181
+ Returns:
182
+ --------
183
+ event_count_matrix: event count matrix, where each row is an instance (log sequence vector)
184
+ labels: a list of labels, 1 represents anomaly
185
+ """
186
+
187
+ # create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running
188
+ if not os.path.exists(para['save_path']):
189
+ os.mkdir(para['save_path'])
190
+ log_size = raw_data.shape[0]
191
+ sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv'
192
+
193
+ #=============divide into sliding windows=========#
194
+ start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window
195
+ label_data, time_data = raw_data[:,0], raw_data[:, 1]
196
+ if not os.path.exists(sliding_file_path):
197
+ # split into sliding window
198
+ start_time = time_data[0]
199
+ start_index = 0
200
+ end_index = 0
201
+
202
+ # get the first start, end index, end time
203
+ for cur_time in time_data:
204
+ if cur_time < start_time + para['window_size']*3600:
205
+ end_index += 1
206
+ end_time = cur_time
207
+ else:
208
+ start_end_pair=tuple((start_index,end_index))
209
+ start_end_index_list.append(start_end_pair)
210
+ break
211
+ # move the start and end index until next sliding window
212
+ while end_index < log_size:
213
+ start_time = start_time + para['step_size']*3600
214
+ end_time = end_time + para['step_size']*3600
215
+ for i in range(start_index,end_index):
216
+ if time_data[i] < start_time:
217
+ i+=1
218
+ else:
219
+ break
220
+ for j in range(end_index, log_size):
221
+ if time_data[j] < end_time:
222
+ j+=1
223
+ else:
224
+ break
225
+ start_index = i
226
+ end_index = j
227
+ start_end_pair = tuple((start_index, end_index))
228
+ start_end_index_list.append(start_end_pair)
229
+ inst_number = len(start_end_index_list)
230
+ print('there are %d instances (sliding windows) in this dataset\n'%inst_number)
231
+ np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d')
232
+ else:
233
+ print('Loading start_end_index_list from file')
234
+ start_end_index_list = pd.read_csv(sliding_file_path, header=None).values
235
+ inst_number = len(start_end_index_list)
236
+ print('there are %d instances (sliding windows) in this dataset' % inst_number)
237
+
238
+ # get all the log indexes in each time window by ranging from start_index to end_index
239
+ expanded_indexes_list=[]
240
+ for t in range(inst_number):
241
+ index_list = []
242
+ expanded_indexes_list.append(index_list)
243
+ for i in range(inst_number):
244
+ start_index = start_end_index_list[i][0]
245
+ end_index = start_end_index_list[i][1]
246
+ for l in range(start_index, end_index):
247
+ expanded_indexes_list[i].append(l)
248
+
249
+ event_mapping_data = [row[0] for row in event_mapping_data]
250
+ event_num = len(list(set(event_mapping_data)))
251
+ print('There are %d log events'%event_num)
252
+
253
+ #=============get labels and event count of each sliding window =========#
254
+ labels = []
255
+ event_count_matrix = np.zeros((inst_number,event_num))
256
+ for j in range(inst_number):
257
+ label = 0 #0 represent success, 1 represent failure
258
+ for k in expanded_indexes_list[j]:
259
+ event_index = event_mapping_data[k]
260
+ event_count_matrix[j, event_index] += 1
261
+ if label_data[k]:
262
+ label = 1
263
+ continue
264
+ labels.append(label)
265
+ assert inst_number == len(labels)
266
+ print("Among all instances, %d are anomalies"%sum(labels))
267
+ assert event_count_matrix.shape[0] == len(labels)
268
+ return event_count_matrix, labels
data/loglizer/models/DecisionTree.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of the decision tree model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Mike Chen, Alice X. Zheng, Jim Lloyd, Michael I. Jordan, Eric Brewer.
9
+ Failure Diagnosis Using Decision Trees. IEEE International Conference
10
+ on Autonomic Computing (ICAC), 2004.
11
+
12
+ """
13
+
14
+ import numpy as np
15
+ from sklearn import tree
16
+ from ..utils import metrics
17
+
18
+ class DecisionTree(object):
19
+
20
+ def __init__(self, criterion='gini', max_depth=None, max_features=None, class_weight=None):
21
+ """ The Invariants Mining model for anomaly detection
22
+ Arguments
23
+ ---------
24
+ See DecisionTreeClassifier API: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
25
+
26
+ Attributes
27
+ ----------
28
+ classifier: object, the classifier for anomaly detection
29
+
30
+ """
31
+ self.classifier = tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
32
+ max_features=max_features, class_weight=class_weight)
33
+
34
+ def fit(self, X, y):
35
+ """
36
+ Arguments
37
+ ---------
38
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
39
+ """
40
+ print('====== Model summary ======')
41
+ self.classifier.fit(X, y)
42
+
43
+ def predict(self, X):
44
+ """ Predict anomalies with mined invariants
45
+
46
+ Arguments
47
+ ---------
48
+ X: the input event count matrix
49
+
50
+ Returns
51
+ -------
52
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
53
+ """
54
+
55
+ y_pred = self.classifier.predict(X)
56
+ return y_pred
57
+
58
+ def predict_proba(self, X):
59
+ """ Predict anomalies with mined invariants
60
+
61
+ Arguments
62
+ ---------
63
+ X: the input event count matrix
64
+
65
+ Returns
66
+ -------
67
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
68
+ """
69
+
70
+ y_pred = self.classifier.predict_proba(X)
71
+ return y_pred
72
+
73
+ def evaluate(self, X, y_true):
74
+ print('====== Evaluation summary ======')
75
+ y_pred = self.predict(X)
76
+ precision, recall, f1 = metrics(y_pred, y_true)
77
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
78
+ return precision, recall, f1
data/loglizer/models/InvariantsMiner.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of Invariants Mining model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Jian-Guang Lou, Qiang Fu, Shengqi Yang, Ye Xu, Jiang Li. Mining Invariants
9
+ from Console Logs for System Problem Detection. USENIX Annual Technical
10
+ Conference (ATC), 2010.
11
+
12
+ """
13
+
14
+ import numpy as np
15
+ from itertools import combinations
16
+ from ..utils import metrics
17
+
18
+ class InvariantsMiner(object):
19
+
20
+ def __init__(self, percentage=0.98, epsilon=0.5, longest_invarant=None, scale_list=[1,2,3]):
21
+ """ The Invariants Mining model for anomaly detection
22
+
23
+ Attributes
24
+ ----------
25
+ percentage: float, percentage of samples satisfying the condition that |X_j * V_i| < epsilon
26
+ epsilon: float, the threshold for estimating the invariant space
27
+ longest_invarant: int, the specified maximal length of invariant, default to None. Stop
28
+ searching when the invariant length is larger than longest_invarant.
29
+ scale_list: list, the list used to scale the theta of float into integer
30
+ invariants_dict: dict, dictionary of invariants where key is the selected columns
31
+ and value is the weights the of invariant
32
+ """
33
+ self.percentage = percentage
34
+ self.epsilon = epsilon
35
+ self.longest_invarant = longest_invarant
36
+ self.scale_list = scale_list
37
+ self.invariants_dict = None
38
+
39
+ def fit(self, X):
40
+ """
41
+ Arguments
42
+ ---------
43
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
44
+ """
45
+ print('====== Model summary ======')
46
+ invar_dim = self._estimate_invarant_space(X)
47
+ self._invariants_search(X, invar_dim)
48
+
49
+ def predict(self, X):
50
+ """ Predict anomalies with mined invariants
51
+
52
+ Arguments
53
+ ---------
54
+ X: the input event count matrix
55
+
56
+ Returns
57
+ -------
58
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
59
+ """
60
+
61
+ y_sum = np.zeros(X.shape[0])
62
+ for cols, theta in self.invariants_dict.items():
63
+ y_sum += np.fabs(np.dot(X[:, cols], np.array(theta)))
64
+ y_pred = (y_sum > 1e-6).astype(int)
65
+ return y_pred
66
+
67
+ def evaluate(self, X, y_true):
68
+ print('====== Evaluation summary ======')
69
+ y_pred = self.predict(X)
70
+ precision, recall, f1 = metrics(y_pred, y_true)
71
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
72
+ return precision, recall, f1
73
+
74
+ def _estimate_invarant_space(self, X):
75
+ """ Estimate the dimension of invariant space using SVD decomposition
76
+
77
+ Arguments
78
+ ---------
79
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
80
+ percentage: float, percentage of samples satisfying the condition that |X_j * V_i| < epsilon
81
+ epsilon: float, the threshold for estimating the invariant space
82
+
83
+ Returns
84
+ -------
85
+ r: the dimension of invariant space
86
+ """
87
+ covariance_matrix = np.dot(X.T, X)
88
+ U, sigma, V = np.linalg.svd(covariance_matrix) # SVD decomposition
89
+ # Start from the right most column of matrix V, sigular values are in ascending order
90
+ num_instances, num_events = X.shape
91
+ r = 0
92
+ for i in range(num_events - 1, -1, -1):
93
+ zero_count = sum(abs(np.dot(X, U[:, i])) < self.epsilon)
94
+ if zero_count / float(num_instances) < self.percentage:
95
+ break
96
+ r += 1
97
+ print('Invariant space dimension: {}'.format(r))
98
+
99
+ return r
100
+
101
+ def _invariants_search(self, X, r):
102
+ """ Mine invariant relationships from X
103
+
104
+ Arguments
105
+ ---------
106
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
107
+ r: the dimension of invariant space
108
+ """
109
+
110
+ num_instances, num_events = X.shape
111
+ invariants_dict = dict() # save the mined Invariants(value) and its corresponding columns(key)
112
+ search_space = [] # only invariant candidates in this list are valid
113
+
114
+ # invariant of only one column (all zero columns)
115
+ init_cols = sorted([[item] for item in range(num_events)])
116
+ for col in init_cols:
117
+ search_space.append(col)
118
+ init_col_list = init_cols[:]
119
+ for col in init_cols:
120
+ if np.count_nonzero(X[:, col]) == 0:
121
+ invariants_dict[tuple(col)] = [1]
122
+ search_space.remove(col)
123
+ init_col_list.remove(col)
124
+
125
+ item_list = init_col_list
126
+ length = 2
127
+ FLAG_break_loop = False
128
+ # check invariant of more columns
129
+ while len(item_list) != 0:
130
+ if self.longest_invarant and len(item_list[0]) >= self.longest_invarant:
131
+ break
132
+ joined_item_list = self._join_set(item_list, length) # generate new invariant candidates
133
+ for items in joined_item_list:
134
+ if self._check_candi_valid(items, length, search_space):
135
+ search_space.append(items)
136
+ item_list = []
137
+ for item in joined_item_list:
138
+ if tuple(item) in invariants_dict:
139
+ continue
140
+ if item not in search_space:
141
+ continue
142
+ if not self._check_candi_valid(tuple(item), length, search_space) and length > 2:
143
+ search_space.remove(item)
144
+ continue # an item must be superset of all other subitems in searchSpace, else skip
145
+ validity, scaled_theta = self._check_invar_validity(X, item)
146
+ if validity:
147
+ self._prune(invariants_dict.keys(), set(item), search_space)
148
+ invariants_dict[tuple(item)] = scaled_theta.tolist()
149
+ search_space.remove(item)
150
+ else:
151
+ item_list.append(item)
152
+ if len(invariants_dict) >= r:
153
+ FLAG_break_loop = True
154
+ break
155
+ if FLAG_break_loop:
156
+ break
157
+ length += 1
158
+ print('Mined {} invariants: {}\n'.format(len(invariants_dict), invariants_dict))
159
+ self.invariants_dict = invariants_dict
160
+
161
+ def _compute_eigenvector(self, X):
162
+ """ calculate the smallest eigenvalue and corresponding eigenvector (theta in the paper)
163
+ for a given sub_matrix
164
+
165
+ Arguments
166
+ ---------
167
+ X: the event count matrix (each row is a log sequence vector, each column represents an event)
168
+
169
+ Returns
170
+ -------
171
+ min_vec: the eigenvector of corresponding minimum eigen value
172
+ FLAG_contain_zero: whether the min_vec contains zero (very small value)
173
+ """
174
+
175
+ FLAG_contain_zero = False
176
+ count_zero = 0
177
+ dot_result = np.dot(X.T, X)
178
+ U, S, V = np.linalg.svd(dot_result)
179
+ min_vec = U[:, -1]
180
+ count_zero = sum(np.fabs(min_vec) < 1e-6)
181
+ if count_zero != 0:
182
+ FLAG_contain_zero = True
183
+ return min_vec, FLAG_contain_zero
184
+
185
+
186
+ def _check_invar_validity(self, X, selected_columns):
187
+ """ scale the eigenvector of float number into integer, and check whether the scaled number is valid
188
+
189
+ Arguments
190
+ ---------
191
+ X: the event count matrix (each row is a log sequence vector, each column represents an event)
192
+ selected_columns: select columns from all column list
193
+
194
+ Returns
195
+ -------
196
+ validity: whether the selected columns is valid
197
+ scaled_theta: the scaled theta vector
198
+ """
199
+
200
+ sub_matrix = X[:, selected_columns]
201
+ inst_num = X.shape[0]
202
+ validity = False
203
+ min_theta, FLAG_contain_zero = self._compute_eigenvector(sub_matrix)
204
+ abs_min_theta = [np.fabs(it) for it in min_theta]
205
+ if FLAG_contain_zero:
206
+ return validity, []
207
+ else:
208
+ for i in self.scale_list:
209
+ min_index = np.argmin(abs_min_theta)
210
+ scale = float(i) / min_theta[min_index]
211
+ scaled_theta = np.array([round(item * scale) for item in min_theta])
212
+ scaled_theta[min_index] = i
213
+ scaled_theta = scaled_theta.T
214
+ if 0 in np.fabs(scaled_theta):
215
+ continue
216
+ dot_submat_theta = np.dot(sub_matrix, scaled_theta)
217
+ count_zero = 0
218
+ for j in dot_submat_theta:
219
+ if np.fabs(j) < 1e-8:
220
+ count_zero += 1
221
+ if count_zero >= self.percentage * inst_num:
222
+ validity = True
223
+ # print('A valid invariant is found: ',scaled_theta, selected_columns)
224
+ break
225
+ return validity, scaled_theta
226
+
227
+ def _prune(self, valid_cols, new_item_set, search_space):
228
+ """ prune invalid combination of columns
229
+
230
+ Arguments
231
+ ---------
232
+ valid_cols: existing valid column list
233
+ new_item_set: item set to be merged
234
+ search_space: the search space that stores possible candidates
235
+
236
+ """
237
+
238
+ if len(valid_cols) == 0:
239
+ return
240
+ for se in valid_cols:
241
+ intersection = set(se) & new_item_set
242
+ if len(intersection) == 0:
243
+ continue
244
+ union = set(se) | new_item_set
245
+ for item in list(intersection):
246
+ diff = sorted(list(union - set([item])))
247
+ if diff in search_space:
248
+ search_space.remove(diff)
249
+
250
+
251
+ def _join_set(self, item_list, length):
252
+ """ Join a set with itself and returns the n-element (length) itemsets
253
+
254
+ Arguments
255
+ ---------
256
+ item_list: current list of columns
257
+ length: generate new items of length
258
+
259
+ Returns
260
+ -------
261
+ return_list: list of items of length-element
262
+ """
263
+
264
+ set_len = len(item_list)
265
+ return_list = []
266
+ for i in range(set_len):
267
+ for j in range(i + 1, set_len):
268
+ i_set = set(item_list[i])
269
+ j_set = set(item_list[j])
270
+ if len(i_set.union(j_set)) == length:
271
+ joined = sorted(list(i_set.union(j_set)))
272
+ if joined not in return_list:
273
+ return_list.append(joined)
274
+ return_list = sorted(return_list)
275
+ return return_list
276
+
277
+
278
+ def _check_candi_valid(self, item, length, search_space):
279
+ """ check whether an item's subitems are in searchspace
280
+
281
+ Arguments
282
+ ---------
283
+ item: item to be checked
284
+ length: the length of item
285
+ search_space: the search space that stores possible candidates
286
+
287
+ Returns
288
+ -------
289
+ True or False
290
+ """
291
+
292
+ for subItem in combinations(item, length - 1):
293
+ if sorted(list(subItem)) not in search_space:
294
+ return False
295
+ return True
296
+
data/loglizer/models/IsolationForest.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of IsolationForest model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Fei Tony Liu, Kai Ming Ting, Zhi-Hua Zhou. Isolation Forest. International
9
+ Conference on Data Mining (ICDM), 2008.
10
+
11
+ """
12
+
13
+
14
+
15
+
16
+
17
+ import numpy as np
18
+ from sklearn.ensemble import IsolationForest as iForest
19
+ from ..utils import metrics
20
+
21
+ class IsolationForest(iForest):
22
+
23
+ def __init__(self, n_estimators=100, max_samples='auto', contamination=0.03, **kwargs):
24
+ """ The IsolationForest model for anomaly detection
25
+
26
+ Arguments
27
+ ---------
28
+ n_estimators : int, optional (default=100). The number of base estimators in the ensemble.
29
+ max_samples : int or float, optional (default="auto")
30
+ The number of samples to draw from X to train each base estimator.
31
+ - If int, then draw max_samples samples.
32
+ - If float, then draw max_samples * X.shape[0] samples.
33
+ - If "auto", then max_samples=min(256, n_samples).
34
+ If max_samples is larger than the number of samples provided, all samples will be used
35
+ for all trees (no sampling).
36
+ contamination : float in (0., 0.5), optional (default='auto')
37
+ The amount of contamination of the data set, i.e. the proportion of outliers in the data
38
+ set. Used when fitting to define the threshold on the decision function. If 'auto', the
39
+ decision function threshold is determined as in the original paper.
40
+ max_features : int or float, optional (default=1.0)
41
+ The number of features to draw from X to train each base estimator.
42
+ - If int, then draw max_features features.
43
+ - If float, then draw max_features * X.shape[1] features.
44
+ bootstrap : boolean, optional (default=False)
45
+ If True, individual trees are fit on random subsets of the training data sampled with replacement.
46
+ If False, sampling without replacement is performed.
47
+ n_jobs : int or None, optional (default=None)
48
+ The number of jobs to run in parallel for both fit and predict. None means 1 unless in a
49
+ joblib.parallel_backend context. -1 means using all processors.
50
+ random_state : int, RandomState instance or None, optional (default=None)
51
+ If int, random_state is the seed used by the random number generator;
52
+ If RandomState instance, random_state is the random number generator;
53
+ If None, the random number generator is the RandomState instance used by np.random.
54
+
55
+ Reference
56
+ ---------
57
+ For more information, please visit https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
58
+ """
59
+
60
+ super(IsolationForest, self).__init__(n_estimators=n_estimators, max_samples=max_samples,
61
+ contamination=contamination, **kwargs)
62
+
63
+
64
+ def fit(self, X):
65
+ """
66
+ Auguments
67
+ ---------
68
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
69
+ """
70
+
71
+ print('====== Model summary ======')
72
+ super(IsolationForest, self).fit(X)
73
+
74
+ def predict(self, X):
75
+ """ Predict anomalies with mined invariants
76
+
77
+ Arguments
78
+ ---------
79
+ X: the input event count matrix
80
+
81
+ Returns
82
+ -------
83
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
84
+ """
85
+
86
+ y_pred = super(IsolationForest, self).predict(X)
87
+ y_pred = np.where(y_pred > 0, 0, 1)
88
+ return y_pred
89
+
90
+ def evaluate(self, X, y_true):
91
+ print('====== Evaluation summary ======')
92
+ y_pred = self.predict(X)
93
+ precision, recall, f1 = metrics(y_pred, y_true)
94
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
95
+ return precision, recall, f1
96
+
data/loglizer/models/LR.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ The implementation of the logistic regression model for anomaly detection.
4
+
5
+ Authors:
6
+ LogPAI Team
7
+
8
+ Reference:
9
+ [1] Peter Bodík, Moises Goldszmidt, Armando Fox, Hans Andersen. Fingerprinting
10
+ the Datacenter: Automated Classification of Performance Crises. The European
11
+ Conference on Computer Systems (EuroSys), 2010.
12
+
13
+ """
14
+
15
+ import numpy as np
16
+ from sklearn.linear_model import LogisticRegression
17
+ from ..utils import metrics
18
+
19
+ class LR(object):
20
+
21
+ def __init__(self, penalty='l2', C=100, tol=0.01, class_weight=None, max_iter=100):
22
+ """ The Invariants Mining model for anomaly detection
23
+
24
+ Attributes
25
+ ----------
26
+ classifier: object, the classifier for anomaly detection
27
+ """
28
+ self.classifier = LogisticRegression(penalty=penalty, C=C, tol=tol, class_weight=class_weight,
29
+ max_iter=max_iter)
30
+
31
+ def fit(self, X, y):
32
+ """
33
+ Arguments
34
+ ---------
35
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
36
+ """
37
+ print('====== Model summary ======')
38
+ self.classifier.fit(X, y)
39
+
40
+ def predict(self, X):
41
+ """ Predict anomalies with mined invariants
42
+
43
+ Arguments
44
+ ---------
45
+ X: the input event count matrix
46
+
47
+ Returns
48
+ -------
49
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
50
+ """
51
+ y_pred = self.classifier.predict(X)
52
+ return y_pred
53
+
54
+ def predict_proba(self, X):
55
+ """ Predict anomalies with mined invariants
56
+
57
+ Arguments
58
+ ---------
59
+ X: the input event count matrix
60
+
61
+ Returns
62
+ -------
63
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
64
+ """
65
+ y_pred = self.classifier.predict_proba(X)
66
+ return y_pred
67
+
68
+ def evaluate(self, X, y_true):
69
+ print('====== Evaluation summary ======')
70
+ y_pred = self.predict(X)
71
+ precision, recall, f1 = metrics(y_pred, y_true)
72
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
73
+ return precision, recall, f1
data/loglizer/models/LogClustering.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of Log Clustering model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Qingwei Lin, Hongyu Zhang, Jian-Guang Lou, Yu Zhang, Xuewei Chen. Log Clustering
9
+ based Problem Identification for Online Service Systems. International Conference
10
+ on Software Engineering (ICSE), 2016.
11
+
12
+ """
13
+
14
+ import numpy as np
15
+ import pprint
16
+ from scipy.special import expit
17
+ from numpy import linalg as LA
18
+ from scipy.cluster.hierarchy import linkage, fcluster
19
+ from scipy.spatial.distance import pdist, squareform
20
+ from ..utils import metrics
21
+
22
+
23
+ class LogClustering(object):
24
+
25
+ def __init__(self, max_dist=0.3, anomaly_threshold=0.3, mode='online', num_bootstrap_samples=1000):
26
+ """
27
+ Attributes
28
+ ----------
29
+ max_dist: float, the threshold to stop the clustering process
30
+ anomaly_threshold: float, the threshold for anomaly detection
31
+ mode: str, 'offline' or 'online' mode for clustering
32
+ num_bootstrap_samples: int, online clustering starts with a bootstraping process, which
33
+ determines the initial cluster representatives offline using a subset of samples
34
+ representatives: ndarray, the representative samples of clusters, of shape
35
+ num_clusters-by-num_events
36
+ cluster_size_dict: dict, the size of each cluster, used to update representatives online
37
+ """
38
+ self.max_dist = max_dist
39
+ self.anomaly_threshold = anomaly_threshold
40
+ self.mode = mode
41
+ self.num_bootstrap_samples = num_bootstrap_samples
42
+ self.representatives = list()
43
+ self.cluster_size_dict = dict()
44
+
45
+ def fit(self, X):
46
+ print('====== Model summary ======')
47
+ if self.mode == 'offline':
48
+ # The offline mode can process about 10K samples only due to huge memory consumption.
49
+ self._offline_clustering(X)
50
+ elif self.mode == 'online':
51
+ # Bootstrapping phase
52
+ if self.num_bootstrap_samples > 0:
53
+ X_bootstrap = X[0:self.num_bootstrap_samples, :]
54
+ self._offline_clustering(X_bootstrap)
55
+ # Online learning phase
56
+ if X.shape[0] > self.num_bootstrap_samples:
57
+ self._online_clustering(X)
58
+
59
+ def predict(self, X):
60
+ y_pred = np.zeros(X.shape[0])
61
+ for i in range(X.shape[0]):
62
+ min_dist, min_index = self._get_min_cluster_dist(X[i, :])
63
+ if min_dist > self.anomaly_threshold:
64
+ y_pred[i] = 1
65
+ return y_pred
66
+
67
+ def evaluate(self, X, y_true):
68
+ print('====== Evaluation summary ======')
69
+ y_pred = self.predict(X)
70
+ precision, recall, f1 = metrics(y_pred, y_true)
71
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n' \
72
+ .format(precision, recall, f1))
73
+ return precision, recall, f1
74
+
75
+ def _offline_clustering(self, X):
76
+ print('Starting offline clustering...')
77
+ p_dist = pdist(X, metric=self._distance_metric)
78
+ Z = linkage(p_dist, 'complete')
79
+ cluster_index = fcluster(Z, self.max_dist, criterion='distance')
80
+ self._extract_representatives(X, cluster_index)
81
+ print('Processed {} instances.'.format(X.shape[0]))
82
+ print('Found {} clusters offline.\n'.format(len(self.representatives)))
83
+ # print('The representive vectors are:')
84
+ # pprint.pprint(self.representatives.tolist())
85
+
86
+ def _extract_representatives(self, X, cluster_index):
87
+ num_clusters = len(set(cluster_index))
88
+ for clu in range(num_clusters):
89
+ clu_idx = np.argwhere(cluster_index == clu + 1)[:, 0]
90
+ self.cluster_size_dict[clu] = clu_idx.shape[0]
91
+ repre_center = np.average(X[clu_idx, :], axis=0)
92
+ self.representatives.append(repre_center)
93
+
94
+ def _online_clustering(self, X):
95
+ print("Starting online clustering...")
96
+ for i in range(self.num_bootstrap_samples, X.shape[0]):
97
+ if (i + 1) % 2000 == 0:
98
+ print('Processed {} instances.'.format(i + 1))
99
+ instance_vec = X[i, :]
100
+ if len(self.representatives) > 0:
101
+ min_dist, clu_id = self._get_min_cluster_dist(instance_vec)
102
+ if min_dist <= self.max_dist:
103
+ self.cluster_size_dict[clu_id] += 1
104
+ self.representatives[clu_id] = self.representatives[clu_id] \
105
+ + (instance_vec - self.representatives[clu_id]) \
106
+ / self.cluster_size_dict[clu_id]
107
+ continue
108
+ self.cluster_size_dict[len(self.representatives)] = 1
109
+ self.representatives.append(instance_vec)
110
+ print('Processed {} instances.'.format(X.shape[0]))
111
+ print('Found {} clusters online.\n'.format(len(self.representatives)))
112
+ # print('The representive vectors are:')
113
+ # pprint.pprint(self.representatives.tolist())
114
+
115
+ def _distance_metric(self, x1, x2):
116
+ norm= LA.norm(x1) * LA.norm(x2)
117
+ distance = 1 - np.dot(x1, x2) / (norm + 1e-8)
118
+ if distance < 1e-8:
119
+ distance = 0
120
+ return distance
121
+
122
+ def _get_min_cluster_dist(self, instance_vec):
123
+ min_index = -1
124
+ min_dist = float('inf')
125
+ for i in range(len(self.representatives)):
126
+ cluster_rep = self.representatives[i]
127
+ dist = self._distance_metric(instance_vec, cluster_rep)
128
+ if dist < 1e-8:
129
+ min_dist = 0
130
+ min_index = i
131
+ break
132
+ elif dist < min_dist:
133
+ min_dist = dist
134
+ min_index = i
135
+ return min_dist, min_index
136
+
137
+
data/loglizer/models/PCA.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of PCA model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Wei Xu, Ling Huang, Armando Fox, David Patterson, Michael I. Jordan.
9
+ Large-Scale System Problems Detection by Mining Console Logs. ACM
10
+ Symposium on Operating Systems Principles (SOSP), 2009.
11
+
12
+ """
13
+
14
+ import numpy as np
15
+ from ..utils import metrics
16
+
17
+ class PCA(object):
18
+
19
+ def __init__(self, n_components=0.95, threshold=None, c_alpha=3.2905):
20
+ """ The PCA model for anomaly detection
21
+
22
+ Attributes
23
+ ----------
24
+ proj_C: The projection matrix for projecting feature vector to abnormal space
25
+ n_components: float/int, number of principal compnents or the variance ratio they cover
26
+ threshold: float, the anomaly detection threshold. When setting to None, the threshold
27
+ is automatically caculated using Q-statistics
28
+ c_alpha: float, the c_alpha parameter for caculating anomaly detection threshold using
29
+ Q-statistics. The following is lookup table for c_alpha:
30
+ c_alpha = 1.7507; # alpha = 0.08
31
+ c_alpha = 1.9600; # alpha = 0.05
32
+ c_alpha = 2.5758; # alpha = 0.01
33
+ c_alpha = 2.807; # alpha = 0.005
34
+ c_alpha = 2.9677; # alpha = 0.003
35
+ c_alpha = 3.2905; # alpha = 0.001
36
+ c_alpha = 3.4808; # alpha = 0.0005
37
+ c_alpha = 3.8906; # alpha = 0.0001
38
+ c_alpha = 4.4172; # alpha = 0.00001
39
+ """
40
+
41
+ self.proj_C = None
42
+ self.components = None
43
+ self.n_components = n_components
44
+ self.threshold = threshold
45
+ self.c_alpha = c_alpha
46
+
47
+
48
+ def fit(self, X):
49
+ """
50
+ Auguments
51
+ ---------
52
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
53
+ """
54
+
55
+ print('====== Model summary ======')
56
+ num_instances, num_events = X.shape
57
+ X_cov = np.dot(X.T, X) / float(num_instances)
58
+ U, sigma, V = np.linalg.svd(X_cov)
59
+ n_components = self.n_components
60
+ if n_components < 1:
61
+ total_variance = np.sum(sigma)
62
+ variance = 0
63
+ for i in range(num_events):
64
+ variance += sigma[i]
65
+ if variance / total_variance >= n_components:
66
+ break
67
+ n_components = i + 1
68
+
69
+ P = U[:, :n_components]
70
+ I = np.identity(num_events, int)
71
+ self.components = P
72
+ self.proj_C = I - np.dot(P, P.T)
73
+ print('n_components: {}'.format(n_components))
74
+ print('Project matrix shape: {}-by-{}'.format(self.proj_C.shape[0], self.proj_C.shape[1]))
75
+
76
+ if not self.threshold:
77
+ # Calculate threshold using Q-statistic. Information can be found at:
78
+ # http://conferences.sigcomm.org/sigcomm/2004/papers/p405-lakhina111.pdf
79
+ phi = np.zeros(3)
80
+ for i in range(3):
81
+ for j in range(n_components, num_events):
82
+ phi[i] += np.power(sigma[j], i + 1)
83
+ h0 = 1.0 - 2 * phi[0] * phi[2] / (3.0 * phi[1] * phi[1])
84
+ self.threshold = phi[0] * np.power(self.c_alpha * np.sqrt(2 * phi[1] * h0 * h0) / phi[0]
85
+ + 1.0 + phi[1] * h0 * (h0 - 1) / (phi[0] * phi[0]),
86
+ 1.0 / h0)
87
+ print('SPE threshold: {}\n'.format(self.threshold))
88
+
89
+ def predict(self, X):
90
+ assert self.proj_C is not None, 'PCA model needs to be trained before prediction.'
91
+ y_pred = np.zeros(X.shape[0])
92
+ for i in range(X.shape[0]):
93
+ y_a = np.dot(self.proj_C, X[i, :])
94
+ SPE = np.dot(y_a, y_a)
95
+ if SPE > self.threshold:
96
+ y_pred[i] = 1
97
+ return y_pred
98
+
99
+ def evaluate(self, X, y_true):
100
+ print('====== Evaluation summary ======')
101
+ y_pred = self.predict(X)
102
+ precision, recall, f1 = metrics(y_pred, y_true)
103
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
104
+ return precision, recall, f1
105
+
data/loglizer/models/SVM.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of the SVM model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Yinglung Liang, Yanyong Zhang, Hui Xiong, Ramendra Sahoo. Failure Prediction
9
+ in IBM BlueGene/L Event Logs. IEEE International Conference on Data Mining
10
+ (ICDM), 2007.
11
+
12
+ """
13
+
14
+ import numpy as np
15
+ from sklearn import svm
16
+ from ..utils import metrics
17
+
18
+ class SVM(object):
19
+
20
+ def __init__(self, penalty='l1', tol=0.1, C=1, dual=False, class_weight=None,
21
+ max_iter=100):
22
+ """ The Invariants Mining model for anomaly detection
23
+ Arguments
24
+ ---------
25
+ See SVM API: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
26
+
27
+ Attributes
28
+ ----------
29
+ classifier: object, the classifier for anomaly detection
30
+
31
+ """
32
+ self.classifier = svm.LinearSVC(penalty=penalty, tol=tol, C=C, dual=dual,
33
+ class_weight=class_weight, max_iter=max_iter)
34
+
35
+ def fit(self, X, y):
36
+ """
37
+ Arguments
38
+ ---------
39
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
40
+ """
41
+ print('====== Model summary ======')
42
+ self.classifier.fit(X, y)
43
+
44
+ def predict(self, X):
45
+ """ Predict anomalies with mined invariants
46
+
47
+ Arguments
48
+ ---------
49
+ X: the input event count matrix
50
+
51
+ Returns
52
+ -------
53
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
54
+ """
55
+
56
+ y_pred = self.classifier.predict(X)
57
+ return y_pred
58
+
59
+ def evaluate(self, X, y_true):
60
+ print('====== Evaluation summary ======')
61
+ y_pred = self.predict(X)
62
+ precision, recall, f1 = metrics(y_pred, y_true)
63
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
64
+ return precision, recall, f1
data/loglizer/models/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from .PCA import PCA
2
+ from .InvariantsMiner import InvariantsMiner
3
+ from .LogClustering import LogClustering
4
+ from .LR import LR
5
+ from .SVM import SVM
6
+ from .DecisionTree import DecisionTree
7
+ from .IsolationForest import IsolationForest
8
+
data/loglizer/models/__pycache__/DecisionTree.cpython-38.pyc ADDED
Binary file (2.57 kB). View file
 
data/loglizer/models/__pycache__/InvariantsMiner.cpython-38.pyc ADDED
Binary file (9.52 kB). View file
 
data/loglizer/models/__pycache__/IsolationForest.cpython-38.pyc ADDED
Binary file (4.52 kB). View file
 
data/loglizer/models/__pycache__/LR.cpython-38.pyc ADDED
Binary file (2.39 kB). View file
 
data/loglizer/models/__pycache__/LogClustering.cpython-38.pyc ADDED
Binary file (5.08 kB). View file
 
data/loglizer/models/__pycache__/PCA.cpython-38.pyc ADDED
Binary file (3.86 kB). View file
 
data/loglizer/models/__pycache__/SVM.cpython-38.pyc ADDED
Binary file (2.36 kB). View file
 
data/loglizer/models/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (369 Bytes). View file
 
data/loglizer/preprocessing.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The interface for data preprocessing.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ """
8
+
9
+
10
+ import pandas as pd
11
+ import os
12
+ import numpy as np
13
+ import re
14
+ from collections import Counter
15
+ from scipy.special import expit
16
+ from itertools import compress
17
+
18
+
19
+
20
+ class FeatureExtractor(object):
21
+
22
+ def __init__(self):
23
+ self.idf_vec = None
24
+ self.mean_vec = None
25
+ self.events = None
26
+ self.term_weighting = None
27
+ self.normalization = None
28
+ self.oov = None
29
+
30
+ def fit_transform(self, X_seq, term_weighting=None, normalization=None, oov=False, min_count=1):
31
+ """ Fit and transform the data matrix
32
+
33
+ Arguments
34
+ ---------
35
+ X_seq: ndarray, log sequences matrix
36
+ term_weighting: None or `tf-idf`
37
+ normalization: None or `zero-mean`
38
+ oov: bool, whether to use OOV event
39
+ min_count: int, the minimal occurrence of events (default 0), only valid when oov=True.
40
+
41
+ Returns
42
+ -------
43
+ X_new: The transformed data matrix
44
+ """
45
+ print('====== Transformed train data summary ======')
46
+ self.term_weighting = term_weighting
47
+ self.normalization = normalization
48
+ self.oov = oov
49
+
50
+ X_counts = []
51
+ for i in range(X_seq.shape[0]):
52
+ event_counts = Counter(X_seq[i])
53
+ X_counts.append(event_counts)
54
+ X_df = pd.DataFrame(X_counts)
55
+ X_df = X_df.fillna(0)
56
+ self.events = X_df.columns
57
+ X = X_df.values
58
+ if self.oov:
59
+ oov_vec = np.zeros(X.shape[0])
60
+ if min_count > 1:
61
+ idx = np.sum(X > 0, axis=0) >= min_count
62
+ oov_vec = np.sum(X[:, ~idx] > 0, axis=1)
63
+ X = X[:, idx]
64
+ self.events = np.array(X_df.columns)[idx].tolist()
65
+ X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)])
66
+
67
+ num_instance, num_event = X.shape
68
+ if self.term_weighting == 'tf-idf':
69
+ df_vec = np.sum(X > 0, axis=0)
70
+ self.idf_vec = np.log(num_instance / (df_vec + 1e-8))
71
+ idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1))
72
+ X = idf_matrix
73
+ if self.normalization == 'zero-mean':
74
+ mean_vec = X.mean(axis=0)
75
+ self.mean_vec = mean_vec.reshape(1, num_event)
76
+ X = X - np.tile(self.mean_vec, (num_instance, 1))
77
+ elif self.normalization == 'sigmoid':
78
+ X[X != 0] = expit(X[X != 0])
79
+ X_new = X
80
+
81
+ print('Train data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1]))
82
+ return X_new
83
+
84
+ def transform(self, X_seq):
85
+ """ Transform the data matrix with trained parameters
86
+
87
+ Arguments
88
+ ---------
89
+ X: log sequences matrix
90
+ term_weighting: None or `tf-idf`
91
+
92
+ Returns
93
+ -------
94
+ X_new: The transformed data matrix
95
+ """
96
+ print('====== Transformed test data summary ======')
97
+ X_counts = []
98
+ for i in range(X_seq.shape[0]):
99
+ event_counts = Counter(X_seq[i])
100
+ X_counts.append(event_counts)
101
+ X_df = pd.DataFrame(X_counts)
102
+ X_df = X_df.fillna(0)
103
+ empty_events = set(self.events) - set(X_df.columns)
104
+ for event in empty_events:
105
+ X_df[event] = [0] * len(X_df)
106
+ X = X_df[self.events].values
107
+ if self.oov:
108
+ oov_vec = np.sum(X_df[X_df.columns.difference(self.events)].values > 0, axis=1)
109
+ X = np.hstack([X, oov_vec.reshape(X.shape[0], 1)])
110
+
111
+ num_instance, num_event = X.shape
112
+ if self.term_weighting == 'tf-idf':
113
+ idf_matrix = X * np.tile(self.idf_vec, (num_instance, 1))
114
+ X = idf_matrix
115
+ if self.normalization == 'zero-mean':
116
+ X = X - np.tile(self.mean_vec, (num_instance, 1))
117
+ elif self.normalization == 'sigmoid':
118
+ X[X != 0] = expit(X[X != 0])
119
+ X_new = X
120
+
121
+ print('Test data shape: {}-by-{}\n'.format(X_new.shape[0], X_new.shape[1]))
122
+
123
+ return X_new, self.events
data/loglizer/utils.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The utility functions of loglizer
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ """
8
+
9
+ from sklearn.metrics import precision_recall_fscore_support
10
+ import numpy as np
11
+
12
+
13
+ def metrics(y_pred, y_true):
14
+ """ Calucate evaluation metrics for precision, recall, and f1.
15
+
16
+ Arguments
17
+ ---------
18
+ y_pred: ndarry, the predicted result list
19
+ y_true: ndarray, the ground truth label list
20
+
21
+ Returns
22
+ -------
23
+ precision: float, precision value
24
+ recall: float, recall value
25
+ f1: float, f1 measure value
26
+ """
27
+ precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
28
+ return precision, recall, f1
29
+
data/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ scikit-learn
2
+ pandas
3
+ streamlit
4
+ openai
data/temp/HDFS_100k.log_structured.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbb13cc937bb35ad683dfcd71929242d7b1e0c01e2a94f79c637847c30a42190
3
+ size 20892087
data_instances.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5ba4d353dddd4e7ef0de168def9502f38d17dbe95ca5d966c5a6d68aa05b4955
3
+ size 909112
loglizer/__init__.py ADDED
File without changes
loglizer/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (138 Bytes). View file
 
loglizer/__pycache__/dataloader.cpython-38.pyc ADDED
Binary file (7.77 kB). View file
 
loglizer/__pycache__/preprocessing.cpython-38.pyc ADDED
Binary file (3.54 kB). View file
 
loglizer/__pycache__/utils.cpython-38.pyc ADDED
Binary file (836 Bytes). View file
 
loglizer/dataloader.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The interface to load log datasets. The datasets currently supported include
3
+ HDFS and BGL.
4
+
5
+ Authors:
6
+ LogPAI Team
7
+
8
+ """
9
+
10
+ import pandas as pd
11
+ import os
12
+ import numpy as np
13
+ import re
14
+ from sklearn.utils import shuffle
15
+ from collections import OrderedDict
16
+
17
+ def _split_data(x_data, y_data=None, train_ratio=0, split_type='uniform'):
18
+ if split_type == 'uniform' and y_data is not None:
19
+ pos_idx = y_data > 0
20
+ x_pos = x_data[pos_idx]
21
+ y_pos = y_data[pos_idx]
22
+ x_neg = x_data[~pos_idx]
23
+ y_neg = y_data[~pos_idx]
24
+ train_pos = int(train_ratio * x_pos.shape[0])
25
+ train_neg = int(train_ratio * x_neg.shape[0])
26
+ x_train = np.hstack([x_pos[0:train_pos], x_neg[0:train_neg]])
27
+ y_train = np.hstack([y_pos[0:train_pos], y_neg[0:train_neg]])
28
+ x_test = np.hstack([x_pos[train_pos:], x_neg[train_neg:]])
29
+ y_test = np.hstack([y_pos[train_pos:], y_neg[train_neg:]])
30
+ elif split_type == 'sequential':
31
+ num_train = int(train_ratio * x_data.shape[0])
32
+ x_train = x_data[0:num_train]
33
+ x_test = x_data[num_train:]
34
+ if y_data is None:
35
+ y_train = None
36
+ y_test = None
37
+ else:
38
+ y_train = y_data[0:num_train]
39
+ y_test = y_data[num_train:]
40
+ # Random shuffle
41
+ indexes = shuffle(np.arange(x_train.shape[0]))
42
+ x_train = x_train[indexes]
43
+ if y_train is not None:
44
+ y_train = y_train[indexes]
45
+ return (x_train, y_train), (x_test, y_test)
46
+
47
+ def load_HDFS(log_file, label_file=None, window='session', train_ratio=0.5, split_type='sequential', save_csv=False, window_size=0):
48
+ """ Load HDFS structured log into train and test data
49
+
50
+ Arguments
51
+ ---------
52
+ log_file: str, the file path of structured log.
53
+ label_file: str, the file path of anomaly labels, None for unlabeled data
54
+ window: str, the window options including `session` (default).
55
+ train_ratio: float, the ratio of training data for train/test split.
56
+ split_type: `uniform` or `sequential`, which determines how to split dataset. `uniform` means
57
+ to split positive samples and negative samples equally when setting label_file. `sequential`
58
+ means to split the data sequentially without label_file. That is, the first part is for training,
59
+ while the second part is for testing.
60
+
61
+ Returns
62
+ -------
63
+ (x_train, y_train): the training data
64
+ (x_test, y_test): the testing data
65
+ """
66
+
67
+ print('====== Input data summary ======')
68
+
69
+ if log_file.endswith('.npz'):
70
+ # Split training and validation set in a class-uniform way
71
+ data = np.load(log_file)
72
+ x_data = data['x_data']
73
+ y_data = data['y_data']
74
+ (x_train, y_train), (x_test, y_test) = _split_data(x_data, y_data, train_ratio, split_type)
75
+
76
+ elif log_file.endswith('.csv'):
77
+ assert window == 'session', "Only window=session is supported for HDFS dataset."
78
+ print("Loading", log_file)
79
+ struct_log = pd.read_csv(log_file, engine='c',
80
+ na_filter=False, memory_map=True)
81
+ data_dict = OrderedDict()
82
+ for idx, row in struct_log.iterrows():
83
+ blkId_list = re.findall(r'(blk_-?\d+)', row['Content'])
84
+ blkId_set = set(blkId_list)
85
+ for blk_Id in blkId_set:
86
+ if not blk_Id in data_dict:
87
+ data_dict[blk_Id] = []
88
+ data_dict[blk_Id].append(row['EventId'])
89
+ data_df = pd.DataFrame(list(data_dict.items()), columns=['BlockId', 'EventSequence'])
90
+
91
+ if label_file:
92
+ # Split training and validation set in a class-uniform way
93
+ label_data = pd.read_csv(label_file, engine='c', na_filter=False, memory_map=True)
94
+ label_data = label_data.set_index('BlockId')
95
+ label_dict = label_data['Label'].to_dict()
96
+ data_df['Label'] = data_df['BlockId'].apply(lambda x: 1 if label_dict[x] == 'Anomaly' else 0)
97
+
98
+ # Split train and test data
99
+ (x_train, y_train), (x_test, y_test) = _split_data(data_df['EventSequence'].values,
100
+ data_df['Label'].values, train_ratio, split_type)
101
+
102
+ print(y_train.sum(), y_test.sum())
103
+
104
+ if save_csv:
105
+ data_df.to_csv('data_instances.csv', index=False)
106
+
107
+ if window_size > 0:
108
+ x_train, window_y_train, y_train = slice_hdfs(x_train, y_train, window_size)
109
+ x_test, window_y_test, y_test = slice_hdfs(x_test, y_test, window_size)
110
+ log = "{} {} windows ({}/{} anomaly), {}/{} normal"
111
+ print(log.format("Train:", x_train.shape[0], y_train.sum(), y_train.shape[0], (1-y_train).sum(), y_train.shape[0]))
112
+ print(log.format("Test:", x_test.shape[0], y_test.sum(), y_test.shape[0], (1-y_test).sum(), y_test.shape[0]))
113
+ return (x_train, window_y_train, y_train), (x_test, window_y_test, y_test)
114
+
115
+ if label_file is None:
116
+ if split_type == 'uniform':
117
+ split_type = 'sequential'
118
+ print('Warning: Only split_type=sequential is supported \
119
+ if label_file=None.'.format(split_type))
120
+ # Split training and validation set sequentially
121
+ x_data = data_df['EventSequence'].values
122
+ (x_train, _), (x_test, _) = _split_data(x_data, train_ratio=train_ratio, split_type=split_type)
123
+ print('Total: {} instances, train: {} instances, test: {} instances'.format(
124
+ x_data.shape[0], x_train.shape[0], x_test.shape[0]))
125
+ return (x_train, None), (x_test, None), data_df
126
+ else:
127
+ raise NotImplementedError('load_HDFS() only support csv and npz files!')
128
+
129
+ num_train = x_train.shape[0]
130
+ num_test = x_test.shape[0]
131
+ num_total = num_train + num_test
132
+ num_train_pos = sum(y_train)
133
+ num_test_pos = sum(y_test)
134
+ num_pos = num_train_pos + num_test_pos
135
+
136
+ print('Total: {} instances, {} anomaly, {} normal' \
137
+ .format(num_total, num_pos, num_total - num_pos))
138
+ print('Train: {} instances, {} anomaly, {} normal' \
139
+ .format(num_train, num_train_pos, num_train - num_train_pos))
140
+ print('Test: {} instances, {} anomaly, {} normal\n' \
141
+ .format(num_test, num_test_pos, num_test - num_test_pos))
142
+
143
+ return (x_train, y_train), (x_test, y_test)
144
+
145
+ def slice_hdfs(x, y, window_size):
146
+ results_data = []
147
+ print("Slicing {} sessions, with window {}".format(x.shape[0], window_size))
148
+ for idx, sequence in enumerate(x):
149
+ seqlen = len(sequence)
150
+ i = 0
151
+ while (i + window_size) < seqlen:
152
+ slice = sequence[i: i + window_size]
153
+ results_data.append([idx, slice, sequence[i + window_size], y[idx]])
154
+ i += 1
155
+ else:
156
+ slice = sequence[i: i + window_size]
157
+ slice += ["#Pad"] * (window_size - len(slice))
158
+ results_data.append([idx, slice, "#Pad", y[idx]])
159
+ results_df = pd.DataFrame(results_data, columns=["SessionId", "EventSequence", "Label", "SessionLabel"])
160
+ print("Slicing done, {} windows generated".format(results_df.shape[0]))
161
+ return results_df[["SessionId", "EventSequence"]], results_df["Label"], results_df["SessionLabel"]
162
+
163
+
164
+
165
+ def load_BGL(log_file, label_file=None, window='sliding', time_interval=60, stepping_size=60,
166
+ train_ratio=0.8):
167
+ """ TODO
168
+
169
+ """
170
+
171
+
172
+ def bgl_preprocess_data(para, raw_data, event_mapping_data):
173
+ """ split logs into sliding windows, built an event count matrix and get the corresponding label
174
+
175
+ Args:
176
+ --------
177
+ para: the parameters dictionary
178
+ raw_data: list of (label, time)
179
+ event_mapping_data: a list of event index, where each row index indicates a corresponding log
180
+
181
+ Returns:
182
+ --------
183
+ event_count_matrix: event count matrix, where each row is an instance (log sequence vector)
184
+ labels: a list of labels, 1 represents anomaly
185
+ """
186
+
187
+ # create the directory for saving the sliding windows (start_index, end_index), which can be directly loaded in future running
188
+ if not os.path.exists(para['save_path']):
189
+ os.mkdir(para['save_path'])
190
+ log_size = raw_data.shape[0]
191
+ sliding_file_path = para['save_path']+'sliding_'+str(para['window_size'])+'h_'+str(para['step_size'])+'h.csv'
192
+
193
+ #=============divide into sliding windows=========#
194
+ start_end_index_list = [] # list of tuples, tuple contains two number, which represent the start and end of sliding time window
195
+ label_data, time_data = raw_data[:,0], raw_data[:, 1]
196
+ if not os.path.exists(sliding_file_path):
197
+ # split into sliding window
198
+ start_time = time_data[0]
199
+ start_index = 0
200
+ end_index = 0
201
+
202
+ # get the first start, end index, end time
203
+ for cur_time in time_data:
204
+ if cur_time < start_time + para['window_size']*3600:
205
+ end_index += 1
206
+ end_time = cur_time
207
+ else:
208
+ start_end_pair=tuple((start_index,end_index))
209
+ start_end_index_list.append(start_end_pair)
210
+ break
211
+ # move the start and end index until next sliding window
212
+ while end_index < log_size:
213
+ start_time = start_time + para['step_size']*3600
214
+ end_time = end_time + para['step_size']*3600
215
+ for i in range(start_index,end_index):
216
+ if time_data[i] < start_time:
217
+ i+=1
218
+ else:
219
+ break
220
+ for j in range(end_index, log_size):
221
+ if time_data[j] < end_time:
222
+ j+=1
223
+ else:
224
+ break
225
+ start_index = i
226
+ end_index = j
227
+ start_end_pair = tuple((start_index, end_index))
228
+ start_end_index_list.append(start_end_pair)
229
+ inst_number = len(start_end_index_list)
230
+ print('there are %d instances (sliding windows) in this dataset\n'%inst_number)
231
+ np.savetxt(sliding_file_path,start_end_index_list,delimiter=',',fmt='%d')
232
+ else:
233
+ print('Loading start_end_index_list from file')
234
+ start_end_index_list = pd.read_csv(sliding_file_path, header=None).values
235
+ inst_number = len(start_end_index_list)
236
+ print('there are %d instances (sliding windows) in this dataset' % inst_number)
237
+
238
+ # get all the log indexes in each time window by ranging from start_index to end_index
239
+ expanded_indexes_list=[]
240
+ for t in range(inst_number):
241
+ index_list = []
242
+ expanded_indexes_list.append(index_list)
243
+ for i in range(inst_number):
244
+ start_index = start_end_index_list[i][0]
245
+ end_index = start_end_index_list[i][1]
246
+ for l in range(start_index, end_index):
247
+ expanded_indexes_list[i].append(l)
248
+
249
+ event_mapping_data = [row[0] for row in event_mapping_data]
250
+ event_num = len(list(set(event_mapping_data)))
251
+ print('There are %d log events'%event_num)
252
+
253
+ #=============get labels and event count of each sliding window =========#
254
+ labels = []
255
+ event_count_matrix = np.zeros((inst_number,event_num))
256
+ for j in range(inst_number):
257
+ label = 0 #0 represent success, 1 represent failure
258
+ for k in expanded_indexes_list[j]:
259
+ event_index = event_mapping_data[k]
260
+ event_count_matrix[j, event_index] += 1
261
+ if label_data[k]:
262
+ label = 1
263
+ continue
264
+ labels.append(label)
265
+ assert inst_number == len(labels)
266
+ print("Among all instances, %d are anomalies"%sum(labels))
267
+ assert event_count_matrix.shape[0] == len(labels)
268
+ return event_count_matrix, labels
loglizer/models/DecisionTree.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of the decision tree model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Mike Chen, Alice X. Zheng, Jim Lloyd, Michael I. Jordan, Eric Brewer.
9
+ Failure Diagnosis Using Decision Trees. IEEE International Conference
10
+ on Autonomic Computing (ICAC), 2004.
11
+
12
+ """
13
+
14
+ import numpy as np
15
+ from sklearn import tree
16
+ from ..utils import metrics
17
+
18
+ class DecisionTree(object):
19
+
20
+ def __init__(self, criterion='gini', max_depth=None, max_features=None, class_weight=None):
21
+ """ The Invariants Mining model for anomaly detection
22
+ Arguments
23
+ ---------
24
+ See DecisionTreeClassifier API: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
25
+
26
+ Attributes
27
+ ----------
28
+ classifier: object, the classifier for anomaly detection
29
+
30
+ """
31
+ self.classifier = tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
32
+ max_features=max_features, class_weight=class_weight)
33
+
34
+ def fit(self, X, y):
35
+ """
36
+ Arguments
37
+ ---------
38
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
39
+ """
40
+ print('====== Model summary ======')
41
+ self.classifier.fit(X, y)
42
+
43
+ def predict(self, X):
44
+ """ Predict anomalies with mined invariants
45
+
46
+ Arguments
47
+ ---------
48
+ X: the input event count matrix
49
+
50
+ Returns
51
+ -------
52
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
53
+ """
54
+
55
+ y_pred = self.classifier.predict(X)
56
+ return y_pred
57
+
58
+ def predict_proba(self, X):
59
+ """ Predict anomalies with mined invariants
60
+
61
+ Arguments
62
+ ---------
63
+ X: the input event count matrix
64
+
65
+ Returns
66
+ -------
67
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
68
+ """
69
+
70
+ y_pred = self.classifier.predict_proba(X)
71
+ return y_pred
72
+
73
+ def evaluate(self, X, y_true):
74
+ print('====== Evaluation summary ======')
75
+ y_pred = self.predict(X)
76
+ precision, recall, f1 = metrics(y_pred, y_true)
77
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
78
+ return precision, recall, f1
loglizer/models/InvariantsMiner.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of Invariants Mining model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Jian-Guang Lou, Qiang Fu, Shengqi Yang, Ye Xu, Jiang Li. Mining Invariants
9
+ from Console Logs for System Problem Detection. USENIX Annual Technical
10
+ Conference (ATC), 2010.
11
+
12
+ """
13
+
14
+ import numpy as np
15
+ from itertools import combinations
16
+ from ..utils import metrics
17
+
18
+ class InvariantsMiner(object):
19
+
20
+ def __init__(self, percentage=0.98, epsilon=0.5, longest_invarant=None, scale_list=[1,2,3]):
21
+ """ The Invariants Mining model for anomaly detection
22
+
23
+ Attributes
24
+ ----------
25
+ percentage: float, percentage of samples satisfying the condition that |X_j * V_i| < epsilon
26
+ epsilon: float, the threshold for estimating the invariant space
27
+ longest_invarant: int, the specified maximal length of invariant, default to None. Stop
28
+ searching when the invariant length is larger than longest_invarant.
29
+ scale_list: list, the list used to scale the theta of float into integer
30
+ invariants_dict: dict, dictionary of invariants where key is the selected columns
31
+ and value is the weights the of invariant
32
+ """
33
+ self.percentage = percentage
34
+ self.epsilon = epsilon
35
+ self.longest_invarant = longest_invarant
36
+ self.scale_list = scale_list
37
+ self.invariants_dict = None
38
+
39
+ def fit(self, X):
40
+ """
41
+ Arguments
42
+ ---------
43
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
44
+ """
45
+ print('====== Model summary ======')
46
+ invar_dim = self._estimate_invarant_space(X)
47
+ self._invariants_search(X, invar_dim)
48
+
49
+ def predict(self, X):
50
+ """ Predict anomalies with mined invariants
51
+
52
+ Arguments
53
+ ---------
54
+ X: the input event count matrix
55
+
56
+ Returns
57
+ -------
58
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
59
+ """
60
+
61
+ y_sum = np.zeros(X.shape[0])
62
+ for cols, theta in self.invariants_dict.items():
63
+ y_sum += np.fabs(np.dot(X[:, cols], np.array(theta)))
64
+ y_pred = (y_sum > 1e-6).astype(int)
65
+ return y_pred
66
+
67
+ def evaluate(self, X, y_true):
68
+ print('====== Evaluation summary ======')
69
+ y_pred = self.predict(X)
70
+ precision, recall, f1 = metrics(y_pred, y_true)
71
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
72
+ return precision, recall, f1
73
+
74
+ def _estimate_invarant_space(self, X):
75
+ """ Estimate the dimension of invariant space using SVD decomposition
76
+
77
+ Arguments
78
+ ---------
79
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
80
+ percentage: float, percentage of samples satisfying the condition that |X_j * V_i| < epsilon
81
+ epsilon: float, the threshold for estimating the invariant space
82
+
83
+ Returns
84
+ -------
85
+ r: the dimension of invariant space
86
+ """
87
+ covariance_matrix = np.dot(X.T, X)
88
+ U, sigma, V = np.linalg.svd(covariance_matrix) # SVD decomposition
89
+ # Start from the right most column of matrix V, sigular values are in ascending order
90
+ num_instances, num_events = X.shape
91
+ r = 0
92
+ for i in range(num_events - 1, -1, -1):
93
+ zero_count = sum(abs(np.dot(X, U[:, i])) < self.epsilon)
94
+ if zero_count / float(num_instances) < self.percentage:
95
+ break
96
+ r += 1
97
+ print('Invariant space dimension: {}'.format(r))
98
+
99
+ return r
100
+
101
+ def _invariants_search(self, X, r):
102
+ """ Mine invariant relationships from X
103
+
104
+ Arguments
105
+ ---------
106
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
107
+ r: the dimension of invariant space
108
+ """
109
+
110
+ num_instances, num_events = X.shape
111
+ invariants_dict = dict() # save the mined Invariants(value) and its corresponding columns(key)
112
+ search_space = [] # only invariant candidates in this list are valid
113
+
114
+ # invariant of only one column (all zero columns)
115
+ init_cols = sorted([[item] for item in range(num_events)])
116
+ for col in init_cols:
117
+ search_space.append(col)
118
+ init_col_list = init_cols[:]
119
+ for col in init_cols:
120
+ if np.count_nonzero(X[:, col]) == 0:
121
+ invariants_dict[tuple(col)] = [1]
122
+ search_space.remove(col)
123
+ init_col_list.remove(col)
124
+
125
+ item_list = init_col_list
126
+ length = 2
127
+ FLAG_break_loop = False
128
+ # check invariant of more columns
129
+ while len(item_list) != 0:
130
+ if self.longest_invarant and len(item_list[0]) >= self.longest_invarant:
131
+ break
132
+ joined_item_list = self._join_set(item_list, length) # generate new invariant candidates
133
+ for items in joined_item_list:
134
+ if self._check_candi_valid(items, length, search_space):
135
+ search_space.append(items)
136
+ item_list = []
137
+ for item in joined_item_list:
138
+ if tuple(item) in invariants_dict:
139
+ continue
140
+ if item not in search_space:
141
+ continue
142
+ if not self._check_candi_valid(tuple(item), length, search_space) and length > 2:
143
+ search_space.remove(item)
144
+ continue # an item must be superset of all other subitems in searchSpace, else skip
145
+ validity, scaled_theta = self._check_invar_validity(X, item)
146
+ if validity:
147
+ self._prune(invariants_dict.keys(), set(item), search_space)
148
+ invariants_dict[tuple(item)] = scaled_theta.tolist()
149
+ search_space.remove(item)
150
+ else:
151
+ item_list.append(item)
152
+ if len(invariants_dict) >= r:
153
+ FLAG_break_loop = True
154
+ break
155
+ if FLAG_break_loop:
156
+ break
157
+ length += 1
158
+ print('Mined {} invariants: {}\n'.format(len(invariants_dict), invariants_dict))
159
+ self.invariants_dict = invariants_dict
160
+
161
+ def _compute_eigenvector(self, X):
162
+ """ calculate the smallest eigenvalue and corresponding eigenvector (theta in the paper)
163
+ for a given sub_matrix
164
+
165
+ Arguments
166
+ ---------
167
+ X: the event count matrix (each row is a log sequence vector, each column represents an event)
168
+
169
+ Returns
170
+ -------
171
+ min_vec: the eigenvector of corresponding minimum eigen value
172
+ FLAG_contain_zero: whether the min_vec contains zero (very small value)
173
+ """
174
+
175
+ FLAG_contain_zero = False
176
+ count_zero = 0
177
+ dot_result = np.dot(X.T, X)
178
+ U, S, V = np.linalg.svd(dot_result)
179
+ min_vec = U[:, -1]
180
+ count_zero = sum(np.fabs(min_vec) < 1e-6)
181
+ if count_zero != 0:
182
+ FLAG_contain_zero = True
183
+ return min_vec, FLAG_contain_zero
184
+
185
+
186
+ def _check_invar_validity(self, X, selected_columns):
187
+ """ scale the eigenvector of float number into integer, and check whether the scaled number is valid
188
+
189
+ Arguments
190
+ ---------
191
+ X: the event count matrix (each row is a log sequence vector, each column represents an event)
192
+ selected_columns: select columns from all column list
193
+
194
+ Returns
195
+ -------
196
+ validity: whether the selected columns is valid
197
+ scaled_theta: the scaled theta vector
198
+ """
199
+
200
+ sub_matrix = X[:, selected_columns]
201
+ inst_num = X.shape[0]
202
+ validity = False
203
+ min_theta, FLAG_contain_zero = self._compute_eigenvector(sub_matrix)
204
+ abs_min_theta = [np.fabs(it) for it in min_theta]
205
+ if FLAG_contain_zero:
206
+ return validity, []
207
+ else:
208
+ for i in self.scale_list:
209
+ min_index = np.argmin(abs_min_theta)
210
+ scale = float(i) / min_theta[min_index]
211
+ scaled_theta = np.array([round(item * scale) for item in min_theta])
212
+ scaled_theta[min_index] = i
213
+ scaled_theta = scaled_theta.T
214
+ if 0 in np.fabs(scaled_theta):
215
+ continue
216
+ dot_submat_theta = np.dot(sub_matrix, scaled_theta)
217
+ count_zero = 0
218
+ for j in dot_submat_theta:
219
+ if np.fabs(j) < 1e-8:
220
+ count_zero += 1
221
+ if count_zero >= self.percentage * inst_num:
222
+ validity = True
223
+ # print('A valid invariant is found: ',scaled_theta, selected_columns)
224
+ break
225
+ return validity, scaled_theta
226
+
227
+ def _prune(self, valid_cols, new_item_set, search_space):
228
+ """ prune invalid combination of columns
229
+
230
+ Arguments
231
+ ---------
232
+ valid_cols: existing valid column list
233
+ new_item_set: item set to be merged
234
+ search_space: the search space that stores possible candidates
235
+
236
+ """
237
+
238
+ if len(valid_cols) == 0:
239
+ return
240
+ for se in valid_cols:
241
+ intersection = set(se) & new_item_set
242
+ if len(intersection) == 0:
243
+ continue
244
+ union = set(se) | new_item_set
245
+ for item in list(intersection):
246
+ diff = sorted(list(union - set([item])))
247
+ if diff in search_space:
248
+ search_space.remove(diff)
249
+
250
+
251
+ def _join_set(self, item_list, length):
252
+ """ Join a set with itself and returns the n-element (length) itemsets
253
+
254
+ Arguments
255
+ ---------
256
+ item_list: current list of columns
257
+ length: generate new items of length
258
+
259
+ Returns
260
+ -------
261
+ return_list: list of items of length-element
262
+ """
263
+
264
+ set_len = len(item_list)
265
+ return_list = []
266
+ for i in range(set_len):
267
+ for j in range(i + 1, set_len):
268
+ i_set = set(item_list[i])
269
+ j_set = set(item_list[j])
270
+ if len(i_set.union(j_set)) == length:
271
+ joined = sorted(list(i_set.union(j_set)))
272
+ if joined not in return_list:
273
+ return_list.append(joined)
274
+ return_list = sorted(return_list)
275
+ return return_list
276
+
277
+
278
+ def _check_candi_valid(self, item, length, search_space):
279
+ """ check whether an item's subitems are in searchspace
280
+
281
+ Arguments
282
+ ---------
283
+ item: item to be checked
284
+ length: the length of item
285
+ search_space: the search space that stores possible candidates
286
+
287
+ Returns
288
+ -------
289
+ True or False
290
+ """
291
+
292
+ for subItem in combinations(item, length - 1):
293
+ if sorted(list(subItem)) not in search_space:
294
+ return False
295
+ return True
296
+
loglizer/models/IsolationForest.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of IsolationForest model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Fei Tony Liu, Kai Ming Ting, Zhi-Hua Zhou. Isolation Forest. International
9
+ Conference on Data Mining (ICDM), 2008.
10
+
11
+ """
12
+
13
+
14
+
15
+
16
+
17
+ import numpy as np
18
+ from sklearn.ensemble import IsolationForest as iForest
19
+ from ..utils import metrics
20
+
21
+ class IsolationForest(iForest):
22
+
23
+ def __init__(self, n_estimators=100, max_samples='auto', contamination=0.03, **kwargs):
24
+ """ The IsolationForest model for anomaly detection
25
+
26
+ Arguments
27
+ ---------
28
+ n_estimators : int, optional (default=100). The number of base estimators in the ensemble.
29
+ max_samples : int or float, optional (default="auto")
30
+ The number of samples to draw from X to train each base estimator.
31
+ - If int, then draw max_samples samples.
32
+ - If float, then draw max_samples * X.shape[0] samples.
33
+ - If "auto", then max_samples=min(256, n_samples).
34
+ If max_samples is larger than the number of samples provided, all samples will be used
35
+ for all trees (no sampling).
36
+ contamination : float in (0., 0.5), optional (default='auto')
37
+ The amount of contamination of the data set, i.e. the proportion of outliers in the data
38
+ set. Used when fitting to define the threshold on the decision function. If 'auto', the
39
+ decision function threshold is determined as in the original paper.
40
+ max_features : int or float, optional (default=1.0)
41
+ The number of features to draw from X to train each base estimator.
42
+ - If int, then draw max_features features.
43
+ - If float, then draw max_features * X.shape[1] features.
44
+ bootstrap : boolean, optional (default=False)
45
+ If True, individual trees are fit on random subsets of the training data sampled with replacement.
46
+ If False, sampling without replacement is performed.
47
+ n_jobs : int or None, optional (default=None)
48
+ The number of jobs to run in parallel for both fit and predict. None means 1 unless in a
49
+ joblib.parallel_backend context. -1 means using all processors.
50
+ random_state : int, RandomState instance or None, optional (default=None)
51
+ If int, random_state is the seed used by the random number generator;
52
+ If RandomState instance, random_state is the random number generator;
53
+ If None, the random number generator is the RandomState instance used by np.random.
54
+
55
+ Reference
56
+ ---------
57
+ For more information, please visit https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.IsolationForest.html
58
+ """
59
+
60
+ super(IsolationForest, self).__init__(n_estimators=n_estimators, max_samples=max_samples,
61
+ contamination=contamination, **kwargs)
62
+
63
+
64
+ def fit(self, X):
65
+ """
66
+ Auguments
67
+ ---------
68
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
69
+ """
70
+
71
+ print('====== Model summary ======')
72
+ super(IsolationForest, self).fit(X)
73
+
74
+ def predict(self, X):
75
+ """ Predict anomalies with mined invariants
76
+
77
+ Arguments
78
+ ---------
79
+ X: the input event count matrix
80
+
81
+ Returns
82
+ -------
83
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
84
+ """
85
+
86
+ y_pred = super(IsolationForest, self).predict(X)
87
+ y_pred = np.where(y_pred > 0, 0, 1)
88
+ return y_pred
89
+
90
+ def evaluate(self, X, y_true):
91
+ print('====== Evaluation summary ======')
92
+ y_pred = self.predict(X)
93
+ precision, recall, f1 = metrics(y_pred, y_true)
94
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
95
+ return precision, recall, f1
96
+
loglizer/models/LR.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ The implementation of the logistic regression model for anomaly detection.
4
+
5
+ Authors:
6
+ LogPAI Team
7
+
8
+ Reference:
9
+ [1] Peter Bodík, Moises Goldszmidt, Armando Fox, Hans Andersen. Fingerprinting
10
+ the Datacenter: Automated Classification of Performance Crises. The European
11
+ Conference on Computer Systems (EuroSys), 2010.
12
+
13
+ """
14
+
15
+ import numpy as np
16
+ from sklearn.linear_model import LogisticRegression
17
+ from ..utils import metrics
18
+
19
+ class LR(object):
20
+
21
+ def __init__(self, penalty='l2', C=100, tol=0.01, class_weight=None, max_iter=100):
22
+ """ The Invariants Mining model for anomaly detection
23
+
24
+ Attributes
25
+ ----------
26
+ classifier: object, the classifier for anomaly detection
27
+ """
28
+ self.classifier = LogisticRegression(penalty=penalty, C=C, tol=tol, class_weight=class_weight,
29
+ max_iter=max_iter)
30
+
31
+ def fit(self, X, y):
32
+ """
33
+ Arguments
34
+ ---------
35
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
36
+ """
37
+ print('====== Model summary ======')
38
+ self.classifier.fit(X, y)
39
+
40
+ def predict(self, X):
41
+ """ Predict anomalies with mined invariants
42
+
43
+ Arguments
44
+ ---------
45
+ X: the input event count matrix
46
+
47
+ Returns
48
+ -------
49
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
50
+ """
51
+ y_pred = self.classifier.predict(X)
52
+ return y_pred
53
+
54
+ def predict_proba(self, X):
55
+ """ Predict anomalies with mined invariants
56
+
57
+ Arguments
58
+ ---------
59
+ X: the input event count matrix
60
+
61
+ Returns
62
+ -------
63
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
64
+ """
65
+ y_pred = self.classifier.predict_proba(X)
66
+ return y_pred
67
+
68
+ def evaluate(self, X, y_true):
69
+ print('====== Evaluation summary ======')
70
+ y_pred = self.predict(X)
71
+ precision, recall, f1 = metrics(y_pred, y_true)
72
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
73
+ return precision, recall, f1
loglizer/models/LogClustering.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of Log Clustering model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Qingwei Lin, Hongyu Zhang, Jian-Guang Lou, Yu Zhang, Xuewei Chen. Log Clustering
9
+ based Problem Identification for Online Service Systems. International Conference
10
+ on Software Engineering (ICSE), 2016.
11
+
12
+ """
13
+
14
+ import numpy as np
15
+ import pprint
16
+ from scipy.special import expit
17
+ from numpy import linalg as LA
18
+ from scipy.cluster.hierarchy import linkage, fcluster
19
+ from scipy.spatial.distance import pdist, squareform
20
+ from ..utils import metrics
21
+
22
+
23
+ class LogClustering(object):
24
+
25
+ def __init__(self, max_dist=0.3, anomaly_threshold=0.3, mode='online', num_bootstrap_samples=1000):
26
+ """
27
+ Attributes
28
+ ----------
29
+ max_dist: float, the threshold to stop the clustering process
30
+ anomaly_threshold: float, the threshold for anomaly detection
31
+ mode: str, 'offline' or 'online' mode for clustering
32
+ num_bootstrap_samples: int, online clustering starts with a bootstraping process, which
33
+ determines the initial cluster representatives offline using a subset of samples
34
+ representatives: ndarray, the representative samples of clusters, of shape
35
+ num_clusters-by-num_events
36
+ cluster_size_dict: dict, the size of each cluster, used to update representatives online
37
+ """
38
+ self.max_dist = max_dist
39
+ self.anomaly_threshold = anomaly_threshold
40
+ self.mode = mode
41
+ self.num_bootstrap_samples = num_bootstrap_samples
42
+ self.representatives = list()
43
+ self.cluster_size_dict = dict()
44
+
45
+ def fit(self, X):
46
+ print('====== Model summary ======')
47
+ if self.mode == 'offline':
48
+ # The offline mode can process about 10K samples only due to huge memory consumption.
49
+ self._offline_clustering(X)
50
+ elif self.mode == 'online':
51
+ # Bootstrapping phase
52
+ if self.num_bootstrap_samples > 0:
53
+ X_bootstrap = X[0:self.num_bootstrap_samples, :]
54
+ self._offline_clustering(X_bootstrap)
55
+ # Online learning phase
56
+ if X.shape[0] > self.num_bootstrap_samples:
57
+ self._online_clustering(X)
58
+
59
+ def predict(self, X):
60
+ y_pred = np.zeros(X.shape[0])
61
+ for i in range(X.shape[0]):
62
+ min_dist, min_index = self._get_min_cluster_dist(X[i, :])
63
+ if min_dist > self.anomaly_threshold:
64
+ y_pred[i] = 1
65
+ return y_pred
66
+
67
+ def evaluate(self, X, y_true):
68
+ print('====== Evaluation summary ======')
69
+ y_pred = self.predict(X)
70
+ precision, recall, f1 = metrics(y_pred, y_true)
71
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n' \
72
+ .format(precision, recall, f1))
73
+ return precision, recall, f1
74
+
75
+ def _offline_clustering(self, X):
76
+ print('Starting offline clustering...')
77
+ p_dist = pdist(X, metric=self._distance_metric)
78
+ Z = linkage(p_dist, 'complete')
79
+ cluster_index = fcluster(Z, self.max_dist, criterion='distance')
80
+ self._extract_representatives(X, cluster_index)
81
+ print('Processed {} instances.'.format(X.shape[0]))
82
+ print('Found {} clusters offline.\n'.format(len(self.representatives)))
83
+ # print('The representive vectors are:')
84
+ # pprint.pprint(self.representatives.tolist())
85
+
86
+ def _extract_representatives(self, X, cluster_index):
87
+ num_clusters = len(set(cluster_index))
88
+ for clu in range(num_clusters):
89
+ clu_idx = np.argwhere(cluster_index == clu + 1)[:, 0]
90
+ self.cluster_size_dict[clu] = clu_idx.shape[0]
91
+ repre_center = np.average(X[clu_idx, :], axis=0)
92
+ self.representatives.append(repre_center)
93
+
94
+ def _online_clustering(self, X):
95
+ print("Starting online clustering...")
96
+ for i in range(self.num_bootstrap_samples, X.shape[0]):
97
+ if (i + 1) % 2000 == 0:
98
+ print('Processed {} instances.'.format(i + 1))
99
+ instance_vec = X[i, :]
100
+ if len(self.representatives) > 0:
101
+ min_dist, clu_id = self._get_min_cluster_dist(instance_vec)
102
+ if min_dist <= self.max_dist:
103
+ self.cluster_size_dict[clu_id] += 1
104
+ self.representatives[clu_id] = self.representatives[clu_id] \
105
+ + (instance_vec - self.representatives[clu_id]) \
106
+ / self.cluster_size_dict[clu_id]
107
+ continue
108
+ self.cluster_size_dict[len(self.representatives)] = 1
109
+ self.representatives.append(instance_vec)
110
+ print('Processed {} instances.'.format(X.shape[0]))
111
+ print('Found {} clusters online.\n'.format(len(self.representatives)))
112
+ # print('The representive vectors are:')
113
+ # pprint.pprint(self.representatives.tolist())
114
+
115
+ def _distance_metric(self, x1, x2):
116
+ norm= LA.norm(x1) * LA.norm(x2)
117
+ distance = 1 - np.dot(x1, x2) / (norm + 1e-8)
118
+ if distance < 1e-8:
119
+ distance = 0
120
+ return distance
121
+
122
+ def _get_min_cluster_dist(self, instance_vec):
123
+ min_index = -1
124
+ min_dist = float('inf')
125
+ for i in range(len(self.representatives)):
126
+ cluster_rep = self.representatives[i]
127
+ dist = self._distance_metric(instance_vec, cluster_rep)
128
+ if dist < 1e-8:
129
+ min_dist = 0
130
+ min_index = i
131
+ break
132
+ elif dist < min_dist:
133
+ min_dist = dist
134
+ min_index = i
135
+ return min_dist, min_index
136
+
137
+
loglizer/models/PCA.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of PCA model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Wei Xu, Ling Huang, Armando Fox, David Patterson, Michael I. Jordan.
9
+ Large-Scale System Problems Detection by Mining Console Logs. ACM
10
+ Symposium on Operating Systems Principles (SOSP), 2009.
11
+
12
+ """
13
+
14
+ import numpy as np
15
+ from ..utils import metrics
16
+
17
+ class PCA(object):
18
+
19
+ def __init__(self, n_components=0.95, threshold=None, c_alpha=3.2905):
20
+ """ The PCA model for anomaly detection
21
+
22
+ Attributes
23
+ ----------
24
+ proj_C: The projection matrix for projecting feature vector to abnormal space
25
+ n_components: float/int, number of principal compnents or the variance ratio they cover
26
+ threshold: float, the anomaly detection threshold. When setting to None, the threshold
27
+ is automatically caculated using Q-statistics
28
+ c_alpha: float, the c_alpha parameter for caculating anomaly detection threshold using
29
+ Q-statistics. The following is lookup table for c_alpha:
30
+ c_alpha = 1.7507; # alpha = 0.08
31
+ c_alpha = 1.9600; # alpha = 0.05
32
+ c_alpha = 2.5758; # alpha = 0.01
33
+ c_alpha = 2.807; # alpha = 0.005
34
+ c_alpha = 2.9677; # alpha = 0.003
35
+ c_alpha = 3.2905; # alpha = 0.001
36
+ c_alpha = 3.4808; # alpha = 0.0005
37
+ c_alpha = 3.8906; # alpha = 0.0001
38
+ c_alpha = 4.4172; # alpha = 0.00001
39
+ """
40
+
41
+ self.proj_C = None
42
+ self.components = None
43
+ self.n_components = n_components
44
+ self.threshold = threshold
45
+ self.c_alpha = c_alpha
46
+
47
+
48
+ def fit(self, X):
49
+ """
50
+ Auguments
51
+ ---------
52
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
53
+ """
54
+
55
+ print('====== Model summary ======')
56
+ num_instances, num_events = X.shape
57
+ X_cov = np.dot(X.T, X) / float(num_instances)
58
+ U, sigma, V = np.linalg.svd(X_cov)
59
+ n_components = self.n_components
60
+ if n_components < 1:
61
+ total_variance = np.sum(sigma)
62
+ variance = 0
63
+ for i in range(num_events):
64
+ variance += sigma[i]
65
+ if variance / total_variance >= n_components:
66
+ break
67
+ n_components = i + 1
68
+
69
+ P = U[:, :n_components]
70
+ I = np.identity(num_events, int)
71
+ self.components = P
72
+ self.proj_C = I - np.dot(P, P.T)
73
+ print('n_components: {}'.format(n_components))
74
+ print('Project matrix shape: {}-by-{}'.format(self.proj_C.shape[0], self.proj_C.shape[1]))
75
+
76
+ if not self.threshold:
77
+ # Calculate threshold using Q-statistic. Information can be found at:
78
+ # http://conferences.sigcomm.org/sigcomm/2004/papers/p405-lakhina111.pdf
79
+ phi = np.zeros(3)
80
+ for i in range(3):
81
+ for j in range(n_components, num_events):
82
+ phi[i] += np.power(sigma[j], i + 1)
83
+ h0 = 1.0 - 2 * phi[0] * phi[2] / (3.0 * phi[1] * phi[1])
84
+ self.threshold = phi[0] * np.power(self.c_alpha * np.sqrt(2 * phi[1] * h0 * h0) / phi[0]
85
+ + 1.0 + phi[1] * h0 * (h0 - 1) / (phi[0] * phi[0]),
86
+ 1.0 / h0)
87
+ print('SPE threshold: {}\n'.format(self.threshold))
88
+
89
+ def predict(self, X):
90
+ assert self.proj_C is not None, 'PCA model needs to be trained before prediction.'
91
+ y_pred = np.zeros(X.shape[0])
92
+ for i in range(X.shape[0]):
93
+ y_a = np.dot(self.proj_C, X[i, :])
94
+ SPE = np.dot(y_a, y_a)
95
+ if SPE > self.threshold:
96
+ y_pred[i] = 1
97
+ return y_pred
98
+
99
+ def evaluate(self, X, y_true):
100
+ print('====== Evaluation summary ======')
101
+ y_pred = self.predict(X)
102
+ precision, recall, f1 = metrics(y_pred, y_true)
103
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
104
+ return precision, recall, f1
105
+
loglizer/models/SVM.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of the SVM model for anomaly detection.
3
+
4
+ Authors:
5
+ LogPAI Team
6
+
7
+ Reference:
8
+ [1] Yinglung Liang, Yanyong Zhang, Hui Xiong, Ramendra Sahoo. Failure Prediction
9
+ in IBM BlueGene/L Event Logs. IEEE International Conference on Data Mining
10
+ (ICDM), 2007.
11
+
12
+ """
13
+
14
+ import numpy as np
15
+ from sklearn import svm
16
+ from ..utils import metrics
17
+
18
+ class SVM(object):
19
+
20
+ def __init__(self, penalty='l1', tol=0.1, C=1, dual=False, class_weight=None,
21
+ max_iter=100):
22
+ """ The Invariants Mining model for anomaly detection
23
+ Arguments
24
+ ---------
25
+ See SVM API: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
26
+
27
+ Attributes
28
+ ----------
29
+ classifier: object, the classifier for anomaly detection
30
+
31
+ """
32
+ self.classifier = svm.LinearSVC(penalty=penalty, tol=tol, C=C, dual=dual,
33
+ class_weight=class_weight, max_iter=max_iter)
34
+
35
+ def fit(self, X, y):
36
+ """
37
+ Arguments
38
+ ---------
39
+ X: ndarray, the event count matrix of shape num_instances-by-num_events
40
+ """
41
+ print('====== Model summary ======')
42
+ self.classifier.fit(X, y)
43
+
44
+ def predict(self, X):
45
+ """ Predict anomalies with mined invariants
46
+
47
+ Arguments
48
+ ---------
49
+ X: the input event count matrix
50
+
51
+ Returns
52
+ -------
53
+ y_pred: ndarray, the predicted label vector of shape (num_instances,)
54
+ """
55
+
56
+ y_pred = self.classifier.predict(X)
57
+ return y_pred
58
+
59
+ def evaluate(self, X, y_true):
60
+ print('====== Evaluation summary ======')
61
+ y_pred = self.predict(X)
62
+ precision, recall, f1 = metrics(y_pred, y_true)
63
+ print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
64
+ return precision, recall, f1
loglizer/models/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from .PCA import PCA
2
+ from .InvariantsMiner import InvariantsMiner
3
+ from .LogClustering import LogClustering
4
+ from .LR import LR
5
+ from .SVM import SVM
6
+ from .DecisionTree import DecisionTree
7
+ from .IsolationForest import IsolationForest
8
+
loglizer/models/__pycache__/DecisionTree.cpython-38.pyc ADDED
Binary file (2.57 kB). View file
 
loglizer/models/__pycache__/InvariantsMiner.cpython-38.pyc ADDED
Binary file (9.52 kB). View file