File size: 14,947 Bytes
e826e46
 
 
 
 
 
 
 
 
 
 
 
 
f328c2e
 
e826e46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
caac8b8
e826e46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense, Bidirectional, Attention
from keras.callbacks import EarlyStopping, Callback
import matplotlib.pyplot as plt
import time
import os
import random
from datasets import load_dataset
import os
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"

# Custom callback to log epoch times
class EpochTimeCallback(Callback):
    def on_epoch_begin(self, epoch, logs=None):
        self.epoch_start_time = time.time()

    def on_epoch_end(self, epoch, logs=None):
        epoch_time = time.time() - self.epoch_start_time
        logs['epoch_time'] = epoch_time
        st.write(f"Epoch {epoch + 1} time: {epoch_time:.2f} seconds")


# Function to create sequences
def create_sequences(data, sequence_length, output_sequence_length, target_column):
    X, y = [], []
    data_array = data.values
    total_length = sequence_length + output_sequence_length
    for i in range(len(data_array) - total_length + 1):
        seq_x = data_array[i:(i + sequence_length)]
        seq_y = data_array[i + sequence_length:(i + sequence_length + output_sequence_length), target_column]
        seq_y = seq_y.reshape(-1)
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)


# Function to preprocess data
def preprocess_data(df, misisdn_column, event_column_date, columns_to_scale, target_column, sequence_length,
                    output_sequence_length):
    msisdns = df[misisdn_column].unique()
    train_msisdns, test_msisdns = train_test_split(msisdns, test_size=0.2, random_state=42)
    train_data = df[df[misisdn_column].isin(train_msisdns)]
    test_data = df[df[misisdn_column].isin(test_msisdns)]
    scaler = RobustScaler()
    train_data[columns_to_scale] = scaler.fit_transform(train_data[columns_to_scale])
    test_data[columns_to_scale] = scaler.transform(test_data[columns_to_scale])

    X_train_dict, y_train_dict = {}, {}
    for msisdn, group in train_data.groupby(misisdn_column):
        group = group.sort_values(event_column_date)
        lstm_data = group[columns_to_scale]
        if len(lstm_data) >= sequence_length + output_sequence_length:
            X_train, y_train = create_sequences(lstm_data, sequence_length, output_sequence_length,
                                                df.columns.get_loc(target_column))
            X_train_dict[msisdn] = X_train
            y_train_dict[msisdn] = y_train

    trainX = np.concatenate(list(X_train_dict.values()))
    trainY = np.concatenate(list(y_train_dict.values()))

    X_test_dict, y_test_dict = {}, {}
    for msisdn, group in test_data.groupby(misisdn_column):
        group = group.sort_values(event_column_date)
        lstm_data = group[columns_to_scale]
        if len(lstm_data) >= sequence_length + output_sequence_length:
            X_test, y_test = create_sequences(lstm_data, sequence_length, output_sequence_length,
                                              df.columns.get_loc(target_column))
            X_test_dict[msisdn] = X_test
            y_test_dict[msisdn] = y_test

    testX = np.concatenate(list(X_test_dict.values()))
    testY = np.concatenate(list(y_test_dict.values()))

    return trainX, trainY, testX, testY, scaler


# Page 1: Load dataset and preprocess data
def page_load_and_preprocess():
    st.title("LSTM Time Series Forecasting")
    st.header("Step 1: Load Dataset")
    data_path = st.text_input("Enter the path to the CSV file",placeholder="/Users/sachinpb/Documents/6d/lstm/15_count.csv")

    if True:
        dataset = load_dataset("azizshaw/events")
        df = dataset['train'].to_pandas()
        print("data loaded sucessfully")
        st.write("Data Preview:")
        st.write(df.head())

        # Proceed if data is loaded
        st.header("Step 2: Preprocess Data")
        misisdn_column = st.text_input("MSISDN Column", "Cust_Sub_Id")
        event_column_date = st.text_input("Date Column", "Event_Date")
        columns_to_scale = st.multiselect("Columns to Scale", df.columns.tolist(),
                                          default=['Total_Revenue', 'Data_Volume', 'OG_Call_Count'])
        df = df[[misisdn_column, event_column_date] + columns_to_scale]
        target_column = st.selectbox("Target Column", df.columns.tolist())
        sequence_length = st.number_input("Sequence Length", min_value=1, max_value=100, value=30)
        output_sequence_length = st.number_input("Output Sequence Length", min_value=1, max_value=10, value=1)

        if st.button("Preprocess Data"):
            st.write("Starting data preprocessing...")  # Log message for preprocessing
            trainX, trainY, testX, testY, scaler = preprocess_data(df, misisdn_column, event_column_date,
                                                                   columns_to_scale, target_column, sequence_length,
                                                                   output_sequence_length)
            st.session_state['trainX'] = trainX
            st.session_state['trainY'] = trainY
            st.session_state['testX'] = testX
            st.session_state['testY'] = testY
            st.session_state['scaler'] = scaler
            st.session_state['df'] = df
            st.session_state['columns_to_scale'] = columns_to_scale
            st.session_state['target_column'] = target_column
            st.session_state['sequence_length'] = sequence_length
            st.session_state['output_sequence_length'] = output_sequence_length
            st.session_state['misisdn_column'] = misisdn_column
            st.session_state['event_column_date'] = event_column_date

            st.write("Data Preprocessing Completed")
            st.write(f"Training Data Shape: {trainX.shape}")
            st.write(f"Test Data Shape: {testX.shape}")

            st.write("Go to the next page to build and train the model.")


# Page 2: Build and train model
def page_build_and_train():
    if 'trainX' not in st.session_state:
        st.warning("Please complete the data preprocessing step first.")
        return

    st.header("Step 3: Build and Train LSTM Model")
    num_layers = st.number_input("Number of LSTM Layers", min_value=1, max_value=5, value=2)
    lstm_units = [st.slider(f"LSTM Units for Layer {i + 1}", min_value=10, max_value=200, value=64) for i in
                  range(num_layers)]
    dropout_rate = st.slider("Dropout Rate", min_value=0.0, max_value=0.5, value=0.2)
    activation_function = st.selectbox("Activation Function", ["relu", "tanh"])
    loss_function = st.selectbox("Loss Function", ["mse", "mae"])
    optimizer = st.selectbox("Optimizer", ["adam", "sgd"])
    epochs = st.number_input("Epochs", min_value=1, max_value=100, value=15)
    batch_size = st.number_input("Batch Size", min_value=1, max_value=1000, value=600)
    bidirectional = st.checkbox("Bidirectional LSTM")
    attention = st.checkbox("Add Attention Layer")

    if st.button("Train Model"):
        st.write("Starting model training...")  # Log message to indicate training has started
        try:
            model = Sequential()
            for i in range(num_layers):
                if bidirectional:
                    model.add(Bidirectional(LSTM(lstm_units[i], activation=activation_function,
                                                 input_shape=(st.session_state['sequence_length'],
                                                              len(st.session_state['columns_to_scale'])),
                                                 return_sequences=(i != num_layers - 1))))
                else:
                    model.add(LSTM(lstm_units[i], activation=activation_function,
                                   input_shape=(
                                   st.session_state['sequence_length'], len(st.session_state['columns_to_scale'])),
                                   return_sequences=(i != num_layers - 1)))
                if i != num_layers - 1:
                    model.add(Dropout(dropout_rate))

            if attention:
                model.add(Attention())

            model.add(Dense(st.session_state['output_sequence_length']))
            model.compile(optimizer=optimizer, loss=loss_function)
            early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
            epoch_time_callback = EpochTimeCallback()

            history = model.fit(st.session_state['trainX'], st.session_state['trainY'], epochs=epochs,
                                batch_size=batch_size, validation_split=0.1, verbose=1,
                                callbacks=[early_stopping, epoch_time_callback])
            st.write("Model Training Completed")

            st.write("Training and Validation Loss")
            st.line_chart(pd.DataFrame(history.history))

            # Save the model and scaler
            model.save("lstm_model.h5")
            st.session_state['model'] = model
            st.write("Model Saved")

        except Exception as e:
            st.write(f"An error occurred during model training: {e}")


# Page 3: Generate predictions
def page_generate_predictions():
    if 'model' not in st.session_state:
        st.warning("Please complete the model training step first.")
        return

    st.header("Step 4: Make Predictions")
    random_msisdns = st.multiselect("Select MSISDNs for Prediction",
                                    st.session_state['df'][st.session_state['misisdn_column']].unique(),
                                    default=random.sample(
                                        list(st.session_state['df'][st.session_state['misisdn_column']].unique()), 5))
    rolling_prediction = st.checkbox("Use Rolling Prediction")

    if st.button("Generate Predictions"):
        for msisdn in random_msisdns:
            group = st.session_state['df'][st.session_state['df'][st.session_state['misisdn_column']] == msisdn]
            group = group.sort_values(st.session_state['event_column_date'])
            lstm_data = group[st.session_state['columns_to_scale']]
            if len(lstm_data) >= st.session_state['sequence_length'] + st.session_state['output_sequence_length']:
                X_test, y_test = create_sequences(lstm_data, st.session_state['sequence_length'],
                                                  st.session_state['output_sequence_length'],
                                                  st.session_state['df'].columns.get_loc(
                                                      st.session_state['target_column']))

                if rolling_prediction:
                    # Initialize lists to store predictions and actual values
                    predictions_msisdn = []
                    actuals_msisdn = []

                    # Initialize the input sequence (the first 30 days)
                    input_sequence = X_test[0]

                    for i in range(len(y_test)):
                        # Predict the next day
                        prediction = st.session_state['model'].predict(
                            input_sequence.reshape(1, st.session_state['sequence_length'],
                                                   len(st.session_state['columns_to_scale'])))

                        # Store the prediction and actual value
                        predictions_msisdn.append(prediction[0, 0])
                        actuals_msisdn.append(y_test[i])

                        # Update the input sequence
                        input_sequence = np.roll(input_sequence, -1, axis=0)
                        input_sequence[-1] = prediction

                    # Convert lists to numpy arrays
                    predictions_msisdn = np.array(predictions_msisdn)
                    actuals_msisdn = np.array(actuals_msisdn)

                    # Add dummy columns to match the shape expected by the scaler
                    dummy_columns = np.zeros(
                        (predictions_msisdn.shape[0], len(st.session_state['columns_to_scale']) - 1))
                    predictions_msisdn_full = np.hstack([predictions_msisdn.reshape(-1, 1), dummy_columns])
                    actuals_msisdn_full = np.hstack([actuals_msisdn.reshape(-1, 1), dummy_columns])

                    # Inverse transform predictions and actuals
                    predictions_msisdn_inverse = st.session_state['scaler'].inverse_transform(predictions_msisdn_full)[
                                                 :, 0]
                    actuals_msisdn_inverse = st.session_state['scaler'].inverse_transform(actuals_msisdn_full)[:, 0]

                    # Plotting
                    plt.figure(figsize=(10, 5))
                    plt.plot(group[st.session_state['event_column_date']].iloc[-len(predictions_msisdn):],
                             predictions_msisdn_inverse, label='Predicted')
                    plt.plot(group[st.session_state['event_column_date']].iloc[-len(actuals_msisdn):],
                             actuals_msisdn_inverse, label='Actual')
                    plt.title(f"Rolling Predictions for MSISDN: {msisdn}")
                    plt.xlabel("Date")
                    plt.ylabel("Total Revenue")
                    plt.legend()
                    st.pyplot(plt)

                else:
                    predictions = st.session_state['model'].predict(X_test)
                    predictions = st.session_state['scaler'].inverse_transform(
                        np.hstack([predictions,
                                   np.zeros((predictions.shape[0], len(st.session_state['columns_to_scale']) - 1))]))[:,
                                  0]
                    actuals = st.session_state['scaler'].inverse_transform(
                        np.hstack(
                            [y_test, np.zeros((y_test.shape[0], len(st.session_state['columns_to_scale']) - 1))]))[:, 0]

                    plt.figure(figsize=(10, 5))
                    plt.plot(group[st.session_state['event_column_date']].iloc[-len(predictions):], predictions,
                             label='Predicted')
                    plt.plot(group[st.session_state['event_column_date']].iloc[-len(actuals):], actuals, label='Actual')
                    plt.title(f"Predictions for MSISDN: {msisdn}")
                    plt.xlabel("Date")
                    plt.ylabel("Total Revenue")
                    plt.legend()
                    st.pyplot(plt)


# Sidebar for navigation
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Load and Preprocess Data", "Build and Train Model", "Generate Predictions"])

if page == "Load and Preprocess Data":
    page_load_and_preprocess()
elif page == "Build and Train Model":
    page_build_and_train()
elif page == "Generate Predictions":
    page_generate_predictions()