File size: 5,494 Bytes
70dc413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3229447
268e4c6
70dc413
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ea07529
5bfa92d
0342253
ea07529
9310b2e
ad87114
0342253
8baa7c2
6b53e3b
 
 
0463d1c
6b53e3b
 
 
6ba2cf2
6b53e3b
 
 
 
 
 
 
 
9f12d92
1d8e459
8f44094
9f12d92
3865968
6b53e3b
 
34e3471
6b53e3b
890b5d6
 
9eb8dd4
890b5d6
3229447
e39233a
 
 
ad44c07
4523cdc
890b5d6
6b53e3b
 
 
 
70dc413
6e8355d
5bfa92d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
### ----------------------------- ###
###           libraries           ###
### ----------------------------- ###

import gradio as gr
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics


### ------------------------------ ###
###       data transformation      ###
### ------------------------------ ###

# load dataset
uncleaned_data = pd.read_csv('data.csv')

# remove timestamp from dataset (always first column)
uncleaned_data = uncleaned_data.iloc[: , 1:]
data = pd.DataFrame()

# keep track of which columns are categorical and what 
# those columns' value mappings are
# structure: {colname1: {...}, colname2: {...} }
cat_value_dicts = {}
final_colname = uncleaned_data.columns[len(uncleaned_data.columns) - 1]

# for each column...
for (colname, colval) in uncleaned_data.iteritems():

  # check if col is already a number; if so, add col directly
  # to new dataframe and skip to next column
  if isinstance(colval.values[0], (np.integer, float)):
    data[colname] = uncleaned_data[colname].copy()
    continue

  # structure: {0: "lilac", 1: "blue", ...}
  new_dict = {}
  val = 0 # first index per column
  transformed_col_vals = [] # new numeric datapoints

  # if not, for each item in that column...
  for (row, item) in enumerate(colval.values):
    
    # if item is not in this col's dict...
    if item not in new_dict:
      new_dict[item] = val
      val += 1
    
    # then add numerical value to transformed dataframe
    transformed_col_vals.append(new_dict[item])
  
  # reverse dictionary only for final col (0, 1) => (vals)
  if colname == final_colname:
    new_dict = {value : key for (key, value) in new_dict.items()}

  cat_value_dicts[colname] = new_dict
  data[colname] = transformed_col_vals


### -------------------------------- ###
###           model training         ###
### -------------------------------- ###

# select features and predicton; automatically selects last column as prediction
cols = len(data.columns)
num_features = cols - 1
x = data.iloc[: , :num_features]
y = data.iloc[: , num_features:]

# split data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

# instantiate the model (using default parameters)
model = LogisticRegression()
model.fit(x_train, y_train.values.ravel())
y_pred = model.predict(x_test)


### -------------------------------- ###
###        article generation        ###
### -------------------------------- ###
# borrow file reading function from reader.py

def get_feat():
  feats = [abs(x) for x in model.coef_[0]]
  max_val = max(feats)
  idx = feats.index(max_val)
  return data.columns[idx]
  
acc = str(round(metrics.accuracy_score(y_test, y_pred) * 100, 1)) + "%"
most_imp_feat = get_feat()
# info = get_article(acc, most_imp_feat)



### ------------------------------- ###
###        interface creation       ###
### ------------------------------- ###


# predictor for generic number of features
def general_predictor(*args):
  features = []

  # transform categorical input
  for colname, arg in zip(data.columns, args):
    if (colname in cat_value_dicts):
      features.append(cat_value_dicts[colname][arg])
    else:
      features.append(arg)

  # predict single datapoint
  new_input = [features]
  result = model.predict(new_input)
  return cat_value_dicts[final_colname][result[0]]

# add data labels to replace those lost via star-args


block = gr.Blocks()

with open('info.md') as f:
  with block:
    gr.Markdown(f.readline())
    gr.Markdown('Take the quiz to get a personalized recommendation using AI.')
    
    with gr.Row():
      with gr.Box():
        inputls = []
        for colname in data.columns:
          # skip last column
          if colname == final_colname:
            continue
          
          # access categories dict if data is categorical
          # otherwise, just use a number input
          if colname in cat_value_dicts:
            radio_options = list(cat_value_dicts[colname].keys())
            inputls.append(gr.inputs.Dropdown(choices=radio_options, type="value", label=colname))
          else:
            # add numerical input
            inputls.append(gr.inputs.Number(label=colname))
          gr.Markdown("<br />")
        
        submit = gr.Button("Click to see your personalized result!", variant="primary")
        gr.Markdown("<br />")
        output = gr.Textbox(label="Your recommendation:", placeholder="your recommendation will appear here")
        
        submit.click(fn=general_predictor, inputs=inputls, outputs=output)
        gr.Markdown("<br />")
        
        with gr.Row():
          with gr.Box():
            gr.Markdown(f"<h3>Accuracy: </h3>{acc}")
          with gr.Box():
            gr.Markdown(f"<h3>Most important feature: </h3>{most_imp_feat}")
        
        gr.Markdown("<br />")
        
        with gr.Box():
          gr.Markdown('''⭐ Note that model accuracy is based on the uploaded data.csv and reflects how well the AI model can give correct recommendations for <em>that dataset</em>. Model accuracy and most important feature can be helpful for understanding how the model works, but <em>should not be considered absolute facts about the real world</em>.''')
        
      with gr.Box():
        with open('info.md') as f:
          f.readline()
          gr.Markdown(f.read())

# show the interface
block.launch()