sagawa commited on
Commit
21bf012
·
verified ·
1 Parent(s): ea3027a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -163
app.py CHANGED
@@ -1,31 +1,23 @@
1
  import os
2
- import gc
3
- import random
4
- import itertools
5
  import warnings
6
  import logging
7
- warnings.filterwarnings('ignore')
8
- logging.disable(logging.WARNING)
9
  import numpy as np
 
 
 
10
  import pandas as pd
11
- from tqdm.auto import tqdm
12
- import tokenizers
13
- import transformers
14
- from transformers import AutoTokenizer, AutoConfig, AutoModel, T5EncoderModel, get_linear_schedule_with_warmup, AutoModelForSeq2SeqLM, T5ForConditionalGeneration
15
- import datasets
16
- from datasets import load_dataset, load_metric
17
- import argparse
18
  import torch
19
- import sentencepiece
20
  from torch.utils.data import Dataset, DataLoader
21
- import torch.nn.functional as F
22
- import torch.nn as nn
23
- import pickle
24
- import time
25
- from sklearn.preprocessing import MinMaxScaler
26
  from datasets.utils.logging import disable_progress_bar
27
- from sklearn.metrics import mean_squared_error, r2_score
 
 
 
28
  disable_progress_bar()
 
 
29
  import streamlit as st
30
 
31
  st.title('predictyield-t5')
@@ -52,161 +44,191 @@ class CFG():
52
  fc_dropout = 0.1
53
  seed = 42
54
  num_workers=1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  if st.button('predict'):
57
  with st.spinner('Now processing. This process takes about 4 seconds per reaction.'):
58
 
59
 
60
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
61
 
62
-
63
- def seed_everything(seed=42):
64
- random.seed(seed)
65
- os.environ['PYTHONHASHSEED'] = str(seed)
66
- np.random.seed(seed)
67
- torch.manual_seed(seed)
68
- torch.cuda.manual_seed(seed)
69
- torch.backends.cudnn.deterministic = True
70
  seed_everything(seed=CFG.seed)
71
 
72
  CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
73
 
74
- def prepare_input(cfg, text):
75
- inputs = cfg.tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', return_offsets_mapping=False, truncation=True, return_attention_mask=True)
76
- for k, v in inputs.items():
77
- inputs[k] = torch.tensor(v, dtype=torch.long)
78
-
79
- return inputs
80
-
81
- class TestDataset(Dataset):
82
- def __init__(self, cfg, df):
83
- self.cfg = cfg
84
- self.inputs = df['input'].values
85
-
86
- def __len__(self):
87
- return len(self.inputs)
88
-
89
- def __getitem__(self, item):
90
- inputs = prepare_input(self.cfg, self.inputs[item])
91
-
92
- return inputs
93
-
94
-
95
- class RegressionModel(nn.Module):
96
- def __init__(self, cfg, config_path=None, pretrained=False):
97
- super().__init__()
98
- self.cfg = cfg
99
- if config_path is None:
100
- self.config = AutoConfig.from_pretrained(cfg.pretrained_model_name_or_path, output_hidden_states=True)
101
- else:
102
- self.config = torch.load(config_path)
103
- if pretrained:
104
- if 't5' in cfg.model:
105
- self.model = T5ForConditionalGeneration.from_pretrained(CFG.pretrained_model_name_or_path)
106
- else:
107
- self.model = AutoModel.from_pretrained(CFG.pretrained_model_name_or_path)
108
- else:
109
- if 't5' in cfg.model:
110
- self.model = T5ForConditionalGeneration.from_pretrained('sagawa/ZINC-t5')
111
- else:
112
- self.model = AutoModel.from_config(self.config)
113
- self.model.resize_token_embeddings(len(cfg.tokenizer))
114
- self.fc_dropout1 = nn.Dropout(cfg.fc_dropout)
115
- self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size//2)
116
- self.fc_dropout2 = nn.Dropout(cfg.fc_dropout)
117
-
118
- self.fc2 = nn.Linear(self.config.hidden_size, self.config.hidden_size//2)
119
- self.fc3 = nn.Linear(self.config.hidden_size//2*2, self.config.hidden_size)
120
- self.fc4 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
121
- self.fc5 = nn.Linear(self.config.hidden_size, 1)
122
-
123
- self._init_weights(self.fc1)
124
- self._init_weights(self.fc2)
125
- self._init_weights(self.fc3)
126
- self._init_weights(self.fc4)
127
-
128
- def _init_weights(self, module):
129
- if isinstance(module, nn.Linear):
130
- module.weight.data.normal_(mean=0.0, std=0.01)
131
- if module.bias is not None:
132
- module.bias.data.zero_()
133
- elif isinstance(module, nn.Embedding):
134
- module.weight.data.normal_(mean=0.0, std=0.01)
135
- if module.padding_idx is not None:
136
- module.weight.data[module.padding_idx].zero_()
137
- elif isinstance(module, nn.LayerNorm):
138
- module.bias.data.zero_()
139
- module.weight.data.fill_(1.0)
140
-
141
- def forward(self, inputs):
142
- encoder_outputs = self.model.encoder(**inputs)
143
- encoder_hidden_states = encoder_outputs[0]
144
- outputs = self.model.decoder(input_ids=torch.full((inputs['input_ids'].size(0),1),
145
- self.config.decoder_start_token_id,
146
- dtype=torch.long,
147
- device=device), encoder_hidden_states=encoder_hidden_states)
148
- last_hidden_states = outputs[0]
149
- output1 = self.fc1(self.fc_dropout1(last_hidden_states).view(-1, self.config.hidden_size))
150
- output2 = self.fc2(encoder_hidden_states[:, 0, :].view(-1, self.config.hidden_size))
151
- output = self.fc3(self.fc_dropout2(torch.hstack((output1, output2))))
152
- output = self.fc4(output)
153
- output = self.fc5(output)
154
- return output
155
-
156
-
157
-
158
- def inference_fn(test_loader, model, device):
159
- preds = []
160
- model.eval()
161
- model.to(device)
162
- tk0 = enumerate(test_loader)
163
- for i, inputs in tk0:
164
- for k, v in inputs.items():
165
- inputs[k] = v.to(device)
166
- with torch.no_grad():
167
- y_preds = model(inputs)
168
- preds.append(y_preds.to('cpu').numpy())
169
- predictions = np.concatenate(preds)
170
- return predictions
171
-
172
- model = RegressionModel(CFG, config_path=CFG.model_name_or_path + '/config.pth', pretrained=False)
173
- state = torch.load(CFG.model_name_or_path + '/ZINC-t5_best.pth', map_location=torch.device('cpu'))
174
- model.load_state_dict(state)
175
-
176
 
 
 
177
  if CFG.uploaded_file is not None:
178
- test_ds = pd.read_csv(CFG.uploaded_file)
179
-
180
- test_dataset = TestDataset(CFG, test_ds)
181
- test_loader = DataLoader(test_dataset,
182
- batch_size=CFG.batch_size,
183
- shuffle=False,
184
- num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
185
-
186
-
187
- prediction = inference_fn(test_loader, model, device)
188
-
189
- test_ds['prediction'] = prediction*100
190
- test_ds['prediction'] = test_ds['prediction'].clip(0, 100)
191
- csv = test_ds.to_csv(index=False)
192
- st.download_button(
193
- label="Download data as CSV",
194
- data=csv,
195
- file_name='output.csv',
196
- mime='text/csv'
197
- )
198
-
199
  else:
200
- CFG.batch_size=1
201
- test_ds = pd.DataFrame.from_dict({'input': CFG.data}, orient='index').T
202
- test_dataset = TestDataset(CFG, test_ds)
203
- test_loader = DataLoader(test_dataset,
204
- batch_size=CFG.batch_size,
205
- shuffle=False,
206
- num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
207
-
208
-
209
- prediction = inference_fn(test_loader, model, device)
210
- prediction = max(min(prediction[0][0]*100, 100), 0)
211
- st.text('yiled: '+ str(prediction))
212
-
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
 
 
 
2
  import warnings
3
  import logging
4
+ import random
 
5
  import numpy as np
6
+ import torch.nn as nn
7
+ from transformers import AutoConfig, PreTrainedModel, T5ForConditionalGeneration
8
+
9
  import pandas as pd
 
 
 
 
 
 
 
10
  import torch
 
11
  from torch.utils.data import Dataset, DataLoader
12
+ from transformers import AutoTokenizer
 
 
 
 
13
  from datasets.utils.logging import disable_progress_bar
14
+
15
+ # Suppress warnings and logging
16
+ warnings.filterwarnings("ignore")
17
+ logging.disable(logging.WARNING)
18
  disable_progress_bar()
19
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
20
+
21
  import streamlit as st
22
 
23
  st.title('predictyield-t5')
 
44
  fc_dropout = 0.1
45
  seed = 42
46
  num_workers=1
47
+
48
+
49
+ def seed_everything(seed=42):
50
+ random.seed(seed)
51
+ os.environ['PYTHONHASHSEED'] = str(seed)
52
+ np.random.seed(seed)
53
+ torch.manual_seed(seed)
54
+ torch.cuda.manual_seed(seed)
55
+ torch.backends.cudnn.deterministic = True
56
+
57
+
58
+ def prepare_input(cfg, text):
59
+ """
60
+ Prepare input tensors for the model.
61
+
62
+ Args:
63
+ cfg (argparse.Namespace): Configuration object.
64
+ text (str): Input text.
65
+
66
+ Returns:
67
+ dict: Tokenized input tensors.
68
+ """
69
+ inputs = cfg.tokenizer(
70
+ text,
71
+ add_special_tokens=True,
72
+ max_length=cfg.max_len,
73
+ padding="max_length",
74
+ truncation=True,
75
+ return_attention_mask=True,
76
+ )
77
+ return {k: torch.tensor(v, dtype=torch.long) for k, v in inputs.items()}
78
+
79
+
80
+ def inference_fn(test_loader, model, cfg):
81
+ """
82
+ Inference function.
83
+
84
+ Args:
85
+ test_loader (DataLoader): DataLoader for test data.
86
+ model (nn.Module): Model for inference.
87
+ cfg (argparse.Namespace): Configuration object.
88
+
89
+ Returns:
90
+ np.ndarray: Predictions.
91
+ """
92
+ model.eval()
93
+ model.to(cfg.device)
94
+ preds = []
95
+
96
+ for inputs in test_loader:
97
+ inputs = {k: v.to(cfg.device) for k, v in inputs.items()}
98
+ with torch.no_grad():
99
+ y_preds = model(inputs)
100
+ preds.append(y_preds.to("cpu").numpy())
101
+
102
+ return np.concatenate(preds)
103
+
104
+ def preprocess(df):
105
+ """
106
+ Preprocess the input DataFrame for training.
107
+
108
+ Args:
109
+ df (pd.DataFrame): Input DataFrame.
110
+ cfg (argparse.Namespace): Configuration object.
111
+
112
+ Returns:
113
+ pd.DataFrame: Preprocessed DataFrame.
114
+ """
115
+ df["input"] = (
116
+ "REACTANT:"
117
+ + df["REACTANT"]
118
+ + "REAGENT:"
119
+ + df["REAGENT"]
120
+ + "PRODUCT:"
121
+ + df["PRODUCT"]
122
+ )
123
+
124
+ return df
125
+
126
+
127
+ class TestDataset(Dataset):
128
+ """
129
+ Dataset class for training.
130
+ """
131
+
132
+ def __init__(self, cfg, df):
133
+ self.cfg = cfg
134
+ self.inputs = df["input"].values
135
+
136
+ def __len__(self):
137
+ return len(self.inputs)
138
+
139
+ def __getitem__(self, item):
140
+ inputs = prepare_input(self.cfg, self.inputs[item])
141
+
142
+ return inputs
143
+
144
+
145
+ class ReactionT5Yield(PreTrainedModel):
146
+ config_class = AutoConfig
147
+ def __init__(self, config):
148
+ super().__init__(config)
149
+ self.config = config
150
+ self.model = T5ForConditionalGeneration.from_pretrained(self.config._name_or_path)
151
+ self.model.resize_token_embeddings(self.config.vocab_size)
152
+ self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size//2)
153
+ self.fc2 = nn.Linear(self.config.hidden_size, self.config.hidden_size//2)
154
+ self.fc3 = nn.Linear(self.config.hidden_size//2*2, self.config.hidden_size)
155
+ self.fc4 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
156
+ self.fc5 = nn.Linear(self.config.hidden_size, 1)
157
+
158
+ self._init_weights(self.fc1)
159
+ self._init_weights(self.fc2)
160
+ self._init_weights(self.fc3)
161
+ self._init_weights(self.fc4)
162
+ self._init_weights(self.fc5)
163
+
164
+ def _init_weights(self, module):
165
+ if isinstance(module, nn.Linear):
166
+ module.weight.data.normal_(mean=0.0, std=0.01)
167
+ if module.bias is not None:
168
+ module.bias.data.zero_()
169
+ elif isinstance(module, nn.Embedding):
170
+ module.weight.data.normal_(mean=0.0, std=0.01)
171
+ if module.padding_idx is not None:
172
+ module.weight.data[module.padding_idx].zero_()
173
+ elif isinstance(module, nn.LayerNorm):
174
+ module.bias.data.zero_()
175
+ module.weight.data.fill_(1.0)
176
+
177
+ def forward(self, inputs):
178
+ encoder_outputs = self.model.encoder(**inputs)
179
+ encoder_hidden_states = encoder_outputs[0]
180
+ outputs = self.model.decoder(input_ids=torch.full((inputs['input_ids'].size(0),1),
181
+ self.config.decoder_start_token_id,
182
+ dtype=torch.long), encoder_hidden_states=encoder_hidden_states)
183
+ last_hidden_states = outputs[0]
184
+ output1 = self.fc1(last_hidden_states.view(-1, self.config.hidden_size))
185
+ output2 = self.fc2(encoder_hidden_states[:, 0, :].view(-1, self.config.hidden_size))
186
+ output = self.fc3(torch.hstack((output1, output2)))
187
+ output = self.fc4(output)
188
+ output = self.fc5(output)
189
+ return output*100
190
+
191
 
192
  if st.button('predict'):
193
  with st.spinner('Now processing. This process takes about 4 seconds per reaction.'):
194
 
195
 
196
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
197
+ CFG.device = device
198
 
 
 
 
 
 
 
 
 
199
  seed_everything(seed=CFG.seed)
200
 
201
  CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
202
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
+ model = ReactionT5Yield.from_pretrained(CFG.model_name_or_path)
205
+
206
  if CFG.uploaded_file is not None:
207
+ test_ds = pd.read_csv(CFG.data)
208
+ if "input" not in test_ds.columns:
209
+ test_ds = preprocess(test_ds, CFG)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  else:
211
+ test_ds = pd.DataFrame.from_dict({"input": [CFG.data]}, orient="index").T
212
+
213
+ test_dataset = TestDataset(CFG, test_ds)
214
+ test_loader = DataLoader(
215
+ test_dataset,
216
+ batch_size=CFG.batch_size,
217
+ shuffle=False,
218
+ num_workers=CFG.num_workers,
219
+ pin_memory=True,
220
+ drop_last=False,
221
+ )
222
+
223
+
224
+ prediction = inference_fn(test_loader, model, CFG)
225
+
226
+ test_ds["prediction"] = prediction
227
+ test_ds["prediction"] = test_ds["prediction"].clip(0, 100)
228
+ csv = test_ds.to_csv(index=False)
229
+ st.download_button(
230
+ label="Download data as CSV",
231
+ data=csv,
232
+ file_name='output.csv',
233
+ mime='text/csv'
234
+ )