Spaces:

sagawa
/

ReactionT5_task_yield

Sleeping

App Files Files Community

sagawa commited on Aug 3, 2024

Commit

21bf012

verified ·

1 Parent(s): ea3027a

Update app.py

Browse files

Files changed (1) hide show

app.py +185 -163

app.py CHANGED Viewed

@@ -1,31 +1,23 @@
 import os
-import gc
-import random
-import itertools
 import warnings
 import logging
-warnings.filterwarnings('ignore')
-logging.disable(logging.WARNING)
 import numpy as np
 import pandas as pd
-from tqdm.auto import tqdm
-import tokenizers
-import transformers
-from transformers import AutoTokenizer, AutoConfig, AutoModel, T5EncoderModel, get_linear_schedule_with_warmup, AutoModelForSeq2SeqLM, T5ForConditionalGeneration
-import datasets
-from datasets import load_dataset, load_metric
-import argparse
 import torch
-import sentencepiece
 from torch.utils.data import Dataset, DataLoader
-import torch.nn.functional as F
-import torch.nn as nn
-import pickle
-import time
-from sklearn.preprocessing import MinMaxScaler
 from datasets.utils.logging import disable_progress_bar
-from sklearn.metrics import mean_squared_error, r2_score
 disable_progress_bar()
 import streamlit as st
 st.title('predictyield-t5')
@@ -52,161 +44,191 @@ class CFG():
     fc_dropout = 0.1
     seed = 42
     num_workers=1
 if st.button('predict'):
     with st.spinner('Now processing. This process takes about 4 seconds per reaction.'):
         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        def seed_everything(seed=42):
-            random.seed(seed)
-            os.environ['PYTHONHASHSEED'] = str(seed)
-            np.random.seed(seed)
-            torch.manual_seed(seed)
-            torch.cuda.manual_seed(seed)
-            torch.backends.cudnn.deterministic = True
         seed_everything(seed=CFG.seed)
         CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
-        def prepare_input(cfg, text):
-            inputs = cfg.tokenizer(text, add_special_tokens=True, max_length=CFG.max_len, padding='max_length', return_offsets_mapping=False, truncation=True, return_attention_mask=True)
-            for k, v in inputs.items():
-                inputs[k] = torch.tensor(v, dtype=torch.long)
-            return inputs
-        class TestDataset(Dataset):
-            def __init__(self, cfg, df):
-                self.cfg = cfg
-                self.inputs = df['input'].values
-            def __len__(self):
-                return len(self.inputs)
-            def __getitem__(self, item):
-                inputs = prepare_input(self.cfg, self.inputs[item])
-                return inputs
-        class RegressionModel(nn.Module):
-            def __init__(self, cfg, config_path=None, pretrained=False):
-                super().__init__()
-                self.cfg = cfg
-                if config_path is None:
-                    self.config = AutoConfig.from_pretrained(cfg.pretrained_model_name_or_path, output_hidden_states=True)
-                else:
-                    self.config = torch.load(config_path)
-                if pretrained:
-                    if 't5' in cfg.model:
-                        self.model = T5ForConditionalGeneration.from_pretrained(CFG.pretrained_model_name_or_path)
-                    else:
-                        self.model = AutoModel.from_pretrained(CFG.pretrained_model_name_or_path)
-                else:
-                    if 't5' in cfg.model:
-                        self.model = T5ForConditionalGeneration.from_pretrained('sagawa/ZINC-t5')
-                    else:
-                        self.model = AutoModel.from_config(self.config)
-                self.model.resize_token_embeddings(len(cfg.tokenizer))
-                self.fc_dropout1 = nn.Dropout(cfg.fc_dropout)
-                self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size//2)
-                self.fc_dropout2 = nn.Dropout(cfg.fc_dropout)
-                self.fc2 = nn.Linear(self.config.hidden_size, self.config.hidden_size//2)
-                self.fc3 = nn.Linear(self.config.hidden_size//2*2, self.config.hidden_size)
-                self.fc4 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
-                self.fc5 = nn.Linear(self.config.hidden_size, 1)
-                self._init_weights(self.fc1)
-                self._init_weights(self.fc2)
-                self._init_weights(self.fc3)
-                self._init_weights(self.fc4)
-            def _init_weights(self, module):
-                if isinstance(module, nn.Linear):
-                    module.weight.data.normal_(mean=0.0, std=0.01)
-                    if module.bias is not None:
-                        module.bias.data.zero_()
-                elif isinstance(module, nn.Embedding):
-                    module.weight.data.normal_(mean=0.0, std=0.01)
-                    if module.padding_idx is not None:
-                        module.weight.data[module.padding_idx].zero_()
-                elif isinstance(module, nn.LayerNorm):
-                    module.bias.data.zero_()
-                    module.weight.data.fill_(1.0)
-            def forward(self, inputs):
-                encoder_outputs = self.model.encoder(**inputs)
-                encoder_hidden_states = encoder_outputs[0]
-                outputs = self.model.decoder(input_ids=torch.full((inputs['input_ids'].size(0),1),
-                                                    self.config.decoder_start_token_id,
-                                                    dtype=torch.long,
-                                                    device=device), encoder_hidden_states=encoder_hidden_states)
-                last_hidden_states = outputs[0]
-                output1 = self.fc1(self.fc_dropout1(last_hidden_states).view(-1, self.config.hidden_size))
-                output2 = self.fc2(encoder_hidden_states[:, 0, :].view(-1, self.config.hidden_size))
-                output = self.fc3(self.fc_dropout2(torch.hstack((output1, output2))))
-                output = self.fc4(output)
-                output = self.fc5(output)
-                return output
-        def inference_fn(test_loader, model, device):
-            preds = []
-            model.eval()
-            model.to(device)
-            tk0 = enumerate(test_loader)
-            for i, inputs in tk0:
-                for k, v in inputs.items():
-                    inputs[k] = v.to(device)
-                with torch.no_grad():
-                    y_preds = model(inputs)
-                preds.append(y_preds.to('cpu').numpy())
-            predictions = np.concatenate(preds)
-            return predictions
-        model = RegressionModel(CFG, config_path=CFG.model_name_or_path + '/config.pth', pretrained=False)
-        state = torch.load(CFG.model_name_or_path + '/ZINC-t5_best.pth', map_location=torch.device('cpu'))
-        model.load_state_dict(state)
         if CFG.uploaded_file is not None:
-            test_ds = pd.read_csv(CFG.uploaded_file)
-            test_dataset = TestDataset(CFG, test_ds)
-            test_loader = DataLoader(test_dataset,
-                                     batch_size=CFG.batch_size,
-                                     shuffle=False,
-                                     num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
-            prediction = inference_fn(test_loader, model, device)
-            test_ds['prediction'] = prediction*100
-            test_ds['prediction'] = test_ds['prediction'].clip(0, 100)
-            csv = test_ds.to_csv(index=False)
-            st.download_button(
-                label="Download data as CSV",
-                data=csv,
-                file_name='output.csv',
-                mime='text/csv'
-            )
         else:
-            CFG.batch_size=1
-            test_ds = pd.DataFrame.from_dict({'input': CFG.data}, orient='index').T
-            test_dataset = TestDataset(CFG, test_ds)
-            test_loader = DataLoader(test_dataset,
-                                     batch_size=CFG.batch_size,
-                                     shuffle=False,
-                                     num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
-            prediction = inference_fn(test_loader, model, device)
-            prediction = max(min(prediction[0][0]*100, 100), 0)
-            st.text('yiled: '+ str(prediction))

 import os
 import warnings
 import logging
+import random
 import numpy as np
+import torch.nn as nn
+from transformers import AutoConfig, PreTrainedModel, T5ForConditionalGeneration
 import pandas as pd
 import torch
 from torch.utils.data import Dataset, DataLoader
+from transformers import AutoTokenizer
 from datasets.utils.logging import disable_progress_bar
+# Suppress warnings and logging
+warnings.filterwarnings("ignore")
+logging.disable(logging.WARNING)
 disable_progress_bar()
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 import streamlit as st
 st.title('predictyield-t5')
     fc_dropout = 0.1
     seed = 42
     num_workers=1
+def seed_everything(seed=42):
+    random.seed(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+def prepare_input(cfg, text):
+    """
+    Prepare input tensors for the model.
+    Args:
+        cfg (argparse.Namespace): Configuration object.
+        text (str): Input text.
+    Returns:
+        dict: Tokenized input tensors.
+    """
+    inputs = cfg.tokenizer(
+        text,
+        add_special_tokens=True,
+        max_length=cfg.max_len,
+        padding="max_length",
+        truncation=True,
+        return_attention_mask=True,
+    )
+    return {k: torch.tensor(v, dtype=torch.long) for k, v in inputs.items()}
+def inference_fn(test_loader, model, cfg):
+    """
+    Inference function.
+    Args:
+        test_loader (DataLoader): DataLoader for test data.
+        model (nn.Module): Model for inference.
+        cfg (argparse.Namespace): Configuration object.
+    Returns:
+        np.ndarray: Predictions.
+    """
+    model.eval()
+    model.to(cfg.device)
+    preds = []
+    for inputs in test_loader:
+        inputs = {k: v.to(cfg.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            y_preds = model(inputs)
+        preds.append(y_preds.to("cpu").numpy())
+    return np.concatenate(preds)
+def preprocess(df):
+    """
+    Preprocess the input DataFrame for training.
+    Args:
+        df (pd.DataFrame): Input DataFrame.
+        cfg (argparse.Namespace): Configuration object.
+    Returns:
+        pd.DataFrame: Preprocessed DataFrame.
+    """
+    df["input"] = (
+        "REACTANT:"
+        + df["REACTANT"]
+        + "REAGENT:"
+        + df["REAGENT"]
+        + "PRODUCT:"
+        + df["PRODUCT"]
+    )
+    return df
+class TestDataset(Dataset):
+    """
+    Dataset class for training.
+    """
+    def __init__(self, cfg, df):
+        self.cfg = cfg
+        self.inputs = df["input"].values
+    def __len__(self):
+        return len(self.inputs)
+    def __getitem__(self, item):
+        inputs = prepare_input(self.cfg, self.inputs[item])
+        return inputs
+class ReactionT5Yield(PreTrainedModel):
+    config_class  = AutoConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.model = T5ForConditionalGeneration.from_pretrained(self.config._name_or_path)
+        self.model.resize_token_embeddings(self.config.vocab_size)
+        self.fc1 = nn.Linear(self.config.hidden_size, self.config.hidden_size//2)
+        self.fc2 = nn.Linear(self.config.hidden_size, self.config.hidden_size//2)
+        self.fc3 = nn.Linear(self.config.hidden_size//2*2, self.config.hidden_size)
+        self.fc4 = nn.Linear(self.config.hidden_size, self.config.hidden_size)
+        self.fc5 = nn.Linear(self.config.hidden_size, 1)
+        self._init_weights(self.fc1)
+        self._init_weights(self.fc2)
+        self._init_weights(self.fc3)
+        self._init_weights(self.fc4)
+        self._init_weights(self.fc5)
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=0.01)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.01)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, inputs):
+        encoder_outputs = self.model.encoder(**inputs)
+        encoder_hidden_states = encoder_outputs[0]
+        outputs = self.model.decoder(input_ids=torch.full((inputs['input_ids'].size(0),1),
+                                            self.config.decoder_start_token_id,
+                                            dtype=torch.long), encoder_hidden_states=encoder_hidden_states)
+        last_hidden_states = outputs[0]
+        output1 = self.fc1(last_hidden_states.view(-1, self.config.hidden_size))
+        output2 = self.fc2(encoder_hidden_states[:, 0, :].view(-1, self.config.hidden_size))
+        output = self.fc3(torch.hstack((output1, output2)))
+        output = self.fc4(output)
+        output = self.fc5(output)
+        return output*100
 if st.button('predict'):
     with st.spinner('Now processing. This process takes about 4 seconds per reaction.'):
         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        CFG.device = device
         seed_everything(seed=CFG.seed)
         CFG.tokenizer = AutoTokenizer.from_pretrained(CFG.model_name_or_path, return_tensors='pt')
+        model = ReactionT5Yield.from_pretrained(CFG.model_name_or_path)
         if CFG.uploaded_file is not None:
+            test_ds = pd.read_csv(CFG.data)
+            if "input" not in test_ds.columns:
+                test_ds = preprocess(test_ds, CFG)
         else:
+            test_ds = pd.DataFrame.from_dict({"input": [CFG.data]}, orient="index").T
+        test_dataset = TestDataset(CFG, test_ds)
+        test_loader = DataLoader(
+            test_dataset,
+            batch_size=CFG.batch_size,
+            shuffle=False,
+            num_workers=CFG.num_workers,
+            pin_memory=True,
+            drop_last=False,
+        )
+        prediction = inference_fn(test_loader, model, CFG)
+        test_ds["prediction"] = prediction
+        test_ds["prediction"] = test_ds["prediction"].clip(0, 100)
+        csv = test_ds.to_csv(index=False)
+        st.download_button(
+            label="Download data as CSV",
+            data=csv,
+            file_name='output.csv',
+            mime='text/csv'
+        )