Add(last version: new angles + add TB-MCQ)
Browse files
README.md
CHANGED
@@ -6,23 +6,21 @@ tags:
|
|
6 |
- Torsional
|
7 |
- Angles
|
8 |
pipeline_tag: token-classification
|
|
|
|
|
9 |
---
|
10 |
# `RNA-TorsionBERT`
|
11 |
|
12 |
## Model Description
|
13 |
|
14 |
-
`RNA-TorsionBERT` is a
|
15 |
|
16 |
-
`RNA-TorsionBERT` is a DNABERT model that was pre-trained on ~4200 RNA structures
|
17 |
|
18 |
-
It provides
|
|
|
19 |
|
20 |
|
21 |
-
| Model | alpha | beta | gamma | delta | epsilon | zeta | chi | eta | theta |
|
22 |
-
|------------------|----------|-------|-------|-------|---------|-------|-------|-------|-------|
|
23 |
-
| **RNA-TorsionBERT** | 37.3 | 19.6 | 29.4 | 13.6 | 16.6 | 26.6 | 14.7 | 20.1 | 25.4 |
|
24 |
-
| SPOT-RNA-1D | 45.7 | 23 | 33.6 | 19 | 21.1 | 34.4 | 19.3 | 28.9 | 33.9 |
|
25 |
-
|
26 |
**Key Features**
|
27 |
* Torsional and Pseudo-torsional angles prediction
|
28 |
* Predict sequences up to 512 nucleotides
|
@@ -49,38 +47,135 @@ output = model(inputs)["logits"]
|
|
49 |
```
|
50 |
|
51 |
- Please note that it was fine-tuned from a DNABERT-3 model and therefore the tokenizer is the same as the one used for DNABERT. Nucleotide `U` should therefore be replaced by `T` in the input sequence.
|
52 |
-
- The output is the sinus and the cosine for each angle. The angles are in the following order: `alpha`, `beta
|
53 |
|
54 |
To convert the predictions into angles, you can use the following code snippet:
|
55 |
|
56 |
```python
|
57 |
-
|
58 |
-
|
59 |
import numpy as np
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
""
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
- Torsional
|
7 |
- Angles
|
8 |
pipeline_tag: token-classification
|
9 |
+
base_model:
|
10 |
+
- zhihan1996/DNA_bert_3
|
11 |
---
|
12 |
# `RNA-TorsionBERT`
|
13 |
|
14 |
## Model Description
|
15 |
|
16 |
+
`RNA-TorsionBERT` is a 86.9 MB parameter BERT-based language model that predicts RNA torsional and pseudo-torsional angles from the sequence.
|
17 |
|
18 |
+
`RNA-TorsionBERT` is a DNABERT model that was pre-trained on ~4200 RNA structures.
|
19 |
|
20 |
+
It provides improvement of [MCQ](https://github.com/tzok/mcq4structures) over the previous state-of-the-art models like
|
21 |
+
[SPOT-RNA-1D](https://github.com/jaswindersingh2/SPOT-RNA-1D) or inferred angles from existing methods, on the Test Set (composed of RNA-Puzzles and CASP-RNA).
|
22 |
|
23 |
|
|
|
|
|
|
|
|
|
|
|
24 |
**Key Features**
|
25 |
* Torsional and Pseudo-torsional angles prediction
|
26 |
* Predict sequences up to 512 nucleotides
|
|
|
47 |
```
|
48 |
|
49 |
- Please note that it was fine-tuned from a DNABERT-3 model and therefore the tokenizer is the same as the one used for DNABERT. Nucleotide `U` should therefore be replaced by `T` in the input sequence.
|
50 |
+
- The output is the sinus and the cosine for each angle. The angles are in the following order: `alpha`, `beta`,`gamma`,`delta`,`epsilon`,`zeta`,`chi`,`eta`,`theta`,`eta'`,`theta'`,`v0`,`v1`,`v2`,`v3`,`v4`.
|
51 |
|
52 |
To convert the predictions into angles, you can use the following code snippet:
|
53 |
|
54 |
```python
|
55 |
+
import transformers
|
56 |
+
from transformers import AutoModel, AutoTokenizer
|
57 |
import numpy as np
|
58 |
+
import pandas as pd
|
59 |
+
from typing import Optional, Dict
|
60 |
+
import os
|
61 |
+
|
62 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
63 |
+
|
64 |
+
transformers.logging.set_verbosity_error()
|
65 |
+
|
66 |
+
|
67 |
+
BACKBONE = [
|
68 |
+
"alpha",
|
69 |
+
"beta",
|
70 |
+
"gamma",
|
71 |
+
"delta",
|
72 |
+
"epsilon",
|
73 |
+
"zeta",
|
74 |
+
"chi",
|
75 |
+
"eta",
|
76 |
+
"theta",
|
77 |
+
"eta'",
|
78 |
+
"theta'",
|
79 |
+
"v0",
|
80 |
+
"v1",
|
81 |
+
"v2",
|
82 |
+
"v3",
|
83 |
+
"v4",
|
84 |
+
]
|
85 |
+
|
86 |
+
|
87 |
+
class RNATorsionBERTHelper:
|
88 |
+
def __init__(self):
|
89 |
+
self.model_name = "sayby/rna_torsionbert"
|
90 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
91 |
+
self.model_name, trust_remote_code=True
|
92 |
+
)
|
93 |
+
self.params_tokenizer = {
|
94 |
+
"return_tensors": "pt",
|
95 |
+
"padding": "max_length",
|
96 |
+
"max_length": 512,
|
97 |
+
"truncation": True,
|
98 |
+
}
|
99 |
+
self.model = AutoModel.from_pretrained(self.model_name, trust_remote_code=True)
|
100 |
+
|
101 |
+
def predict(self, sequence: str):
|
102 |
+
sequence_tok = self.convert_raw_sequence_to_k_mers(sequence)
|
103 |
+
inputs = self.tokenizer(sequence_tok, **self.params_tokenizer)
|
104 |
+
outputs = self.model(inputs)["logits"]
|
105 |
+
outputs = self.convert_sin_cos_to_angles(
|
106 |
+
outputs.cpu().detach().numpy(), inputs["input_ids"]
|
107 |
+
)
|
108 |
+
output_angles = self.convert_logits_to_dict(
|
109 |
+
outputs[0, :], inputs["input_ids"][0, :].cpu().detach().numpy()
|
110 |
+
)
|
111 |
+
output_angles.index = list(sequence)[:-2] # Because of the 3-mer representation
|
112 |
+
return output_angles
|
113 |
+
|
114 |
+
def convert_raw_sequence_to_k_mers(self, sequence: str, k_mers: int = 3):
|
115 |
+
"""
|
116 |
+
Convert a raw RNA sequence into sequence readable for the tokenizer.
|
117 |
+
It converts the sequence into k-mers, and replace U by T
|
118 |
+
:return: input readable by the tokenizer
|
119 |
+
"""
|
120 |
+
sequence = sequence.upper().replace("U", "T")
|
121 |
+
k_mers_sequence = [
|
122 |
+
sequence[i : i + k_mers]
|
123 |
+
for i in range(len(sequence))
|
124 |
+
if len(sequence[i : i + k_mers]) == k_mers
|
125 |
+
]
|
126 |
+
return " ".join(k_mers_sequence)
|
127 |
+
|
128 |
+
def convert_sin_cos_to_angles(
|
129 |
+
self, output: np.ndarray, input_ids: Optional[np.ndarray] = None
|
130 |
+
):
|
131 |
+
"""
|
132 |
+
Convert the raw predictions of the RNA-TorsionBERT into angles.
|
133 |
+
It converts the cos and sinus into angles using:
|
134 |
+
alpha = arctan(sin(alpha)/cos(alpha))
|
135 |
+
:param output: Dictionary with the predictions of the RNA-TorsionBERT per angle
|
136 |
+
:param input_ids: the input_ids of the RNA-TorsionBERT. It allows to only select the of the sequence,
|
137 |
+
and not the special tokens.
|
138 |
+
:return: a np.ndarray with the angles for the sequence
|
139 |
+
"""
|
140 |
+
if input_ids is not None:
|
141 |
+
output[
|
142 |
+
(input_ids == 0)
|
143 |
+
| (input_ids == 2)
|
144 |
+
| (input_ids == 3)
|
145 |
+
| (input_ids == 4)
|
146 |
+
] = np.nan
|
147 |
+
pair_indexes, impair_indexes = np.arange(0, output.shape[-1], 2), np.arange(
|
148 |
+
1, output.shape[-1], 2
|
149 |
+
)
|
150 |
+
sin, cos = output[:, :, impair_indexes], output[:, :, pair_indexes]
|
151 |
+
tan = np.arctan2(sin, cos)
|
152 |
+
angles = np.degrees(tan)
|
153 |
+
return angles
|
154 |
+
|
155 |
+
def convert_logits_to_dict(self, output: np.ndarray, input_ids: np.ndarray) -> Dict:
|
156 |
+
"""
|
157 |
+
Convert the raw predictions into dictionary format.
|
158 |
+
It removes the special tokens and only keeps the predictions for the sequence.
|
159 |
+
:param output: predictions from the models in angles
|
160 |
+
:param input_ids: input ids from the tokenizer
|
161 |
+
:return: a dictionary with the predictions for each angle
|
162 |
+
"""
|
163 |
+
index_start, index_end = (
|
164 |
+
np.where(input_ids == 2)[0][0],
|
165 |
+
np.where(input_ids == 3)[0][0],
|
166 |
+
)
|
167 |
+
output_non_pad = output[index_start + 1 : index_end, :]
|
168 |
+
output_angles = {
|
169 |
+
angle: output_non_pad[:, angle_index]
|
170 |
+
for angle_index, angle in enumerate(BACKBONE)
|
171 |
+
}
|
172 |
+
out = pd.DataFrame(output_angles)
|
173 |
+
return out
|
174 |
+
|
175 |
+
|
176 |
+
if __name__ == "__main__":
|
177 |
+
sequence = "AGGGCUUUAGUCUUUGGAG"
|
178 |
+
rna_torsionbert_helper = RNATorsionBERTHelper()
|
179 |
+
output_angles = rna_torsionbert_helper.predict(sequence)
|
180 |
+
print(output_angles)
|
181 |
+
```
|