Spaces:

AI4PD
/

hexviz

Sleeping

App Files Files Community

aksell commited on May 4, 2023

Commit

a71a737

1 Parent(s): 7a18fac

Add ruff, run ruff and black

Browse files

Files changed (12) hide show

hexviz/attention.py +20 -37
hexviz/ec_number.py +1 -3
hexviz/models.py +1 -3
hexviz/pages/1_🗺️Identify_Interesting_Heads.py +7 -10
hexviz/pages/2_📄Documentation.py +42 -19
hexviz/plot.py +4 -12
hexviz/view.py +9 -8
hexviz/🧬Attention_Visualization.py +19 -40
poetry.lock +10 -1
pyproject.toml +7 -0
tests/test_attention.py +22 -15
tests/test_models.py +1 -2

hexviz/attention.py CHANGED Viewed

@@ -68,18 +68,14 @@ def res_to_1letter(residues: list[Residue]) -> str:
     Residues not in the standard 20 amino acids are replaced with X
     """
     res_names = [residue.get_resname() for residue in residues]
-    residues_single_letter = map(
-        lambda x: Polypeptide.protein_letters_3to1.get(x, "X"), res_names
-    )
     return "".join(list(residues_single_letter))
 def clean_and_validate_sequence(sequence: str) -> tuple[str, str | None]:
     lines = sequence.split("\n")
-    cleaned_sequence = "".join(
-        line.upper() for line in lines if not line.startswith(">")
-    )
     cleaned_sequence = cleaned_sequence.replace(" ", "")
     valid_residues = set(Polypeptide.protein_letters_3to1.values())
     residues_in_sequence = set(cleaned_sequence)
@@ -87,7 +83,9 @@ def clean_and_validate_sequence(sequence: str) -> tuple[str, str | None]:
     # Check if the sequence exceeds the max allowed length
     max_sequence_length = 400
     if len(cleaned_sequence) > max_sequence_length:
-        error_message = f"Sequence exceeds the max allowed length of {max_sequence_length} characters"
         return cleaned_sequence, error_message
     illegal_residues = residues_in_sequence - valid_residues
@@ -103,9 +101,7 @@ def remove_special_tokens_and_periods(attentions_tuple, sequence, tokenizer):
     tokens = tokenizer.tokenize(sequence)
     indices_to_remove = [
-        i
-        for i, token in enumerate(tokens)
-        if token in {".", "<sep>", "<start>", "<end>", "<pad>"}
     ]
     new_attentions = []
@@ -113,9 +109,7 @@ def remove_special_tokens_and_periods(attentions_tuple, sequence, tokenizer):
     for attentions in attentions_tuple:
         # Remove rows and columns corresponding to special tokens and periods
         for idx in sorted(indices_to_remove, reverse=True):
-            attentions = torch.cat(
-                (attentions[:, :, :idx], attentions[:, :, idx + 1 :]), dim=2
-            )
             attentions = torch.cat(
                 (attentions[:, :, :, :idx], attentions[:, :, :, idx + 1 :]), dim=3
             )
@@ -131,7 +125,7 @@ def get_attention(
     sequence: str,
     model_type: ModelType = ModelType.TAPE_BERT,
     remove_special_tokens: bool = True,
-    ec_number: list[ECNumber] = None,
 ):
     """
     Returns a tensor of shape [n_layers, n_heads, n_res, n_res] with attention weights
@@ -153,24 +147,18 @@ def get_attention(
         tokenizer, model = get_zymctrl()
         if ec_number:
-            sequence = f"{'.'.join([ec.number for ec in ec_number])}<sep><start>{sequence}<end><pad>"
         inputs = tokenizer(sequence, return_tensors="pt").input_ids.to(device)
-        attention_mask = tokenizer(sequence, return_tensors="pt").attention_mask.to(
-            device
-        )
         with torch.no_grad():
-            outputs = model(
-                inputs, attention_mask=attention_mask, output_attentions=True
-            )
             attentions = outputs.attentions
         if ec_number:
             # Remove attention to special tokens and periods separating EC number components
-            attentions = remove_special_tokens_and_periods(
-                attentions, sequence, tokenizer
-            )
         # torch.Size([1, n_heads, n_res, n_res]) -> torch.Size([n_heads, n_res, n_res])
         attention_squeezed = [torch.squeeze(attention) for attention in attentions]
@@ -196,9 +184,7 @@ def get_attention(
         token_idxs = tokenizer.encode(sequence_separated)
         inputs = torch.tensor(token_idxs).unsqueeze(0).to(device)
         with torch.no_grad():
-            attentions = model(inputs, output_attentions=True)[
-                -1
-            ]  # Do you need an attention mask?
         if remove_special_tokens:
             # Remove attention to </s> (last) token
@@ -262,17 +248,16 @@ def get_attention_pairs(
     top_residues = []
     ec_tag_length = 4
-    is_tag = lambda x: x < ec_tag_length
     for i, chain in enumerate(chains):
         ec_number = ec_numbers[i] if ec_numbers else None
         sequence = res_to_1letter(chain)
-        attention = get_attention(
-            sequence=sequence, model_type=model_type, ec_number=ec_number
-        )
-        attention_unidirectional = unidirectional_avg_filtered(
-            attention, layer, head, threshold
-        )
         # Store sum of attention in to a resiue (from the unidirectional attention)
         residue_attention = {}
@@ -305,9 +290,7 @@ def get_attention_pairs(
                             residue_attention.get(res - ec_tag_length, 0) + attn_value
                         )
-        top_n_residues = sorted(
-            residue_attention.items(), key=lambda x: x[1], reverse=True
-        )[:top_n]
         for res, attn_sum in top_n_residues:
             coord = chain[res]["CA"].coord.tolist()

     Residues not in the standard 20 amino acids are replaced with X
     """
     res_names = [residue.get_resname() for residue in residues]
+    residues_single_letter = map(lambda x: Polypeptide.protein_letters_3to1.get(x, "X"), res_names)
     return "".join(list(residues_single_letter))
 def clean_and_validate_sequence(sequence: str) -> tuple[str, str | None]:
     lines = sequence.split("\n")
+    cleaned_sequence = "".join(line.upper() for line in lines if not line.startswith(">"))
     cleaned_sequence = cleaned_sequence.replace(" ", "")
     valid_residues = set(Polypeptide.protein_letters_3to1.values())
     residues_in_sequence = set(cleaned_sequence)
     # Check if the sequence exceeds the max allowed length
     max_sequence_length = 400
     if len(cleaned_sequence) > max_sequence_length:
+        error_message = (
+            f"Sequence exceeds the max allowed length of {max_sequence_length} characters"
+        )
         return cleaned_sequence, error_message
     illegal_residues = residues_in_sequence - valid_residues
     tokens = tokenizer.tokenize(sequence)
     indices_to_remove = [
+        i for i, token in enumerate(tokens) if token in {".", "<sep>", "<start>", "<end>", "<pad>"}
     ]
     new_attentions = []
     for attentions in attentions_tuple:
         # Remove rows and columns corresponding to special tokens and periods
         for idx in sorted(indices_to_remove, reverse=True):
+            attentions = torch.cat((attentions[:, :, :idx], attentions[:, :, idx + 1 :]), dim=2)
             attentions = torch.cat(
                 (attentions[:, :, :, :idx], attentions[:, :, :, idx + 1 :]), dim=3
             )
     sequence: str,
     model_type: ModelType = ModelType.TAPE_BERT,
     remove_special_tokens: bool = True,
+    ec_number: str = None,
 ):
     """
     Returns a tensor of shape [n_layers, n_heads, n_res, n_res] with attention weights
         tokenizer, model = get_zymctrl()
         if ec_number:
+            sequence = f"{ec_number}<sep><start>{sequence}<end><pad>"
         inputs = tokenizer(sequence, return_tensors="pt").input_ids.to(device)
+        attention_mask = tokenizer(sequence, return_tensors="pt").attention_mask.to(device)
         with torch.no_grad():
+            outputs = model(inputs, attention_mask=attention_mask, output_attentions=True)
             attentions = outputs.attentions
         if ec_number:
             # Remove attention to special tokens and periods separating EC number components
+            attentions = remove_special_tokens_and_periods(attentions, sequence, tokenizer)
         # torch.Size([1, n_heads, n_res, n_res]) -> torch.Size([n_heads, n_res, n_res])
         attention_squeezed = [torch.squeeze(attention) for attention in attentions]
         token_idxs = tokenizer.encode(sequence_separated)
         inputs = torch.tensor(token_idxs).unsqueeze(0).to(device)
         with torch.no_grad():
+            attentions = model(inputs, output_attentions=True)[-1]  # Do you need an attention mask?
         if remove_special_tokens:
             # Remove attention to </s> (last) token
     top_residues = []
     ec_tag_length = 4
+    def is_tag(x):
+        return x < ec_tag_length
     for i, chain in enumerate(chains):
         ec_number = ec_numbers[i] if ec_numbers else None
+        ec_string = ".".join([ec.number for ec in ec_number]) if ec_number else ""
         sequence = res_to_1letter(chain)
+        attention = get_attention(sequence=sequence, model_type=model_type, ec_number=ec_string)
+        attention_unidirectional = unidirectional_avg_filtered(attention, layer, head, threshold)
         # Store sum of attention in to a resiue (from the unidirectional attention)
         residue_attention = {}
                             residue_attention.get(res - ec_tag_length, 0) + attn_value
                         )
+        top_n_residues = sorted(residue_attention.items(), key=lambda x: x[1], reverse=True)[:top_n]
         for res, attn_sum in top_n_residues:
             coord = chain[res]["CA"].coord.tolist()

hexviz/ec_number.py CHANGED Viewed

@@ -6,6 +6,4 @@ class ECNumber:
         self.radius = radius
     def __str__(self):
-        return (
-            f"(EC: {self.number}, Coordinate: {self.coordinate}, Color: {self.color})"
-        )

         self.radius = radius
     def __str__(self):
+        return f"(EC: {self.number}, Coordinate: {self.coordinate}, Color: {self.color})"

hexviz/models.py CHANGED Viewed

@@ -60,7 +60,5 @@ def get_prot_t5():
     tokenizer = T5Tokenizer.from_pretrained(
         "Rostlab/prot_t5_xl_half_uniref50-enc", do_lower_case=False
     )
-    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(
-        device
-    )
     return tokenizer, model

     tokenizer = T5Tokenizer.from_pretrained(
         "Rostlab/prot_t5_xl_half_uniref50-enc", do_lower_case=False
     )
+    model = T5EncoderModel.from_pretrained("Rostlab/prot_t5_xl_half_uniref50-enc").to(device)
     return tokenizer, model

hexviz/pages/1_🗺️Identify_Interesting_Heads.py CHANGED Viewed

@@ -27,14 +27,10 @@ models = [
     Model(name=ModelType.PROT_T5, layers=24, heads=32),
 ]
-with st.expander(
-    "Input a PDB id, upload a PDB file or input a sequence", expanded=True
-):
     pdb_id = select_pdb()
     uploaded_file = st.file_uploader("2.Upload PDB", type=["pdb"])
-    input_sequence = st.text_area(
-        "3.Input sequence", "", key="input_sequence", max_chars=400
-    )
     sequence, error = clean_and_validate_sequence(input_sequence)
     if error:
         st.error(error)
@@ -65,7 +61,9 @@ truncated_sequence = sequence[slice_start - 1 : slice_end]
 layer_sequence, head_sequence = select_heads_and_layers(st.sidebar, selected_model)
 st.markdown(
-    f"Each tile is a heatmap of attention for a section of the {source} chain ({chain_selection}) from residue {slice_start} to {slice_end}. Adjust the section length and starting point in the sidebar."
 )
 # TODO: Decide if you should get attention for the full sequence or just the truncated sequence
@@ -74,11 +72,10 @@ attention = get_attention(
     sequence=truncated_sequence,
     model_type=selected_model.name,
     remove_special_tokens=True,
 )
-fig = plot_tiled_heatmap(
-    attention, layer_sequence=layer_sequence, head_sequence=head_sequence
-)
 st.pyplot(fig)

     Model(name=ModelType.PROT_T5, layers=24, heads=32),
 ]
+with st.expander("Input a PDB id, upload a PDB file or input a sequence", expanded=True):
     pdb_id = select_pdb()
     uploaded_file = st.file_uploader("2.Upload PDB", type=["pdb"])
+    input_sequence = st.text_area("3.Input sequence", "", key="input_sequence", max_chars=400)
     sequence, error = clean_and_validate_sequence(input_sequence)
     if error:
         st.error(error)
 layer_sequence, head_sequence = select_heads_and_layers(st.sidebar, selected_model)
 st.markdown(
+    f"""Each tile is a heatmap of attention for a section of the {source} chain
+    ({chain_selection}) from residue {slice_start} to {slice_end}. Adjust the
+    section length and starting point in the sidebar."""
 )
 # TODO: Decide if you should get attention for the full sequence or just the truncated sequence
     sequence=truncated_sequence,
     model_type=selected_model.name,
     remove_special_tokens=True,
+    ec_number=ec_number,
 )
+fig = plot_tiled_heatmap(attention, layer_sequence=layer_sequence, head_sequence=head_sequence)
 st.pyplot(fig)

hexviz/pages/2_📄Documentation.py CHANGED Viewed

@@ -5,42 +5,65 @@ from hexviz.config import URL
 st.markdown(
     f"""
 ## Protein language models
-There has been an explosion of capabilities in natural language processing models in the last few years.
-These architectural advances from NLP have proven to work very well for protein sequences, and we now have protein language models (pLMs) that can generate novel functional proteins sequences [ProtGPT2](https://www.nature.com/articles/s42256-022-00499-z)
-and auto-encoding models that excel at capturing biophysical features of protein sequences [ProtTrans](https://www.biorxiv.org/content/10.1101/2020.07.12.199554v3).
-For an introduction to protein language models for protein design check out [Controllable protein design with language models](https://www.nature.com/articles/s42256-022-00499-z).
 ## Interpreting protein language models by visualizing attention patterns
-With these impressive capabilities it is natural to ask what protein language models are learning and how they work -- we want to **interpret** the models.
-In natural language processing **attention analysis** has proven to be a useful tool for interpreting transformer model internals see fex ([Abnar et al. 2020](https://arxiv.org/abs/2005.00928v2)).
-[BERTology meets biology](https://arxiv.org/abs/2006.15222) provides a thorough introduction to how we can analyze Transformer protein models through the lens of attention, they show exciting findings such as:
-> Attention: (1) captures the folding structure of proteins, connecting amino acids that are far apart in the underlying sequence, but spatially close in the three-dimensional structure, (2) targets binding sites, a key functional component of proteins, and (3) focuses on progressively more complex biophysical properties with increasing layer depth
-Most existing tools for analyzing and visualizing attention patterns focus on models trained on text. It can be hard to analyze protein sequences using these tools as
-sequences can be long and we lack intuition about how the language of proteins work.
-BERTology meets biology shows visualizing attention patterns in the context of protein structure can facilitate novel discoveries about what models learn.
-[**Hexviz**](https://huggingface.co/spaces/aksell/hexviz) is a tool to simplify analyzing attention patterns in the context of protein structure. We hope this can enable
-domain experts to explore and interpret the knowledge contained in pLMs.
 ## How to use Hexviz
 There are two views:
 1. <a href="{URL}Attention_Visualization" target="_self">🧬Attention Visualization</a> Shows attention weights from a single head as red bars between residues on a protein structure.
 2. <a href="{URL}Identify_Interesting_Heads" target="_self">🗺️Identify Interesting Heads</a> Plots attention weights between residues as a heatmap for each head in the model.
-The first view is the meat of the application and is where you can investigate how attention patterns map onto the structure of a protein you're interested in.
-Use the second view to narrow down to a few heads that you want to investigate attention patterns from in detail.
-pLM are large and can have many heads, as an example ProtBERT with it's 30 layers and 16 heads has 480 heads, so we need a way to identify heads with patterns we're interested in.
-The second view is a customizable heatmap plot of attention between residue for all heads and layers in a model. From here it is possible to identify heads that specialize in
-a particular attention pattern, such as:
 1. Vertical lines: Paying attention so a single or a few residues
 2. Diagonal: Attention to the same residue or residues in front or behind the current residue.
 3. Block attention: Attention is segmented so parts of the sequence are attended to by one part of the sequence.
 4. Heterogeneous: More complex attention patterns that are not easily categorized.
 TODO: Add examples of attention patterns
-Read more about attention patterns in fex [Revealing the dark secrets of BERT](https://arxiv.org/abs/1908.08593).
 ## Protein Language models in Hexviz
 Hexviz currently supports the following models:

 st.markdown(
     f"""
 ## Protein language models
+There has been an explosion of capabilities in natural language processing
+models in the last few years.  These architectural advances from NLP have proven
+to work very well for protein sequences, and we now have protein language models
+(pLMs) that can generate novel functional proteins sequences
+[ProtGPT2](https://www.nature.com/articles/s42256-022-00499-z) and auto-encoding
+models that excel at capturing biophysical features of protein sequences
+[ProtTrans](https://www.biorxiv.org/content/10.1101/2020.07.12.199554v3).
+For an introduction to protein language models for protein design check out
+[Controllable protein design with language
+models](https://www.nature.com/articles/s42256-022-00499-z).
 ## Interpreting protein language models by visualizing attention patterns
+With these impressive capabilities it is natural to ask what protein language
+models are learning and how they work -- we want to **interpret** the models.
+In natural language processing **attention analysis** has proven to be a useful
+tool for interpreting transformer model internals see fex ([Abnar et al.
+2020](https://arxiv.org/abs/2005.00928v2)).  [BERTology meets
+biology](https://arxiv.org/abs/2006.15222) provides a thorough introduction to
+how we can analyze Transformer protein models through the lens of attention,
+they show exciting findings such as: > Attention: (1) captures the folding
+structure of proteins, connecting amino acids that are far apart in the
+underlying sequence, but spatially close in the three-dimensional structure, (2)
+targets binding sites, a key functional component of proteins, and (3) focuses
+on progressively more complex biophysical properties with increasing layer depth
+Most existing tools for analyzing and visualizing attention patterns focus on
+models trained on text. It can be hard to analyze protein sequences using these
+tools as sequences can be long and we lack intuition about how the language of
+proteins work.  BERTology meets biology shows visualizing attention patterns in
+the context of protein structure can facilitate novel discoveries about what
+models learn.  [**Hexviz**](https://huggingface.co/spaces/aksell/hexviz) is a
+tool to simplify analyzing attention patterns in the context of protein
+structure. We hope this can enable domain experts to explore and interpret the
+knowledge contained in pLMs.
 ## How to use Hexviz
 There are two views:
 1. <a href="{URL}Attention_Visualization" target="_self">🧬Attention Visualization</a> Shows attention weights from a single head as red bars between residues on a protein structure.
 2. <a href="{URL}Identify_Interesting_Heads" target="_self">🗺️Identify Interesting Heads</a> Plots attention weights between residues as a heatmap for each head in the model.
+The first view is the meat of the application and is where you can investigate
+how attention patterns map onto the structure of a protein you're interested in.
+Use the second view to narrow down to a few heads that you want to investigate
+attention patterns from in detail.  pLM are large and can have many heads, as an
+example ProtBERT with it's 30 layers and 16 heads has 480 heads, so we need a
+way to identify heads with patterns we're interested in.
+The second view is a customizable heatmap plot of attention between residue for
+all heads and layers in a model. From here it is possible to identify heads that
+specialize in a particular attention pattern, such as:
 1. Vertical lines: Paying attention so a single or a few residues
 2. Diagonal: Attention to the same residue or residues in front or behind the current residue.
 3. Block attention: Attention is segmented so parts of the sequence are attended to by one part of the sequence.
 4. Heterogeneous: More complex attention patterns that are not easily categorized.
 TODO: Add examples of attention patterns
+Read more about attention patterns in fex [Revealing the dark secrets of
+BERT](https://arxiv.org/abs/1908.08593).
 ## Protein Language models in Hexviz
 Hexviz currently supports the following models:

hexviz/plot.py CHANGED Viewed

@@ -15,30 +15,22 @@ def plot_tiled_heatmap(tensor, layer_sequence: List[int], head_sequence: List[in
     x_size = num_heads * 2
     y_size = num_layers * 2
-    fig, axes = plt.subplots(
-        num_layers, num_heads, figsize=(x_size, y_size), squeeze=False
-    )
     for i in range(num_layers):
         for j in range(num_heads):
-            axes[i, j].imshow(
-                tensor[i, j].detach().numpy(), cmap="viridis", aspect="equal"
-            )
             axes[i, j].axis("off")
             # Enumerate the axes
             if i == 0:
-                axes[i, j].set_title(
-                    f"Head {head_sequence[j] + 1}", fontsize=10, y=1.05
-                )
     # Calculate the row label offset based on the number of columns
     offset = 0.02 + (12 - num_heads) * 0.0015
     for i, ax_row in enumerate(axes):
         row_label = f"{layer_sequence[i]+1}"
         row_pos = ax_row[num_heads - 1].get_position()
-        fig.text(
-            row_pos.x1 + offset, (row_pos.y1 + row_pos.y0) / 2, row_label, va="center"
-        )
     plt.subplots_adjust(wspace=0.1, hspace=0.1)
     return fig

     x_size = num_heads * 2
     y_size = num_layers * 2
+    fig, axes = plt.subplots(num_layers, num_heads, figsize=(x_size, y_size), squeeze=False)
     for i in range(num_layers):
         for j in range(num_heads):
+            axes[i, j].imshow(tensor[i, j].detach().numpy(), cmap="viridis", aspect="equal")
             axes[i, j].axis("off")
             # Enumerate the axes
             if i == 0:
+                axes[i, j].set_title(f"Head {head_sequence[j] + 1}", fontsize=10, y=1.05)
     # Calculate the row label offset based on the number of columns
     offset = 0.02 + (12 - num_heads) * 0.0015
     for i, ax_row in enumerate(axes):
         row_label = f"{layer_sequence[i]+1}"
         row_pos = ax_row[num_heads - 1].get_position()
+        fig.text(row_pos.x1 + offset, (row_pos.y1 + row_pos.y0) / 2, row_label, va="center")
     plt.subplots_adjust(wspace=0.1, hspace=0.1)
     return fig

hexviz/view.py CHANGED Viewed

@@ -18,11 +18,7 @@ def get_selecte_model_index(models):
         return 0
     else:
         return next(
-            (
-                i
-                for i, model in enumerate(models)
-                if model.name.value == selected_model_name
-            ),
             None,
         )
@@ -89,10 +85,10 @@ def select_protein(pdb_code, uploaded_file, input_sequence):
         pdb_str = get_pdb_from_seq(str(input_sequence))
         if "selected_chains" in st.session_state:
             del st.session_state.selected_chains
-        source = f"Input sequence + ESM-fold"
     elif "uploaded_pdb_str" in st.session_state:
         pdb_str = st.session_state.uploaded_pdb_str
-        source = f"Uploaded file stored in cache"
     else:
         file = get_pdb_file(pdb_code)
         pdb_str = file.read()
@@ -135,7 +131,12 @@ def select_heads_and_layers(sidebar, model):
 def select_sequence_slice(sequence_length):
-    st.sidebar.markdown("Sequence segment to plot")
     if "sequence_slice" not in st.session_state:
         st.session_state.sequence_slice = (1, min(50, sequence_length))
     slice = st.sidebar.slider(

         return 0
     else:
         return next(
+            (i for i, model in enumerate(models) if model.name.value == selected_model_name),
             None,
         )
         pdb_str = get_pdb_from_seq(str(input_sequence))
         if "selected_chains" in st.session_state:
             del st.session_state.selected_chains
+        source = "Input sequence + ESM-fold"
     elif "uploaded_pdb_str" in st.session_state:
         pdb_str = st.session_state.uploaded_pdb_str
+        source = "Uploaded file stored in cache"
     else:
         file = get_pdb_file(pdb_code)
         pdb_str = file.read()
 def select_sequence_slice(sequence_length):
+    st.sidebar.markdown(
+        """
+        Sequence segment to plot
+        ---
+        """
+    )
     if "sequence_slice" not in st.session_state:
         st.session_state.sequence_slice = (1, min(50, sequence_length))
     slice = st.sidebar.slider(

hexviz/🧬Attention_Visualization.py CHANGED Viewed

@@ -31,14 +31,10 @@ models = [
     Model(name=ModelType.PROT_T5, layers=24, heads=32),
 ]
-with st.expander(
-    "Input a PDB id, upload a PDB file or input a sequence", expanded=True
-):
-    pdb_id = select_pdb()
     uploaded_file = st.file_uploader("2.Upload PDB", type=["pdb"])
-    input_sequence = st.text_area(
-        "3.Input sequence", "", key="input_sequence", max_chars=400
-    )
     sequence, error = clean_and_validate_sequence(input_sequence)
     if error:
         st.error(error)
@@ -59,9 +55,7 @@ selected_chains = st.sidebar.multiselect(
     label="Select Chain(s)", options=chains, key="selected_chains"
 )
-show_ligands = st.sidebar.checkbox(
-    "Show ligands", value=st.session_state.get("show_ligands", True)
-)
 st.session_state.show_ligands = show_ligands
@@ -71,9 +65,7 @@ st.sidebar.markdown(
     ---
     """
 )
-min_attn = st.sidebar.slider(
-    "Minimum attention", min_value=0.0, max_value=0.4, value=0.1
-)
 n_highest_resis = st.sidebar.number_input(
     "Num highest attention resis to label", value=2, min_value=1, max_value=100
 )
@@ -84,9 +76,7 @@ sidechain_highest = st.sidebar.checkbox("Show sidechains", value=True)
 with st.sidebar.expander("Label residues manually"):
     hl_chain = st.selectbox(label="Chain to label", options=selected_chains, index=0)
-    hl_resi_list = st.multiselect(
-        label="Selected Residues", options=list(range(1, 5000))
-    )
     label_resi = st.checkbox(label="Label Residues", value=True)
@@ -97,10 +87,13 @@ with left:
 with mid:
     if "selected_layer" not in st.session_state:
         st.session_state["selected_layer"] = 5
-    layer_one = st.selectbox(
-        "Layer",
-        options=[i for i in range(1, selected_model.layers + 1)],
-        key="selected_layer",
     )
     layer = layer_one - 1
 with right:
@@ -135,9 +128,7 @@ if selected_model.name == ModelType.ZymCTRL:
     if ec_number:
         if selected_chains:
-            shown_chains = [
-                ch for ch in structure.get_chains() if ch.id in selected_chains
-            ]
         else:
             shown_chains = list(structure.get_chains())
@@ -163,14 +154,9 @@ if selected_model.name == ModelType.ZymCTRL:
             reverse_vector = [-v for v in vector]
             # Normalize the reverse vector
-            reverse_vector_normalized = np.array(reverse_vector) / np.linalg.norm(
-                reverse_vector
-            )
             coordinates = [
-                [
-                    res_1[j] + i * 2 * radius * reverse_vector_normalized[j]
-                    for j in range(3)
-                ]
                 for i in range(4)
             ]
             EC_tag = [
@@ -213,9 +199,7 @@ def get_3dview(pdb):
         for chain in hidden_chains:
             xyzview.setStyle({"chain": chain}, {"cross": {"hidden": "true"}})
             # Hide ligands for chain too
-            xyzview.addStyle(
-                {"chain": chain, "hetflag": True}, {"cross": {"hidden": "true"}}
-            )
     if len(selected_chains) == 1:
         xyzview.zoomTo({"chain": f"{selected_chains[0]}"})
@@ -257,7 +241,6 @@ def get_3dview(pdb):
         for _, _, chain, res in top_residues:
             one_indexed_res = res + 1
             xyzview.addResLabels(
                 {"chain": chain, "resi": one_indexed_res},
                 {
                     "backgroundColor": "lightgray",
@@ -266,9 +249,7 @@ def get_3dview(pdb):
                 },
             )
             if sidechain_highest:
-                xyzview.addStyle(
-                    {"chain": chain, "resi": res}, {"stick": {"radius": 0.2}}
-                )
     return xyzview
@@ -282,9 +263,7 @@ Pick a PDB ID, layer and head to visualize attention from the selected protein l
     unsafe_allow_html=True,
 )
-chain_dict = {
-    f"{chain.id}": list(chain.get_residues()) for chain in list(structure.get_chains())
-}
 data = []
 for att_weight, _, chain, resi in top_residues:
     try:

     Model(name=ModelType.PROT_T5, layers=24, heads=32),
 ]
+with st.expander("Input a PDB id, upload a PDB file or input a sequence", expanded=True):
+    pdb_id = select_pdb() or "2WK4"
     uploaded_file = st.file_uploader("2.Upload PDB", type=["pdb"])
+    input_sequence = st.text_area("3.Input sequence", "", key="input_sequence", max_chars=400)
     sequence, error = clean_and_validate_sequence(input_sequence)
     if error:
         st.error(error)
     label="Select Chain(s)", options=chains, key="selected_chains"
 )
+show_ligands = st.sidebar.checkbox("Show ligands", value=st.session_state.get("show_ligands", True))
 st.session_state.show_ligands = show_ligands
     ---
     """
 )
+min_attn = st.sidebar.slider("Minimum attention", min_value=0.0, max_value=0.4, value=0.1)
 n_highest_resis = st.sidebar.number_input(
     "Num highest attention resis to label", value=2, min_value=1, max_value=100
 )
 with st.sidebar.expander("Label residues manually"):
     hl_chain = st.selectbox(label="Chain to label", options=selected_chains, index=0)
+    hl_resi_list = st.multiselect(label="Selected Residues", options=list(range(1, 5000)))
     label_resi = st.checkbox(label="Label Residues", value=True)
 with mid:
     if "selected_layer" not in st.session_state:
         st.session_state["selected_layer"] = 5
+    layer_one = (
+        st.selectbox(
+            "Layer",
+            options=[i for i in range(1, selected_model.layers + 1)],
+            key="selected_layer",
+        )
+        or 5
     )
     layer = layer_one - 1
 with right:
     if ec_number:
         if selected_chains:
+            shown_chains = [ch for ch in structure.get_chains() if ch.id in selected_chains]
         else:
             shown_chains = list(structure.get_chains())
             reverse_vector = [-v for v in vector]
             # Normalize the reverse vector
+            reverse_vector_normalized = np.array(reverse_vector) / np.linalg.norm(reverse_vector)
             coordinates = [
+                [res_1[j] + i * 2 * radius * reverse_vector_normalized[j] for j in range(3)]
                 for i in range(4)
             ]
             EC_tag = [
         for chain in hidden_chains:
             xyzview.setStyle({"chain": chain}, {"cross": {"hidden": "true"}})
             # Hide ligands for chain too
+            xyzview.addStyle({"chain": chain, "hetflag": True}, {"cross": {"hidden": "true"}})
     if len(selected_chains) == 1:
         xyzview.zoomTo({"chain": f"{selected_chains[0]}"})
         for _, _, chain, res in top_residues:
             one_indexed_res = res + 1
             xyzview.addResLabels(
                 {"chain": chain, "resi": one_indexed_res},
                 {
                     "backgroundColor": "lightgray",
                 },
             )
             if sidechain_highest:
+                xyzview.addStyle({"chain": chain, "resi": res}, {"stick": {"radius": 0.2}})
     return xyzview
     unsafe_allow_html=True,
 )
+chain_dict = {f"{chain.id}": list(chain.get_residues()) for chain in list(structure.get_chains())}
 data = []
 for att_weight, _, chain, resi in top_residues:
     try:

poetry.lock CHANGED Viewed

@@ -1609,6 +1609,14 @@ pygments = ">=2.13.0,<3.0.0"
 [package.extras]
 jupyter = ["ipywidgets (>=7.5.1,<9)"]
 [[package]]
 name = "s3transfer"
 version = "0.6.0"
@@ -2196,7 +2204,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-co
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.10"
-content-hash = "79f191c2f3cc09035f7d0f543aa08c44c0a39de462336ca23eeea26ac29218de"
 [metadata.files]
 altair = []
@@ -2428,6 +2436,7 @@ requests = []
 rfc3339-validator = []
 rfc3986-validator = []
 rich = []
 s3transfer = []
 scipy = []
 semver = []

 [package.extras]
 jupyter = ["ipywidgets (>=7.5.1,<9)"]
+[[package]]
+name = "ruff"
+version = "0.0.264"
+description = "An extremely fast Python linter, written in Rust."
+category = "main"
+optional = false
+python-versions = ">=3.7"
 [[package]]
 name = "s3transfer"
 version = "0.6.0"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.10"
+content-hash = "502949174f23054a4b450dfc0bb16df64c43d7d6c3e60d1adaf2835962223c32"
 [metadata.files]
 altair = []
 rfc3339-validator = []
 rfc3986-validator = []
 rich = []
+ruff = []
 s3transfer = []
 scipy = []
 semver = []

pyproject.toml CHANGED Viewed

@@ -14,6 +14,7 @@ torch = "^2.0.0"
 sentencepiece = "^0.1.97"
 tape-proteins = "^0.5"
 matplotlib = "^3.7.1"
 [tool.poetry.dev-dependencies]
 pytest = "^7.2.2"
@@ -21,3 +22,9 @@ pytest = "^7.2.2"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"

 sentencepiece = "^0.1.97"
 tape-proteins = "^0.5"
 matplotlib = "^3.7.1"
+ruff = "^0.0.264"
 [tool.poetry.dev-dependencies]
 pytest = "^7.2.2"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
+[tool.ruff]
+line-length = 100
+[tool.black]
+line-length = 100

tests/test_attention.py CHANGED Viewed

@@ -1,8 +1,13 @@
 import torch
 from Bio.PDB.Structure import Structure
-from hexviz.attention import (ModelType, get_attention, get_sequences,
-                              get_structure, unidirectional_avg_filtered)
 def test_get_structure():
@@ -12,10 +17,11 @@ def test_get_structure():
     assert structure is not None
     assert isinstance(structure, Structure)
 def test_get_sequences():
     pdb_id = "1AKE"
     structure = get_structure(pdb_id)
     sequences = get_sequences(structure)
     assert sequences is not None
@@ -30,26 +36,29 @@ def test_get_attention_zymctrl():
     result = get_attention("GGG", model_type=ModelType.ZymCTRL)
     assert result is not None
-    assert result.shape == torch.Size([36,16,3,3])
 def test_get_attention_zymctrl_long_chain():
-    structure = get_structure(pdb_code="6A5J") # 13 residues long
     sequences = get_sequences(structure)
     result = get_attention(sequences[0], model_type=ModelType.ZymCTRL)
     assert result is not None
-    assert result.shape == torch.Size([36,16,13,13])
 def test_get_attention_tape():
-    structure = get_structure(pdb_code="6A5J") # 13 residues long
     sequences = get_sequences(structure)
     result = get_attention(sequences[0], model_type=ModelType.TAPE_BERT)
     assert result is not None
-    assert result.shape == torch.Size([12,12,13,13])
 def test_get_attention_prot_bert():
@@ -58,21 +67,19 @@ def test_get_attention_prot_bert():
     assert result is not None
     assert result.shape == torch.Size([30, 16, 3, 3])
 def test_get_unidirection_avg_filtered():
     # 1 head, 1 layer, 4 residues long attention tensor
-    attention= torch.tensor([[[[1, 2, 3, 4],
-                               [2, 5, 6, 7],
-                               [3, 6, 8, 9],
-                               [4, 7, 9, 11]]]], dtype=torch.float32)
     result = unidirectional_avg_filtered(attention, 0, 0, 0)
     assert result is not None
     assert len(result) == 10
-    attention = torch.tensor([[[[1, 2, 3],
-                               [2, 5, 6],
-                               [4, 7, 91]]]], dtype=torch.float32)
     result = unidirectional_avg_filtered(attention, 0, 0, 0)

 import torch
 from Bio.PDB.Structure import Structure
+from hexviz.attention import (
+    ModelType,
+    get_attention,
+    get_sequences,
+    get_structure,
+    unidirectional_avg_filtered,
+)
 def test_get_structure():
     assert structure is not None
     assert isinstance(structure, Structure)
 def test_get_sequences():
     pdb_id = "1AKE"
     structure = get_structure(pdb_id)
     sequences = get_sequences(structure)
     assert sequences is not None
     result = get_attention("GGG", model_type=ModelType.ZymCTRL)
     assert result is not None
+    assert result.shape == torch.Size([36, 16, 3, 3])
 def test_get_attention_zymctrl_long_chain():
+    structure = get_structure(pdb_code="6A5J")  # 13 residues long
     sequences = get_sequences(structure)
     result = get_attention(sequences[0], model_type=ModelType.ZymCTRL)
     assert result is not None
+    assert result.shape == torch.Size([36, 16, 13, 13])
 def test_get_attention_tape():
+    structure = get_structure(pdb_code="6A5J")  # 13 residues long
     sequences = get_sequences(structure)
     result = get_attention(sequences[0], model_type=ModelType.TAPE_BERT)
     assert result is not None
+    assert result.shape == torch.Size([12, 12, 13, 13])
 def test_get_attention_prot_bert():
     assert result is not None
     assert result.shape == torch.Size([30, 16, 3, 3])
 def test_get_unidirection_avg_filtered():
     # 1 head, 1 layer, 4 residues long attention tensor
+    attention = torch.tensor(
+        [[[[1, 2, 3, 4], [2, 5, 6, 7], [3, 6, 8, 9], [4, 7, 9, 11]]]], dtype=torch.float32
+    )
     result = unidirectional_avg_filtered(attention, 0, 0, 0)
     assert result is not None
     assert len(result) == 10
+    attention = torch.tensor([[[[1, 2, 3], [2, 5, 6], [4, 7, 91]]]], dtype=torch.float32)
     result = unidirectional_avg_filtered(attention, 0, 0, 0)

tests/test_models.py CHANGED Viewed

@@ -1,4 +1,3 @@
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
 from hexviz.models import get_zymctrl
@@ -13,4 +12,4 @@ def test_get_zymctrl():
     tokenizer, model = result
     assert isinstance(tokenizer, GPT2TokenizerFast)
-    assert isinstance(model, GPT2LMHeadModel)

 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
 from hexviz.models import get_zymctrl
     tokenizer, model = result
     assert isinstance(tokenizer, GPT2TokenizerFast)
+    assert isinstance(model, GPT2LMHeadModel)