hu-lab
/

PlantGFM-Gene-generation

Model card Files Files and versions Community

hu-lab commited on 19 days ago

Commit

d84c46c

·

verified ·

1 Parent(s): 8b1cf03

Update README.md

Files changed (1) hide show

README.md +25 -11

README.md CHANGED Viewed

@@ -21,27 +21,41 @@ pip install transformers
 To calculate the embedding of a dna sequence:
 ```python
 import torch
 from transformers import PreTrainedTokenizerFast
-from plantgfm.modeling_plantgfm import PlantGFMForCausalLM
 from plantgfm.configuration_plantgfm import PlantGFMConfig
-config = PlantGFMConfig.from_pretrained("hu-lab/PlantGFM")
-tokenizer = PreTrainedTokenizerFast.from_pretrained("hu-lab/PlantGFM")
-model = PlantGFMForCausalLM.from_pretrained("hu-lab/PlantGFM", config=config)
-sequences = ["CCCTAAACCCTAAACCCTAAA", "ATGGCGTGGCTG"]
-# get single-nucleotide sequences with space between each base
-single_nucleotide_sequences = list(map(lambda seq: " ".join(list(seq)), sequences))
-tokenized_sequences = tokenizer(single_nucleotide_sequences, padding="longest")["input_ids"]
-input_ids = torch.LongTensor(tokenized_sequences)
-embd = model(input_ids=input_ids, output_hidden_states=True)["hidden_states"][0]
-print(embd)
 ```

 To calculate the embedding of a dna sequence:
 ```python
 import torch
 from transformers import PreTrainedTokenizerFast
+from torch.cuda.amp import autocast
 from plantgfm.configuration_plantgfm import PlantGFMConfig
+from plantgfm.modeling_plantgfm import PlantGFMForCausalLM
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+config = PlantGFMConfig.from_pretrained("hu-lab/PlantGFM-Gene-generation ")
+tokenizer = PreTrainedTokenizerFast.from_pretrained("hu-lab/PlantGFM-Gene-generation")
+model = PlantGFMForCausalLM.from_pretrained("hu-lab/PlantGFM-Gene-generation", config=config).to(device)
+model = model.to(dtype=torch.bfloat16)
+num_texts = 1
+batch_size = 1
+generated_texts = []
+input_ids = tokenizer.encode("", return_tensors="pt").to(device, dtype=torch.long)
+input_ids = input_ids.expand(batch_size, -1)
+for i in range(0, num_texts, batch_size):
+    with autocast(dtype=torch.bfloat16):
+        generated_text = model.generate(
+            input_ids=input_ids,
+            max_length=4000,
+            do_sample=True,
+        )
+        for output_sequence in output:
+            generated_text = tokenizer.decode(output_sequence, skip_special_tokens=True)
+            print(generated_text)
 ```