brief
gpt2 model trained by DNA sequence and Human language based on gpt2 small, 128 max token input
basic use
import torch
from transformers import GPT2Tokenizer,GPT2Model,GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained("dnagpt/gpt_dna_eng")
tokenizer = GPT2Tokenizer.from_pretrained("dnagpt/dna_eng_bpe")
input = "hello TCTTTCTCTTCTGTAT"
input_ids = tokenizer.encode(
input,
return_tensors="pt",
truncation=True,
max_length=1000
)
# Generate
device = model.device
generated_tokens = model.generate(
input_ids=input_ids.to(device),
max_new_tokens=10,
)
generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print(generated_text)
#hello TCTTTCTCTTCTGTATATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATCTATC
base_model = GPT2Model.from_pretrained("dnagpt/gpt_dna_eng")
output = base_model(input_ids)
print(output)
github
- Downloads last month
- 20