LofiAmazon/BarcodeBERT-Finetuned-Amazon

This model adds a classification head on top of LofiAmazon/BarcodeBERT-Entire-BOLD, The classification head is a linear layer which concatenates the DNA embeddings with environmental layer data. This model has been trained with BOLD-Embeddings-Ecolayers-Amazon to predict taxonomic genuses. BOLD-Embeddings-Ecolayers-Amazon includes DNA embeddings and ecological raster layer data regarding sample location.

Example Usage

First you need to download the StandardScaler used to normalise environmental layer values during training. You can find it here.

You will also need to download all ecological layers from here.

The model will output a probability distribution over genera that were present in the training dataset. You can find the mapping between the index in the vector and the genus name in this file.

import pickle

from transformers import PreTrainedTokenizerFast, BertForMaskedLM, BertConfig
import rasterio
from rasterio.sample import sample_gen


class DNASeqClassifier(nn.Module, PyTorchModelHubMixin):
    def __init__(self, bert_model, env_dim, num_classes):
        super(DNASeqClassifier, self).__init__()
        self.bert = bert_model
        self.env_dim = env_dim
        self.num_classes = num_classes
        self.fc = nn.Linear(768 + env_dim, num_classes)

    def forward(self, bert_inputs, env_data):
        outputs = self.bert(**bert_inputs)
        dna_embeddings = outputs.hidden_states[-1].mean(1)
        combined = torch.cat((dna_embeddings, env_data), dim=1)
        logits = self.fc(combined)

        return logits

classification_model = DNASeqClassifier.from_pretrained(
    "LofiAmazon/BarcodeBERT-Finetuned-Amazon",
    bert_model=BertForMaskedLM(
        BertConfig(vocab_size=259, output_hidden_states=True),
    ),
)

ecolayers = [
    "median_elevation_1km.tiff",
    "human_footprint.tiff",
    "population_density_1km.tif",
    "annual_precipitation.tif",
    "precipitation_seasonality.tif",
    "annual_mean_air_temp.tif",
    "temp_seasonality.tif",
]

with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

tokenizer = PreTrainedTokenizerFast.from_pretrained("LofiAmazon/BarcodeBERT-Entire-BOLD")

# The DNA sequence you want to predict.
# There should be a space after every 4 characters.
# The sequence may also have unknown characters which are not A,C,T,G.
# The maximum DNA sequence length (not counting spaces) should be 660 characters
dna_sequence = "AACA ATGT ATTT A-T- TTCG CCCT TGTG AATT TATT ..."
# Location where DNA was sampled
coords = (-3.009083, -58.68281)

# Tokenize the DNA sequence
dna_tokenized = tokenizer(dna_sequence, return_tensors="pt")

# Obtain the environmental data from the coordinates
env_data = []
for layer in ecolayers:
    with rasterio.open(layer) as dataset:
        # Get the corresponding ecological values for the samples
        results = sample_gen(dataset, [coords])
        results = [r for r in results]
    layer_data = np.mean(results[0])
    env_data.append(layer_data)
env_data = scaler.transform([env_data])
env_data = torch.from_numpy(env_data).to(torch.float32)

# Obtain genus prediction
logits = classification_model(dna_tokenized, env_data)
temperature = 0.2

# Obtain the final genus probabilities
probs = torch.softmax(logits / temperature, dim=1).squeeze()