gf_v1: filled in isp single gene code

#237

by davidjwen - opened Sep 1, 2023

base: refs/heads/main

←

from: refs/pr/237

Discussion Files changed

+6106

-10773

This view is limited to 50 files because it contains too many changes. See the raw diff here.

Files changed (50) hide show

.gitattributes +2 -2
.pre-commit-config.yaml +0 -26
.readthedocs.yaml +0 -19
MANIFEST.in +3 -4
README.md +11 -38
config.json +8 -9
docs/Makefile +0 -20
docs/make.bat +0 -35
docs/requirements.txt +0 -3
docs/source/_static/css/custom.css +0 -40
docs/source/_static/gf_logo.png +0 -0
docs/source/about.rst +0 -49
docs/source/api.rst +0 -51
docs/source/conf.py +0 -80
docs/source/geneformer.classifier.rst +0 -10
docs/source/geneformer.emb_extractor.rst +0 -26
docs/source/geneformer.in_silico_perturber.rst +0 -8
docs/source/geneformer.in_silico_perturber_stats.rst +0 -25
docs/source/geneformer.mtl_classifier.rst +0 -11
docs/source/geneformer.tokenizer.rst +0 -15
docs/source/getstarted.rst +0 -36
docs/source/index.rst +0 -16
examples/cell_classification.ipynb +0 -0
examples/extract_and_plot_cell_embeddings.ipynb +4 -8
examples/gene_classification.ipynb +0 -0
examples/hyperparam_optimiz_for_disease_classifier.py +226 -0
examples/in_silico_perturbation.ipynb +17 -66
examples/multitask_cell_classification.ipynb +0 -420
examples/pretraining_new_model/pretrain_geneformer_w_deepspeed.py +1 -3
examples/tokenizing_scRNAseq_data.ipynb +8 -27
fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/config.json +0 -0
fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/optimizer.pt +0 -0
fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/pytorch_model.bin +0 -0
fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/rng_state.pth +0 -0
fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/scheduler.pt +0 -0
fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/trainer_state.json +0 -0
fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/training_args.bin +0 -0
fine_tuned_models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522/config.json +0 -24
fine_tuned_models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522/pytorch_model.bin +0 -3
{gf-12L-30M-i2048 → geneformer-12L-30M}/config.json +0 -0
{gf-12L-30M-i2048 → geneformer-12L-30M}/pytorch_model.bin +0 -0
{gf-12L-30M-i2048 → geneformer-12L-30M}/training_args.bin +0 -0
geneformer/__init__.py +11 -33
geneformer/classifier.py +0 -1563
geneformer/classifier_utils.py +0 -648
geneformer/collator_for_classification.py +74 -139
geneformer/emb_extractor.py +279 -649
geneformer/ensembl_mapping_dict_gc95M.pkl +0 -3
geneformer/evaluation_utils.py +0 -287
geneformer/gene_dictionaries_30m/ensembl_mapping_dict_gc30M.pkl +0 -3

.gitattributes CHANGED Viewed

@@ -14,11 +14,10 @@
 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
@@ -26,4 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 model.safetensors filter=lfs diff=lfs merge=lfs -text

 *.ot filter=lfs diff=lfs merge=lfs -text
 *.parquet filter=lfs diff=lfs merge=lfs -text
 *.pb filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+geneformer/gene_name_id_dict.pkl filter=lfs diff=lfs merge=lfs -text
 model.safetensors filter=lfs diff=lfs merge=lfs -text

.pre-commit-config.yaml DELETED Viewed

@@ -1,26 +0,0 @@
-# See https://pre-commit.com for more information
-# See https://pre-commit.com/hooks.html for more hooks
-repos:
--   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.2.0
-    hooks:
-    -   id: trailing-whitespace
-    -   id: end-of-file-fixer
-    -   id: check-yaml
-    -   id: check-added-large-files
-    -   id: check-merge-conflict
-    -   id: mixed-line-ending
-    -   id: check-docstring-first
--   repo: https://github.com/pycqa/isort
-    rev: 5.12.0
-    hooks:
-    -   id: isort
-        args: ["--profile", "black"]
--   repo: https://github.com/astral-sh/ruff-pre-commit
-    # Ruff version.
-    rev: v0.1.4
-    hooks:
-    # Run the Ruff linter.
-    -   id: ruff
-    # Run the Ruff formatter.
-    -   id: ruff-format

.readthedocs.yaml DELETED Viewed

@@ -1,19 +0,0 @@
-# Read the Docs configuration file
-# Required
-version: 2
-# Set the OS, Python version and other tools you might need
-build:
-  os: ubuntu-22.04
-  tools:
-    python: "3.10"
-# Build documentation in the "docs/" directory with Sphinx
-sphinx:
-  configuration: docs/source/conf.py
-# Python requirements required build your documentation
-python:
-   install:
-   - requirements: docs/requirements.txt

MANIFEST.in CHANGED Viewed

@@ -1,4 +1,3 @@
-include geneformer/gene_median_dictionary_gc95M.pkl
-include geneformer/gene_name_id_dict_gc95M.pkl
-include geneformer/ensembl_mapping_dict_gc95M.pkl
-include geneformer/token_dictionary_gc95M.pkl

+include geneformer/gene_median_dictionary.pkl
+include geneformer/token_dictionary.pkl
+include geneformer/gene_name_id_dict.pkl

README.md CHANGED Viewed

@@ -1,43 +1,22 @@
 ---
 datasets: ctheodoris/Genecorpus-30M
 license: apache-2.0
-tags:
-- single-cell
-- genomics
 ---
 # Geneformer
-Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.
-- See [our manuscript](https://rdcu.be/ddrx0) for details of the original model trained on ~30 million transcriptomes in June 2021 and the initial report of our in silico perturbation and cell and gene classification strategies.
-- See [our manuscript](https://www.biorxiv.org/content/10.1101/2024.08.16.608180v1.full.pdf) for details of the expanded model trained on ~95 million transcriptomes in April 2024 and our continual learning, multitask learning, and quantization strategies.
-- See [geneformer.readthedocs.io](https://geneformer.readthedocs.io) for documentation.
 # Model Description
-Geneformer is a foundational transformer model pretrained on a large-scale corpus of single cell transcriptomes representing a broad range of human tissues. Geneformer was originally pretrained in June 2021 on [Genecorpus-30M](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M), a corpus comprised of ~30 million single cell transcriptomes. We excluded cells with high mutational burdens (e.g. malignant cells and immortalized cell lines) that could lead to substantial network rewiring without companion genome sequencing to facilitate interpretation. Then, in April 2024, Geneformer was pretrained on ~95 million non-cancer transcriptomes, followed by continual learning on ~14 million cancer transcriptomes to yield a cancer domain-tuned model.
-Each single cell’s transcriptome is presented to the model as a rank value encoding where genes are ranked by their expression in that cell scaled by their expression across the entire Genecorpus-30M. The rank value encoding provides a nonparametric representation of that cell’s transcriptome and takes advantage of the many observations of each gene’s expression across the pretraining corpus to prioritize genes that distinguish cell state. Specifically, this method will deprioritize ubiquitously highly-expressed housekeeping genes by scaling them to a lower rank. Conversely, genes such as transcription factors that may be lowly expressed when they are expressed but highly distinguish cell state will move to a higher rank within the encoding. Furthermore, this rank-based approach may be more robust against technical artifacts that may systematically bias the absolute transcript counts value while the overall relative ranking of genes within each cell remains more stable.
-The rank value encoding of each single cell’s transcriptome then proceeds through N layers of transformer encoder units, where N varies dependent on the model size. Pretraining was accomplished using a masked learning objective where 15% of the genes within each transcriptome were masked and the model was trained to predict which gene should be within each masked position in that specific cell state using the context of the remaining unmasked genes. A major strength of this approach is that it is entirely self-supervised and can be accomplished on completely unlabeled data, which allows the inclusion of large amounts of training data without being restricted to samples with accompanying labels.
-We detail applications and results in [our manuscript](https://rdcu.be/ddrx0).
-During pretraining, Geneformer gained a fundamental understanding of network dynamics, encoding network hierarchy in the model’s attention weights in a completely self-supervised manner. With both zero-shot learning and fine-tuning with limited task-specific data, Geneformer consistently boosted predictive accuracy in a diverse panel of downstream tasks relevant to chromatin and network dynamics. In silico perturbation with zero-shot learning identified a novel transcription factor in cardiomyocytes that we experimentally validated to be critical to their ability to generate contractile force. In silico treatment with limited patient data revealed candidate therapeutic targets for cardiomyopathy that we experimentally validated to significantly improve the ability of cardiomyocytes to generate contractile force in an induced pluripotent stem cell (iPSC) model of the disease. Overall, Geneformer represents a foundational deep learning model pretrained on a large-scale corpus human single cell transcriptomes to gain a fundamental understanding of gene network dynamics that can now be democratized to a vast array of downstream tasks to accelerate discovery of key network regulators and candidate therapeutic targets.
-The repository includes the following pretrained models:
-L=layers\
-M=millions of cells used for pretraining\
-i=input size\
-(pretraining date)
-- GF-6L-30M-i2048 (June 2021)
-- GF-12L-30M-i2048 (June 2021)
-- GF-12L-95M-i4096 (April 2024)
-- GF-20L-95M-i4096 (April 2024)
-The current default model in the main directory of the repository is GF-12L-95M-i4096.
-The repository also contains fined tuned models in the fine_tuned_models directory and the cancer-tuned model following continual learning on ~14 million cancer cells, GF-12L-95M-i4096_CLcancer.
 # Application
 The pretrained Geneformer model can be used directly for zero-shot learning, for example for in silico perturbation analysis, or by fine-tuning towards the relevant downstream task, such as gene or cell state classification.
@@ -45,7 +24,7 @@ The pretrained Geneformer model can be used directly for zero-shot learning, for
 Example applications demonstrated in [our manuscript](https://rdcu.be/ddrx0) include:
 *Fine-tuning*:
-- transcription factor dosage sensitivity
 - chromatin dynamics (bivalently marked promoters)
 - transcription factor regulatory range
 - gene network centrality
@@ -67,11 +46,9 @@ Example applications demonstrated in [our manuscript](https://rdcu.be/ddrx0) inc
 - in silico perturbation to determine transcription factor cooperativity
 # Installation
-In addition to the pretrained model, contained herein are functions for tokenizing and collating data specific to single cell transcriptomics, pretraining the model, fine-tuning the model, extracting and plotting cell embeddings, and performing in silico pertrubation with either the pretrained or fine-tuned models. To install (~20s):
 ```bash
-# Make sure you have git-lfs installed (https://git-lfs.com)
-git lfs install
 git clone https://huggingface.co/ctheodoris/Geneformer
 cd Geneformer
 pip install .
@@ -85,10 +62,6 @@ For usage, see [examples](https://huggingface.co/ctheodoris/Geneformer/tree/main
 - extracting and plotting cell embeddings
 - in silico perturbation
-Please note that the fine-tuning examples are meant to be generally applicable and the input datasets and labels will vary dependent on the downstream task. Example input files for a few of the downstream tasks demonstrated in the manuscript are located within the [example_input_files directory](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files) in the dataset repository, but these only represent a few example fine-tuning applications.
-Please note that GPU resources are required for efficient usage of Geneformer. Additionally, we strongly recommend tuning hyperparameters for each downstream fine-tuning application as this can significantly boost predictive potential in the downstream task (e.g. max learning rate, learning schedule, number of layers to freeze, etc.).
-# Citations
-- C V Theodoris#, L Xiao, A Chopra, M D Chaffin, Z R Al Sayed, M C Hill, H Mantineo, E Brydon, Z Zeng, X S Liu, P T Ellinor#. Transfer learning enables predictions in network biology. _**Nature**_, 31 May 2023. (#co-corresponding authors)
-- H Chen*, M S Venkatesh*, J Gomez Ortega, S V Mahesh, T Nandi, R Madduri, K Pelka†, C V Theodoris†#. Quantized multi-task learning for context-specific representations of gene network dynamics. _**bioRxiv**_, 19 Aug 2024. (*co-first authors, †co-senior authors, #corresponding author)

 ---
 datasets: ctheodoris/Genecorpus-30M
 license: apache-2.0
 ---
 # Geneformer
+Geneformer is a foundation transformer model pretrained on a large-scale corpus of ~30 million single cell transcriptomes to enable context-aware predictions in settings with limited data in network biology.
+See [our manuscript](https://rdcu.be/ddrx0) for details.
 # Model Description
+Geneformer is a foundation transformer model pretrained on [Genecorpus-30M](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M), a pretraining corpus comprised of ~30 million single cell transcriptomes from a broad range of human tissues. We excluded cells with high mutational burdens (e.g. malignant cells and immortalized cell lines) that could lead to substantial network rewiring without companion genome sequencing to facilitate interpretation. Each single cell’s transcriptome is presented to the model as a rank value encoding where genes are ranked by their expression in that cell normalized by their expression across the entire Genecorpus-30M. The rank value encoding provides a nonparametric representation of that cell’s transcriptome and takes advantage of the many observations of each gene’s expression across Genecorpus-30M to prioritize genes that distinguish cell state. Specifically, this method will deprioritize ubiquitously highly-expressed housekeeping genes by normalizing them to a lower rank. Conversely, genes such as transcription factors that may be lowly expressed when they are expressed but highly distinguish cell state will move to a higher rank within the encoding. Furthermore, this rank-based approach may be more robust against technical artifacts that may systematically bias the absolute transcript counts value while the overall relative ranking of genes within each cell remains more stable.
+The rank value encoding of each single cell’s transcriptome then proceeds through six transformer encoder units. Pretraining was accomplished using a masked learning objective where 15% of the genes within each transcriptome were masked and the model was trained to predict which gene should be within each masked position in that specific cell state using the context of the remaining unmasked genes. A major strength of this approach is that it is entirely self-supervised and can be accomplished on completely unlabeled data, which allows the inclusion of large amounts of training data without being restricted to samples with accompanying labels.
+We detail applications and results in [our manuscript](https://rdcu.be/ddrx0).
+During pretraining, Geneformer gained a fundamental understanding of network dynamics, encoding network hierarchy in the model’s attention weights in a completely self-supervised manner. Fine-tuning Geneformer towards a diverse panel of downstream tasks relevant to chromatin and network dynamics using limited task-specific data demonstrated that Geneformer consistently boosted predictive accuracy. Applied to disease modeling with limited patient data, Geneformer identified candidate therapeutic targets. Overall, Geneformer represents a pretrained deep learning model from which fine-tuning towards a broad range of downstream applications can be pursued to accelerate discovery of key network regulators and candidate therapeutic targets.
+In [our manuscript](https://rdcu.be/ddrx0), we report results for the 6 layer Geneformer model pretrained on Genecorpus-30M. We additionally provide within this repository a 12 layer Geneformer model, scaled up with retained width:depth aspect ratio, also pretrained on Genecorpus-30M.
 # Application
 The pretrained Geneformer model can be used directly for zero-shot learning, for example for in silico perturbation analysis, or by fine-tuning towards the relevant downstream task, such as gene or cell state classification.
 Example applications demonstrated in [our manuscript](https://rdcu.be/ddrx0) include:
 *Fine-tuning*:
+- transcription factor dosage sensitivity
 - chromatin dynamics (bivalently marked promoters)
 - transcription factor regulatory range
 - gene network centrality
 - in silico perturbation to determine transcription factor cooperativity
 # Installation
+In addition to the pretrained model, contained herein are functions for tokenizing and collating data specific to single cell transcriptomics, pretraining the model, fine-tuning the model, extracting and plotting cell embeddings, and performing in silico pertrubation with either the pretrained or fine-tuned models. To install:
 ```bash
 git clone https://huggingface.co/ctheodoris/Geneformer
 cd Geneformer
 pip install .
 - extracting and plotting cell embeddings
 - in silico perturbation
+Please note that the fine-tuning examples are meant to be generally applicable and the input datasets and labels will vary dependent on the downstream task. Example input files for a few of the downstream tasks demonstrated in the manuscript are located within the [example_input_files directory](https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files) in the dataset repository, but these only represent a few example fine-tuning applications.
+Please note that GPU resources are required for efficient usage of Geneformer. Additionally, we strongly recommend tuning hyperparameters for each downstream fine-tuning application as this can significantly boost predictive potential in the downstream task (e.g. max learning rate, learning schedule, number of layers to freeze, etc.).

config.json CHANGED Viewed

@@ -3,22 +3,21 @@
     "BertForMaskedLM"
   ],
   "attention_probs_dropout_prob": 0.02,
-  "classifier_dropout": null,
   "hidden_act": "relu",
   "hidden_dropout_prob": 0.02,
-  "hidden_size": 512,
   "initializer_range": 0.02,
-  "intermediate_size": 1024,
   "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 4096,
   "model_type": "bert",
-  "num_attention_heads": 8,
-  "num_hidden_layers": 12,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
-  "torch_dtype": "float32",
-  "transformers_version": "4.37.1",
   "type_vocab_size": 2,
   "use_cache": true,
-  "vocab_size": 20275
 }

     "BertForMaskedLM"
   ],
   "attention_probs_dropout_prob": 0.02,
+  "gradient_checkpointing": false,
   "hidden_act": "relu",
   "hidden_dropout_prob": 0.02,
+  "hidden_size": 256,
   "initializer_range": 0.02,
+  "intermediate_size": 512,
   "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 2048,
   "model_type": "bert",
+  "num_attention_heads": 4,
+  "num_hidden_layers": 6,
   "pad_token_id": 0,
   "position_embedding_type": "absolute",
+  "transformers_version": "4.6.0",
   "type_vocab_size": 2,
   "use_cache": true,
+  "vocab_size": 25426
 }

docs/Makefile DELETED Viewed

@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    ?=
-SPHINXBUILD   ?= sphinx-build
-SOURCEDIR     = source
-BUILDDIR      = build
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-.PHONY: help Makefile
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

docs/make.bat DELETED Viewed

@@ -1,35 +0,0 @@
-@ECHO OFF
-pushd %~dp0
-REM Command file for Sphinx documentation
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-if "%1" == "" goto help
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-:end
-popd

docs/requirements.txt DELETED Viewed

@@ -1,3 +0,0 @@
-.
-sphinx_rtd_theme==2.0.0
-nbsphinx==0.9.3

docs/source/_static/css/custom.css DELETED Viewed

@@ -1,40 +0,0 @@
-/* top left logo */
-.wy-side-nav-search, .wy-nav-top {
-    background: linear-gradient(15deg, #13547a 0%, #80d0c7 100%);
-}
-/* unvisited link */
-.wy-nav-content a:link {
-  color: #067abd;
-}
-/* visited link */
-.wy-nav-content a:visited {
-  color: #4b827c;
-}
-/* mouse over link */
-.wy-nav-content a:hover {
-  color: #80d0c7;
-}
-/* selected link */
-.wy-nav-content a:active {
-  color: #4b827c;
-}
-/* class object */
-.sig.sig-object {
-    padding: 5px 5px 5px 5px;
-    background-color: #ececec;
-    border-style: solid;
-    border-color: black;
-    border-width: 1px 0;
-}
-/* parameter object */
-dt {
-    padding: 5px 5px 5px 5px;
-    background-color: #ececec;
-}

docs/source/_static/gf_logo.png DELETED Viewed

Binary file (48.2 kB)

docs/source/about.rst DELETED Viewed

@@ -1,49 +0,0 @@
-About
-=====
-Model Description
------------------
-**Geneformer** is a context-aware, attention-based deep learning model pretrained on a large-scale corpus of single-cell transcriptomes to enable context-specific predictions in settings with limited data in network biology. During pretraining, Geneformer gained a fundamental understanding of network dynamics, encoding network hierarchy in the attention weights of the model in a completely self-supervised manner. With both zero-shot learning and fine-tuning with limited task-specific data, Geneformer consistently boosted predictive accuracy in a diverse panel of downstream tasks relevant to chromatin and network dynamics. In silico perturbation with zero-shot learning identified a novel transcription factor in cardiomyocytes that we experimentally validated to be critical to their ability to generate contractile force. In silico treatment with limited patient data revealed candidate therapeutic targets for cardiomyopathy that we experimentally validated to significantly improve the ability of cardiomyocytes to generate contractile force in an iPSC model of the disease. Overall, Geneformer represents a foundational deep learning model pretrained on a large-scale corpus of human single cell transcriptomes to gain a fundamental understanding of gene network dynamics that can now be democratized to a vast array of downstream tasks to accelerate discovery of key network regulators and candidate therapeutic targets.
-In `our manuscript <https://rdcu.be/ddrx0>`_, we report results for the original 6 layer Geneformer model pretrained on Genecorpus-30M. We additionally provide within the repository a 12 layer Geneformer model, scaled up with retained width:depth aspect ratio, also pretrained on Genecorpus-30M.
-Both the `6 <https://huggingface.co/ctheodoris/Geneformer/blob/main/gf-6L-30M-i2048/model.safetensors>`_ and `12 <https://huggingface.co/ctheodoris/Geneformer/blob/main/gf-12L-30M-i2048/pytorch_model.bin>`_ layer Geneformer models were pretrained in June 2021.
-Also see `our 2024 manuscript <https://www.biorxiv.org/content/10.1101/2024.08.16.608180v1.full.pdf>`_, for details of the `expanded model <https://huggingface.co/ctheodoris/Geneformer/blob/main/model.safetensors>`_ trained on ~95 million transcriptomes in April 2024 and our continual learning, multitask learning, and quantization strategies.
-Application
------------
-The pretrained Geneformer model can be used directly for zero-shot learning, for example for in silico perturbation analysis, or by fine-tuning towards the relevant downstream task, such as gene or cell state classification.
-Example applications demonstrated in `our manuscript <https://rdcu.be/ddrx0>`_ include:
-| *Fine-tuning*:
-| - transcription factor dosage sensitivity
-| - chromatin dynamics (bivalently marked promoters)
-| - transcription factor regulatory range
-| - gene network centrality
-| - transcription factor targets
-| - cell type annotation
-| - batch integration
-| - cell state classification across differentiation
-| - disease classification
-| - in silico perturbation to determine disease-driving genes
-| - in silico treatment to determine candidate therapeutic targets
-| *Zero-shot learning*:
-| - batch integration
-| - gene context specificity
-| - in silico reprogramming
-| - in silico differentiation
-| - in silico perturbation to determine impact on cell state
-| - in silico perturbation to determine transcription factor targets
-| - in silico perturbation to determine transcription factor cooperativity
-Citations
----------
-| C V Theodoris #, L Xiao, A Chopra, M D Chaffin, Z R Al Sayed, M C Hill, H Mantineo, E Brydon, Z Zeng, X S Liu, P T Ellinor #. `Transfer learning enables predictions in network biology. <https://rdcu.be/ddrx0>`_ *Nature*, 31 May 2023. (# co-corresponding authors)
-| H Chen \*, M S Venkatesh \*, J Gomez Ortega, S V Mahesh, T Nandi, R Madduri, K Pelka †, C V Theodoris † #. `Quantized multi-task learning for context-specific representations of gene network dynamics. <https://www.biorxiv.org/content/10.1101/2024.08.16.608180v1.full.pdf>`_ *bioRxiv*, 19 Aug 2024. (\* co-first authors, † co-senior authors, # corresponding author)

docs/source/api.rst DELETED Viewed

@@ -1,51 +0,0 @@
-API
-===
-Tokenizer
----------
-.. toctree::
-   :maxdepth: 1
-   geneformer.tokenizer
-Classifier
-----------
-.. toctree::
-   :maxdepth: 1
-   geneformer.classifier
-Multitask Classifier
---------------------
-.. toctree::
-   :maxdepth: 1
-   geneformer.mtl_classifier
-Embedding Extractor
--------------------
-.. toctree::
-   :maxdepth: 1
-   geneformer.emb_extractor
-In Silico Perturber
--------------------
-.. toctree::
-   :maxdepth: 1
-   geneformer.in_silico_perturber
-In Silico Perturber Stats
--------------------------
-.. toctree::
-   :maxdepth: 1
-   geneformer.in_silico_perturber_stats

docs/source/conf.py DELETED Viewed

@@ -1,80 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# For the full list of built-in configuration values, see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-import pathlib
-import re
-import sys
-from sphinx.ext import autodoc
-sys.path.insert(0, pathlib.Path(__file__).parents[2].resolve().as_posix())
-# -- Project information -----------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
-project = "geneformer"
-copyright = "2024, Christina Theodoris"
-author = "Christina Theodoris"
-release = "0.1.0"
-repository_url = "https://huggingface.co/ctheodoris/Geneformer"
-# -- General configuration ---------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
-extensions = [
-    "sphinx.ext.autodoc",
-    "sphinx.ext.autosummary",
-    "nbsphinx",
-    "sphinx.ext.viewcode",
-    "sphinx.ext.doctest",
-]
-templates_path = ["_templates"]
-exclude_patterns = [
-    "**.ipynb_checkpoints",
-]
-autoclass_content = "both"
-class MockedClassDocumenter(autodoc.ClassDocumenter):
-    def add_line(self, line: str, source: str, *lineno: int) -> None:
-        if line == "   Bases: :py:class:`object`":
-            return
-        super().add_line(line, source, *lineno)
-autodoc.ClassDocumenter = MockedClassDocumenter
-add_module_names = False
-def process_signature(app, what, name, obj, options, signature, return_annotation):
-    # loop through each line in the docstring and replace path with
-    # the generic path text
-    signature = re.sub(r"PosixPath\(.*?\)", "FILEPATH", signature)
-    return (signature, None)
-def setup(app):
-    app.connect("autodoc-process-signature", process_signature)
-# -- Options for HTML output -------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
-html_theme = "sphinx_rtd_theme"
-html_show_sphinx = False
-html_static_path = ["_static"]
-html_logo = "_static/gf_logo.png"
-html_theme_options = {
-    "collapse_navigation": False,
-    "sticky_navigation": True,
-    "navigation_depth": 3,
-    "logo_only": True,
-}
-html_css_files = [
-    "css/custom.css",
-]
-html_show_sourcelink = False

docs/source/geneformer.classifier.rst DELETED Viewed

@@ -1,10 +0,0 @@
-geneformer.classifier
-=====================
-.. automodule:: geneformer.classifier
-   :members:
-   :undoc-members:
-   :show-inheritance:
-   :exclude-members:
-        valid_option_dict,
-        validate_options

docs/source/geneformer.emb_extractor.rst DELETED Viewed

@@ -1,26 +0,0 @@
-geneformer.emb\_extractor
-=========================
-.. automodule:: geneformer.emb_extractor
-   :members:
-   :undoc-members:
-   :show-inheritance:
-   :exclude-members:
-       accumulate_tdigests,
-       gen_heatmap_class_colors,
-       gen_heatmap_class_dict,
-       get_embs,
-       label_cell_embs,
-       label_gene_embs,
-       make_colorbar,
-       plot_heatmap,
-       plot_umap,
-       summarize_gene_embs,
-       tdigest_mean,
-       tdigest_median,
-       test_emb,
-       update_tdigest_dict,
-       update_tdigest_dict_mean,
-       update_tdigest_dict_median,
-       valid_option_dict,
-       validate_options

docs/source/geneformer.in_silico_perturber.rst DELETED Viewed

@@ -1,8 +0,0 @@
-geneformer.in\_silico\_perturber
-=======================================
-.. automodule:: geneformer.in_silico_perturber
-   :members:
-   :undoc-members:
-   :show-inheritance:
-   :exclude-members:  valid_option_dict, validate_options, apply_additional_filters, isp_perturb_all, isp_perturb_set, update_perturbation_dictionary

docs/source/geneformer.in_silico_perturber_stats.rst DELETED Viewed

@@ -1,25 +0,0 @@
-geneformer.in\_silico\_perturber\_stats
-==============================================
-.. automodule:: geneformer.in_silico_perturber_stats
-   :members:
-   :undoc-members:
-   :show-inheritance:
-   :exclude-members:
-        find,
-        get_fdr,
-        get_gene_list,
-        get_impact_component,
-        invert_dict,
-        isp_aggregate_gene_shifts,
-        isp_aggregate_grouped_perturb,
-        isp_stats_mixture_model,
-        isp_stats_to_goal_state,
-        isp_stats_vs_null,
-        n_detections,
-        read_dict,
-        read_dictionaries,
-        token_to_gene_name,
-        token_tuple_to_ensembl_ids,
-        valid_option_dict,
-        validate_options

docs/source/geneformer.mtl_classifier.rst DELETED Viewed

@@ -1,11 +0,0 @@
-geneformer.mtl\_classifier
-==========================
-.. automodule:: geneformer.mtl_classifier
-   :members:
-   :undoc-members:
-   :show-inheritance:
-   :exclude-members:
-        valid_option_dict,
-        validate_options,
-        validate_additional_options

docs/source/geneformer.tokenizer.rst DELETED Viewed

@@ -1,15 +0,0 @@
-geneformer.tokenizer
-====================
-.. automodule:: geneformer.tokenizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
-   :exclude-members:
-        create_dataset,
-        tokenize_anndata,
-        tokenize_files,
-        tokenize_loom,
-        rank_genes,
-        tokenize_cell,
-        sum_ensembl_ids

docs/source/getstarted.rst DELETED Viewed

@@ -1,36 +0,0 @@
-Getting Started
-===============
-Installation
-------------
-Geneformer installation instructions.
-Make sure you have git-lfs installed (https://git-lfs.com).
-.. code-block:: bash
-    git lfs install
-    git clone https://huggingface.co/ctheodoris/Geneformer
-    cd Geneformer
-    pip install .
-Tutorials
----------
-| See `examples <https://huggingface.co/ctheodoris/Geneformer/tree/main/examples>`_ for:
-| - tokenizing transcriptomes
-| - pretraining
-| - hyperparameter tuning
-| - fine-tuning
-| - extracting and plotting cell embeddings
-| - in silico perturbation
-Please note that the fine-tuning examples are meant to be generally applicable and the input datasets and labels will vary dependent on the downstream task. Example input files for a few of the downstream tasks demonstrated in the manuscript are located within the `example_input_files directory <https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files>`_ in the dataset repository, but these only represent a few example fine-tuning applications.
-Tips
-----
-Please note that GPU resources are required for efficient usage of Geneformer. Additionally, we strongly recommend tuning hyperparameters for each downstream fine-tuning application as this can significantly boost predictive potential in the downstream task (e.g. max learning rate, learning schedule, number of layers to freeze, etc.).

docs/source/index.rst DELETED Viewed

@@ -1,16 +0,0 @@
-Geneformer
-==========
-Geneformer is a foundation transformer model pretrained on a large-scale corpus of single cell transcriptomes to enable context-aware predictions in network biology.
-See `our manuscript <https://rdcu.be/ddrx0>`_ for details.
-Table of Contents
------------------
-.. toctree::
-   :maxdepth: 2
-   about
-   getstarted
-   api

examples/cell_classification.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

examples/extract_and_plot_cell_embeddings.ipynb CHANGED Viewed

@@ -18,8 +18,6 @@
    "outputs": [],
    "source": [
     "# initiate EmbExtractor\n",
-    "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
-    "# (otherwise the EmbExtractor will use the current default model dictionary)\n",
     "embex = EmbExtractor(model_type=\"CellClassifier\",\n",
     "                     num_classes=3,\n",
     "                     filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
@@ -28,13 +26,11 @@
     "                     emb_label=[\"disease\",\"cell_type\"],\n",
     "                     labels_to_plot=[\"disease\"],\n",
     "                     forward_batch_size=200,\n",
-    "                     nproc=16,\n",
-    "                     token_dictionary_file=\"./gene_dictionaries_30m/token_dictionary_gc30M.pkl\") # change from current default dictionary for 30M model series\n",
     "\n",
     "# extracts embedding from input data\n",
-    "# input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)\n",
-    "# example dataset for 30M model series: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
-    "embs = embex.extract_embs(\"../fine_tuned_models/gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224\", # example 30M fine-tuned model\n",
     "                          \"path/to/input_data/\",\n",
     "                          \"path/to/output_directory/\",\n",
     "                          \"output_prefix\")\n"
@@ -132,7 +128,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.15"
   }
  },
  "nbformat": 4,

    "outputs": [],
    "source": [
     "# initiate EmbExtractor\n",
     "embex = EmbExtractor(model_type=\"CellClassifier\",\n",
     "                     num_classes=3,\n",
     "                     filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
     "                     emb_label=[\"disease\",\"cell_type\"],\n",
     "                     labels_to_plot=[\"disease\"],\n",
     "                     forward_batch_size=200,\n",
+    "                     nproc=16)\n",
     "\n",
     "# extracts embedding from input data\n",
+    "# example dataset: https://huggingface.co/datasets/ctheodoris/Genecorpus-30M/tree/main/example_input_files/cell_classification/disease_classification/human_dcm_hcm_nf.dataset\n",
+    "embs = embex.extract_embs(\"../fine_tuned_models/geneformer-6L-30M_CellClassifier_cardiomyopathies_220224\",\n",
     "                          \"path/to/input_data/\",\n",
     "                          \"path/to/output_directory/\",\n",
     "                          \"output_prefix\")\n"
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,

examples/gene_classification.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

examples/hyperparam_optimiz_for_disease_classifier.py ADDED Viewed

	@@ -0,0 +1,226 @@

+#!/usr/bin/env python
+# coding: utf-8
+# hyperparameter optimization with raytune for disease classification
+# imports
+import os
+import subprocess
+GPU_NUMBER = [0,1,2,3]
+os.environ["CUDA_VISIBLE_DEVICES"] = ",".join([str(s) for s in GPU_NUMBER])
+os.environ["NCCL_DEBUG"] = "INFO"
+os.environ["CONDA_OVERRIDE_GLIBC"] = "2.56"
+os.environ["LD_LIBRARY_PATH"] = "/path/to/miniconda3/lib:/path/to/sw/lib:/path/to/sw/lib"
+# initiate runtime environment for raytune
+import pyarrow # must occur prior to ray import
+import ray
+from ray import tune
+from ray.tune import ExperimentAnalysis
+from ray.tune.suggest.hyperopt import HyperOptSearch
+ray.shutdown() #engage new ray session
+runtime_env = {"conda": "base",
+               "env_vars": {"LD_LIBRARY_PATH": "/path/to/miniconda3/lib:/path/to/sw/lib:/path/to/sw/lib"}}
+ray.init(runtime_env=runtime_env)
+def initialize_ray_with_check(ip_address):
+    """
+    Initialize Ray with a specified IP address and check its status and accessibility.
+    Args:
+    - ip_address (str): The IP address (with port) to initialize Ray.
+    Returns:
+    - bool: True if initialization was successful and dashboard is accessible, False otherwise.
+    """
+    try:
+        ray.init(address=ip_address)
+        print(ray.nodes())
+        services = ray.get_webui_url()
+        if not services:
+            raise RuntimeError("Ray dashboard is not accessible.")
+        else:
+            print(f"Ray dashboard is accessible at: {services}")
+        return True
+    except Exception as e:
+        print(f"Error initializing Ray: {e}")
+        return False
+# Usage:
+ip = 'your_ip:xxxx'  # Replace with your actual IP address and port
+if initialize_ray_with_check(ip):
+    print("Ray initialized successfully.")
+else:
+    print("Error during Ray initialization.")
+import datetime
+import numpy as np
+import pandas as pd
+import random
+import seaborn as sns; sns.set()
+from collections import Counter
+from datasets import load_from_disk
+from scipy.stats import ranksums
+from sklearn.metrics import accuracy_score
+from transformers import BertForSequenceClassification
+from transformers import Trainer
+from transformers.training_args import TrainingArguments
+from geneformer import DataCollatorForCellClassification
+# number of CPU cores
+num_proc=30
+# load train dataset with columns:
+    # cell_type (annotation of each cell's type)
+    # disease (healthy or disease state)
+    # individual (unique ID for each patient)
+    # length (length of that cell's rank value encoding)
+train_dataset=load_from_disk("/path/to/disease_train_data.dataset")
+# filter dataset for given cell_type
+def if_cell_type(example):
+    return example["cell_type"].startswith("Cardiomyocyte")
+trainset_v2 = train_dataset.filter(if_cell_type, num_proc=num_proc)
+# create dictionary of disease states : label ids
+target_names = ["healthy", "disease1", "disease2"]
+target_name_id_dict = dict(zip(target_names,[i for i in range(len(target_names))]))
+trainset_v3 = trainset_v2.rename_column("disease","label")
+# change labels to numerical ids
+def classes_to_ids(example):
+    example["label"] = target_name_id_dict[example["label"]]
+    return example
+trainset_v4 = trainset_v3.map(classes_to_ids, num_proc=num_proc)
+# separate into train, validation, test sets
+indiv_set = set(trainset_v4["individual"])
+random.seed(42)
+train_indiv = random.sample(indiv_set,round(0.7*len(indiv_set)))
+eval_indiv = [indiv for indiv in indiv_set if indiv not in train_indiv]
+valid_indiv = random.sample(eval_indiv,round(0.5*len(eval_indiv)))
+test_indiv = [indiv for indiv in eval_indiv if indiv not in valid_indiv]
+def if_train(example):
+    return example["individual"] in train_indiv
+classifier_trainset = trainset_v4.filter(if_train,num_proc=num_proc).shuffle(seed=42)
+def if_valid(example):
+    return example["individual"] in valid_indiv
+classifier_validset = trainset_v4.filter(if_valid,num_proc=num_proc).shuffle(seed=42)
+# define output directory path
+current_date = datetime.datetime.now()
+datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
+output_dir = f"/path/to/models/{datestamp}_geneformer_DiseaseClassifier/"
+# ensure not overwriting previously saved model
+saved_model_test = os.path.join(output_dir, f"pytorch_model.bin")
+if os.path.isfile(saved_model_test) == True:
+    raise Exception("Model already saved to this directory.")
+# make output directory
+subprocess.call(f'mkdir {output_dir}', shell=True)
+# set training parameters
+# how many pretrained layers to freeze
+freeze_layers = 2
+# batch size for training and eval
+geneformer_batch_size = 12
+# number of epochs
+epochs = 1
+# logging steps
+logging_steps = round(len(classifier_trainset)/geneformer_batch_size/10)
+# define function to initiate model
+def model_init():
+    model = BertForSequenceClassification.from_pretrained("/path/to/pretrained_model/",
+                                                          num_labels=len(target_names),
+                                                          output_attentions = False,
+                                                          output_hidden_states = False)
+    if freeze_layers is not None:
+        modules_to_freeze = model.bert.encoder.layer[:freeze_layers]
+        for module in modules_to_freeze:
+            for param in module.parameters():
+                param.requires_grad = False
+    model = model.to("cuda:0")
+    return model
+# define metrics
+# note: macro f1 score recommended for imbalanced multiclass classifiers
+def compute_metrics(pred):
+    labels = pred.label_ids
+    preds = pred.predictions.argmax(-1)
+    # calculate accuracy using sklearn's function
+    acc = accuracy_score(labels, preds)
+    return {
+      'accuracy': acc,
+    }
+# set training arguments
+training_args = {
+    "do_train": True,
+    "do_eval": True,
+    "evaluation_strategy": "steps",
+    "eval_steps": logging_steps,
+    "logging_steps": logging_steps,
+    "group_by_length": True,
+    "length_column_name": "length",
+    "disable_tqdm": True,
+    "skip_memory_metrics": True, # memory tracker causes errors in raytune
+    "per_device_train_batch_size": geneformer_batch_size,
+    "per_device_eval_batch_size": geneformer_batch_size,
+    "num_train_epochs": epochs,
+    "load_best_model_at_end": True,
+    "output_dir": output_dir,
+}
+training_args_init = TrainingArguments(**training_args)
+# create the trainer
+trainer = Trainer(
+    model_init=model_init,
+    args=training_args_init,
+    data_collator=DataCollatorForCellClassification(),
+    train_dataset=classifier_trainset,
+    eval_dataset=classifier_validset,
+    compute_metrics=compute_metrics,
+)
+# specify raytune hyperparameter search space
+ray_config = {
+    "num_train_epochs": tune.choice([epochs]),
+    "learning_rate": tune.loguniform(1e-6, 1e-3),
+    "weight_decay": tune.uniform(0.0, 0.3),
+    "lr_scheduler_type": tune.choice(["linear","cosine","polynomial"]),
+    "warmup_steps": tune.uniform(100, 2000),
+    "seed": tune.uniform(0,100),
+    "per_device_train_batch_size": tune.choice([geneformer_batch_size])
+}
+hyperopt_search = HyperOptSearch(
+    metric="eval_accuracy", mode="max")
+# optimize hyperparameters
+trainer.hyperparameter_search(
+    direction="maximize",
+    backend="ray",
+    resources_per_trial={"cpu":8,"gpu":1},
+    hp_space=lambda _: ray_config,
+    search_alg=hyperopt_search,
+    n_trials=100, # number of trials
+    progress_reporter=tune.CLIReporter(max_report_frequency=600,
+                                                   sort_by_metric=True,
+                                                   max_progress_rows=100,
+                                                   mode="max",
+                                                   metric="eval_accuracy",
+                                                   metric_columns=["loss", "eval_loss", "eval_accuracy"])
+)

examples/in_silico_perturbation.ipynb CHANGED Viewed

@@ -8,80 +8,35 @@
    "outputs": [],
    "source": [
     "from geneformer import InSilicoPerturber\n",
-    "from geneformer import InSilicoPerturberStats\n",
-    "from geneformer import EmbExtractor"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cbd6851c-060e-4967-b816-e605ffe58b23",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "### in silico perturbation in deletion mode to determine genes whose deletion in the dilated cardiomyopathy (dcm) state significantly shifts the embedding towards non-failing (nf) state"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c53e98cd-c603-4878-82ba-db471181bb55",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# first obtain start, goal, and alt embedding positions\n",
-    "# this function was changed to be separate from perturb_data\n",
-    "# to avoid repeating calcuations when parallelizing perturb_data\n",
-    "cell_states_to_model={\"state_key\": \"disease\", \n",
-    "                      \"start_state\": \"dcm\", \n",
-    "                      \"goal_state\": \"nf\", \n",
-    "                      \"alt_states\": [\"hcm\"]}\n",
-    "\n",
-    "filter_data_dict={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]}\n",
-    "\n",
-    "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
-    "# (otherwise the EmbExtractor will use the current default model dictionary)\n",
-    "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
-    "embex = EmbExtractor(model_type=\"CellClassifier\", # if using previously fine-tuned cell classifier model\n",
-    "                     num_classes=3,\n",
-    "                     filter_data=filter_data_dict,\n",
-    "                     max_ncells=1000,\n",
-    "                     emb_layer=0,\n",
-    "                     summary_stat=\"exact_mean\",\n",
-    "                     forward_batch_size=256,\n",
-    "                     nproc=16)\n",
-    "\n",
-    "state_embs_dict = embex.get_state_embs(cell_states_to_model,\n",
-    "                                       \"../fine_tuned_models/gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224\", # example 30M fine-tuned model\n",
-    "                                       \"path/to/input_data\",\n",
-    "                                       \"path/to/output_directory\",\n",
-    "                                       \"output_prefix\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "981e1190-62da-4543-b7d3-6e2a2d6a6d56",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
-    "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
-    "# (otherwise the InSilicoPerturber will use the current default model dictionary)\n",
-    "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
     "isp = InSilicoPerturber(perturb_type=\"delete\",\n",
     "                        perturb_rank_shift=None,\n",
     "                        genes_to_perturb=\"all\",\n",
     "                        combos=0,\n",
     "                        anchor_gene=None,\n",
-    "                        model_type=\"CellClassifier\", # if using previously fine-tuned cell classifier model\n",
     "                        num_classes=3,\n",
     "                        emb_mode=\"cell\",\n",
     "                        cell_emb_style=\"mean_pool\",\n",
-    "                        filter_data=filter_data_dict,\n",
-    "                        cell_states_to_model=cell_states_to_model,\n",
-    "                        state_embs_dict=state_embs_dict,\n",
     "                        max_ncells=2000,\n",
     "                        emb_layer=0,\n",
     "                        forward_batch_size=400,\n",
@@ -96,10 +51,9 @@
    "outputs": [],
    "source": [
     "# outputs intermediate files from in silico perturbation\n",
-    "\n",
-    "isp.perturb_data(\"../fine_tuned_models/gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224\", # example 30M fine-tuned model\n",
     "                 \"path/to/input_data\",\n",
-    "                 \"path/to/isp_output_directory\",\n",
     "                 \"output_prefix\")"
    ]
   },
@@ -110,14 +64,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# OF NOTE: token_dictionary_file must be set to the gc-30M token dictionary if using a 30M series model\n",
-    "# (otherwise the InSilicoPerturberStats will use the current default model dictionary)\n",
-    "# 30M token dictionary: https://huggingface.co/ctheodoris/Geneformer/blob/main/geneformer/gene_dictionaries_30m/token_dictionary_gc30M.pkl\n",
     "ispstats = InSilicoPerturberStats(mode=\"goal_state_shift\",\n",
     "                                  genes_perturbed=\"all\",\n",
     "                                  combos=0,\n",
     "                                  anchor_gene=None,\n",
-    "                                  cell_states_to_model=cell_states_to_model)"
    ]
   },
   {
@@ -128,9 +79,9 @@
    "outputs": [],
    "source": [
     "# extracts data from intermediate files and processes stats to output in final .csv\n",
-    "ispstats.get_stats(\"path/to/isp_output_directory\", # this should be the directory \n",
     "                   None,\n",
-    "                   \"path/to/isp_stats_output_directory\",\n",
     "                   \"output_prefix\")"
    ]
   }
@@ -151,7 +102,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.15"
   }
  },
  "nbformat": 4,

    "outputs": [],
    "source": [
     "from geneformer import InSilicoPerturber\n",
+    "from geneformer import InSilicoPerturberStats"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
+   "id": "67b44366-f255-4415-a865-6a27a8ffcce7",
    "metadata": {
     "tags": []
    },
    "outputs": [],
    "source": [
+    "# in silico perturbation in deletion mode to determine genes whose \n",
+    "# deletion in the dilated cardiomyopathy (dcm) state significantly shifts\n",
+    "# the embedding towards non-failing (nf) state\n",
     "isp = InSilicoPerturber(perturb_type=\"delete\",\n",
     "                        perturb_rank_shift=None,\n",
     "                        genes_to_perturb=\"all\",\n",
     "                        combos=0,\n",
     "                        anchor_gene=None,\n",
+    "                        model_type=\"CellClassifier\",\n",
     "                        num_classes=3,\n",
     "                        emb_mode=\"cell\",\n",
     "                        cell_emb_style=\"mean_pool\",\n",
+    "                        filter_data={\"cell_type\":[\"Cardiomyocyte1\",\"Cardiomyocyte2\",\"Cardiomyocyte3\"]},\n",
+    "                        cell_states_to_model={'state_key': 'disease', \n",
+    "                                              'start_state': 'dcm', \n",
+    "                                              'goal_state': 'nf', \n",
+    "                                              'alt_states': ['hcm']},\n",
     "                        max_ncells=2000,\n",
     "                        emb_layer=0,\n",
     "                        forward_batch_size=400,\n",
    "outputs": [],
    "source": [
     "# outputs intermediate files from in silico perturbation\n",
+    "isp.perturb_data(\"path/to/model\",\n",
     "                 \"path/to/input_data\",\n",
+    "                 \"path/to/output_directory\",\n",
     "                 \"output_prefix\")"
    ]
   },
    "metadata": {},
    "outputs": [],
    "source": [
     "ispstats = InSilicoPerturberStats(mode=\"goal_state_shift\",\n",
     "                                  genes_perturbed=\"all\",\n",
     "                                  combos=0,\n",
     "                                  anchor_gene=None,\n",
+    "                                  cell_states_to_model={\"disease\":([\"dcm\"],[\"nf\"],[\"hcm\"])})"
    ]
   },
   {
    "outputs": [],
    "source": [
     "# extracts data from intermediate files and processes stats to output in final .csv\n",
+    "ispstats.get_stats(\"path/to/input_data\",\n",
     "                   None,\n",
+    "                   \"path/to/output_directory\",\n",
     "                   \"output_prefix\")"
    ]
   }
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,

examples/multitask_cell_classification.ipynb DELETED Viewed

@@ -1,420 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "866f100c-e11a-4e7b-a37c-831775d845a7",
-   "metadata": {},
-   "source": [
-    "# Geneformer Multi-Task Cell Classifier Tutorial\n",
-    "\n",
-    "This tutorial demonstrates how to use the Geneformer Multi-Task Cell Classifier and optimizatize hyperparameter for fine-tuning"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "311ba456-b44d-40c7-941d-3fc03bcda85a",
-   "metadata": {},
-   "source": [
-    "## 1. Installation and Imports\n",
-    "\n",
-    "First import the necessary modules."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "cd9defdc-0524-4c3b-a741-27117ed3a5be",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from geneformer import MTLClassifier"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "790e9c3c-f6d9-44b3-b9a5-05725760f4fd",
-   "metadata": {},
-   "source": [
-    "## 2. Set up Paths and Parameters\n",
-    "\n",
-    "Now, let's set up the necessary paths and parameters for our classifier. We'll also define our task columns, which are specific columns from our dataset that represent the classification tasks we want to train the model on."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "04a04197-8e45-47f8-a86f-202209ea10ae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Define paths\n",
-    "pretrained_path = \"/path/to/pretrained/Geneformer/model\" \n",
-    "# input data is tokenized rank value encodings generated by Geneformer tokenizer (see tokenizing_scRNAseq_data.ipynb)\n",
-    "train_path = \"/path/to/train/data.dataset\"\n",
-    "val_path = \"/path/to/val/data.dataset\"\n",
-    "test_path = \"/path/to/test/data.dataset\"\n",
-    "results_dir = \"/path/to/results/directory\"\n",
-    "model_save_path = \"/path/to/model/save/path\"\n",
-    "tensorboard_log_dir = \"/path/to/tensorboard/log/dir\"\n",
-    "\n",
-    "# Define tasks and hyperparameters\n",
-    "# task_columns should be a list of column names from your dataset\n",
-    "# Each column represents a specific classification task (e.g. cell type, disease state)\n",
-    "task_columns = [\"cell_type\", \"disease_state\"]  # Example task columns\n",
-    "\n",
-    "hyperparameters = {\n",
-    "    \"learning_rate\": {\"type\": \"float\", \"low\": 1e-5, \"high\": 1e-3, \"log\": True},\n",
-    "    \"warmup_ratio\": {\"type\": \"float\", \"low\": 0.005, \"high\": 0.01},\n",
-    "    \"weight_decay\": {\"type\": \"float\", \"low\": 0.01, \"high\": 0.1},\n",
-    "    \"dropout_rate\": {\"type\": \"float\", \"low\": 0.0, \"high\": 0.7},\n",
-    "    \"lr_scheduler_type\": {\"type\": \"categorical\", \"choices\": [\"cosine\"]},\n",
-    "    \"task_weights\": {\"type\": \"float\", \"low\": 0.1, \"high\": 2.0}\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "31857690-a739-435a-aefd-f171fafc1b78",
-   "metadata": {},
-   "source": [
-    "In the code above, we've defined `task_columns` as `[\"cell_type\", \"disease_state\"]`. This means our model will be trained to classify cells based on two tasks:\n",
-    "1. Identifying the cell type\n",
-    "2. Determining the disease state\n",
-    "3. Note: \"unique_cell_id\" is a required column in the dataset for logging and inference purposes\n",
-    "\n",
-    "These column names should correspond to actual columns in your dataset. Each column should contain the labels for that specific classification task.\n",
-    "\n",
-    "For example, your dataset might look something like this:\n",
-    "\n",
-    "    | unique_cell_id | input_ids | ... | cell_type | disease_state |\n",
-    "    |----------------|-----------|-----|-----------|---------------|\n",
-    "    | cell1          | ...       | ... | neuron    | healthy       |\n",
-    "    | cell2          | ...       | ... | astrocyte | diseased      |\n",
-    "    | ...            | ...       | ... | ...       | ...           |\n",
-    "The model will learn to predict classes within 'cell_type' and 'disease_state' "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b9e3050a-6162-4c01-b6fd-8784bf4ab1e4",
-   "metadata": {},
-   "source": [
-    "## 3. Initialize the MTLClassifier\n",
-    "\n",
-    "Now, let's create an instance of the MTLClassifier with our defined parameters and task columns."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e27caac9-670c-409d-9313-50201c665cb9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mc = MTLClassifier(\n",
-    "    task_columns=task_columns,  # Our defined classification tasks\n",
-    "    study_name=\"MTLClassifier_example\",\n",
-    "    pretrained_path=pretrained_path,\n",
-    "    train_path=train_path,\n",
-    "    val_path=val_path,\n",
-    "    test_path=test_path,\n",
-    "    model_save_path=model_save_path,\n",
-    "    results_dir=results_dir,\n",
-    "    tensorboard_log_dir=tensorboard_log_dir,\n",
-    "    hyperparameters=hyperparameters,\n",
-    "    n_trials=15,  # Number of trials for hyperparameter optimization (at least 50 suggested)\n",
-    "    epochs=1,    # Number of training epochs (1 suggested to prevent overfitting)\n",
-    "    batch_size=8, # Adjust based on available GPU memory\n",
-    "    seed=42\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0d729444-e3ad-4584-9659-0c464ac97462",
-   "metadata": {},
-   "source": [
-    "## 4. Run Hyperparameter Optimization\n",
-    "\n",
-    "Now, let's run the Optuna study to optimize our hyperparameters for both classification tasks."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9298aa3e-6a52-4aa8-b9ff-b63d97beac93",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mc.run_optuna_study()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "af23075d-d07b-43d3-bc5d-4df4d5d7199b",
-   "metadata": {},
-   "source": [
-    "## 5. Evaluate the Model on Test Data\n",
-    "\n",
-    "After optimization, we can evaluate our model on the test dataset. This will provide performance metrics for both classification tasks. CSV containing following keys will be generated in specified results directiory \"Cell ID, task(1...n) True,task(1.,.n) Pred,task(1...n) Probabilities\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "461bf8d3-b964-4ff4-994f-9f3d313d4614",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mc.load_and_evaluate_test_model()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "31cfeb2d-6673-4b02-a79c-2533cc5e4d28",
-   "metadata": {},
-   "source": [
-    "## 6. (Optional) Manual Hyperparameter Tuning\n",
-    "\n",
-    "If you prefer to set hyperparameters manually, you can use the following approach:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8ee6b99f-42e9-4abf-a292-aa9047735e0e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "manual_hyperparameters = {\n",
-    "    \"learning_rate\": 0.001,\n",
-    "    \"warmup_ratio\": 0.01,\n",
-    "    \"weight_decay\": 0.1,\n",
-    "    \"dropout_rate\": 0.1,\n",
-    "    \"lr_scheduler_type\": \"cosine\",\n",
-    "    \"task_weights\": [1, 1],  # Weights for each task (cell_type, disease_state)\n",
-    "    \"max_layers_to_freeze\": 2\n",
-    "}\n",
-    "\n",
-    "mc_manual = MTLClassifier(\n",
-    "    task_columns=task_columns,\n",
-    "    study_name=\"mtl_manual\",\n",
-    "    pretrained_path=pretrained_path,\n",
-    "    train_path=train_path,\n",
-    "    val_path=val_path,\n",
-    "    test_path=test_path,\n",
-    "    model_save_path=model_save_path,\n",
-    "    results_dir=results_dir,\n",
-    "    tensorboard_log_dir=tensorboard_log_dir,\n",
-    "    manual_hyperparameters=manual_hyperparameters,\n",
-    "    use_manual_hyperparameters=True,\n",
-    "    epochs=10,\n",
-    "    batch_size=32,\n",
-    "    seed=42\n",
-    ")\n",
-    "\n",
-    "mc_manual.run_manual_tuning()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "dbaac008-fc00-4b71-8e78-89b2d922d9d8",
-   "metadata": {},
-   "source": [
-    "# Geneformer In Silico Perturber Tutorial (MTL Quantized)\n",
-    "This demonstrates how to use the Geneformer In Silico Perturber with a Multi-Task Learning (MTL) model in a quantized configuration to optimize runtime and memory."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2e15ad57-736c-48f0-be87-39cf5015bc5c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from geneformer import InSilicoPerturber, EmbExtractor, InSilicoPerturberStats"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "43c18140-151e-4d44-95b4-a9b3a47172cf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Define paths\n",
-    "model_directory = \"/path/to/model/save/path\"\n",
-    "input_data_file = \"/path/to/input/data.dataset\"\n",
-    "output_directory = \"/path/to/output/directory\"\n",
-    "output_prefix = \"mtl_quantized_perturbation\"\n",
-    "\n",
-    "# Define parameters\n",
-    "perturb_type = \"delete\"  # or \"overexpress\"\n",
-    "\n",
-    "# Define cell states to model\n",
-    "cell_states_to_model = {\n",
-    "    \"state_key\": \"disease_state\", \n",
-    "    \"start_state\": \"disease\", \n",
-    "    \"goal_state\": \"control\"\n",
-    "}\n",
-    "\n",
-    "# Define filter data\n",
-    "filter_data_dict = {\n",
-    "    \"cell_type\": [\"Fibroblast\"]\n",
-    "}"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3010d0bf-b23c-45c1-ac12-8c472dc8b7a1",
-   "metadata": {},
-   "source": [
-    "## 3. Extract State Embeddings\n",
-    "\n",
-    "Before we initialize the InSilicoPerturber, we need to extract the state embeddings using the EmbExtractor."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "215f0a90-8041-417d-a5d3-b2483626c3b2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Initialize EmbExtractor\n",
-    "embex = EmbExtractor(\n",
-    "    filter_data_dict=filter_data_dict,\n",
-    "    max_ncells=1000, # Number of cells to extract embeddings for\n",
-    "    emb_layer=0,  # Use the second to last layer\n",
-    "    emb_mode = \"cls\",\n",
-    "    summary_stat=\"exact_mean\",\n",
-    "    forward_batch_size=8, # Adjust based on available GPU memory\n",
-    "    nproc=4\n",
-    ")\n",
-    "\n",
-    "# Extract state embeddings\n",
-    "state_embs_dict = embex.get_state_embs(\n",
-    "    cell_states_to_model,\n",
-    "    model_directory=model_directory,\n",
-    "    input_data_file=input_data_file,\n",
-    "    output_directory=output_directory,\n",
-    "    output_prefix=output_prefix\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "23f14e36-4529-4fb2-8af9-7f4875cf81e3",
-   "metadata": {},
-   "source": [
-    "## 4. Initialize the InSilicoPerturber\n",
-    "\n",
-    "Now that we have our state embeddings, let's create an instance of the InSilicoPerturber with MTL and quantized configurations."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "09f985a1-91bc-4e8d-8001-a3663531b570",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Initialize InSilicoPerturber\n",
-    "isp = InSilicoPerturber(\n",
-    "    perturb_type=perturb_type,\n",
-    "    genes_to_perturb=\"all\",  # Perturb all genes\n",
-    "    model_type=\"MTLCellClassifier-Quantized\",  # Use quantized MTL model\n",
-    "    emb_mode=\"cls\",  # Use CLS token embedding\n",
-    "    cell_states_to_model=cell_states_to_model,\n",
-    "    state_embs_dict=state_embs_dict,\n",
-    "    max_ncells=1000,  # Number of cells to perturb (larger number increases power)\n",
-    "    emb_layer=0,  \n",
-    "    forward_batch_size=8, # Adjust based on available GPU memory\n",
-    "    nproc=1\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "cfcc2c1e-fd7f-4a36-99fc-ac7f43e5be6b",
-   "metadata": {},
-   "source": [
-    "## 5. Run In Silico Perturbation\n",
-    "\n",
-    "Run the in silico perturbation on the dataset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cf030c09-8ae4-45a7-aaf7-3fc2af4fe296",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Run perturbation and output intermediate files\n",
-    "isp.perturb_data(\n",
-    "    model_directory=model_directory,\n",
-    "    input_data_file=input_data_file,\n",
-    "    output_directory=output_directory,\n",
-    "    output_prefix=output_prefix\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bb8ec074-6f2f-422b-a973-37ed32a15c38",
-   "metadata": {},
-   "source": [
-    "## 6. Process Results with InSilicoPerturberStats\n",
-    "\n",
-    "After running the perturbation, we'll use InSilicoPerturberStats to process the intermediate files and generate the final statistics."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0a748043-43fc-47ad-ace5-f0ae3dd34674",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Initialize InSilicoPerturberStats\n",
-    "ispstats = InSilicoPerturberStats(\n",
-    "    mode=\"goal_state_shift\",\n",
-    "    genes_perturbed=\"all\",\n",
-    "    combos=0,\n",
-    "    anchor_gene=None,\n",
-    "    cell_states_to_model=cell_states_to_model\n",
-    ")\n",
-    "\n",
-    "# Process stats and output final .csv\n",
-    "ispstats.get_stats(\n",
-    "    input_data_file,\n",
-    "    None,\n",
-    "    output_directory,\n",
-    "    output_prefix\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.5"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}

examples/pretraining_new_model/pretrain_geneformer_w_deepspeed.py CHANGED Viewed

@@ -138,9 +138,7 @@ training_args = {
     "per_device_train_batch_size": geneformer_batch_size,
     "num_train_epochs": epochs,
     "save_strategy": "steps",
-    "save_steps": np.floor(
-        num_examples / geneformer_batch_size / 8
-    ),  # 8 saves per epoch
     "logging_steps": 1000,
     "output_dir": training_output_dir,
     "logging_dir": logging_dir,

     "per_device_train_batch_size": geneformer_batch_size,
     "num_train_epochs": epochs,
     "save_strategy": "steps",
+    "save_steps": np.floor(num_examples / geneformer_batch_size / 8),  # 8 saves per epoch
     "logging_steps": 1000,
     "output_dir": training_output_dir,
     "logging_dir": logging_dir,

examples/tokenizing_scRNAseq_data.ipynb CHANGED Viewed

@@ -7,39 +7,23 @@
     "tags": []
    },
    "source": [
-    "## Tokenizing .loom or .h5ad single cell RNA-seq data to rank value encoding .dataset format"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "1fe86f48-5578-47df-b373-58c21ec170ab",
    "metadata": {},
    "source": [
-    "#### Input data is a directory with .loom or .h5ad files containing raw counts from single cell RNAseq data, including all genes detected in the transcriptome without feature selection. The input file type is specified by the argument file_format in the tokenize_data function.\n",
     "\n",
-    "#### The discussion below references the .loom file format, but the analagous labels are required for .h5ad files, just that they will be column instead of row attributes and vice versa due to the transposed format of the two file types.\n",
-    "\n",
-    "#### Genes should be labeled with Ensembl IDs (loom row attribute \"ensembl_id\"), which provide a unique identifer for conversion to tokens. Other forms of gene annotations (e.g. gene names) can be converted to Ensembl IDs via Ensembl Biomart. Cells should be labeled with the total read count in the cell (loom column attribute \"n_counts\") to be used for normalization.\n",
     "\n",
     "#### No cell metadata is required, but custom cell attributes may be passed onto the tokenized dataset by providing a dictionary of custom attributes to be added, which is formatted as loom_col_attr_name : desired_dataset_col_attr_name. For example, if the original .loom dataset has column attributes \"cell_type\" and \"organ_major\" and one would like to retain these attributes as labels in the tokenized dataset with the new names \"cell_type\" and \"organ\", respectively, the following custom attribute dictionary should be provided: {\"cell_type\": \"cell_type\", \"organ_major\": \"organ\"}. \n",
     "\n",
     "#### Additionally, if the original .loom file contains a cell column attribute called \"filter_pass\", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with \"1\" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.\n",
     "\n",
-    "#### If one's data is in other formats besides .loom or .h5ad, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom or .h5ad format prior to running the transcriptome tokenizer."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "32c69493-4e5a-4b07-8dc1-958ff2ee7d0b",
-   "metadata": {},
-   "source": [
-    "**********************************************************************************************************\n",
-    "#### OF NOTE: PLEASE ENSURE THE CORRECT TOKEN DICTIONARY AND GENE MEDIAN FILE IS USED FOR THE CORRECT MODEL.\n",
-    "#### 95M: current defaults; 30M: https://huggingface.co/ctheodoris/Geneformer/tree/main/geneformer/gene_dictionaries_30m\n",
-    "\n",
-    "#### ADDITIONALLY:\n",
-    "#### The 95M model series require the special_token argument to be set to True and model_input_size to be 4096. (current defaults)\n",
-    "#### The 30M model series require the special_token argument to be set to False and the model_input_size to be 2048."
    ]
   },
   {
@@ -59,11 +43,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tk = TranscriptomeTokenizer({\"cell_type\": \"cell_type\", \"organ_major\": \"organ\"}, nproc=16)\n",
-    "tk.tokenize_data(\"loom_data_directory\", \n",
-    "                 \"output_directory\", \n",
-    "                 \"output_prefix\", \n",
-    "                 file_format=\"loom\")"
    ]
   }
  ],
@@ -83,7 +64,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.15"
   }
  },
  "nbformat": 4,

     "tags": []
    },
    "source": [
+    "## Tokenizing .loom single cell RNA-seq data to rank value encoding .dataset format"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "350e6252-b783-494b-9767-f087eb868a15",
    "metadata": {},
    "source": [
+    "#### Input data is a directory with .loom files containing raw counts from single cell RNAseq data, including all genes detected in the transcriptome without feature selection. \n",
     "\n",
+    "#### Genes should be labeled with Ensembl IDs (row attribute \"ensembl_id\"), which provide a unique identifer for conversion to tokens. Other forms of gene annotations (e.g. gene names) can be converted to Ensembl IDs via Ensembl Biomart. Cells should be labeled with the total read count in the cell (column attribute \"n_counts\") to be used for normalization.\n",
     "\n",
     "#### No cell metadata is required, but custom cell attributes may be passed onto the tokenized dataset by providing a dictionary of custom attributes to be added, which is formatted as loom_col_attr_name : desired_dataset_col_attr_name. For example, if the original .loom dataset has column attributes \"cell_type\" and \"organ_major\" and one would like to retain these attributes as labels in the tokenized dataset with the new names \"cell_type\" and \"organ\", respectively, the following custom attribute dictionary should be provided: {\"cell_type\": \"cell_type\", \"organ_major\": \"organ\"}. \n",
     "\n",
     "#### Additionally, if the original .loom file contains a cell column attribute called \"filter_pass\", this column will be used as a binary indicator of whether to include these cells in the tokenized data. All cells with \"1\" in this attribute will be tokenized, whereas the others will be excluded. One may use this column to indicate QC filtering or other criteria for selection for inclusion in the final tokenized dataset.\n",
     "\n",
+    "#### If one's data is in other formats besides .loom, one can use the relevant tools (such as Anndata tools) to convert the file to a .loom format prior to running the transcriptome tokenizer."
    ]
   },
   {
    "metadata": {},
    "outputs": [],
    "source": [
+    "tk = TranscriptomeTokenizer({\"cell_type\": \"cell_type\", \"organ_major\": \"organ_major\"}, nproc=4)\n",
+    "tk.tokenize_data(\"loom_data_directory\", \"output_directory\", \"output_prefix\")"
    ]
   }
  ],
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
+   "version": "3.10.11"
   }
  },
  "nbformat": 4,

fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/config.json RENAMED Viewed

File without changes

fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/optimizer.pt RENAMED Viewed

File without changes

fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/pytorch_model.bin RENAMED Viewed

File without changes

fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/rng_state.pth RENAMED Viewed

File without changes

fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/scheduler.pt RENAMED Viewed

File without changes

fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/trainer_state.json RENAMED Viewed

File without changes

fine_tuned_models/{gf-6L-30M-i2048_CellClassifier_cardiomyopathies_220224 → geneformer-6L-30M_CellClassifier_cardiomyopathies_220224}/training_args.bin RENAMED Viewed

File without changes

fine_tuned_models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522/config.json DELETED Viewed

@@ -1,24 +0,0 @@
-{
-  "architectures": [
-    "BertForMaskedLM"
-  ],
-  "attention_probs_dropout_prob": 0.02,
-  "classifier_dropout": null,
-  "hidden_act": "relu",
-  "hidden_dropout_prob": 0.02,
-  "hidden_size": 512,
-  "initializer_range": 0.02,
-  "intermediate_size": 1024,
-  "layer_norm_eps": 1e-12,
-  "max_position_embeddings": 4096,
-  "model_type": "bert",
-  "num_attention_heads": 8,
-  "num_hidden_layers": 12,
-  "pad_token_id": 0,
-  "position_embedding_type": "absolute",
-  "torch_dtype": "float32",
-  "transformers_version": "4.37.2",
-  "type_vocab_size": 2,
-  "use_cache": true,
-  "vocab_size": 20275
-}

fine_tuned_models/gf-12L-95M-i4096_MTLCellClassifier_CELLxGENE_240522/pytorch_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:07b28d8c7bb789d59755c42d32f6182cc04d2cf34aafaa6397aa50e4fdf1a9b4
-size 152363342

{gf-12L-30M-i2048 → geneformer-12L-30M}/config.json RENAMED Viewed

File without changes

{gf-12L-30M-i2048 → geneformer-12L-30M}/pytorch_model.bin RENAMED Viewed

File without changes

{gf-12L-30M-i2048 → geneformer-12L-30M}/training_args.bin RENAMED Viewed

File without changes

geneformer/__init__.py CHANGED Viewed

@@ -1,34 +1,12 @@
-# ruff: noqa: F401
-import warnings
-from pathlib import Path
-warnings.filterwarnings("ignore", message=".*The 'nopython' keyword.*")  # noqa # isort:skip
-GENE_MEDIAN_FILE = Path(__file__).parent / "gene_median_dictionary_gc95M.pkl"
-TOKEN_DICTIONARY_FILE = Path(__file__).parent / "token_dictionary_gc95M.pkl"
-ENSEMBL_DICTIONARY_FILE = Path(__file__).parent / "gene_name_id_dict_gc95M.pkl"
-ENSEMBL_MAPPING_FILE = Path(__file__).parent / "ensembl_mapping_dict_gc95M.pkl"
-from . import (
-    collator_for_classification,
-    emb_extractor,
-    in_silico_perturber,
-    in_silico_perturber_stats,
-    pretrainer,
-    tokenizer,
-)
-from .collator_for_classification import (
-    DataCollatorForCellClassification,
-    DataCollatorForGeneClassification,
-)
-from .emb_extractor import EmbExtractor, get_embs
-from .in_silico_perturber import InSilicoPerturber
-from .in_silico_perturber_stats import InSilicoPerturberStats
-from .pretrainer import GeneformerPretrainer
 from .tokenizer import TranscriptomeTokenizer
-from . import classifier  # noqa # isort:skip
-from .classifier import Classifier  # noqa # isort:skip
-from . import mtl_classifier  # noqa # isort:skip
-from .mtl_classifier import MTLClassifier  # noqa # isort:skip

+from . import tokenizer
+from . import pretrainer
+from . import collator_for_classification
+from . import in_silico_perturber
+from . import in_silico_perturber_stats
 from .tokenizer import TranscriptomeTokenizer
+from .pretrainer import GeneformerPretrainer
+from .collator_for_classification import DataCollatorForGeneClassification
+from .collator_for_classification import DataCollatorForCellClassification
+from .emb_extractor import EmbExtractor
+from .in_silico_perturber import InSilicoPerturber
+from .in_silico_perturber_stats import InSilicoPerturberStats

geneformer/classifier.py DELETED Viewed

@@ -1,1563 +0,0 @@
-"""
-Geneformer classifier.
-**Input data:**
-| Cell state classifier:
-| Single-cell transcriptomes as Geneformer rank value encodings with cell state labels in Geneformer .dataset format (generated from single-cell RNAseq data by tokenizer.py)
-| Gene classifier:
-| Dictionary in format {Gene_label: list(genes)} for gene labels and single-cell transcriptomes as Geneformer rank value encodings in Geneformer .dataset format (generated from single-cell RNAseq data by tokenizer.py)
-**Usage:**
-.. code-block :: python
-    >>> from geneformer import Classifier
-    >>> cc = Classifier(classifier="cell",  # example of cell state classifier
-    ...                 cell_state_dict={"state_key": "disease", "states": "all"},
-    ...                 filter_data={"cell_type":["Cardiomyocyte1","Cardiomyocyte2","Cardiomyocyte3"]},
-    ...                 training_args=training_args,
-    ...                 freeze_layers = 2,
-    ...                 num_crossval_splits = 1,
-    ...                 forward_batch_size=200,
-    ...                 nproc=16)
-    >>> cc.prepare_data(input_data_file="path/to/input_data",
-    ...                 output_directory="path/to/output_directory",
-    ...                 output_prefix="output_prefix")
-    >>> all_metrics = cc.validate(model_directory="path/to/model",
-    ...                           prepared_input_data_file=f"path/to/output_directory/{output_prefix}_labeled.dataset",
-    ...                           id_class_dict_file=f"path/to/output_directory/{output_prefix}_id_class_dict.pkl",
-    ...                           output_directory="path/to/output_directory",
-    ...                           output_prefix="output_prefix",
-    ...                           predict_eval=True)
-    >>> cc.plot_conf_mat(conf_mat_dict={"Geneformer": all_metrics["conf_matrix"]},
-    ...                  output_directory="path/to/output_directory",
-    ...                  output_prefix="output_prefix",
-    ...                  custom_class_order=["healthy","disease1","disease2"])
-    >>> cc.plot_predictions(predictions_file=f"path/to/output_directory/datestamp_geneformer_cellClassifier_{output_prefix}/ksplit1/predictions.pkl",
-    ...                     id_class_dict_file=f"path/to/output_directory/{output_prefix}_id_class_dict.pkl",
-    ...                     title="disease",
-    ...                     output_directory="path/to/output_directory",
-    ...                     output_prefix="output_prefix",
-    ...                     custom_class_order=["healthy","disease1","disease2"])
-"""
-import datetime
-import logging
-import os
-import pickle
-import subprocess
-from pathlib import Path
-import numpy as np
-import pandas as pd
-import seaborn as sns
-from tqdm.auto import tqdm, trange
-from transformers import Trainer
-from transformers.training_args import TrainingArguments
-from . import (
-    TOKEN_DICTIONARY_FILE,
-    DataCollatorForCellClassification,
-    DataCollatorForGeneClassification,
-)
-from . import classifier_utils as cu
-from . import evaluation_utils as eu
-from . import perturber_utils as pu
-sns.set()
-logger = logging.getLogger(__name__)
-class Classifier:
-    valid_option_dict = {
-        "classifier": {"cell", "gene"},
-        "quantize": {bool, dict},
-        "cell_state_dict": {None, dict},
-        "gene_class_dict": {None, dict},
-        "filter_data": {None, dict},
-        "rare_threshold": {int, float},
-        "max_ncells": {None, int},
-        "max_ncells_per_class": {None, int},
-        "training_args": {None, dict},
-        "freeze_layers": {int},
-        "num_crossval_splits": {0, 1, 5},
-        "split_sizes": {None, dict},
-        "no_eval": {bool},
-        "stratify_splits_col": {None, str},
-        "forward_batch_size": {int},
-        "token_dictionary_file": {None, str},
-        "nproc": {int},
-        "ngpu": {int},
-    }
-    def __init__(
-        self,
-        classifier=None,
-        quantize=False,
-        cell_state_dict=None,
-        gene_class_dict=None,
-        filter_data=None,
-        rare_threshold=0,
-        max_ncells=None,
-        max_ncells_per_class=None,
-        training_args=None,
-        ray_config=None,
-        freeze_layers=0,
-        num_crossval_splits=1,
-        split_sizes={"train": 0.8, "valid": 0.1, "test": 0.1},
-        stratify_splits_col=None,
-        no_eval=False,
-        forward_batch_size=100,
-        token_dictionary_file=None,
-        nproc=4,
-        ngpu=1,
-    ):
-        """
-        Initialize Geneformer classifier.
-        **Parameters:**
-        classifier : {"cell", "gene"}
-            | Whether to fine-tune a cell state or gene classifier.
-        quantize : bool, dict
-            | Whether to fine-tune a quantized model.
-            | If True and no config provided, will use default.
-            | Will use custom config if provided.
-            | Configs should be provided as dictionary of BitsAndBytesConfig (transformers) and LoraConfig (peft).
-            | For example: {"bnb_config": BitsAndBytesConfig(...),
-            |               "peft_config": LoraConfig(...)}
-        cell_state_dict : None, dict
-            | Cell states to fine-tune model to distinguish.
-            | Two-item dictionary with keys: state_key and states
-            | state_key: key specifying name of column in .dataset that defines the states to model
-            | states: list of values in the state_key column that specifies the states to model
-            | Alternatively, instead of a list of states, can specify "all" to use all states in that state key from input data.
-            | Of note, if using "all", states will be defined after data is filtered.
-            | Must have at least 2 states to model.
-            | For example: {"state_key": "disease",
-            |               "states": ["nf", "hcm", "dcm"]}
-            |               or
-            |               {"state_key": "disease",
-            |               "states": "all"}
-        gene_class_dict : None, dict
-            | Gene classes to fine-tune model to distinguish.
-            | Dictionary in format: {Gene_label_A: list(geneA1, geneA2, ...),
-            |                        Gene_label_B: list(geneB1, geneB2, ...)}
-            | Gene values should be Ensembl IDs.
-        filter_data : None, dict
-            | Default is to fine-tune with all input data.
-            | Otherwise, dictionary specifying .dataset column name and list of values to filter by.
-        rare_threshold : float
-            | Threshold below which rare cell states should be removed.
-            | For example, setting to 0.05 will remove cell states representing
-            | < 5% of the total cells from the cell state classifier's possible classes.
-        max_ncells : None, int
-            | Maximum number of cells to use for fine-tuning.
-            | Default is to fine-tune with all input data.
-        max_ncells_per_class : None, int
-            | Maximum number of cells per cell class to use for fine-tuning.
-            | Of note, will be applied after max_ncells above.
-            | (Only valid for cell classification.)
-        training_args : None, dict
-            | Training arguments for fine-tuning.
-            | If None, defaults will be inferred for 6 layer Geneformer.
-            | Otherwise, will use the Hugging Face defaults:
-            | https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
-            | Note: Hyperparameter tuning is highly recommended, rather than using defaults.
-        ray_config : None, dict
-            | Training argument ranges for tuning hyperparameters with Ray.
-        freeze_layers : int
-            | Number of layers to freeze from fine-tuning.
-            | 0: no layers will be frozen; 2: first two layers will be frozen; etc.
-        num_crossval_splits : {0, 1, 5}
-            | 0: train on all data without splitting
-            | 1: split data into train and eval sets by designated split_sizes["valid"]
-            | 5: split data into 5 folds of train and eval sets by designated split_sizes["valid"]
-        split_sizes : None, dict
-            | Dictionary of proportion of data to hold out for train, validation, and test sets
-            | {"train": 0.8, "valid": 0.1, "test": 0.1} if intending 80/10/10 train/valid/test split
-        stratify_splits_col : None, str
-            | Name of column in .dataset to be used for stratified splitting.
-            | Proportion of each class in this column will be the same in the splits as in the original dataset.
-        no_eval : bool
-            | If True, will skip eval step and use all data for training.
-            | Otherwise, will perform eval during training.
-        forward_batch_size : int
-            | Batch size for forward pass (for evaluation, not training).
-        token_dictionary_file : None, str
-            | Default is to use token dictionary file from Geneformer
-            | Otherwise, will load custom gene token dictionary.
-        nproc : int
-            | Number of CPU processes to use.
-        ngpu : int
-            | Number of GPUs available.
-        """
-        self.classifier = classifier
-        if self.classifier == "cell":
-            self.model_type = "CellClassifier"
-        elif self.classifier == "gene":
-            self.model_type = "GeneClassifier"
-        self.quantize = quantize
-        self.cell_state_dict = cell_state_dict
-        self.gene_class_dict = gene_class_dict
-        self.filter_data = filter_data
-        self.rare_threshold = rare_threshold
-        self.max_ncells = max_ncells
-        self.max_ncells_per_class = max_ncells_per_class
-        self.training_args = training_args
-        self.ray_config = ray_config
-        self.freeze_layers = freeze_layers
-        self.num_crossval_splits = num_crossval_splits
-        self.split_sizes = split_sizes
-        self.train_size = self.split_sizes["train"]
-        self.valid_size = self.split_sizes["valid"]
-        self.oos_test_size = self.split_sizes["test"]
-        self.eval_size = self.valid_size / (self.train_size + self.valid_size)
-        self.stratify_splits_col = stratify_splits_col
-        self.no_eval = no_eval
-        self.forward_batch_size = forward_batch_size
-        self.token_dictionary_file = token_dictionary_file
-        self.nproc = nproc
-        self.ngpu = ngpu
-        if self.training_args is None:
-            logger.warning(
-                "Hyperparameter tuning is highly recommended for optimal results. "
-                "No training_args provided; using default hyperparameters."
-            )
-        self.validate_options()
-        if self.filter_data is None:
-            self.filter_data = dict()
-        if self.classifier == "cell":
-            if self.cell_state_dict["states"] != "all":
-                self.filter_data[
-                    self.cell_state_dict["state_key"]
-                ] = self.cell_state_dict["states"]
-        # load token dictionary (Ensembl IDs:token)
-        if self.token_dictionary_file is None:
-            self.token_dictionary_file = TOKEN_DICTIONARY_FILE
-        with open(self.token_dictionary_file, "rb") as f:
-            self.gene_token_dict = pickle.load(f)
-        self.token_gene_dict = {v: k for k, v in self.gene_token_dict.items()}
-        # filter genes for gene classification for those in token dictionary
-        if self.classifier == "gene":
-            all_gene_class_values = set(pu.flatten_list(self.gene_class_dict.values()))
-            missing_genes = [
-                gene
-                for gene in all_gene_class_values
-                if gene not in self.gene_token_dict.keys()
-            ]
-            if len(missing_genes) == len(all_gene_class_values):
-                logger.error(
-                    "None of the provided genes to classify are in token dictionary."
-                )
-                raise
-            elif len(missing_genes) > 0:
-                logger.warning(
-                    f"Genes to classify {missing_genes} are not in token dictionary."
-                )
-            self.gene_class_dict = {
-                k: list(set([self.gene_token_dict.get(gene) for gene in v]))
-                for k, v in self.gene_class_dict.items()
-            }
-            empty_classes = []
-            for k, v in self.gene_class_dict.items():
-                if len(v) == 0:
-                    empty_classes += [k]
-            if len(empty_classes) > 0:
-                logger.error(
-                    f"Class(es) {empty_classes} did not contain any genes in the token dictionary."
-                )
-                raise
-    def validate_options(self):
-        # confirm arguments are within valid options and compatible with each other
-        for attr_name, valid_options in self.valid_option_dict.items():
-            attr_value = self.__dict__[attr_name]
-            if not isinstance(attr_value, (list, dict)):
-                if attr_value in valid_options:
-                    continue
-            valid_type = False
-            for option in valid_options:
-                if (option in [int, float, list, dict, bool, str]) and isinstance(
-                    attr_value, option
-                ):
-                    valid_type = True
-                    break
-            if valid_type:
-                continue
-            logger.error(
-                f"Invalid option for {attr_name}. "
-                f"Valid options for {attr_name}: {valid_options}"
-            )
-            raise
-        if self.filter_data is not None:
-            for key, value in self.filter_data.items():
-                if not isinstance(value, list):
-                    self.filter_data[key] = [value]
-                    logger.warning(
-                        "Values in filter_data dict must be lists. "
-                        f"Changing {key} value to list ([{value}])."
-                    )
-        if self.classifier == "cell":
-            if set(self.cell_state_dict.keys()) != set(["state_key", "states"]):
-                logger.error(
-                    "Invalid keys for cell_state_dict. "
-                    "The cell_state_dict should have only 2 keys: state_key and states"
-                )
-                raise
-            if self.cell_state_dict["states"] != "all":
-                if not isinstance(self.cell_state_dict["states"], list):
-                    logger.error(
-                        "States in cell_state_dict should be list of states to model."
-                    )
-                    raise
-                if len(self.cell_state_dict["states"]) < 2:
-                    logger.error(
-                        "States in cell_state_dict should contain at least 2 states to classify."
-                    )
-                    raise
-        if self.classifier == "gene":
-            if len(self.gene_class_dict.keys()) < 2:
-                logger.error(
-                    "Gene_class_dict should contain at least 2 gene classes to classify."
-                )
-                raise
-        if sum(self.split_sizes.values()) != 1:
-            logger.error("Train, validation, and test proportions should sum to 1.")
-            raise
-    def prepare_data(
-        self,
-        input_data_file,
-        output_directory,
-        output_prefix,
-        split_id_dict=None,
-        test_size=None,
-        attr_to_split=None,
-        attr_to_balance=None,
-        max_trials=100,
-        pval_threshold=0.1,
-    ):
-        """
-        Prepare data for cell state or gene classification.
-        **Parameters**
-        input_data_file : Path
-            | Path to directory containing .dataset input
-        output_directory : Path
-            | Path to directory where prepared data will be saved
-        output_prefix : str
-            | Prefix for output file
-        split_id_dict : None, dict
-            | Dictionary of IDs for train and test splits
-            | Three-item dictionary with keys: attr_key, train, test
-            | attr_key: key specifying name of column in .dataset that contains the IDs for the data splits
-            | train: list of IDs in the attr_key column to include in the train split
-            | test: list of IDs in the attr_key column to include in the test split
-            | For example: {"attr_key": "individual",
-            |               "train": ["patient1", "patient2", "patient3", "patient4"],
-            |               "test": ["patient5", "patient6"]}
-        test_size : None, float
-            | Proportion of data to be saved separately and held out for test set
-            | (e.g. 0.2 if intending hold out 20%)
-            | If None, will inherit from split_sizes["test"] from Classifier
-            | The training set will be further split to train / validation in self.validate
-            | Note: only available for CellClassifiers
-        attr_to_split : None, str
-            | Key for attribute on which to split data while balancing potential confounders
-            | e.g. "patient_id" for splitting by patient while balancing other characteristics
-            | Note: only available for CellClassifiers
-        attr_to_balance : None, list
-            | List of attribute keys on which to balance data while splitting on attr_to_split
-            | e.g. ["age", "sex"] for balancing these characteristics while splitting by patient
-            | Note: only available for CellClassifiers
-        max_trials : None, int
-            | Maximum number of trials of random splitting to try to achieve balanced other attributes
-            | If no split is found without significant (p<0.05) differences in other attributes, will select best
-            | Note: only available for CellClassifiers
-        pval_threshold : None, float
-            | P-value threshold to use for attribute balancing across splits
-            | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance
-        """
-        if test_size is None:
-            test_size = self.oos_test_size
-        # prepare data and labels for classification
-        data = pu.load_and_filter(self.filter_data, self.nproc, input_data_file)
-        if self.classifier == "cell":
-            if "label" in data.features:
-                logger.error(
-                    "Column name 'label' must be reserved for class IDs. Please rename column."
-                )
-                raise
-        elif self.classifier == "gene":
-            if "labels" in data.features:
-                logger.error(
-                    "Column name 'labels' must be reserved for class IDs. Please rename column."
-                )
-                raise
-        if (attr_to_split is not None) and (attr_to_balance is None):
-            logger.error(
-                "Splitting by attribute while balancing confounders requires both attr_to_split and attr_to_balance to be defined."
-            )
-            raise
-        if not isinstance(attr_to_balance, list):
-            attr_to_balance = [attr_to_balance]
-        if self.classifier == "cell":
-            # remove cell states representing < rare_threshold of cells
-            data = cu.remove_rare(
-                data, self.rare_threshold, self.cell_state_dict["state_key"], self.nproc
-            )
-            # downsample max cells and max per class
-            data = cu.downsample_and_shuffle(
-                data, self.max_ncells, self.max_ncells_per_class, self.cell_state_dict
-            )
-            # rename cell state column to "label"
-            data = cu.rename_cols(data, self.cell_state_dict["state_key"])
-        # convert classes to numerical labels and save as id_class_dict
-        # of note, will label all genes in gene_class_dict
-        # if (cross-)validating, genes will be relabeled in column "labels" for each split
-        # at the time of training with Classifier.validate
-        data, id_class_dict = cu.label_classes(
-            self.classifier, data, self.gene_class_dict, self.nproc
-        )
-        # save id_class_dict for future reference
-        id_class_output_path = (
-            Path(output_directory) / f"{output_prefix}_id_class_dict"
-        ).with_suffix(".pkl")
-        with open(id_class_output_path, "wb") as f:
-            pickle.dump(id_class_dict, f)
-        if split_id_dict is not None:
-            data_dict = dict()
-            data_dict["train"] = pu.filter_by_dict(
-                data, {split_id_dict["attr_key"]: split_id_dict["train"]}, self.nproc
-            )
-            data_dict["test"] = pu.filter_by_dict(
-                data, {split_id_dict["attr_key"]: split_id_dict["test"]}, self.nproc
-            )
-            train_data_output_path = (
-                Path(output_directory) / f"{output_prefix}_labeled_train"
-            ).with_suffix(".dataset")
-            test_data_output_path = (
-                Path(output_directory) / f"{output_prefix}_labeled_test"
-            ).with_suffix(".dataset")
-            data_dict["train"].save_to_disk(str(train_data_output_path))
-            data_dict["test"].save_to_disk(str(test_data_output_path))
-        elif (test_size is not None) and (self.classifier == "cell"):
-            if 1 > test_size > 0:
-                if attr_to_split is None:
-                    data_dict = data.train_test_split(
-                        test_size=test_size,
-                        stratify_by_column=self.stratify_splits_col,
-                        seed=42,
-                    )
-                    train_data_output_path = (
-                        Path(output_directory) / f"{output_prefix}_labeled_train"
-                    ).with_suffix(".dataset")
-                    test_data_output_path = (
-                        Path(output_directory) / f"{output_prefix}_labeled_test"
-                    ).with_suffix(".dataset")
-                    data_dict["train"].save_to_disk(str(train_data_output_path))
-                    data_dict["test"].save_to_disk(str(test_data_output_path))
-                else:
-                    data_dict, balance_df = cu.balance_attr_splits(
-                        data,
-                        attr_to_split,
-                        attr_to_balance,
-                        test_size,
-                        max_trials,
-                        pval_threshold,
-                        self.cell_state_dict["state_key"],
-                        self.nproc,
-                    )
-                    balance_df.to_csv(
-                        f"{output_directory}/{output_prefix}_train_test_balance_df.csv"
-                    )
-                    train_data_output_path = (
-                        Path(output_directory) / f"{output_prefix}_labeled_train"
-                    ).with_suffix(".dataset")
-                    test_data_output_path = (
-                        Path(output_directory) / f"{output_prefix}_labeled_test"
-                    ).with_suffix(".dataset")
-                    data_dict["train"].save_to_disk(str(train_data_output_path))
-                    data_dict["test"].save_to_disk(str(test_data_output_path))
-            else:
-                data_output_path = (
-                    Path(output_directory) / f"{output_prefix}_labeled"
-                ).with_suffix(".dataset")
-                data.save_to_disk(str(data_output_path))
-                print(data_output_path)
-        else:
-            data_output_path = (
-                Path(output_directory) / f"{output_prefix}_labeled"
-            ).with_suffix(".dataset")
-            data.save_to_disk(str(data_output_path))
-    def train_all_data(
-        self,
-        model_directory,
-        prepared_input_data_file,
-        id_class_dict_file,
-        output_directory,
-        output_prefix,
-        save_eval_output=True,
-        gene_balance=False,
-    ):
-        """
-        Train cell state or gene classifier using all data.
-        **Parameters**
-        model_directory : Path
-            | Path to directory containing model
-        prepared_input_data_file : Path
-            | Path to directory containing _labeled.dataset previously prepared by Classifier.prepare_data
-        id_class_dict_file : Path
-            | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data
-            | (dictionary of format: numerical IDs: class_labels)
-        output_directory : Path
-            | Path to directory where model and eval data will be saved
-        output_prefix : str
-            | Prefix for output files
-        save_eval_output : bool
-            | Whether to save cross-fold eval output
-            | Saves as pickle file of dictionary of eval metrics
-        gene_balance : None, bool
-            | Whether to automatically balance genes in training set.
-            | Only available for binary gene classifications.
-        **Output**
-        Returns trainer after fine-tuning with all data.
-        """
-        if (gene_balance is True) and (len(self.gene_class_dict.values()) != 2):
-            logger.error(
-                "Automatically balancing gene sets for training is only available for binary gene classifications."
-            )
-            raise
-        ##### Load data and prepare output directory #####
-        # load numerical id to class dictionary (id:class)
-        with open(id_class_dict_file, "rb") as f:
-            id_class_dict = pickle.load(f)
-        class_id_dict = {v: k for k, v in id_class_dict.items()}
-        # load previously filtered and prepared data
-        data = pu.load_and_filter(None, self.nproc, prepared_input_data_file)
-        data = data.shuffle(seed=42)  # reshuffle in case users provide unshuffled data
-        # define output directory path
-        current_date = datetime.datetime.now()
-        datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
-        if output_directory[-1:] != "/":  # add slash for dir if not present
-            output_directory = output_directory + "/"
-        output_dir = f"{output_directory}{datestamp}_geneformer_{self.classifier}Classifier_{output_prefix}/"
-        subprocess.call(f"mkdir {output_dir}", shell=True)
-        # get number of classes for classifier
-        num_classes = cu.get_num_classes(id_class_dict)
-        if self.classifier == "gene":
-            targets = pu.flatten_list(self.gene_class_dict.values())
-            labels = pu.flatten_list(
-                [
-                    [class_id_dict[label]] * len(targets)
-                    for label, targets in self.gene_class_dict.items()
-                ]
-            )
-            assert len(targets) == len(labels)
-            data = cu.prep_gene_classifier_all_data(
-                data, targets, labels, self.max_ncells, self.nproc, gene_balance
-            )
-        trainer = self.train_classifier(
-            model_directory, num_classes, data, None, output_dir
-        )
-        return trainer
-    def validate(
-        self,
-        model_directory,
-        prepared_input_data_file,
-        id_class_dict_file,
-        output_directory,
-        output_prefix,
-        split_id_dict=None,
-        attr_to_split=None,
-        attr_to_balance=None,
-        gene_balance=False,
-        max_trials=100,
-        pval_threshold=0.1,
-        save_eval_output=True,
-        predict_eval=True,
-        predict_trainer=False,
-        n_hyperopt_trials=0,
-        save_gene_split_datasets=True,
-        debug_gene_split_datasets=False,
-    ):
-        """
-        (Cross-)validate cell state or gene classifier.
-        **Parameters**
-        model_directory : Path
-            | Path to directory containing model
-        prepared_input_data_file : Path
-            | Path to directory containing _labeled.dataset previously prepared by Classifier.prepare_data
-        id_class_dict_file : Path
-            | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data
-            | (dictionary of format: numerical IDs: class_labels)
-        output_directory : Path
-            | Path to directory where model and eval data will be saved
-        output_prefix : str
-            | Prefix for output files
-        split_id_dict : None, dict
-            | Dictionary of IDs for train and eval splits
-            | Three-item dictionary with keys: attr_key, train, eval
-            | attr_key: key specifying name of column in .dataset that contains the IDs for the data splits
-            | train: list of IDs in the attr_key column to include in the train split
-            | eval: list of IDs in the attr_key column to include in the eval split
-            | For example: {"attr_key": "individual",
-            |               "train": ["patient1", "patient2", "patient3", "patient4"],
-            |               "eval": ["patient5", "patient6"]}
-            | Note: only available for CellClassifiers with 1-fold split (self.classifier="cell"; self.num_crossval_splits=1)
-        attr_to_split : None, str
-            | Key for attribute on which to split data while balancing potential confounders
-            | e.g. "patient_id" for splitting by patient while balancing other characteristics
-            | Note: only available for CellClassifiers with 1-fold split (self.classifier="cell"; self.num_crossval_splits=1)
-        attr_to_balance : None, list
-            | List of attribute keys on which to balance data while splitting on attr_to_split
-            | e.g. ["age", "sex"] for balancing these characteristics while splitting by patient
-        gene_balance : None, bool
-            | Whether to automatically balance genes in training set.
-            | Only available for binary gene classifications.
-        max_trials : None, int
-            | Maximum number of trials of random splitting to try to achieve balanced other attribute
-            | If no split is found without significant (p < pval_threshold) differences in other attributes, will select best
-        pval_threshold : None, float
-            | P-value threshold to use for attribute balancing across splits
-            | E.g. if set to 0.1, will accept trial if p >= 0.1 for all attributes in attr_to_balance
-        save_eval_output : bool
-            | Whether to save cross-fold eval output
-            | Saves as pickle file of dictionary of eval metrics
-        predict_eval : bool
-            | Whether or not to save eval predictions
-            | Saves as a pickle file of self.evaluate predictions
-        predict_trainer : bool
-            | Whether or not to save eval predictions from trainer
-            | Saves as a pickle file of trainer predictions
-        n_hyperopt_trials : int
-            | Number of trials to run for hyperparameter optimization
-            | If 0, will not optimize hyperparameters
-        save_gene_split_datasets : bool
-            | Whether or not to save train, valid, and test gene-labeled datasets
-        """
-        if self.num_crossval_splits == 0:
-            logger.error("num_crossval_splits must be 1 or 5 to validate.")
-            raise
-        if (gene_balance is True) and (len(self.gene_class_dict.values()) != 2):
-            logger.error(
-                "Automatically balancing gene sets for training is only available for binary gene classifications."
-            )
-            raise
-        # ensure number of genes in each class is > 5 if validating model
-        if self.classifier == "gene":
-            insuff_classes = [k for k, v in self.gene_class_dict.items() if len(v) < 5]
-            if (self.num_crossval_splits > 0) and (len(insuff_classes) > 0):
-                logger.error(
-                    f"Insufficient # of members in class(es) {insuff_classes} to (cross-)validate."
-                )
-                raise
-        ##### Load data and prepare output directory #####
-        # load numerical id to class dictionary (id:class)
-        with open(id_class_dict_file, "rb") as f:
-            id_class_dict = pickle.load(f)
-        class_id_dict = {v: k for k, v in id_class_dict.items()}
-        # load previously filtered and prepared data
-        data = pu.load_and_filter(None, self.nproc, prepared_input_data_file)
-        data = data.shuffle(seed=42)  # reshuffle in case users provide unshuffled data
-        # define output directory path
-        current_date = datetime.datetime.now()
-        datestamp = f"{str(current_date.year)[-2:]}{current_date.month:02d}{current_date.day:02d}"
-        if output_directory[-1:] != "/":  # add slash for dir if not present
-            output_directory = output_directory + "/"
-        output_dir = f"{output_directory}{datestamp}_geneformer_{self.classifier}Classifier_{output_prefix}/"
-        subprocess.call(f"mkdir {output_dir}", shell=True)
-        # get number of classes for classifier
-        num_classes = cu.get_num_classes(id_class_dict)
-        ##### (Cross-)validate the model #####
-        results = []
-        all_conf_mat = np.zeros((num_classes, num_classes))
-        iteration_num = 1
-        if self.classifier == "cell":
-            for i in trange(self.num_crossval_splits):
-                print(
-                    f"****** Validation split: {iteration_num}/{self.num_crossval_splits} ******\n"
-                )
-                ksplit_output_dir = os.path.join(output_dir, f"ksplit{iteration_num}")
-                if self.num_crossval_splits == 1:
-                    # single 1-eval_size:eval_size split
-                    if split_id_dict is not None:
-                        data_dict = dict()
-                        data_dict["train"] = pu.filter_by_dict(
-                            data,
-                            {split_id_dict["attr_key"]: split_id_dict["train"]},
-                            self.nproc,
-                        )
-                        data_dict["test"] = pu.filter_by_dict(
-                            data,
-                            {split_id_dict["attr_key"]: split_id_dict["eval"]},
-                            self.nproc,
-                        )
-                    elif attr_to_split is not None:
-                        data_dict, balance_df = cu.balance_attr_splits(
-                            data,
-                            attr_to_split,
-                            attr_to_balance,
-                            self.eval_size,
-                            max_trials,
-                            pval_threshold,
-                            self.cell_state_dict["state_key"],
-                            self.nproc,
-                        )
-                        balance_df.to_csv(
-                            f"{output_dir}/{output_prefix}_train_valid_balance_df.csv"
-                        )
-                    else:
-                        data_dict = data.train_test_split(
-                            test_size=self.eval_size,
-                            stratify_by_column=self.stratify_splits_col,
-                            seed=42,
-                        )
-                    train_data = data_dict["train"]
-                    eval_data = data_dict["test"]
-                else:
-                    # 5-fold cross-validate
-                    num_cells = len(data)
-                    fifth_cells = int(np.floor(num_cells * 0.2))
-                    num_eval = min((self.eval_size * num_cells), fifth_cells)
-                    start = i * fifth_cells
-                    end = start + num_eval
-                    eval_indices = [j for j in range(start, end)]
-                    train_indices = [
-                        j for j in range(num_cells) if j not in eval_indices
-                    ]
-                    eval_data = data.select(eval_indices)
-                    train_data = data.select(train_indices)
-                if n_hyperopt_trials == 0:
-                    trainer = self.train_classifier(
-                        model_directory,
-                        num_classes,
-                        train_data,
-                        eval_data,
-                        ksplit_output_dir,
-                        predict_trainer,
-                    )
-                else:
-                    trainer = self.hyperopt_classifier(
-                        model_directory,
-                        num_classes,
-                        train_data,
-                        eval_data,
-                        ksplit_output_dir,
-                        n_trials=n_hyperopt_trials,
-                    )
-                    if iteration_num == self.num_crossval_splits:
-                        return
-                    else:
-                        iteration_num = iteration_num + 1
-                        continue
-                result = self.evaluate_model(
-                    trainer.model,
-                    num_classes,
-                    id_class_dict,
-                    eval_data,
-                    predict_eval,
-                    ksplit_output_dir,
-                    output_prefix,
-                )
-                results += [result]
-                all_conf_mat = all_conf_mat + result["conf_mat"]
-                iteration_num = iteration_num + 1
-        elif self.classifier == "gene":
-            # set up (cross-)validation splits
-            targets = pu.flatten_list(self.gene_class_dict.values())
-            labels = pu.flatten_list(
-                [
-                    [class_id_dict[label]] * len(targets)
-                    for label, targets in self.gene_class_dict.items()
-                ]
-            )
-            assert len(targets) == len(labels)
-            n_splits = int(1 / (1 - self.train_size))
-            skf = cu.StratifiedKFold3(n_splits=n_splits, random_state=0, shuffle=True)
-            # (Cross-)validate
-            test_ratio = self.oos_test_size / (self.eval_size + self.oos_test_size)
-            for train_index, eval_index, test_index in tqdm(
-                skf.split(targets, labels, test_ratio)
-            ):
-                print(
-                    f"****** Validation split: {iteration_num}/{self.num_crossval_splits} ******\n"
-                )
-                ksplit_output_dir = os.path.join(output_dir, f"ksplit{iteration_num}")
-                # filter data for examples containing classes for this split
-                # subsample to max_ncells and relabel data in column "labels"
-                train_data, eval_data = cu.prep_gene_classifier_train_eval_split(
-                    data,
-                    targets,
-                    labels,
-                    train_index,
-                    eval_index,
-                    self.max_ncells,
-                    iteration_num,
-                    self.nproc,
-                    gene_balance,
-                )
-                if save_gene_split_datasets is True:
-                    for split_name in ["train", "valid"]:
-                        labeled_dataset_output_path = (
-                            Path(output_dir)
-                            / f"{output_prefix}_{split_name}_gene_labeled_ksplit{iteration_num}"
-                        ).with_suffix(".dataset")
-                        if split_name == "train":
-                            train_data.save_to_disk(str(labeled_dataset_output_path))
-                        elif split_name == "valid":
-                            eval_data.save_to_disk(str(labeled_dataset_output_path))
-                if self.oos_test_size > 0:
-                    test_data = cu.prep_gene_classifier_split(
-                        data,
-                        targets,
-                        labels,
-                        test_index,
-                        "test",
-                        self.max_ncells,
-                        iteration_num,
-                        self.nproc,
-                    )
-                    if save_gene_split_datasets is True:
-                        test_labeled_dataset_output_path = (
-                            Path(output_dir)
-                            / f"{output_prefix}_test_gene_labeled_ksplit{iteration_num}"
-                        ).with_suffix(".dataset")
-                        test_data.save_to_disk(str(test_labeled_dataset_output_path))
-                if debug_gene_split_datasets is True:
-                    logger.error(
-                        "Exiting after saving gene split datasets given debug_gene_split_datasets = True."
-                    )
-                    raise
-                if n_hyperopt_trials == 0:
-                    trainer = self.train_classifier(
-                        model_directory,
-                        num_classes,
-                        train_data,
-                        eval_data,
-                        ksplit_output_dir,
-                        predict_trainer,
-                    )
-                    result = self.evaluate_model(
-                        trainer.model,
-                        num_classes,
-                        id_class_dict,
-                        eval_data,
-                        predict_eval,
-                        ksplit_output_dir,
-                        output_prefix,
-                    )
-                else:
-                    trainer = self.hyperopt_classifier(
-                        model_directory,
-                        num_classes,
-                        train_data,
-                        eval_data,
-                        ksplit_output_dir,
-                        n_trials=n_hyperopt_trials,
-                    )
-                    model = cu.load_best_model(
-                        ksplit_output_dir, self.model_type, num_classes
-                    )
-                    if self.oos_test_size > 0:
-                        result = self.evaluate_model(
-                            model,
-                            num_classes,
-                            id_class_dict,
-                            test_data,
-                            predict_eval,
-                            ksplit_output_dir,
-                            output_prefix,
-                        )
-                    else:
-                        if iteration_num == self.num_crossval_splits:
-                            return
-                        else:
-                            iteration_num = iteration_num + 1
-                            continue
-                results += [result]
-                all_conf_mat = all_conf_mat + result["conf_mat"]
-                # break after 1 or 5 splits, each with train/eval proportions dictated by eval_size
-                if iteration_num == self.num_crossval_splits:
-                    break
-                iteration_num = iteration_num + 1
-        all_conf_mat_df = pd.DataFrame(
-            all_conf_mat, columns=id_class_dict.values(), index=id_class_dict.values()
-        )
-        all_metrics = {
-            "conf_matrix": all_conf_mat_df,
-            "macro_f1": [result["macro_f1"] for result in results],
-            "acc": [result["acc"] for result in results],
-        }
-        all_roc_metrics = None  # roc metrics not reported for multiclass
-        if num_classes == 2:
-            mean_fpr = np.linspace(0, 1, 100)
-            all_tpr = [result["roc_metrics"]["interp_tpr"] for result in results]
-            all_roc_auc = [result["roc_metrics"]["auc"] for result in results]
-            all_tpr_wt = [result["roc_metrics"]["tpr_wt"] for result in results]
-            mean_tpr, roc_auc, roc_auc_sd = eu.get_cross_valid_roc_metrics(
-                all_tpr, all_roc_auc, all_tpr_wt
-            )
-            all_roc_metrics = {
-                "mean_tpr": mean_tpr,
-                "mean_fpr": mean_fpr,
-                "all_roc_auc": all_roc_auc,
-                "roc_auc": roc_auc,
-                "roc_auc_sd": roc_auc_sd,
-            }
-        all_metrics["all_roc_metrics"] = all_roc_metrics
-        if save_eval_output is True:
-            eval_metrics_output_path = (
-                Path(output_dir) / f"{output_prefix}_eval_metrics_dict"
-            ).with_suffix(".pkl")
-            with open(eval_metrics_output_path, "wb") as f:
-                pickle.dump(all_metrics, f)
-        return all_metrics
-    def hyperopt_classifier(
-        self,
-        model_directory,
-        num_classes,
-        train_data,
-        eval_data,
-        output_directory,
-        n_trials=100,
-    ):
-        """
-        Fine-tune model for cell state or gene classification.
-        **Parameters**
-        model_directory : Path
-            | Path to directory containing model
-        num_classes : int
-            | Number of classes for classifier
-        train_data : Dataset
-            | Loaded training .dataset input
-            | For cell classifier, labels in column "label".
-            | For gene classifier, labels in column "labels".
-        eval_data : None, Dataset
-            | (Optional) Loaded evaluation .dataset input
-            | For cell classifier, labels in column "label".
-            | For gene classifier, labels in column "labels".
-        output_directory : Path
-            | Path to directory where fine-tuned model will be saved
-        n_trials : int
-            | Number of trials to run for hyperparameter optimization
-        """
-        # initiate runtime environment for raytune
-        import ray
-        from ray import tune
-        from ray.tune.search.hyperopt import HyperOptSearch
-        ray.shutdown()  # engage new ray session
-        ray.init()
-        ##### Validate and prepare data #####
-        train_data, eval_data = cu.validate_and_clean_cols(
-            train_data, eval_data, self.classifier
-        )
-        if (self.no_eval is True) and (eval_data is not None):
-            logger.warning(
-                "no_eval set to True; hyperparameter optimization requires eval, proceeding with eval"
-            )
-        # ensure not overwriting previously saved model
-        saved_model_test = os.path.join(output_directory, "pytorch_model.bin")
-        if os.path.isfile(saved_model_test) is True:
-            logger.error("Model already saved to this designated output directory.")
-            raise
-        # make output directory
-        subprocess.call(f"mkdir {output_directory}", shell=True)
-        ##### Load model and training args #####
-        model = pu.load_model(
-            self.model_type,
-            num_classes,
-            model_directory,
-            "train",
-            quantize=self.quantize,
-        )
-        def_training_args, def_freeze_layers = cu.get_default_train_args(
-            model, self.classifier, train_data, output_directory
-        )
-        del model
-        if self.training_args is not None:
-            def_training_args.update(self.training_args)
-        logging_steps = round(
-            len(train_data) / def_training_args["per_device_train_batch_size"] / 10
-        )
-        def_training_args["logging_steps"] = logging_steps
-        def_training_args["output_dir"] = output_directory
-        if eval_data is None:
-            def_training_args["evaluation_strategy"] = "no"
-            def_training_args["load_best_model_at_end"] = False
-        def_training_args.update(
-            {"save_strategy": "epoch", "save_total_limit": 1}
-        )  # only save last model for each run
-        training_args_init = TrainingArguments(**def_training_args)
-        ##### Fine-tune the model #####
-        # define the data collator
-        if self.classifier == "cell":
-            data_collator = DataCollatorForCellClassification(
-                token_dictionary=self.gene_token_dict
-            )
-        elif self.classifier == "gene":
-            data_collator = DataCollatorForGeneClassification(
-                token_dictionary=self.gene_token_dict
-            )
-        # define function to initiate model
-        def model_init():
-            model = pu.load_model(
-                self.model_type,
-                num_classes,
-                model_directory,
-                "train",
-                quantize=self.quantize,
-            )
-            if self.freeze_layers is not None:
-                def_freeze_layers = self.freeze_layers
-            if def_freeze_layers > 0:
-                modules_to_freeze = model.bert.encoder.layer[:def_freeze_layers]
-                for module in modules_to_freeze:
-                    for param in module.parameters():
-                        param.requires_grad = False
-            if self.quantize is False:
-                model = model.to("cuda:0")
-            return model
-        # create the trainer
-        trainer = Trainer(
-            model_init=model_init,
-            args=training_args_init,
-            data_collator=data_collator,
-            train_dataset=train_data,
-            eval_dataset=eval_data,
-            compute_metrics=cu.compute_metrics,
-        )
-        # specify raytune hyperparameter search space
-        if self.ray_config is None:
-            logger.warning(
-                "No ray_config provided. Proceeding with default, but ranges may need adjustment depending on model."
-            )
-            def_ray_config = {
-                "num_train_epochs": tune.choice([1]),
-                "learning_rate": tune.loguniform(1e-6, 1e-3),
-                "weight_decay": tune.uniform(0.0, 0.3),
-                "lr_scheduler_type": tune.choice(["linear", "cosine", "polynomial"]),
-                "warmup_steps": tune.uniform(100, 2000),
-                "seed": tune.uniform(0, 100),
-                "per_device_train_batch_size": tune.choice(
-                    [def_training_args["per_device_train_batch_size"]]
-                ),
-            }
-        hyperopt_search = HyperOptSearch(metric="eval_macro_f1", mode="max")
-        # optimize hyperparameters
-        trainer.hyperparameter_search(
-            direction="maximize",
-            backend="ray",
-            resources_per_trial={"cpu": int(self.nproc / self.ngpu), "gpu": 1},
-            hp_space=lambda _: def_ray_config
-            if self.ray_config is None
-            else self.ray_config,
-            search_alg=hyperopt_search,
-            n_trials=n_trials,  # number of trials
-            progress_reporter=tune.CLIReporter(
-                max_report_frequency=600,
-                sort_by_metric=True,
-                max_progress_rows=n_trials,
-                mode="max",
-                metric="eval_macro_f1",
-                metric_columns=["loss", "eval_loss", "eval_accuracy", "eval_macro_f1"],
-            ),
-            storage_path=output_directory,
-        )
-        return trainer
-    def train_classifier(
-        self,
-        model_directory,
-        num_classes,
-        train_data,
-        eval_data,
-        output_directory,
-        predict=False,
-    ):
-        """
-        Fine-tune model for cell state or gene classification.
-        **Parameters**
-        model_directory : Path
-            | Path to directory containing model
-        num_classes : int
-            | Number of classes for classifier
-        train_data : Dataset
-            | Loaded training .dataset input
-            | For cell classifier, labels in column "label".
-            | For gene classifier, labels in column "labels".
-        eval_data : None, Dataset
-            | (Optional) Loaded evaluation .dataset input
-            | For cell classifier, labels in column "label".
-            | For gene classifier, labels in column "labels".
-        output_directory : Path
-            | Path to directory where fine-tuned model will be saved
-        predict : bool
-            | Whether or not to save eval predictions from trainer
-        """
-        ##### Validate and prepare data #####
-        train_data, eval_data = cu.validate_and_clean_cols(
-            train_data, eval_data, self.classifier
-        )
-        if (self.no_eval is True) and (eval_data is not None):
-            logger.warning(
-                "no_eval set to True; model will be trained without evaluation."
-            )
-            eval_data = None
-        if (self.classifier == "gene") and (predict is True):
-            logger.warning(
-                "Predictions during training not currently available for gene classifiers; setting predict to False."
-            )
-            predict = False
-        # ensure not overwriting previously saved model
-        saved_model_test = os.path.join(output_directory, "pytorch_model.bin")
-        if os.path.isfile(saved_model_test) is True:
-            logger.error("Model already saved to this designated output directory.")
-            raise
-        # make output directory
-        subprocess.call(f"mkdir {output_directory}", shell=True)
-        ##### Load model and training args #####
-        model = pu.load_model(
-            self.model_type,
-            num_classes,
-            model_directory,
-            "train",
-            quantize=self.quantize,
-        )
-        def_training_args, def_freeze_layers = cu.get_default_train_args(
-            model, self.classifier, train_data, output_directory
-        )
-        if self.training_args is not None:
-            def_training_args.update(self.training_args)
-        logging_steps = round(
-            len(train_data) / def_training_args["per_device_train_batch_size"] / 10
-        )
-        def_training_args["logging_steps"] = logging_steps
-        def_training_args["output_dir"] = output_directory
-        if eval_data is None:
-            def_training_args["evaluation_strategy"] = "no"
-            def_training_args["load_best_model_at_end"] = False
-        training_args_init = TrainingArguments(**def_training_args)
-        if self.freeze_layers is not None:
-            def_freeze_layers = self.freeze_layers
-        if def_freeze_layers > 0:
-            modules_to_freeze = model.bert.encoder.layer[:def_freeze_layers]
-            for module in modules_to_freeze:
-                for param in module.parameters():
-                    param.requires_grad = False
-        ##### Fine-tune the model #####
-        # define the data collator
-        if self.classifier == "cell":
-            data_collator = DataCollatorForCellClassification(
-                token_dictionary=self.gene_token_dict
-            )
-        elif self.classifier == "gene":
-            data_collator = DataCollatorForGeneClassification(
-                token_dictionary=self.gene_token_dict
-            )
-        # create the trainer
-        trainer = Trainer(
-            model=model,
-            args=training_args_init,
-            data_collator=data_collator,
-            train_dataset=train_data,
-            eval_dataset=eval_data,
-            compute_metrics=cu.compute_metrics,
-        )
-        # train the classifier
-        trainer.train()
-        trainer.save_model(output_directory)
-        if predict is True:
-            # make eval predictions and save predictions and metrics
-            predictions = trainer.predict(eval_data)
-            prediction_output_path = f"{output_directory}/predictions.pkl"
-            with open(prediction_output_path, "wb") as f:
-                pickle.dump(predictions, f)
-            trainer.save_metrics("eval", predictions.metrics)
-        return trainer
-    def evaluate_model(
-        self,
-        model,
-        num_classes,
-        id_class_dict,
-        eval_data,
-        predict=False,
-        output_directory=None,
-        output_prefix=None,
-    ):
-        """
-        Evaluate the fine-tuned model.
-        **Parameters**
-        model : nn.Module
-            | Loaded fine-tuned model (e.g. trainer.model)
-        num_classes : int
-            | Number of classes for classifier
-        id_class_dict : dict
-            | Loaded _id_class_dict.pkl previously prepared by Classifier.prepare_data
-            | (dictionary of format: numerical IDs: class_labels)
-        eval_data : Dataset
-            | Loaded evaluation .dataset input
-        predict : bool
-            | Whether or not to save eval predictions
-        output_directory : Path
-            | Path to directory where eval data will be saved
-        output_prefix : str
-            | Prefix for output files
-        """
-        ##### Evaluate the model #####
-        labels = id_class_dict.keys()
-        y_pred, y_true, logits_list = eu.classifier_predict(
-            model, self.classifier, eval_data, self.forward_batch_size
-        )
-        conf_mat, macro_f1, acc, roc_metrics = eu.get_metrics(
-            y_pred, y_true, logits_list, num_classes, labels
-        )
-        if predict is True:
-            pred_dict = {
-                "pred_ids": y_pred,
-                "label_ids": y_true,
-                "predictions": logits_list,
-            }
-            pred_dict_output_path = (
-                Path(output_directory) / f"{output_prefix}_pred_dict"
-            ).with_suffix(".pkl")
-            with open(pred_dict_output_path, "wb") as f:
-                pickle.dump(pred_dict, f)
-        return {
-            "conf_mat": conf_mat,
-            "macro_f1": macro_f1,
-            "acc": acc,
-            "roc_metrics": roc_metrics,
-        }
-    def evaluate_saved_model(
-        self,
-        model_directory,
-        id_class_dict_file,
-        test_data_file,
-        output_directory,
-        output_prefix,
-        predict=True,
-    ):
-        """
-        Evaluate the fine-tuned model.
-        **Parameters**
-        model_directory : Path
-            | Path to directory containing model
-        id_class_dict_file : Path
-            | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data
-            | (dictionary of format: numerical IDs: class_labels)
-        test_data_file : Path
-            | Path to directory containing test .dataset
-        output_directory : Path
-            | Path to directory where eval data will be saved
-        output_prefix : str
-            | Prefix for output files
-        predict : bool
-            | Whether or not to save eval predictions
-        """
-        # load numerical id to class dictionary (id:class)
-        with open(id_class_dict_file, "rb") as f:
-            id_class_dict = pickle.load(f)
-        # get number of classes for classifier
-        num_classes = cu.get_num_classes(id_class_dict)
-        # load previously filtered and prepared data
-        test_data = pu.load_and_filter(None, self.nproc, test_data_file)
-        # load previously fine-tuned model
-        model = pu.load_model(
-            self.model_type,
-            num_classes,
-            model_directory,
-            "eval",
-            quantize=self.quantize,
-        )
-        # evaluate the model
-        result = self.evaluate_model(
-            model,
-            num_classes,
-            id_class_dict,
-            test_data,
-            predict=predict,
-            output_directory=output_directory,
-            output_prefix=output_prefix,
-        )
-        all_conf_mat_df = pd.DataFrame(
-            result["conf_mat"],
-            columns=id_class_dict.values(),
-            index=id_class_dict.values(),
-        )
-        all_metrics = {
-            "conf_matrix": all_conf_mat_df,
-            "macro_f1": result["macro_f1"],
-            "acc": result["acc"],
-        }
-        all_roc_metrics = None  # roc metrics not reported for multiclass
-        if num_classes == 2:
-            mean_fpr = np.linspace(0, 1, 100)
-            mean_tpr = result["roc_metrics"]["interp_tpr"]
-            all_roc_auc = result["roc_metrics"]["auc"]
-            all_roc_metrics = {
-                "mean_tpr": mean_tpr,
-                "mean_fpr": mean_fpr,
-                "all_roc_auc": all_roc_auc,
-            }
-        all_metrics["all_roc_metrics"] = all_roc_metrics
-        test_metrics_output_path = (
-            Path(output_directory) / f"{output_prefix}_test_metrics_dict"
-        ).with_suffix(".pkl")
-        with open(test_metrics_output_path, "wb") as f:
-            pickle.dump(all_metrics, f)
-        return all_metrics
-    def plot_conf_mat(
-        self,
-        conf_mat_dict,
-        output_directory,
-        output_prefix,
-        custom_class_order=None,
-    ):
-        """
-        Plot confusion matrix results of evaluating the fine-tuned model.
-        **Parameters**
-        conf_mat_dict : dict
-            | Dictionary of model_name : confusion_matrix_DataFrame
-            | (all_metrics["conf_matrix"] from self.validate)
-        output_directory : Path
-            | Path to directory where plots will be saved
-        output_prefix : str
-            | Prefix for output file
-        custom_class_order : None, list
-            | List of classes in custom order for plots.
-            | Same order will be used for all models.
-        """
-        for model_name in conf_mat_dict.keys():
-            eu.plot_confusion_matrix(
-                conf_mat_dict[model_name],
-                model_name,
-                output_directory,
-                output_prefix,
-                custom_class_order,
-            )
-    def plot_roc(
-        self,
-        roc_metric_dict,
-        model_style_dict,
-        title,
-        output_directory,
-        output_prefix,
-    ):
-        """
-        Plot ROC curve results of evaluating the fine-tuned model.
-        **Parameters**
-        roc_metric_dict : dict
-            | Dictionary of model_name : roc_metrics
-            | (all_metrics["all_roc_metrics"] from self.validate)
-        model_style_dict : dict[dict]
-            | Dictionary of model_name : dictionary of style_attribute : style
-            | where style includes color and linestyle
-            | e.g. {'Model_A': {'color': 'black', 'linestyle': '-'}, 'Model_B': ...}
-        title : str
-            | Title of plot (e.g. 'Dosage-sensitive vs -insensitive factors')
-        output_directory : Path
-            | Path to directory where plots will be saved
-        output_prefix : str
-            | Prefix for output file
-        """
-        eu.plot_ROC(
-            roc_metric_dict, model_style_dict, title, output_directory, output_prefix
-        )
-    def plot_predictions(
-        self,
-        predictions_file,
-        id_class_dict_file,
-        title,
-        output_directory,
-        output_prefix,
-        custom_class_order=None,
-        kwargs_dict=None,
-    ):
-        """
-        Plot prediction results of evaluating the fine-tuned model.
-        **Parameters**
-        predictions_file : path
-            | Path of model predictions output to plot
-            | (saved output from self.validate if predict_eval=True)
-            | (or saved output from self.evaluate_saved_model)
-        id_class_dict_file : Path
-            | Path to _id_class_dict.pkl previously prepared by Classifier.prepare_data
-            | (dictionary of format: numerical IDs: class_labels)
-        title : str
-            | Title for legend containing class labels.
-        output_directory : Path
-            | Path to directory where plots will be saved
-        output_prefix : str
-            | Prefix for output file
-        custom_class_order : None, list
-            | List of classes in custom order for plots.
-            | Same order will be used for all models.
-        kwargs_dict : None, dict
-            | Dictionary of kwargs to pass to plotting function.
-        """
-        # load predictions
-        with open(predictions_file, "rb") as f:
-            predictions = pickle.load(f)
-        # load numerical id to class dictionary (id:class)
-        with open(id_class_dict_file, "rb") as f:
-            id_class_dict = pickle.load(f)
-        if isinstance(predictions, dict):
-            if all(
-                [
-                    key in predictions.keys()
-                    for key in ["pred_ids", "label_ids", "predictions"]
-                ]
-            ):
-                # format is output from self.evaluate_saved_model
-                predictions_logits = np.array(predictions["predictions"])
-                true_ids = predictions["label_ids"]
-        else:
-            # format is output from self.validate if predict_eval=True
-            predictions_logits = predictions.predictions
-            true_ids = predictions.label_ids
-        num_classes = len(id_class_dict.keys())
-        num_predict_classes = predictions_logits.shape[1]
-        assert num_classes == num_predict_classes
-        classes = id_class_dict.values()
-        true_labels = [id_class_dict[idx] for idx in true_ids]
-        predictions_df = pd.DataFrame(predictions_logits, columns=classes)
-        if custom_class_order is not None:
-            predictions_df = predictions_df.reindex(columns=custom_class_order)
-        predictions_df["true"] = true_labels
-        custom_dict = dict(zip(classes, [i for i in range(len(classes))]))
-        if custom_class_order is not None:
-            custom_dict = dict(
-                zip(custom_class_order, [i for i in range(len(custom_class_order))])
-            )
-        predictions_df = predictions_df.sort_values(
-            by=["true"], key=lambda x: x.map(custom_dict)
-        )
-        eu.plot_predictions(
-            predictions_df, title, output_directory, output_prefix, kwargs_dict
-        )

geneformer/classifier_utils.py DELETED Viewed

@@ -1,648 +0,0 @@
-import json
-import logging
-import os
-import random
-from collections import Counter, defaultdict
-import numpy as np
-import pandas as pd
-from scipy.stats import chisquare, ranksums
-from sklearn.metrics import accuracy_score, f1_score
-from sklearn.model_selection import StratifiedKFold, train_test_split
-from . import perturber_utils as pu
-logger = logging.getLogger(__name__)
-def downsample_and_shuffle(data, max_ncells, max_ncells_per_class, cell_state_dict):
-    data = data.shuffle(seed=42)
-    num_cells = len(data)
-    # if max number of cells is defined, then subsample to this max number
-    if max_ncells is not None:
-        if num_cells > max_ncells:
-            data = data.select([i for i in range(max_ncells)])
-    if max_ncells_per_class is not None:
-        class_labels = data[cell_state_dict["state_key"]]
-        random.seed(42)
-        subsample_indices = subsample_by_class(class_labels, max_ncells_per_class)
-        data = data.select(subsample_indices)
-    return data
-# subsample labels to maximum number N per class and return indices
-def subsample_by_class(labels, N):
-    label_indices = defaultdict(list)
-    # Gather indices for each label
-    for idx, label in enumerate(labels):
-        label_indices[label].append(idx)
-    selected_indices = []
-    # Select up to N indices for each label
-    for label, indices in label_indices.items():
-        if len(indices) > N:
-            selected_indices.extend(random.sample(indices, N))
-        else:
-            selected_indices.extend(indices)
-    return selected_indices
-def rename_cols(data, state_key):
-    data = data.rename_column(state_key, "label")
-    return data
-def validate_and_clean_cols(train_data, eval_data, classifier):
-    # validate that data has expected label column and remove others
-    if classifier == "cell":
-        label_col = "label"
-    elif classifier == "gene":
-        label_col = "labels"
-    cols_to_keep = [label_col] + ["input_ids", "length"]
-    if label_col not in train_data.column_names:
-        logger.error(f"train_data must contain column {label_col} with class labels.")
-        raise
-    else:
-        train_data = remove_cols(train_data, cols_to_keep)
-    if eval_data is not None:
-        if label_col not in eval_data.column_names:
-            logger.error(
-                f"eval_data must contain column {label_col} with class labels."
-            )
-            raise
-        else:
-            eval_data = remove_cols(eval_data, cols_to_keep)
-    return train_data, eval_data
-def remove_cols(data, cols_to_keep):
-    other_cols = list(data.features.keys())
-    other_cols = [ele for ele in other_cols if ele not in cols_to_keep]
-    data = data.remove_columns(other_cols)
-    return data
-def remove_rare(data, rare_threshold, label, nproc):
-    if rare_threshold > 0:
-        total_cells = len(data)
-        label_counter = Counter(data[label])
-        nonrare_label_dict = {
-            label: [k for k, v in label_counter if (v / total_cells) > rare_threshold]
-        }
-        data = pu.filter_by_dict(data, nonrare_label_dict, nproc)
-    return data
-def label_classes(classifier, data, gene_class_dict, nproc):
-    if classifier == "cell":
-        label_set = set(data["label"])
-    elif classifier == "gene":
-        # remove cells without any of the target genes
-        def if_contains_label(example):
-            a = pu.flatten_list(gene_class_dict.values())
-            b = example["input_ids"]
-            return not set(a).isdisjoint(b)
-        data = data.filter(if_contains_label, num_proc=nproc)
-        label_set = gene_class_dict.keys()
-        if len(data) == 0:
-            logger.error(
-                "No cells remain after filtering for target genes. Check target gene list."
-            )
-            raise
-    class_id_dict = dict(zip(label_set, [i for i in range(len(label_set))]))
-    id_class_dict = {v: k for k, v in class_id_dict.items()}
-    def classes_to_ids(example):
-        if classifier == "cell":
-            example["label"] = class_id_dict[example["label"]]
-        elif classifier == "gene":
-            example["labels"] = label_gene_classes(
-                example, class_id_dict, gene_class_dict
-            )
-        return example
-    data = data.map(classes_to_ids, num_proc=nproc)
-    return data, id_class_dict
-def label_gene_classes(example, class_id_dict, gene_class_dict):
-    return [
-        class_id_dict.get(gene_class_dict.get(token_id, -100), -100)
-        for token_id in example["input_ids"]
-    ]
-def prep_gene_classifier_train_eval_split(
-    data,
-    targets,
-    labels,
-    train_index,
-    eval_index,
-    max_ncells,
-    iteration_num,
-    num_proc,
-    balance=False,
-):
-    # generate cross-validation splits
-    train_data = prep_gene_classifier_split(
-        data,
-        targets,
-        labels,
-        train_index,
-        "train",
-        max_ncells,
-        iteration_num,
-        num_proc,
-        balance,
-    )
-    eval_data = prep_gene_classifier_split(
-        data,
-        targets,
-        labels,
-        eval_index,
-        "eval",
-        max_ncells,
-        iteration_num,
-        num_proc,
-        balance,
-    )
-    return train_data, eval_data
-def prep_gene_classifier_split(
-    data,
-    targets,
-    labels,
-    index,
-    subset_name,
-    max_ncells,
-    iteration_num,
-    num_proc,
-    balance=False,
-):
-    # generate cross-validation splits
-    targets = np.array(targets)
-    labels = np.array(labels)
-    targets_subset = targets[index]
-    labels_subset = labels[index]
-    label_dict_subset = dict(zip(targets_subset, labels_subset))
-    # function to filter by whether contains train or eval labels
-    def if_contains_subset_label(example):
-        a = targets_subset
-        b = example["input_ids"]
-        return not set(a).isdisjoint(b)
-    # filter dataset for examples containing classes for this split
-    logger.info(f"Filtering data for {subset_name} genes in split {iteration_num}")
-    subset_data = data.filter(if_contains_subset_label, num_proc=num_proc)
-    logger.info(
-        f"Filtered {round((1-len(subset_data)/len(data))*100)}%; {len(subset_data)} remain\n"
-    )
-    # balance gene subsets if train
-    if (subset_name == "train") and (balance is True):
-        subset_data, label_dict_subset = balance_gene_split(
-            subset_data, label_dict_subset, num_proc
-        )
-    # subsample to max_ncells
-    subset_data = downsample_and_shuffle(subset_data, max_ncells, None, None)
-    # relabel genes for this split
-    def subset_classes_to_ids(example):
-        example["labels"] = [
-            label_dict_subset.get(token_id, -100) for token_id in example["input_ids"]
-        ]
-        return example
-    subset_data = subset_data.map(subset_classes_to_ids, num_proc=num_proc)
-    return subset_data
-def prep_gene_classifier_all_data(
-    data, targets, labels, max_ncells, num_proc, balance=False
-):
-    targets = np.array(targets)
-    labels = np.array(labels)
-    label_dict_train = dict(zip(targets, labels))
-    # function to filter by whether contains train labels
-    def if_contains_train_label(example):
-        a = targets
-        b = example["input_ids"]
-        return not set(a).isdisjoint(b)
-    # filter dataset for examples containing classes for this split
-    logger.info("Filtering training data for genes to classify.")
-    train_data = data.filter(if_contains_train_label, num_proc=num_proc)
-    logger.info(
-        f"Filtered {round((1-len(train_data)/len(data))*100)}%; {len(train_data)} remain\n"
-    )
-    if balance is True:
-        train_data, label_dict_train = balance_gene_split(
-            train_data, label_dict_train, num_proc
-        )
-    # subsample to max_ncells
-    train_data = downsample_and_shuffle(train_data, max_ncells, None, None)
-    # relabel genes for this split
-    def train_classes_to_ids(example):
-        example["labels"] = [
-            label_dict_train.get(token_id, -100) for token_id in example["input_ids"]
-        ]
-        return example
-    train_data = train_data.map(train_classes_to_ids, num_proc=num_proc)
-    return train_data
-def balance_gene_split(subset_data, label_dict_subset, num_proc):
-    # count occurrence of genes in each label category
-    label0_counts, label1_counts = count_genes_for_balancing(
-        subset_data, label_dict_subset, num_proc
-    )
-    label_ratio_0to1 = label0_counts / label1_counts
-    if 8 / 10 <= label_ratio_0to1 <= 10 / 8:
-        # gene sets already balanced
-        logger.info(
-            "Gene sets were already balanced within 0.8-1.25 fold and did not require balancing.\n"
-        )
-        return subset_data, label_dict_subset
-    else:
-        label_ratio_0to1_orig = label_ratio_0to1 + 0
-        label_dict_subset_orig = label_dict_subset.copy()
-        # balance gene sets
-        max_ntrials = 25
-        boost = 1
-        if label_ratio_0to1 > 10 / 8:
-            # downsample label 0
-            for i in range(max_ntrials):
-                label0 = 0
-                label0_genes = [k for k, v in label_dict_subset.items() if v == label0]
-                label0_ngenes = len(label0_genes)
-                label0_nremove = max(
-                    1,
-                    int(
-                        np.floor(
-                            label0_ngenes - label0_ngenes / (label_ratio_0to1 * boost)
-                        )
-                    ),
-                )
-                random.seed(i)
-                label0_remove_genes = random.sample(label0_genes, label0_nremove)
-                label_dict_subset_new = {
-                    k: v
-                    for k, v in label_dict_subset.items()
-                    if k not in label0_remove_genes
-                }
-                label0_counts, label1_counts = count_genes_for_balancing(
-                    subset_data, label_dict_subset_new, num_proc
-                )
-                label_ratio_0to1 = label0_counts / label1_counts
-                if 8 / 10 <= label_ratio_0to1 <= 10 / 8:
-                    # if gene sets now balanced, return new filtered data and new label_dict_subset
-                    return filter_data_balanced_genes(
-                        subset_data, label_dict_subset_new, num_proc
-                    )
-                elif label_ratio_0to1 > 10 / 8:
-                    boost = boost * 1.1
-                elif label_ratio_0to1 < 8 / 10:
-                    boost = boost * 0.9
-        else:
-            # downsample label 1
-            for i in range(max_ntrials):
-                label1 = 1
-                label1_genes = [k for k, v in label_dict_subset.items() if v == label1]
-                label1_ngenes = len(label1_genes)
-                label1_nremove = max(
-                    1,
-                    int(
-                        np.floor(
-                            label1_ngenes
-                            - label1_ngenes / ((1 / label_ratio_0to1) * boost)
-                        )
-                    ),
-                )
-                random.seed(i)
-                label1_remove_genes = random.sample(label1_genes, label1_nremove)
-                label_dict_subset_new = {
-                    k: v
-                    for k, v in label_dict_subset.items()
-                    if k not in label1_remove_genes
-                }
-                label0_counts, label1_counts = count_genes_for_balancing(
-                    subset_data, label_dict_subset_new, num_proc
-                )
-                label_ratio_0to1 = label0_counts / label1_counts
-                if 8 / 10 <= label_ratio_0to1 <= 10 / 8:
-                    # if gene sets now balanced, return new filtered data and new label_dict_subset
-                    return filter_data_balanced_genes(
-                        subset_data, label_dict_subset_new, num_proc
-                    )
-                elif label_ratio_0to1 < 8 / 10:
-                    boost = boost * 1.1
-                elif label_ratio_0to1 > 10 / 8:
-                    boost = boost * 0.9
-        assert i + 1 == max_ntrials
-        if (label_ratio_0to1 <= label_ratio_0to1_orig < 8 / 10) or (
-            10 / 8 > label_ratio_0to1_orig >= label_ratio_0to1
-        ):
-            label_ratio_0to1 = label_ratio_0to1_orig
-            label_dict_subset_new = label_dict_subset_orig
-        logger.warning(
-            f"Gene sets were not able to be balanced within 0.8-1.25 fold after {max_ntrials} trials. Imbalance level: {label_ratio_0to1}\n"
-        )
-        return filter_data_balanced_genes(subset_data, label_dict_subset_new, num_proc)
-def count_genes_for_balancing(subset_data, label_dict_subset, num_proc):
-    def count_targets(example):
-        labels = [
-            label_dict_subset.get(token_id, -100) for token_id in example["input_ids"]
-        ]
-        counter_labels = Counter(labels)
-        # get count of labels 0 or 1, or if absent, return 0
-        example["labels_counts"] = [counter_labels.get(0, 0), counter_labels.get(1, 0)]
-        return example
-    subset_data = subset_data.map(count_targets, num_proc=num_proc)
-    label0_counts = sum([counts[0] for counts in subset_data["labels_counts"]])
-    label1_counts = sum([counts[1] for counts in subset_data["labels_counts"]])
-    subset_data = subset_data.remove_columns("labels_counts")
-    return label0_counts, label1_counts
-def filter_data_balanced_genes(subset_data, label_dict_subset, num_proc):
-    # function to filter by whether contains labels
-    def if_contains_subset_label(example):
-        a = list(label_dict_subset.keys())
-        b = example["input_ids"]
-        return not set(a).isdisjoint(b)
-    # filter dataset for examples containing classes for this split
-    logger.info("Filtering data for balanced genes")
-    subset_data_len_orig = len(subset_data)
-    subset_data = subset_data.filter(if_contains_subset_label, num_proc=num_proc)
-    logger.info(
-        f"Filtered {round((1-len(subset_data)/subset_data_len_orig)*100)}%; {len(subset_data)} remain\n"
-    )
-    return subset_data, label_dict_subset
-def balance_attr_splits(
-    data,
-    attr_to_split,
-    attr_to_balance,
-    eval_size,
-    max_trials,
-    pval_threshold,
-    state_key,
-    nproc,
-):
-    metadata_df = pd.DataFrame({"split_attr_ids": data[attr_to_split]})
-    for attr in attr_to_balance:
-        if attr == state_key:
-            metadata_df[attr] = data["label"]
-        else:
-            metadata_df[attr] = data[attr]
-    metadata_df = metadata_df.drop_duplicates()
-    split_attr_ids = list(metadata_df["split_attr_ids"])
-    assert len(split_attr_ids) == len(set(split_attr_ids))
-    eval_num = round(len(split_attr_ids) * eval_size)
-    colnames = (
-        ["trial_num", "train_ids", "eval_ids"]
-        + pu.flatten_list(
-            [
-                [
-                    f"{attr}_train_mean_or_counts",
-                    f"{attr}_eval_mean_or_counts",
-                    f"{attr}_pval",
-                ]
-                for attr in attr_to_balance
-            ]
-        )
-        + ["mean_pval"]
-    )
-    balance_df = pd.DataFrame(columns=colnames)
-    data_dict = dict()
-    trial_num = 1
-    for i in range(max_trials):
-        if not all(
-            count > 1 for count in list(Counter(metadata_df[state_key]).values())
-        ):
-            logger.error(
-                f"Cannot balance by {attr_to_split} while retaining at least 1 occurrence of each {state_key} class in both data splits. "
-            )
-            raise
-        eval_base = []
-        for state in set(metadata_df[state_key]):
-            eval_base += list(
-                metadata_df.loc[
-                    metadata_df[state_key][metadata_df[state_key].eq(state)]
-                    .sample(1, random_state=i)
-                    .index
-                ]["split_attr_ids"]
-            )
-        non_eval_base = [idx for idx in split_attr_ids if idx not in eval_base]
-        random.seed(i)
-        eval_ids = random.sample(non_eval_base, eval_num - len(eval_base)) + eval_base
-        train_ids = [idx for idx in split_attr_ids if idx not in eval_ids]
-        df_vals = [trial_num, train_ids, eval_ids]
-        pvals = []
-        for attr in attr_to_balance:
-            train_attr = list(
-                metadata_df[metadata_df["split_attr_ids"].isin(train_ids)][attr]
-            )
-            eval_attr = list(
-                metadata_df[metadata_df["split_attr_ids"].isin(eval_ids)][attr]
-            )
-            if attr == state_key:
-                # ensure IDs are interpreted as categorical
-                train_attr = [str(item) for item in train_attr]
-                eval_attr = [str(item) for item in eval_attr]
-            if all(isinstance(item, (int, float)) for item in train_attr + eval_attr):
-                train_attr_mean = np.nanmean(train_attr)
-                eval_attr_mean = np.nanmean(eval_attr)
-                pval = ranksums(train_attr, eval_attr, nan_policy="omit").pvalue
-                df_vals += [train_attr_mean, eval_attr_mean, pval]
-            elif all(isinstance(item, (str)) for item in train_attr + eval_attr):
-                obs_counts = Counter(train_attr)
-                exp_counts = Counter(eval_attr)
-                all_categ = set(obs_counts.keys()).union(set(exp_counts.keys()))
-                obs = [obs_counts[cat] for cat in all_categ]
-                exp = [
-                    exp_counts[cat] * sum(obs) / sum(exp_counts.values())
-                    for cat in all_categ
-                ]
-                pval = chisquare(f_obs=obs, f_exp=exp).pvalue
-                train_attr_counts = str(obs_counts).strip("Counter(").strip(")")
-                eval_attr_counts = str(exp_counts).strip("Counter(").strip(")")
-                df_vals += [train_attr_counts, eval_attr_counts, pval]
-            else:
-                logger.error(
-                    f"Inconsistent data types in attribute {attr}. "
-                    "Cannot infer if continuous or categorical. "
-                    "Must be all numeric (continuous) or all strings (categorical) to balance."
-                )
-                raise
-            pvals += [pval]
-        df_vals += [np.nanmean(pvals)]
-        balance_df_i = pd.DataFrame(df_vals, index=colnames).T
-        balance_df = pd.concat([balance_df, balance_df_i], ignore_index=True)
-        valid_pvals = [
-            pval_i
-            for pval_i in pvals
-            if isinstance(pval_i, (int, float)) and not np.isnan(pval_i)
-        ]
-        if all(i >= pval_threshold for i in valid_pvals):
-            data_dict["train"] = pu.filter_by_dict(
-                data, {attr_to_split: balance_df_i["train_ids"][0]}, nproc
-            )
-            data_dict["test"] = pu.filter_by_dict(
-                data, {attr_to_split: balance_df_i["eval_ids"][0]}, nproc
-            )
-            return data_dict, balance_df
-        trial_num = trial_num + 1
-    balance_max_df = balance_df.iloc[balance_df["mean_pval"].idxmax(), :]
-    data_dict["train"] = pu.filter_by_dict(
-        data, {attr_to_split: balance_df_i["train_ids"][0]}, nproc
-    )
-    data_dict["test"] = pu.filter_by_dict(
-        data, {attr_to_split: balance_df_i["eval_ids"][0]}, nproc
-    )
-    logger.warning(
-        f"No splits found without significant difference in attr_to_balance among {max_trials} trials. "
-        f"Selecting optimal split (trial #{balance_max_df['trial_num']}) from completed trials."
-    )
-    return data_dict, balance_df
-def get_num_classes(id_class_dict):
-    return len(set(id_class_dict.values()))
-def compute_metrics(pred):
-    labels = pred.label_ids
-    preds = pred.predictions.argmax(-1)
-    # calculate accuracy and macro f1 using sklearn's function
-    if len(labels.shape) == 1:
-        acc = accuracy_score(labels, preds)
-        macro_f1 = f1_score(labels, preds, average="macro")
-    else:
-        flat_labels = labels.flatten().tolist()
-        flat_preds = preds.flatten().tolist()
-        logit_label_paired = [
-            item for item in list(zip(flat_preds, flat_labels)) if item[1] != -100
-        ]
-        y_pred = [item[0] for item in logit_label_paired]
-        y_true = [item[1] for item in logit_label_paired]
-        acc = accuracy_score(y_true, y_pred)
-        macro_f1 = f1_score(y_true, y_pred, average="macro")
-    return {"accuracy": acc, "macro_f1": macro_f1}
-def get_default_train_args(model, classifier, data, output_dir):
-    num_layers = pu.quant_layers(model)
-    freeze_layers = 0
-    batch_size = 12
-    if classifier == "cell":
-        epochs = 10
-        evaluation_strategy = "epoch"
-        load_best_model_at_end = True
-    else:
-        epochs = 1
-        evaluation_strategy = "no"
-        load_best_model_at_end = False
-    if num_layers == 6:
-        default_training_args = {
-            "learning_rate": 5e-5,
-            "lr_scheduler_type": "linear",
-            "warmup_steps": 500,
-            "per_device_train_batch_size": batch_size,
-            "per_device_eval_batch_size": batch_size,
-        }
-    else:
-        default_training_args = {
-            "per_device_train_batch_size": batch_size,
-            "per_device_eval_batch_size": batch_size,
-        }
-    training_args = {
-        "num_train_epochs": epochs,
-        "do_train": True,
-        "do_eval": True,
-        "evaluation_strategy": evaluation_strategy,
-        "logging_steps": np.floor(len(data) / batch_size / 8),  # 8 evals per epoch
-        "save_strategy": "epoch",
-        "group_by_length": False,
-        "length_column_name": "length",
-        "disable_tqdm": False,
-        "weight_decay": 0.001,
-        "load_best_model_at_end": load_best_model_at_end,
-    }
-    training_args.update(default_training_args)
-    return training_args, freeze_layers
-def load_best_model(directory, model_type, num_classes, mode="eval"):
-    file_dict = dict()
-    for subdir, dirs, files in os.walk(directory):
-        for file in files:
-            if file.endswith("result.json"):
-                with open(f"{subdir}/{file}", "rb") as fp:
-                    result_json = json.load(fp)
-                file_dict[f"{subdir}"] = result_json["eval_macro_f1"]
-    file_df = pd.DataFrame(
-        {"dir": file_dict.keys(), "eval_macro_f1": file_dict.values()}
-    )
-    model_superdir = (
-        "run-"
-        + file_df.iloc[file_df["eval_macro_f1"].idxmax()]["dir"]
-        .split("_objective_")[2]
-        .split("_")[0]
-    )
-    for subdir, dirs, files in os.walk(f"{directory}/{model_superdir}"):
-        for file in files:
-            if file.endswith("model.safetensors"):
-                model = pu.load_model(model_type, num_classes, f"{subdir}", mode)
-    return model
-class StratifiedKFold3(StratifiedKFold):
-    def split(self, targets, labels, test_ratio=0.5, groups=None):
-        s = super().split(targets, labels, groups)
-        for train_indxs, test_indxs in s:
-            if test_ratio == 0:
-                yield train_indxs, test_indxs, None
-            else:
-                labels_test = np.array(labels)[test_indxs]
-                valid_indxs, test_indxs = train_test_split(
-                    test_indxs,
-                    stratify=labels_test,
-                    test_size=test_ratio,
-                    random_state=0,
-                )
-                yield train_indxs, valid_indxs, test_indxs

geneformer/collator_for_classification.py CHANGED Viewed

@@ -1,22 +1,24 @@
 """
 Geneformer collator for gene and cell classification.
 Huggingface data collator modified to accommodate single-cell transcriptomics data for gene and cell classification.
 """
 import warnings
 from enum import Enum
 from typing import Dict, List, Optional, Union
-import numpy as np
-import torch
 from transformers import (
-    BatchEncoding,
     DataCollatorForTokenClassification,
     SpecialTokensMixin,
 )
 from transformers.utils import is_tf_available, is_torch_available, logging, to_py_obj
 from transformers.utils.generic import _is_tensorflow, _is_torch
 EncodedInput = List[int]
 logger = logging.get_logger(__name__)
 VERY_LARGE_INTEGER = int(
@@ -28,7 +30,6 @@ LARGE_INTEGER = int(
 # precollator functions
 class ExplicitEnum(Enum):
     """
     Enum with more explicit error message for missing values.
@@ -41,7 +42,6 @@ class ExplicitEnum(Enum):
             % (value, cls.__name__, str(list(cls._value2member_map_.keys())))
         )
 class TruncationStrategy(ExplicitEnum):
     """
     Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
@@ -54,6 +54,7 @@ class TruncationStrategy(ExplicitEnum):
     DO_NOT_TRUNCATE = "do_not_truncate"
 class PaddingStrategy(ExplicitEnum):
     """
     Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
@@ -65,6 +66,7 @@ class PaddingStrategy(ExplicitEnum):
     DO_NOT_PAD = "do_not_pad"
 class TensorType(ExplicitEnum):
     """
     Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
@@ -76,41 +78,21 @@ class TensorType(ExplicitEnum):
     NUMPY = "np"
     JAX = "jax"
 class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(mask_token="<mask>", pad_token="<pad>")
-        self.token_dictionary = kwargs.get("token_dictionary")
-        self.padding_side = "right"
-        self.model_input_names = ["input_ids"]
-        self._mask_token_id = self.token_dictionary.get("<mask>")
-        self._pad_token_id = self.token_dictionary.get("<pad>")
-        self._all_special_ids = [
-            self.token_dictionary.get("<mask>"),
-            self.token_dictionary.get("<pad>"),
-        ]
-    @property
-    def all_special_ids(self):
-        return self._all_special_ids
-    @property
-    def mask_token_id(self):
-        return self._mask_token_id
-    @property
-    def pad_token_id(self):
-        return self._pad_token_id
     def _get_padding_truncation_strategies(
-        self,
-        padding=True,
-        truncation=False,
-        max_length=None,
-        pad_to_multiple_of=None,
-        verbose=True,
-        **kwargs,
     ):
         """
         Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
@@ -123,9 +105,7 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
         # If you only set max_length, it activates truncation for max_length
         if max_length is not None and padding is False and truncation is False:
             if verbose:
-                if not self.deprecation_warnings.get(
-                    "Truncation-not-explicitly-activated", False
-                ):
                     logger.warning(
                         "Truncation was not explicitly activated but `max_length` is provided a specific value, "
                         "please use `truncation=True` to explicitly truncate examples to max length. "
@@ -153,9 +133,7 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
                 padding_strategy = PaddingStrategy.MAX_LENGTH
         elif padding is not False:
             if padding is True:
-                padding_strategy = (
-                    PaddingStrategy.LONGEST
-                )  # Default to pad to the longest sequence in the batch
             elif not isinstance(padding, PaddingStrategy):
                 padding_strategy = PaddingStrategy(padding)
             elif isinstance(padding, PaddingStrategy):
@@ -195,9 +173,7 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
             if padding_strategy == PaddingStrategy.MAX_LENGTH:
                 if self.model_max_length > LARGE_INTEGER:
                     if verbose:
-                        if not self.deprecation_warnings.get(
-                            "Asking-to-pad-to-max_length", False
-                        ):
                             logger.warning(
                                 "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
                                 "Default to no padding."
@@ -210,24 +186,18 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
             if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                 if self.model_max_length > LARGE_INTEGER:
                     if verbose:
-                        if not self.deprecation_warnings.get(
-                            "Asking-to-truncate-to-max_length", False
-                        ):
                             logger.warning(
                                 "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
                                 "Default to no truncation."
                             )
-                        self.deprecation_warnings[
-                            "Asking-to-truncate-to-max_length"
-                        ] = True
                     truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                 else:
                     max_length = self.model_max_length
         # Test if we have a padding token
-        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (
-            not self.pad_token or self.pad_token_id < 0
-        ):
             raise ValueError(
                 "Asking to pad but the tokenizer does not have a padding token. "
                 "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
@@ -258,7 +228,7 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
             Dict[str, List[EncodedInput]],
             List[Dict[str, EncodedInput]],
         ],
-        class_type,  # options: "gene" or "cell"
         padding: Union[bool, str, PaddingStrategy] = True,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
@@ -269,23 +239,29 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
         """
         Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
         in the batch.
         Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
         ``self.pad_token_id`` and ``self.pad_token_type_id``)
         .. note::
             If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
             result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
             case of PyTorch tensors, you will lose the specific device of your tensors however.
         Args:
             encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
                 Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
                 List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
                 List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
                 well as in a PyTorch Dataloader collate function.
                 Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
                 see the note above for the return type.
             padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
                  Select a strategy to pad the returned sequences (according to the model's padding side and padding
                  index) among:
                 * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
                 * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
@@ -296,14 +272,17 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
                 Maximum length of the returned list and optionally padding length (see above).
             pad_to_multiple_of (:obj:`int`, `optional`):
                 If set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 >= 7.5 (Volta).
             return_attention_mask (:obj:`bool`, `optional`):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
                 `What are attention masks? <../glossary.html#attention-mask>`__
             return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
                 * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
                 * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
                 * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
@@ -312,13 +291,8 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
         """
         # If we have a list of dicts, let's convert it in a dict of lists
         # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
-        if isinstance(encoded_inputs, (list, tuple)) and isinstance(
-            encoded_inputs[0], (dict, BatchEncoding)
-        ):
-            encoded_inputs = {
-                key: [example[key] for example in encoded_inputs]
-                for key in encoded_inputs[0].keys()
-            }
         # The model's main input name, usually `input_ids`, has be passed for padding
         if self.model_input_names[0] not in encoded_inputs:
@@ -412,7 +386,7 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
     def _pad(
         self,
         encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
-        class_type,  # options: "gene" or "cell"
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.LONGEST,
         pad_to_multiple_of: Optional[int] = None,
@@ -420,15 +394,18 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
     ) -> dict:
         """
         Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
         Args:
             encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
             max_length: maximum length of the returned list and optionally padding length (see below).
                 Will truncate by taking into account the special tokens.
             padding_strategy: PaddingStrategy to use for padding.
                 - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                 - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                 - PaddingStrategy.DO_NOT_PAD: Do not pad
                 The tokenizer padding sides are defined in self.padding_side:
                     - 'left': pads on the left of the sequences
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
@@ -445,73 +422,46 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
         if padding_strategy == PaddingStrategy.LONGEST:
             max_length = len(required_input)
-        if (
-            max_length is not None
-            and pad_to_multiple_of is not None
-            and (max_length % pad_to_multiple_of != 0)
-        ):
             max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-        needs_to_be_padded = (
-            padding_strategy != PaddingStrategy.DO_NOT_PAD
-            and len(required_input) != max_length
-        )
         if needs_to_be_padded:
             difference = max_length - len(required_input)
             if self.padding_side == "right":
                 if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [1] * len(required_input) + [
-                        0
-                    ] * difference
                 if "token_type_ids" in encoded_inputs:
                     encoded_inputs["token_type_ids"] = (
-                        encoded_inputs["token_type_ids"]
-                        + [self.pad_token_type_id] * difference
                     )
                 if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = (
-                        encoded_inputs["special_tokens_mask"] + [1] * difference
-                    )
-                encoded_inputs[self.model_input_names[0]] = (
-                    required_input + [self.pad_token_id] * difference
-                )
                 if class_type == "gene":
-                    encoded_inputs["labels"] = (
-                        encoded_inputs["labels"] + [-100] * difference
-                    )
             elif self.padding_side == "left":
                 if return_attention_mask:
-                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(
-                        required_input
-                    )
                 if "token_type_ids" in encoded_inputs:
-                    encoded_inputs["token_type_ids"] = [
-                        self.pad_token_type_id
-                    ] * difference + encoded_inputs["token_type_ids"]
                 if "special_tokens_mask" in encoded_inputs:
-                    encoded_inputs["special_tokens_mask"] = [
-                        1
-                    ] * difference + encoded_inputs["special_tokens_mask"]
-                encoded_inputs[self.model_input_names[0]] = [
-                    self.pad_token_id
-                ] * difference + required_input
                 if class_type == "gene":
-                    encoded_inputs["labels"] = [-100] * difference + encoded_inputs[
-                        "labels"
-                    ]
             else:
                 raise ValueError("Invalid padding strategy:" + str(self.padding_side))
         elif return_attention_mask and "attention_mask" not in encoded_inputs:
             encoded_inputs["attention_mask"] = [1] * len(required_input)
         return encoded_inputs
     def get_special_tokens_mask(
-        self,
-        token_ids_0: List[int],
-        token_ids_1: Optional[List[int]] = None,
-        already_has_special_tokens: bool = False,
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
@@ -535,15 +485,11 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
         all_special_ids = self.all_special_ids  # cache the property
-        special_tokens_mask = [
-            1 if token in all_special_ids else 0 for token in token_ids_0
-        ]
         return special_tokens_mask
-    def convert_tokens_to_ids(
-        self, tokens: Union[str, List[str]]
-    ) -> Union[int, List[int]]:
         """
         Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
         vocabulary.
@@ -567,15 +513,14 @@ class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
         if token is None:
             return None
-        return self.token_dictionary.get(token)
     def __len__(self):
-        return len(self.token_dictionary)
 # collator functions
 class DataCollatorForGeneClassification(DataCollatorForTokenClassification):
     """
     Data collator that will dynamically pad the inputs received, as well as the labels.
@@ -601,33 +546,25 @@ class DataCollatorForGeneClassification(DataCollatorForTokenClassification):
             The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
     """
     class_type = "gene"
     padding: Union[bool, str, PaddingStrategy] = True
     max_length: Optional[int] = None
     pad_to_multiple_of: Optional[int] = None
     label_pad_token_id: int = -100
     def __init__(self, *args, **kwargs) -> None:
-        self.token_dictionary = kwargs.pop("token_dictionary")
         super().__init__(
-            tokenizer=PrecollatorForGeneAndCellClassification(
-                token_dictionary=self.token_dictionary
-            ),
             padding=self.padding,
             max_length=self.max_length,
             pad_to_multiple_of=self.pad_to_multiple_of,
             label_pad_token_id=self.label_pad_token_id,
-            *args,
-            **kwargs,
-        )
     def _prepare_batch(self, features):
         label_name = "label" if "label" in features[0].keys() else "labels"
-        labels = (
-            [feature[label_name] for feature in features]
-            if label_name in features[0].keys()
-            else None
-        )
         batch = self.tokenizer.pad(
             features,
             class_type=self.class_type,
@@ -637,31 +574,29 @@ class DataCollatorForGeneClassification(DataCollatorForTokenClassification):
             return_tensors="pt",
         )
         return batch
     def __call__(self, features):
         batch = self._prepare_batch(features)
         batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
         return batch
 class DataCollatorForCellClassification(DataCollatorForGeneClassification):
     class_type = "cell"
     def _prepare_batch(self, features):
         batch = super()._prepare_batch(features)
         # Special handling for labels.
         # Ensure that tensor is created with the correct type
         # (it should be automatically the case, but let's make sure of it.)
         first = features[0]
         if "label" in first and first["label"] is not None:
-            label = (
-                first["label"].item()
-                if isinstance(first["label"], torch.Tensor)
-                else first["label"]
-            )
             dtype = torch.long if isinstance(label, int) else torch.float
             batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
         return batch

 """
 Geneformer collator for gene and cell classification.
 Huggingface data collator modified to accommodate single-cell transcriptomics data for gene and cell classification.
 """
+import numpy as np
+import torch
 import warnings
 from enum import Enum
 from typing import Dict, List, Optional, Union
 from transformers import (
     DataCollatorForTokenClassification,
     SpecialTokensMixin,
+    BatchEncoding,
 )
 from transformers.utils import is_tf_available, is_torch_available, logging, to_py_obj
 from transformers.utils.generic import _is_tensorflow, _is_torch
+from .pretrainer import token_dictionary
 EncodedInput = List[int]
 logger = logging.get_logger(__name__)
 VERY_LARGE_INTEGER = int(
 # precollator functions
 class ExplicitEnum(Enum):
     """
     Enum with more explicit error message for missing values.
             % (value, cls.__name__, str(list(cls._value2member_map_.keys())))
         )
 class TruncationStrategy(ExplicitEnum):
     """
     Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
     DO_NOT_TRUNCATE = "do_not_truncate"
 class PaddingStrategy(ExplicitEnum):
     """
     Possible values for the ``padding`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for tab-completion
     DO_NOT_PAD = "do_not_pad"
 class TensorType(ExplicitEnum):
     """
     Possible values for the ``return_tensors`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
     NUMPY = "np"
     JAX = "jax"
 class PrecollatorForGeneAndCellClassification(SpecialTokensMixin):
+    mask_token = "<mask>"
+    mask_token_id = token_dictionary.get("<mask>")
+    pad_token = "<pad>"
+    pad_token_id = token_dictionary.get("<pad>")
+    padding_side = "right"
+    all_special_ids = [
+        token_dictionary.get("<mask>"),
+        token_dictionary.get("<pad>")
+    ]
+    model_input_names = ["input_ids"]
     def _get_padding_truncation_strategies(
+        self, padding=True, truncation=False, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
     ):
         """
         Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy
         # If you only set max_length, it activates truncation for max_length
         if max_length is not None and padding is False and truncation is False:
             if verbose:
+                if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
                     logger.warning(
                         "Truncation was not explicitly activated but `max_length` is provided a specific value, "
                         "please use `truncation=True` to explicitly truncate examples to max length. "
                 padding_strategy = PaddingStrategy.MAX_LENGTH
         elif padding is not False:
             if padding is True:
+                padding_strategy = PaddingStrategy.LONGEST  # Default to pad to the longest sequence in the batch
             elif not isinstance(padding, PaddingStrategy):
                 padding_strategy = PaddingStrategy(padding)
             elif isinstance(padding, PaddingStrategy):
             if padding_strategy == PaddingStrategy.MAX_LENGTH:
                 if self.model_max_length > LARGE_INTEGER:
                     if verbose:
+                        if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
                             logger.warning(
                                 "Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. "
                                 "Default to no padding."
             if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
                 if self.model_max_length > LARGE_INTEGER:
                     if verbose:
+                        if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
                             logger.warning(
                                 "Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. "
                                 "Default to no truncation."
                             )
+                        self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
                     truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
                 else:
                     max_length = self.model_max_length
         # Test if we have a padding token
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD and (not self.pad_token or self.pad_token_id < 0):
             raise ValueError(
                 "Asking to pad but the tokenizer does not have a padding token. "
                 "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
             Dict[str, List[EncodedInput]],
             List[Dict[str, EncodedInput]],
         ],
+        class_type, # options: "gene" or "cell"
         padding: Union[bool, str, PaddingStrategy] = True,
         max_length: Optional[int] = None,
         pad_to_multiple_of: Optional[int] = None,
         """
         Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
         in the batch.
         Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
         ``self.pad_token_id`` and ``self.pad_token_type_id``)
         .. note::
             If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
             result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
             case of PyTorch tensors, you will lose the specific device of your tensors however.
         Args:
             encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
                 Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
                 List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
                 List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
                 well as in a PyTorch Dataloader collate function.
                 Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
                 see the note above for the return type.
             padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
                  Select a strategy to pad the returned sequences (according to the model's padding side and padding
                  index) among:
                 * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
                 * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
                 Maximum length of the returned list and optionally padding length (see above).
             pad_to_multiple_of (:obj:`int`, `optional`):
                 If set will pad the sequence to a multiple of the provided value.
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 >= 7.5 (Volta).
             return_attention_mask (:obj:`bool`, `optional`):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
                 `What are attention masks? <../glossary.html#attention-mask>`__
             return_tensors (:obj:`str` or :class:`~transformers.tokenization_utils_base.TensorType`, `optional`):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
                 * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
                 * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
                 * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
         """
         # If we have a list of dicts, let's convert it in a dict of lists
         # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
+        if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
+            encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
         # The model's main input name, usually `input_ids`, has be passed for padding
         if self.model_input_names[0] not in encoded_inputs:
     def _pad(
         self,
         encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+        class_type, # options: "gene" or "cell"
         max_length: Optional[int] = None,
         padding_strategy: PaddingStrategy = PaddingStrategy.LONGEST,
         pad_to_multiple_of: Optional[int] = None,
     ) -> dict:
         """
         Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
         Args:
             encoded_inputs: Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
             max_length: maximum length of the returned list and optionally padding length (see below).
                 Will truncate by taking into account the special tokens.
             padding_strategy: PaddingStrategy to use for padding.
                 - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
                 - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
                 - PaddingStrategy.DO_NOT_PAD: Do not pad
                 The tokenizer padding sides are defined in self.padding_side:
                     - 'left': pads on the left of the sequences
                     - 'right': pads on the right of the sequences
             pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
         if padding_strategy == PaddingStrategy.LONGEST:
             max_length = len(required_input)
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
             max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
         if needs_to_be_padded:
             difference = max_length - len(required_input)
             if self.padding_side == "right":
                 if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
                 if "token_type_ids" in encoded_inputs:
                     encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
                     )
                 if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
                 if class_type == "gene":
+                    encoded_inputs["labels"] = encoded_inputs["labels"] + [-100] * difference
             elif self.padding_side == "left":
                 if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
                 if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
                 if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
                 if class_type == "gene":
+                    encoded_inputs["labels"] = [-100] * difference + encoded_inputs["labels"]
             else:
                 raise ValueError("Invalid padding strategy:" + str(self.padding_side))
         elif return_attention_mask and "attention_mask" not in encoded_inputs:
             encoded_inputs["attention_mask"] = [1] * len(required_input)
         return encoded_inputs
     def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
         all_special_ids = self.all_special_ids  # cache the property
+        special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0]
         return special_tokens_mask
+    def convert_tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
         """
         Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
         vocabulary.
         if token is None:
             return None
+        return token_dictionary.get(token)
     def __len__(self):
+        return len(token_dictionary)
 # collator functions
 class DataCollatorForGeneClassification(DataCollatorForTokenClassification):
     """
     Data collator that will dynamically pad the inputs received, as well as the labels.
             The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
     """
+    tokenizer = PrecollatorForGeneAndCellClassification()
     class_type = "gene"
     padding: Union[bool, str, PaddingStrategy] = True
     max_length: Optional[int] = None
     pad_to_multiple_of: Optional[int] = None
     label_pad_token_id: int = -100
     def __init__(self, *args, **kwargs) -> None:
         super().__init__(
+            tokenizer=self.tokenizer,
             padding=self.padding,
             max_length=self.max_length,
             pad_to_multiple_of=self.pad_to_multiple_of,
             label_pad_token_id=self.label_pad_token_id,
+            *args, **kwargs)
     def _prepare_batch(self, features):
         label_name = "label" if "label" in features[0].keys() else "labels"
+        labels = [feature[label_name] for feature in features] if label_name in features[0].keys() else None
         batch = self.tokenizer.pad(
             features,
             class_type=self.class_type,
             return_tensors="pt",
         )
         return batch
     def __call__(self, features):
         batch = self._prepare_batch(features)
         batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}
         return batch
 class DataCollatorForCellClassification(DataCollatorForGeneClassification):
     class_type = "cell"
     def _prepare_batch(self, features):
         batch = super()._prepare_batch(features)
         # Special handling for labels.
         # Ensure that tensor is created with the correct type
         # (it should be automatically the case, but let's make sure of it.)
         first = features[0]
         if "label" in first and first["label"] is not None:
+            label = first["label"].item() if isinstance(first["label"], torch.Tensor) else first["label"]
             dtype = torch.long if isinstance(label, int) else torch.float
             batch["labels"] = torch.tensor([f["label"] for f in features], dtype=dtype)
         return batch

geneformer/emb_extractor.py CHANGED Viewed

@@ -1,419 +1,253 @@
 """
 Geneformer embedding extractor.
-**Description:**
-| Extracts gene or cell embeddings.
-| Plots cell embeddings as heatmaps or UMAPs.
-| Generates cell state embedding dictionary for use with InSilicoPerturber.
 """
 # imports
 import logging
-import pickle
-from collections import Counter
-from pathlib import Path
 import anndata
 import matplotlib.pyplot as plt
 import pandas as pd
 import scanpy as sc
 import seaborn as sns
 import torch
-from tdigest import TDigest
-from tqdm.auto import trange
-from . import TOKEN_DICTIONARY_FILE
-from . import perturber_utils as pu
-logger = logging.getLogger(__name__)
 # extract embeddings
-def get_embs(
-    model,
-    filtered_input_data,
-    emb_mode,
-    layer_to_quant,
-    pad_token_id,
-    forward_batch_size,
-    token_gene_dict,
-    special_token=False,
-    summary_stat=None,
-    silent=False,
-):
-    model_input_size = pu.get_model_input_size(model)
     total_batch_length = len(filtered_input_data)
     if summary_stat is None:
         embs_list = []
     elif summary_stat is not None:
-        # get # of emb dims
-        emb_dims = pu.get_model_emb_dims(model)
-        if emb_mode == "cell":
-            # initiate tdigests for # of emb dims
-            embs_tdigests = [TDigest() for _ in range(emb_dims)]
-        if emb_mode == "gene":
-            gene_set = list(
-                {
-                    element
-                    for sublist in filtered_input_data["input_ids"]
-                    for element in sublist
-                }
-            )
-            # initiate dict with genes as keys and tdigests for # of emb dims as values
-            embs_tdigests_dict = {
-                k: [TDigest() for _ in range(emb_dims)] for k in gene_set
-            }
-    # Check if CLS and EOS token is present in the token dictionary
-    cls_present = any("<cls>" in value for value in token_gene_dict.values())
-    eos_present = any("<eos>" in value for value in token_gene_dict.values())
-    if emb_mode == "cls":
-        assert cls_present, "<cls> token missing in token dictionary"
-        # Check to make sure that the first token of the filtered input data is cls token
-        gene_token_dict = {v: k for k, v in token_gene_dict.items()}
-        cls_token_id = gene_token_dict["<cls>"]
-        assert (
-            filtered_input_data["input_ids"][0][0] == cls_token_id
-        ), "First token is not <cls> token value"
-    elif emb_mode == "cell":
-        if cls_present:
-            logger.warning(
-                "CLS token present in token dictionary, excluding from average."
-            )
-        if eos_present:
-            logger.warning(
-                "EOS token present in token dictionary, excluding from average."
-            )
-    overall_max_len = 0
-    for i in trange(0, total_batch_length, forward_batch_size, leave=(not silent)):
-        max_range = min(i + forward_batch_size, total_batch_length)
         minibatch = filtered_input_data.select([i for i in range(i, max_range)])
-        max_len = int(max(minibatch["length"]))
-        original_lens = torch.tensor(minibatch["length"], device="cuda")
         minibatch.set_format(type="torch")
         input_data_minibatch = minibatch["input_ids"]
-        input_data_minibatch = pu.pad_tensor_list(
-            input_data_minibatch, max_len, pad_token_id, model_input_size
-        )
         with torch.no_grad():
             outputs = model(
-                input_ids=input_data_minibatch.to("cuda"),
-                attention_mask=pu.gen_attention_mask(minibatch),
             )
         embs_i = outputs.hidden_states[layer_to_quant]
         if emb_mode == "cell":
-            if cls_present:
-                non_cls_embs = embs_i[:, 1:, :]  # Get all layers except the embs
-                if eos_present:
-                    mean_embs = pu.mean_nonpadding_embs(non_cls_embs, original_lens - 2)
-                else:
-                    mean_embs = pu.mean_nonpadding_embs(non_cls_embs, original_lens - 1)
-            else:
-                mean_embs = pu.mean_nonpadding_embs(embs_i, original_lens)
             if summary_stat is None:
-                embs_list.append(mean_embs)
             elif summary_stat is not None:
                 # update tdigests with current batch for each emb dim
-                accumulate_tdigests(embs_tdigests, mean_embs, emb_dims)
-            del mean_embs
-        elif emb_mode == "gene":
-            if summary_stat is None:
-                embs_list.append(embs_i)
-            elif summary_stat is not None:
-                for h in trange(len(minibatch)):
-                    length_h = minibatch[h]["length"]
-                    input_ids_h = minibatch[h]["input_ids"][0:length_h]
-                    # double check dimensions before unsqueezing
-                    embs_i_dim = embs_i.dim()
-                    if embs_i_dim != 3:
-                        logger.error(
-                            f"Embedding tensor should have 3 dimensions, not {embs_i_dim}"
-                        )
-                        raise
-                    embs_h = embs_i[h, :, :].unsqueeze(dim=1)
-                    dict_h = dict(zip(input_ids_h, embs_h))
-                    for k in dict_h.keys():
-                        accumulate_tdigests(
-                            embs_tdigests_dict[int(k)], dict_h[k], emb_dims
-                        )
-                    del embs_h
-                    del dict_h
-        elif emb_mode == "cls":
-            cls_embs = embs_i[:, 0, :].clone().detach()  # CLS token layer
-            embs_list.append(cls_embs)
-            del cls_embs
-        overall_max_len = max(overall_max_len, max_len)
         del outputs
         del minibatch
         del input_data_minibatch
         del embs_i
-        torch.cuda.empty_cache()
     if summary_stat is None:
-        if (emb_mode == "cell") or (emb_mode == "cls"):
-            embs_stack = torch.cat(embs_list, dim=0)
-        elif emb_mode == "gene":
-            embs_stack = pu.pad_tensor_list(
-                embs_list,
-                overall_max_len,
-                pad_token_id,
-                model_input_size,
-                1,
-                pu.pad_3d_tensor,
-            )
     # calculate summary stat embs from approximated tdigests
     elif summary_stat is not None:
-        if emb_mode == "cell":
-            if summary_stat == "mean":
-                summary_emb_list = tdigest_mean(embs_tdigests, emb_dims)
-            elif summary_stat == "median":
-                summary_emb_list = tdigest_median(embs_tdigests, emb_dims)
-            embs_stack = torch.tensor(summary_emb_list)
-        elif emb_mode == "gene":
-            if summary_stat == "mean":
-                [
-                    update_tdigest_dict_mean(embs_tdigests_dict, gene, emb_dims)
-                    for gene in embs_tdigests_dict.keys()
-                ]
-            elif summary_stat == "median":
-                [
-                    update_tdigest_dict_median(embs_tdigests_dict, gene, emb_dims)
-                    for gene in embs_tdigests_dict.keys()
-                ]
-            return embs_tdigests_dict
     return embs_stack
-def accumulate_tdigests(embs_tdigests, mean_embs, emb_dims):
-    # note: tdigest batch update known to be slow so updating serially
-    [
-        embs_tdigests[j].update(mean_embs[i, j].item())
-        for i in range(mean_embs.size(0))
-        for j in range(emb_dims)
-    ]
-def update_tdigest_dict(embs_tdigests_dict, gene, gene_embs, emb_dims):
-    embs_tdigests_dict[gene] = accumulate_tdigests(
-        embs_tdigests_dict[gene], gene_embs, emb_dims
-    )
-def update_tdigest_dict_mean(embs_tdigests_dict, gene, emb_dims):
-    embs_tdigests_dict[gene] = tdigest_mean(embs_tdigests_dict[gene], emb_dims)
-def update_tdigest_dict_median(embs_tdigests_dict, gene, emb_dims):
-    embs_tdigests_dict[gene] = tdigest_median(embs_tdigests_dict[gene], emb_dims)
-def summarize_gene_embs(h, minibatch, embs_i, embs_tdigests_dict, emb_dims):
-    length_h = minibatch[h]["length"]
-    input_ids_h = minibatch[h]["input_ids"][0:length_h]
-    embs_h = embs_i[h, :, :].unsqueeze(dim=1)
-    dict_h = dict(zip(input_ids_h, embs_h))
-    [
-        update_tdigest_dict(embs_tdigests_dict, k, dict_h[k], emb_dims)
-        for k in dict_h.keys()
-    ]
-def tdigest_mean(embs_tdigests, emb_dims):
-    return [embs_tdigests[i].trimmed_mean(0, 100) for i in range(emb_dims)]
-def tdigest_median(embs_tdigests, emb_dims):
-    return [embs_tdigests[i].percentile(50) for i in range(emb_dims)]
-def label_cell_embs(embs, downsampled_data, emb_labels):
-    embs_df = pd.DataFrame(embs.cpu().numpy())
     if emb_labels is not None:
         for label in emb_labels:
             emb_label = downsampled_data[label]
             embs_df[label] = emb_label
     return embs_df
-def label_gene_embs(embs, downsampled_data, token_gene_dict):
-    gene_set = {
-        element for sublist in downsampled_data["input_ids"] for element in sublist
-    }
-    gene_emb_dict = {k: [] for k in gene_set}
-    for i in range(embs.size()[0]):
-        length = downsampled_data[i]["length"]
-        dict_i = dict(
-            zip(
-                downsampled_data[i]["input_ids"][0:length],
-                embs[i, :, :].unsqueeze(dim=1),
-            )
-        )
-        for k in dict_i.keys():
-            gene_emb_dict[k].append(dict_i[k])
-    for k in gene_emb_dict.keys():
-        gene_emb_dict[k] = (
-            torch.squeeze(torch.mean(torch.stack(gene_emb_dict[k]), dim=0), dim=0)
-            .cpu()
-            .numpy()
-        )
-    embs_df = pd.DataFrame(gene_emb_dict).T
-    embs_df.index = [token_gene_dict[token] for token in embs_df.index]
-    return embs_df
-def plot_umap(embs_df, emb_dims, label, output_file, kwargs_dict, seed=0):
-    only_embs_df = embs_df.iloc[:, :emb_dims]
     only_embs_df.index = pd.RangeIndex(0, only_embs_df.shape[0], name=None).astype(str)
-    only_embs_df.columns = pd.RangeIndex(0, only_embs_df.shape[1], name=None).astype(
-        str
-    )
     vars_dict = {"embs": only_embs_df.columns}
-    obs_dict = {"cell_id": list(only_embs_df.index), f"{label}": list(embs_df[label])}
     adata = anndata.AnnData(X=only_embs_df, obs=obs_dict, var=vars_dict)
-    sc.tl.pca(adata, svd_solver="arpack")
-    sc.pp.neighbors(adata, random_state=seed)
-    sc.tl.umap(adata, random_state=seed)
-    sns.set(rc={"figure.figsize": (10, 10)}, font_scale=2.3)
     sns.set_style("white")
-    default_kwargs_dict = {"size": 200}
     if kwargs_dict is not None:
         default_kwargs_dict.update(kwargs_dict)
-    cats = set(embs_df[label])
-    with plt.rc_context():
-        ax = sc.pl.umap(adata, color=label, show=False, **default_kwargs_dict)
-        ax.legend(
-            markerscale=2,
-            frameon=False,
-            loc="center left",
-            bbox_to_anchor=(1, 0.5),
-            ncol=(1 if len(cats) <= 14 else 2 if len(cats) <= 30 else 3),
-        )
-        plt.show()
-        plt.savefig(output_file, bbox_inches="tight")
 def gen_heatmap_class_colors(labels, df):
-    pal = sns.cubehelix_palette(
-        len(Counter(labels).keys()),
-        light=0.9,
-        dark=0.1,
-        hue=1,
-        reverse=True,
-        start=1,
-        rot=-2,
-    )
     lut = dict(zip(map(str, Counter(labels).keys()), pal))
     colors = pd.Series(labels, index=df.index).map(lut)
     return colors
 def gen_heatmap_class_dict(classes, label_colors_series):
-    class_color_dict_df = pd.DataFrame(
-        {"classes": classes, "color": label_colors_series}
-    )
     class_color_dict_df = class_color_dict_df.drop_duplicates(subset=["classes"])
-    return dict(zip(class_color_dict_df["classes"], class_color_dict_df["color"]))
 def make_colorbar(embs_df, label):
-    labels = list(embs_df[label])
     cell_type_colors = gen_heatmap_class_colors(labels, embs_df)
     label_colors = pd.DataFrame(cell_type_colors, columns=[label])
     # create dictionary for colors and classes
     label_color_dict = gen_heatmap_class_dict(labels, label_colors[label])
     return label_colors, label_color_dict
 def plot_heatmap(embs_df, emb_dims, label, output_file, kwargs_dict):
     sns.set_style("white")
     sns.set(font_scale=2)
     plt.figure(figsize=(15, 15), dpi=150)
     label_colors, label_color_dict = make_colorbar(embs_df, label)
-    default_kwargs_dict = {
-        "row_cluster": True,
-        "col_cluster": True,
-        "row_colors": label_colors,
-        "standard_scale": 1,
-        "linewidths": 0,
-        "xticklabels": False,
-        "yticklabels": False,
-        "figsize": (15, 15),
-        "center": 0,
-        "cmap": "magma",
-    }
     if kwargs_dict is not None:
         default_kwargs_dict.update(kwargs_dict)
-    g = sns.clustermap(
-        embs_df.iloc[:, 0:emb_dims].apply(pd.to_numeric), **default_kwargs_dict
-    )
     plt.setp(g.ax_row_colors.get_xmajorticklabels(), rotation=45, ha="right")
     for label_color in list(label_color_dict.keys()):
-        g.ax_col_dendrogram.bar(
-            0, 0, color=label_color_dict[label_color], label=label_color, linewidth=0
-        )
-        g.ax_col_dendrogram.legend(
-            title=f"{label}",
-            loc="lower center",
-            ncol=4,
-            bbox_to_anchor=(0.5, 1),
-            facecolor="white",
-        )
-    plt.show()
-    logger.info(f"Output file: {output_file}")
-    plt.savefig(output_file, bbox_inches="tight")
 class EmbExtractor:
     valid_option_dict = {
-        "model_type": {"Pretrained", "GeneClassifier", "CellClassifier"},
         "num_classes": {int},
-        "emb_mode": {"cls", "cell", "gene"},
         "cell_emb_style": {"mean_pool"},
-        "gene_emb_style": {"mean_pool"},
         "filter_data": {None, dict},
         "max_ncells": {None, int},
         "emb_layer": {-1, 0},
         "emb_label": {None, list},
         "labels_to_plot": {None, list},
         "forward_batch_size": {int},
-        "token_dictionary_file": {None, str},
         "nproc": {int},
-        "summary_stat": {None, "mean", "median", "exact_mean", "exact_median"},
     }
     def __init__(
         self,
         model_type="Pretrained",
         num_classes=0,
-        emb_mode="cls",
         cell_emb_style="mean_pool",
-        gene_emb_style="mean_pool",
         filter_data=None,
         max_ncells=1000,
         emb_layer=-1,
@@ -422,442 +256,238 @@ class EmbExtractor:
         forward_batch_size=100,
         nproc=4,
         summary_stat=None,
-        token_dictionary_file=None,
     ):
         """
         Initialize embedding extractor.
-        **Parameters:**
-        model_type : {"Pretrained", "GeneClassifier", "CellClassifier"}
-            | Whether model is the pretrained Geneformer or a fine-tuned gene or cell classifier.
         num_classes : int
-            | If model is a gene or cell classifier, specify number of classes it was trained to classify.
-            | For the pretrained Geneformer model, number of classes is 0 as it is not a classifier.
-        emb_mode : {"cls", "cell", "gene"}
-            | Whether to output CLS, cell, or gene embeddings.
-            | CLS embeddings are cell embeddings derived from the CLS token in the front of the rank value encoding.
-        cell_emb_style : {"mean_pool"}
-            | Method for summarizing cell embeddings if not using CLS token.
-            | Currently only option is mean pooling of gene embeddings for given cell.
-        gene_emb_style : "mean_pool"
-            | Method for summarizing gene embeddings.
-            | Currently only option is mean pooling of contextual gene embeddings for given gene.
         filter_data : None, dict
-            | Default is to extract embeddings from all input data.
-            | Otherwise, dictionary specifying .dataset column name and list of values to filter by.
         max_ncells : None, int
-            | Maximum number of cells to extract embeddings from.
-            | Default is 1000 cells randomly sampled from input data.
-            | If None, will extract embeddings from all cells.
         emb_layer : {-1, 0}
-            | Embedding layer to extract.
-            | The last layer is most specifically weighted to optimize the given learning objective.
-            | Generally, it is best to extract the 2nd to last layer to get a more general representation.
-            | -1: 2nd to last layer
-            | 0: last layer
         emb_label : None, list
-            | List of column name(s) in .dataset to add as labels to embedding output.
         labels_to_plot : None, list
-            | Cell labels to plot.
-            | Shown as color bar in heatmap.
-            | Shown as cell color in umap.
-            | Plotting umap requires labels to plot.
         forward_batch_size : int
-            | Batch size for forward pass.
         nproc : int
-            | Number of CPU processes to use.
-        summary_stat : {None, "mean", "median", "exact_mean", "exact_median"}
-            | If exact_mean or exact_median, outputs only exact mean or median embedding of input data.
-            | If mean or median, outputs only approximated mean or median embedding of input data.
-            | Non-exact recommended if encountering memory constraints while generating goal embedding positions.
-            | Non-exact is slower but more memory-efficient.
         token_dictionary_file : Path
-            | Default is the Geneformer token dictionary
-            | Path to pickle file containing token dictionary (Ensembl ID:token).
-        **Examples:**
-        .. code-block :: python
-            >>> from geneformer import EmbExtractor
-            >>> embex = EmbExtractor(model_type="CellClassifier",
-            ...         num_classes=3,
-            ...         emb_mode="cell",
-            ...         filter_data={"cell_type":["cardiomyocyte"]},
-            ...         max_ncells=1000,
-            ...         emb_layer=-1,
-            ...         emb_label=["disease", "cell_type"],
-            ...         labels_to_plot=["disease", "cell_type"])
         """
         self.model_type = model_type
         self.num_classes = num_classes
         self.emb_mode = emb_mode
         self.cell_emb_style = cell_emb_style
-        self.gene_emb_style = gene_emb_style
         self.filter_data = filter_data
         self.max_ncells = max_ncells
         self.emb_layer = emb_layer
         self.emb_label = emb_label
         self.labels_to_plot = labels_to_plot
-        self.token_dictionary_file = token_dictionary_file
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
-        if (summary_stat is not None) and ("exact" in summary_stat):
-            self.summary_stat = None
-            self.exact_summary_stat = summary_stat
-        else:
-            self.summary_stat = summary_stat
-            self.exact_summary_stat = None
         self.validate_options()
         # load token dictionary (Ensembl IDs:token)
-        if self.token_dictionary_file is None:
-            token_dictionary_file = TOKEN_DICTIONARY_FILE
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
-        self.token_gene_dict = {v: k for k, v in self.gene_token_dict.items()}
         self.pad_token_id = self.gene_token_dict.get("<pad>")
     def validate_options(self):
         # confirm arguments are within valid options and compatible with each other
-        for attr_name, valid_options in self.valid_option_dict.items():
             attr_value = self.__dict__[attr_name]
-            if not isinstance(attr_value, (list, dict)):
                 if attr_value in valid_options:
                     continue
             valid_type = False
             for option in valid_options:
-                if (option in [int, list, dict, bool, str]) and isinstance(
-                    attr_value, option
-                ):
                     valid_type = True
                     break
             if valid_type:
                 continue
             logger.error(
-                f"Invalid option for {attr_name}. "
                 f"Valid options for {attr_name}: {valid_options}"
             )
             raise
         if self.filter_data is not None:
-            for key, value in self.filter_data.items():
-                if not isinstance(value, list):
                     self.filter_data[key] = [value]
                     logger.warning(
-                        "Values in filter_data dict must be lists. "
-                        f"Changing {key} value to list ([{value}])."
-                    )
-    def extract_embs(
-        self,
-        model_directory,
-        input_data_file,
-        output_directory,
-        output_prefix,
-        output_torch_embs=False,
-        cell_state=None,
-    ):
         """
         Extract embeddings from input data and save as results in output_directory.
-        **Parameters:**
         model_directory : Path
-            | Path to directory containing model
         input_data_file : Path
-            | Path to directory containing .dataset inputs
         output_directory : Path
-            | Path to directory where embedding data will be saved as csv
         output_prefix : str
-            | Prefix for output file
-        output_torch_embs : bool
-            | Whether or not to also output the embeddings as a tensor.
-            | Note, if true, will output embeddings as both dataframe and tensor.
-        cell_state : dict
-            | Cell state key and value for state embedding extraction.
-        **Examples:**
-        .. code-block :: python
-            >>> embs = embex.extract_embs("path/to/model",
-            ...                           "path/to/input_data",
-            ...                           "path/to/output_directory",
-            ...                           "output_prefix")
         """
-        filtered_input_data = pu.load_and_filter(
-            self.filter_data, self.nproc, input_data_file
-        )
-        # Check to make sure that all the labels exist in the tokenized data:
-        if self.emb_label is not None:
-            for label in self.emb_label:
-                assert label in filtered_input_data.features.keys(), f"Attribute `{label}` not present in dataset features"
-        if cell_state is not None:
-            filtered_input_data = pu.filter_by_dict(
-                filtered_input_data, cell_state, self.nproc
-            )
-        downsampled_data = pu.downsample_and_sort(filtered_input_data, self.max_ncells)
-        model = pu.load_model(
-            self.model_type, self.num_classes, model_directory, mode="eval"
-        )
-        layer_to_quant = pu.quant_layers(model) + self.emb_layer
-        embs = get_embs(
-            model=model,
-            filtered_input_data=downsampled_data,
-            emb_mode=self.emb_mode,
-            layer_to_quant=layer_to_quant,
-            pad_token_id=self.pad_token_id,
-            forward_batch_size=self.forward_batch_size,
-            token_gene_dict=self.token_gene_dict,
-            summary_stat=self.summary_stat,
-        )
-        if self.emb_mode == "cell":
-            if self.summary_stat is None:
-                embs_df = label_cell_embs(embs, downsampled_data, self.emb_label)
-            elif self.summary_stat is not None:
-                embs_df = pd.DataFrame(embs.cpu().numpy()).T
-        elif self.emb_mode == "gene":
-            if self.summary_stat is None:
-                embs_df = label_gene_embs(embs, downsampled_data, self.token_gene_dict)
-            elif self.summary_stat is not None:
-                embs_df = pd.DataFrame(embs).T
-                embs_df.index = [self.token_gene_dict[token] for token in embs_df.index]
-        elif self.emb_mode == "cls":
-            embs_df = label_cell_embs(embs, downsampled_data, self.emb_label)
         # save embeddings to output_path
-        if cell_state is None:
-            output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
-            embs_df.to_csv(output_path)
-        if self.exact_summary_stat == "exact_mean":
-            embs = embs.mean(dim=0)
-            emb_dims = pu.get_model_emb_dims(model)
-            embs_df = pd.DataFrame(
-                embs_df[0 : emb_dims - 1].mean(axis="rows"),
-                columns=[self.exact_summary_stat],
-            ).T
-        elif self.exact_summary_stat == "exact_median":
-            embs = torch.median(embs, dim=0)[0]
-            emb_dims = pu.get_model_emb_dims(model)
-            embs_df = pd.DataFrame(
-                embs_df[0 : emb_dims - 1].median(axis="rows"),
-                columns=[self.exact_summary_stat],
-            ).T
-        if cell_state is not None:
-            return embs
-        else:
-            if output_torch_embs:
-                return embs_df, embs
-            else:
-                return embs_df
-    def get_state_embs(
-        self,
-        cell_states_to_model,
-        model_directory,
-        input_data_file,
-        output_directory,
-        output_prefix,
-        output_torch_embs=True,
-    ):
-        """
-        Extract exact mean or exact median cell state embedding positions from input data and save as results in output_directory.
-        **Parameters:**
-        cell_states_to_model : None, dict
-            | Cell states to model if testing perturbations that achieve goal state change.
-            | Four-item dictionary with keys: state_key, start_state, goal_state, and alt_states
-            | state_key: key specifying name of column in .dataset that defines the start/goal states
-            | start_state: value in the state_key column that specifies the start state
-            | goal_state: value in the state_key column taht specifies the goal end state
-            | alt_states: list of values in the state_key column that specify the alternate end states
-            | For example:
-            |      {"state_key": "disease",
-            |      "start_state": "dcm",
-            |      "goal_state": "nf",
-            |      "alt_states": ["hcm", "other1", "other2"]}
-        model_directory : Path
-            | Path to directory containing model
-        input_data_file : Path
-            | Path to directory containing .dataset inputs
-        output_directory : Path
-            | Path to directory where embedding data will be saved as csv
-        output_prefix : str
-            | Prefix for output file
-        output_torch_embs : bool
-            | Whether or not to also output the embeddings as a tensor.
-            | Note, if true, will output embeddings as both dataframe and tensor.
-        **Outputs**
-        | Outputs state_embs_dict for use with in silico perturber.
-        | Format is dictionary of embedding positions of each cell state to model shifts from/towards.
-        | Keys specify each possible cell state to model.
-        | Values are target embedding positions as torch.tensor.
-        | For example:
-        |      {"nf": emb_nf,
-        |      "hcm": emb_hcm,
-        |      "dcm": emb_dcm,
-        |      "other1": emb_other1,
-        |      "other2": emb_other2}
-        """
-        pu.validate_cell_states_to_model(cell_states_to_model)
-        valid_summary_stats = ["exact_mean", "exact_median"]
-        if self.exact_summary_stat not in valid_summary_stats:
-            logger.error(
-                "For extracting state embs, summary_stat in EmbExtractor "
-                f"must be set to option in {valid_summary_stats}"
-            )
-            raise
-        if self.emb_label is not None:
-            logger.error(
-                "For extracting state embs, emb_label should be None since labels are based on state embs dict keys."
-            )
-            raise
-        state_embs_dict = dict()
-        state_key = cell_states_to_model["state_key"]
-        for k, v in cell_states_to_model.items():
-            if k == "state_key":
-                continue
-            elif (k == "start_state") or (k == "goal_state"):
-                state_embs_dict[v] = self.extract_embs(
-                    model_directory,
-                    input_data_file,
-                    output_directory,
-                    output_prefix,
-                    output_torch_embs,
-                    cell_state={state_key: v},
-                )
-            else:  # k == "alt_states"
-                for alt_state in v:
-                    state_embs_dict[alt_state] = self.extract_embs(
-                        model_directory,
-                        input_data_file,
-                        output_directory,
-                        output_prefix,
-                        output_torch_embs,
-                        cell_state={state_key: alt_state},
-                    )
-        output_path = (Path(output_directory) / output_prefix).with_suffix(".pkl")
-        with open(output_path, "wb") as fp:
-            pickle.dump(state_embs_dict, fp)
-        return state_embs_dict
-    def plot_embs(
-        self,
-        embs,
-        plot_style,
-        output_directory,
-        output_prefix,
-        max_ncells_to_plot=1000,
-        kwargs_dict=None,
-    ):
         """
         Plot embeddings, coloring by provided labels.
-        **Parameters:**
         embs : pandas.core.frame.DataFrame
-            | Pandas dataframe containing embeddings output from extract_embs
         plot_style : str
-            | Style of plot: "heatmap" or "umap"
         output_directory : Path
-            | Path to directory where plots will be saved as pdf
         output_prefix : str
-            | Prefix for output file
         max_ncells_to_plot : None, int
-            | Maximum number of cells to plot.
-            | Default is 1000 cells randomly sampled from embeddings.
-            | If None, will plot embeddings from all cells.
         kwargs_dict : dict
-            | Dictionary of kwargs to pass to plotting function.
-        **Examples:**
-        .. code-block :: python
-            >>> embex.plot_embs(embs=embs,
-            ...                 plot_style="heatmap",
-            ...                 output_directory="path/to/output_directory",
-            ...                 output_prefix="output_prefix")
         """
-        if plot_style not in ["heatmap", "umap"]:
             logger.error(
-                "Invalid option for 'plot_style'. " "Valid options: {'heatmap','umap'}"
             )
             raise
         if (plot_style == "umap") and (self.labels_to_plot is None):
-            logger.error("Plotting UMAP requires 'labels_to_plot'. ")
             raise
-        if max_ncells_to_plot is not None:
-            if max_ncells_to_plot > self.max_ncells:
-                max_ncells_to_plot = self.max_ncells
-                logger.warning(
-                    "max_ncells_to_plot must be <= max_ncells. "
-                    f"Changing max_ncells_to_plot to {self.max_ncells}."
-                )
-            elif max_ncells_to_plot < self.max_ncells:
-                embs = embs.sample(max_ncells_to_plot, axis=0)
         if self.emb_label is None:
             label_len = 0
         else:
             label_len = len(self.emb_label)
         emb_dims = embs.shape[1] - label_len
         if self.emb_label is None:
             emb_labels = None
         else:
             emb_labels = embs.columns[emb_dims:]
         if plot_style == "umap":
             for label in self.labels_to_plot:
                 if label not in emb_labels:
                     logger.warning(
-                        f"Label {label} from labels_to_plot "
-                        f"not present in provided embeddings dataframe."
-                    )
                     continue
-                output_prefix_label = output_prefix + f"_umap_{label}"
-                output_file = (
-                    Path(output_directory) / output_prefix_label
-                ).with_suffix(".pdf")
-                plot_umap(embs, emb_dims, label, output_file, kwargs_dict)
         if plot_style == "heatmap":
             for label in self.labels_to_plot:
                 if label not in emb_labels:
                     logger.warning(
-                        f"Label {label} from labels_to_plot "
-                        f"not present in provided embeddings dataframe."
-                    )
                     continue
                 output_prefix_label = output_prefix + f"_heatmap_{label}"
-                output_file = (
-                    Path(output_directory) / output_prefix_label
-                ).with_suffix(".pdf")
-                plot_heatmap(embs, emb_dims, label, output_file, kwargs_dict)

 """
 Geneformer embedding extractor.
+Usage:
+  from geneformer import EmbExtractor
+  embex = EmbExtractor(model_type="CellClassifier",
+                       num_classes=3,
+                       emb_mode="cell",
+                       cell_emb_style="mean_pool",
+                       filter_data={"cell_type":["cardiomyocyte"]},
+                       max_ncells=1000,
+                       max_ncells_to_plot=1000,
+                       emb_layer=-1,
+                       emb_label=["disease","cell_type"],
+                       labels_to_plot=["disease","cell_type"],
+                       forward_batch_size=100,
+                       nproc=16,
+                       summary_stat=None)
+  embs = embex.extract_embs("path/to/model",
+                            "path/to/input_data",
+                            "path/to/output_directory",
+                            "output_prefix")
+  embex.plot_embs(embs=embs,
+                  plot_style="heatmap",
+                  output_directory="path/to/output_directory",
+                  output_prefix="output_prefix")
 """
 # imports
 import logging
 import anndata
 import matplotlib.pyplot as plt
+import numpy as np
 import pandas as pd
+import pickle
+from tdigest import TDigest
 import scanpy as sc
 import seaborn as sns
 import torch
+from collections import Counter
+from pathlib import Path
+from tqdm.notebook import trange
+from transformers import BertForMaskedLM, BertForTokenClassification, BertForSequenceClassification
+from .tokenizer import TOKEN_DICTIONARY_FILE
+from .in_silico_perturber import downsample_and_sort, \
+                                 gen_attention_mask, \
+                                 get_model_input_size, \
+                                 load_and_filter, \
+                                 load_model, \
+                                 mean_nonpadding_embs, \
+                                 pad_tensor_list, \
+                                 quant_layers
+logger = logging.getLogger(__name__)
 # extract embeddings
+def get_embs(model,
+             filtered_input_data,
+             emb_mode,
+             layer_to_quant,
+             pad_token_id,
+             forward_batch_size,
+             summary_stat):
+    model_input_size = get_model_input_size(model)
     total_batch_length = len(filtered_input_data)
     if summary_stat is None:
         embs_list = []
     elif summary_stat is not None:
+        # test embedding extraction for example cell and extract # emb dims
+        example = filtered_input_data.select([i for i in range(1)])
+        example.set_format(type="torch")
+        emb_dims = test_emb(model, example["input_ids"], layer_to_quant)
+        # initiate tdigests for # of emb dims
+        embs_tdigests = [TDigest() for _ in range(emb_dims)]
+    for i in trange(0, total_batch_length, forward_batch_size):
+        max_range = min(i+forward_batch_size, total_batch_length)
         minibatch = filtered_input_data.select([i for i in range(i, max_range)])
+        max_len = max(minibatch["length"])
+        original_lens = torch.tensor(minibatch["length"]).to("cuda")
         minibatch.set_format(type="torch")
         input_data_minibatch = minibatch["input_ids"]
+        input_data_minibatch = pad_tensor_list(input_data_minibatch,
+                                               max_len,
+                                               pad_token_id,
+                                               model_input_size)
         with torch.no_grad():
             outputs = model(
+                input_ids = input_data_minibatch.to("cuda"),
+                attention_mask = gen_attention_mask(minibatch)
             )
         embs_i = outputs.hidden_states[layer_to_quant]
         if emb_mode == "cell":
+            mean_embs = mean_nonpadding_embs(embs_i, original_lens)
             if summary_stat is None:
+                embs_list += [mean_embs]
             elif summary_stat is not None:
                 # update tdigests with current batch for each emb dim
+                # note: tdigest batch update known to be slow so updating serially
+                [embs_tdigests[j].update(mean_embs[i,j].item()) for i in range(mean_embs.size(0)) for j in range(emb_dims)]
         del outputs
         del minibatch
         del input_data_minibatch
         del embs_i
+        del mean_embs
+        torch.cuda.empty_cache()
     if summary_stat is None:
+        embs_stack = torch.cat(embs_list)
     # calculate summary stat embs from approximated tdigests
     elif summary_stat is not None:
+        if summary_stat == "mean":
+            summary_emb_list = [embs_tdigests[i].trimmed_mean(0,100) for i in range(emb_dims)]
+        elif summary_stat == "median":
+            summary_emb_list = [embs_tdigests[i].percentile(50) for i in range(emb_dims)]
+        embs_stack = torch.tensor(summary_emb_list)
     return embs_stack
+def test_emb(model, example, layer_to_quant):
+    with torch.no_grad():
+        outputs = model(
+            input_ids = example.to("cuda")
+        )
+    embs_test = outputs.hidden_states[layer_to_quant]
+    return embs_test.size()[2]
+def label_embs(embs, downsampled_data, emb_labels):
+    embs_df = pd.DataFrame(embs.cpu())
     if emb_labels is not None:
         for label in emb_labels:
             emb_label = downsampled_data[label]
             embs_df[label] = emb_label
     return embs_df
+def plot_umap(embs_df, emb_dims, label, output_file, kwargs_dict):
+    only_embs_df = embs_df.iloc[:,:emb_dims]
     only_embs_df.index = pd.RangeIndex(0, only_embs_df.shape[0], name=None).astype(str)
+    only_embs_df.columns = pd.RangeIndex(0, only_embs_df.shape[1], name=None).astype(str)
     vars_dict = {"embs": only_embs_df.columns}
+    obs_dict = {"cell_id": list(only_embs_df.index),
+                f"{label}": list(embs_df[label])}
     adata = anndata.AnnData(X=only_embs_df, obs=obs_dict, var=vars_dict)
+    sc.tl.pca(adata, svd_solver='arpack')
+    sc.pp.neighbors(adata)
+    sc.tl.umap(adata)
+    sns.set(rc={'figure.figsize':(10,10)}, font_scale=2.3)
     sns.set_style("white")
+    default_kwargs_dict = {"palette":"Set2", "size":200}
     if kwargs_dict is not None:
         default_kwargs_dict.update(kwargs_dict)
+    sc.pl.umap(adata, color=label, save=output_file, **default_kwargs_dict)
 def gen_heatmap_class_colors(labels, df):
+    pal = sns.cubehelix_palette(len(Counter(labels).keys()), light=0.9, dark=0.1, hue=1, reverse=True, start=1, rot=-2)
     lut = dict(zip(map(str, Counter(labels).keys()), pal))
     colors = pd.Series(labels, index=df.index).map(lut)
     return colors
 def gen_heatmap_class_dict(classes, label_colors_series):
+    class_color_dict_df = pd.DataFrame({"classes": classes, "color": label_colors_series})
     class_color_dict_df = class_color_dict_df.drop_duplicates(subset=["classes"])
+    return dict(zip(class_color_dict_df["classes"],class_color_dict_df["color"]))
 def make_colorbar(embs_df, label):
+    labels = list(embs_df[label])
     cell_type_colors = gen_heatmap_class_colors(labels, embs_df)
     label_colors = pd.DataFrame(cell_type_colors, columns=[label])
+    for i,row in label_colors.iterrows():
+        colors=row[0]
+        if len(colors)!=3 or any(np.isnan(colors)):
+            print(i,colors)
+    label_colors.isna().sum()
     # create dictionary for colors and classes
     label_color_dict = gen_heatmap_class_dict(labels, label_colors[label])
     return label_colors, label_color_dict
 def plot_heatmap(embs_df, emb_dims, label, output_file, kwargs_dict):
     sns.set_style("white")
     sns.set(font_scale=2)
     plt.figure(figsize=(15, 15), dpi=150)
     label_colors, label_color_dict = make_colorbar(embs_df, label)
+    default_kwargs_dict = {"row_cluster": True,
+                           "col_cluster": True,
+                           "row_colors": label_colors,
+                           "standard_scale":  1,
+                           "linewidths": 0,
+                           "xticklabels": False,
+                           "yticklabels": False,
+                           "figsize": (15,15),
+                           "center": 0,
+                           "cmap": "magma"}
     if kwargs_dict is not None:
         default_kwargs_dict.update(kwargs_dict)
+    g = sns.clustermap(embs_df.iloc[:,0:emb_dims].apply(pd.to_numeric), **default_kwargs_dict)
     plt.setp(g.ax_row_colors.get_xmajorticklabels(), rotation=45, ha="right")
     for label_color in list(label_color_dict.keys()):
+        g.ax_col_dendrogram.bar(0, 0, color=label_color_dict[label_color], label=label_color, linewidth=0)
+        l1 = g.ax_col_dendrogram.legend(title=f"{label}",
+                                        loc="lower center",
+                                        ncol=4,
+                                        bbox_to_anchor=(0.5, 1),
+                                        facecolor="white")
+    plt.savefig(output_file, bbox_inches='tight')
 class EmbExtractor:
     valid_option_dict = {
+        "model_type": {"Pretrained","GeneClassifier","CellClassifier"},
         "num_classes": {int},
+        "emb_mode": {"cell","gene"},
         "cell_emb_style": {"mean_pool"},
         "filter_data": {None, dict},
         "max_ncells": {None, int},
         "emb_layer": {-1, 0},
         "emb_label": {None, list},
         "labels_to_plot": {None, list},
         "forward_batch_size": {int},
         "nproc": {int},
+        "summary_stat": {None, "mean", "median"},
     }
     def __init__(
         self,
         model_type="Pretrained",
         num_classes=0,
+        emb_mode="cell",
         cell_emb_style="mean_pool",
         filter_data=None,
         max_ncells=1000,
         emb_layer=-1,
         forward_batch_size=100,
         nproc=4,
         summary_stat=None,
+        token_dictionary_file=TOKEN_DICTIONARY_FILE,
     ):
         """
         Initialize embedding extractor.
+        Parameters
+        ----------
+        model_type : {"Pretrained","GeneClassifier","CellClassifier"}
+            Whether model is the pretrained Geneformer or a fine-tuned gene or cell classifier.
         num_classes : int
+            If model is a gene or cell classifier, specify number of classes it was trained to classify.
+            For the pretrained Geneformer model, number of classes is 0 as it is not a classifier.
+        emb_mode : {"cell","gene"}
+            Whether to output cell or gene embeddings.
+        cell_emb_style : "mean_pool"
+            Method for summarizing cell embeddings.
+            Currently only option is mean pooling of gene embeddings for given cell.
         filter_data : None, dict
+            Default is to extract embeddings from all input data.
+            Otherwise, dictionary specifying .dataset column name and list of values to filter by.
         max_ncells : None, int
+            Maximum number of cells to extract embeddings from.
+            Default is 1000 cells randomly sampled from input data.
+            If None, will extract embeddings from all cells.
         emb_layer : {-1, 0}
+            Embedding layer to extract.
+            The last layer is most specifically weighted to optimize the given learning objective.
+            Generally, it is best to extract the 2nd to last layer to get a more general representation.
+            -1: 2nd to last layer
+            0: last layer
         emb_label : None, list
+            List of column name(s) in .dataset to add as labels to embedding output.
         labels_to_plot : None, list
+            Cell labels to plot.
+            Shown as color bar in heatmap.
+            Shown as cell color in umap.
+            Plotting umap requires labels to plot.
         forward_batch_size : int
+            Batch size for forward pass.
         nproc : int
+            Number of CPU processes to use.
+        summary_stat : {None, "mean", "median"}
+            If not None, outputs only approximated mean or median embedding of input data.
+            Recommended if encountering memory constraints while generating goal embedding positions.
+            Slower but more memory-efficient.
         token_dictionary_file : Path
+            Path to pickle file containing token dictionary (Ensembl ID:token).
         """
         self.model_type = model_type
         self.num_classes = num_classes
         self.emb_mode = emb_mode
         self.cell_emb_style = cell_emb_style
         self.filter_data = filter_data
         self.max_ncells = max_ncells
         self.emb_layer = emb_layer
         self.emb_label = emb_label
         self.labels_to_plot = labels_to_plot
         self.forward_batch_size = forward_batch_size
         self.nproc = nproc
+        self.summary_stat = summary_stat
         self.validate_options()
         # load token dictionary (Ensembl IDs:token)
         with open(token_dictionary_file, "rb") as f:
             self.gene_token_dict = pickle.load(f)
         self.pad_token_id = self.gene_token_dict.get("<pad>")
     def validate_options(self):
+        # first disallow options under development
+        if self.emb_mode == "gene":
+            logger.error(
+                "Extraction and plotting of gene-level embeddings currently under development. " \
+                "Current valid option for 'emb_mode': 'cell'"
+            )
+            raise
         # confirm arguments are within valid options and compatible with each other
+        for attr_name,valid_options in self.valid_option_dict.items():
             attr_value = self.__dict__[attr_name]
+            if type(attr_value) not in {list, dict}:
                 if attr_value in valid_options:
                     continue
             valid_type = False
             for option in valid_options:
+                if (option in [int,list,dict]) and isinstance(attr_value, option):
                     valid_type = True
                     break
             if valid_type:
                 continue
             logger.error(
+                f"Invalid option for {attr_name}. " \
                 f"Valid options for {attr_name}: {valid_options}"
             )
             raise
         if self.filter_data is not None:
+            for key,value in self.filter_data.items():
+                if type(value) != list:
                     self.filter_data[key] = [value]
                     logger.warning(
+                        "Values in filter_data dict must be lists. " \
+                        f"Changing {key} value to list ([{value}]).")
+    def extract_embs(self,
+                     model_directory,
+                     input_data_file,
+                     output_directory,
+                     output_prefix):
         """
         Extract embeddings from input data and save as results in output_directory.
+        Parameters
+        ----------
         model_directory : Path
+            Path to directory containing model
         input_data_file : Path
+            Path to directory containing .dataset inputs
         output_directory : Path
+            Path to directory where embedding data will be saved as csv
         output_prefix : str
+            Prefix for output file
         """
+        filtered_input_data = load_and_filter(self.filter_data, self.nproc, input_data_file)
+        downsampled_data = downsample_and_sort(filtered_input_data, self.max_ncells)
+        model = load_model(self.model_type, self.num_classes, model_directory)
+        layer_to_quant = quant_layers(model)+self.emb_layer
+        embs = get_embs(model,
+                        downsampled_data,
+                        self.emb_mode,
+                        layer_to_quant,
+                        self.pad_token_id,
+                        self.forward_batch_size,
+                        self.summary_stat)
+        if self.summary_stat is None:
+            embs_df = label_embs(embs, downsampled_data, self.emb_label)
+        elif self.summary_stat is not None:
+            embs_df = pd.DataFrame(embs.cpu()).T
         # save embeddings to output_path
+        output_path = (Path(output_directory) / output_prefix).with_suffix(".csv")
+        embs_df.to_csv(output_path)
+        return embs_df
+    def plot_embs(self,
+                  embs,
+                  plot_style,
+                  output_directory,
+                  output_prefix,
+                  max_ncells_to_plot=1000,
+                  kwargs_dict=None):
         """
         Plot embeddings, coloring by provided labels.
+        Parameters
+        ----------
         embs : pandas.core.frame.DataFrame
+            Pandas dataframe containing embeddings output from extract_embs
         plot_style : str
+            Style of plot: "heatmap" or "umap"
         output_directory : Path
+            Path to directory where plots will be saved as pdf
         output_prefix : str
+            Prefix for output file
         max_ncells_to_plot : None, int
+            Maximum number of cells to plot.
+            Default is 1000 cells randomly sampled from embeddings.
+            If None, will plot embeddings from all cells.
         kwargs_dict : dict
+            Dictionary of kwargs to pass to plotting function.
         """
+        if plot_style not in ["heatmap","umap"]:
             logger.error(
+                "Invalid option for 'plot_style'. " \
+                "Valid options: {'heatmap','umap'}"
             )
             raise
         if (plot_style == "umap") and (self.labels_to_plot is None):
+            logger.error(
+                "Plotting UMAP requires 'labels_to_plot'. "
+            )
             raise
+        if max_ncells_to_plot > self.max_ncells:
+            max_ncells_to_plot = self.max_ncells
+            logger.warning(
+                "max_ncells_to_plot must be <= max_ncells. " \
+                f"Changing max_ncells_to_plot to {self.max_ncells}.")
+        if (max_ncells_to_plot is not None) \
+            and (max_ncells_to_plot < self.max_ncells):
+            embs = embs.sample(max_ncells_to_plot, axis=0)
         if self.emb_label is None:
             label_len = 0
         else:
             label_len = len(self.emb_label)
         emb_dims = embs.shape[1] - label_len
         if self.emb_label is None:
             emb_labels = None
         else:
             emb_labels = embs.columns[emb_dims:]
         if plot_style == "umap":
             for label in self.labels_to_plot:
                 if label not in emb_labels:
                     logger.warning(
+                        f"Label {label} from labels_to_plot " \
+                        f"not present in provided embeddings dataframe.")
                     continue
+                output_prefix_label = "_" + output_prefix + f"_umap_{label}"
+                output_file = (Path(output_directory) / output_prefix_label).with_suffix(".pdf")
+                plot_umap(embs, emb_dims, label, output_prefix_label, kwargs_dict)
         if plot_style == "heatmap":
             for label in self.labels_to_plot:
                 if label not in emb_labels:
                     logger.warning(
+                        f"Label {label} from labels_to_plot " \
+                        f"not present in provided embeddings dataframe.")
                     continue
                 output_prefix_label = output_prefix + f"_heatmap_{label}"
+                output_file = (Path(output_directory) / output_prefix_label).with_suffix(".pdf")
+                plot_heatmap(embs, emb_dims, label, output_file, kwargs_dict)

geneformer/ensembl_mapping_dict_gc95M.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0819bcbd869cfa14279449b037eb9ed1d09a91310e77bd1a19d927465030e95c
-size 3957652

geneformer/evaluation_utils.py DELETED Viewed

@@ -1,287 +0,0 @@
-import logging
-import math
-import pickle
-from pathlib import Path
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-import seaborn as sns
-import torch
-from datasets.utils.logging import disable_progress_bar, enable_progress_bar
-from sklearn import preprocessing
-from sklearn.metrics import (
-    ConfusionMatrixDisplay,
-    accuracy_score,
-    auc,
-    confusion_matrix,
-    f1_score,
-    roc_curve,
-)
-from tqdm.auto import trange
-from . import TOKEN_DICTIONARY_FILE
-from .emb_extractor import make_colorbar
-logger = logging.getLogger(__name__)
-def preprocess_classifier_batch(cell_batch, max_len, label_name):
-    if max_len is None:
-        max_len = max([len(i) for i in cell_batch["input_ids"]])
-    # load token dictionary (Ensembl IDs:token)
-    with open(TOKEN_DICTIONARY_FILE, "rb") as f:
-        gene_token_dict = pickle.load(f)
-    def pad_label_example(example):
-        example[label_name] = np.pad(
-            example[label_name],
-            (0, max_len - len(example["input_ids"])),
-            mode="constant",
-            constant_values=-100,
-        )
-        example["input_ids"] = np.pad(
-            example["input_ids"],
-            (0, max_len - len(example["input_ids"])),
-            mode="constant",
-            constant_values=gene_token_dict.get("<pad>"),
-        )
-        example["attention_mask"] = (
-            example["input_ids"] != gene_token_dict.get("<pad>")
-        ).astype(int)
-        return example
-    padded_batch = cell_batch.map(pad_label_example)
-    return padded_batch
-# Function to find the largest number smaller
-# than or equal to N that is divisible by k
-def find_largest_div(N, K):
-    rem = N % K
-    if rem == 0:
-        return N
-    else:
-        return N - rem
-def vote(logit_list):
-    m = max(logit_list)
-    logit_list.index(m)
-    indices = [i for i, x in enumerate(logit_list) if x == m]
-    if len(indices) > 1:
-        return "tie"
-    else:
-        return indices[0]
-def py_softmax(vector):
-    e = np.exp(vector)
-    return e / e.sum()
-def classifier_predict(model, classifier_type, evalset, forward_batch_size):
-    if classifier_type == "gene":
-        label_name = "labels"
-    elif classifier_type == "cell":
-        label_name = "label"
-    predict_logits = []
-    predict_labels = []
-    model.eval()
-    # ensure there is at least 2 examples in each batch to avoid incorrect tensor dims
-    evalset_len = len(evalset)
-    max_divisible = find_largest_div(evalset_len, forward_batch_size)
-    if len(evalset) - max_divisible == 1:
-        evalset_len = max_divisible
-    max_evalset_len = max(evalset.select([i for i in range(evalset_len)])["length"])
-    disable_progress_bar()  # disable progress bar for preprocess_classifier_batch mapping
-    for i in trange(0, evalset_len, forward_batch_size):
-        max_range = min(i + forward_batch_size, evalset_len)
-        batch_evalset = evalset.select([i for i in range(i, max_range)])
-        padded_batch = preprocess_classifier_batch(
-            batch_evalset, max_evalset_len, label_name
-        )
-        padded_batch.set_format(type="torch")
-        input_data_batch = padded_batch["input_ids"]
-        attn_msk_batch = padded_batch["attention_mask"]
-        label_batch = padded_batch[label_name]
-        with torch.no_grad():
-            outputs = model(
-                input_ids=input_data_batch.to("cuda"),
-                attention_mask=attn_msk_batch.to("cuda"),
-                labels=label_batch.to("cuda"),
-            )
-            predict_logits += [torch.squeeze(outputs.logits.to("cpu"))]
-            predict_labels += [torch.squeeze(label_batch.to("cpu"))]
-    enable_progress_bar()
-    logits_by_cell = torch.cat(predict_logits)
-    last_dim = len(logits_by_cell.shape) - 1
-    all_logits = logits_by_cell.reshape(-1, logits_by_cell.shape[last_dim])
-    labels_by_cell = torch.cat(predict_labels)
-    all_labels = torch.flatten(labels_by_cell)
-    logit_label_paired = [
-        item
-        for item in list(zip(all_logits.tolist(), all_labels.tolist()))
-        if item[1] != -100
-    ]
-    y_pred = [vote(item[0]) for item in logit_label_paired]
-    y_true = [item[1] for item in logit_label_paired]
-    logits_list = [item[0] for item in logit_label_paired]
-    return y_pred, y_true, logits_list
-def get_metrics(y_pred, y_true, logits_list, num_classes, labels):
-    conf_mat = confusion_matrix(y_true, y_pred, labels=list(labels))
-    macro_f1 = f1_score(y_true, y_pred, average="macro")
-    acc = accuracy_score(y_true, y_pred)
-    roc_metrics = None  # roc metrics not reported for multiclass
-    if num_classes == 2:
-        y_score = [py_softmax(item)[1] for item in logits_list]
-        fpr, tpr, _ = roc_curve(y_true, y_score)
-        mean_fpr = np.linspace(0, 1, 100)
-        interp_tpr = np.interp(mean_fpr, fpr, tpr)
-        interp_tpr[0] = 0.0
-        tpr_wt = len(tpr)
-        roc_auc = auc(fpr, tpr)
-        roc_metrics = {
-            "fpr": fpr,
-            "tpr": tpr,
-            "interp_tpr": interp_tpr,
-            "auc": roc_auc,
-            "tpr_wt": tpr_wt,
-        }
-    return conf_mat, macro_f1, acc, roc_metrics
-# get cross-validated mean and sd metrics
-def get_cross_valid_roc_metrics(all_tpr, all_roc_auc, all_tpr_wt):
-    wts = [count / sum(all_tpr_wt) for count in all_tpr_wt]
-    all_weighted_tpr = [a * b for a, b in zip(all_tpr, wts)]
-    mean_tpr = np.sum(all_weighted_tpr, axis=0)
-    mean_tpr[-1] = 1.0
-    all_weighted_roc_auc = [a * b for a, b in zip(all_roc_auc, wts)]
-    roc_auc = np.sum(all_weighted_roc_auc)
-    roc_auc_sd = math.sqrt(np.average((all_roc_auc - roc_auc) ** 2, weights=wts))
-    return mean_tpr, roc_auc, roc_auc_sd
-# plot ROC curve
-def plot_ROC(roc_metric_dict, model_style_dict, title, output_dir, output_prefix):
-    fig = plt.figure()
-    fig.set_size_inches(10, 8)
-    sns.set(font_scale=2)
-    sns.set_style("white")
-    lw = 3
-    for model_name in roc_metric_dict.keys():
-        mean_fpr = roc_metric_dict[model_name]["mean_fpr"]
-        mean_tpr = roc_metric_dict[model_name]["mean_tpr"]
-        roc_auc = roc_metric_dict[model_name]["roc_auc"]
-        roc_auc_sd = roc_metric_dict[model_name]["roc_auc_sd"]
-        color = model_style_dict[model_name]["color"]
-        linestyle = model_style_dict[model_name]["linestyle"]
-        if len(roc_metric_dict[model_name]["all_roc_auc"]) > 1:
-            label = f"{model_name} (AUC {roc_auc:0.2f} $\pm$ {roc_auc_sd:0.2f})"
-        else:
-            label = f"{model_name} (AUC {roc_auc:0.2f})"
-        plt.plot(
-            mean_fpr, mean_tpr, color=color, linestyle=linestyle, lw=lw, label=label
-        )
-    plt.plot([0, 1], [0, 1], color="black", lw=lw, linestyle="--")
-    plt.xlim([0.0, 1.0])
-    plt.ylim([0.0, 1.05])
-    plt.xlabel("False Positive Rate")
-    plt.ylabel("True Positive Rate")
-    plt.title(title)
-    plt.legend(loc="lower right")
-    output_file = (Path(output_dir) / f"{output_prefix}_roc").with_suffix(".pdf")
-    plt.savefig(output_file, bbox_inches="tight")
-    plt.show()
-# plot confusion matrix
-def plot_confusion_matrix(
-    conf_mat_df, title, output_dir, output_prefix, custom_class_order
-):
-    fig = plt.figure()
-    fig.set_size_inches(10, 10)
-    sns.set(font_scale=1)
-    sns.set_style("whitegrid", {"axes.grid": False})
-    if custom_class_order is not None:
-        conf_mat_df = conf_mat_df.reindex(
-            index=custom_class_order, columns=custom_class_order
-        )
-    display_labels = generate_display_labels(conf_mat_df)
-    conf_mat = preprocessing.normalize(conf_mat_df.to_numpy(), norm="l1")
-    display = ConfusionMatrixDisplay(
-        confusion_matrix=conf_mat, display_labels=display_labels
-    )
-    display.plot(cmap="Blues", values_format=".2g")
-    plt.title(title)
-    plt.show()
-    output_file = (Path(output_dir) / f"{output_prefix}_conf_mat").with_suffix(".pdf")
-    display.figure_.savefig(output_file, bbox_inches="tight")
-def generate_display_labels(conf_mat_df):
-    display_labels = []
-    i = 0
-    for label in conf_mat_df.index:
-        display_labels += [f"{label}\nn={conf_mat_df.iloc[i,:].sum():.0f}"]
-        i = i + 1
-    return display_labels
-def plot_predictions(predictions_df, title, output_dir, output_prefix, kwargs_dict):
-    sns.set(font_scale=2)
-    plt.figure(figsize=(10, 10), dpi=150)
-    label_colors, label_color_dict = make_colorbar(predictions_df, "true")
-    predictions_df = predictions_df.drop(columns=["true"])
-    predict_colors_list = [label_color_dict[label] for label in predictions_df.columns]
-    predict_label_list = [label for label in predictions_df.columns]
-    predict_colors = pd.DataFrame(
-        pd.Series(predict_colors_list, index=predict_label_list), columns=["predicted"]
-    )
-    default_kwargs_dict = {
-        "row_cluster": False,
-        "col_cluster": False,
-        "row_colors": label_colors,
-        "col_colors": predict_colors,
-        "linewidths": 0,
-        "xticklabels": False,
-        "yticklabels": False,
-        "center": 0,
-        "cmap": "vlag",
-    }
-    if kwargs_dict is not None:
-        default_kwargs_dict.update(kwargs_dict)
-    g = sns.clustermap(predictions_df, **default_kwargs_dict)
-    plt.setp(g.ax_row_colors.get_xmajorticklabels(), rotation=45, ha="right")
-    for label_color in list(label_color_dict.keys()):
-        g.ax_col_dendrogram.bar(
-            0, 0, color=label_color_dict[label_color], label=label_color, linewidth=0
-        )
-        g.ax_col_dendrogram.legend(
-            title=f"{title}",
-            loc="lower center",
-            ncol=4,
-            bbox_to_anchor=(0.5, 1),
-            facecolor="white",
-        )
-    output_file = (Path(output_dir) / f"{output_prefix}_pred").with_suffix(".pdf")
-    plt.savefig(output_file, bbox_inches="tight")

geneformer/gene_dictionaries_30m/ensembl_mapping_dict_gc30M.pkl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:eac0fb0b3007267871b6305ac0003ceba19d4f28d85686cb9067ecf142787869
-size 584125