issue with single gene perturbation
#358
by
cstrlln
- opened
I'm getting this error when trying to do in silico perturb with a single gene:
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
here is my code
# first obtain start, goal, and alt embedding positions
# this function was changed to be separate from perturb_data
# to avoid repeating calcuations when parallelizing perturb_data
cell_states_to_model={"state_key": "cell_type",
"start_state": "Mid",
"goal_state": "Late_3",
"alt_states": ["Late_1","Late_2","Late_4"]}
filter_data_dict={"cell_type":["Mid","Late_3","Late_1","Late_2","Late_4"]}
# embex = EmbExtractor(model_type="CellClassifier",
# num_classes=3,
# filter_data=filter_data_dict,
# max_ncells=1000,
# emb_layer=0,
# summary_stat="exact_mean",
# forward_batch_size=32,
# nproc=16)
embex = EmbExtractor(model_type="CellClassifier",
num_classes=10,
max_ncells=1000,
emb_layer=0,
summary_stat="exact_mean",
forward_batch_size=10,
nproc=8)
state_embs_dict = embex.get_state_embs(cell_states_to_model,
"classifier/240625155408/240625_geneformer_cellClassifier_asc_classifier_test/ksplit1",
"data_for_geneformer/asc_organs.dataset",
output_directory = output_dir,
output_prefix = output_prefix)
isp = InSilicoPerturber(perturb_type="overexpress",
perturb_rank_shift=None,
genes_to_perturb= ['ENSG00000171791'],
combos=0,
anchor_gene=None,
model_type="CellClassifier",
num_classes=10,
emb_mode="cell",
cell_emb_style="mean_pool",
cell_states_to_model=cell_states_to_model,
state_embs_dict=state_embs_dict,
max_ncells=2000,
emb_layer=0,
forward_batch_size=4,
nproc=1)
isp.perturb_data("classifier/240625155408/240625_geneformer_cellClassifier_asc_classifier_test/ksplit1",
"data_for_geneformer/asc_organs.dataset",
output_directory = output_dir,
output_prefix = "pert")
ispstats = InSilicoPerturberStats(mode="goal_state_shift",
genes_perturbed=genes,
combos=0,
anchor_gene=None,
cell_states_to_model=cell_states_to_model)
ispstats.get_stats(input_data_directory = output_dir,
null_dist_data_directory = None,
output_directory = output_dir,
output_prefix = "stats_bcl2_over2")
here is full error, fails only in last step:
100%
β4/4β[00:00<00:00,β437.56it/s]
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[30], line 2
1 # extracts data from intermediate files and processes stats to output in final .csv
----> 2 ispstats.get_stats(input_data_directory = output_dir,
3 null_dist_data_directory = None,
4 output_directory = output_dir,
5 output_prefix = "stats_bcl2_over2")
File ~/miniforge3/envs/geneformer/lib/python3.10/site-packages/geneformer/in_silico_perturber_stats.py:975, in InSilicoPerturberStats.get_stats(self, input_data_directory, null_dist_data_directory, output_directory, output_prefix, null_dict_list)
966 else:
967 # cos sim data for effect of gene perturbation on the embedding of each cell
968 dict_list = read_dictionaries(
969 input_data_directory,
970 "cell",
(...)
973 self.pickle_suffix,
974 )
--> 975 gene_list = get_gene_list(dict_list, "cell")
977 # initiate results dataframe
978 cos_sims_df_initial = pd.DataFrame(
979 {
980 "Gene": gene_list,
(...)
991 index=[i for i in range(len(gene_list))],
992 )
File ~/miniforge3/envs/geneformer/lib/python3.10/site-packages/geneformer/in_silico_perturber_stats.py:153, in get_gene_list(dict_list, mode)
151 if mode == "gene":
152 gene_list.remove("cell_emb")
--> 153 gene_list.sort()
154 return gene_list
ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
Thank you for bringing this up! We don't encounter this error when running the current code so please pull the updated version, but please reopen if you continue to encounter this issue.
ctheodoris
changed discussion status to
closed