Update geneformer/tokenizer.py
Browse files- geneformer/tokenizer.py +13 -0
geneformer/tokenizer.py
CHANGED
@@ -100,6 +100,9 @@ def sum_ensembl_ids(
|
|
100 |
"ensembl_id" in data.ra.keys()
|
101 |
), "'ensembl_id' column missing from data.ra.keys()"
|
102 |
|
|
|
|
|
|
|
103 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
104 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
105 |
gene_ids_in_dict = [
|
@@ -197,6 +200,10 @@ def sum_ensembl_ids(
|
|
197 |
"ensembl_id" in data.var.columns
|
198 |
), "'ensembl_id' column missing from data.var"
|
199 |
|
|
|
|
|
|
|
|
|
200 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
201 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
202 |
gene_ids_in_dict = [
|
@@ -516,6 +523,7 @@ class TranscriptomeTokenizer:
|
|
516 |
file_cell_metadata = {
|
517 |
attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
|
518 |
}
|
|
|
519 |
|
520 |
dedup_filename = loom_file_path.with_name(loom_file_path.stem + "__dedup.loom")
|
521 |
loom_file_path = sum_ensembl_ids(
|
@@ -591,6 +599,11 @@ class TranscriptomeTokenizer:
|
|
591 |
if str(dedup_filename) == str(loom_file_path):
|
592 |
os.remove(str(dedup_filename))
|
593 |
|
|
|
|
|
|
|
|
|
|
|
594 |
return tokenized_cells, file_cell_metadata
|
595 |
|
596 |
def create_dataset(
|
|
|
100 |
"ensembl_id" in data.ra.keys()
|
101 |
), "'ensembl_id' column missing from data.ra.keys()"
|
102 |
|
103 |
+
assert (
|
104 |
+
"ensembl_id_collapsed" not in data.ra.keys()
|
105 |
+
), "'ensembl_id_collapsed' column already exists in data.ra.keys()"
|
106 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
107 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
108 |
gene_ids_in_dict = [
|
|
|
200 |
"ensembl_id" in data.var.columns
|
201 |
), "'ensembl_id' column missing from data.var"
|
202 |
|
203 |
+
assert (
|
204 |
+
"ensembl_id_collapsed" not in data.var.columns
|
205 |
+
), "'ensembl_id_collapsed' column already exists in data.var"
|
206 |
+
|
207 |
# Check for duplicate Ensembl IDs if collapse_gene_ids is False.
|
208 |
# Comparing to gene_token_dict here, would not perform any mapping steps
|
209 |
gene_ids_in_dict = [
|
|
|
523 |
file_cell_metadata = {
|
524 |
attr_key: [] for attr_key in self.custom_attr_name_dict.keys()
|
525 |
}
|
526 |
+
loom_file_path_original = loom_file_path
|
527 |
|
528 |
dedup_filename = loom_file_path.with_name(loom_file_path.stem + "__dedup.loom")
|
529 |
loom_file_path = sum_ensembl_ids(
|
|
|
599 |
if str(dedup_filename) == str(loom_file_path):
|
600 |
os.remove(str(dedup_filename))
|
601 |
|
602 |
+
with lp.connect(str(loom_file_path_original)) as data:
|
603 |
+
if "ensembl_id_collapsed" in data.ra.keys():
|
604 |
+
del data.ra["ensembl_id_collapsed"]
|
605 |
+
|
606 |
+
|
607 |
return tokenized_cells, file_cell_metadata
|
608 |
|
609 |
def create_dataset(
|