bioclip-demo / lib.py
Samuel Stevens
wip: hierarchical prediction
290c238
raw
history blame
3.28 kB
import json
import itertools
class TaxonomicNode:
__slots__ = ("name", "index", "root", "_children")
def __init__(self, name, index, root):
self.name = name
self.index = index
self.root = root
self._children = {}
def add(self, name):
added = 0
if not name:
return added
first, rest = name[0], name[1:]
if first not in self._children:
self._children[first] = TaxonomicNode(first, self.root.size, self.root)
self.root.size += 1
self._children[first].add(rest)
def children(self, name):
if not name:
return set((child.name, child.index) for child in self._children.values())
first, rest = name[0], name[1:]
if first not in self._children:
return set()
return self._children[first].children(rest)
def __iter__(self):
yield self.name, self.index
for child in self._children.values():
for name, index in child:
yield f"{self.name} {name}", index
@classmethod
def from_dict(cls, dct, root):
node = cls(dct["name"], dct["index"], root)
node._children = {child["name"]: cls.from_dict(child, root) for child in dct["children"]}
return node
class TaxonomicTree:
"""
Efficient structure for finding taxonomic names and their descendants.
Also returns an integer index i for each possible name.
"""
def __init__(self):
self.kingdoms = {}
self.size = 0
def add(self, name: list[str]):
if not name:
return
first, rest = name[0], name[1:]
if first not in self.kingdoms:
self.kingdoms[first] = TaxonomicNode(first, self.size, self)
self.size += 1
self.kingdoms[first].add(rest)
def children(self, name=None):
if not name:
return set(
(kingdom.name, kingdom.index) for kingdom in self.kingdoms.values()
)
first, rest = name[0], name[1:]
if first not in self.kingdoms:
return set()
return self.kingdoms[first].children(rest)
def __iter__(self):
for kingdom in self.kingdoms.values():
yield from kingdom
@classmethod
def from_dict(cls, dct):
tree = cls()
tree.kingdoms = {
kingdom["name"]: TaxonomicNode.from_dict(kingdom, tree) for kingdom in dct["kingdoms"]
}
tree.size = dct["size"]
return tree
class TaxonomicJsonEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, TaxonomicNode):
return {
"name": obj.name,
"index": obj.index,
"children": list(obj._children.values()),
}
elif isinstance(obj, TaxonomicTree):
return {
"kingdoms": list(obj.kingdoms.values()),
"size": obj.size,
}
else:
super().default(self, obj)
def batched(iterable, n):
# batched('ABCDEFG', 3) --> ABC DEF G
if n < 1:
raise ValueError('n must be at least one')
it = iter(iterable)
while batch := tuple(itertools.islice(it, n)):
yield zip(*batch)