File size: 4,692 Bytes
a33c93d
 
 
 
 
 
 
 
 
 
290c238
2cfb891
290c238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a33c93d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290c238
 
a33c93d
 
290c238
 
 
 
2cfb891
 
 
290c238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a33c93d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290c238
a33c93d
290c238
2cfb891
 
 
290c238
 
 
 
2cfb891
 
290c238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2cfb891
290c238
 
2cfb891
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
"""
Mostly a TaxonomicTree class that implements a taxonomy and some helpers for easily
walking and looking in the tree.

A tree is an arrangement of TaxonomicNodes.


"""


import itertools
import json


class TaxonomicNode:
    __slots__ = ("name", "index", "root", "_children")

    def __init__(self, name, index, root):
        self.name = name
        self.index = index
        self.root = root
        self._children = {}

    def add(self, name):
        added = 0
        if not name:
            return added

        first, rest = name[0], name[1:]
        if first not in self._children:
            self._children[first] = TaxonomicNode(first, self.root.size, self.root)
            self.root.size += 1

        self._children[first].add(rest)

    def children(self, name):
        if not name:
            return set((child.name, child.index) for child in self._children.values())

        first, rest = name[0], name[1:]
        if first not in self._children:
            return set()

        return self._children[first].children(rest)

    def descendants(self, prefix=None):
        """Iterates over all values in the subtree that match prefix."""

        if not prefix:
            yield (self.name,), self.index
            for child in self._children.values():
                for name, i in child.descendants():
                    yield (self.name, *name), i
            return

        first, rest = prefix[0], prefix[1:]
        if first not in self._children:
            return

        for name, i in self._children[first].descendants(rest):
            yield (self.name, *name), i

    def values(self):
        """Iterates over all (name, i) pairs in the tree."""
        yield (self.name,), self.index

        for child in self._children.values():
            for name, index in child.values():
                yield (self.name, *name), index

    @classmethod
    def from_dict(cls, dct, root):
        node = cls(dct["name"], dct["index"], root)
        node._children = {
            child["name"]: cls.from_dict(child, root) for child in dct["children"]
        }
        return node


class TaxonomicTree:
    """
    Efficient structure for finding taxonomic names and their descendants.
    Also returns an integer index i for each possible name.
    """

    def __init__(self):
        self.kingdoms = {}
        self.size = 0

    def add(self, name: list[str]):
        if not name:
            return

        first, rest = name[0], name[1:]
        if first not in self.kingdoms:
            self.kingdoms[first] = TaxonomicNode(first, self.size, self)
            self.size += 1

        self.kingdoms[first].add(rest)

    def children(self, name=None):
        if not name:
            return set(
                (kingdom.name, kingdom.index) for kingdom in self.kingdoms.values()
            )

        first, rest = name[0], name[1:]
        if first not in self.kingdoms:
            return set()

        return self.kingdoms[first].children(rest)

    def descendants(self, prefix=None):
        """Iterates over all values in the tree that match prefix."""
        if not prefix:
            # Give them all the subnodes
            for kingdom in self.kingdoms.values():
                yield from kingdom.descendants()

            return

        first, rest = prefix[0], prefix[1:]
        if first not in self.kingdoms:
            return

        yield from self.kingdoms[first].descendants(rest)

    def values(self):
        """Iterates over all (name, i) pairs in the tree."""
        for kingdom in self.kingdoms.values():
            yield from kingdom.values()

    def __len__(self):
        return self.size

    @classmethod
    def from_dict(cls, dct):
        tree = cls()
        tree.kingdoms = {
            kingdom["name"]: TaxonomicNode.from_dict(kingdom, tree)
            for kingdom in dct["kingdoms"]
        }
        tree.size = dct["size"]
        return tree


class TaxonomicJsonEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, TaxonomicNode):
            return {
                "name": obj.name,
                "index": obj.index,
                "children": list(obj._children.values()),
            }
        elif isinstance(obj, TaxonomicTree):
            return {
                "kingdoms": list(obj.kingdoms.values()),
                "size": obj.size,
            }
        else:
            super().default(self, obj)


def batched(iterable, n):
    # batched('ABCDEFG', 3) --> ABC DEF G
    if n < 1:
        raise ValueError("n must be at least one")
    it = iter(iterable)
    while batch := tuple(itertools.islice(it, n)):
        yield zip(*batch)