File size: 1,216 Bytes
cb5b71d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from core.data_types import convert_dtype
from core.names import find_unique_name
from core.state import Field
from core.state import FileObject
from core.state import FileSet
from core.state import RecordSet
import mlcroissant as mlc


def infer_record_sets(file: FileObject | FileSet, names: set[str]) -> list[RecordSet]:
    """Infers one or several ml:RecordSets from a FileOject/FileSet."""
    # For the moment, there is no inference support for FileSets.
    if isinstance(file, FileSet):
        return []
    # We can infer only if the underlying `pd.DataFrame` could be built.
    if file.df is None:
        return []
    fields = []
    for column, value in file.df.dtypes.items():
        source = mlc.Source(
            uid=file.name,
            node_type="distribution",
            extract=mlc.Extract(column=column),
        )
        field = Field(
            name=column,
            data_types=[convert_dtype(value)],
            source=source,
            references=mlc.Source(),
        )
        fields.append(field)
    return [
        RecordSet(
            fields=fields,
            name=find_unique_name(names, file.name + "_record_set"),
            description="",
        )
    ]