Spaces:
Running
Running
File size: 1,216 Bytes
cb5b71d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
from core.data_types import convert_dtype
from core.names import find_unique_name
from core.state import Field
from core.state import FileObject
from core.state import FileSet
from core.state import RecordSet
import mlcroissant as mlc
def infer_record_sets(file: FileObject | FileSet, names: set[str]) -> list[RecordSet]:
"""Infers one or several ml:RecordSets from a FileOject/FileSet."""
# For the moment, there is no inference support for FileSets.
if isinstance(file, FileSet):
return []
# We can infer only if the underlying `pd.DataFrame` could be built.
if file.df is None:
return []
fields = []
for column, value in file.df.dtypes.items():
source = mlc.Source(
uid=file.name,
node_type="distribution",
extract=mlc.Extract(column=column),
)
field = Field(
name=column,
data_types=[convert_dtype(value)],
source=source,
references=mlc.Source(),
)
fields.append(field)
return [
RecordSet(
fields=fields,
name=find_unique_name(names, file.name + "_record_set"),
description="",
)
]
|