File size: 4,369 Bytes
e901392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import os
from datetime import datetime

from dotenv import load_dotenv
from httpx import Client, AsyncClient
from huggingface_hub import HfApi
from huggingface_hub.utils import logging
from tqdm.auto import tqdm
from typing import Any, Dict, List
import pandas as pd

load_dotenv()

HF_TOKEN = os.getenv("HF_TOKEN")
assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
USER_AGENT = os.getenv("USER_AGENT")
assert (
    USER_AGENT is not None
), "You need to set USER_AGENT in your environment variables"

logger = logging.get_logger(__name__)
headers = {
    "authorization": f"Bearer ${HF_TOKEN}",
    "user-agent": USER_AGENT,
}
client = Client(headers=headers)
async_client = AsyncClient(headers=headers)
api = HfApi(token=HF_TOKEN)


def has_card_data(dataset):
    return hasattr(dataset, "card_data")


def check_dataset_has_dataset_info(dataset):
    return bool(
        has_card_data(dataset)
        and hasattr(dataset.card_data, "dataset_info")
        and dataset.card_data.dataset_info is not None
    )


def parse_single_config_dataset(data):
    config_name = data.get("config_name", "default")
    features = data.get("features", [])
    column_names = [feature.get("name") for feature in features]
    return {
        "config_name": config_name,
        "column_names": column_names,
        "features": features,
    }


def parse_multiple_config_dataset(data: List[Dict[str, Any]]):
    return [parse_single_config_dataset(d) for d in data]


def parse_dataset(dataset):
    hub_id = dataset.id
    likes = dataset.likes
    downloads = dataset.downloads
    tags = dataset.tags
    created_at = dataset.created_at
    last_modified = dataset.last_modified
    license = dataset.card_data.license
    language = dataset.card_data.language
    return {
        "hub_id": hub_id,
        "likes": likes,
        "downloads": downloads,
        "tags": tags,
        "created_at": created_at,
        "last_modified": last_modified,
        "license": license,
        "language": language,
    }


def parsed_column_info(dataset_info):
    if isinstance(dataset_info, dict):
        return [parse_single_config_dataset(dataset_info)]
    elif isinstance(dataset_info, list):
        return parse_multiple_config_dataset(dataset_info)
    return None


def ensure_list_of_strings(value):
    if value is None:
        return []
    if isinstance(value, list):
        return [str(item) for item in value]
    return [str(value)]


def refresh_data() -> List[Dict[str, Any]]:
    # current date as string
    now = datetime.now()
    # check if a file for the current date exists
    if os.path.exists(f"datasets_{now.strftime('%Y-%m-%d')}.parquet"):
        df = pd.read_parquet(f"datasets_{now.strftime('%Y-%m-%d')}.parquet")
        return df.to_dict(orient="records")

    # List all datasets
    datasets = list(api.list_datasets(limit=None, full=True))

    # Filter datasets with dataset info
    datasets = [
        dataset for dataset in tqdm(datasets) if check_dataset_has_dataset_info(dataset)
    ]

    parsed_datasets = []
    for dataset in tqdm(datasets):
        try:
            datasetinfo = parse_dataset(dataset)
            column_info = parsed_column_info(dataset.card_data.dataset_info)
            parsed_datasets.extend({**datasetinfo, **info} for info in column_info)
        except Exception as e:
            print(f"Error processing dataset {dataset.id}: {e}")
            continue

    # Convert to DataFrame
    df = pd.DataFrame(parsed_datasets)

    # Ensure 'license', 'tags', and 'language' are lists of strings
    df["license"] = df["license"].apply(ensure_list_of_strings)
    df["tags"] = df["tags"].apply(ensure_list_of_strings)
    df["language"] = df["language"].apply(ensure_list_of_strings)

    # Convert 'features' column to string
    df["features"] = df["features"].apply(lambda x: str(x) if x is not None else None)
    df = df.astype({"hub_id": "string", "config_name": "string"})

    # save to parquet file with current date
    df.to_parquet(f"datasets_{now.strftime('%Y-%m-%d')}.parquet")

    # save to JSON file with current date
    df.to_json(
        f"datasets_{now.strftime('%Y-%m-%d')}.json", orient="records", lines=True
    )

    # return a list of dictionaries
    return df.to_dict(orient="records")


if __name__ == "__main__":
    refresh_data()