File size: 3,996 Bytes
2c80eb3
 
 
 
 
 
 
 
 
 
 
 
 
 
142dd1a
2c80eb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import sys
import requests
import time
import pandas as pd
import argilla as rg
from datasets import load_dataset
from argilla.labeling.text_classification import Rule, add_rules


def load_datasets():
    # This is the code that you want to execute when the endpoint is available
    print("Argilla is available! Loading datasets")
    api_key = sys.argv[-1]
    rg.init(api_key=api_key, workspace="team")

    # load dataset from json
    my_dataframe = pd.read_json(
        "https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json")

    # convert pandas dataframe to DatasetForTextClassification
    dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe)

    # Define labeling schema to avoid UI user modification
    settings = rg.TextClassificationSettings(label_schema=["POSITIVE", "NEGATIVE"])
    rg.configure_dataset(name="sst-sentiment-explainability", settings=settings)

    # log the dataset
    rg.log(
        dataset_rg,
        name="sst-sentiment-explainability",
        tags={
            "description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations from Transformers Interpret."
        }
    )

    dataset = load_dataset("argilla/news-summary", split="train").select(range(100))
    dataset_rg = rg.read_datasets(dataset, task="Text2Text")

    # log the dataset
    rg.log(
        dataset_rg,
        name="news-text-summarization",
        tags={
            "description": "A text summarization dataset with news pieces and their predicted summaries."
        }
    )

    # Read dataset from Hub
    dataset_rg = rg.read_datasets(
        load_dataset("argilla/agnews_weak_labeling", split="train"),
        task="TextClassification",
    )

    # Define labeling schema to avoid UI user modification
    settings = rg.TextClassificationSettings(label_schema=["World", "Sports", "Sci/Tech", "Business"])
    rg.configure_dataset(name="news-programmatic-labeling", settings=settings)

    # log the dataset
    rg.log(
        dataset_rg,
        name="news-programmatic-labeling",
        tags={
            "description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)."
        }
    )

    # define queries and patterns for each category (using ES DSL)
    queries = [
        (["money", "financ*", "dollar*"], "Business"),
        (["war", "gov*", "minister*", "conflict"], "World"),
        (["*ball", "sport*", "game", "play*"], "Sports"),
        (["sci*", "techno*", "computer*", "software", "web"], "Sci/Tech"),
    ]

    # define rules
    rules = [Rule(query=term, label=label) for terms, label in queries for term in terms]

    # add rules to the dataset
    add_rules(dataset="news-programmatic-labeling", rules=rules)

    # load dataset from the hub
    dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train")

    # read in dataset, assuming its a dataset for token classification
    dataset_rg = rg.read_datasets(dataset, task="TokenClassification")

    # Define labeling schema to avoid UI user modification
    labels = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG",
              "PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"]
    settings = rg.TokenClassificationSettings(label_schema=labels)
    rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings)

    # log the dataset
    rg.log(
        dataset_rg,
        "gutenberg_spacy-ner-monitoring",
        tags={
            "description": "A dataset containing text from books with predictions from two spaCy NER pre-trained models."
        }
    )


while True:
    try:
        response = requests.get("http://0.0.0.0:6900/")
        if response.status_code == 200:
            load_datasets()
            break
        else:
            time.sleep(10)
    except Exception as e:
        print(e)
        time.sleep(10)
        pass