import os import sys import requests import time import pandas as pd import argilla as rg from datasets import load_dataset from argilla.labeling.text_classification import Rule, add_rules def load_datasets(): # This is the code that you want to execute when the endpoint is available print("Argilla is available! Loading datasets") api_key = sys.argv[-1] rg.init(api_key=api_key, workspace="admin") # load dataset from json my_dataframe = pd.read_json( "https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json") # convert pandas dataframe to DatasetForTextClassification dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe) # Define labeling schema to avoid UI user modification settings = rg.TextClassificationSettings(label_schema=["POSITIVE", "NEGATIVE"]) rg.configure_dataset(name="sst-sentiment-explainability", settings=settings) # log the dataset rg.log( dataset_rg, name="sst-sentiment-explainability", tags={ "description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations from Transformers Interpret." } ) dataset = load_dataset("argilla/news-summary", split="train").select(range(100)) dataset_rg = rg.read_datasets(dataset, task="Text2Text") # log the dataset rg.log( dataset_rg, name="news-text-summarization", tags={ "description": "A text summarization dataset with news pieces and their predicted summaries." } ) # Read dataset from Hub dataset_rg = rg.read_datasets( load_dataset("argilla/agnews_weak_labeling", split="train"), task="TextClassification", ) # Define labeling schema to avoid UI user modification settings = rg.TextClassificationSettings(label_schema=["World", "Sports", "Sci/Tech", "Business"]) rg.configure_dataset(name="news-programmatic-labeling", settings=settings) # log the dataset rg.log( dataset_rg, name="news-programmatic-labeling", tags={ "description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)." } ) # define queries and patterns for each category (using ES DSL) queries = [ (["money", "financ*", "dollar*"], "Business"), (["war", "gov*", "minister*", "conflict"], "World"), (["*ball", "sport*", "game", "play*"], "Sports"), (["sci*", "techno*", "computer*", "software", "web"], "Sci/Tech"), ] # define rules rules = [Rule(query=term, label=label) for terms, label in queries for term in terms] # add rules to the dataset add_rules(dataset="news-programmatic-labeling", rules=rules) # load dataset from the hub dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train") # read in dataset, assuming its a dataset for token classification dataset_rg = rg.read_datasets(dataset, task="TokenClassification") # Define labeling schema to avoid UI user modification labels = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG", "PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"] settings = rg.TokenClassificationSettings(label_schema=labels) rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings) # log the dataset rg.log( dataset_rg, "gutenberg_spacy-ner-monitoring", tags={ "description": "A dataset containing text from books with predictions from two spaCy NER pre-trained models." } ) while True: try: response = requests.get("http://0.0.0.0:6900/") if response.status_code == 200: load_datasets() break else: time.sleep(10) except Exception as e: print(e) time.sleep(10) pass