Spaces:
Runtime error
Runtime error
File size: 4,313 Bytes
2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 142dd1a 2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 f79718f 2c80eb3 f79718f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 |
import sys
import time
import pandas as pd
import requests
from datasets import load_dataset
import argilla as rg
from argilla.labeling.text_classification import Rule, add_rules
def load_datasets():
# This is the code that you want to execute when the endpoint is available
print("Argilla is available! Loading datasets")
api_key = sys.argv[-1]
rg.init(api_key=api_key, workspace="team")
# load dataset from json
my_dataframe = pd.read_json(
"https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json"
)
# convert pandas dataframe to DatasetForTextClassification
dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe)
# Define labeling schema to avoid UI user modification
settings = rg.TextClassificationSettings(label_schema={"POSITIVE", "NEGATIVE"})
rg.configure_dataset(name="sst-sentiment-explainability", settings=settings)
# log the dataset
rg.log(
dataset_rg,
name="sst-sentiment-explainability",
tags={
"description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations "
"from Transformers Interpret. "
},
)
dataset = load_dataset("argilla/news-summary", split="train").select(range(100))
dataset_rg = rg.read_datasets(dataset, task="Text2Text")
# log the dataset
rg.log(
dataset_rg,
name="news-text-summarization",
tags={
"description": "A text summarization dataset with news pieces and their predicted summaries."
},
)
# Read dataset from Hub
dataset_rg = rg.read_datasets(
load_dataset("argilla/agnews_weak_labeling", split="train"),
task="TextClassification",
)
# Define labeling schema to avoid UI user modification
settings = rg.TextClassificationSettings(
label_schema={"World", "Sports", "Sci/Tech", "Business"}
)
rg.configure_dataset(name="news-programmatic-labeling", settings=settings)
# log the dataset
rg.log(
dataset_rg,
name="news-programmatic-labeling",
tags={
"description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)."
},
)
# define queries and patterns for each category (using ES DSL)
queries = [
(["money", "financ*", "dollar*"], "Business"),
(["war", "gov*", "minister*", "conflict"], "World"),
(["*ball", "sport*", "game", "play*"], "Sports"),
(["sci*", "techno*", "computer*", "software", "web"], "Sci/Tech"),
]
# define rules
rules = [
Rule(query=term, label=label) for terms, label in queries for term in terms
]
# add rules to the dataset
add_rules(dataset="news-programmatic-labeling", rules=rules)
# load dataset from the hub
dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train")
# read in dataset, assuming it's a dataset for token classification
dataset_rg = rg.read_datasets(dataset, task="TokenClassification")
# Define labeling schema to avoid UI user modification
labels = {
"CARDINAL",
"DATE",
"EVENT",
"FAC",
"GPE",
"LANGUAGE",
"LAW",
"LOC",
"MONEY",
"NORP",
"ORDINAL",
"ORG",
"PERCENT",
"PERSON",
"PRODUCT",
"QUANTITY",
"TIME",
"WORK_OF_ART",
}
settings = rg.TokenClassificationSettings(label_schema=labels)
rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings)
# log the dataset
rg.log(
dataset_rg,
"gutenberg_spacy-ner-monitoring",
tags={
"description": "A dataset containing text from books with predictions from two spaCy NER pre-trained "
"models. "
},
)
if __name__ == "__main__":
while True:
try:
response = requests.get("http://0.0.0.0:6900/")
if response.status_code == 200:
load_datasets()
break
except requests.exceptions.ConnectionError:
pass
except Exception as e:
print(e)
time.sleep(10)
pass
time.sleep(5)
|