Spaces:

rhoitjadhav
/

template-space-docker-v1

Runtime error

App Files Files Community

template-space-docker-v1 / load_data.py

rhoitjadhav

update dockerfile

f79718f almost 2 years ago

raw

history blame

4.31 kB

	import sys
	import time

	import pandas as pd
	import requests
	from datasets import load_dataset

	import argilla as rg
	from argilla.labeling.text_classification import Rule, add_rules


	def load_datasets():
	# This is the code that you want to execute when the endpoint is available
	print("Argilla is available! Loading datasets")
	api_key = sys.argv[-1]
	rg.init(api_key=api_key, workspace="team")

	# load dataset from json
	my_dataframe = pd.read_json(
	"https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json"
	)

	# convert pandas dataframe to DatasetForTextClassification
	dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe)

	# Define labeling schema to avoid UI user modification
	settings = rg.TextClassificationSettings(label_schema={"POSITIVE", "NEGATIVE"})
	rg.configure_dataset(name="sst-sentiment-explainability", settings=settings)

	# log the dataset
	rg.log(
	dataset_rg,
	name="sst-sentiment-explainability",
	tags={
	"description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations "
	"from Transformers Interpret. "
	},
	)

	dataset = load_dataset("argilla/news-summary", split="train").select(range(100))
	dataset_rg = rg.read_datasets(dataset, task="Text2Text")

	# log the dataset
	rg.log(
	dataset_rg,
	name="news-text-summarization",
	tags={
	"description": "A text summarization dataset with news pieces and their predicted summaries."
	},
	)

	# Read dataset from Hub
	dataset_rg = rg.read_datasets(
	load_dataset("argilla/agnews_weak_labeling", split="train"),
	task="TextClassification",
	)

	# Define labeling schema to avoid UI user modification
	settings = rg.TextClassificationSettings(
	label_schema={"World", "Sports", "Sci/Tech", "Business"}
	)
	rg.configure_dataset(name="news-programmatic-labeling", settings=settings)

	# log the dataset
	rg.log(
	dataset_rg,
	name="news-programmatic-labeling",
	tags={
	"description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)."
	},
	)

	# define queries and patterns for each category (using ES DSL)
	queries = [
	(["money", "financ", "dollar"], "Business"),
	(["war", "gov", "minister", "conflict"], "World"),
	(["ball", "sport", "game", "play*"], "Sports"),
	(["sci", "techno", "computer*", "software", "web"], "Sci/Tech"),
	]

	# define rules
	rules = [
	Rule(query=term, label=label) for terms, label in queries for term in terms
	]

	# add rules to the dataset
	add_rules(dataset="news-programmatic-labeling", rules=rules)

	# load dataset from the hub
	dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train")

	# read in dataset, assuming it's a dataset for token classification
	dataset_rg = rg.read_datasets(dataset, task="TokenClassification")

	# Define labeling schema to avoid UI user modification
	labels = {
	"CARDINAL",
	"DATE",
	"EVENT",
	"FAC",
	"GPE",
	"LANGUAGE",
	"LAW",
	"LOC",
	"MONEY",
	"NORP",
	"ORDINAL",
	"ORG",
	"PERCENT",
	"PERSON",
	"PRODUCT",
	"QUANTITY",
	"TIME",
	"WORK_OF_ART",
	}
	settings = rg.TokenClassificationSettings(label_schema=labels)
	rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings)

	# log the dataset
	rg.log(
	dataset_rg,
	"gutenberg_spacy-ner-monitoring",
	tags={
	"description": "A dataset containing text from books with predictions from two spaCy NER pre-trained "
	"models. "
	},
	)


	if __name__ == "__main__":
	while True:
	try:
	response = requests.get("http://0.0.0.0:6900/")
	if response.status_code == 200:
	load_datasets()
	break
	except requests.exceptions.ConnectionError:
	pass
	except Exception as e:
	print(e)
	time.sleep(10)
	pass

	time.sleep(5)