Spaces:
Sleeping
Sleeping
import os | |
import json | |
import sys | |
from distilabel.llms import InferenceEndpointsLLM | |
from distilabel.pipeline import Pipeline | |
from distilabel.steps import ( | |
LoadDataFromDicts, | |
TextGenerationToArgilla, | |
ExpandColumns, | |
) | |
from distilabel.steps.tasks import SelfInstruct | |
from huggingface_hub import hf_hub_download | |
def run(repo_id): | |
# Get super secret tokens | |
hub_token = os.environ.get("HF_TOKEN") | |
with open( | |
hf_hub_download( | |
repo_id=repo_id, filename="pipeline_params.json", repo_type="dataset" | |
), | |
"r", | |
) as f: | |
params = json.load(f) | |
self_instruct_base_url = params.get("self_instruct_base_url") | |
self_intruct_num_generations = params.get("self_instruct_num_generations", 2) | |
domain_expert_num_generations = params.get("domain_expert_num_generations", 2) | |
self_instruct_temperature = params.get("self_instruct_temperature", 0.9) | |
domain_expert_temperature = params.get("domain_expert_temperature", 0.9) | |
self_instruct_max_new_tokens = params.get("self_instruct_max_new_tokens", 1024) | |
domain_expert_max_new_tokens = params.get("domain_expert_max_new_tokens", 1024) | |
with open( | |
hf_hub_download( | |
repo_id=repo_id, filename="seed_data.json", repo_type="dataset" | |
), | |
"r", | |
) as f: | |
seed_data = json.load(f) | |
application_instruction = seed_data.get("application_instruction") | |
domain_expert_prompt = seed_data.get("domain_expert_prompt") | |
domain_name = seed_data.get("domain") | |
terms = seed_data.get("seed_terms") | |
with Pipeline(domain_name) as pipeline: | |
load_data = LoadDataFromDicts( | |
name="load_data", | |
batch_size=64, | |
data=[{"input": term} for term in terms], | |
) | |
self_instruct = SelfInstruct( | |
name="self_instruct", | |
num_instructions=self_intruct_num_generations, | |
input_batch_size=8, | |
llm=InferenceEndpointsLLM( | |
api_key=hub_token, | |
base_url=self_instruct_base_url, | |
), | |
application_description=application_instruction, | |
) | |
# Connect up the pipeline | |
load_data.connect(self_instruct) | |
# Run the pipeline | |
pipeline.run( | |
use_cache=False, | |
parameters={ | |
"self_instruct": { | |
"llm": { | |
"generation_kwargs": { | |
"max_new_tokens": self_instruct_max_new_tokens, | |
"temperature": self_instruct_temperature, | |
}, | |
} | |
}, | |
}, | |
) | |
if __name__ == "__main__": | |
run(sys.argv[1]) | |