gen-synth-data

Sleeping

App Files Files Community

burtenshaw HF staff commited on Apr 23

Commit

4b83e74

•

1 Parent(s): 8773ff3

Upload 16 files

Browse files

Files changed (6) hide show

defaults.py +22 -11
pages/2_👩🏼‍🔬 Describe Domain.py +10 -2
pages/3_🌱 Generate Dataset.py +65 -65
pipeline.py +29 -5
pipeline.yaml +6 -6
project_config.json +1 -1

defaults.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import json
 SEED_DATA_PATH = "seed_data.json"
 PIPELINE_PATH = "pipeline.yaml"
-REMOTE_CODE_PATHS = ["defaults.py", "domain.py", "pipeline.py"]
 DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/"
 N_PERSPECTIVES = 5
 N_TOPICS = 5
 N_EXAMPLES = 5
 ################################################
 # DEFAULTS ON FARMING
@@ -25,14 +27,23 @@ DEFAULT_SYSTEM_PROMPT = DEFAULT_DATA["domain_expert_prompt"]
 # PROJECT CONFIG FROM PARENT APP
 ################################################
-with open("project_config.json") as f:
-    PROJECT_CONFIG = json.load(f)
-PROJECT_NAME = PROJECT_CONFIG["project_name"]
-ARGILLA_SPACE_REPO_ID = PROJECT_CONFIG["argilla_space_repo_id"]
-DATASET_REPO_ID = PROJECT_CONFIG["dataset_repo_id"]
-ARGILLA_SPACE_NAME = ARGILLA_SPACE_REPO_ID.replace("/", "-").replace("_", "-")
-ARGILLA_URL = f"https://{ARGILLA_SPACE_NAME}.hf.space"
-PROJECT_SPACE_REPO_ID = PROJECT_CONFIG["project_space_repo_id"]
-DATASET_URL = f"https://huggingface.co/datasets/{DATASET_REPO_ID}"
-HUB_USERNAME = DATASET_REPO_ID.split("/")[0]

+import os
 import json
 SEED_DATA_PATH = "seed_data.json"
 PIPELINE_PATH = "pipeline.yaml"
+REMOTE_CODE_PATHS = ["defaults.py", "domain.py", "pipeline.py", "requirements.txt"]
 DIBT_PARENT_APP_URL = "https://argilla-domain-specific-datasets-welcome.hf.space/"
 N_PERSPECTIVES = 5
 N_TOPICS = 5
 N_EXAMPLES = 5
+CODELESS_DISTILABEL = os.environ.get("CODELESS_DISTILABEL", True)
 ################################################
 # DEFAULTS ON FARMING
 # PROJECT CONFIG FROM PARENT APP
 ################################################
+try:
+    with open("project_config.json") as f:
+        PROJECT_CONFIG = json.load(f)
+    PROJECT_NAME = PROJECT_CONFIG["project_name"]
+    ARGILLA_SPACE_REPO_ID = PROJECT_CONFIG["argilla_space_repo_id"]
+    DATASET_REPO_ID = PROJECT_CONFIG["dataset_repo_id"]
+    ARGILLA_SPACE_NAME = ARGILLA_SPACE_REPO_ID.replace("/", "-").replace("_", "-")
+    ARGILLA_URL = f"https://{ARGILLA_SPACE_NAME}.hf.space"
+    PROJECT_SPACE_REPO_ID = PROJECT_CONFIG["project_space_repo_id"]
+    DATASET_URL = f"https://huggingface.co/datasets/{DATASET_REPO_ID}"
+    HUB_USERNAME = DATASET_REPO_ID.split("/")[0]
+except FileNotFoundError:
+    PROJECT_NAME = "DEFAULT_DOMAIN"
+    ARGILLA_SPACE_REPO_ID = ""
+    DATASET_REPO_ID = ""
+    ARGILLA_URL = ""
+    PROJECT_SPACE_REPO_ID = ""
+    DATASET_URL = ""
+    HUB_USERNAME = ""

pages/2_👩🏼‍🔬 Describe Domain.py CHANGED Viewed

@@ -14,7 +14,6 @@ from defaults import (
     N_TOPICS,
     SEED_DATA_PATH,
     PIPELINE_PATH,
-    PROJECT_NAME,
     DATASET_REPO_ID,
 )
 from utils import project_sidebar
@@ -231,9 +230,18 @@ if st.button("🤗 Push Dataset Seed") and all(
         pipeline_path=PIPELINE_PATH,
     )
-    st.sidebar.success(
         f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})"
     )
 else:
     st.info(
         "Please fill in all the required domain fields to push the dataset seed to the Hub"

     N_TOPICS,
     SEED_DATA_PATH,
     PIPELINE_PATH,
     DATASET_REPO_ID,
 )
 from utils import project_sidebar
         pipeline_path=PIPELINE_PATH,
     )
+    st.success(
         f"Dataset seed created and pushed to the Hub. Check it out [here](https://huggingface.co/datasets/{hub_username}/{project_name})"
     )
+    st.write("You can now move on to runnning your distilabel pipeline.")
+    st.page_link(
+        page="pages/3_🌱 Generate Dataset.py",
+        label="Generate Dataset",
+        icon="🌱",
+    )
 else:
     st.info(
         "Please fill in all the required domain fields to push the dataset seed to the Hub"

pages/3_🌱 Generate Dataset.py CHANGED Viewed

@@ -1,17 +1,13 @@
 import streamlit as st
-from streamlit.errors import EntryNotFoundError
 from hub import pull_seed_data_from_repo, push_pipeline_to_hub
 from defaults import (
     DEFAULT_SYSTEM_PROMPT,
     PIPELINE_PATH,
     PROJECT_NAME,
-    ARGILLA_SPACE_REPO_ID,
-    DATASET_REPO_ID,
-    ARGILLA_SPACE_NAME,
     ARGILLA_URL,
-    PROJECT_SPACE_REPO_ID,
     HUB_USERNAME,
 )
 from utils import project_sidebar
@@ -75,20 +71,21 @@ st.divider()
 st.markdown("### Run the pipeline")
 st.write(
-    "Once you've defined the pipeline configuration, you can run the pipeline locally or on this space."
 )
-st.write(
-    """We recommend running the pipeline locally if you're planning on generating a large dataset. \
-        But running the pipeline on this space is a handy way to get started quickly. Your synthetic
-        samples will be pushed to Argilla and available for review.
-        """
-)
-st.write(
-    """If you're planning on running the pipeline on the space, be aware that it \
-        will take some time to complete and you will need to maintain a \
-        connection to the space."""
-)
 if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
@@ -147,13 +144,16 @@ if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
             hub_token=hub_token,
             pipeline_config_path=PIPELINE_PATH,
             argilla_dataset_name=argilla_dataset_name,
         )
         st.code(
             f"""
             pip install git+https://github.com/argilla-io/distilabel.git
-            git clone https://huggingface.co/{hub_username}/{project_name}
             cd {project_name}
-            {' '.join(command_to_run[2:])}
         """,
             language="bash",
         )
@@ -163,57 +163,57 @@ if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
 ###############################################################
 # SPACE
 ###############################################################
-if st.button("🔥 Run pipeline right here, right now!"):
-    if all(
-        [
-            argilla_api_key,
-            argilla_url,
-            base_url,
-            hub_username,
-            project_name,
-            hub_token,
-            argilla_dataset_name,
-        ]
-    ):
-        with st.spinner("Pulling seed data from the Hub..."):
-            try:
                 seed_data = pull_seed_data_from_repo(
                     repo_id=f"{hub_username}/{project_name}",
                     hub_token=hub_token,
                 )
-            except EntryNotFoundError:
-                st.error(
-                    "Seed data not found. Please make sure you pushed the data seed in Step 2."
                 )
-            domain = seed_data["domain"]
-            perspectives = seed_data["perspectives"]
-            topics = seed_data["topics"]
-            examples = seed_data["examples"]
-            domain_expert_prompt = seed_data["domain_expert_prompt"]
-        with st.spinner("Serializing the pipeline configuration..."):
-            serialize_pipeline(
-                argilla_api_key=argilla_api_key,
-                argilla_dataset_name=argilla_dataset_name,
-                argilla_api_url=argilla_url,
-                topics=topics,
-                perspectives=perspectives,
-                pipeline_config_path=PIPELINE_PATH,
-                domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
-                hub_token=hub_token,
-                endpoint_base_url=base_url,
-                examples=examples,
-            )
-        with st.spinner("Starting the pipeline..."):
-            logs = run_pipeline(PIPELINE_PATH)
-        st.success(f"Pipeline started successfully! 🚀")
-        with st.expander(label="View Logs", expanded=True):
-            for out in logs:
-                st.text(out)
-    else:
-        st.error("Please fill all the required fields.")

 import streamlit as st
 from hub import pull_seed_data_from_repo, push_pipeline_to_hub
 from defaults import (
     DEFAULT_SYSTEM_PROMPT,
     PIPELINE_PATH,
     PROJECT_NAME,
     ARGILLA_URL,
     HUB_USERNAME,
+    CODELESS_DISTILABEL,
 )
 from utils import project_sidebar
 st.markdown("### Run the pipeline")
 st.write(
+    "Once you've defined the pipeline configuration, you can run the pipeline from your local machine."
 )
+if CODELESS_DISTILABEL:
+    st.write(
+        """We recommend running the pipeline locally if you're planning on generating a large dataset. \
+            But running the pipeline on this space is a handy way to get started quickly. Your synthetic
+            samples will be pushed to Argilla and available for review.
+            """
+    )
+    st.write(
+        """If you're planning on running the pipeline on the space, be aware that it \
+            will take some time to complete and you will need to maintain a \
+            connection to the space."""
+    )
 if st.button("💻 Run pipeline locally", key="run_pipeline_local"):
             hub_token=hub_token,
             pipeline_config_path=PIPELINE_PATH,
             argilla_dataset_name=argilla_dataset_name,
+            argilla_api_key=argilla_api_key,
+            argilla_api_url=argilla_url,
         )
         st.code(
             f"""
             pip install git+https://github.com/argilla-io/distilabel.git
+            git clone https://huggingface.co/datasets/{hub_username}/{project_name}
             cd {project_name}
+            pip install -r requirements.txt
+            {' '.join(["python"] + command_to_run[1:])}
         """,
             language="bash",
         )
 ###############################################################
 # SPACE
 ###############################################################
+if CODELESS_DISTILABEL:
+    if st.button("🔥 Run pipeline right here, right now!"):
+        if all(
+            [
+                argilla_api_key,
+                argilla_url,
+                base_url,
+                hub_username,
+                project_name,
+                hub_token,
+                argilla_dataset_name,
+            ]
+        ):
+            with st.spinner("Pulling seed data from the Hub..."):
                 seed_data = pull_seed_data_from_repo(
                     repo_id=f"{hub_username}/{project_name}",
                     hub_token=hub_token,
                 )
+                domain = seed_data["domain"]
+                perspectives = seed_data["perspectives"]
+                topics = seed_data["topics"]
+                examples = seed_data["examples"]
+                domain_expert_prompt = seed_data["domain_expert_prompt"]
+            with st.spinner("Serializing the pipeline configuration..."):
+                serialize_pipeline(
+                    argilla_api_key=argilla_api_key,
+                    argilla_dataset_name=argilla_dataset_name,
+                    argilla_api_url=argilla_url,
+                    topics=topics,
+                    perspectives=perspectives,
+                    pipeline_config_path=PIPELINE_PATH,
+                    domain_expert_prompt=domain_expert_prompt or DEFAULT_SYSTEM_PROMPT,
+                    hub_token=hub_token,
+                    endpoint_base_url=base_url,
+                    examples=examples,
                 )
+            with st.spinner("Starting the pipeline..."):
+                logs = run_pipeline(
+                    pipeline_config_path=PIPELINE_PATH,
+                    argilla_api_key=argilla_api_key,
+                    argilla_api_url=argilla_url,
+                    hub_token=hub_token,
+                    argilla_dataset_name=argilla_dataset_name,
+                )
+            st.success(f"Pipeline started successfully! 🚀")
+            with st.expander(label="View Logs", expanded=True):
+                for out in logs:
+                    st.text(out)
+        else:
+            st.error("Please fill all the required fields.")

pipeline.py CHANGED Viewed

@@ -1,5 +1,5 @@
-import os
 import subprocess
 import time
 from typing import List
@@ -82,10 +82,11 @@ def define_pipeline(
             input_batch_size=8,
             input_mappings={"instruction": "evolved_questions"},
             output_mappings={"generation": "domain_expert_answer"},
-            _system_prompt=domain_expert_prompt,
-            _template=template,
         )
         keep_columns = KeepColumns(
             name="keep_columns",
             columns=["model_name", "evolved_questions", "domain_expert_answer"],
@@ -142,12 +143,15 @@ def serialize_pipeline(
 def create_pipelines_run_command(
     pipeline_config_path: str = "pipeline.yaml",
     argilla_dataset_name: str = "domain_specific_datasets",
 ):
     """Create the command to run the pipeline."""
     command_to_run = [
-        "python",
         "-m",
         "distilabel",
         "pipeline",
@@ -156,24 +160,44 @@ def create_pipelines_run_command(
         pipeline_config_path,
         "--param",
         f"text_generation_to_argilla.dataset_name={argilla_dataset_name}",
     ]
     return command_to_run
 def run_pipeline(
     pipeline_config_path: str = "pipeline.yaml",
     argilla_dataset_name: str = "domain_specific_datasets",
 ):
     """Run the pipeline and yield the output as a generator of logs."""
     command_to_run = create_pipelines_run_command(
         pipeline_config_path=pipeline_config_path,
         argilla_dataset_name=argilla_dataset_name,
     )
     # Run the script file
     process = subprocess.Popen(
-        command_to_run, stdout=subprocess.PIPE, stderr=subprocess.PIPE
     )
     while process.stdout and process.stdout.readable():

 import subprocess
+import sys
 import time
 from typing import List
             input_batch_size=8,
             input_mappings={"instruction": "evolved_questions"},
             output_mappings={"generation": "domain_expert_answer"},
         )
+        domain_expert._system_prompt = domain_expert_prompt
+        domain_expert._template = template
         keep_columns = KeepColumns(
             name="keep_columns",
             columns=["model_name", "evolved_questions", "domain_expert_answer"],
 def create_pipelines_run_command(
+    hub_token: str,
+    argilla_api_key: str,
+    argilla_api_url: str,
     pipeline_config_path: str = "pipeline.yaml",
     argilla_dataset_name: str = "domain_specific_datasets",
 ):
     """Create the command to run the pipeline."""
     command_to_run = [
+        sys.executable,
         "-m",
         "distilabel",
         "pipeline",
         pipeline_config_path,
         "--param",
         f"text_generation_to_argilla.dataset_name={argilla_dataset_name}",
+        "--param",
+        f"text_generation_to_argilla.api_key={argilla_api_key}",
+        "--param",
+        f"text_generation_to_argilla.api_url={argilla_api_url}",
+        "--param",
+        f"self-instruct.llm.api_key={hub_token}",
+        "--param",
+        f"evol_instruction_complexity.llm.api_key={hub_token}",
+        "--param",
+        f"domain_expert.llm.api_key={hub_token}",
+        "--ignore-cache",
     ]
     return command_to_run
 def run_pipeline(
+    hub_token: str,
+    argilla_api_key: str,
+    argilla_api_url: str,
     pipeline_config_path: str = "pipeline.yaml",
     argilla_dataset_name: str = "domain_specific_datasets",
 ):
     """Run the pipeline and yield the output as a generator of logs."""
     command_to_run = create_pipelines_run_command(
+        hub_token=hub_token,
         pipeline_config_path=pipeline_config_path,
         argilla_dataset_name=argilla_dataset_name,
+        argilla_api_key=argilla_api_key,
+        argilla_api_url=argilla_api_url,
     )
     # Run the script file
     process = subprocess.Popen(
+        args=command_to_run,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        env={"HF_TOKEN": hub_token},
     )
     while process.stdout and process.stdout.readable():

pipeline.yaml CHANGED Viewed

@@ -1,5 +1,5 @@
 distilabel:
-  version: 1.0.0
 pipeline:
   name: farming
   description: null
@@ -54,7 +54,7 @@ pipeline:
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
-        base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
@@ -163,7 +163,7 @@ pipeline:
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
-        base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
@@ -390,7 +390,7 @@ pipeline:
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
-        base_url: https://hh1rkuymnetmkw9m.eu-west-1.aws.endpoints.huggingface.cloud
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
@@ -489,9 +489,9 @@ pipeline:
         generation: domain_expert_answer
       output_mappings: {}
       input_batch_size: 50
-      dataset_name: farming
       dataset_workspace: admin
-      api_url: https://argilla-farming.hf.space
       runtime_parameters_info:
       - name: input_batch_size
         optional: true

 distilabel:
+  version: 1.0.1
 pipeline:
   name: farming
   description: null
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
+        base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
+        base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
         model_id: null
         endpoint_name: null
         endpoint_namespace: null
+        base_url: https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.2
         tokenizer_id: null
         model_display_name: null
         use_openai_client: false
         generation: domain_expert_answer
       output_mappings: {}
       input_batch_size: 50
+      dataset_name: test_3
       dataset_workspace: admin
+      api_url: https://burtenshaw-test-3-argilla-space.hf.space
       runtime_parameters_info:
       - name: input_batch_size
         optional: true

project_config.json CHANGED Viewed

	@@ -1 +1 @@
1	- {"project_name": "~~DEFAULT_DOMAIN~~", "argilla_space_repo_id": "burtenshaw/~~domain_test_4_argilla_space~~", "project_space_repo_id": "burtenshaw/~~domain_test_4_config_space~~", "dataset_repo_id": "burtenshaw/~~domain_test_4~~"}


1	+ {"project_name": "test_3", "argilla_space_repo_id": "burtenshaw/test_3_argilla_space", "project_space_repo_id": "burtenshaw/test_3_config_space", "dataset_repo_id": "burtenshaw/test_3"}