Spaces:

rhoitjadhav
/

template-space-docker-v1

Runtime error

App Files Files Community

rhoitjadhav commited on Jan 17, 2023

Commit

f79718f

•

1 Parent(s): fcffc62

update dockerfile

Browse files

Files changed (3) hide show

Dockerfile +6 -2
load_data.py +56 -27
start.sh +18 -7

Dockerfile CHANGED Viewed

@@ -1,6 +1,7 @@
 FROM docker.elastic.co/elasticsearch/elasticsearch:8.5.3
 # Environment variable
 ENV TEAM_PASSWORD=1234
 ENV ARGILLA_PASSWORD=1234
 ENV TEAM_API_KEY=team.apikey
@@ -23,13 +24,16 @@ RUN pip3 install datasets
 COPY start.sh /
 RUN chmod +x /start.sh
 COPY *.whl /packages/
 # Install argilla
 RUN for wheel in /packages/*.whl; do pip install "$wheel"[server]; done
 USER elasticsearch
-RUN touch $HOME/users.yml
-RUN chown -R elasticsearch:elasticsearch $HOME/users.yml
 CMD ["/start.sh"]

 FROM docker.elastic.co/elasticsearch/elasticsearch:8.5.3
 # Environment variable
+ENV ARGILLA_LOCAL_AUTH_USERS_DB_FILE=/usr/share/elasticsearch/users.yml
 ENV TEAM_PASSWORD=1234
 ENV ARGILLA_PASSWORD=1234
 ENV TEAM_API_KEY=team.apikey
 COPY start.sh /
 RUN chmod +x /start.sh
+COPY scripts/load_data.py /
 COPY *.whl /packages/
 # Install argilla
 RUN for wheel in /packages/*.whl; do pip install "$wheel"[server]; done
 USER elasticsearch
+RUN touch "$HOME"/users.yml
+RUN chown -R elasticsearch:elasticsearch "$HOME"/users.yml
+RUN chmod 777 "$HOME"/users.yml
 CMD ["/start.sh"]

load_data.py CHANGED Viewed

@@ -1,10 +1,11 @@
-import os
 import sys
-import requests
 import time
 import pandas as pd
-import argilla as rg
 from datasets import load_dataset
 from argilla.labeling.text_classification import Rule, add_rules
@@ -16,13 +17,14 @@ def load_datasets():
     # load dataset from json
     my_dataframe = pd.read_json(
-        "https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json")
     # convert pandas dataframe to DatasetForTextClassification
     dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe)
     # Define labeling schema to avoid UI user modification
-    settings = rg.TextClassificationSettings(label_schema=["POSITIVE", "NEGATIVE"])
     rg.configure_dataset(name="sst-sentiment-explainability", settings=settings)
     # log the dataset
@@ -30,8 +32,9 @@ def load_datasets():
         dataset_rg,
         name="sst-sentiment-explainability",
         tags={
-            "description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations from Transformers Interpret."
-        }
     )
     dataset = load_dataset("argilla/news-summary", split="train").select(range(100))
@@ -43,7 +46,7 @@ def load_datasets():
         name="news-text-summarization",
         tags={
             "description": "A text summarization dataset with news pieces and their predicted summaries."
-        }
     )
     # Read dataset from Hub
@@ -53,7 +56,9 @@ def load_datasets():
     )
     # Define labeling schema to avoid UI user modification
-    settings = rg.TextClassificationSettings(label_schema=["World", "Sports", "Sci/Tech", "Business"])
     rg.configure_dataset(name="news-programmatic-labeling", settings=settings)
     # log the dataset
@@ -62,7 +67,7 @@ def load_datasets():
         name="news-programmatic-labeling",
         tags={
             "description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)."
-        }
     )
     # define queries and patterns for each category (using ES DSL)
@@ -74,7 +79,9 @@ def load_datasets():
     ]
     # define rules
-    rules = [Rule(query=term, label=label) for terms, label in queries for term in terms]
     # add rules to the dataset
     add_rules(dataset="news-programmatic-labeling", rules=rules)
@@ -82,12 +89,30 @@ def load_datasets():
     # load dataset from the hub
     dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train")
-    # read in dataset, assuming its a dataset for token classification
     dataset_rg = rg.read_datasets(dataset, task="TokenClassification")
     # Define labeling schema to avoid UI user modification
-    labels = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG",
-              "PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"]
     settings = rg.TokenClassificationSettings(label_schema=labels)
     rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings)
@@ -96,20 +121,24 @@ def load_datasets():
         dataset_rg,
         "gutenberg_spacy-ner-monitoring",
         tags={
-            "description": "A dataset containing text from books with predictions from two spaCy NER pre-trained models."
-        }
     )
-while True:
-    try:
-        response = requests.get("http://0.0.0.0:6900/")
-        if response.status_code == 200:
-            load_datasets()
-            break
-        else:
             time.sleep(10)
-    except Exception as e:
-        print(e)
-        time.sleep(10)
-        pass

 import sys
 import time
 import pandas as pd
+import requests
 from datasets import load_dataset
+import argilla as rg
 from argilla.labeling.text_classification import Rule, add_rules
     # load dataset from json
     my_dataframe = pd.read_json(
+        "https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json"
+    )
     # convert pandas dataframe to DatasetForTextClassification
     dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe)
     # Define labeling schema to avoid UI user modification
+    settings = rg.TextClassificationSettings(label_schema={"POSITIVE", "NEGATIVE"})
     rg.configure_dataset(name="sst-sentiment-explainability", settings=settings)
     # log the dataset
         dataset_rg,
         name="sst-sentiment-explainability",
         tags={
+            "description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations "
+            "from Transformers Interpret. "
+        },
     )
     dataset = load_dataset("argilla/news-summary", split="train").select(range(100))
         name="news-text-summarization",
         tags={
             "description": "A text summarization dataset with news pieces and their predicted summaries."
+        },
     )
     # Read dataset from Hub
     )
     # Define labeling schema to avoid UI user modification
+    settings = rg.TextClassificationSettings(
+        label_schema={"World", "Sports", "Sci/Tech", "Business"}
+    )
     rg.configure_dataset(name="news-programmatic-labeling", settings=settings)
     # log the dataset
         name="news-programmatic-labeling",
         tags={
             "description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)."
+        },
     )
     # define queries and patterns for each category (using ES DSL)
     ]
     # define rules
+    rules = [
+        Rule(query=term, label=label) for terms, label in queries for term in terms
+    ]
     # add rules to the dataset
     add_rules(dataset="news-programmatic-labeling", rules=rules)
     # load dataset from the hub
     dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train")
+    # read in dataset, assuming it's a dataset for token classification
     dataset_rg = rg.read_datasets(dataset, task="TokenClassification")
     # Define labeling schema to avoid UI user modification
+    labels = {
+        "CARDINAL",
+        "DATE",
+        "EVENT",
+        "FAC",
+        "GPE",
+        "LANGUAGE",
+        "LAW",
+        "LOC",
+        "MONEY",
+        "NORP",
+        "ORDINAL",
+        "ORG",
+        "PERCENT",
+        "PERSON",
+        "PRODUCT",
+        "QUANTITY",
+        "TIME",
+        "WORK_OF_ART",
+    }
     settings = rg.TokenClassificationSettings(label_schema=labels)
     rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings)
         dataset_rg,
         "gutenberg_spacy-ner-monitoring",
         tags={
+            "description": "A dataset containing text from books with predictions from two spaCy NER pre-trained "
+            "models. "
+        },
     )
+if __name__ == "__main__":
+    while True:
+        try:
+            response = requests.get("http://0.0.0.0:6900/")
+            if response.status_code == 200:
+                load_datasets()
+                break
+        except requests.exceptions.ConnectionError:
+            pass
+        except Exception as e:
+            print(e)
             time.sleep(10)
+            pass
+        time.sleep(5)

start.sh CHANGED Viewed

@@ -2,32 +2,43 @@
 set -e
-# Start Elasticsearch
-echo "Starting Elasticsearch"
-elasticsearch 1>/dev/null 2>/dev/null &
 whoami
 # Create users.yml file
 echo "Creating users schema"
 cat >"$HOME"/users.yml <<EOF
 - username: "team"
-  api_key: TEAM_API_KEY
   full_name: Team
   email: [email protected]
-  hashed_password: TEAM_PASSWORD
   workspaces: []
 - username: "argilla"
-  api_key: ARGILLA_API_KEY
   full_name: Argilla
   email: [email protected]
-  hashed_password: ARGILLA_PASSWORD
   workspaces: ["team"]
 EOF
 echo "Waiting for elasticsearch to start"
 sleep 15
 # Start Argilla
 echo "Starting Argilla"
 uvicorn argilla:app --host "0.0.0.0"

 set -e
 whoami
+# Generate hashed passwords
+team_password=$(htpasswd -nbB "" "$TEAM_PASSWORD" | cut -d ":" -f 2 | tr -d "\n")
+argilla_password=$(htpasswd -nbB "" "$ARGILLA_PASSWORD" | cut -d ":" -f 2 | tr -d "\n")
 # Create users.yml file
 echo "Creating users schema"
 cat >"$HOME"/users.yml <<EOF
 - username: "team"
+  api_key: $TEAM_API_KEY
   full_name: Team
   email: [email protected]
+  hashed_password: $team_password
   workspaces: []
 - username: "argilla"
+  api_key: $ARGILLA_API_KEY
   full_name: Argilla
   email: [email protected]
+  hashed_password: $argilla_password
   workspaces: ["team"]
 EOF
+# Start Elasticsearch
+echo "Starting Elasticsearch"
+elasticsearch 1>/dev/null 2>/dev/null &
 echo "Waiting for elasticsearch to start"
 sleep 15
+# Load data
+if [ "$LOAD_DATA_ENABLE" == "true" ]; then
+  echo "Starting to load data"
+  python3.9 /load_data.py "$TEAM_API_KEY" &
+fi
 # Start Argilla
 echo "Starting Argilla"
 uvicorn argilla:app --host "0.0.0.0"