rhoitjadhav commited on
Commit
f79718f
1 Parent(s): fcffc62

update dockerfile

Browse files
Files changed (3) hide show
  1. Dockerfile +6 -2
  2. load_data.py +56 -27
  3. start.sh +18 -7
Dockerfile CHANGED
@@ -1,6 +1,7 @@
1
  FROM docker.elastic.co/elasticsearch/elasticsearch:8.5.3
2
 
3
  # Environment variable
 
4
  ENV TEAM_PASSWORD=1234
5
  ENV ARGILLA_PASSWORD=1234
6
  ENV TEAM_API_KEY=team.apikey
@@ -23,13 +24,16 @@ RUN pip3 install datasets
23
 
24
  COPY start.sh /
25
  RUN chmod +x /start.sh
 
 
26
  COPY *.whl /packages/
27
 
28
  # Install argilla
29
  RUN for wheel in /packages/*.whl; do pip install "$wheel"[server]; done
30
 
31
  USER elasticsearch
32
- RUN touch $HOME/users.yml
33
- RUN chown -R elasticsearch:elasticsearch $HOME/users.yml
 
34
 
35
  CMD ["/start.sh"]
 
1
  FROM docker.elastic.co/elasticsearch/elasticsearch:8.5.3
2
 
3
  # Environment variable
4
+ ENV ARGILLA_LOCAL_AUTH_USERS_DB_FILE=/usr/share/elasticsearch/users.yml
5
  ENV TEAM_PASSWORD=1234
6
  ENV ARGILLA_PASSWORD=1234
7
  ENV TEAM_API_KEY=team.apikey
 
24
 
25
  COPY start.sh /
26
  RUN chmod +x /start.sh
27
+
28
+ COPY scripts/load_data.py /
29
  COPY *.whl /packages/
30
 
31
  # Install argilla
32
  RUN for wheel in /packages/*.whl; do pip install "$wheel"[server]; done
33
 
34
  USER elasticsearch
35
+ RUN touch "$HOME"/users.yml
36
+ RUN chown -R elasticsearch:elasticsearch "$HOME"/users.yml
37
+ RUN chmod 777 "$HOME"/users.yml
38
 
39
  CMD ["/start.sh"]
load_data.py CHANGED
@@ -1,10 +1,11 @@
1
- import os
2
  import sys
3
- import requests
4
  import time
 
5
  import pandas as pd
6
- import argilla as rg
7
  from datasets import load_dataset
 
 
8
  from argilla.labeling.text_classification import Rule, add_rules
9
 
10
 
@@ -16,13 +17,14 @@ def load_datasets():
16
 
17
  # load dataset from json
18
  my_dataframe = pd.read_json(
19
- "https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json")
 
20
 
21
  # convert pandas dataframe to DatasetForTextClassification
22
  dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe)
23
 
24
  # Define labeling schema to avoid UI user modification
25
- settings = rg.TextClassificationSettings(label_schema=["POSITIVE", "NEGATIVE"])
26
  rg.configure_dataset(name="sst-sentiment-explainability", settings=settings)
27
 
28
  # log the dataset
@@ -30,8 +32,9 @@ def load_datasets():
30
  dataset_rg,
31
  name="sst-sentiment-explainability",
32
  tags={
33
- "description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations from Transformers Interpret."
34
- }
 
35
  )
36
 
37
  dataset = load_dataset("argilla/news-summary", split="train").select(range(100))
@@ -43,7 +46,7 @@ def load_datasets():
43
  name="news-text-summarization",
44
  tags={
45
  "description": "A text summarization dataset with news pieces and their predicted summaries."
46
- }
47
  )
48
 
49
  # Read dataset from Hub
@@ -53,7 +56,9 @@ def load_datasets():
53
  )
54
 
55
  # Define labeling schema to avoid UI user modification
56
- settings = rg.TextClassificationSettings(label_schema=["World", "Sports", "Sci/Tech", "Business"])
 
 
57
  rg.configure_dataset(name="news-programmatic-labeling", settings=settings)
58
 
59
  # log the dataset
@@ -62,7 +67,7 @@ def load_datasets():
62
  name="news-programmatic-labeling",
63
  tags={
64
  "description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)."
65
- }
66
  )
67
 
68
  # define queries and patterns for each category (using ES DSL)
@@ -74,7 +79,9 @@ def load_datasets():
74
  ]
75
 
76
  # define rules
77
- rules = [Rule(query=term, label=label) for terms, label in queries for term in terms]
 
 
78
 
79
  # add rules to the dataset
80
  add_rules(dataset="news-programmatic-labeling", rules=rules)
@@ -82,12 +89,30 @@ def load_datasets():
82
  # load dataset from the hub
83
  dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train")
84
 
85
- # read in dataset, assuming its a dataset for token classification
86
  dataset_rg = rg.read_datasets(dataset, task="TokenClassification")
87
 
88
  # Define labeling schema to avoid UI user modification
89
- labels = ["CARDINAL", "DATE", "EVENT", "FAC", "GPE", "LANGUAGE", "LAW", "LOC", "MONEY", "NORP", "ORDINAL", "ORG",
90
- "PERCENT", "PERSON", "PRODUCT", "QUANTITY", "TIME", "WORK_OF_ART"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
  settings = rg.TokenClassificationSettings(label_schema=labels)
92
  rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings)
93
 
@@ -96,20 +121,24 @@ def load_datasets():
96
  dataset_rg,
97
  "gutenberg_spacy-ner-monitoring",
98
  tags={
99
- "description": "A dataset containing text from books with predictions from two spaCy NER pre-trained models."
100
- }
 
101
  )
102
 
103
 
104
- while True:
105
- try:
106
- response = requests.get("http://0.0.0.0:6900/")
107
- if response.status_code == 200:
108
- load_datasets()
109
- break
110
- else:
 
 
 
 
111
  time.sleep(10)
112
- except Exception as e:
113
- print(e)
114
- time.sleep(10)
115
- pass
 
 
1
  import sys
 
2
  import time
3
+
4
  import pandas as pd
5
+ import requests
6
  from datasets import load_dataset
7
+
8
+ import argilla as rg
9
  from argilla.labeling.text_classification import Rule, add_rules
10
 
11
 
 
17
 
18
  # load dataset from json
19
  my_dataframe = pd.read_json(
20
+ "https://raw.githubusercontent.com/recognai/datasets/main/sst-sentimentclassification.json"
21
+ )
22
 
23
  # convert pandas dataframe to DatasetForTextClassification
24
  dataset_rg = rg.DatasetForTextClassification.from_pandas(my_dataframe)
25
 
26
  # Define labeling schema to avoid UI user modification
27
+ settings = rg.TextClassificationSettings(label_schema={"POSITIVE", "NEGATIVE"})
28
  rg.configure_dataset(name="sst-sentiment-explainability", settings=settings)
29
 
30
  # log the dataset
 
32
  dataset_rg,
33
  name="sst-sentiment-explainability",
34
  tags={
35
+ "description": "The sst2 sentiment dataset with predictions from a pretrained pipeline and explanations "
36
+ "from Transformers Interpret. "
37
+ },
38
  )
39
 
40
  dataset = load_dataset("argilla/news-summary", split="train").select(range(100))
 
46
  name="news-text-summarization",
47
  tags={
48
  "description": "A text summarization dataset with news pieces and their predicted summaries."
49
+ },
50
  )
51
 
52
  # Read dataset from Hub
 
56
  )
57
 
58
  # Define labeling schema to avoid UI user modification
59
+ settings = rg.TextClassificationSettings(
60
+ label_schema={"World", "Sports", "Sci/Tech", "Business"}
61
+ )
62
  rg.configure_dataset(name="news-programmatic-labeling", settings=settings)
63
 
64
  # log the dataset
 
67
  name="news-programmatic-labeling",
68
  tags={
69
  "description": "The AG News with programmatic labeling rules (see weak labeling mode in the UI)."
70
+ },
71
  )
72
 
73
  # define queries and patterns for each category (using ES DSL)
 
79
  ]
80
 
81
  # define rules
82
+ rules = [
83
+ Rule(query=term, label=label) for terms, label in queries for term in terms
84
+ ]
85
 
86
  # add rules to the dataset
87
  add_rules(dataset="news-programmatic-labeling", rules=rules)
 
89
  # load dataset from the hub
90
  dataset = load_dataset("argilla/gutenberg_spacy-ner", split="train")
91
 
92
+ # read in dataset, assuming it's a dataset for token classification
93
  dataset_rg = rg.read_datasets(dataset, task="TokenClassification")
94
 
95
  # Define labeling schema to avoid UI user modification
96
+ labels = {
97
+ "CARDINAL",
98
+ "DATE",
99
+ "EVENT",
100
+ "FAC",
101
+ "GPE",
102
+ "LANGUAGE",
103
+ "LAW",
104
+ "LOC",
105
+ "MONEY",
106
+ "NORP",
107
+ "ORDINAL",
108
+ "ORG",
109
+ "PERCENT",
110
+ "PERSON",
111
+ "PRODUCT",
112
+ "QUANTITY",
113
+ "TIME",
114
+ "WORK_OF_ART",
115
+ }
116
  settings = rg.TokenClassificationSettings(label_schema=labels)
117
  rg.configure_dataset(name="gutenberg_spacy-ner-monitoring", settings=settings)
118
 
 
121
  dataset_rg,
122
  "gutenberg_spacy-ner-monitoring",
123
  tags={
124
+ "description": "A dataset containing text from books with predictions from two spaCy NER pre-trained "
125
+ "models. "
126
+ },
127
  )
128
 
129
 
130
+ if __name__ == "__main__":
131
+ while True:
132
+ try:
133
+ response = requests.get("http://0.0.0.0:6900/")
134
+ if response.status_code == 200:
135
+ load_datasets()
136
+ break
137
+ except requests.exceptions.ConnectionError:
138
+ pass
139
+ except Exception as e:
140
+ print(e)
141
  time.sleep(10)
142
+ pass
143
+
144
+ time.sleep(5)
 
start.sh CHANGED
@@ -2,32 +2,43 @@
2
 
3
  set -e
4
 
5
- # Start Elasticsearch
6
- echo "Starting Elasticsearch"
7
- elasticsearch 1>/dev/null 2>/dev/null &
8
  whoami
9
 
 
 
 
 
10
  # Create users.yml file
11
  echo "Creating users schema"
12
  cat >"$HOME"/users.yml <<EOF
13
  - username: "team"
14
- api_key: TEAM_API_KEY
15
  full_name: Team
16
  email: [email protected]
17
- hashed_password: TEAM_PASSWORD
18
  workspaces: []
19
 
20
  - username: "argilla"
21
- api_key: ARGILLA_API_KEY
22
  full_name: Argilla
23
  email: [email protected]
24
- hashed_password: ARGILLA_PASSWORD
25
  workspaces: ["team"]
26
  EOF
27
 
 
 
 
 
28
  echo "Waiting for elasticsearch to start"
29
  sleep 15
30
 
 
 
 
 
 
 
31
  # Start Argilla
32
  echo "Starting Argilla"
33
  uvicorn argilla:app --host "0.0.0.0"
 
2
 
3
  set -e
4
 
 
 
 
5
  whoami
6
 
7
+ # Generate hashed passwords
8
+ team_password=$(htpasswd -nbB "" "$TEAM_PASSWORD" | cut -d ":" -f 2 | tr -d "\n")
9
+ argilla_password=$(htpasswd -nbB "" "$ARGILLA_PASSWORD" | cut -d ":" -f 2 | tr -d "\n")
10
+
11
  # Create users.yml file
12
  echo "Creating users schema"
13
  cat >"$HOME"/users.yml <<EOF
14
  - username: "team"
15
+ api_key: $TEAM_API_KEY
16
  full_name: Team
17
  email: [email protected]
18
+ hashed_password: $team_password
19
  workspaces: []
20
 
21
  - username: "argilla"
22
+ api_key: $ARGILLA_API_KEY
23
  full_name: Argilla
24
  email: [email protected]
25
+ hashed_password: $argilla_password
26
  workspaces: ["team"]
27
  EOF
28
 
29
+ # Start Elasticsearch
30
+ echo "Starting Elasticsearch"
31
+ elasticsearch 1>/dev/null 2>/dev/null &
32
+
33
  echo "Waiting for elasticsearch to start"
34
  sleep 15
35
 
36
+ # Load data
37
+ if [ "$LOAD_DATA_ENABLE" == "true" ]; then
38
+ echo "Starting to load data"
39
+ python3.9 /load_data.py "$TEAM_API_KEY" &
40
+ fi
41
+
42
  # Start Argilla
43
  echo "Starting Argilla"
44
  uvicorn argilla:app --host "0.0.0.0"