musfiqdehan commited on
Commit
5fb40ca
1 Parent(s): c8b4f90

Upload app.py and requirements

Browse files
Files changed (3) hide show
  1. app.py +396 -0
  2. dumpy.py +52 -0
  3. requirements.txt +72 -0
app.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from apscheduler.schedulers.background import BackgroundScheduler
2
+ import datetime
3
+ import os
4
+ from typing import Dict, Tuple
5
+ from uuid import UUID
6
+
7
+ import altair as alt
8
+ import argilla as rg
9
+ from argilla.feedback import FeedbackDataset
10
+ from argilla.client.feedback.dataset.remote.dataset import RemoteFeedbackDataset
11
+ from huggingface_hub import restart_space
12
+ import gradio as gr
13
+ import pandas as pd
14
+
15
+ """
16
+ This is the main file for the dashboard application. It contains the main function and the functions to obtain the data and create the charts.
17
+ It's designed as a template to recreate the dashboard for the prompt translation project of any language.
18
+
19
+ To create a new dashboard, you need several environment variables, that you can easily set in the HuggingFace Space that you are using to host the dashboard:
20
+
21
+ - HF_TOKEN: Token with write access from your Hugging Face account: https://huggingface.co/settings/tokens
22
+ - SOURCE_DATASET: The dataset id of the source dataset
23
+ - SOURCE_WORKSPACE: The workspace id of the source dataset
24
+ - TARGET_RECORDS: The number of records that you have as a target to annotate. We usually set this to 500.
25
+ - ARGILLA_API_URL: Link to the Huggingface Space where the annotation effort is being hosted. For example, the Spanish one is https://somosnlp-dibt-prompt-translation-for-es.hf.space/
26
+ - ARGILLA_API_KEY: The API key to access the Huggingface Space. Please, write this as a secret in the Huggingface Space configuration.
27
+ """
28
+
29
+ # Translation of legends and titles
30
+ ANNOTATED = "Annotations"
31
+ NUMBER_ANNOTATED = "Total Annotations"
32
+ PENDING = "Pending"
33
+
34
+ NUMBER_ANNOTATORS = "Number of annotators"
35
+ NAME = "Username"
36
+ NUMBER_ANNOTATIONS = "Number of annotations"
37
+
38
+ CATEGORY = "Category"
39
+
40
+
41
+ def restart() -> None:
42
+ """
43
+ This function restarts the space where the dashboard is hosted.
44
+ """
45
+
46
+ # Update Space name with your Space information
47
+ gr.Info("Restarting space at " + str(datetime.datetime.now()))
48
+ restart_space(
49
+ "ignacioct/TryingRestartDashboard",
50
+ token=os.getenv("HF_TOKEN"),
51
+ # factory_reboot=True,
52
+ )
53
+
54
+
55
+ def obtain_source_target_datasets() -> (
56
+ Tuple[
57
+ FeedbackDataset | RemoteFeedbackDataset, FeedbackDataset | RemoteFeedbackDataset
58
+ ]
59
+ ):
60
+ """
61
+ This function returns the source and target datasets to be used in the application.
62
+
63
+ Returns:
64
+ A tuple with the source and target datasets. The source dataset is filtered by the response status 'pending'.
65
+
66
+ """
67
+
68
+ # Obtain the public dataset and see how many pending records are there
69
+ source_dataset = rg.FeedbackDataset.from_argilla(
70
+ os.getenv("SOURCE_DATASET"), workspace=os.getenv("SOURCE_WORKSPACE")
71
+ )
72
+ filtered_source_dataset = source_dataset.filter_by(response_status=["pending"])
73
+
74
+ # Obtain a list of users from the private workspace
75
+ # target_dataset = rg.FeedbackDataset.from_argilla(
76
+ # os.getenv("RESULTS_DATASET"), workspace=os.getenv("RESULTS_WORKSPACE")
77
+ # )
78
+
79
+ target_dataset = source_dataset.filter_by(response_status=["submitted"])
80
+
81
+ return filtered_source_dataset, target_dataset
82
+
83
+
84
+ def get_user_annotations_dictionary(
85
+ dataset: FeedbackDataset | RemoteFeedbackDataset,
86
+ ) -> Dict[str, int]:
87
+ """
88
+ This function returns a dictionary with the username as the key and the number of annotations as the value.
89
+
90
+ Args:
91
+ dataset: The dataset to be analyzed.
92
+ Returns:
93
+ A dictionary with the username as the key and the number of annotations as the value.
94
+ """
95
+ output = {}
96
+ for record in dataset:
97
+ for response in record.responses:
98
+ if str(response.user_id) not in output.keys():
99
+ output[str(response.user_id)] = 1
100
+ else:
101
+ output[str(response.user_id)] += 1
102
+
103
+ # Changing the name of the keys, from the id to the username
104
+ for key in list(output.keys()):
105
+ output[rg.User.from_id(UUID(key)).username] = output.pop(key)
106
+
107
+ return output
108
+
109
+
110
+ def donut_chart_total() -> alt.Chart:
111
+ """
112
+ This function returns a donut chart with the progress of the total annotations.
113
+ Counts each record that has been annotated at least once.
114
+
115
+ Returns:
116
+ An altair chart with the donut chart.
117
+ """
118
+
119
+ # Load your data
120
+ annotated_records = len(target_dataset)
121
+ pending_records = int(os.getenv("TARGET_RECORDS")) - annotated_records
122
+
123
+ # Prepare data for the donut chart
124
+ source = pd.DataFrame(
125
+ {
126
+ "values": [annotated_records, pending_records],
127
+ "category": [ANNOTATED, PENDING],
128
+ "colors": [
129
+ "#4682b4",
130
+ "#e68c39",
131
+ ], # Blue for Completed, Orange for Remaining
132
+ }
133
+ )
134
+
135
+ domain = source["category"].tolist()
136
+ range_ = source["colors"].tolist()
137
+
138
+ base = alt.Chart(source).encode(
139
+ theta=alt.Theta("values:Q", stack=True),
140
+ radius=alt.Radius(
141
+ "values", scale=alt.Scale(type="sqrt", zero=True, rangeMin=20)
142
+ ),
143
+ color=alt.Color(
144
+ field="category",
145
+ type="nominal",
146
+ scale=alt.Scale(domain=domain, range=range_),
147
+ legend=alt.Legend(title=CATEGORY),
148
+ ),
149
+ )
150
+
151
+ c1 = base.mark_arc(innerRadius=20, stroke="#fff")
152
+
153
+ c2 = base.mark_text(radiusOffset=20).encode(text="values:Q")
154
+
155
+ chart = c1 + c2
156
+
157
+ return chart
158
+
159
+
160
+ def kpi_chart_remaining() -> alt.Chart:
161
+ """
162
+ This function returns a KPI chart with the remaining amount of records to be annotated.
163
+ Returns:
164
+ An altair chart with the KPI chart.
165
+ """
166
+
167
+ pending_records = int(os.getenv("TARGET_RECORDS")) - len(target_dataset)
168
+ # Assuming you have a DataFrame with user data, create a sample DataFrame
169
+ data = pd.DataFrame({"Category": [PENDING], "Value": [pending_records]})
170
+
171
+ # Create Altair chart
172
+ chart = (
173
+ alt.Chart(data)
174
+ .mark_text(fontSize=100, align="center", baseline="middle", color="#e68b39")
175
+ .encode(text="Value:N")
176
+ .properties(title=PENDING, width=250, height=200)
177
+ )
178
+
179
+ return chart
180
+
181
+
182
+ def kpi_chart_submitted() -> alt.Chart:
183
+ """
184
+ This function returns a KPI chart with the total amount of records that have been annotated.
185
+ Returns:
186
+ An altair chart with the KPI chart.
187
+ """
188
+
189
+ total = len(target_dataset)
190
+
191
+ # Assuming you have a DataFrame with user data, create a sample DataFrame
192
+ data = pd.DataFrame({"Category": [NUMBER_ANNOTATED], "Value": [total]})
193
+
194
+ # Create Altair chart
195
+ chart = (
196
+ alt.Chart(data)
197
+ .mark_text(fontSize=100, align="center", baseline="middle", color="steelblue")
198
+ .encode(text="Value:N")
199
+ .properties(title=NUMBER_ANNOTATED, width=250, height=200)
200
+ )
201
+
202
+ return chart
203
+
204
+
205
+ def kpi_chart_total_annotators() -> alt.Chart:
206
+ """
207
+ This function returns a KPI chart with the total amount of annotators.
208
+
209
+ Returns:
210
+ An altair chart with the KPI chart.
211
+ """
212
+
213
+ # Obtain the total amount of annotators
214
+ total_annotators = len(user_ids_annotations)
215
+
216
+ # Assuming you have a DataFrame with user data, create a sample DataFrame
217
+ data = pd.DataFrame({"Category": [NUMBER_ANNOTATORS], "Value": [total_annotators]})
218
+
219
+ # Create Altair chart
220
+ chart = (
221
+ alt.Chart(data)
222
+ .mark_text(fontSize=100, align="center", baseline="middle", color="steelblue")
223
+ .encode(text="Value:N")
224
+ .properties(title=NUMBER_ANNOTATORS, width=250, height=200)
225
+ )
226
+
227
+ return chart
228
+
229
+
230
+ def render_hub_user_link(hub_id: str) -> str:
231
+ """
232
+ This function returns a link to the user's profile on Hugging Face.
233
+
234
+ Args:
235
+ hub_id: The user's id on Hugging Face.
236
+
237
+ Returns:
238
+ A string with the link to the user's profile on Hugging Face.
239
+ """
240
+ link = f"https://huggingface.co/{hub_id}"
241
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
242
+
243
+
244
+ def obtain_top_users(user_ids_annotations: Dict[str, int], N: int = 50) -> pd.DataFrame:
245
+ """
246
+ This function returns the top N users with the most annotations.
247
+
248
+ Args:
249
+ user_ids_annotations: A dictionary with the user ids as the key and the number of annotations as the value.
250
+
251
+ Returns:
252
+ A pandas dataframe with the top N users with the most annotations.
253
+ """
254
+
255
+ dataframe = pd.DataFrame(
256
+ user_ids_annotations.items(), columns=[NAME, NUMBER_ANNOTATIONS]
257
+ )
258
+ dataframe[NAME] = dataframe[NAME].apply(render_hub_user_link)
259
+ dataframe = dataframe.sort_values(by=NUMBER_ANNOTATIONS, ascending=False)
260
+ return dataframe.head(N)
261
+
262
+
263
+ def fetch_data() -> None:
264
+ """
265
+ This function fetches the data from the source and target datasets and updates the global variables.
266
+ """
267
+
268
+ print(f"Starting to fetch data: {datetime.datetime.now()}")
269
+
270
+ global source_dataset, target_dataset, user_ids_annotations, annotated, remaining, percentage_completed, top_dataframe
271
+ source_dataset, target_dataset = obtain_source_target_datasets()
272
+ user_ids_annotations = get_user_annotations_dictionary(target_dataset)
273
+
274
+ annotated = len(target_dataset)
275
+ remaining = int(os.getenv("TARGET_RECORDS")) - annotated
276
+ percentage_completed = round(
277
+ (annotated / int(os.getenv("TARGET_RECORDS"))) * 100, 1
278
+ )
279
+
280
+ # Print the current date and time
281
+ print(f"Data fetched: {datetime.datetime.now()}")
282
+
283
+
284
+ def get_top(N=50) -> pd.DataFrame:
285
+ """
286
+ This function returns the top N users with the most annotations.
287
+
288
+ Args:
289
+ N: The number of users to be returned. 50 by default
290
+
291
+ Returns:
292
+ A pandas dataframe with the top N users with the most annotations.
293
+ """
294
+
295
+ return obtain_top_users(user_ids_annotations, N=N)
296
+
297
+
298
+ def main() -> None:
299
+
300
+ # Connect to the space with rg.init()
301
+ rg.init(
302
+ api_url=os.getenv("ARGILLA_API_URL"),
303
+ api_key=os.getenv("ARGILLA_API_KEY"),
304
+ )
305
+
306
+ # Fetch the data initially
307
+ fetch_data()
308
+
309
+ # To avoid the orange border for the Gradio elements that are in constant loading
310
+ css = """
311
+ .generating {
312
+ border: none;
313
+ }
314
+ """
315
+
316
+ with gr.Blocks(css=css, delete_cache=(300, 300)) as demo:
317
+ gr.Markdown(
318
+ """
319
+ # 🌍 [YOUR LANGUAGE] - Multilingual Prompt Evaluation Project
320
+
321
+ Hugging Face and @argilla are developing [Multilingual Prompt Evaluation Project](https://github.com/huggingface/data-is-better-together/tree/main/prompt_translation) project. It is an open multilingual benchmark for evaluating language models, and of course, also for [YOUR LANGUAGE].
322
+
323
+ ## The goal is to translate 500 Prompts
324
+ And as always: data is needed for that! The community selected the best 500 prompts that will form the benchmark. In English, of course.
325
+ **That's why we need your help**: if we all translate the 500 prompts, we can add [YOUR LANGUAGE] to the leaderboard.
326
+
327
+ ## How to participate
328
+ Participating is easy. Go to the [annotation space][add a link to your annotation dataset], log in or create a Hugging Face account, and you can start working.
329
+ Thanks in advance! Oh, and we'll give you a little push: GPT4 has already prepared a translation suggestion for you.
330
+ """
331
+ )
332
+
333
+ gr.Markdown(
334
+ f"""
335
+ ## 🚀 Current Progress
336
+ This is what we've achieved so far!
337
+ """
338
+ )
339
+ with gr.Row():
340
+
341
+ kpi_submitted_plot = gr.Plot(label="Plot")
342
+ demo.load(
343
+ kpi_chart_submitted,
344
+ inputs=[],
345
+ outputs=[kpi_submitted_plot],
346
+ )
347
+
348
+ kpi_remaining_plot = gr.Plot(label="Plot")
349
+ demo.load(
350
+ kpi_chart_remaining,
351
+ inputs=[],
352
+ outputs=[kpi_remaining_plot],
353
+ )
354
+
355
+ donut_total_plot = gr.Plot(label="Plot")
356
+ demo.load(
357
+ donut_chart_total,
358
+ inputs=[],
359
+ outputs=[donut_total_plot],
360
+ )
361
+
362
+ gr.Markdown(
363
+ """
364
+ ## 👾 Hall of Fame
365
+ Here you can see the top contributors and the number of annotations they have made.
366
+ """
367
+ )
368
+
369
+ with gr.Row():
370
+
371
+ kpi_hall_plot = gr.Plot(label="Plot")
372
+ demo.load(kpi_chart_total_annotators, inputs=[], outputs=[kpi_hall_plot])
373
+
374
+ top_df_plot = gr.Dataframe(
375
+ headers=[NAME, NUMBER_ANNOTATIONS],
376
+ datatype=[
377
+ "markdown",
378
+ "number",
379
+ ],
380
+ row_count=50,
381
+ col_count=(2, "fixed"),
382
+ interactive=False,
383
+ )
384
+ demo.load(get_top, None, [top_df_plot])
385
+
386
+ # Manage background refresh
387
+ scheduler = BackgroundScheduler()
388
+ _ = scheduler.add_job(restart, "interval", minutes=30)
389
+ scheduler.start()
390
+
391
+ # Launch the Gradio interface
392
+ demo.launch()
393
+
394
+
395
+ if __name__ == "__main__":
396
+ main()
dumpy.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+
5
+ import argilla as rg
6
+ from huggingface_hub import HfApi
7
+
8
+ logger = logging.getLogger(__name__)
9
+ logger.setLevel(logging.INFO)
10
+
11
+ if __name__ == "__main__":
12
+ logger.info("*** Initializing Argilla session ***")
13
+ rg.init(
14
+ api_url=os.getenv("ARGILLA_API_URL"),
15
+ api_key=os.getenv("ARGILLA_API_KEY"),
16
+ extra_headers={"Authorization": f"Bearer {os.getenv('HF_TOKEN')}"},
17
+ )
18
+
19
+ logger.info("*** Fetching dataset from Argilla ***")
20
+ dataset = rg.FeedbackDataset.from_argilla(
21
+ os.getenv("SOURCE_DATASET"),
22
+ workspace=os.getenv("SOURCE_WORKSPACE"),
23
+ )
24
+ logger.info("*** Filtering records by `response_status` ***")
25
+ dataset = dataset.filter_by(response_status=["submitted"]) # type: ignore
26
+
27
+ logger.info("*** Calculating users and annotation count ***")
28
+ output = {}
29
+ for record in dataset.records:
30
+ for response in record.responses:
31
+ if response.user_id not in output:
32
+ output[response.user_id] = 0
33
+ output[response.user_id] += 1
34
+
35
+ for key in list(output.keys()):
36
+ output[rg.User.from_id(key).username] = output.pop(key)
37
+
38
+ logger.info("*** Users and annotation count successfully calculated! ***")
39
+
40
+ logger.info("*** Dumping Python dict into `stats.json` ***")
41
+ with open("stats.json", "w") as file:
42
+ json.dump(output, file, indent=4)
43
+
44
+ logger.info("*** Uploading `stats.json` to Hugging Face Hub ***")
45
+ api = HfApi(token=os.getenv("HF_TOKEN"))
46
+ api.upload_file(
47
+ path_or_fileobj="stats.json",
48
+ path_in_repo="stats.json",
49
+ repo_id="DIBT/prompt-collective-dashboard",
50
+ repo_type="space",
51
+ )
52
+ logger.info("*** `stats.json` successfully uploaded to Hugging Face Hub! ***")
requirements.txt ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles
2
+ altair
3
+ annotated-types
4
+ anyio
5
+ apscheduler
6
+ argilla
7
+ attrs
8
+ backoff
9
+ certifi
10
+ charset-normalizer
11
+ click
12
+ colorama
13
+ contourpy
14
+ cycler
15
+ Deprecated
16
+ exceptiongroup
17
+ fastapi
18
+ ffmpy
19
+ filelock
20
+ fonttools
21
+ fsspec
22
+ gradio
23
+ gradio_client
24
+ h11
25
+ httpcore
26
+ httpx
27
+ huggingface-hub
28
+ idna
29
+ importlib-resources
30
+ Jinja2
31
+ jsonschema
32
+ jsonschema-specifications
33
+ kiwisolver
34
+ markdown-it-py
35
+ MarkupSafe
36
+ matplotlib
37
+ mdurl
38
+ monotonic
39
+ numpy
40
+ orjson
41
+ packaging
42
+ pandas
43
+ pillow
44
+ pydantic
45
+ pydantic_core
46
+ pydub
47
+ Pygments
48
+ pyparsing
49
+ python-dateutil
50
+ python-multipart
51
+ pytz
52
+ PyYAML
53
+ referencing
54
+ requests
55
+ rich
56
+ rpds-py
57
+ ruff
58
+ semantic-version
59
+ shellingham
60
+ six
61
+ sniffio
62
+ starlette
63
+ tomlkit
64
+ toolz
65
+ tqdm
66
+ typer
67
+ typing_extensions
68
+ urllib3
69
+ uvicorn
70
+ vega-datasets
71
+ websockets
72
+ wrapt