davanstrien HF staff commited on
Commit
bbb3627
β€’
1 Parent(s): 3941018

Update Dockerfile and requirements.txt

Browse files
Files changed (3) hide show
  1. Dockerfile +6 -12
  2. app.py β†’ main.py +45 -36
  3. requirements.txt +24 -74
Dockerfile CHANGED
@@ -1,17 +1,11 @@
1
- # Set the base image using Python 3.12 and Debian Bookworm
2
- FROM python:3.12-slim-bookworm
3
 
4
- # Set the working directory to /app
5
- WORKDIR /app
6
 
7
- # Copy only the necessary files to the working directory
8
- COPY . /app
9
 
10
- # Install the requirements
11
- RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
12
 
13
- # Expose the port the app runs on
14
- EXPOSE 80
15
 
16
- # Run the app with the Litestar CLI
17
- CMD ["litestar", "run", "--host", "0.0.0.0", "--port", "80"]
 
1
+ FROM python:3.11
 
2
 
3
+ WORKDIR /code
 
4
 
5
+ COPY ./requirements.txt /code/requirements.txt
 
6
 
7
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
 
8
 
9
+ COPY . .
 
10
 
11
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
app.py β†’ main.py RENAMED
@@ -1,26 +1,18 @@
1
- import gradio as gr
2
- from httpx import Client
3
- import random
4
  import os
 
 
 
5
  import fasttext
6
- from huggingface_hub import hf_hub_download
7
- from typing import Union
8
- from typing import Iterator
9
  from dotenv import load_dotenv
10
- from toolz import groupby, valmap, concat
11
- from statistics import mean
12
- from httpx import Timeout
13
  from huggingface_hub.utils import logging
14
- from litestar import get
 
15
  from httpx import AsyncClient
16
 
17
- import random
18
- import asyncio
19
- import httpx
20
-
21
- # ...
22
- from litestar import Litestar, get
23
-
24
  logger = logging.get_logger(__name__)
25
  load_dotenv()
26
  HF_TOKEN = os.getenv("HF_TOKEN")
@@ -56,26 +48,35 @@ TARGET_COLUMN_NAMES = {
56
 
57
 
58
  def datasets_server_valid_rows(hub_id: str):
59
- resp = client.get(f"{BASE_DATASETS_SERVER_URL}/is-valid?dataset={hub_id}")
60
- resp.raise_for_status()
61
- return resp.json()["viewer"]
62
-
 
 
 
 
 
 
 
 
 
63
 
64
- def get_first_config_and_split_name(hub_id: str):
65
- resp = client.get(f"https://datasets-server.huggingface.co/splits?dataset={hub_id}")
66
- resp.raise_for_status()
67
- data = resp.json()
68
- return data["splits"][0]["config"], data["splits"][0]["split"]
69
 
70
 
71
- def get_dataset_info(hub_id: str, config: str | None = None):
72
  if config is None:
73
  config = get_first_config_and_split_name(hub_id)
74
  if config is None:
75
  return None
76
  else:
77
  config = config[0]
78
- resp = client.get(
79
  f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}"
80
  )
81
  resp.raise_for_status()
@@ -98,6 +99,8 @@ async def get_random_rows(
98
  for _ in range(min(max_request_calls, number_of_rows // rows_per_call)):
99
  offset = random.randint(0, total_length - rows_per_call)
100
  url = f"https://datasets-server.huggingface.co/rows?dataset={hub_id}&config={config}&split={split}&offset={offset}&length={rows_per_call}"
 
 
101
  response = await async_client.get(url)
102
  if response.status_code == 200:
103
  data = response.json()
@@ -186,31 +189,30 @@ def predict_rows(rows, target_column, language_threshold_percent=0.2):
186
  }
187
 
188
 
189
- @get("/predict_language/")
190
  async def predict_language(
191
  hub_id: str,
192
  config: str | None = None,
193
  split: str | None = None,
194
  max_request_calls: int = 10,
195
  number_of_rows: int = 1000,
196
- ) -> dict[str, float | str]:
197
  is_valid = datasets_server_valid_rows(hub_id)
198
  if not is_valid:
199
  gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
200
  if not config:
201
- config, split = get_first_config_and_split_name(hub_id)
202
- info = get_dataset_info(hub_id, config)
203
  if info is None:
204
  gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
205
  if dataset_info := info.get("dataset_info"):
206
  total_rows_for_split = dataset_info.get("splits").get(split).get("num_examples")
207
- logger.info(f"Total rows for split {split}: {total_rows_for_split}")
208
  features = dataset_info.get("features")
209
  column_names = set(features.keys())
210
  logger.info(f"Column names: {column_names}")
211
  if not set(column_names).intersection(TARGET_COLUMN_NAMES):
212
  raise gr.Error(
213
- f"Dataset {hub_id} does not contain any of the target columns {TARGET_COLUMN_NAMES}"
214
  )
215
  for column in TARGET_COLUMN_NAMES:
216
  if column in column_names:
@@ -233,7 +235,8 @@ async def predict_language(
233
  return predictions
234
 
235
 
236
- app = Litestar([predict_language])
 
237
  # inputs = [
238
  # gr.Text(label="dataset id"),
239
  # gr.Textbox(
@@ -242,6 +245,12 @@ app = Litestar([predict_language])
242
  # ),
243
  # gr.Textbox(None, label="split"),
244
  # ]
245
- # interface = gr.Interface(predict_language, inputs=inputs, outputs="json")
 
 
 
 
 
 
246
  # interface.queue()
247
  # interface.launch()
 
 
 
 
1
  import os
2
+ import random
3
+ from statistics import mean
4
+ from typing import Iterator, Union, Any
5
  import fasttext
6
+ import gradio as gr
 
 
7
  from dotenv import load_dotenv
8
+ from httpx import Client, Timeout
9
+ from huggingface_hub import hf_hub_download
 
10
  from huggingface_hub.utils import logging
11
+ from toolz import concat, groupby, valmap
12
+ from fastapi import FastAPI
13
  from httpx import AsyncClient
14
 
15
+ app = FastAPI()
 
 
 
 
 
 
16
  logger = logging.get_logger(__name__)
17
  load_dotenv()
18
  HF_TOKEN = os.getenv("HF_TOKEN")
 
48
 
49
 
50
  def datasets_server_valid_rows(hub_id: str):
51
+ try:
52
+ resp = client.get(f"{BASE_DATASETS_SERVER_URL}/is-valid?dataset={hub_id}")
53
+ return resp.json()["viewer"]
54
+ except Exception as e:
55
+ logger.error(f"Failed to get is-valid for {hub_id}: {e}")
56
+ return False
57
+
58
+
59
+ async def get_first_config_and_split_name(hub_id: str):
60
+ try:
61
+ resp = await async_client.get(
62
+ f"https://datasets-server.huggingface.co/splits?dataset={hub_id}"
63
+ )
64
 
65
+ data = resp.json()
66
+ return data["splits"][0]["config"], data["splits"][0]["split"]
67
+ except Exception as e:
68
+ logger.error(f"Failed to get splits for {hub_id}: {e}")
69
+ return None
70
 
71
 
72
+ async def get_dataset_info(hub_id: str, config: str | None = None):
73
  if config is None:
74
  config = get_first_config_and_split_name(hub_id)
75
  if config is None:
76
  return None
77
  else:
78
  config = config[0]
79
+ resp = await async_client.get(
80
  f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}"
81
  )
82
  resp.raise_for_status()
 
99
  for _ in range(min(max_request_calls, number_of_rows // rows_per_call)):
100
  offset = random.randint(0, total_length - rows_per_call)
101
  url = f"https://datasets-server.huggingface.co/rows?dataset={hub_id}&config={config}&split={split}&offset={offset}&length={rows_per_call}"
102
+ logger.info(f"Fetching {url}")
103
+ print(url)
104
  response = await async_client.get(url)
105
  if response.status_code == 200:
106
  data = response.json()
 
189
  }
190
 
191
 
192
+ @app.get("/items/{hub_id}")
193
  async def predict_language(
194
  hub_id: str,
195
  config: str | None = None,
196
  split: str | None = None,
197
  max_request_calls: int = 10,
198
  number_of_rows: int = 1000,
199
+ ) -> dict[Any, Any]:
200
  is_valid = datasets_server_valid_rows(hub_id)
201
  if not is_valid:
202
  gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
203
  if not config:
204
+ config, split = await get_first_config_and_split_name(hub_id)
205
+ info = await get_dataset_info(hub_id, config)
206
  if info is None:
207
  gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
208
  if dataset_info := info.get("dataset_info"):
209
  total_rows_for_split = dataset_info.get("splits").get(split).get("num_examples")
 
210
  features = dataset_info.get("features")
211
  column_names = set(features.keys())
212
  logger.info(f"Column names: {column_names}")
213
  if not set(column_names).intersection(TARGET_COLUMN_NAMES):
214
  raise gr.Error(
215
+ f"Dataset {hub_id} {column_names} is not in any of the target columns {TARGET_COLUMN_NAMES}"
216
  )
217
  for column in TARGET_COLUMN_NAMES:
218
  if column in column_names:
 
235
  return predictions
236
 
237
 
238
+ # app_title = "Dataset Language Detection"
239
+ # app_description = "Detect the language of a dataset on the Hub"
240
  # inputs = [
241
  # gr.Text(label="dataset id"),
242
  # gr.Textbox(
 
245
  # ),
246
  # gr.Textbox(None, label="split"),
247
  # ]
248
+ # interface = gr.Interface(
249
+ # predict_language,
250
+ # inputs=inputs,
251
+ # outputs="json",
252
+ # title=app_title,
253
+ # article=app_description,
254
+ # )
255
  # interface.queue()
256
  # interface.launch()
requirements.txt CHANGED
@@ -6,12 +6,6 @@
6
  #
7
  aiofiles==23.2.1
8
  # via gradio
9
- aiohttp==3.9.1
10
- # via
11
- # datasets
12
- # fsspec
13
- aiosignal==1.3.1
14
- # via aiohttp
15
  altair==5.2.0
16
  # via gradio
17
  annotated-types==0.6.0
@@ -19,11 +13,10 @@ annotated-types==0.6.0
19
  anyio==4.2.0
20
  # via
21
  # httpx
22
- # litestar
23
  # starlette
 
24
  attrs==23.2.0
25
  # via
26
- # aiohttp
27
  # jsonschema
28
  # referencing
29
  certifi==2023.11.17
@@ -35,8 +28,6 @@ charset-normalizer==3.3.2
35
  # via requests
36
  click==8.1.7
37
  # via
38
- # litestar
39
- # rich-click
40
  # typer
41
  # uvicorn
42
  colorama==0.4.6
@@ -45,16 +36,10 @@ contourpy==1.2.0
45
  # via matplotlib
46
  cycler==0.12.1
47
  # via matplotlib
48
- datasets==2.14.4
49
- # via -r requirements.in
50
- dill==0.3.7
51
- # via
52
- # datasets
53
- # multiprocess
54
- faker==22.5.0
55
- # via polyfactory
56
  fastapi==0.109.0
57
- # via gradio
 
 
58
  fasttext==0.9.2
59
  # via -r requirements.in
60
  ffmpy==0.3.1
@@ -63,13 +48,8 @@ filelock==3.13.1
63
  # via huggingface-hub
64
  fonttools==4.47.2
65
  # via matplotlib
66
- frozenlist==1.4.1
67
- # via
68
- # aiohttp
69
- # aiosignal
70
- fsspec[http]==2023.12.2
71
  # via
72
- # datasets
73
  # gradio-client
74
  # huggingface-hub
75
  gradio==4.15.0
@@ -82,16 +62,16 @@ h11==0.14.0
82
  # uvicorn
83
  httpcore==1.0.2
84
  # via httpx
 
 
85
  httpx==0.26.0
86
  # via
87
  # -r requirements.in
88
  # gradio
89
  # gradio-client
90
- # litestar
91
  huggingface-hub==0.20.3
92
  # via
93
  # -r requirements.in
94
- # datasets
95
  # gradio
96
  # gradio-client
97
  idna==3.6
@@ -99,11 +79,8 @@ idna==3.6
99
  # anyio
100
  # httpx
101
  # requests
102
- # yarl
103
  importlib-resources==6.1.1
104
  # via gradio
105
- iso639-lang==2.2.2
106
- # via -r requirements.in
107
  jinja2==3.1.3
108
  # via
109
  # altair
@@ -114,8 +91,6 @@ jsonschema-specifications==2023.12.1
114
  # via jsonschema
115
  kiwisolver==1.4.5
116
  # via matplotlib
117
- litestar==2.5.1
118
- # via -r requirements.in
119
  markdown-it-py==3.0.0
120
  # via rich
121
  markupsafe==2.1.4
@@ -126,31 +101,19 @@ matplotlib==3.8.2
126
  # via gradio
127
  mdurl==0.1.2
128
  # via markdown-it-py
129
- msgspec==0.18.6
130
- # via litestar
131
- multidict==6.0.4
132
- # via
133
- # aiohttp
134
- # litestar
135
- # yarl
136
- multiprocess==0.70.15
137
- # via datasets
138
  numpy==1.26.3
139
  # via
140
  # altair
141
  # contourpy
142
- # datasets
143
  # fasttext
144
  # gradio
145
  # matplotlib
146
  # pandas
147
- # pyarrow
148
  orjson==3.9.12
149
  # via gradio
150
  packaging==23.2
151
  # via
152
  # altair
153
- # datasets
154
  # gradio
155
  # gradio-client
156
  # huggingface-hub
@@ -158,16 +121,11 @@ packaging==23.2
158
  pandas==2.2.0
159
  # via
160
  # altair
161
- # datasets
162
  # gradio
163
  pillow==10.2.0
164
  # via
165
  # gradio
166
  # matplotlib
167
- polyfactory==2.14.1
168
- # via litestar
169
- pyarrow==15.0.0
170
- # via datasets
171
  pybind11==2.11.1
172
  # via fasttext
173
  pydantic==2.5.3
@@ -184,38 +142,31 @@ pyparsing==3.1.1
184
  # via matplotlib
185
  python-dateutil==2.8.2
186
  # via
187
- # faker
188
  # matplotlib
189
  # pandas
190
  python-dotenv==1.0.1
191
- # via -r requirements.in
 
 
192
  python-multipart==0.0.6
193
  # via gradio
194
  pytz==2023.3.post1
195
  # via pandas
196
  pyyaml==6.0.1
197
  # via
198
- # datasets
199
  # gradio
200
  # huggingface-hub
201
- # litestar
202
  referencing==0.32.1
203
  # via
204
  # jsonschema
205
  # jsonschema-specifications
206
  requests==2.31.0
207
- # via
208
- # datasets
209
- # fsspec
210
- # huggingface-hub
211
  rich==13.7.0
212
  # via
213
  # -r requirements.in
214
- # litestar
215
- # rich-click
216
  # typer
217
- rich-click==1.7.3
218
- # via litestar
219
  rpds-py==0.17.1
220
  # via
221
  # jsonschema
@@ -241,9 +192,7 @@ toolz==0.12.0
241
  # -r requirements.in
242
  # altair
243
  tqdm==4.66.1
244
- # via
245
- # datasets
246
- # huggingface-hub
247
  typer[all]==0.9.0
248
  # via
249
  # gradio
@@ -254,24 +203,25 @@ typing-extensions==4.9.0
254
  # gradio
255
  # gradio-client
256
  # huggingface-hub
257
- # litestar
258
- # polyfactory
259
  # pydantic
260
  # pydantic-core
261
- # rich-click
262
  # typer
263
  tzdata==2023.4
264
  # via pandas
265
  urllib3==2.1.0
266
  # via requests
267
- uvicorn==0.27.0
268
- # via gradio
 
 
 
 
 
 
269
  websockets==11.0.3
270
- # via gradio-client
271
- xxhash==3.4.1
272
- # via datasets
273
- yarl==1.9.4
274
- # via aiohttp
275
 
276
  # The following packages are considered to be unsafe in a requirements file:
277
  # setuptools
 
6
  #
7
  aiofiles==23.2.1
8
  # via gradio
 
 
 
 
 
 
9
  altair==5.2.0
10
  # via gradio
11
  annotated-types==0.6.0
 
13
  anyio==4.2.0
14
  # via
15
  # httpx
 
16
  # starlette
17
+ # watchfiles
18
  attrs==23.2.0
19
  # via
 
20
  # jsonschema
21
  # referencing
22
  certifi==2023.11.17
 
28
  # via requests
29
  click==8.1.7
30
  # via
 
 
31
  # typer
32
  # uvicorn
33
  colorama==0.4.6
 
36
  # via matplotlib
37
  cycler==0.12.1
38
  # via matplotlib
 
 
 
 
 
 
 
 
39
  fastapi==0.109.0
40
+ # via
41
+ # -r requirements.in
42
+ # gradio
43
  fasttext==0.9.2
44
  # via -r requirements.in
45
  ffmpy==0.3.1
 
48
  # via huggingface-hub
49
  fonttools==4.47.2
50
  # via matplotlib
51
+ fsspec==2023.12.2
 
 
 
 
52
  # via
 
53
  # gradio-client
54
  # huggingface-hub
55
  gradio==4.15.0
 
62
  # uvicorn
63
  httpcore==1.0.2
64
  # via httpx
65
+ httptools==0.6.1
66
+ # via uvicorn
67
  httpx==0.26.0
68
  # via
69
  # -r requirements.in
70
  # gradio
71
  # gradio-client
 
72
  huggingface-hub==0.20.3
73
  # via
74
  # -r requirements.in
 
75
  # gradio
76
  # gradio-client
77
  idna==3.6
 
79
  # anyio
80
  # httpx
81
  # requests
 
82
  importlib-resources==6.1.1
83
  # via gradio
 
 
84
  jinja2==3.1.3
85
  # via
86
  # altair
 
91
  # via jsonschema
92
  kiwisolver==1.4.5
93
  # via matplotlib
 
 
94
  markdown-it-py==3.0.0
95
  # via rich
96
  markupsafe==2.1.4
 
101
  # via gradio
102
  mdurl==0.1.2
103
  # via markdown-it-py
 
 
 
 
 
 
 
 
 
104
  numpy==1.26.3
105
  # via
106
  # altair
107
  # contourpy
 
108
  # fasttext
109
  # gradio
110
  # matplotlib
111
  # pandas
 
112
  orjson==3.9.12
113
  # via gradio
114
  packaging==23.2
115
  # via
116
  # altair
 
117
  # gradio
118
  # gradio-client
119
  # huggingface-hub
 
121
  pandas==2.2.0
122
  # via
123
  # altair
 
124
  # gradio
125
  pillow==10.2.0
126
  # via
127
  # gradio
128
  # matplotlib
 
 
 
 
129
  pybind11==2.11.1
130
  # via fasttext
131
  pydantic==2.5.3
 
142
  # via matplotlib
143
  python-dateutil==2.8.2
144
  # via
 
145
  # matplotlib
146
  # pandas
147
  python-dotenv==1.0.1
148
+ # via
149
+ # -r requirements.in
150
+ # uvicorn
151
  python-multipart==0.0.6
152
  # via gradio
153
  pytz==2023.3.post1
154
  # via pandas
155
  pyyaml==6.0.1
156
  # via
 
157
  # gradio
158
  # huggingface-hub
159
+ # uvicorn
160
  referencing==0.32.1
161
  # via
162
  # jsonschema
163
  # jsonschema-specifications
164
  requests==2.31.0
165
+ # via huggingface-hub
 
 
 
166
  rich==13.7.0
167
  # via
168
  # -r requirements.in
 
 
169
  # typer
 
 
170
  rpds-py==0.17.1
171
  # via
172
  # jsonschema
 
192
  # -r requirements.in
193
  # altair
194
  tqdm==4.66.1
195
+ # via huggingface-hub
 
 
196
  typer[all]==0.9.0
197
  # via
198
  # gradio
 
203
  # gradio
204
  # gradio-client
205
  # huggingface-hub
 
 
206
  # pydantic
207
  # pydantic-core
 
208
  # typer
209
  tzdata==2023.4
210
  # via pandas
211
  urllib3==2.1.0
212
  # via requests
213
+ uvicorn[standard]==0.27.0
214
+ # via
215
+ # -r requirements.in
216
+ # gradio
217
+ uvloop==0.19.0
218
+ # via uvicorn
219
+ watchfiles==0.21.0
220
+ # via uvicorn
221
  websockets==11.0.3
222
+ # via
223
+ # gradio-client
224
+ # uvicorn
 
 
225
 
226
  # The following packages are considered to be unsafe in a requirements file:
227
  # setuptools