Spaces:

davanstrien
/

dataset-language-prediction-api-literstar

Sleeping

App Files Files Community

davanstrien HF staff commited on Jan 24

Commit

bbb3627

•

1 Parent(s): 3941018

Update Dockerfile and requirements.txt

Browse files

Files changed (3) hide show

Dockerfile +6 -12
app.py → main.py +45 -36
requirements.txt +24 -74

Dockerfile CHANGED Viewed

@@ -1,17 +1,11 @@
-# Set the base image using Python 3.12 and Debian Bookworm
-FROM python:3.12-slim-bookworm
-# Set the working directory to /app
-WORKDIR /app
-# Copy only the necessary files to the working directory
-COPY . /app
-# Install the requirements
-RUN pip install --no-cache-dir --upgrade -r /app/requirements.txt
-# Expose the port the app runs on
-EXPOSE 80
-# Run the app with the Litestar CLI
-CMD ["litestar", "run", "--host", "0.0.0.0", "--port", "80"]

+FROM python:3.11
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

app.py → main.py RENAMED Viewed

@@ -1,26 +1,18 @@
-import gradio as gr
-from httpx import Client
-import random
 import os
 import fasttext
-from huggingface_hub import hf_hub_download
-from typing import Union
-from typing import Iterator
 from dotenv import load_dotenv
-from toolz import groupby, valmap, concat
-from statistics import mean
-from httpx import Timeout
 from huggingface_hub.utils import logging
-from litestar import get
 from httpx import AsyncClient
-import random
-import asyncio
-import httpx
-# ...
-from litestar import Litestar, get
 logger = logging.get_logger(__name__)
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
@@ -56,26 +48,35 @@ TARGET_COLUMN_NAMES = {
 def datasets_server_valid_rows(hub_id: str):
-    resp = client.get(f"{BASE_DATASETS_SERVER_URL}/is-valid?dataset={hub_id}")
-    resp.raise_for_status()
-    return resp.json()["viewer"]
-def get_first_config_and_split_name(hub_id: str):
-    resp = client.get(f"https://datasets-server.huggingface.co/splits?dataset={hub_id}")
-    resp.raise_for_status()
-    data = resp.json()
-    return data["splits"][0]["config"], data["splits"][0]["split"]
-def get_dataset_info(hub_id: str, config: str | None = None):
     if config is None:
         config = get_first_config_and_split_name(hub_id)
         if config is None:
             return None
         else:
             config = config[0]
-    resp = client.get(
         f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}"
     )
     resp.raise_for_status()
@@ -98,6 +99,8 @@ async def get_random_rows(
     for _ in range(min(max_request_calls, number_of_rows // rows_per_call)):
         offset = random.randint(0, total_length - rows_per_call)
         url = f"https://datasets-server.huggingface.co/rows?dataset={hub_id}&config={config}&split={split}&offset={offset}&length={rows_per_call}"
         response = await async_client.get(url)
         if response.status_code == 200:
             data = response.json()
@@ -186,31 +189,30 @@ def predict_rows(rows, target_column, language_threshold_percent=0.2):
     }
-@get("/predict_language/")
 async def predict_language(
     hub_id: str,
     config: str | None = None,
     split: str | None = None,
     max_request_calls: int = 10,
     number_of_rows: int = 1000,
-) -> dict[str, float | str]:
     is_valid = datasets_server_valid_rows(hub_id)
     if not is_valid:
         gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
     if not config:
-        config, split = get_first_config_and_split_name(hub_id)
-    info = get_dataset_info(hub_id, config)
     if info is None:
         gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
     if dataset_info := info.get("dataset_info"):
         total_rows_for_split = dataset_info.get("splits").get(split).get("num_examples")
-        logger.info(f"Total rows for split {split}: {total_rows_for_split}")
         features = dataset_info.get("features")
         column_names = set(features.keys())
         logger.info(f"Column names: {column_names}")
         if not set(column_names).intersection(TARGET_COLUMN_NAMES):
             raise gr.Error(
-                f"Dataset {hub_id} does not contain any of the target columns {TARGET_COLUMN_NAMES}"
             )
         for column in TARGET_COLUMN_NAMES:
             if column in column_names:
@@ -233,7 +235,8 @@ async def predict_language(
         return predictions
-app = Litestar([predict_language])
 # inputs = [
 #     gr.Text(label="dataset id"),
 #     gr.Textbox(
@@ -242,6 +245,12 @@ app = Litestar([predict_language])
 #     ),
 #     gr.Textbox(None, label="split"),
 # ]
-# interface = gr.Interface(predict_language, inputs=inputs, outputs="json")
 # interface.queue()
 # interface.launch()

 import os
+import random
+from statistics import mean
+from typing import Iterator, Union, Any
 import fasttext
+import gradio as gr
 from dotenv import load_dotenv
+from httpx import Client, Timeout
+from huggingface_hub import hf_hub_download
 from huggingface_hub.utils import logging
+from toolz import concat, groupby, valmap
+from fastapi import FastAPI
 from httpx import AsyncClient
+app = FastAPI()
 logger = logging.get_logger(__name__)
 load_dotenv()
 HF_TOKEN = os.getenv("HF_TOKEN")
 def datasets_server_valid_rows(hub_id: str):
+    try:
+        resp = client.get(f"{BASE_DATASETS_SERVER_URL}/is-valid?dataset={hub_id}")
+        return resp.json()["viewer"]
+    except Exception as e:
+        logger.error(f"Failed to get is-valid for {hub_id}: {e}")
+        return False
+async def get_first_config_and_split_name(hub_id: str):
+    try:
+        resp = await async_client.get(
+            f"https://datasets-server.huggingface.co/splits?dataset={hub_id}"
+        )
+        data = resp.json()
+        return data["splits"][0]["config"], data["splits"][0]["split"]
+    except Exception as e:
+        logger.error(f"Failed to get splits for {hub_id}: {e}")
+        return None
+async def get_dataset_info(hub_id: str, config: str | None = None):
     if config is None:
         config = get_first_config_and_split_name(hub_id)
         if config is None:
             return None
         else:
             config = config[0]
+    resp = await async_client.get(
         f"{BASE_DATASETS_SERVER_URL}/info?dataset={hub_id}&config={config}"
     )
     resp.raise_for_status()
     for _ in range(min(max_request_calls, number_of_rows // rows_per_call)):
         offset = random.randint(0, total_length - rows_per_call)
         url = f"https://datasets-server.huggingface.co/rows?dataset={hub_id}&config={config}&split={split}&offset={offset}&length={rows_per_call}"
+        logger.info(f"Fetching {url}")
+        print(url)
         response = await async_client.get(url)
         if response.status_code == 200:
             data = response.json()
     }
+@app.get("/items/{hub_id}")
 async def predict_language(
     hub_id: str,
     config: str | None = None,
     split: str | None = None,
     max_request_calls: int = 10,
     number_of_rows: int = 1000,
+) -> dict[Any, Any]:
     is_valid = datasets_server_valid_rows(hub_id)
     if not is_valid:
         gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
     if not config:
+        config, split = await get_first_config_and_split_name(hub_id)
+    info = await get_dataset_info(hub_id, config)
     if info is None:
         gr.Error(f"Dataset {hub_id} is not accessible via the datasets server.")
     if dataset_info := info.get("dataset_info"):
         total_rows_for_split = dataset_info.get("splits").get(split).get("num_examples")
         features = dataset_info.get("features")
         column_names = set(features.keys())
         logger.info(f"Column names: {column_names}")
         if not set(column_names).intersection(TARGET_COLUMN_NAMES):
             raise gr.Error(
+                f"Dataset {hub_id} {column_names} is not in any of the target columns {TARGET_COLUMN_NAMES}"
             )
         for column in TARGET_COLUMN_NAMES:
             if column in column_names:
         return predictions
+# app_title = "Dataset Language Detection"
+# app_description = "Detect the language of a dataset on the Hub"
 # inputs = [
 #     gr.Text(label="dataset id"),
 #     gr.Textbox(
 #     ),
 #     gr.Textbox(None, label="split"),
 # ]
+# interface = gr.Interface(
+#     predict_language,
+#     inputs=inputs,
+#     outputs="json",
+#     title=app_title,
+#     article=app_description,
+# )
 # interface.queue()
 # interface.launch()

requirements.txt CHANGED Viewed

@@ -6,12 +6,6 @@
 #
 aiofiles==23.2.1
     # via gradio
-aiohttp==3.9.1
-    # via
-    #   datasets
-    #   fsspec
-aiosignal==1.3.1
-    # via aiohttp
 altair==5.2.0
     # via gradio
 annotated-types==0.6.0
@@ -19,11 +13,10 @@ annotated-types==0.6.0
 anyio==4.2.0
     # via
     #   httpx
-    #   litestar
     #   starlette
 attrs==23.2.0
     # via
-    #   aiohttp
     #   jsonschema
     #   referencing
 certifi==2023.11.17
@@ -35,8 +28,6 @@ charset-normalizer==3.3.2
     # via requests
 click==8.1.7
     # via
-    #   litestar
-    #   rich-click
     #   typer
     #   uvicorn
 colorama==0.4.6
@@ -45,16 +36,10 @@ contourpy==1.2.0
     # via matplotlib
 cycler==0.12.1
     # via matplotlib
-datasets==2.14.4
-    # via -r requirements.in
-dill==0.3.7
-    # via
-    #   datasets
-    #   multiprocess
-faker==22.5.0
-    # via polyfactory
 fastapi==0.109.0
-    # via gradio
 fasttext==0.9.2
     # via -r requirements.in
 ffmpy==0.3.1
@@ -63,13 +48,8 @@ filelock==3.13.1
     # via huggingface-hub
 fonttools==4.47.2
     # via matplotlib
-frozenlist==1.4.1
-    # via
-    #   aiohttp
-    #   aiosignal
-fsspec[http]==2023.12.2
     # via
-    #   datasets
     #   gradio-client
     #   huggingface-hub
 gradio==4.15.0
@@ -82,16 +62,16 @@ h11==0.14.0
     #   uvicorn
 httpcore==1.0.2
     # via httpx
 httpx==0.26.0
     # via
     #   -r requirements.in
     #   gradio
     #   gradio-client
-    #   litestar
 huggingface-hub==0.20.3
     # via
     #   -r requirements.in
-    #   datasets
     #   gradio
     #   gradio-client
 idna==3.6
@@ -99,11 +79,8 @@ idna==3.6
     #   anyio
     #   httpx
     #   requests
-    #   yarl
 importlib-resources==6.1.1
     # via gradio
-iso639-lang==2.2.2
-    # via -r requirements.in
 jinja2==3.1.3
     # via
     #   altair
@@ -114,8 +91,6 @@ jsonschema-specifications==2023.12.1
     # via jsonschema
 kiwisolver==1.4.5
     # via matplotlib
-litestar==2.5.1
-    # via -r requirements.in
 markdown-it-py==3.0.0
     # via rich
 markupsafe==2.1.4
@@ -126,31 +101,19 @@ matplotlib==3.8.2
     # via gradio
 mdurl==0.1.2
     # via markdown-it-py
-msgspec==0.18.6
-    # via litestar
-multidict==6.0.4
-    # via
-    #   aiohttp
-    #   litestar
-    #   yarl
-multiprocess==0.70.15
-    # via datasets
 numpy==1.26.3
     # via
     #   altair
     #   contourpy
-    #   datasets
     #   fasttext
     #   gradio
     #   matplotlib
     #   pandas
-    #   pyarrow
 orjson==3.9.12
     # via gradio
 packaging==23.2
     # via
     #   altair
-    #   datasets
     #   gradio
     #   gradio-client
     #   huggingface-hub
@@ -158,16 +121,11 @@ packaging==23.2
 pandas==2.2.0
     # via
     #   altair
-    #   datasets
     #   gradio
 pillow==10.2.0
     # via
     #   gradio
     #   matplotlib
-polyfactory==2.14.1
-    # via litestar
-pyarrow==15.0.0
-    # via datasets
 pybind11==2.11.1
     # via fasttext
 pydantic==2.5.3
@@ -184,38 +142,31 @@ pyparsing==3.1.1
     # via matplotlib
 python-dateutil==2.8.2
     # via
-    #   faker
     #   matplotlib
     #   pandas
 python-dotenv==1.0.1
-    # via -r requirements.in
 python-multipart==0.0.6
     # via gradio
 pytz==2023.3.post1
     # via pandas
 pyyaml==6.0.1
     # via
-    #   datasets
     #   gradio
     #   huggingface-hub
-    #   litestar
 referencing==0.32.1
     # via
     #   jsonschema
     #   jsonschema-specifications
 requests==2.31.0
-    # via
-    #   datasets
-    #   fsspec
-    #   huggingface-hub
 rich==13.7.0
     # via
     #   -r requirements.in
-    #   litestar
-    #   rich-click
     #   typer
-rich-click==1.7.3
-    # via litestar
 rpds-py==0.17.1
     # via
     #   jsonschema
@@ -241,9 +192,7 @@ toolz==0.12.0
     #   -r requirements.in
     #   altair
 tqdm==4.66.1
-    # via
-    #   datasets
-    #   huggingface-hub
 typer[all]==0.9.0
     # via
     #   gradio
@@ -254,24 +203,25 @@ typing-extensions==4.9.0
     #   gradio
     #   gradio-client
     #   huggingface-hub
-    #   litestar
-    #   polyfactory
     #   pydantic
     #   pydantic-core
-    #   rich-click
     #   typer
 tzdata==2023.4
     # via pandas
 urllib3==2.1.0
     # via requests
-uvicorn==0.27.0
-    # via gradio
 websockets==11.0.3
-    # via gradio-client
-xxhash==3.4.1
-    # via datasets
-yarl==1.9.4
-    # via aiohttp
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools

 #
 aiofiles==23.2.1
     # via gradio
 altair==5.2.0
     # via gradio
 annotated-types==0.6.0
 anyio==4.2.0
     # via
     #   httpx
     #   starlette
+    #   watchfiles
 attrs==23.2.0
     # via
     #   jsonschema
     #   referencing
 certifi==2023.11.17
     # via requests
 click==8.1.7
     # via
     #   typer
     #   uvicorn
 colorama==0.4.6
     # via matplotlib
 cycler==0.12.1
     # via matplotlib
 fastapi==0.109.0
+    # via
+    #   -r requirements.in
+    #   gradio
 fasttext==0.9.2
     # via -r requirements.in
 ffmpy==0.3.1
     # via huggingface-hub
 fonttools==4.47.2
     # via matplotlib
+fsspec==2023.12.2
     # via
     #   gradio-client
     #   huggingface-hub
 gradio==4.15.0
     #   uvicorn
 httpcore==1.0.2
     # via httpx
+httptools==0.6.1
+    # via uvicorn
 httpx==0.26.0
     # via
     #   -r requirements.in
     #   gradio
     #   gradio-client
 huggingface-hub==0.20.3
     # via
     #   -r requirements.in
     #   gradio
     #   gradio-client
 idna==3.6
     #   anyio
     #   httpx
     #   requests
 importlib-resources==6.1.1
     # via gradio
 jinja2==3.1.3
     # via
     #   altair
     # via jsonschema
 kiwisolver==1.4.5
     # via matplotlib
 markdown-it-py==3.0.0
     # via rich
 markupsafe==2.1.4
     # via gradio
 mdurl==0.1.2
     # via markdown-it-py
 numpy==1.26.3
     # via
     #   altair
     #   contourpy
     #   fasttext
     #   gradio
     #   matplotlib
     #   pandas
 orjson==3.9.12
     # via gradio
 packaging==23.2
     # via
     #   altair
     #   gradio
     #   gradio-client
     #   huggingface-hub
 pandas==2.2.0
     # via
     #   altair
     #   gradio
 pillow==10.2.0
     # via
     #   gradio
     #   matplotlib
 pybind11==2.11.1
     # via fasttext
 pydantic==2.5.3
     # via matplotlib
 python-dateutil==2.8.2
     # via
     #   matplotlib
     #   pandas
 python-dotenv==1.0.1
+    # via
+    #   -r requirements.in
+    #   uvicorn
 python-multipart==0.0.6
     # via gradio
 pytz==2023.3.post1
     # via pandas
 pyyaml==6.0.1
     # via
     #   gradio
     #   huggingface-hub
+    #   uvicorn
 referencing==0.32.1
     # via
     #   jsonschema
     #   jsonschema-specifications
 requests==2.31.0
+    # via huggingface-hub
 rich==13.7.0
     # via
     #   -r requirements.in
     #   typer
 rpds-py==0.17.1
     # via
     #   jsonschema
     #   -r requirements.in
     #   altair
 tqdm==4.66.1
+    # via huggingface-hub
 typer[all]==0.9.0
     # via
     #   gradio
     #   gradio
     #   gradio-client
     #   huggingface-hub
     #   pydantic
     #   pydantic-core
     #   typer
 tzdata==2023.4
     # via pandas
 urllib3==2.1.0
     # via requests
+uvicorn[standard]==0.27.0
+    # via
+    #   -r requirements.in
+    #   gradio
+uvloop==0.19.0
+    # via uvicorn
+watchfiles==0.21.0
+    # via uvicorn
 websockets==11.0.3
+    # via
+    #   gradio-client
+    #   uvicorn
 # The following packages are considered to be unsafe in a requirements file:
 # setuptools