Spaces:

klasocki
/

comma-fixer

Sleeping

App Files Files Community

klasocki commited on Aug 19, 2023

Commit

f42ec01

•

1 Parent(s): a5fed35

Migrate to FastAPI from Flask, Docker works

Browse files

Files changed (8) hide show

.dockerignore +4 -0
Dockerfile +4 -3
app.py +16 -13
docker-compose.yml +23 -21
requirements.txt +7 -5
src/baseline.py +17 -9
tests/test_baseline.py +7 -7
tests/test_integration.py +6 -10

.dockerignore CHANGED Viewed

@@ -1,3 +1,7 @@
 .idea
 data/
 .pytest_cache

 .idea
 data/
 .pytest_cache
+.gitignore
+README.txt
+openapi.yaml

Dockerfile CHANGED Viewed

@@ -5,9 +5,10 @@ WORKDIR /comma-fixer
 COPY requirements.txt .
 RUN pip install -r requirements.txt
-COPY . .
-COPY ~/.cache/huggingface/hub/models--oliverguhr--fullstop-punctuation-multilang-large/ ~/.cache/huggingface/hub/models--oliverguhr--fullstop-punctuation-multilang-large/
 EXPOSE 8000
-#CMD gunicorn "app:app"

 COPY requirements.txt .
 RUN pip install -r requirements.txt
+COPY src/baseline.py src/baseline.py
+RUN python src/baseline.py  # This pre-downloads models and tokenizers
+COPY . .
 EXPOSE 8000
+CMD uvicorn "app:app" --port 8000 --host "0.0.0.0"

app.py CHANGED Viewed

@@ -1,32 +1,35 @@
-from flask import Flask, request, jsonify, make_response
-from src.baseline import fix_commas, create_baseline_pipeline
 import logging
 logger = logging.Logger(__name__)
 logging.basicConfig(level=logging.INFO)
-app = Flask(__name__)
-logging.info('Loading the baseline model...')
-app.baseline_pipeline = create_baseline_pipeline()
-@app.route('/', methods=['GET'])
-def root():
     return ("Welcome to the comma fixer. Send a POST request to /fix-commas or /baseline/fix-commas with a string "
             "'s' in the JSON body to try "
             "out the functionality.")
-@app.route('/baseline/fix-commas/', methods=['POST'])
-def fix_commas_with_baseline():
     json_field_name = 's'
-    data = request.get_json()
     if json_field_name in data:
-        return make_response(jsonify({json_field_name: fix_commas(app.baseline_pipeline, data['s'])}), 200)
     else:
-        return make_response(f"Parameter '{json_field_name}' missing", 400)
 if __name__ == '__main__':
-    app.run(debug=True)

+import uvicorn
+from fastapi import FastAPI, HTTPException
+from src.baseline import BaselineCommaFixer
 import logging
 logger = logging.Logger(__name__)
 logging.basicConfig(level=logging.INFO)
+app = FastAPI() #TODO router?
+logger.info('Loading the baseline model...')
+app.baseline_model = BaselineCommaFixer()
+@app.get('/')
+async def root():
     return ("Welcome to the comma fixer. Send a POST request to /fix-commas or /baseline/fix-commas with a string "
             "'s' in the JSON body to try "
             "out the functionality.")
+@app.post('/baseline/fix-commas/')
+async def fix_commas_with_baseline(data: dict):
     json_field_name = 's'
     if json_field_name in data:
+        logger.debug('Fixing commas.')
+        return {json_field_name: app.baseline_model.fix_commas(data['s'])}
     else:
+        msg = f"Text '{json_field_name}' missing"
+        logger.debug(msg)
+        raise HTTPException(status_code=400, detail=msg)
 if __name__ == '__main__':
+    uvicorn.run("app:app", reload=True, port=8000)

docker-compose.yml CHANGED Viewed

@@ -1,28 +1,30 @@
 services:
-  nginx:
-    image: nginx:latest
-    container_name: nginx
-    volumes:
-      - ./:/comma-fixer
-      - ./nginx.conf:/etc/nginx/conf.d/default.conf
-    ports:
-      - 8001:80
-    networks:
-      - my-network
-    depends_on:
-      - flask
-  flask:
     build:
       context: ./
       dockerfile: Dockerfile
     container_name: comma-fixer
-    command: gunicorn --bind 0.0.0.0:8000 "app:app" --timeout 300 #--workers 4
     volumes:
       - ./:/comma-fixer
-    networks:
-      my-network:
-        aliases:
-          - flask-app
-networks:
-  my-network:

+version: '3.1'
 services:
+#  nginx:
+#    image: nginx:latest
+#    container_name: nginx
+#    volumes:
+#      - ./:/comma-fixer
+#      - ./nginx.conf:/etc/nginx/conf.d/default.conf
+#    ports:
+#      - 8001:80
+#    networks:
+#      - my-network
+#    depends_on:
+#      - flask
+  comma-fixer:
     build:
       context: ./
       dockerfile: Dockerfile
     container_name: comma-fixer
+    command: uvicorn --host 0.0.0.0 --port 8000 "app:app"
     volumes:
       - ./:/comma-fixer
+#    networks:
+#      my-network:
+#        aliases:
+#          - comma-fixer
+#
+#networks:
+#  my-network:

requirements.txt CHANGED Viewed

@@ -1,9 +1,11 @@
-flask == 2.2.2
-gunicorn == 21.2.0
 pytest
-torch == 2.0.1
-transformers == 4.31.0
 # for the tokenizer of the baseline model
-protobuf == 4.24.0
 sentencepiece==0.1.99

+fastapi==0.101.1
+gunicorn==21.2.0
+uvicorn==0.23.2
 pytest
+httpx
+torch==2.0.1
+transformers==4.31.0
 # for the tokenizer of the baseline model
+protobuf==4.24.0
 sentencepiece==0.1.99

src/baseline.py CHANGED Viewed

@@ -1,19 +1,23 @@
 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, NerPipeline
-def create_baseline_pipeline(model_name="oliverguhr/fullstop-punctuation-multilang-large") -> NerPipeline:
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForTokenClassification.from_pretrained(model_name)
     return pipeline('ner', model=model, tokenizer=tokenizer)
-def fix_commas(ner_pipeline: NerPipeline, s: str) -> str:
-    return _fix_commas_based_on_pipeline_output(
-        ner_pipeline(_remove_punctuation(s)),
-        s
-    )
 def _remove_punctuation(s: str) -> str:
     to_remove = ".,?-:"
     for char in to_remove:
@@ -29,7 +33,7 @@ def _fix_commas_based_on_pipeline_output(pipeline_json: list[dict], original_s:
         current_offset = _find_current_token(current_offset, i, pipeline_json, result)
         if _should_insert_comma(i, pipeline_json):
             result = result[:current_offset] + ',' + result[current_offset:]
-            current_offset += 1
     return result
@@ -43,3 +47,7 @@ def _find_current_token(current_offset, i, pipeline_json, result, new_word_indic
     # Find the current word in the result string, starting looking at current offset
     current_offset = result.find(current_word, current_offset) + len(current_word)
     return current_offset

 from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline, NerPipeline
+class BaselineCommaFixer:
+    def __init__(self):
+        self._ner = _create_baseline_pipeline()
+    def fix_commas(self, s: str) -> str:
+        return _fix_commas_based_on_pipeline_output(
+            self._ner(_remove_punctuation(s)),
+            s
+        )
+def _create_baseline_pipeline(model_name="oliverguhr/fullstop-punctuation-multilang-large") -> NerPipeline:
     tokenizer = AutoTokenizer.from_pretrained(model_name)
     model = AutoModelForTokenClassification.from_pretrained(model_name)
     return pipeline('ner', model=model, tokenizer=tokenizer)
 def _remove_punctuation(s: str) -> str:
     to_remove = ".,?-:"
     for char in to_remove:
         current_offset = _find_current_token(current_offset, i, pipeline_json, result)
         if _should_insert_comma(i, pipeline_json):
             result = result[:current_offset] + ',' + result[current_offset:]
+        current_offset += 1
     return result
     # Find the current word in the result string, starting looking at current offset
     current_offset = result.find(current_word, current_offset) + len(current_word)
     return current_offset
+if __name__ == "__main__":
+    BaselineCommaFixer()  # to pre-download the model and tokenizer

tests/test_baseline.py CHANGED Viewed

@@ -1,10 +1,10 @@
 import pytest
-from baseline import create_baseline_pipeline, fix_commas, _remove_punctuation
 @pytest.fixture()
-def baseline_pipeline():
-    yield create_baseline_pipeline()
 @pytest.mark.parametrize(
@@ -14,8 +14,8 @@ def baseline_pipeline():
      'This test string should not have any commas inside it.',
      'aAaalLL the.. weird?~! punctuation.should also . be kept-as is! Only fixing-commas.']
 )
-def test_fix_commas_leaves_correct_strings_unchanged(baseline_pipeline, test_input):
-    result = fix_commas(baseline_pipeline, s=test_input)
     assert result == test_input
@@ -32,8 +32,8 @@ def test_fix_commas_leaves_correct_strings_unchanged(baseline_pipeline, test_inp
         ['I had no Creativity left, therefore, I come here, and write useless examples, for this test.',
          'I had no Creativity left therefore, I come here and write useless examples for this test.']]
 )
-def test_fix_commas_fixes_incorrect_commas(baseline_pipeline, test_input, expected):
-    result = fix_commas(baseline_pipeline, s=test_input)
     assert result == expected

 import pytest
+from baseline import BaselineCommaFixer, _remove_punctuation
 @pytest.fixture()
+def baseline_fixer():
+    yield BaselineCommaFixer()
 @pytest.mark.parametrize(
      'This test string should not have any commas inside it.',
      'aAaalLL the.. weird?~! punctuation.should also . be kept-as is! Only fixing-commas.']
 )
+def test_fix_commas_leaves_correct_strings_unchanged(baseline_fixer, test_input):
+    result = baseline_fixer.fix_commas(s=test_input)
     assert result == test_input
         ['I had no Creativity left, therefore, I come here, and write useless examples, for this test.',
          'I had no Creativity left therefore, I come here and write useless examples for this test.']]
 )
+def test_fix_commas_fixes_incorrect_commas(baseline_fixer, test_input, expected):
+    result = baseline_fixer.fix_commas(s=test_input)
     assert result == expected

tests/test_integration.py CHANGED Viewed

@@ -1,21 +1,17 @@
-from flask import json
 import pytest
 from app import app
-from baseline import create_baseline_pipeline
 @pytest.fixture()
 def client():
-    app.config["DEBUG"] = True
-    app.config["TESTING"] = True
-    app.baseline_pipeline = create_baseline_pipeline()
-    yield app.test_client()
 def test_fix_commas_fails_on_no_parameter(client):
     response = client.post('/baseline/fix-commas/')
-    assert response.status_code == 400
 def test_fix_commas_fails_on_wrong_parameters(client):
@@ -33,7 +29,7 @@ def test_fix_commas_correct_string_unchanged(client, test_input: str):
     response = client.post('/baseline/fix-commas/', json={'s': test_input})
     assert response.status_code == 200
-    assert response.get_json().get('s') == test_input
 @pytest.mark.parametrize(
@@ -46,7 +42,7 @@ def test_fix_commas_fixes_wrong_commas(client, test_input: str, expected: str):
     response = client.post('/baseline/fix-commas/', json={'s': test_input})
     assert response.status_code == 200
-    assert response.get_json().get('s') == expected
 def test_with_a_very_long_string(client):
@@ -54,4 +50,4 @@ def test_with_a_very_long_string(client):
     response = client.post('/baseline/fix-commas/', json={'s': s})
     assert response.status_code == 200
-    assert response.get_json().get('s') == s

+from fastapi.testclient import TestClient
 import pytest
 from app import app
 @pytest.fixture()
 def client():
+    yield TestClient(app)
 def test_fix_commas_fails_on_no_parameter(client):
     response = client.post('/baseline/fix-commas/')
+    assert response.status_code == 422
 def test_fix_commas_fails_on_wrong_parameters(client):
     response = client.post('/baseline/fix-commas/', json={'s': test_input})
     assert response.status_code == 200
+    assert response.json().get('s') == test_input
 @pytest.mark.parametrize(
     response = client.post('/baseline/fix-commas/', json={'s': test_input})
     assert response.status_code == 200
+    assert response.json().get('s') == expected
 def test_with_a_very_long_string(client):
     response = client.post('/baseline/fix-commas/', json={'s': s})
     assert response.status_code == 200
+    assert response.json().get('s') == s