eloukas commited on
Commit
7928a0f
1 Parent(s): d48b193
Files changed (3) hide show
  1. Dockerfile +16 -0
  2. app.py +404 -0
  3. requirements.txt +4 -0
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ from fastapi import FastAPI, HTTPException, Query
4
+ from fastapi.responses import RedirectResponse
5
+ from gr_nlp_toolkit import Pipeline
6
+ from pydantic import BaseModel, Field
7
+
8
+ app = FastAPI(
9
+ title="The Grεεk NLP API 🇬🇷",
10
+ description="State-of-the-art API for Greek NLP tasks including Greeklish to Greek conversion (G2G), Named Entity Recognition (NER), Part-of-Speech (POS) tagging, and Dependency Parsing (DP). Powered by the Grεεk NLP Toolkit, available on PyPI (`pip install gr-nlp-toolkit`).",
11
+ version="1.0.0",
12
+ contact={
13
+ "name": "Natural Language Processing Group - Athens University of Economics and Business (AUEB)",
14
+ "url": "http://nlp.cs.aueb.gr/",
15
+ "api_author": "Lefteris Loukas",
16
+ },
17
+ )
18
+
19
+ # Instantiate the Pipeline
20
+ nlp_pos_ner_dp_with_g2g = Pipeline("pos,ner,dp,g2g")
21
+
22
+
23
+ # Pydantic models for responses
24
+ class G2GOutput(BaseModel):
25
+ greek_text: str = Field(
26
+ ...,
27
+ example="η θεσσαλονικη ειναι ωραια πολη",
28
+ description="Converted Greek text",
29
+ )
30
+
31
+
32
+ class NERItem(BaseModel):
33
+ token: str = Field(..., example="αργεντινη")
34
+ ner_value: str = Field(..., example="S-ORG")
35
+
36
+
37
+ class POSItem(BaseModel):
38
+ token: str = Field(..., example="μου")
39
+ upos: str = Field(..., example="PRON")
40
+ morphological_features: Dict[str, str] = Field(
41
+ ...,
42
+ example={
43
+ "Case": "Gen",
44
+ "Gender": "Masc",
45
+ "Number": "Sing",
46
+ "Person": "1",
47
+ "Poss": "_",
48
+ "PronType": "Prs",
49
+ },
50
+ )
51
+
52
+
53
+ class POSResponse(BaseModel):
54
+ pos_results: List[POSItem] = Field(
55
+ ...,
56
+ description="Part-of-Speech tagging information",
57
+ example=[
58
+ {
59
+ "token": "μου",
60
+ "upos": "PRON",
61
+ "morphological_features": {
62
+ "Case": "Gen",
63
+ "Gender": "Masc",
64
+ "Number": "Sing",
65
+ "Person": "1",
66
+ "Poss": "_",
67
+ "PronType": "Prs",
68
+ },
69
+ },
70
+ {
71
+ "token": "αρεσει",
72
+ "upos": "VERB",
73
+ "morphological_features": {
74
+ "Aspect": "Imp",
75
+ "Case": "_",
76
+ "Gender": "_",
77
+ "Mood": "Ind",
78
+ "Number": "Sing",
79
+ "Person": "3",
80
+ "Tense": "Pres",
81
+ "VerbForm": "Fin",
82
+ "Voice": "Act",
83
+ },
84
+ },
85
+ {
86
+ "token": "να",
87
+ "upos": "AUX",
88
+ "morphological_features": {
89
+ "Aspect": "_",
90
+ "Mood": "_",
91
+ "Number": "_",
92
+ "Person": "_",
93
+ "Tense": "_",
94
+ "VerbForm": "_",
95
+ "Voice": "_",
96
+ },
97
+ },
98
+ {
99
+ "token": "διαβαζω",
100
+ "upos": "VERB",
101
+ "morphological_features": {
102
+ "Aspect": "Imp",
103
+ "Case": "_",
104
+ "Gender": "_",
105
+ "Mood": "Ind",
106
+ "Number": "Sing",
107
+ "Person": "1",
108
+ "Tense": "Pres",
109
+ "VerbForm": "Fin",
110
+ "Voice": "Act",
111
+ },
112
+ },
113
+ {
114
+ "token": "τα",
115
+ "upos": "DET",
116
+ "morphological_features": {
117
+ "Case": "Acc",
118
+ "Definite": "Def",
119
+ "Gender": "Neut",
120
+ "Number": "Plur",
121
+ "PronType": "Art",
122
+ },
123
+ },
124
+ {
125
+ "token": "post",
126
+ "upos": "X",
127
+ "morphological_features": {"Foreign": "Yes"},
128
+ },
129
+ {
130
+ "token": "του",
131
+ "upos": "DET",
132
+ "morphological_features": {
133
+ "Case": "Gen",
134
+ "Definite": "Def",
135
+ "Gender": "Masc",
136
+ "Number": "Sing",
137
+ "PronType": "Art",
138
+ },
139
+ },
140
+ {
141
+ "token": "andrew",
142
+ "upos": "X",
143
+ "morphological_features": {"Foreign": "Yes"},
144
+ },
145
+ {
146
+ "token": "ng",
147
+ "upos": "X",
148
+ "morphological_features": {"Foreign": "Yes"},
149
+ },
150
+ {"token": "στο", "upos": "_", "morphological_features": {}},
151
+ {
152
+ "token": "twitter",
153
+ "upos": "X",
154
+ "morphological_features": {"Foreign": "Yes"},
155
+ },
156
+ {"token": ".", "upos": "PUNCT", "morphological_features": {}},
157
+ ],
158
+ )
159
+
160
+
161
+ class DPItem(BaseModel):
162
+ token: str = Field(..., example="προτιμω")
163
+ head: int = Field(..., example=0)
164
+ deprel: str = Field(..., example="root")
165
+
166
+
167
+ class DPResponse(BaseModel):
168
+ dp_results: List[DPItem] = Field(
169
+ ...,
170
+ description="Dependency Parsing information",
171
+ example=[
172
+ {"token": "προτιμω", "head": 0, "deprel": "root"},
173
+ {"token": "την", "head": 4, "deprel": "det"},
174
+ {"token": "πρωινη", "head": 4, "deprel": "amod"},
175
+ {"token": "πτηση", "head": 1, "deprel": "obj"},
176
+ {"token": "απο", "head": 7, "deprel": "case"},
177
+ {"token": "την", "head": 7, "deprel": "det"},
178
+ {"token": "αθηνα", "head": 4, "deprel": "nmod"},
179
+ {"token": "στη", "head": 9, "deprel": "case"},
180
+ {"token": "θεσσαλονικη", "head": 4, "deprel": "nmod"},
181
+ {"token": ".", "head": 1, "deprel": "punct"},
182
+ ],
183
+ )
184
+
185
+
186
+ # API endpoints
187
+ @app.post("/g2g", response_model=G2GOutput, summary="Convert Greeklish to Greek")
188
+ async def greeklish_to_greek(
189
+ text: str = Query(
190
+ ...,
191
+ description="The Greeklish text to convert",
192
+ example="H thessaloniki einai wraia polh",
193
+ ),
194
+ ):
195
+ """
196
+ Convert Greeklish (Greek written with Latin characters) to Greek.
197
+
198
+ This endpoint takes Greeklish text (Greek written with Latin characters) as input and returns the
199
+ transliterated Greek text.
200
+ """
201
+ try:
202
+ greek_text = " ".join(
203
+ [token.text for token in nlp_pos_ner_dp_with_g2g(text).tokens]
204
+ )
205
+ return G2GOutput(greek_text=greek_text)
206
+ except Exception as e:
207
+ raise HTTPException(status_code=500, detail=str(e))
208
+
209
+
210
+ class NERResponse(BaseModel):
211
+ ner_results: List[NERItem] = Field(
212
+ ...,
213
+ description="Named Entity Recognition information",
214
+ example=[
215
+ {"token": "η", "ner_value": "O"},
216
+ {"token": "αργεντινη", "ner_value": "S-ORG"},
217
+ {"token": "κερδισε", "ner_value": "O"},
218
+ {"token": "το", "ner_value": "O"},
219
+ {"token": "παγκοσμιο", "ner_value": "B-EVENT"},
220
+ {"token": "κυπελλο", "ner_value": "E-EVENT"},
221
+ {"token": "το", "ner_value": "O"},
222
+ {"token": "2022", "ner_value": "S-DATE"},
223
+ ],
224
+ )
225
+
226
+
227
+ # @app.post("/ner", response_model=List[NERItem], summary="Named Entity Recognition")
228
+ @app.post("/ner", response_model=NERResponse, summary="Named Entity Recognition")
229
+ async def process_ner(
230
+ text: str = Query(
231
+ ...,
232
+ description="The text to process for NER",
233
+ example="Η Αργεντινή κέρδισε το Παγκόσμιο Κύπελλο το 2022",
234
+ ),
235
+ ):
236
+ """
237
+ The NER endpoint takes Greek text as input and returns a list of dictionaries with the token and the NER value.
238
+
239
+ Named Entity Recognition (NER) Labels:
240
+ ```python
241
+ ner_possible_labels = [
242
+ 'O', 'S-GPE', 'S-ORG', 'S-CARDINAL', 'B-ORG', 'E-ORG', 'B-DATE', 'E-DATE', 'S-NORP',
243
+ 'B-GPE', 'E-GPE', 'S-EVENT', 'S-DATE', 'S-PRODUCT', 'S-LOC', 'I-ORG', 'S-PERSON',
244
+ 'S-ORDINAL', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'B-LAW', 'I-LAW', 'E-LAW', 'B-MONEY',
245
+ 'I-MONEY', 'E-MONEY', 'B-EVENT', 'I-EVENT', 'E-EVENT', 'B-FAC', 'E-FAC', 'I-DATE',
246
+ 'S-PERCENT', 'B-QUANTITY', 'E-QUANTITY', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'E-WORK_OF_ART',
247
+ 'I-FAC', 'S-LAW', 'S-TIME', 'B-LOC', 'E-LOC', 'I-LOC', 'S-FAC', 'B-TIME', 'E-TIME',
248
+ 'S-WORK_OF_ART', 'B-PRODUCT', 'E-PRODUCT', 'B-CARDINAL', 'E-CARDINAL', 'S-MONEY',
249
+ 'S-LANGUAGE', 'I-TIME', 'I-PRODUCT', 'I-GPE', 'I-QUANTITY', 'B-NORP', 'E-NORP',
250
+ 'S-QUANTITY', 'B-PERCENT', 'I-PERCENT', 'E-PERCENT', 'I-CARDINAL', 'B-ORDINAL',
251
+ 'I-ORDINAL', 'E-ORDINAL'
252
+ ]
253
+ ```
254
+ """
255
+ try:
256
+ doc = nlp_pos_ner_dp_with_g2g(text)
257
+
258
+ # Create a list of dictionaries, each with "token" and "ner_value"
259
+ ner_list = [
260
+ {"token": token.text, "ner_value": token.ner} for token in doc.tokens
261
+ ]
262
+
263
+ return {"ner_results": ner_list}
264
+
265
+ except Exception as e:
266
+ raise HTTPException(status_code=500, detail=str(e))
267
+
268
+
269
+ # @app.post("/pos", response_model=List[POSItem], summary="Part-of-Speech Tagging")
270
+ @app.post("/pos", response_model=POSResponse, summary="Part-of-Speech Tagging")
271
+ async def process_pos(
272
+ text: str = Query(
273
+ ...,
274
+ description="The text to process for POS tagging",
275
+ example="Μου αρέσει να διαβάζω τα post του Andrew Ng στο Twitter.",
276
+ ),
277
+ ):
278
+ """
279
+ The POS Tagging endpoint analyzes the input text and provides Universal POS (UPOS) tags and detailed morphological features.
280
+
281
+ It returns a list of dictionaries with "token", "upos", and "morphological_features" keys.
282
+ The "morphological_features" key contains a dictionary itself with detailed morphological features.
283
+
284
+ The UPOS and morphological features are based on the Universal Dependencies (UD) framework: [https://universaldependencies.org/u/pos/](https://universaldependencies.org/u/pos/)
285
+
286
+ Complete list of the Universal POS (UPOS) tags and morphological features:
287
+ ```python
288
+ {'ADJ': ['Degree', 'Number', 'Gender', 'Case'],
289
+ 'ADP': ['Number', 'Gender', 'Case'],
290
+ 'ADV': ['Degree', 'Abbr'],
291
+ 'AUX': ['Mood',
292
+ 'Aspect',
293
+ 'Tense',
294
+ 'Number',
295
+ 'Person',
296
+ 'VerbForm',
297
+ 'Voice'],
298
+ 'CCONJ': [],
299
+ 'DET': ['Number', 'Gender', 'PronType', 'Definite', 'Case'],
300
+ 'NOUN': ['Number', 'Gender', 'Abbr', 'Case'],
301
+ 'NUM': ['NumType', 'Number', 'Gender', 'Case'],
302
+ 'PART': [],
303
+ 'PRON': ['Number', 'Gender', 'Person', 'Poss', 'PronType', 'Case'],
304
+ 'PROPN': ['Number', 'Gender', 'Case'],
305
+ 'PUNCT': [],
306
+ 'SCONJ': [],
307
+ 'SYM': [],
308
+ 'VERB': ['Mood',
309
+ 'Aspect',
310
+ 'Tense',
311
+ 'Number',
312
+ 'Gender',
313
+ 'Person',
314
+ 'VerbForm',
315
+ 'Voice',
316
+ 'Case'],
317
+ 'X': ['Foreign'],
318
+ ```
319
+
320
+ ```python
321
+ {'Abbr': ['_', 'Yes'],
322
+ 'Aspect': ['Perf', '_', 'Imp'],
323
+ 'Case': ['Dat', '_', 'Acc', 'Gen', 'Nom', 'Voc'],
324
+ 'Definite': ['Ind', 'Def', '_'],
325
+ 'Degree': ['Cmp', 'Sup', '_'],
326
+ 'Foreign': ['_', 'Yes'],
327
+ 'Gender': ['Fem', 'Masc', '_', 'Neut'],
328
+ 'Mood': ['Ind', '_', 'Imp'],
329
+ 'NumType': ['Mult', 'Card', '_', 'Ord', 'Sets'],
330
+ 'Number': ['Plur', '_', 'Sing'],
331
+ 'Person': ['3', '1', '_', '2'],
332
+ 'Poss': ['_', 'Yes'],
333
+ 'PronType': ['Ind', 'Art', '_', 'Rel', 'Dem', 'Prs', 'Ind,Rel', 'Int'],
334
+ 'Tense': ['Pres', 'Past', '_'],
335
+ 'VerbForm': ['Part', 'Conv', '_', 'Inf', 'Fin'],
336
+ 'Voice': ['Pass', 'Act', '_'],
337
+ ```
338
+ """
339
+ try:
340
+ doc = nlp_pos_ner_dp_with_g2g(text)
341
+
342
+ # Create a list of dictionaries, each with "token", "upos", and "morphological_features"
343
+ pos_list = [
344
+ {
345
+ "token": token.text,
346
+ "upos": token.upos,
347
+ "morphological_features": token.feats,
348
+ }
349
+ for token in doc.tokens
350
+ ]
351
+
352
+ # return pos_list
353
+ return {"pos_results": pos_list}
354
+
355
+ except Exception as e:
356
+ raise HTTPException(status_code=500, detail=str(e))
357
+
358
+
359
+ # @app.post("/dp", response_model=List[DPItem], summary="Dependency Parsing")
360
+ @app.post("/dp", response_model=DPResponse, summary="Dependency Parsing")
361
+ async def process_dp(
362
+ text: str = Query(
363
+ ...,
364
+ description="The text to process for Dependency Parsing",
365
+ example="Προτιμώ την πρωινή πτήση από την Αθήνα στη Θεσσαλονίκη",
366
+ ),
367
+ ):
368
+ """
369
+ The Dependency Parsing endpoint analyzes the syntactic structure of the input text.
370
+ It provides the tokens' (syntactic) heads and dependency relations. A head value of 0 indicates the root.
371
+ More specifically, the endpoint returns a list of dictionaries with "token", "head", and "deprel" keys.
372
+
373
+ Dependency Parsing Labels:
374
+ ```python
375
+ dp_possible_labels = ['obl', 'obj', 'dep', 'mark', 'case', 'flat', 'nummod', 'obl:arg', 'punct', 'cop',
376
+ 'acl:relcl', 'expl', 'nsubj', 'csubj:pass', 'root', 'advmod', 'nsubj:pass', 'ccomp',
377
+ 'conj', 'amod', 'xcomp', 'aux', 'appos', 'csubj', 'fixed', 'nmod', 'iobj', 'parataxis',
378
+ 'orphan', 'det', 'advcl', 'vocative', 'compound', 'cc', 'discourse', 'acl', 'obl:agent']
379
+ ```
380
+ """
381
+ try:
382
+ doc = nlp_pos_ner_dp_with_g2g(text)
383
+
384
+ # Create a list of dictionaries, each with "token", "head", and "deprel"
385
+ dp_list = [
386
+ {"token": token.text, "head": token.head, "deprel": token.deprel}
387
+ for token in doc.tokens
388
+ ]
389
+
390
+ return {"dp_results": dp_list}
391
+
392
+ except Exception as e:
393
+ raise HTTPException(status_code=500, detail=str(e))
394
+
395
+
396
+ @app.get("/", include_in_schema=False)
397
+ async def root():
398
+ return RedirectResponse(url="/docs#")
399
+
400
+
401
+ if __name__ == "__main__":
402
+ import uvicorn
403
+
404
+ uvicorn.run(app)
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi==0.112.2
2
+ gr-nlp-toolkit
3
+ pydantic==2.8.2
4
+ uvicorn==0.30.6