Spaces:
Sleeping
Sleeping
from typing import Dict, List | |
from fastapi import FastAPI, HTTPException, Query | |
from fastapi.responses import RedirectResponse | |
from gr_nlp_toolkit import Pipeline | |
from pydantic import BaseModel, Field | |
app = FastAPI( | |
title="The Grεεk NLP API 🇬🇷", | |
description="State-of-the-art API for Greek NLP tasks including Greeklish to Greek conversion (G2G), Named Entity Recognition (NER), Part-of-Speech (POS) tagging, and Dependency Parsing (DP). The API is powered by the Grεεk NLP Toolkit ([https://github.com/nlpaueb/gr-nlp-toolkit/](https://github.com/nlpaueb/gr-nlp-toolkit/)), which is also available via PyPI (`pip install gr-nlp-toolkit`). ", | |
version="1.0.0", | |
contact={ | |
"name": "Natural Language Processing Group - Athens University of Economics and Business (AUEB)", | |
"url": "http://nlp.cs.aueb.gr/", | |
"api_author": "Lefteris Loukas", | |
}, | |
) | |
# Instantiate the Pipeline | |
nlp_pos_ner_dp_with_g2g = Pipeline("pos,ner,dp,g2g") | |
# Pydantic models for responses | |
class G2GOutput(BaseModel): | |
greek_text: str = Field( | |
..., | |
example="η θεσσαλονικη ειναι ωραια πολη", | |
description="Converted Greek text", | |
) | |
class NERItem(BaseModel): | |
token: str = Field(..., example="αργεντινη") | |
ner_value: str = Field(..., example="S-ORG") | |
class POSItem(BaseModel): | |
token: str = Field(..., example="μου") | |
upos: str = Field(..., example="PRON") | |
morphological_features: Dict[str, str] = Field( | |
..., | |
example={ | |
"Case": "Gen", | |
"Gender": "Masc", | |
"Number": "Sing", | |
"Person": "1", | |
"Poss": "_", | |
"PronType": "Prs", | |
}, | |
) | |
class POSResponse(BaseModel): | |
pos_results: List[POSItem] = Field( | |
..., | |
description="Part-of-Speech tagging information", | |
example=[ | |
{ | |
"token": "μου", | |
"upos": "PRON", | |
"morphological_features": { | |
"Case": "Gen", | |
"Gender": "Masc", | |
"Number": "Sing", | |
"Person": "1", | |
"Poss": "_", | |
"PronType": "Prs", | |
}, | |
}, | |
{ | |
"token": "αρεσει", | |
"upos": "VERB", | |
"morphological_features": { | |
"Aspect": "Imp", | |
"Case": "_", | |
"Gender": "_", | |
"Mood": "Ind", | |
"Number": "Sing", | |
"Person": "3", | |
"Tense": "Pres", | |
"VerbForm": "Fin", | |
"Voice": "Act", | |
}, | |
}, | |
{ | |
"token": "να", | |
"upos": "AUX", | |
"morphological_features": { | |
"Aspect": "_", | |
"Mood": "_", | |
"Number": "_", | |
"Person": "_", | |
"Tense": "_", | |
"VerbForm": "_", | |
"Voice": "_", | |
}, | |
}, | |
{ | |
"token": "διαβαζω", | |
"upos": "VERB", | |
"morphological_features": { | |
"Aspect": "Imp", | |
"Case": "_", | |
"Gender": "_", | |
"Mood": "Ind", | |
"Number": "Sing", | |
"Person": "1", | |
"Tense": "Pres", | |
"VerbForm": "Fin", | |
"Voice": "Act", | |
}, | |
}, | |
{ | |
"token": "τα", | |
"upos": "DET", | |
"morphological_features": { | |
"Case": "Acc", | |
"Definite": "Def", | |
"Gender": "Neut", | |
"Number": "Plur", | |
"PronType": "Art", | |
}, | |
}, | |
{ | |
"token": "post", | |
"upos": "X", | |
"morphological_features": {"Foreign": "Yes"}, | |
}, | |
{ | |
"token": "του", | |
"upos": "DET", | |
"morphological_features": { | |
"Case": "Gen", | |
"Definite": "Def", | |
"Gender": "Masc", | |
"Number": "Sing", | |
"PronType": "Art", | |
}, | |
}, | |
{ | |
"token": "andrew", | |
"upos": "X", | |
"morphological_features": {"Foreign": "Yes"}, | |
}, | |
{ | |
"token": "ng", | |
"upos": "X", | |
"morphological_features": {"Foreign": "Yes"}, | |
}, | |
{"token": "στο", "upos": "_", "morphological_features": {}}, | |
{ | |
"token": "twitter", | |
"upos": "X", | |
"morphological_features": {"Foreign": "Yes"}, | |
}, | |
{"token": ".", "upos": "PUNCT", "morphological_features": {}}, | |
], | |
) | |
class DPItem(BaseModel): | |
token: str = Field(..., example="προτιμω") | |
head: int = Field(..., example=0) | |
deprel: str = Field(..., example="root") | |
class DPResponse(BaseModel): | |
dp_results: List[DPItem] = Field( | |
..., | |
description="Dependency Parsing information", | |
example=[ | |
{"token": "προτιμω", "head": 0, "deprel": "root"}, | |
{"token": "την", "head": 4, "deprel": "det"}, | |
{"token": "πρωινη", "head": 4, "deprel": "amod"}, | |
{"token": "πτηση", "head": 1, "deprel": "obj"}, | |
{"token": "απο", "head": 7, "deprel": "case"}, | |
{"token": "την", "head": 7, "deprel": "det"}, | |
{"token": "αθηνα", "head": 4, "deprel": "nmod"}, | |
{"token": "στη", "head": 9, "deprel": "case"}, | |
{"token": "θεσσαλονικη", "head": 4, "deprel": "nmod"}, | |
{"token": ".", "head": 1, "deprel": "punct"}, | |
], | |
) | |
# API endpoints | |
async def greeklish_to_greek( | |
text: str = Query( | |
..., | |
description="The Greeklish text to convert", | |
example="H thessaloniki einai wraia polh", | |
), | |
): | |
""" | |
The G2G (Greeklish-to-Greek) endpoint takes Greeklish text (Greek written with Latin characters) as input and transliterates it to Greek text. | |
""" | |
try: | |
greek_text = " ".join( | |
[token.text for token in nlp_pos_ner_dp_with_g2g(text).tokens] | |
) | |
return G2GOutput(greek_text=greek_text) | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |
class NERResponse(BaseModel): | |
ner_results: List[NERItem] = Field( | |
..., | |
description="Named Entity Recognition information", | |
example=[ | |
{"token": "η", "ner_value": "O"}, | |
{"token": "αργεντινη", "ner_value": "S-ORG"}, | |
{"token": "κερδισε", "ner_value": "O"}, | |
{"token": "το", "ner_value": "O"}, | |
{"token": "παγκοσμιο", "ner_value": "B-EVENT"}, | |
{"token": "κυπελλο", "ner_value": "E-EVENT"}, | |
{"token": "το", "ner_value": "O"}, | |
{"token": "2022", "ner_value": "S-DATE"}, | |
], | |
) | |
# @app.post("/ner", response_model=List[NERItem], summary="Named Entity Recognition") | |
async def process_ner( | |
text: str = Query( | |
..., | |
description="The text to process for NER", | |
example="Η Αργεντινή κέρδισε το Παγκόσμιο Κύπελλο το 2022", | |
), | |
): | |
""" | |
The NER endpoint takes Greek text as input and returns a list of dictionaries with the token and the NER value. | |
Named Entity Recognition (NER) Labels: | |
```python | |
ner_possible_labels = [ | |
'O', 'S-GPE', 'S-ORG', 'S-CARDINAL', 'B-ORG', 'E-ORG', 'B-DATE', 'E-DATE', 'S-NORP', | |
'B-GPE', 'E-GPE', 'S-EVENT', 'S-DATE', 'S-PRODUCT', 'S-LOC', 'I-ORG', 'S-PERSON', | |
'S-ORDINAL', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'B-LAW', 'I-LAW', 'E-LAW', 'B-MONEY', | |
'I-MONEY', 'E-MONEY', 'B-EVENT', 'I-EVENT', 'E-EVENT', 'B-FAC', 'E-FAC', 'I-DATE', | |
'S-PERCENT', 'B-QUANTITY', 'E-QUANTITY', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'E-WORK_OF_ART', | |
'I-FAC', 'S-LAW', 'S-TIME', 'B-LOC', 'E-LOC', 'I-LOC', 'S-FAC', 'B-TIME', 'E-TIME', | |
'S-WORK_OF_ART', 'B-PRODUCT', 'E-PRODUCT', 'B-CARDINAL', 'E-CARDINAL', 'S-MONEY', | |
'S-LANGUAGE', 'I-TIME', 'I-PRODUCT', 'I-GPE', 'I-QUANTITY', 'B-NORP', 'E-NORP', | |
'S-QUANTITY', 'B-PERCENT', 'I-PERCENT', 'E-PERCENT', 'I-CARDINAL', 'B-ORDINAL', | |
'I-ORDINAL', 'E-ORDINAL' | |
] | |
``` | |
""" | |
try: | |
doc = nlp_pos_ner_dp_with_g2g(text) | |
# Create a list of dictionaries, each with "token" and "ner_value" | |
ner_list = [ | |
{"token": token.text, "ner_value": token.ner} for token in doc.tokens | |
] | |
return {"ner_results": ner_list} | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |
# @app.post("/pos", response_model=List[POSItem], summary="Part-of-Speech Tagging") | |
async def process_pos( | |
text: str = Query( | |
..., | |
description="The text to process for POS tagging", | |
example="Μου αρέσει να διαβάζω τα post του Andrew Ng στο Twitter.", | |
), | |
): | |
""" | |
The POS Tagging endpoint analyzes the input text and provides Universal POS (UPOS) tags and detailed morphological features. | |
It returns a list of dictionaries with "token", "upos", and "morphological_features" keys. | |
The "morphological_features" key contains a dictionary itself with detailed morphological features. | |
The UPOS and morphological features are based on the Universal Dependencies (UD) framework: [https://universaldependencies.org/u/pos/](https://universaldependencies.org/u/pos/) | |
Complete list of the Universal POS (UPOS) tags and morphological features: | |
```python | |
{'ADJ': ['Degree', 'Number', 'Gender', 'Case'], | |
'ADP': ['Number', 'Gender', 'Case'], | |
'ADV': ['Degree', 'Abbr'], | |
'AUX': ['Mood', | |
'Aspect', | |
'Tense', | |
'Number', | |
'Person', | |
'VerbForm', | |
'Voice'], | |
'CCONJ': [], | |
'DET': ['Number', 'Gender', 'PronType', 'Definite', 'Case'], | |
'NOUN': ['Number', 'Gender', 'Abbr', 'Case'], | |
'NUM': ['NumType', 'Number', 'Gender', 'Case'], | |
'PART': [], | |
'PRON': ['Number', 'Gender', 'Person', 'Poss', 'PronType', 'Case'], | |
'PROPN': ['Number', 'Gender', 'Case'], | |
'PUNCT': [], | |
'SCONJ': [], | |
'SYM': [], | |
'VERB': ['Mood', | |
'Aspect', | |
'Tense', | |
'Number', | |
'Gender', | |
'Person', | |
'VerbForm', | |
'Voice', | |
'Case'], | |
'X': ['Foreign'], | |
``` | |
```python | |
{'Abbr': ['_', 'Yes'], | |
'Aspect': ['Perf', '_', 'Imp'], | |
'Case': ['Dat', '_', 'Acc', 'Gen', 'Nom', 'Voc'], | |
'Definite': ['Ind', 'Def', '_'], | |
'Degree': ['Cmp', 'Sup', '_'], | |
'Foreign': ['_', 'Yes'], | |
'Gender': ['Fem', 'Masc', '_', 'Neut'], | |
'Mood': ['Ind', '_', 'Imp'], | |
'NumType': ['Mult', 'Card', '_', 'Ord', 'Sets'], | |
'Number': ['Plur', '_', 'Sing'], | |
'Person': ['3', '1', '_', '2'], | |
'Poss': ['_', 'Yes'], | |
'PronType': ['Ind', 'Art', '_', 'Rel', 'Dem', 'Prs', 'Ind,Rel', 'Int'], | |
'Tense': ['Pres', 'Past', '_'], | |
'VerbForm': ['Part', 'Conv', '_', 'Inf', 'Fin'], | |
'Voice': ['Pass', 'Act', '_'], | |
``` | |
""" | |
try: | |
doc = nlp_pos_ner_dp_with_g2g(text) | |
# Create a list of dictionaries, each with "token", "upos", and "morphological_features" | |
pos_list = [ | |
{ | |
"token": token.text, | |
"upos": token.upos, | |
"morphological_features": token.feats, | |
} | |
for token in doc.tokens | |
] | |
# return pos_list | |
return {"pos_results": pos_list} | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |
# @app.post("/dp", response_model=List[DPItem], summary="Dependency Parsing") | |
async def process_dp( | |
text: str = Query( | |
..., | |
description="The text to process for Dependency Parsing", | |
example="Προτιμώ την πρωινή πτήση από την Αθήνα στη Θεσσαλονίκη", | |
), | |
): | |
""" | |
The Dependency Parsing endpoint analyzes the syntactic structure of the input text. | |
It provides the tokens' (syntactic) heads and dependency relations. A head value of 0 indicates the root. | |
More specifically, the endpoint returns a list of dictionaries with "token", "head", and "deprel" keys. | |
Dependency Parsing Labels: | |
```python | |
dp_possible_labels = ['obl', 'obj', 'dep', 'mark', 'case', 'flat', 'nummod', 'obl:arg', 'punct', 'cop', | |
'acl:relcl', 'expl', 'nsubj', 'csubj:pass', 'root', 'advmod', 'nsubj:pass', 'ccomp', | |
'conj', 'amod', 'xcomp', 'aux', 'appos', 'csubj', 'fixed', 'nmod', 'iobj', 'parataxis', | |
'orphan', 'det', 'advcl', 'vocative', 'compound', 'cc', 'discourse', 'acl', 'obl:agent'] | |
``` | |
""" | |
try: | |
doc = nlp_pos_ner_dp_with_g2g(text) | |
# Create a list of dictionaries, each with "token", "head", and "deprel" | |
dp_list = [ | |
{"token": token.text, "head": token.head, "deprel": token.deprel} | |
for token in doc.tokens | |
] | |
return {"dp_results": dp_list} | |
except Exception as e: | |
raise HTTPException(status_code=500, detail=str(e)) | |
async def root(): | |
return RedirectResponse(url="/docs#") | |
if __name__ == "__main__": | |
import uvicorn | |
uvicorn.run(app) | |