Spaces:
Sleeping
Sleeping
File size: 7,648 Bytes
92218bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
# !pip install gr-nlp-toolkit
from gr_nlp_toolkit import Pipeline
# Instantiate the Pipeline
nlp_pos_ner_dp_with_g2g = Pipeline("pos,ner,dp,g2g")
def greeklish_to_greek(text: str) -> str:
"""
Convert Greeklish (Greek written with Latin characters) to Greek. ("larisa" -> "λαρισα")
Args:
text (str): The Greeklish text to convert.
Returns:
str: The transliterated Greek text.
Examples:
>>> greeklish_to_greek("H thessaloniki einai wraia polh")
'η θεσσαλονικη ειναι ωραια πολη'
"""
doc = nlp_pos_ner_dp_with_g2g(text)
return " ".join([token.text for token in doc.tokens])
def process_ner(text: str) -> dict:
"""
Process text to extract Named Entity Recognition (NER) information.
Args:
text (str): The text to process.
Returns:
dict: A dictionary with the text and the NER value.
Examples:
>>> process_ner("Η Αργεντινή κέρδισε το Παγκόσμιο Κύπελλο το 2022")
{
'η': 'O',
'αργεντινη': 'S-ORG',
'κερδισε': 'O',
'το': 'O',
'παγκοσμιο': 'B-EVENT',
'κυπελλο': 'E-EVENT',
'το': 'O',
'2022': 'S-DATE'
}
NER Possible Labels List:
ner_labels = [
'O', 'S-GPE', 'S-ORG', 'S-CARDINAL', 'B-ORG', 'E-ORG', 'B-DATE', 'E-DATE', 'S-NORP',
'B-GPE', 'E-GPE', 'S-EVENT', 'S-DATE', 'S-PRODUCT', 'S-LOC', 'I-ORG', 'S-PERSON',
'S-ORDINAL', 'B-PERSON', 'I-PERSON', 'E-PERSON', 'B-LAW', 'I-LAW', 'E-LAW', 'B-MONEY',
'I-MONEY', 'E-MONEY', 'B-EVENT', 'I-EVENT', 'E-EVENT', 'B-FAC', 'E-FAC', 'I-DATE',
'S-PERCENT', 'B-QUANTITY', 'E-QUANTITY', 'B-WORK_OF_ART', 'I-WORK_OF_ART', 'E-WORK_OF_ART',
'I-FAC', 'S-LAW', 'S-TIME', 'B-LOC', 'E-LOC', 'I-LOC', 'S-FAC', 'B-TIME', 'E-TIME',
'S-WORK_OF_ART', 'B-PRODUCT', 'E-PRODUCT', 'B-CARDINAL', 'E-CARDINAL', 'S-MONEY',
'S-LANGUAGE', 'I-TIME', 'I-PRODUCT', 'I-GPE', 'I-QUANTITY', 'B-NORP', 'E-NORP',
'S-QUANTITY', 'B-PERCENT', 'I-PERCENT', 'E-PERCENT', 'I-CARDINAL', 'B-ORDINAL',
'I-ORDINAL', 'E-ORDINAL'
]
"""
doc = nlp_pos_ner_dp_with_g2g(text)
ner_dict = {token.text: token.ner for token in doc.tokens}
return ner_dict
def process_pos(text: str) -> dict:
"""
Process text to extract Part-of-Speech information (UPOS tags and morphological features).
# Complete list of UPOS (https://universaldependencies.org/u/pos/ & https://github.com/nlpaueb/gr-nlp-toolkit/blob/main/gr_nlp_toolkit/configs/pos_labels.py)
ADJ: adjective
ADP: adposition
ADV: adverb
AUX: auxiliary
CCONJ: coordinating conjunction
DET: determiner
INTJ: interjection
NOUN: noun
NUM: numeral
PART: particle
PRON: pronoun
PROPN: proper noun
PUNCT: punctuation
SCONJ: subordinating conjunction
SYM: symbol
VERB: verb
X: other
# Complete list of the morphological features can be found here: (https://github.com/nlpaueb/gr-nlp-toolkit/blob/main/gr_nlp_toolkit/configs/pos_labels.py
Due to the large number of features, only the most common ones are listed here:
- Aspect
- Case
- Definite
- Mood
- Number
- Person
- PronType
- Tense
- Gender
- VerbForm
- Voice
Args:
text (str): The text to process.
Returns:
dict: A dictionary with the text and the POS information, containing UPOS and morphological features as keys.
Examples:
>>> process_pos("Μου αρέσει να διαβάζω τα post του Andrew Ng στο Twitter.")
{
'μου': {'UPOS': 'PRON', 'Morphological_Features': {'Case': 'Gen', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '1', 'Poss': '_', 'PronType': 'Prs'}},
'αρεσει': {'UPOS': 'VERB', 'Morphological_Features': {'Aspect': 'Imp', 'Case': '_', 'Gender': '_', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}},
'να': {'UPOS': 'AUX', 'Morphological_Features': {'Aspect': '_', 'Mood': '_', 'Number': '_', 'Person': '_', 'Tense': '_', 'VerbForm': '_', 'Voice': '_'}},
'διαβαζω': {'UPOS': 'VERB', 'Morphological_Features': {'Aspect': 'Imp', 'Case': '_', 'Gender': '_', 'Mood': 'Ind', 'Number': 'Sing', 'Person': '1', 'Tense': 'Pres', 'VerbForm': 'Fin', 'Voice': 'Act'}},
'τα': {'UPOS': 'DET', 'Morphological_Features': {'Case': 'Acc', 'Definite': 'Def', 'Gender': 'Neut', 'Number': 'Plur', 'PronType': 'Art'}},
'post': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
'του': {'UPOS': 'DET', 'Morphological_Features': {'Case': 'Gen', 'Definite': 'Def', 'Gender': 'Masc', 'Number': 'Sing', 'PronType': 'Art'}},
'andrew': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
'ng': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
'στο': {'UPOS': '_', 'Morphological_Features': {}},
'twitter': {'UPOS': 'X', 'Morphological_Features': {'Foreign': 'Yes'}},
'.': {'UPOS': 'PUNCT', 'Morphological_Features': {}}
}
"""
doc = nlp_pos_ner_dp_with_g2g(text)
pos_dict = {
token.text: {"UPOS": token.upos, "Morphological_Features": token.feats}
for token in doc.tokens
}
return pos_dict
def process_dp(text: str) -> dict:
"""
Process text to extract Dependency Parsing information.
This method analyzes the given text and returns dependency parsing information for each word,
including its syntactic head and dependency relation.
Args:
text (str): The text to process.
Returns:
dict: A dictionary where each key is a word from the input text, and the value is another
dictionary containing:
- 'Head': The position of the syntactic head of the word (0 indicates the root).
- 'Deprel': The dependency relation to the head.
Examples:
>>> process_dp("Προτιμώ την πρωινή πτήση από την Αθήνα στη Θεσσαλονίκη.")
{
'προτιμω': {'Head': 0, 'Deprel': 'root'},
'την': {'Head': 4, 'Deprel': 'det'},
'πρωινη': {'Head': 4, 'Deprel': 'amod'},
'πτηση': {'Head': 1, 'Deprel': 'obj'},
'απο': {'Head': 7, 'Deprel': 'case'},
'την': {'Head': 7, 'Deprel': 'det'},
'αθηνα': {'Head': 4, 'Deprel': 'nmod'},
'στη': {'Head': 9, 'Deprel': 'case'},
'θεσσαλονικη': {'Head': 4, 'Deprel': 'nmod'},
'.': {'Head': 1, 'Deprel': 'punct'}
}
Dependency Parsing Possible Labels List:
dp_labels = [
'obl', 'obj', 'dep', 'mark', 'case', 'flat', 'nummod', 'obl:arg', 'punct', 'cop',
'acl:relcl', 'expl', 'nsubj', 'csubj:pass', 'root', 'advmod', 'nsubj:pass', 'ccomp',
'conj', 'amod', 'xcomp', 'aux', 'appos', 'csubj', 'fixed', 'nmod', 'iobj', 'parataxis',
'orphan', 'det', 'advcl', 'vocative', 'compound', 'cc', 'discourse', 'acl', 'obl:agent'
]
"""
doc = nlp_pos_ner_dp_with_g2g(text)
dp_dict = {
token.text: {"Head": token.head, "Deprel": token.deprel} for token in doc.tokens
}
return dp_dict
|