|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
import warnings |
|
from typing import Iterator, List, Tuple |
|
|
|
|
|
def align_tokens(tokens, sentence): |
|
""" |
|
This module attempt to find the offsets of the tokens in *s*, as a sequence |
|
of ``(start, end)`` tuples, given the tokens and also the source string. |
|
|
|
>>> from nltk.tokenize import TreebankWordTokenizer |
|
>>> from nltk.tokenize.util import align_tokens |
|
>>> s = str("The plane, bound for St Petersburg, crashed in Egypt's " |
|
... "Sinai desert just 23 minutes after take-off from Sharm el-Sheikh " |
|
... "on Saturday.") |
|
>>> tokens = TreebankWordTokenizer().tokenize(s) |
|
>>> expected = [(0, 3), (4, 9), (9, 10), (11, 16), (17, 20), (21, 23), |
|
... (24, 34), (34, 35), (36, 43), (44, 46), (47, 52), (52, 54), |
|
... (55, 60), (61, 67), (68, 72), (73, 75), (76, 83), (84, 89), |
|
... (90, 98), (99, 103), (104, 109), (110, 119), (120, 122), |
|
... (123, 131), (131, 132)] |
|
>>> output = list(align_tokens(tokens, s)) |
|
>>> len(tokens) == len(expected) == len(output) # Check that length of tokens and tuples are the same. |
|
True |
|
>>> expected == list(align_tokens(tokens, s)) # Check that the output is as expected. |
|
True |
|
>>> tokens == [s[start:end] for start, end in output] # Check that the slices of the string corresponds to the tokens. |
|
True |
|
|
|
:param tokens: The list of strings that are the result of tokenization |
|
:type tokens: list(str) |
|
:param sentence: The original string |
|
:type sentence: str |
|
:rtype: list(tuple(int,int)) |
|
""" |
|
point = 0 |
|
offsets = [] |
|
for token in tokens: |
|
try: |
|
start = sentence.index(token, point) |
|
except ValueError as e: |
|
raise ValueError(f'substring "{token}" not found in "{sentence}"') from e |
|
point = start + len(token) |
|
offsets.append((start, point)) |
|
return offsets |
|
|
|
|
|
class NLTKWordTokenizer: |
|
""" |
|
The NLTK tokenizer that has improved upon the TreebankWordTokenizer. |
|
|
|
This is the method that is invoked by ``word_tokenize()``. It assumes that the |
|
text has already been segmented into sentences, e.g. using ``sent_tokenize()``. |
|
|
|
The tokenizer is "destructive" such that the regexes applied will munge the |
|
input string to a state beyond re-construction. It is possible to apply |
|
`TreebankWordDetokenizer.detokenize` to the tokenized outputs of |
|
`NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to |
|
revert to the original string. |
|
""" |
|
|
|
|
|
STARTING_QUOTES = [ |
|
(re.compile("([Β«βββ]|[`]+)", re.U), r" \1 "), |
|
(re.compile(r"^\""), r' " '), |
|
(re.compile(r"(``)"), r" \1 "), |
|
(re.compile(r"([ \(\[{<])(\"|\'{2})"), r'\1 " '), |
|
|
|
] |
|
|
|
|
|
ENDING_QUOTES = [ |
|
(re.compile("([Β»ββ])", re.U), r" \1 "), |
|
(re.compile(r"''"), " '' "), |
|
(re.compile(r'"'), ' " '), |
|
(re.compile(r"([^' ])('[sS]|'[mM]|'[dD]|') "), r"\1 \2 "), |
|
|
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PUNCTUATION = [ |
|
(re.compile(r'([^\.])(\.)([\]\)}>"\'' "Β»ββ " r"]*)\s*$", re.U), r"\1 \2 \3 "), |
|
(re.compile(r"([:,])([^\d])"), r" \1 \2"), |
|
(re.compile(r"([:,])$"), r" \1 "), |
|
( |
|
re.compile(r"\.{2,}", re.U), |
|
r" \g<0> ", |
|
), |
|
(re.compile(r"[;@#$%&]"), r" \g<0> "), |
|
( |
|
re.compile(r'([^\.])(\.)([\]\)}>"\']*)\s*$'), |
|
r"\1 \2\3 ", |
|
), |
|
(re.compile(r"[?!]"), r" \g<0> "), |
|
(re.compile(r"([^'])' "), r"\1 ' "), |
|
( |
|
re.compile(r"[*]", re.U), |
|
r" \g<0> ", |
|
), |
|
] |
|
|
|
|
|
PARENS_BRACKETS = (re.compile(r"[\]\[\(\)\{\}\<\>]"), r" \g<0> ") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
DOUBLE_DASHES = (re.compile(r"--"), r" -- ") |
|
|
|
|
|
|
|
|
|
|
|
|
|
def tokenize( |
|
self, text: str |
|
) -> List[str]: |
|
r"""Return a tokenized copy of `text`. |
|
|
|
>>> from nltk.tokenize import NLTKWordTokenizer |
|
>>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York. Please buy me\ntwo of them.\nThanks.''' |
|
>>> NLTKWordTokenizer().tokenize(s) # doctest: +NORMALIZE_WHITESPACE |
|
['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36', |
|
'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', |
|
'of', 'them.', 'Thanks', '.'] |
|
>>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True) # doctest: +NORMALIZE_WHITESPACE |
|
['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36', |
|
'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', |
|
'of', 'them.', 'Thanks', '.'] |
|
|
|
|
|
:param text: A string with a sentence or sentences. |
|
:type text: str |
|
:param convert_parentheses: if True, replace parentheses to PTB symbols, |
|
e.g. `(` to `-LRB-`. Defaults to False. |
|
:type convert_parentheses: bool, optional |
|
:param return_str: If True, return tokens as space-separated string, |
|
defaults to False. |
|
:type return_str: bool, optional |
|
:return: List of tokens from `text`. |
|
:rtype: List[str] |
|
""" |
|
|
|
for regexp, substitution in self.STARTING_QUOTES: |
|
text = regexp.sub(substitution, text) |
|
|
|
for regexp, substitution in self.PUNCTUATION: |
|
text = regexp.sub(substitution, text) |
|
|
|
|
|
regexp, substitution = self.PARENS_BRACKETS |
|
text = regexp.sub(substitution, text) |
|
|
|
|
|
regexp, substitution = self.DOUBLE_DASHES |
|
text = regexp.sub(substitution, text) |
|
|
|
|
|
text = " " + text + " " |
|
|
|
for regexp, substitution in self.ENDING_QUOTES: |
|
text = regexp.sub(substitution, text) |
|
|
|
return text.split() |
|
|
|
def span_tokenize(self, text: str) -> Iterator[Tuple[int, int]]: |
|
r""" |
|
Returns the spans of the tokens in ``text``. |
|
Uses the post-hoc nltk.tokens.align_tokens to return the offset spans. |
|
|
|
>>> from nltk.tokenize import NLTKWordTokenizer |
|
>>> s = '''Good muffins cost $3.88\nin New (York). Please (buy) me\ntwo of them.\n(Thanks).''' |
|
>>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23), |
|
... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38), |
|
... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59), |
|
... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)] |
|
>>> list(NLTKWordTokenizer().span_tokenize(s)) == expected |
|
True |
|
>>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in', |
|
... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')', |
|
... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.'] |
|
>>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected |
|
True |
|
|
|
:param text: A string with a sentence or sentences. |
|
:type text: str |
|
:yield: Tuple[int, int] |
|
""" |
|
raw_tokens = self.tokenize(text) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
tokens = raw_tokens |
|
|
|
yield from align_tokens(tokens, text) |
|
|