Spaces:
Sleeping
Sleeping
from typing import List | |
from spacy.lang.en import English | |
class SentenceHandler(object): | |
def __init__(self, language=English): | |
""" | |
Base Sentence Handler with Spacy support. | |
:param language: Determines the language to use with spacy. | |
""" | |
self.nlp = language() | |
try: | |
# Supports spacy 2.0 | |
self.nlp.add_pipe(self.nlp.create_pipe('sentencizer')) | |
self.is_spacy_3 = False | |
except Exception: | |
# Supports spacy 3.0 | |
self.nlp.add_pipe("sentencizer") | |
self.is_spacy_3 = True | |
def sentence_processor(self, doc, | |
min_length: int = 40, | |
max_length: int = 600) -> List[str]: | |
""" | |
Processes a given spacy document and turns them into sentences. | |
:param doc: The document to use from spacy. | |
:param min_length: The minimum length a sentence should be to be considered. | |
:param max_length: The maximum length a sentence should be to be considered. | |
:return: Sentences. | |
""" | |
to_return = [] | |
for c in doc.sents: | |
if max_length > len(c.text.strip()) > min_length: | |
if self.is_spacy_3: | |
to_return.append(c.text.strip()) | |
else: | |
to_return.append(c.string.strip()) | |
return to_return | |
def process(self, body: str, | |
min_length: int = 40, | |
max_length: int = 600) -> List[str]: | |
""" | |
Processes the content sentences. | |
:param body: The raw string body to process | |
:param min_length: Minimum length that the sentences must be | |
:param max_length: Max length that the sentences mus fall under | |
:return: Returns a list of sentences. | |
""" | |
doc = self.nlp(body) | |
return self.sentence_processor(doc, min_length, max_length) | |
def __call__(self, body: str, | |
min_length: int = 40, | |
max_length: int = 600) -> List[str]: | |
""" | |
Processes the content sentences. | |
:param body: The raw string body to process | |
:param min_length: Minimum length that the sentences must be | |
:param max_length: Max length that the sentences mus fall under | |
:return: Returns a list of sentences. | |
""" | |
return self.process(body, min_length, max_length) |