zhengr's picture
First version
19dc0f3
raw
history blame
7.48 kB
"""
This module contains utils for preprocessing the text before converting it to embeddings.
- TextPreprocessorBuilder preprocesses individual strings.
* lowering cases
* converting numbers to words or characters
* merging and stripping spaces
* removing punctuation
* removing stop words
* lemmatizing
* removing specific parts of speech (adverbs and interjections)
- TextSummarizer extracts the most important sentences from a long string using text-ranking.
"""
import math
import re
import string
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from num2words import num2words
class TextPreprocessorBuilder:
# Define class variables as None initially
_stop_words = set(stopwords.words('english'))
_lemmatizer = WordNetLemmatizer()
# Some of the functions are expensive. We cache the results.
_lemmatizer_cache = {}
_pos_remove_cache = {}
def __init__(self, text: str):
self.text = text
def to_lower(self):
# Match both words and non-word characters
tokens = re.findall(r'\b\w+\b|\W+', self.text)
for i, token in enumerate(tokens):
# Check if token is a word
if re.match(r'^\w+$', token):
# Check if token is not an abbreviation or constant
if not re.match(r'^[A-Z]+$', token) and not re.match(r'^[A-Z_]+$', token):
tokens[i] = token.lower()
self.text = "".join(tokens)
return self
def num_to_word(self, min_len: int = 1):
# Match both words and non-word characters
tokens = re.findall(r'\b\w+\b|\W+', self.text)
for i, token in enumerate(tokens):
# Check if token is a number of length `min_len` or more
if token.isdigit() and len(token) >= min_len:
# This is done to pay better attention to numbers (e.g. ticket numbers, thread numbers, post numbers)
# 740700 will become "seven hundred and forty thousand seven hundred".
tokens[i] = num2words(int(token)).replace(",", "") # Remove commas from num2words.
self.text = "".join(tokens)
return self
def num_to_char_long(self, min_len: int = 1):
# Match both words and non-word characters
tokens = re.findall(r'\b\w+\b|\W+', self.text)
for i, token in enumerate(tokens):
# Check if token is a number of length `min_len` or more
if token.isdigit() and len(token) >= min_len:
# This is done to pay better attention to numbers (e.g. ticket numbers, thread numbers, post numbers)
# 740700 will become HHHHHHEEEEEAAAAHHHAAA
def convert_token(token):
return ''.join((chr(int(digit) + 65) * (i + 1)) for i, digit in enumerate(token[::-1]))[::-1]
tokens[i] = convert_token(tokens[i])
self.text = "".join(tokens)
return self
def num_to_char(self, min_len: int = 1):
# Match both words and non-word characters
tokens = re.findall(r'\b\w+\b|\W+', self.text)
for i, token in enumerate(tokens):
# Check if token is a number of length `min_len` or more
if token.isdigit() and len(token) >= min_len:
# This is done to pay better attention to numbers (e.g. ticket numbers, thread numbers, post numbers)
# 740700 will become HEAHAA
tokens[i] = ''.join(chr(int(digit) + 65) for digit in token)
self.text = "".join(tokens)
return self
def merge_spaces(self):
self.text = re.sub(' +', ' ', self.text)
return self
def strip(self):
self.text = self.text.strip()
return self
def remove_punctuation(self):
self.text = self.text.translate(str.maketrans('', '', string.punctuation))
return self
def remove_stopwords(self):
self.text = "".join([word for word in re.findall(r'\b\w+\b|\W+', self.text) if word not in TextPreprocessorBuilder._stop_words])
return self
def remove_specific_pos(self):
"""
In the English language, adverbs and interjections rarely provide meaningul information.
Removing them improves the embedding precision. Don't tell JK Rowling, though.
"""
processed_text = TextPreprocessorBuilder._pos_remove_cache.get(self.text)
if processed_text:
self.text = processed_text
return self
# Match both words and non-word characters
tokens = re.findall(r'\b\w+\b|\W+', self.text)
# Exclude adverbs and interjections
excluded_tags = ['RB', 'RBR', 'RBS', 'UH']
for i, token in enumerate(tokens):
# Check if token is a word
if re.match(r'^\w+$', token):
# Part-of-speech tag the word
pos = nltk.pos_tag([token])[0][1]
# If the word's POS tag is in the excluded list, remove the word
if pos in excluded_tags:
tokens[i] = ''
new_text = "".join(tokens)
TextPreprocessorBuilder._pos_remove_cache[self.text] = new_text
self.text = new_text
return self
def lemmatize(self):
processed_text = TextPreprocessorBuilder._lemmatizer_cache.get(self.text)
if processed_text:
self.text = processed_text
return self
new_text = "".join([TextPreprocessorBuilder._lemmatizer.lemmatize(word) for word in re.findall(r'\b\w+\b|\W+', self.text)])
TextPreprocessorBuilder._lemmatizer_cache[self.text] = new_text
self.text = new_text
return self
def build(self):
return self.text
class TextSummarizer:
_nlp_pipeline = None
_cache = {}
@staticmethod
def _load_nlp_pipeline():
# Lazy-load it.
if TextSummarizer._nlp_pipeline is None:
TextSummarizer._nlp_pipeline = spacy.load('en_core_web_sm')
TextSummarizer._nlp_pipeline.add_pipe("textrank", last=True)
return TextSummarizer._nlp_pipeline
@staticmethod
def process_long_text(text: str, min_num_sent: int) -> list[str]:
"""
This function applies a text summarization process on a given text string, extracting
the most important sentences based on the principle that 20% of the content is responsible
for 80% of the meaning (the Pareto Principle).
Returns:
list: A list of the most important sentences
"""
# Attempt to get the result from cache
cache_key = (text, min_num_sent)
cached_result = TextSummarizer._cache.get(cache_key, None)
if cached_result is not None:
return cached_result
nlp_pipeline = TextSummarizer._load_nlp_pipeline()
doc = nlp_pipeline(text)
num_sent = len(list(doc.sents))
result = []
if num_sent >= min_num_sent:
limit_phrases = math.ceil(len(doc._.phrases) * 0.20) # 20% of the phrases, rounded up
limit_sentences = math.ceil(num_sent * 0.20) # 20% of the sentences, rounded up
result = [str(sent) for sent in doc._.textrank.summary(limit_phrases=limit_phrases, limit_sentences=limit_sentences)]
else:
result = [text]
# Store the result in cache before returning it
TextSummarizer._cache[cache_key] = result
return result