pszemraj commited on
Commit
ca983bc
1 Parent(s): f84fce9

⚗️ ⚡️ better stopwords and splitting

Browse files

Signed-off-by: peter szemraj <[email protected]>

Files changed (1) hide show
  1. utils.py +39 -21
utils.py CHANGED
@@ -19,11 +19,11 @@ logging.basicConfig(
19
 
20
  import torch
21
  from natsort import natsorted
22
- from nltk.tokenize import word_tokenize, WhitespaceTokenizer
23
  from rapidfuzz import fuzz
24
 
25
  STOPWORDS = set(
26
- "a about above after again against all am an and any are aren't as at be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he he'd he'll he's her here here's hers herself him himself his how how's i i'd i'll i'm i've if in into is isn't it it's its itself let's me more most mustn't my myself no nor not of off on once only or other ought our ours ourselves out over own same shan't she she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's these they they'd they'll they're they've this those through to too under until up very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you you'd you'll you're you've your yours yourself yourselves".split()
27
  )
28
 
29
 
@@ -66,30 +66,48 @@ def remove_stopwords(
66
  :param bool contraction_tokenize: use custom apostrophe tokenizer, defaults to True
67
  :return str: text with stopwords removed
68
  """
69
- words = (
70
- contraction_aware_tokenize(text)
71
- if contraction_tokenize
72
- else word_tokenize(text)
73
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- filtered_words = []
76
- for word in words:
77
- # Remove leading and trailing punctuation marks
78
- word = word.strip(string.punctuation)
79
 
80
- if word.lower() not in stopwords:
81
- filtered_words.append(word)
 
82
 
83
- filtered_text = " ".join(filtered_words)
84
 
85
- # Replace multiple consecutive whitespaces with a single space
86
- filtered_text = re.sub(r"\s+", " ", filtered_text)
87
- filtered_text = filtered_text.strip()
88
 
89
- # Restore original whitespaces around punctuation marks
90
- filtered_text = re.sub(
91
- r"\s*([{}])\s*".format(re.escape(string.punctuation)), r"\1", filtered_text
92
- )
 
 
 
93
 
94
  return filtered_text
95
 
 
19
 
20
  import torch
21
  from natsort import natsorted
22
+ from nltk.tokenize import word_tokenize, WhitespaceTokenizer, sent_tokenize
23
  from rapidfuzz import fuzz
24
 
25
  STOPWORDS = set(
26
+ "a about above after again all also am an and any are aren't as at back be because been before being below between both but by can't cannot could couldn't did didn't do does doesn't doing don't down during each few for from further had hadn't has hasn't have haven't having he'd he'll he's hence her here here's hers herself him himself his how how's however i'd i'll i'm i've if in into is isn't it's its itself just let's me more moreover most mustn't my myself new nor now of off on once only or other ought our ours ourselves out over own really same shan't she'd she'll she's should shouldn't so some such than that that's the their theirs them themselves then there there's therefore these they they'd they'll they're they've this those through thus to too under until up use used using very was wasn't we we'd we'll we're we've were weren't what what's when when's where where's which while who who's whom why why's with won't would wouldn't you'd you'll you're you've your yours yourself yourselves".split()
27
  )
28
 
29
 
 
66
  :param bool contraction_tokenize: use custom apostrophe tokenizer, defaults to True
67
  :return str: text with stopwords removed
68
  """
69
+ lines = text.split("\n")
70
+ filtered_lines = []
71
+
72
+ def fix_commas(text: str) -> str:
73
+ """fixes commas in text to have a space after them"""
74
+ spaced_text = text.replace(",", ", ")
75
+ return spaced_text.replace(" ", " ").strip()
76
+
77
+ for line in lines:
78
+ sentences = sent_tokenize(line)
79
+ filtered_sentences = []
80
+
81
+ for sentence in sentences:
82
+ # Add space around punctuations for the regex to work correctly, only if they are followed by a letter
83
+ sentence_with_spaces = re.sub(r"([.,!?])(\w)", r"\1 \2", sentence[:-1])
84
+
85
+ words = (
86
+ contraction_aware_tokenize(sentence_with_spaces)
87
+ if contraction_tokenize
88
+ else word_tokenize(sentence_with_spaces)
89
+ )
90
 
91
+ filtered_words = []
92
+ for word in words:
93
+ if word.lower() not in stopwords:
94
+ filtered_words.append(word)
95
 
96
+ filtered_sentence = " ".join(filtered_words)
97
+ # Restore original spaces around punctuation marks
98
+ filtered_sentence = re.sub(r"([.,!?])\s*", r"\1", filtered_sentence)
99
 
100
+ filtered_sentences.append(filtered_sentence + sentence[-1])
101
 
102
+ filtered_line = " ".join(filtered_sentences)
 
 
103
 
104
+ # Replace multiple consecutive whitespaces with a single space
105
+ filtered_line = re.sub(r"\s+", " ", filtered_line)
106
+ filtered_line = fix_commas(filtered_line.strip())
107
+
108
+ filtered_lines.append(filtered_line)
109
+
110
+ filtered_text = "\n".join(filtered_lines)
111
 
112
  return filtered_text
113