Diego-0121 commited on
Commit
5178166
1 Parent(s): ed8b14c

Update tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +34 -0
tokenizer.py CHANGED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_processing import load_data, spotify_data, path
2
+ import pandas
3
+ import nltk
4
+ from nltk.corpus import stopwords
5
+ from nltk.tokenize import word_tokenize
6
+ import string
7
+
8
+ #---------------------------Download the requirements NLTK--------------------------------
9
+
10
+ #nltk.download('punkt')
11
+ #nltk.download('stopwords')
12
+
13
+ def clean_lyrics(lyrics):
14
+ # Tokenización
15
+ tokens = word_tokenize(lyrics)
16
+
17
+ # To lower case
18
+ tokens = [word.lower() for word in tokens]
19
+
20
+ # Delete signs
21
+ table = str.maketrans('', '', string.punctuation)
22
+ stripped_tokens = [word.translate(table) for word in tokens]
23
+
24
+ # Stop Words
25
+ stop_words = set(stopwords.words('english'))
26
+ tokens_without_sw = [word for word in stripped_tokens if word not in stop_words]
27
+
28
+ return tokens_without_sw
29
+
30
+ # Apply clean
31
+ spotify_data['cleaned_text'] = spotify_data['text'].apply(clean_lyrics)
32
+ spotify_data.to_csv('spotify_data_processed.csv', index=False)
33
+
34
+ #print(spotify_data['cleaned_text'].head())