Harmonize / tokenizer.py
Diego-0121's picture
Update tokenizer.py
5178166
raw
history blame
988 Bytes
from data_processing import load_data, spotify_data, path
import pandas
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
#---------------------------Download the requirements NLTK--------------------------------
#nltk.download('punkt')
#nltk.download('stopwords')
def clean_lyrics(lyrics):
# Tokenización
tokens = word_tokenize(lyrics)
# To lower case
tokens = [word.lower() for word in tokens]
# Delete signs
table = str.maketrans('', '', string.punctuation)
stripped_tokens = [word.translate(table) for word in tokens]
# Stop Words
stop_words = set(stopwords.words('english'))
tokens_without_sw = [word for word in stripped_tokens if word not in stop_words]
return tokens_without_sw
# Apply clean
spotify_data['cleaned_text'] = spotify_data['text'].apply(clean_lyrics)
spotify_data.to_csv('spotify_data_processed.csv', index=False)
#print(spotify_data['cleaned_text'].head())