|
from data_processing import load_data, spotify_data, path |
|
import pandas |
|
import nltk |
|
from nltk.corpus import stopwords |
|
from nltk.tokenize import word_tokenize |
|
import string |
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_lyrics(lyrics): |
|
|
|
tokens = word_tokenize(lyrics) |
|
|
|
|
|
tokens = [word.lower() for word in tokens] |
|
|
|
|
|
table = str.maketrans('', '', string.punctuation) |
|
stripped_tokens = [word.translate(table) for word in tokens] |
|
|
|
|
|
stop_words = set(stopwords.words('english')) |
|
tokens_without_sw = [word for word in stripped_tokens if word not in stop_words] |
|
|
|
return tokens_without_sw |
|
|
|
|
|
spotify_data['cleaned_text'] = spotify_data['text'].apply(clean_lyrics) |
|
spotify_data.to_csv('spotify_data_processed.csv', index=False) |
|
|
|
|
|
|