from data_processing import load_data, spotify_data, path
import pandas 
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

#---------------------------Download the requirements NLTK--------------------------------

#nltk.download('punkt')
#nltk.download('stopwords')

def clean_lyrics(lyrics):
    # Tokenización
    tokens = word_tokenize(lyrics)

    # To lower case 
    tokens = [word.lower() for word in tokens]

    # Delete signs 
    table = str.maketrans('', '', string.punctuation)
    stripped_tokens = [word.translate(table) for word in tokens]

    # Stop Words
    stop_words = set(stopwords.words('english'))  
    tokens_without_sw = [word for word in stripped_tokens if word not in stop_words]

    return tokens_without_sw

# Apply clean
spotify_data['cleaned_text'] = spotify_data['text'].apply(clean_lyrics)
spotify_data.to_csv('spotify_data_processed.csv', index=False)

#print(spotify_data['cleaned_text'].head())