Spaces:

Saturdays
/

Harmonize

Sleeping

Harmonize / tokenizer.py

Update tokenizer.py

5178166 11 months ago

988 Bytes

	from data_processing import load_data, spotify_data, path
	import pandas
	import nltk
	from nltk.corpus import stopwords
	from nltk.tokenize import word_tokenize
	import string

	#---------------------------Download the requirements NLTK--------------------------------

	#nltk.download('punkt')
	#nltk.download('stopwords')

	def clean_lyrics(lyrics):
	# Tokenización
	tokens = word_tokenize(lyrics)

	# To lower case
	tokens = [word.lower() for word in tokens]

	# Delete signs
	table = str.maketrans('', '', string.punctuation)
	stripped_tokens = [word.translate(table) for word in tokens]

	# Stop Words
	stop_words = set(stopwords.words('english'))
	tokens_without_sw = [word for word in stripped_tokens if word not in stop_words]

	return tokens_without_sw

	# Apply clean
	spotify_data['cleaned_text'] = spotify_data['text'].apply(clean_lyrics)
	spotify_data.to_csv('spotify_data_processed.csv', index=False)

	#print(spotify_data['cleaned_text'].head())