import os from pytube import YouTube from src.music.utils import RATE_AUDIO_SAVE, slugify from src.music.config import MAX_LEN # define filtering keyworfds start_keywords = [' ', '(', ',', ':'] end_keywords = [')', ' ', '.', ',', '!', ':'] def get_all_keywords(k): all_keywords = [] for s in start_keywords: for e in end_keywords: all_keywords.append(s + k + e) return all_keywords filtered_keywords = ['duet', 'duo', 'quartet', 'orchestre', 'orchestra', 'quintet', 'sixtet', 'septet', 'octet', 'backing track', 'accompaniment', 'string', 'contrebrasse', 'drums', 'guitar'] + get_all_keywords('live') + get_all_keywords('trio') # list of playlist for which no filtering should occur on keywords (they were prefiltered already, it's supposed to be only piano) playlist_and_channel_not_to_filter = ["https://www.youtube.com/c/MySheetMusicTranscriptions", "https://www.youtube.com/c/PianoNotion", "https://www.youtube.com/c/PianoNotion", "https://www.youtube.com/watch?v=3F5glYefwio&list=PLFv3ZQw-ZPxi2DH3Bau7lBC5K6zfPJZxc", "https://www.youtube.com/user/Mercuziopianist", "https://www.youtube.com/channel/UCy6NPK6-xeX7MZLaMARa5qg", "https://www.youtube.com/channel/UCKMRNFV2dWTWIJnymtA9_Iw", "https://www.youtube.com/c/pianomaedaful", "https://www.youtube.com/c/FrancescoParrinoMusic", "https://www.youtube.com/c/itsremco"] playlist_ok = "https://www.youtube.com/watch?v=sYv_vk6bJtk&list=PLO9E3V4rGLD9-0BEd3t-AvvMcVF1zOJPj" def should_be_filtered(title, length, url, playlist_url, max_length): to_filter = False reason = '' lower_title = title.lower() if length > max_length: reason += f'it is too long (>{max_length/60:.1f} min), ' to_filter = True if any([f in lower_title for f in filtered_keywords]) \ and playlist_url not in playlist_and_channel_not_to_filter \ and 'to live' not in lower_title and 'alive' not in lower_title \ and url not in playlist_ok: reason += 'it contains a filtered keyword, ' to_filter = True return to_filter, reason def convert_mp4_to_mp3(path, verbose=True): if verbose: print(f"Converting mp4 to mp3, in {path}\n") assert '.mp4' == path[-4:] os.system(f'ffmpeg -i "{path}" -loglevel panic -y -ac 1 -ar {int(RATE_AUDIO_SAVE)} "{path[:-4] + ".mp3"}" ') os.remove(path) if verbose: print('\tDone.') def pipeline_video(video, playlist_path, filename): # extract best stream for this video stream, kbps = extract_best_stream(video.streams) stream.download(output_path=playlist_path, filename=filename + '.mp4') # convert to mp3 convert_mp4_to_mp3(playlist_path + filename + '.mp4', verbose=False) return kbps def extract_best_stream(streams): # extract best audio stream stream_out = streams.get_audio_only() kbps = int(stream_out.abr[:-4]) return stream_out, kbps def get_title_and_length(video): title = video.title filename = slugify(title) length = video.length return title, filename, length, video.metadata def url2audio(playlist_path, video_url=None, video=None, playlist_url='', apply_filters=False, verbose=False, level=0): assert video_url is not None or video is not None, 'needs either video or url' error_msg = 'Error in loading video?' try: if not video: video = YouTube(video_url) error_msg += ' Nope. In extracting title and length?' title, filename, length, video_meta_data = get_title_and_length(video) if apply_filters: to_filter, reason = should_be_filtered(title, length, video_url, playlist_url, MAX_LEN) else: to_filter = False if not to_filter: audio_path = playlist_path + filename + ".mp3" if verbose: print(' ' * level + f'Downloading {title}, Url: {video_url}') if not os.path.exists(audio_path): if length > MAX_LEN and verbose: print(' ' * (level + 2) + f'Long video ({int(length/60)} min), will be cut after {int(MAX_LEN/60)} min.') error_msg += ' Nope. In pipeline video?' kbps = None for _ in range(5): try: kbps = pipeline_video(video, playlist_path, filename) break except: pass assert kbps is not None error_msg += ' Nope. In dict filling?' data = dict(title=title, filename=filename, length=length, kbps=kbps, url=video_url, meta=video_meta_data) error_msg += ' Nope. ' else: if verbose: print(' ' * (level + 2) + 'Song already downloaded') data = None return audio_path, data, '' else: return None, None, f'Filtered because {reason}' except: if verbose: print(' ' * (level + 2) + f'Download failed with error {error_msg}') if os.path.exists(audio_path): os.remove(audio_path) return None, None, error_msg + ' Yes.'