File size: 5,243 Bytes
e775f6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
from pytube import YouTube
from src.music.utils import RATE_AUDIO_SAVE, slugify
from src.music.config import MAX_LEN

# define filtering keyworfds
start_keywords = [' ', '(', ',', ':']
end_keywords = [')', ' ', '.', ',', '!', ':']
def get_all_keywords(k):
    all_keywords = []
    for s in start_keywords:
        for e in end_keywords:
            all_keywords.append(s + k + e)
    return all_keywords
filtered_keywords = ['duet', 'duo', 'quartet', 'orchestre', 'orchestra',
                     'quintet', 'sixtet', 'septet', 'octet', 'backing track', 'accompaniment', 'string',
                     'contrebrasse', 'drums', 'guitar'] + get_all_keywords('live') + get_all_keywords('trio')

# list of playlist for which no filtering should occur on keywords (they were prefiltered already, it's supposed to be only piano)
playlist_and_channel_not_to_filter = ["https://www.youtube.com/c/MySheetMusicTranscriptions",
                                      "https://www.youtube.com/c/PianoNotion",
                                      "https://www.youtube.com/c/PianoNotion",
                                      "https://www.youtube.com/watch?v=3F5glYefwio&list=PLFv3ZQw-ZPxi2DH3Bau7lBC5K6zfPJZxc",
                                      "https://www.youtube.com/user/Mercuziopianist",
                                      "https://www.youtube.com/channel/UCy6NPK6-xeX7MZLaMARa5qg",
                                      "https://www.youtube.com/channel/UCKMRNFV2dWTWIJnymtA9_Iw",
                                      "https://www.youtube.com/c/pianomaedaful",
                                      "https://www.youtube.com/c/FrancescoParrinoMusic",
                                      "https://www.youtube.com/c/itsremco"]
playlist_ok = "https://www.youtube.com/watch?v=sYv_vk6bJtk&list=PLO9E3V4rGLD9-0BEd3t-AvvMcVF1zOJPj"


def should_be_filtered(title, length, url, playlist_url, max_length):
    to_filter = False
    reason = ''
    lower_title = title.lower()
    if length > max_length:
        reason += f'it is too long (>{max_length/60:.1f} min), '
        to_filter = True
    if any([f in lower_title for f in filtered_keywords]) \
            and playlist_url not in playlist_and_channel_not_to_filter \
            and 'to live' not in lower_title and 'alive' not in lower_title \
            and url not in playlist_ok:
        reason += 'it contains a filtered keyword, '
        to_filter = True
    return to_filter, reason

def convert_mp4_to_mp3(path, verbose=True):
    if verbose: print(f"Converting mp4 to mp3, in {path}\n")
    assert '.mp4' == path[-4:]
    os.system(f'ffmpeg -i "{path}" -loglevel panic -y -ac 1 -ar {int(RATE_AUDIO_SAVE)} "{path[:-4] + ".mp3"}" ')
    os.remove(path)
    if verbose: print('\tDone.')

def pipeline_video(video, playlist_path, filename):
    # extract best stream for this video
    stream, kbps = extract_best_stream(video.streams)
    stream.download(output_path=playlist_path, filename=filename + '.mp4')
    # convert to mp3
    convert_mp4_to_mp3(playlist_path + filename + '.mp4', verbose=False)
    return kbps

def extract_best_stream(streams):
    # extract best audio stream
    stream_out = streams.get_audio_only()
    kbps = int(stream_out.abr[:-4])
    return stream_out, kbps

def get_title_and_length(video):
    title = video.title
    filename = slugify(title)
    length = video.length
    return title, filename, length, video.metadata


def url2audio(playlist_path, video_url=None, video=None, playlist_url='', apply_filters=False, verbose=False, level=0):
    assert video_url is not None or video is not None, 'needs either video or url'
    error_msg = 'Error in loading video?'
    try:
        if not video:
            video = YouTube(video_url)
        error_msg += ' Nope. In extracting title and length?'
        title, filename, length, video_meta_data = get_title_and_length(video)
        if apply_filters:
            to_filter, reason = should_be_filtered(title, length, video_url, playlist_url, MAX_LEN)
        else:
            to_filter = False
        if not to_filter:
            audio_path = playlist_path + filename + ".mp3"
            if verbose: print(' ' * level + f'Downloading {title}, Url: {video_url}')
            if not os.path.exists(audio_path):
                if length > MAX_LEN and verbose: print(' ' * (level + 2) + f'Long video ({int(length/60)} min), will be cut after {int(MAX_LEN/60)} min.')
                error_msg += ' Nope. In pipeline video?'
                kbps = pipeline_video(video, playlist_path, filename)
                error_msg += ' Nope. In dict filling?'
                data = dict(title=title, filename=filename, length=length, kbps=kbps, url=video_url, meta=video_meta_data)
                error_msg += ' Nope. '
            else:
                if verbose: print(' ' * (level + 2) + 'Song already downloaded')
                data = None
            return audio_path, data, ''
        else:
            return None, None, f'Filtered because {reason}'
    except:
        if verbose: print(' ' * (level + 2) + f'Download failed with error {error_msg}')
        if os.path.exists(audio_path):
            os.remove(audio_path)
        return None, None, error_msg + ' Yes.'