RexChan commited on
Commit
74372b3
1 Parent(s): cd54693

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor
3
+ from transformers import pipeline
4
+ import librosa
5
+ import torch
6
+ from spleeter.separator import Separator
7
+ from pydub import AudioSegment
8
+ from IPython.display import Audio
9
+ import os
10
+ import accelerate
11
+
12
+
13
+ # load song
14
+ output_file = "/content/"
15
+
16
+ # preprocess and crop audio file
17
+ def audio_preprocess():
18
+ # separate music and vocal
19
+ separator = Separator('spleeter:2stems')
20
+ separator.separate_to_file(input_file, output_file)
21
+
22
+ # Crop the audio
23
+ start_time = 60000 # e.g. 30 seconds, 30000
24
+ end_time = 110000 # e.g. 40 seconds, 40000
25
+
26
+ audio = AudioSegment.from_file('/content/test1/vocals.wav')
27
+ cropped_audio = audio[start_time:end_time]
28
+ cropped_audio.export('/content/cropped_vocals.wav', format='wav') # save vocal audio file
29
+
30
+
31
+ # ASR transcription
32
+ def asr_model():
33
+ # load audio file
34
+ y, sr = librosa.load('cropped_vocals.wav', sr=16000)
35
+
36
+ # ASR model
37
+ MODEL_NAME = "RexChan/ISOM5240-whisper-small-zhhk_1"
38
+ processor = WhisperProcessor.from_pretrained(MODEL_NAME)
39
+ model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME, low_cpu_mem_usage=True)
40
+
41
+ model.config.forced_decoder_ids = None
42
+ model.config.suppress_tokens = []
43
+ model.config.use_cache = False
44
+
45
+ processed_in = processor(y, sampling_rate=sr, return_tensors="pt")
46
+ gout = model.generate(
47
+ input_features=processed_in.input_features,
48
+ output_scores=True, return_dict_in_generate=True
49
+ )
50
+ transcription = processor.batch_decode(gout.sequences, skip_special_tokens=True)[0]
51
+
52
+ # print result
53
+ print(f"Song lyrics = {transcription}")
54
+
55
+ return transcription
56
+
57
+
58
+ # sentiment analysis
59
+ def senti_model(transcription):
60
+
61
+ pipe = pipeline("text-classification", model="lxyuan/distilbert-base-multilingual-cased-sentiments-student")
62
+ final_result = pipe(transcription)
63
+ print(f"Sentiment Analysis shows that this song is {final_result[0]['label']}. Confident level of this analysis is {final_result[0]['score']*100:.1f}%.")
64
+
65
+ return final_result
66
+
67
+
68
+ # main
69
+ def main():
70
+
71
+ input_file = st.file_uploader("upload a song in mp3 format", type="mp3")
72
+ if input_file is not None:
73
+ st.write("File uploaded successfully!")
74
+ else:
75
+ st.write("No file uploaded.")
76
+ audio_preprocess()
77
+ transcription = asr_model()
78
+ final_result = senti_model(transcription)
79
+
80
+
81
+ if __name__ == '__main__':
82
+ main()