File size: 1,615 Bytes
c6d8763
 
 
 
16e821f
 
 
c6d8763
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16e821f
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
from flask import Flask, render_template, request, jsonify
import torch
from transformers import pipeline
import gradio as gr

app = Flask(__name__)

# Load the automatic speech recognition model
pipe = pipeline("automatic-speech-recognition",
               "openai/whisper-large-v3",
               torch_dtype=torch.float16,
               device="cuda:0")

# Load the emotion classification model
emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    return_all_scores=True
)

def transcribe(audio_file, task):
    if audio_file is None:
        return "Please upload or record an audio file."

    # Check if the audio file is in bytes format (drag-and-drop file)
    if isinstance(audio_file, bytes):
        text = pipe(audio_file, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    else:
        # Handle the case where the file is uploaded using the file uploader
        text = pipe(audio_file.name, generate_kwargs={"task": task}, return_timestamps=True)["text"]

    return text

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/transcribe', methods=['POST'])
def transcribe_endpoint():
    audio_file = request.files.get('audio_file')
    task = request.form.get('task')
    text = transcribe(audio_file, task)
    return jsonify({'text': text})

@app.route('/classify_emotion', methods=['POST'])
def classify_emotion_endpoint():
    text = request.form.get('text')
    result = emotion_classifier(text)
    return jsonify(result)

if __name__ == '__main__':
    app.run(debug=True)