Spaces:

mskov
/

Speech-Trigger-Detection

Runtime error

App Files Files Community

Speech-Trigger-Detection / app.py

mskov

Update app.py

6615174 about 1 year ago

raw

history blame

6.02 kB

	import os
	os.system("pip install git+https://github.com/openai/whisper.git")
	import evaluate
	from evaluate.utils import launch_gradio_widget
	import gradio as gr
	import torch
	import classify
	from whisper.model import Whisper
	from whisper.tokenizer import get_tokenizer
	from speechbrain.pretrained.interfaces import foreign_class
	from transformers import AutoModelForSequenceClassification, pipeline, WhisperTokenizer, RobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer


	# pull in emotion detection
	# --- Add element for specification
	# pull in text classification
	# --- Add custom labels
	# --- Associate labels with radio elements
	# add logic to initiate mock notificaiton when detected
	# pull in misophonia-specific model

	model_cache = {}

	# Building prediction function for gradio
	emo_dict = {
	'sad': 'Sad',
	'hap': 'Happy',
	'ang': 'Anger',
	'neu': 'Neutral'
	}

	# static classes for now, but it would be best ot have the user select from multiple, and to enter their own
	class_options = {
	"racism": ["racism", "hate speech", "bigotry", "racially targeted", "racially diminutive", "racial slur", "ethnic slur", "ethnic hate", "pro-white nationalism"],
	"LGBTQ+ hate": ["gay slur", "trans slur", "homophobic slur", "transphobia", "anti-LBGTQ+", "hate speech"],
	"sexually explicit": ["sexually explicit", "sexually coercive", "sexual exploitation", "vulgar", "raunchy", "sexually demeaning", "sexual violence", "victim blaming"],
	"misophonia": ["chewing", "breathing", "mouthsounds", "popping", "sneezing", "yawning", "smacking", "sniffling", "panting"]
	}

	pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large")

	# Create a Gradio interface with audio file and text inputs
	def classify_toxicity(audio_file, text_input, classify_anxiety):
	# Transcribe the audio file using Whisper ASR
	if audio_file != None:
	transcribed_text = pipe(audio_file)["text"]

	#### Emotion classification ####
	emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
	out_prob, score, index, text_lab = emotion_classifier.classify_file(audio_file)

	else:
	transcribed_text = text_input
	if classify_anxiety != "misophonia":
	#### Toxicity Classifier ####

	toxicity_module = evaluate.load("toxicity", "facebook/roberta-hate-speech-dynabench-r4-target")
	#toxicity_module = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection', module_type="measurement")

	toxicity_results = toxicity_module.compute(predictions=[transcribed_text])

	toxicity_score = toxicity_results["toxicity"][0]
	print(toxicity_score)

	#### Text classification #####

	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

	text_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

	sequence_to_classify = transcribed_text
	print(classify_anxiety, class_options)
	candidate_labels = class_options.get(classify_anxiety, [])
	# classification_output = classifier(sequence_to_classify, candidate_labels, multi_label=False)
	classification_output = text_classifier(sequence_to_classify, candidate_labels, multi_label=True)
	print(classification_output)

	#### Emotion classification ####

	emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
	out_prob, score, index, text_lab = emotion_classifier.classify_file(audio_file)

	return toxicity_score, classification_output, emo_dict[text_lab[0]], transcribed_text
	# return f"Toxicity Score ({available_models[selected_model]}): {toxicity_score:.4f}"
	else:
	model = whisper.load_model("large")
	model_cache[model_name] = model
	# model = model_cache[model_name]
	# class_names = classify_anxiety.split(",")
	class_names_list = class_options.get(classify_anxiety, [])
	class_str = ""
	for elm in class_names_list:
	class_str += elm + ","
	#class_names = class_names_temp.split(",")
	class_names = class_str.split(",")
	print("class names ", class_names, "classify_anxiety ", classify_anxiety)

	# tokenizer = get_tokenizer(multilingual=".en" not in model_name)
	tokenizer= WhisperTokenizer.from_pretrained("openai/whisper-large")
	model = "whisper-large"

	internal_lm_average_logprobs = classify.calculate_internal_lm_average_logprobs(
	model=model,
	class_names=class_names,
	# class_names=classify_anxiety,
	tokenizer=tokenizer,
	)
	audio_features = classify.calculate_audio_features(audio_path, model)
	average_logprobs = classify.calculate_average_logprobs(
	model=model,
	audio_features=audio_features,
	class_names=class_names,
	tokenizer=tokenizer,
	)
	average_logprobs -= internal_lm_average_logprobs
	scores = average_logprobs.softmax(-1).tolist()
	return {class_name: score for class_name, score in zip(class_names, scores)}

	return classify_anxiety

	with gr.Blocks() as iface:
	with gr.Column():
	anxiety_class = gr.Radio(["racism", "LGBTQ+ hate", "sexually explicit", "misophonia"])
	with gr.Column():
	aud_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
	text = gr.Textbox(label="Enter Text", placeholder="Enter text here...")
	submit_btn = gr.Button(label="Run")
	with gr.Column():
	out_text = gr.Textbox()
	submit_btn.click(fn=classify_toxicity, inputs=[aud_input, text, anxiety_class], outputs=out_text)

	iface.launch()