Spaces:

Xenova
/

sponsorblock-ml

Running

Joshua Lochner

Remove redundant count

a6de017 almost 3 years ago

7.94 kB

	from utils import jaccard
	from datasets import load_dataset
	from transformers import (
	AutoModelForSeq2SeqLM,
	AutoTokenizer,
	HfArgumentParser
	)
	from preprocess import DatasetArguments, ProcessedArguments, get_words
	from shared import device, GeneralArguments
	from predict import ClassifierArguments, predict, TrainingOutputArguments
	from segment import word_start, word_end, SegmentationArguments, add_labels_to_words
	import pandas as pd
	from dataclasses import dataclass, field
	from typing import Optional
	from tqdm import tqdm
	import json
	import os
	import random


	@dataclass
	class EvaluationArguments(TrainingOutputArguments):
	"""
	Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
	"""
	max_videos: Optional[int] = field(
	default=None,
	metadata={
	'help': 'The number of videos to test on'
	}
	)

	data_dir: Optional[str] = DatasetArguments.__dataclass_fields__['data_dir']
	dataset: Optional[str] = DatasetArguments.__dataclass_fields__[
	'validation_file']

	output_file: Optional[str] = field(
	default='metrics.csv',
	metadata={
	'help': 'Save metrics to output file'
	}
	)


	def attach_predictions_to_sponsor_segments(predictions, sponsor_segments):
	"""Attach sponsor segments to closest prediction"""
	for prediction in predictions:
	prediction['best_overlap'] = 0
	prediction['best_sponsorship'] = None

	# Assign predictions to actual (labelled) sponsored segments
	for sponsor_segment in sponsor_segments:
	sponsor_segment['best_overlap'] = 0
	sponsor_segment['best_prediction'] = None

	for prediction in predictions:

	j = jaccard(prediction['start'], prediction['end'],
	sponsor_segment['start'], sponsor_segment['end'])
	if sponsor_segment['best_overlap'] < j:
	sponsor_segment['best_overlap'] = j
	sponsor_segment['best_prediction'] = prediction

	if prediction['best_overlap'] < j:
	prediction['best_overlap'] = j
	prediction['best_sponsorship'] = sponsor_segment

	return sponsor_segments


	def calculate_metrics(labelled_words, predictions):

	metrics = {
	'true_positive': 0, # Is sponsor, predicted sponsor
	# Is sponsor, predicted not sponsor (i.e., missed it - bad)
	'false_negative': 0,
	# Is not sponsor, predicted sponsor (classified incorectly, not that bad since we do manual checking afterwards)
	'false_positive': 0,
	'true_negative': 0, # Is not sponsor, predicted not sponsor
	}

	metrics['video_duration'] = word_end(
	labelled_words[-1])-word_start(labelled_words[0])

	for index, word in enumerate(labelled_words):
	if index >= len(labelled_words) - 1:
	continue

	# TODO make sure words with manual transcripts
	duration = labelled_words[index+1]['start'] - word['start']

	predicted_sponsor = False
	for p in predictions:
	# Is in some prediction
	if p['start'] <= word['start'] <= p['end']:
	predicted_sponsor = True
	break

	if predicted_sponsor:
	# total_positive_time += duration
	if word['category'] is not None: # Is actual sponsor
	metrics['true_positive'] += duration
	else:
	metrics['false_positive'] += duration
	else:
	# total_negative_time += duration
	if word['category'] is not None: # Is actual sponsor
	metrics['false_negative'] += duration
	else:
	metrics['true_negative'] += duration

	# NOTE In cases where we encounter division by 0, we say that the value is 1
	# https://stats.stackexchange.com/a/1775
	# (Precision) TP+FP=0: means that all instances were predicted as negative
	# (Recall) TP+FN=0: means that there were no positive cases in the input data

	# The fraction of predictions our model got right
	# Can simplify, but use full formula
	z = metrics['true_positive'] + metrics['true_negative'] + \
	metrics['false_positive'] + metrics['false_negative']
	metrics['accuracy'] = (
	(metrics['true_positive'] + metrics['true_negative']) / z) if z > 0 else 1

	# What proportion of positive identifications was actually correct?
	z = metrics['true_positive'] + metrics['false_positive']
	metrics['precision'] = (metrics['true_positive'] / z) if z > 0 else 1

	# What proportion of actual positives was identified correctly?
	z = metrics['true_positive'] + metrics['false_negative']
	metrics['recall'] = (metrics['true_positive'] / z) if z > 0 else 1

	# https://deepai.org/machine-learning-glossary-and-terms/f-score

	s = metrics['precision'] + metrics['recall']
	metrics['f-score'] = (2 * (metrics['precision'] *
	metrics['recall']) / s) if s > 0 else 0

	return metrics


	def main():
	hf_parser = HfArgumentParser((
	EvaluationArguments,
	ProcessedArguments,
	SegmentationArguments,
	ClassifierArguments,
	GeneralArguments
	))

	evaluation_args, processed_args, segmentation_args, classifier_args, _ = hf_parser.parse_args_into_dataclasses()

	model = AutoModelForSeq2SeqLM.from_pretrained(evaluation_args.model_path)
	model.to(device())

	tokenizer = AutoTokenizer.from_pretrained(evaluation_args.model_path)

	dataset = load_dataset('json', data_files=os.path.join(
	evaluation_args.data_dir, evaluation_args.dataset))['train']

	video_ids = [row['video_id'] for row in dataset]
	random.shuffle(video_ids) # TODO Make param

	if evaluation_args.max_videos is not None:
	video_ids = video_ids[:evaluation_args.max_videos]

	# Load labelled data:
	final_path = os.path.join(
	processed_args.processed_dir, processed_args.processed_file)

	with open(final_path) as fp:
	final_data = json.load(fp)

	total_accuracy = 0
	total_precision = 0
	total_recall = 0
	total_fscore = 0

	out_metrics = []

	try:
	with tqdm(video_ids) as progress:
	for video_id in progress:
	progress.set_description(f'Processing {video_id}')
	sponsor_segments = final_data.get(video_id, [])

	words = get_words(video_id)
	if not words:
	continue

	# Make predictions
	predictions = predict(video_id, model, tokenizer,
	segmentation_args, words, classifier_args)

	labelled_words = add_labels_to_words(words, sponsor_segments)
	met = calculate_metrics(labelled_words, predictions)
	met['video_id'] = video_id

	out_metrics.append(met)

	total_accuracy += met['accuracy']
	total_precision += met['precision']
	total_recall += met['recall']
	total_fscore += met['f-score']

	progress.set_postfix({
	'accuracy': total_accuracy/len(out_metrics),
	'precision': total_precision/len(out_metrics),
	'recall': total_recall/len(out_metrics),
	'f-score': total_fscore/len(out_metrics)
	})

	labelled_predicted_segments = attach_predictions_to_sponsor_segments(
	predictions, sponsor_segments)
	for seg in labelled_predicted_segments:
	if seg['best_prediction'] is None:
	print('\nNo match found for', seg)

	except KeyboardInterrupt:
	pass

	df = pd.DataFrame(out_metrics)

	df.to_csv(evaluation_args.output_file)
	print(df.mean())


	if __name__ == '__main__':
	main()