import itertools import base64 import re import requests from model import get_model_tokenizer from utils import jaccard from datasets import load_dataset from transformers import ( AutoModelForSeq2SeqLM, AutoTokenizer, HfArgumentParser ) from preprocess import DatasetArguments, get_words from shared import device, GeneralArguments from predict import ClassifierArguments, predict, TrainingOutputArguments from segment import extract_segment, word_start, word_end, SegmentationArguments, add_labels_to_words import pandas as pd from dataclasses import dataclass, field from typing import Optional from tqdm import tqdm import json import os import random from shared import seconds_to_time from urllib.parse import quote @dataclass class EvaluationArguments(TrainingOutputArguments): """ Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. """ max_videos: Optional[int] = field( default=None, metadata={ 'help': 'The number of videos to test on' } ) start_index: int = field(default=None, metadata={ 'help': 'Video to start the evaluation at.'}) output_file: Optional[str] = field( default='metrics.csv', metadata={ 'help': 'Save metrics to output file' } ) channel_id: Optional[str] = field( default=None, metadata={ 'help': 'Used to evaluate a channel' } ) def attach_predictions_to_sponsor_segments(predictions, sponsor_segments): """Attach sponsor segments to closest prediction""" for prediction in predictions: prediction['best_overlap'] = 0 prediction['best_sponsorship'] = None # Assign predictions to actual (labelled) sponsored segments for sponsor_segment in sponsor_segments: sponsor_segment['best_overlap'] = 0 sponsor_segment['best_prediction'] = None for prediction in predictions: j = jaccard(prediction['start'], prediction['end'], sponsor_segment['start'], sponsor_segment['end']) if sponsor_segment['best_overlap'] < j: sponsor_segment['best_overlap'] = j sponsor_segment['best_prediction'] = prediction if prediction['best_overlap'] < j: prediction['best_overlap'] = j prediction['best_sponsorship'] = sponsor_segment return sponsor_segments def calculate_metrics(labelled_words, predictions): metrics = { 'true_positive': 0, # Is sponsor, predicted sponsor # Is sponsor, predicted not sponsor (i.e., missed it - bad) 'false_negative': 0, # Is not sponsor, predicted sponsor (classified incorectly, not that bad since we do manual checking afterwards) 'false_positive': 0, 'true_negative': 0, # Is not sponsor, predicted not sponsor } metrics['video_duration'] = word_end( labelled_words[-1])-word_start(labelled_words[0]) for index, word in enumerate(labelled_words): if index >= len(labelled_words) - 1: continue # TODO make sure words with manual transcripts duration = labelled_words[index+1]['start'] - word['start'] predicted_sponsor = False for p in predictions: # Is in some prediction if p['start'] <= word['start'] <= p['end']: predicted_sponsor = True break if predicted_sponsor: # total_positive_time += duration if word.get('category') is not None: # Is actual sponsor metrics['true_positive'] += duration else: metrics['false_positive'] += duration else: # total_negative_time += duration if word.get('category') is not None: # Is actual sponsor metrics['false_negative'] += duration else: metrics['true_negative'] += duration # NOTE In cases where we encounter division by 0, we say that the value is 1 # https://stats.stackexchange.com/a/1775 # (Precision) TP+FP=0: means that all instances were predicted as negative # (Recall) TP+FN=0: means that there were no positive cases in the input data # The fraction of predictions our model got right # Can simplify, but use full formula z = metrics['true_positive'] + metrics['true_negative'] + \ metrics['false_positive'] + metrics['false_negative'] metrics['accuracy'] = ( (metrics['true_positive'] + metrics['true_negative']) / z) if z > 0 else 1 # What proportion of positive identifications was actually correct? z = metrics['true_positive'] + metrics['false_positive'] metrics['precision'] = (metrics['true_positive'] / z) if z > 0 else 1 # What proportion of actual positives was identified correctly? z = metrics['true_positive'] + metrics['false_negative'] metrics['recall'] = (metrics['true_positive'] / z) if z > 0 else 1 # https://deepai.org/machine-learning-glossary-and-terms/f-score s = metrics['precision'] + metrics['recall'] metrics['f-score'] = (2 * (metrics['precision'] * metrics['recall']) / s) if s > 0 else 0 return metrics # Public innertube key (b64 encoded so that it is not incorrectly flagged) INNERTUBE_KEY = base64.b64decode( b'QUl6YVN5QU9fRkoyU2xxVThRNFNURUhMR0NpbHdfWTlfMTFxY1c4').decode() YT_CONTEXT = { 'client': { 'userAgent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36,gzip(gfe)', 'clientName': 'WEB', 'clientVersion': '2.20211221.00.00', } } _YT_INITIAL_DATA_RE = r'(?:window\s*\[\s*["\']ytInitialData["\']\s*\]|ytInitialData)\s*=\s*({.+?})\s*;\s*(?:var\s+meta|', seconds_to_time(missed_segment['end'])) print('\t\tText: "', ' '.join( [w['text'] for w in missed_segment['words']]), '"', sep='') print('\t\tCategory:', missed_segment.get('category')) print('\t\tProbability:', missed_segment.get('probability')) segments_to_submit.append({ 'segment': [missed_segment['start'], missed_segment['end']], 'category': missed_segment['category'].lower(), 'actionType': 'skip' }) json_data = quote(json.dumps(segments_to_submit)) print( f'\tSubmit: https://www.youtube.com/watch?v={video_id}#segments={json_data}') # Potentially incorrect segments (model didn't predict, but in database) if incorrect_segments: print(' - Incorrect segments:') for i, incorrect_segment in enumerate(incorrect_segments, start=1): print(f'\t#{i}:', seconds_to_time( incorrect_segment['start']), '-->', seconds_to_time(incorrect_segment['end'])) seg_words = extract_segment( words, incorrect_segment['start'], incorrect_segment['end']) print('\t\tText: "', ' '.join( [w['text'] for w in seg_words]), '"', sep='') print('\t\tUUID:', incorrect_segment['uuid']) print('\t\tCategory:', incorrect_segment['category']) print('\t\tVotes:', incorrect_segment['votes']) print('\t\tViews:', incorrect_segment['views']) print('\t\tLocked:', incorrect_segment['locked']) print() except KeyboardInterrupt: pass df = pd.DataFrame(out_metrics) df.to_csv(evaluation_args.output_file) print(df.mean()) if __name__ == '__main__': main()