File size: 1,678 Bytes
d810840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4f3259
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import re
from thefuzz import fuzz
import numpy as np


def match_mask_and_transcript(split_punct, transcript, classification):
    """
    Input:
        split_punct: the punctuated text, split on ?/!/.\s,
        transcript: original transcript with timestamps
        classification: classification object (list of numbers 0,1)
    Output: times
    """

    # Get the sponsored part
    sponsored_segment = []
    for i, val in enumerate(classification):
        if val == 1:
            sponsored_segment.append(split_punct[i])

    segment = " ".join(sponsored_segment)
    sim_scores = list()

    # Check the similarity scores between the sponsored part and the transcript parts
    for elem in transcript:
        sim_scores.append(fuzz.partial_ratio(segment, elem["text"]))

    # Get the scores and check if they are above mean + 2*stdev
    scores = np.array(sim_scores)
    timestamp_mask = (scores > np.mean(scores) + np.std(scores) * 2).astype(int)
    timestamps = [
        (transcript[i]["start"], transcript[i]["duration"])
        for i, elem in enumerate(timestamp_mask)
        if elem == 1
    ]

    # Get the timestamp segments
    times = []
    current = -1
    current_time = 0
    for elem in timestamps:
        # Threshold of 5 to see if it is a jump to another segment (also to make sure smaller segments are added together
        if elem[0] > (current_time + 5):
            current += 1
            times.append((elem[0], elem[0] + elem[1]))
            current_time = elem[0] + elem[1]
        else:
            times[current] = (times[current][0], elem[0] + elem[1])
            current_time = elem[0] + elem[1]
    return times, timestamps