File size: 9,036 Bytes
92da9af
 
 
 
 
 
 
 
 
ffa6434
92da9af
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
# -*- coding: utf-8 -*-

# from datetime import datetime
from collections import defaultdict

from numba import njit, jit
# import pandas as pd
import numpy as np

from modules.constants import MAPPING_TITLES, MAPPING_SCORES_INDEX


@njit(fastmath=True)
def compute_distance(reference, prediction, distance):
    for char_pred in range(1, len(prediction) + 1):
        for char_ref in range(1, len(reference) + 1):
            delt = 1 if prediction[char_pred - 1] != reference[char_ref - 1] else 0
            distance[char_pred, char_ref] = min(distance[char_pred - 1, char_ref - 1] + delt,
                                                distance[char_pred - 1, char_ref] + 1,
                                                distance[char_pred, char_ref - 1] + 1)

    return distance


@jit(nopython=True, nogil=True)
def check_back_direction(direction, char_ref, char_pred):
    char_pred = char_pred - 1 if direction == "<-" or direction == "\\" else char_pred
    char_ref = char_ref - 1 if direction == "^" or direction == "\\" else char_ref
    return char_ref, char_pred


def show_diff_color_html(reference: str, prediction: str) -> dict:
    """Display source and prediction in HTML format and color-code insertions (blue),
    deletions (red), and exact words (green). based on Levensthein algorithm.

    Example
    --------
    >>> show_diff_color_html("Chat", "Chien")
    ["<span style='color:#3CB371'>C</span>", "<span style='color:#3CB371'>h</span>",
    "<span style='color:#4169E1'>i</span>", "<span style='color:#4169E1'>e</span>",
    "<span style='color:#D2122E'>a</span>", "<span style='color:#4169E1'>n</span>",
    "<span style='color:#D2122E'>t</span>"]

    Args:
        reference (str): reference sequence
        prediction (str): prediction sequence

    Returns:
        list: list of HTML tag with color code
    """
    result = []
    res_r = []
    res_p = []

    distance = np.zeros((len(prediction) + 1, len(reference) + 1), dtype=int)
    distance[0, 1:] = range(1, len(reference) + 1)
    distance[1:, 0] = range(1, len(prediction) + 1)

    distance = compute_distance(reference, prediction, distance)
    # sequences alignment
    # iterate the matrix's values from back to forward
    char_pred = len(prediction)
    char_ref = len(reference)
    counter = 0
    while char_pred > 0 and char_ref > 0:
        counter +=1
        diagonal = distance[char_pred - 1, char_ref - 1]
        upper = distance[char_pred, char_ref - 1]
        left = distance[char_pred - 1, char_ref]

        # check back direction
        direction = "\\" if diagonal <= upper and \
                            diagonal <= left else "<-" \
            if left < diagonal and \
               left <= upper else "^"
        #char_pred = char_pred - 1 if direction == "<-" or direction == "\\" else char_pred
        #char_ref = char_ref - 1 if direction == "^" or direction == "\\" else char_ref
        char_ref, char_pred = check_back_direction(direction, char_ref, char_pred)

        # Colorize characters with HTML tags
        if (direction == "\\"):
            if distance[char_pred + 1, char_ref + 1] == diagonal:
                # exact match
                result.append(f"<span data-id='em-{counter}' class='exact-match line'>{prediction[char_pred]}</span>")
                res_r.append(f"<span id='em-{counter}'>{reference[char_ref]}</span>")
                res_p.append(f"<span id='em-{counter}'>{prediction[char_pred]}</span>")
            elif distance[char_pred + 1, char_ref + 1] > diagonal:
                result.append(f"<span data-id='ref-{counter}' class='delSubts line'>{reference[char_ref]}</span>")
                result.append(f"<span data-id='pred-{counter}' class='insertion line'>{prediction[char_pred]}</span>")
                res_r.append(f"<span id='ref-{counter}'>{reference[char_ref]}</span>")
                res_p.append(f"<span id='pred-{counter}'>{prediction[char_pred]}</span>")
            else:
                result.append(f"<span data-id='pred-{counter}' class='insertion line'>{prediction[char_pred]}</span>")
                result.append(f"<span data-id='ref-{counter}' class='delSubts line'>{reference[char_ref]}</span>")
                res_r.append(f"<span id='ref-{counter}'>{reference[char_ref]}</span>")
                res_p.append(f"<span id='pred-{counter}'>{prediction[char_pred]}</span>")
        elif (direction == "<-"):
            result.append(f"<span data-id='pred-{counter}' class='insertion line'>{prediction[char_pred]}</span>")
            res_p.append(f"<span id='pred-{counter}'>{prediction[char_pred]}</span>")
        elif (direction == "^"):
            result.append(f"<span data-id='ref-{counter}' class='delSubts line'>{reference[char_ref]}</span>")
            res_r.append(f"<span id='ref-{counter}'>{reference[char_ref]}</span>")

    # reverse the list of result
    return {"comparaison": result[::-1], "reference": res_r[::-1], "prediction": res_p[::-1]}


def serialize_scores(board: dict) -> dict:
    """Serialize Kami board in correct format to display in HTML table

    Args:
        board (dict): Kami dict that contains transcription metrics and preprocessing keys

    Returns:
        dict : dict that contain scores and columns
    """
    # set empty value in columns list to represent score legend title in final table
    columns = [""]
    # case with text preprocessing actions
    if "default" in board.keys():
        scores = defaultdict(list)
        for type_preprocess, results in board.items():
            if isinstance(results, dict):
                # convert (from mapping) and add correct preprocessing
                # titles display in final table
                columns.append(MAPPING_TITLES[type_preprocess])
                # convert and add correct metrics titles
                # display in final table
                for type_metric, score in results.items():
                    if type_metric != "wer_hunt":
                        scores[MAPPING_SCORES_INDEX[type_metric]].append(score)
        # final score list eg.
        # [["Levensthein Distance (Char.)", 4, 4, 4, 4], ["Word Error Rate (WER)", 14, 35.54, 46.6, 20], ...]
        scores = [[type_metric]+scores for type_metric, scores in dict(scores).items() if type_metric != "wer_hunt"]
    else:
        columns.append(MAPPING_TITLES["default"])
        scores = [[MAPPING_SCORES_INDEX[type_metric], score] for type_metric, score in board.items() if type_metric != "wer_hunt"]
    return {
        "scores": scores,
        "columns": columns
    }


"""
LEGACY 
def make_dataframe(score_board, reference):
    metadata_keys = ['levensthein_distance_char', 'levensthein_distance_words', 'hamming_distance', 'wer', 'cer',
                     'wacc', 'mer', 'cil', 'cip', 'hits', 'substitutions', 'deletions', 'insertions']
    now = datetime.now()
    metadatas = {}
    metrics = {}
    metadatas["DATETIME"] = now.strftime("%d_%m_%Y_%H:%M:%S")
    metadatas["IMAGE"] = None  # TODO changer quand implémenté
    metadatas["REFERENCE"] = reference
    metadatas["MODEL"] = None  # TODO changer quand implémenté

    for key, value in score_board.items():
        if type(value) != dict and key not in metadata_keys:
            metadatas[key] = value
        else:
            metrics[key] = value
    try:
        df_metrics = pd.DataFrame.from_dict(metrics)
    except:
        df_metrics = pd.DataFrame.from_dict(metrics, orient='index')

    displayable_titles = {0: "Default",
                          "0": "Default",
                          "default": "Default",
                          "non_digits": "Ignoring digits",
                          "lowercase": "Ignoring case",
                          "remove_punctuation": "Ignoring punctuation",
                          "remove_diacritics": "Ignoring diacritics",
                          "all_transforms": "Combining all options"}
    displayable_index = {"cer": "Char. Error Rate (CER)", "wer": "Word Error Rate (WER)",
                         "levensthein_distance_char": "Levensthein Distance (Char.)",
                         "levensthein_distance_words": "Levensthein Distance (Words)",
                         "hamming_distance": "Hamming Distance",
                         "wacc": "Word Accuracy (Wacc)",
                         "mer": "Match Error Rate (MER)",
                         "cil": "Char. Information Lost (CIL)",
                         "cip": "Char. Information Preserved (CIP)",
                         "hits": "Hits",
                         "substitutions": "Substitutions",
                         "deletions": "Deletions",
                         "insertions": "Insertions"}

    df_metrics.rename(columns=displayable_titles, index=displayable_index, inplace=True)

    tables = [df_metrics.to_html(classes=["data", "table", "table-hover", "table-bordered", "table-result-metrics"],
                                 justify='center')]
    titles = [df_metrics.columns.values]
    return tables, titles, metrics
"""