# -*- coding: utf-8 -*- # from datetime import datetime from collections import defaultdict from numba import njit, jit # import pandas as pd import numpy as np from modules.constants import MAPPING_TITLES, MAPPING_SCORES_INDEX @njit(fastmath=True) def compute_distance(reference, prediction, distance): for char_pred in range(1, len(prediction) + 1): for char_ref in range(1, len(reference) + 1): delt = 1 if prediction[char_pred - 1] != reference[char_ref - 1] else 0 distance[char_pred, char_ref] = min(distance[char_pred - 1, char_ref - 1] + delt, distance[char_pred - 1, char_ref] + 1, distance[char_pred, char_ref - 1] + 1) return distance @jit(nopython=True, nogil=True) def check_back_direction(direction, char_ref, char_pred): char_pred = char_pred - 1 if direction == "<-" or direction == "\\" else char_pred char_ref = char_ref - 1 if direction == "^" or direction == "\\" else char_ref return char_ref, char_pred def show_diff_color_html(reference: str, prediction: str) -> dict: """Display source and prediction in HTML format and color-code insertions (blue), deletions (red), and exact words (green). based on Levensthein algorithm. Example -------- >>> show_diff_color_html("Chat", "Chien") ["C", "h", "i", "e", "a", "n", "t"] Args: reference (str): reference sequence prediction (str): prediction sequence Returns: list: list of HTML tag with color code """ result = [] res_r = [] res_p = [] distance = np.zeros((len(prediction) + 1, len(reference) + 1), dtype=int) distance[0, 1:] = range(1, len(reference) + 1) distance[1:, 0] = range(1, len(prediction) + 1) distance = compute_distance(reference, prediction, distance) # sequences alignment # iterate the matrix's values from back to forward char_pred = len(prediction) char_ref = len(reference) counter = 0 while char_pred > 0 and char_ref > 0: counter +=1 diagonal = distance[char_pred - 1, char_ref - 1] upper = distance[char_pred, char_ref - 1] left = distance[char_pred - 1, char_ref] # check back direction direction = "\\" if diagonal <= upper and \ diagonal <= left else "<-" \ if left < diagonal and \ left <= upper else "^" #char_pred = char_pred - 1 if direction == "<-" or direction == "\\" else char_pred #char_ref = char_ref - 1 if direction == "^" or direction == "\\" else char_ref char_ref, char_pred = check_back_direction(direction, char_ref, char_pred) # Colorize characters with HTML tags if (direction == "\\"): if distance[char_pred + 1, char_ref + 1] == diagonal: # exact match result.append(f"{prediction[char_pred]}") res_r.append(f"{reference[char_ref]}") res_p.append(f"{prediction[char_pred]}") elif distance[char_pred + 1, char_ref + 1] > diagonal: result.append(f"{reference[char_ref]}") result.append(f"{prediction[char_pred]}") res_r.append(f"{reference[char_ref]}") res_p.append(f"{prediction[char_pred]}") else: result.append(f"{prediction[char_pred]}") result.append(f"{reference[char_ref]}") res_r.append(f"{reference[char_ref]}") res_p.append(f"{prediction[char_pred]}") elif (direction == "<-"): result.append(f"{prediction[char_pred]}") res_p.append(f"{prediction[char_pred]}") elif (direction == "^"): result.append(f"{reference[char_ref]}") res_r.append(f"{reference[char_ref]}") # reverse the list of result return {"comparaison": result[::-1], "reference": res_r[::-1], "prediction": res_p[::-1]} def serialize_scores(board: dict) -> dict: """Serialize Kami board in correct format to display in HTML table Args: board (dict): Kami dict that contains transcription metrics and preprocessing keys Returns: dict : dict that contain scores and columns """ # set empty value in columns list to represent score legend title in final table columns = [""] # case with text preprocessing actions if "default" in board.keys(): scores = defaultdict(list) for type_preprocess, results in board.items(): if isinstance(results, dict): # convert (from mapping) and add correct preprocessing # titles display in final table columns.append(MAPPING_TITLES[type_preprocess]) # convert and add correct metrics titles # display in final table for type_metric, score in results.items(): if type_metric != "wer_hunt": scores[MAPPING_SCORES_INDEX[type_metric]].append(score) # final score list eg. # [["Levensthein Distance (Char.)", 4, 4, 4, 4], ["Word Error Rate (WER)", 14, 35.54, 46.6, 20], ...] scores = [[type_metric]+scores for type_metric, scores in dict(scores).items() if type_metric != "wer_hunt"] else: columns.append(MAPPING_TITLES["default"]) scores = [[MAPPING_SCORES_INDEX[type_metric], score] for type_metric, score in board.items() if type_metric != "wer_hunt"] return { "scores": scores, "columns": columns } """ LEGACY def make_dataframe(score_board, reference): metadata_keys = ['levensthein_distance_char', 'levensthein_distance_words', 'hamming_distance', 'wer', 'cer', 'wacc', 'mer', 'cil', 'cip', 'hits', 'substitutions', 'deletions', 'insertions'] now = datetime.now() metadatas = {} metrics = {} metadatas["DATETIME"] = now.strftime("%d_%m_%Y_%H:%M:%S") metadatas["IMAGE"] = None # TODO changer quand implémenté metadatas["REFERENCE"] = reference metadatas["MODEL"] = None # TODO changer quand implémenté for key, value in score_board.items(): if type(value) != dict and key not in metadata_keys: metadatas[key] = value else: metrics[key] = value try: df_metrics = pd.DataFrame.from_dict(metrics) except: df_metrics = pd.DataFrame.from_dict(metrics, orient='index') displayable_titles = {0: "Default", "0": "Default", "default": "Default", "non_digits": "Ignoring digits", "lowercase": "Ignoring case", "remove_punctuation": "Ignoring punctuation", "remove_diacritics": "Ignoring diacritics", "all_transforms": "Combining all options"} displayable_index = {"cer": "Char. Error Rate (CER)", "wer": "Word Error Rate (WER)", "levensthein_distance_char": "Levensthein Distance (Char.)", "levensthein_distance_words": "Levensthein Distance (Words)", "hamming_distance": "Hamming Distance", "wacc": "Word Accuracy (Wacc)", "mer": "Match Error Rate (MER)", "cil": "Char. Information Lost (CIL)", "cip": "Char. Information Preserved (CIP)", "hits": "Hits", "substitutions": "Substitutions", "deletions": "Deletions", "insertions": "Insertions"} df_metrics.rename(columns=displayable_titles, index=displayable_index, inplace=True) tables = [df_metrics.to_html(classes=["data", "table", "table-hover", "table-bordered", "table-result-metrics"], justify='center')] titles = [df_metrics.columns.values] return tables, titles, metrics """