aseifert commited on
Commit
dbdb640
1 Parent(s): b69b754

initial commit - highlighter.py

Browse files
Files changed (1) hide show
  1. highlighter.py +122 -0
highlighter.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module is taken with slight modifications from:
3
+ https://github.com/PrithivirajDamodaran/Gramformer/blob/main/gramformer/gramformer.py
4
+ """
5
+
6
+ import re
7
+
8
+ import pandas as pd
9
+ from annotated_text import annotated_text
10
+ from bs4 import BeautifulSoup
11
+
12
+
13
+ def show_highlights(annotator, input_text, corrected_sentence):
14
+ strikeout = lambda x: "\u0336".join(x) + "\u0336"
15
+ highlight_text = highlight(annotator, input_text, corrected_sentence)
16
+ color_map = {"d": "#faa", "a": "#afa", "c": "#fea"}
17
+ tokens = re.split(r"(<[dac]\s.*?<\/[dac]>)", highlight_text)
18
+ annotations = []
19
+ for token in tokens:
20
+ soup = BeautifulSoup(token, "html.parser")
21
+ tags = soup.findAll()
22
+ if tags:
23
+ _tag = tags[0].name
24
+ _type = tags[0]["type"]
25
+ _text = tags[0]["edit"]
26
+ _color = color_map[_tag]
27
+ if _tag == "d":
28
+ _text = strikeout(tags[0].text)
29
+ annotations.append((_text, _type, _color))
30
+ else:
31
+ annotations.append(token)
32
+ annotated_text(*annotations)
33
+
34
+
35
+ def show_edits(annotator, input_text, corrected_sentence):
36
+ edits = get_edits(annotator, input_text, corrected_sentence)
37
+ df = pd.DataFrame(
38
+ edits,
39
+ columns=[
40
+ "type",
41
+ "original word",
42
+ "original start",
43
+ "original end",
44
+ "correct word",
45
+ "correct start",
46
+ "correct end",
47
+ ],
48
+ )
49
+ return df.set_index("type")
50
+
51
+
52
+ def highlight(annotator, orig, cor):
53
+ edits = get_edits(annotator, orig, cor)
54
+ orig_tokens = orig.split()
55
+ ignore_indexes = []
56
+ for edit in edits:
57
+ edit_type = edit[0]
58
+ edit_str_start = edit[1]
59
+ edit_spos = edit[2]
60
+ edit_epos = edit[3]
61
+ edit_str_end = edit[4]
62
+ for i in range(edit_spos + 1, edit_epos):
63
+ ignore_indexes.append(i)
64
+ if edit_str_start == "":
65
+ if edit_spos >= 1:
66
+ new_edit_str = orig_tokens[edit_spos - 1]
67
+ # print("edit_spos >= 1", new_edit_str)
68
+ edit_spos -= 1
69
+ else:
70
+ new_edit_str = orig_tokens[edit_spos + 1]
71
+ # print("new", new_edit_str)
72
+ edit_spos += 1
73
+
74
+ if edit_type == "PUNCT":
75
+ st = (
76
+ "<a type='"
77
+ + edit_type
78
+ + "' edit='"
79
+ + edit_str_end
80
+ + "'>"
81
+ + new_edit_str
82
+ + "</a>"
83
+ )
84
+ else:
85
+ st = (
86
+ "<a type='"
87
+ + edit_type
88
+ + "' edit='"
89
+ + new_edit_str
90
+ + " "
91
+ + edit_str_end
92
+ + "'>"
93
+ + new_edit_str
94
+ + "</a>"
95
+ )
96
+ elif edit_str_end == "":
97
+ st = "<d type='" + edit_type + "' edit=''>" + edit_str_start + "</d>"
98
+ else:
99
+ st = (
100
+ "<c type='" + edit_type + "' edit='" + edit_str_end + "'>" + edit_str_start + "</c>"
101
+ )
102
+ orig_tokens[edit_spos] = st
103
+ for i in sorted(ignore_indexes, reverse=True):
104
+ del orig_tokens[i]
105
+ return " ".join(orig_tokens)
106
+
107
+
108
+ def get_edits(annotator, orig, cor):
109
+ orig = annotator.parse(orig)
110
+ cor = annotator.parse(cor)
111
+ alignment = annotator.align(orig, cor)
112
+ edits = annotator.merge(alignment)
113
+ if len(edits) == 0:
114
+ return []
115
+ edit_annotations = []
116
+ for e in edits:
117
+ e = annotator.classify(e)
118
+ edit_annotations.append(
119
+ (e.type[2:], e.o_str, e.o_start, e.o_end, e.c_str, e.c_start, e.c_end)
120
+ )
121
+
122
+ return edit_annotations or []