Spaces:
Sleeping
Sleeping
Upload event_detection_dataclean.py
Browse files- event_detection_dataclean.py +118 -0
event_detection_dataclean.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
from collections import Counter
|
4 |
+
|
5 |
+
|
6 |
+
def load_texttag_file(texttag_filename):
|
7 |
+
try:
|
8 |
+
with open(texttag_filename, "r") as data_file:
|
9 |
+
data_all = data_file.read()
|
10 |
+
tags_all = list()
|
11 |
+
texts_selected = list()
|
12 |
+
tags_selected = list()
|
13 |
+
|
14 |
+
for line in re.split(r'\n\t?\n', data_all):
|
15 |
+
if len(line) != 0:
|
16 |
+
texts_line = list()
|
17 |
+
tags_line = list()
|
18 |
+
for item in line.split("\n"):
|
19 |
+
if len(item)!=0:
|
20 |
+
text, tag = item.split("\t")
|
21 |
+
if re.search(r"[@|?|!+?|:|(|)]|\\|\.*?\|-|/|/|/.*?/|http\S+|www\S+", text) == None:
|
22 |
+
texts_line.append(text.lower())
|
23 |
+
tags_line.append(tag)
|
24 |
+
tags_all.append(tag)
|
25 |
+
|
26 |
+
texts_selected.append(texts_line)
|
27 |
+
tags_selected.append(tags_line)
|
28 |
+
except FileNotFoundError as error:
|
29 |
+
msg = "Sorry, the file" + data_file + "does not exist."
|
30 |
+
print(msg)
|
31 |
+
print("error:" + error)
|
32 |
+
|
33 |
+
return texts_selected, tags_selected, tags_all
|
34 |
+
|
35 |
+
|
36 |
+
def tag_ids_map(tags_all, tags2ids_name, ids2tags_name):
|
37 |
+
tags = list(set(tags_all))
|
38 |
+
tags.sort()
|
39 |
+
unique_tags = len(tags)
|
40 |
+
ids = [i for i in range(unique_tags)]
|
41 |
+
|
42 |
+
tags2ids = dict(zip(tags, ids))
|
43 |
+
ids2tags = dict(zip(ids, tags))
|
44 |
+
|
45 |
+
with open(tags2ids_name, "w") as filename:
|
46 |
+
json.dump(tags2ids, filename)
|
47 |
+
|
48 |
+
with open(ids2tags_name, "w") as filename:
|
49 |
+
json.dump(ids2tags, filename)
|
50 |
+
|
51 |
+
return tags2ids, ids2tags
|
52 |
+
|
53 |
+
|
54 |
+
def add_tagids(tags_selected, tags2ids, ids2tags):
|
55 |
+
tagids_selected = list()
|
56 |
+
for tags_line in tags_selected:
|
57 |
+
tagids_line = list()
|
58 |
+
for tag in tags_line:
|
59 |
+
tagids_line.append(tags2ids[tag])
|
60 |
+
tagids_selected.append(tagids_line)
|
61 |
+
# print(tagids_selected)
|
62 |
+
return tagids_selected
|
63 |
+
|
64 |
+
|
65 |
+
def add_text_tagid(tags_selected, tags2ids, ids2tags):
|
66 |
+
tags_chunk = list()
|
67 |
+
tagids_chunk = list()
|
68 |
+
for tags_line in tags_selected:
|
69 |
+
tag_line_chunk = list()
|
70 |
+
tagid_line_chunk = list()
|
71 |
+
tag_line_count = Counter(tags_line)
|
72 |
+
if len(tag_line_count) == 1:
|
73 |
+
tag_line_chunk.append(max(tag_line_count))
|
74 |
+
tagid_line_chunk.append(tags2ids[max(tag_line_count)])
|
75 |
+
else:
|
76 |
+
del tag_line_count["O"]
|
77 |
+
tag_line_chunk.append(max(tag_line_count))
|
78 |
+
tagid_line_chunk.append(tags2ids[max(tag_line_count)])
|
79 |
+
|
80 |
+
tags_chunk.append(tag_line_chunk)
|
81 |
+
tagids_chunk.append(tagid_line_chunk)
|
82 |
+
|
83 |
+
return tags_chunk, tagids_chunk
|
84 |
+
|
85 |
+
def save_json(json_filename, texts_selected, tags_selected, tagids_selected, tags_chunk, tagids_chunk):
|
86 |
+
total_length = len(texts_selected)
|
87 |
+
save_datalist = list()
|
88 |
+
total_length = 32
|
89 |
+
for index in range(total_length):
|
90 |
+
item_dict = dict()
|
91 |
+
item_dict["text"] = texts_selected[index]
|
92 |
+
item_dict["word_tag"] = tags_selected[index]
|
93 |
+
item_dict["word_tag_id"] = tagids_selected[index]
|
94 |
+
item_dict["text_tag"] = tags_chunk[index]
|
95 |
+
item_dict["text_tag_id"] = tagids_chunk[index]
|
96 |
+
save_datalist.append(item_dict)
|
97 |
+
|
98 |
+
with open(json_filename, 'w') as file:
|
99 |
+
json.dump(save_datalist, file)
|
100 |
+
|
101 |
+
return
|
102 |
+
|
103 |
+
def main(data_filename, json_filename, tags2ids_name, ids2tags_name):
|
104 |
+
texts_selected, tags_selected, tags_all = load_texttag_file(data_filename)
|
105 |
+
tags2ids, ids2tags = tag_ids_map(tags_all, tags2ids_name, ids2tags_name)
|
106 |
+
|
107 |
+
tagids_selected = add_tagids(tags_selected, tags2ids, ids2tags)
|
108 |
+
tags_chunk, tagids_chunk = add_text_tagid(tags_selected, tags2ids, ids2tags)
|
109 |
+
|
110 |
+
save_json(json_filename, texts_selected, tags_selected, tagids_selected, tags_chunk, tagids_chunk)
|
111 |
+
|
112 |
+
|
113 |
+
if __name__ == "__main__":
|
114 |
+
test_raw = "../data/raw_EDT/Event_detection/dev.txt"
|
115 |
+
test_save = '../data/raw_EDT/Event_detection/dev.json'
|
116 |
+
tags2ids_name = "../data/raw_EDT/Event_detection/tags2ids.json"
|
117 |
+
ids2tags_name = "../data/raw_EDT/Event_detection/ids2tags.json"
|
118 |
+
main(test_raw, test_save, tags2ids_name, ids2tags_name)
|