Lycoris53 commited on
Commit
9346eed
1 Parent(s): a448270

Create ParseAmitaroHTML.py

Browse files
Files changed (1) hide show
  1. ParseAmitaroHTML.py +68 -0
ParseAmitaroHTML.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pip --cert /etc/pki/ca-trust/source/anchors/tri-ace-CA-2015.cer install --trusted-host pypi.org --trusted-host files.pythonhosted.org beautifulsoup4
2
+
3
+ from bs4 import BeautifulSoup
4
+
5
+ f = open("./amitaro.htm", "r")
6
+ txt = f.read()
7
+ soup = BeautifulSoup(txt)
8
+ print(soup.prettify())
9
+
10
+ import json
11
+ f = open('amitaro.json')
12
+ file_list = json.load(f)
13
+
14
+ td = soup.find_all('td')
15
+ for i, val in enumerate(td):
16
+ if len(val.contents) == 0:
17
+ continue
18
+ key = val.contents[0]
19
+ if key in file_list:
20
+ #print(td[i-1].contents[0])
21
+ if len(td[i-1].contents) > 0:
22
+ #print(td[i-1].contents[0])
23
+ temp = BeautifulSoup(str(td[i-1].contents[0]))
24
+ a = temp.find_all('a')
25
+ print(a[0].contents[0])
26
+ file_list[key]["kana"] = str(a[0].contents[0])
27
+
28
+ with open("./amitaro_with_kana.json", "w") as outfile:
29
+ outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False))
30
+
31
+ for key, val in file_list.items():
32
+ val["path"] = "./data_amitaro22k/" + val["path"]
33
+
34
+ with open("./amitaro_with_kana.json", "w") as outfile:
35
+ outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False))
36
+
37
+ file = []
38
+ for key, val in file_list.items():
39
+ if len(val['kana']) == 0:
40
+ continue
41
+ if val['kana'].find("(") != -1:
42
+ continue
43
+ file.append(f"{val['path']}|10|{val['kana']}")
44
+
45
+ amitaro_train = []
46
+ amitaro_val = []
47
+ for val in file:
48
+ amitaro_train.append(val)
49
+
50
+ import random
51
+
52
+ rands = []
53
+ while len(rands) < len(file)/10:
54
+ rand_num = random.randint(0, len(file)-1)
55
+ if rand_num in rands:
56
+ continue
57
+ amitaro_val.append(file[rand_num])
58
+ rands.append(rand_num)
59
+
60
+ f = open("amitaro_train.txt", "w")
61
+ for val in amitaro_train:
62
+ f.write(f"{val}\n")
63
+ f.close()
64
+
65
+ f = open("amitaro_val.txt", "w")
66
+ for val in amitaro_val:
67
+ f.write(f"{val}\n")
68
+ f.close()