Create ParseAmitaroHTML.py
Browse files- ParseAmitaroHTML.py +68 -0
ParseAmitaroHTML.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pip --cert /etc/pki/ca-trust/source/anchors/tri-ace-CA-2015.cer install --trusted-host pypi.org --trusted-host files.pythonhosted.org beautifulsoup4
|
2 |
+
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
|
5 |
+
f = open("./amitaro.htm", "r")
|
6 |
+
txt = f.read()
|
7 |
+
soup = BeautifulSoup(txt)
|
8 |
+
print(soup.prettify())
|
9 |
+
|
10 |
+
import json
|
11 |
+
f = open('amitaro.json')
|
12 |
+
file_list = json.load(f)
|
13 |
+
|
14 |
+
td = soup.find_all('td')
|
15 |
+
for i, val in enumerate(td):
|
16 |
+
if len(val.contents) == 0:
|
17 |
+
continue
|
18 |
+
key = val.contents[0]
|
19 |
+
if key in file_list:
|
20 |
+
#print(td[i-1].contents[0])
|
21 |
+
if len(td[i-1].contents) > 0:
|
22 |
+
#print(td[i-1].contents[0])
|
23 |
+
temp = BeautifulSoup(str(td[i-1].contents[0]))
|
24 |
+
a = temp.find_all('a')
|
25 |
+
print(a[0].contents[0])
|
26 |
+
file_list[key]["kana"] = str(a[0].contents[0])
|
27 |
+
|
28 |
+
with open("./amitaro_with_kana.json", "w") as outfile:
|
29 |
+
outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False))
|
30 |
+
|
31 |
+
for key, val in file_list.items():
|
32 |
+
val["path"] = "./data_amitaro22k/" + val["path"]
|
33 |
+
|
34 |
+
with open("./amitaro_with_kana.json", "w") as outfile:
|
35 |
+
outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False))
|
36 |
+
|
37 |
+
file = []
|
38 |
+
for key, val in file_list.items():
|
39 |
+
if len(val['kana']) == 0:
|
40 |
+
continue
|
41 |
+
if val['kana'].find("(") != -1:
|
42 |
+
continue
|
43 |
+
file.append(f"{val['path']}|10|{val['kana']}")
|
44 |
+
|
45 |
+
amitaro_train = []
|
46 |
+
amitaro_val = []
|
47 |
+
for val in file:
|
48 |
+
amitaro_train.append(val)
|
49 |
+
|
50 |
+
import random
|
51 |
+
|
52 |
+
rands = []
|
53 |
+
while len(rands) < len(file)/10:
|
54 |
+
rand_num = random.randint(0, len(file)-1)
|
55 |
+
if rand_num in rands:
|
56 |
+
continue
|
57 |
+
amitaro_val.append(file[rand_num])
|
58 |
+
rands.append(rand_num)
|
59 |
+
|
60 |
+
f = open("amitaro_train.txt", "w")
|
61 |
+
for val in amitaro_train:
|
62 |
+
f.write(f"{val}\n")
|
63 |
+
f.close()
|
64 |
+
|
65 |
+
f = open("amitaro_val.txt", "w")
|
66 |
+
for val in amitaro_val:
|
67 |
+
f.write(f"{val}\n")
|
68 |
+
f.close()
|