Spaces:
Running
Running
mylibrar
commited on
Commit
•
3dd0859
1
Parent(s):
cb27b88
Add comments to each topic graph
Browse files- data/topic_charts.json +40 -20
- results.py +3 -3
data/topic_charts.json
CHANGED
@@ -149,7 +149,8 @@
|
|
149 |
],
|
150 |
"pctdistance": 1.2,
|
151 |
"labeldistance": 1.5
|
152 |
-
}
|
|
|
153 |
}
|
154 |
],
|
155 |
[
|
@@ -304,7 +305,8 @@
|
|
304 |
"subplots_adjust": {
|
305 |
"left": 0.37,
|
306 |
"right": 0.98
|
307 |
-
}
|
|
|
308 |
}
|
309 |
],
|
310 |
[
|
@@ -459,7 +461,8 @@
|
|
459 |
"subplots_adjust": {
|
460 |
"left": 0.37,
|
461 |
"right": 0.98
|
462 |
-
}
|
|
|
463 |
}
|
464 |
],
|
465 |
[
|
@@ -614,7 +617,8 @@
|
|
614 |
"subplots_adjust": {
|
615 |
"left": 0.37,
|
616 |
"right": 0.98
|
617 |
-
}
|
|
|
618 |
}
|
619 |
],
|
620 |
[
|
@@ -769,7 +773,8 @@
|
|
769 |
"subplots_adjust": {
|
770 |
"left": 0.37,
|
771 |
"right": 0.98
|
772 |
-
}
|
|
|
773 |
}
|
774 |
],
|
775 |
[
|
@@ -924,7 +929,8 @@
|
|
924 |
"subplots_adjust": {
|
925 |
"left": 0.37,
|
926 |
"right": 0.98
|
927 |
-
}
|
|
|
928 |
}
|
929 |
],
|
930 |
[
|
@@ -1079,7 +1085,8 @@
|
|
1079 |
"subplots_adjust": {
|
1080 |
"left": 0.37,
|
1081 |
"right": 0.98
|
1082 |
-
}
|
|
|
1083 |
}
|
1084 |
],
|
1085 |
[
|
@@ -1234,7 +1241,8 @@
|
|
1234 |
"subplots_adjust": {
|
1235 |
"left": 0.37,
|
1236 |
"right": 0.98
|
1237 |
-
}
|
|
|
1238 |
}
|
1239 |
],
|
1240 |
[
|
@@ -1389,7 +1397,8 @@
|
|
1389 |
"subplots_adjust": {
|
1390 |
"left": 0.37,
|
1391 |
"right": 0.98
|
1392 |
-
}
|
|
|
1393 |
}
|
1394 |
],
|
1395 |
[
|
@@ -1544,7 +1553,8 @@
|
|
1544 |
"subplots_adjust": {
|
1545 |
"left": 0.37,
|
1546 |
"right": 0.98
|
1547 |
-
}
|
|
|
1548 |
}
|
1549 |
],
|
1550 |
[
|
@@ -1699,7 +1709,8 @@
|
|
1699 |
"subplots_adjust": {
|
1700 |
"left": 0.37,
|
1701 |
"right": 0.98
|
1702 |
-
}
|
|
|
1703 |
}
|
1704 |
],
|
1705 |
[
|
@@ -1854,7 +1865,8 @@
|
|
1854 |
"subplots_adjust": {
|
1855 |
"left": 0.37,
|
1856 |
"right": 0.98
|
1857 |
-
}
|
|
|
1858 |
}
|
1859 |
],
|
1860 |
[
|
@@ -2009,7 +2021,8 @@
|
|
2009 |
"subplots_adjust": {
|
2010 |
"left": 0.37,
|
2011 |
"right": 0.98
|
2012 |
-
}
|
|
|
2013 |
}
|
2014 |
],
|
2015 |
[
|
@@ -2164,7 +2177,8 @@
|
|
2164 |
"subplots_adjust": {
|
2165 |
"left": 0.37,
|
2166 |
"right": 0.98
|
2167 |
-
}
|
|
|
2168 |
}
|
2169 |
],
|
2170 |
[
|
@@ -2319,7 +2333,8 @@
|
|
2319 |
"subplots_adjust": {
|
2320 |
"left": 0.37,
|
2321 |
"right": 0.98
|
2322 |
-
}
|
|
|
2323 |
}
|
2324 |
],
|
2325 |
[
|
@@ -2474,7 +2489,8 @@
|
|
2474 |
"subplots_adjust": {
|
2475 |
"left": 0.37,
|
2476 |
"right": 0.98
|
2477 |
-
}
|
|
|
2478 |
}
|
2479 |
],
|
2480 |
[
|
@@ -2629,7 +2645,8 @@
|
|
2629 |
"subplots_adjust": {
|
2630 |
"left": 0.37,
|
2631 |
"right": 0.98
|
2632 |
-
}
|
|
|
2633 |
}
|
2634 |
],
|
2635 |
[
|
@@ -2784,7 +2801,8 @@
|
|
2784 |
"subplots_adjust": {
|
2785 |
"left": 0.37,
|
2786 |
"right": 0.98
|
2787 |
-
}
|
|
|
2788 |
}
|
2789 |
],
|
2790 |
[
|
@@ -2939,7 +2957,8 @@
|
|
2939 |
"subplots_adjust": {
|
2940 |
"left": 0.37,
|
2941 |
"right": 0.98
|
2942 |
-
}
|
|
|
2943 |
}
|
2944 |
],
|
2945 |
[
|
@@ -3094,7 +3113,8 @@
|
|
3094 |
"subplots_adjust": {
|
3095 |
"left": 0.37,
|
3096 |
"right": 0.98
|
3097 |
-
}
|
|
|
3098 |
}
|
3099 |
],
|
3100 |
[
|
|
|
149 |
],
|
150 |
"pctdistance": 1.2,
|
151 |
"labeldistance": 1.5
|
152 |
+
},
|
153 |
+
"comment": "As shown in the graph above, over 20% of the documents are related to Business & Economics & Finance, which makes it the largest topic group in dataset. On the contrary, the group of Culture & Cultural geography contains the smallest number of documents among all topics."
|
154 |
}
|
155 |
],
|
156 |
[
|
|
|
305 |
"subplots_adjust": {
|
306 |
"left": 0.37,
|
307 |
"right": 0.98
|
308 |
+
},
|
309 |
+
"comment": "In average, documents related to Shopping & Commodity have larger fraction of words corrected in lines."
|
310 |
}
|
311 |
],
|
312 |
[
|
|
|
461 |
"subplots_adjust": {
|
462 |
"left": 0.37,
|
463 |
"right": 0.98
|
464 |
+
},
|
465 |
+
"comment": "Compared with other topics, Personal Development & Human Resources & Career in average contain more lines ending with ellipsis."
|
466 |
}
|
467 |
],
|
468 |
[
|
|
|
617 |
"subplots_adjust": {
|
618 |
"left": 0.37,
|
619 |
"right": 0.98
|
620 |
+
},
|
621 |
+
"comment": "Shopping & Commodity related documents have higher percentage of lines starting with bullet point."
|
622 |
}
|
623 |
],
|
624 |
[
|
|
|
773 |
"subplots_adjust": {
|
774 |
"left": 0.37,
|
775 |
"right": 0.98
|
776 |
+
},
|
777 |
+
"comment": "Personal Development & Human Resources & Career in average has more lines with toxic words."
|
778 |
}
|
779 |
],
|
780 |
[
|
|
|
929 |
"subplots_adjust": {
|
930 |
"left": 0.37,
|
931 |
"right": 0.98
|
932 |
+
},
|
933 |
+
"comment": "Daily Life & Home & Lifestyle in average has more toxic words."
|
934 |
}
|
935 |
],
|
936 |
[
|
|
|
1085 |
"subplots_adjust": {
|
1086 |
"left": 0.37,
|
1087 |
"right": 0.98
|
1088 |
+
},
|
1089 |
+
"comment": "Documents in the topic of Personal Development & Human Resources & Career in average contain more words than other topics."
|
1090 |
}
|
1091 |
],
|
1092 |
[
|
|
|
1241 |
"subplots_adjust": {
|
1242 |
"left": 0.37,
|
1243 |
"right": 0.98
|
1244 |
+
},
|
1245 |
+
"comment": "There is no significant variance in the average word length for different topic groups. However, Education related data contain longer words than others in general."
|
1246 |
}
|
1247 |
],
|
1248 |
[
|
|
|
1397 |
"subplots_adjust": {
|
1398 |
"left": 0.37,
|
1399 |
"right": 0.98
|
1400 |
+
},
|
1401 |
+
"comment": "Documents in the topic of Personal Development & Human Resources & Career usually contain more sentences."
|
1402 |
}
|
1403 |
],
|
1404 |
[
|
|
|
1553 |
"subplots_adjust": {
|
1554 |
"left": 0.37,
|
1555 |
"right": 0.98
|
1556 |
+
},
|
1557 |
+
"comment": "Documents related to Daily Life & Home & Lifestyle usually have higher percentage of symbols."
|
1558 |
}
|
1559 |
],
|
1560 |
[
|
|
|
1709 |
"subplots_adjust": {
|
1710 |
"left": 0.37,
|
1711 |
"right": 0.98
|
1712 |
+
},
|
1713 |
+
"comment": "The fraction of words with alpha character seems to be relatively consistent across different topics."
|
1714 |
}
|
1715 |
],
|
1716 |
[
|
|
|
1865 |
"subplots_adjust": {
|
1866 |
"left": 0.37,
|
1867 |
"right": 0.98
|
1868 |
+
},
|
1869 |
+
"comment": "Culture & Cultural geography contains more stop words in average."
|
1870 |
}
|
1871 |
],
|
1872 |
[
|
|
|
2021 |
"subplots_adjust": {
|
2022 |
"left": 0.37,
|
2023 |
"right": 0.98
|
2024 |
+
},
|
2025 |
+
"comment": "Natural Science & Formal Science & Technology has a significantly higher rate in percentage of documents that contain curly bracket. It might be related to the coding data."
|
2026 |
}
|
2027 |
],
|
2028 |
[
|
|
|
2177 |
"subplots_adjust": {
|
2178 |
"left": 0.37,
|
2179 |
"right": 0.98
|
2180 |
+
},
|
2181 |
+
"comment": "Sports related documents have a higher number of duplication count."
|
2182 |
}
|
2183 |
],
|
2184 |
[
|
|
|
2333 |
"subplots_adjust": {
|
2334 |
"left": 0.37,
|
2335 |
"right": 0.98
|
2336 |
+
},
|
2337 |
+
"comment": "In average, Culture & Cultural geography related documents are duplicated across a higher number of common crawl dumps. Duplication of Shopping & Commodity appears in less dumps than others."
|
2338 |
}
|
2339 |
],
|
2340 |
[
|
|
|
2489 |
"subplots_adjust": {
|
2490 |
"left": 0.37,
|
2491 |
"right": 0.98
|
2492 |
+
},
|
2493 |
+
"comment": "In average, Culture & Cultural geography related documents are duplicated across more years than other topics."
|
2494 |
}
|
2495 |
],
|
2496 |
[
|
|
|
2645 |
"subplots_adjust": {
|
2646 |
"left": 0.37,
|
2647 |
"right": 0.98
|
2648 |
+
},
|
2649 |
+
"comment": "In average, Culture & Cultural geography related documents are duplicated across a wider span of years."
|
2650 |
}
|
2651 |
],
|
2652 |
[
|
|
|
2801 |
"subplots_adjust": {
|
2802 |
"left": 0.37,
|
2803 |
"right": 0.98
|
2804 |
+
},
|
2805 |
+
"comment": "Average language scores of different topic groups are mostly consistent. No significant differences are obeserved."
|
2806 |
}
|
2807 |
],
|
2808 |
[
|
|
|
2957 |
"subplots_adjust": {
|
2958 |
"left": 0.37,
|
2959 |
"right": 0.98
|
2960 |
+
},
|
2961 |
+
"comment": "In average, Shopping & Commodity has a larger fraction of duplicate lines than others."
|
2962 |
}
|
2963 |
],
|
2964 |
[
|
|
|
3113 |
"subplots_adjust": {
|
3114 |
"left": 0.37,
|
3115 |
"right": 0.98
|
3116 |
+
},
|
3117 |
+
"comment": "Shopping & Commodity usually has a larger fraction of characters in duplicate lines than others."
|
3118 |
}
|
3119 |
],
|
3120 |
[
|
results.py
CHANGED
@@ -990,7 +990,7 @@ for title, data in topic_charts:
|
|
990 |
for rgb in data["kwargs"]["color"]
|
991 |
]
|
992 |
)))
|
993 |
-
|
994 |
topic_graphs.append(go.Figure(go.Pie(
|
995 |
values=data["kwargs"]['x'],
|
996 |
labels=data["kwargs"]["labels"],
|
@@ -1014,8 +1014,8 @@ cluster_div = Div(
|
|
1014 |
)),
|
1015 |
H3("Results Analysis"),
|
1016 |
*(
|
1017 |
-
Section(H4(title), plotly2fasthtml(topic_graphs[i]))
|
1018 |
-
for i, (title,
|
1019 |
)
|
1020 |
)
|
1021 |
)
|
|
|
990 |
for rgb in data["kwargs"]["color"]
|
991 |
]
|
992 |
)))
|
993 |
+
elif data["type"] == "pie":
|
994 |
topic_graphs.append(go.Figure(go.Pie(
|
995 |
values=data["kwargs"]['x'],
|
996 |
labels=data["kwargs"]["labels"],
|
|
|
1014 |
)),
|
1015 |
H3("Results Analysis"),
|
1016 |
*(
|
1017 |
+
Section(H4(title), plotly2fasthtml(topic_graphs[i]), P(data.get("comment", '')))
|
1018 |
+
for i, (title, data) in enumerate(topic_charts)
|
1019 |
)
|
1020 |
)
|
1021 |
)
|