diff --git "a/data/topic_charts.json" "b/data/topic_charts.json" new file mode 100644--- /dev/null +++ "b/data/topic_charts.json" @@ -0,0 +1,4515 @@ +[ + [ + "Number of Document of Each Topic", + { + "type": "pie", + "kwargs": { + "x": [ + 535838, + 206990, + 368022, + 200460, + 435310, + 250450, + 933732, + 271801, + 639890, + 387594, + 271359, + 1473798, + 459519, + 1101903, + 31659, + 2254859, + 591041 + ], + "labels": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "autopct": "%1.1f%%", + "colors": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ], + "pctdistance": 1.2, + "labeldistance": 1.5 + }, + "comment": "As shown in the graph above, over 20% of the documents are related to Business & Economics & Finance, which makes it the largest topic group in dataset. On the contrary, the group of Culture & Cultural geography contains the smallest number of documents among all topics." + } + ], + [ + "Fraction of Words Corrected in Lines", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.005599317351029421, + 0.005491440909735792, + 0.010611897213357221, + 0.0061721529486005915, + 0.005040363960665401, + 0.0042498218252128035, + 0.008174887952855342, + 0.005098232906967347, + 0.005905725848762689, + 0.008048438948020924, + 0.005920233062429675, + 0.00738773833987446, + 0.006788916830535338, + 0.007824824620615435, + 0.007817009319252808, + 0.006894261391191716, + 0.007759051322619051 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "In average, documents related to Shopping & Commodity have larger fraction of words corrected in lines." + } + ], + [ + "Fraction of Lines Ending with Ellipsis", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.013608683903284204, + 0.01187771888948645, + 0.010704198151112872, + 0.013181499370177098, + 0.012342863597933462, + 0.01669603038717465, + 0.013958760786106517, + 0.011481605295821474, + 0.011727508302172751, + 0.013890752469918237, + 0.012950109439490815, + 0.015828153615401713, + 0.011233498318616135, + 0.013063106813702607, + 0.013101045053120094, + 0.012854514904197168, + 0.014225441730661032 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Compared with other topics, Personal Development & Human Resources & Career in average contain more lines ending with ellipsis." + } + ], + [ + "Fraction of Lines Starting with Bullet Point", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.05958846605103529, + 0.06540916901994907, + 0.10871161367473074, + 0.057639202535687495, + 0.05391125998418046, + 0.048856823399157104, + 0.0919025139411848, + 0.06361059519326412, + 0.08348033701472354, + 0.09887120370776314, + 0.0654760782941809, + 0.07275273301463199, + 0.08648053868877607, + 0.0728023788334523, + 0.059507615068158916, + 0.08230576538579888, + 0.06015758928408362 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Shopping & Commodity related documents have higher percentage of lines starting with bullet point." + } + ], + [ + "Number of Lines with Toxic Words", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.18993986988604764, + 0.8879124595391081, + 0.25990565781393504, + 0.26195250922877383, + 0.25880866508924677, + 1.059369135555999, + 0.13686689542609656, + 0.41953855946078195, + 0.8275813030364594, + 0.15215921815095176, + 0.13490615752563948, + 0.7103062970637767, + 0.10924031432867846, + 0.983178192635831, + 0.1341482674752835, + 0.14871528552339636, + 0.44260888838506973 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Personal Development & Human Resources & Career in average has more lines with toxic words." + } + ], + [ + "Number of Toxic Words", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.2548288848495254, + 1.5926663123822407, + 0.4235181592404802, + 0.38067444876783396, + 0.32550136684202063, + 2.0770772609303254, + 0.20720185235163838, + 0.590086129190106, + 1.571774836299989, + 0.20227609302517582, + 0.18648727331689754, + 1.453566228207665, + 0.15104924932374938, + 2.337839174591593, + 0.20351242932499447, + 0.24778267732040007, + 0.7902395942075084 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Daily Life & Home & Lifestyle in average has more toxic words." + } + ], + [ + "Word Count", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 524.2469683001206, + 634.8099570027538, + 332.5969724636027, + 654.5120023944927, + 634.4970021364086, + 747.0358714314234, + 624.2853688210322, + 570.2685052667209, + 746.3173279782462, + 427.6056492102561, + 603.5602799243807, + 470.1159928294108, + 559.1577497339609, + 450.07929463845727, + 682.6580435263274, + 559.7302638435485, + 514.1515901604118 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Documents in the topic of Personal Development & Human Resources & Career in average contain more words than other topics." + } + ], + [ + "Mean Word Length", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 4.851116616082301, + 5.17314698008811, + 4.951553714759433, + 4.8636771295932055, + 5.165523097115738, + 4.64498800138652, + 5.233981234962708, + 5.094122002544284, + 5.191578081429402, + 4.872407702558401, + 5.077044932121297, + 4.911569182027774, + 5.25771470252484, + 4.990336313339119, + 5.138998450653204, + 5.165914329275205, + 4.943227231080612 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "There is no significant variance in the average word length for different topic groups. However, Education related data contain longer words than others in general." + } + ], + [ + "Number of Sentences", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 23.816802839664227, + 28.88356925455336, + 17.167653020743327, + 32.65256909109049, + 26.743545978727802, + 41.80899580754642, + 28.010818950191275, + 25.435358957472562, + 35.18096235290441, + 22.968376703457743, + 28.56101327024348, + 22.844366731397383, + 26.802678452904015, + 22.603309910218957, + 30.825168198616506, + 26.01027691753675, + 27.965867004150304 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Documents in the topic of Personal Development & Human Resources & Career usually contain more sentences." + } + ], + [ + "Symbol to Word Ratio", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.0029508316364481296, + 0.002339527014691741, + 0.002746622681352375, + 0.0031207893125393786, + 0.0024594503072570637, + 0.003732116125668388, + 0.0029521717963945683, + 0.002009846839273012, + 0.0023335875319153666, + 0.0032912280108721562, + 0.0026740153080243275, + 0.0037401276658117497, + 0.0022685436825723537, + 0.0034624173472893424, + 0.0022837896768252673, + 0.002565854536163215, + 0.0035536009817103663 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Documents related to Daily Life & Home & Lifestyle usually have higher percentage of symbols." + } + ], + [ + "Fraction of Words with Alpha Character", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.9554513833362817, + 0.9672667625084445, + 0.945038227724378, + 0.9650443058450766, + 0.9662993498435797, + 0.9795101768513954, + 0.949647348401343, + 0.9644024275136092, + 0.9651040360235426, + 0.9515637138100507, + 0.9638773263904938, + 0.9544175710037947, + 0.9602638724414636, + 0.9533095901329957, + 0.9536863995733356, + 0.9573400271816177, + 0.9613916720605239 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "The fraction of words with alpha character seems to be relatively consistent across different topics." + } + ], + [ + "Number of Stop Words", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 106.35206163056745, + 139.14766896951542, + 61.915719712408496, + 150.11113937942733, + 141.82980634490363, + 156.21242563385906, + 122.75635942647355, + 122.98374178167114, + 152.60597915266686, + 83.42474857711936, + 128.65106740517174, + 93.49815985637109, + 114.57335387655353, + 86.25348147704472, + 162.30932752139992, + 114.21801717978818, + 106.4132116046095 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Culture & Cultural geography contains more stop words in average." + } + ], + [ + "Has Curly Bracket", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.005337434075224228, + 0.0067539494661577855, + 0.01028199401122759, + 0.009842362566097974, + 0.011575658725965403, + 0.00931123976841685, + 0.02773600990434086, + 0.006582021405366427, + 0.009203144290424917, + 0.01040779785032792, + 0.008158933368710822, + 0.007557345036429687, + 0.010752547772779798, + 0.011963847997509762, + 0.012824157427587732, + 0.009383291815585807, + 0.008704979857573332 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Natural Science & Formal Science & Technology has a significantly higher rate in percentage of documents that contain curly bracket. It might be related to the coding data." + } + ], + [ + "Number of Document Duplication", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 12.971086410444947, + 7.7029131842117975, + 6.170992495013885, + 7.104888755861518, + 8.650198708966025, + 6.623561589139549, + 6.508078335111145, + 9.093410252353744, + 6.089149697604276, + 7.057779532190901, + 7.702302116384568, + 6.5227466722033824, + 6.954972482095409, + 6.535254918082626, + 9.99308253577182, + 5.590145547903439, + 6.865564317873041 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Sports related documents have a higher number of duplication count." + } + ], + [ + "Number of Dump Duplication", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 3.8719109133730716, + 3.26455384318083, + 2.2721848150382313, + 3.265644018756859, + 3.444853093197951, + 3.1923417847873825, + 2.7517906637022187, + 3.3698330764051643, + 2.710181437434559, + 3.1639266861716124, + 3.206342888940481, + 2.7590002157690536, + 2.8303421621304015, + 2.6106544768459656, + 3.9413752803310276, + 2.4888664878823907, + 3.094817449212491 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "In average, Culture & Cultural geography related documents are duplicated across a higher number of common crawl dumps. Duplication of Shopping & Commodity appears in less dumps than others." + } + ], + [ + "Number of Year Duplication", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 1.4484135130393887, + 1.4291463355717668, + 1.229893865040677, + 1.4503641624264192, + 1.452872665456801, + 1.442735076861649, + 1.3276539735170263, + 1.4222795353953812, + 1.328097016674741, + 1.406236938652301, + 1.4158586964132385, + 1.3305229074812153, + 1.3344910656577857, + 1.2914712093532734, + 1.5438579866704571, + 1.2835525414227675, + 1.404339123681775 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "In average, Culture & Cultural geography related documents are duplicated across more years than other topics." + } + ], + [ + "Maximum Span of Year Duplication", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 1.4810222492619038, + 1.4615875163051355, + 1.2437028221138953, + 1.4792776613788288, + 1.49291539362753, + 1.4697025354362148, + 1.3531216666024084, + 1.4549983259811405, + 1.3486599259247682, + 1.432375114165854, + 1.4460216908228583, + 1.352992743917416, + 1.3557807185339452, + 1.309416527589089, + 1.582425218737168, + 1.3025537295236642, + 1.4325029904862776 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "In average, Culture & Cultural geography related documents are duplicated across a wider span of years." + } + ], + [ + "Language Score", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.9413726660750416, + 0.9347130364355554, + 0.8847180050664069, + 0.9336572405289453, + 0.9420075430577804, + 0.9522977107155225, + 0.8831956938165678, + 0.9481278901144439, + 0.9241279717677588, + 0.9066709862541587, + 0.9270825804900252, + 0.9117954084131167, + 0.921528771738386, + 0.8992133008305735, + 0.9224377655046135, + 0.9152426551412108, + 0.9178671893764959 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Average language scores of different topic groups are mostly consistent. No significant differences are obeserved." + } + ], + [ + "Fraction of Duplicate Lines", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.01235014200385377, + 0.012056246914591116, + 0.018525258494333133, + 0.012726935235443207, + 0.01165333793386552, + 0.010444387257042395, + 0.016149995700960602, + 0.012705431934865763, + 0.01519943556613772, + 0.014809953345215319, + 0.012686293057054212, + 0.01603496888664195, + 0.01596207137084465, + 0.016014032499666292, + 0.013610478505124169, + 0.01580386009616988, + 0.015060041023072804 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "In average, Shopping & Commodity has a larger fraction of duplicate lines than others." + } + ], + [ + "Fraction of Characters in Duplicate Lines", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.00501345725991589, + 0.004299959504074716, + 0.0073601226054879785, + 0.004651424152553605, + 0.00450348053495509, + 0.003909584113418541, + 0.0063485903557626774, + 0.00521503913729261, + 0.005782503341128245, + 0.005962335751386622, + 0.004749891704712697, + 0.006420052544922626, + 0.0063561887111620065, + 0.006466672218067342, + 0.004978436253072214, + 0.006108371322424041, + 0.0057332990952240126 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + }, + "comment": "Shopping & Commodity usually has a larger fraction of characters in duplicate lines than others." + } + ], + [ + "Fraction of Characters in Most Common Bigram", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.02614438964212445, + 0.02549244163757135, + 0.03593997020714517, + 0.026520648762908574, + 0.023796693998532542, + 0.019517664362790295, + 0.03146900938445295, + 0.026900122790576828, + 0.027486920194835916, + 0.029735671266585457, + 0.02724062185263462, + 0.030402249730981233, + 0.03031798250174187, + 0.034936591389516845, + 0.02730012031746535, + 0.029329317288923955, + 0.028440195287636943 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Most Common 3-gram", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.025877442339206684, + 0.026073122171118526, + 0.03794965393907832, + 0.02756936824343807, + 0.024589084236341825, + 0.019970321326854976, + 0.031104349287997282, + 0.027138921074492478, + 0.02674544851177018, + 0.03082668946385283, + 0.027642774270487825, + 0.031311152209273344, + 0.030596143210215625, + 0.0352048856850328, + 0.028135774349846692, + 0.029182052353507664, + 0.03023015052666528 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Most Common 4-gram", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.026649882448510714, + 0.026904261739744192, + 0.04086332064828129, + 0.0286899321496711, + 0.025495383586610822, + 0.020748509542508307, + 0.03171918073481819, + 0.027563495776633813, + 0.02693813171261885, + 0.03243470539147362, + 0.0287992899739741, + 0.032589127100319, + 0.031139178077804624, + 0.03630423964958027, + 0.029809457289325606, + 0.029522356378146167, + 0.032375986416410006 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 5-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.016989093741368057, + 0.01874268974292254, + 0.02001249239006167, + 0.01893345653851295, + 0.017576185062959156, + 0.013966567341084396, + 0.026648000310062814, + 0.021239561601745963, + 0.022547189937081085, + 0.016903077473431387, + 0.018277127900190513, + 0.019079382613460993, + 0.023467347573746446, + 0.021192854307303135, + 0.019157826340526964, + 0.021183180653813184, + 0.016589870490142093 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 6-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.011946923836249373, + 0.013297904416841108, + 0.014258151562338789, + 0.013153098583726782, + 0.012601072651000291, + 0.009837626317910313, + 0.01949595975959962, + 0.014924056163499448, + 0.015889641140216917, + 0.011840004108930956, + 0.012940087820238557, + 0.013424858515603134, + 0.016468963654372438, + 0.014839192401791004, + 0.01388493355575309, + 0.01498376261489033, + 0.011812586464028073 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 7-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.009095872215498261, + 0.010146405377015994, + 0.010487557518535542, + 0.009868429864354638, + 0.009802808055168035, + 0.007541438580109868, + 0.015129318997269138, + 0.011302686364124783, + 0.011969695536420487, + 0.00892604076557906, + 0.009759568234633746, + 0.010070709856859254, + 0.012419860047704056, + 0.011070038486862109, + 0.010547925069683646, + 0.011327481696653985, + 0.008957792606056945 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 8-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.0073128979182685684, + 0.008109486840255576, + 0.00813393016693516, + 0.007792984494504855, + 0.008002590936702558, + 0.006117199534770664, + 0.012284551039331444, + 0.009023639757214827, + 0.009496488981527608, + 0.007086539674993228, + 0.0076650824522217454, + 0.007877075837565403, + 0.00987496717434344, + 0.008652258777583252, + 0.008392133389867372, + 0.00901948167673584, + 0.007053229339496676 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 9-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.006148310898840968, + 0.00676529697013875, + 0.00643290688721836, + 0.006434352312383364, + 0.0067735701471297, + 0.005172565516416477, + 0.010288525380088334, + 0.007482544617336476, + 0.00780204339328974, + 0.005852660603046196, + 0.006240040171999708, + 0.006465362460507409, + 0.008165651028577293, + 0.006986331812620781, + 0.006928899525750178, + 0.007487698229960434, + 0.005728220555701086 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ], + [ + "Fraction of Characters in Duplicate 10-grams", + { + "type": "barh", + "kwargs": { + "y": [ + "Sports", + "Society & Social Issues & Human Rights", + "Shopping & Commodity", + "Religion & Spirituality", + "Politics & Government", + "Personal Development & Human Resources & Career", + "Natural Science & Formal Science & Technology", + "Law & Justice", + "Health & Wellness & Medicine", + "Food & Drink & Cooking", + "Environment", + "Entertainment & Travel & Hobby", + "Education", + "Daily Life & Home & Lifestyle", + "Culture & Cultural geography", + "Business & Economics & Finance", + "Arts" + ], + "width": [ + 0.005325364079705381, + 0.005797357629820572, + 0.005283647214644124, + 0.005467491111249268, + 0.005879825006312822, + 0.004529332536092203, + 0.008882676950579147, + 0.006399831899960353, + 0.006645377495475746, + 0.005021569571200667, + 0.005306020206939719, + 0.00550360123328725, + 0.007013864844056383, + 0.005835955446724545, + 0.005845555947354781, + 0.00641447975612288, + 0.004809876486057196 + ], + "color": [ + [ + 1.0, + 0.4980392156862745, + 0.054901960784313725, + 1.0 + ], + [ + 1.0, + 0.7333333333333333, + 0.47058823529411764, + 1.0 + ], + [ + 0.17254901960784313, + 0.6274509803921569, + 0.17254901960784313, + 1.0 + ], + [ + 0.596078431372549, + 0.8745098039215686, + 0.5411764705882353, + 1.0 + ], + [ + 0.8392156862745098, + 0.15294117647058825, + 0.1568627450980392, + 1.0 + ], + [ + 1.0, + 0.596078431372549, + 0.5882352941176471, + 1.0 + ], + [ + 0.5803921568627451, + 0.403921568627451, + 0.7411764705882353, + 1.0 + ], + [ + 0.7725490196078432, + 0.6901960784313725, + 0.8352941176470589, + 1.0 + ], + [ + 0.5490196078431373, + 0.33725490196078434, + 0.29411764705882354, + 1.0 + ], + [ + 0.7686274509803922, + 0.611764705882353, + 0.5803921568627451, + 1.0 + ], + [ + 0.8901960784313725, + 0.4666666666666667, + 0.7607843137254902, + 1.0 + ], + [ + 0.9686274509803922, + 0.7137254901960784, + 0.8235294117647058, + 1.0 + ], + [ + 0.4980392156862745, + 0.4980392156862745, + 0.4980392156862745, + 1.0 + ], + [ + 0.7803921568627451, + 0.7803921568627451, + 0.7803921568627451, + 1.0 + ], + [ + 0.7372549019607844, + 0.7411764705882353, + 0.13333333333333333, + 1.0 + ], + [ + 0.8588235294117647, + 0.8588235294117647, + 0.5529411764705883, + 1.0 + ], + [ + 0.09019607843137255, + 0.7450980392156863, + 0.8117647058823529, + 1.0 + ] + ] + }, + "x_label": "Metrics", + "subplots_adjust": { + "left": 0.37, + "right": 0.98 + } + } + ] +] \ No newline at end of file