diff --git a/.gitattributes b/.gitattributes index 063d87ab99357bb953a656f0880d140b4105cb53..2f05e84ca24036c5faa84698514fbe91b48cb48d 100644 --- a/.gitattributes +++ b/.gitattributes @@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text datasets/* filter=lfs diff=lfs merge=lfs -text +datasets/** filter=lfs diff=lfs merge=lfs -text diff --git a/datasets/allenai_c4_en/fqdn/avg_line_length/metric.json b/datasets/allenai_c4_en/fqdn/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..37013d784c4412a668750edbfc5d1c85557f8ce8 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13e295d246a9b709aebd04e178ba14a42d004148b45359f865b342bbb8c283a1 +size 1623484 diff --git a/datasets/allenai_c4_en/fqdn/avg_sentence_length/metric.json b/datasets/allenai_c4_en/fqdn/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ee9583a6810d80ad62407d2eb21a39b92eaf0762 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:844455b9c507a8b176149482b26b3520f96b39e126b983fb7143a326c8932bb8 +size 1674861 diff --git a/datasets/allenai_c4_en/fqdn/avg_word_length/metric.json b/datasets/allenai_c4_en/fqdn/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..22d61f743b58995e46ffc2f96b58a570b8592fb4 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:929c07146bbe8a77500af69f52414a97026d8cdb2a7ce277cbf2282437b4b1b8 +size 1761053 diff --git a/datasets/allenai_c4_en/fqdn/avg_words_per_line/metric.json b/datasets/allenai_c4_en/fqdn/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7f017e5f43c846edf19408381494e6eb3bd40545 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:524677a4969079244caf66ac8b92326b86d0733f24accd4c01abcb98e3318901 +size 1616865 diff --git a/datasets/allenai_c4_en/fqdn/capitalized_ratio/metric.json b/datasets/allenai_c4_en/fqdn/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..af30cd8230b8b89e8eb03a57ca435b6aed66b6f2 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eaa4cffbf0cab6b0a12feaaf383d4e827c2b658872017c0cbe32cb5a06573f5 +size 1794500 diff --git a/datasets/allenai_c4_en/fqdn/ccnet_perplexity_wikipedia_en/metric.json b/datasets/allenai_c4_en/fqdn/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d30d2048189c95b5b9dec79de3f05aee1049afd3 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffbbe7122e820473190be77a92836a26690f2a6f0bd54eb7e40b520aaddac755 +size 1569467 diff --git a/datasets/allenai_c4_en/fqdn/digit_ratio/metric.json b/datasets/allenai_c4_en/fqdn/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e0a292b06f6b131e767e83bf1ffdddf627924a00 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a637bfb0478881101ea1a4fd471e286c7e97c673875fdffd6ce359b23583447 +size 1746356 diff --git a/datasets/allenai_c4_en/fqdn/elipsis_ratio/metric.json b/datasets/allenai_c4_en/fqdn/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..75f5fd906027f37fca6d09f5b9e4ebb066c66d5e --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fdec8ce0fbe952a130d4874e14686a724da961ae26dcc3f6a9458b8eb0ba9c5 +size 1592486 diff --git a/datasets/allenai_c4_en/fqdn/fasttext_en/metric.json b/datasets/allenai_c4_en/fqdn/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d74c738ac81be0893c68d9c55ca0bade5e5a7be9 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0e57979bbc0ad4ab11760fef7b675ca5d3a2bd891926ffad327f0e9860622a7 +size 1805069 diff --git a/datasets/allenai_c4_en/fqdn/length/metric.json b/datasets/allenai_c4_en/fqdn/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..81a812b914b5bf6a945d3d428e25922aa466d0ec --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ba340b60da923dbf4425d4cea30d5277c3d618d6eb13e2075b0e78c2845dac8 +size 1468886 diff --git a/datasets/allenai_c4_en/fqdn/line_char_duplicates/metric.json b/datasets/allenai_c4_en/fqdn/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..671b9d7b6399b0031aebd9d9c951d8f326eecd25 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d375b628e2f34bc839a1124ae6ffab4957c275ad3335271e6bde2a49de652a5 +size 201296 diff --git a/datasets/allenai_c4_en/fqdn/line_duplicates/metric.json b/datasets/allenai_c4_en/fqdn/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..671b9d7b6399b0031aebd9d9c951d8f326eecd25 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d375b628e2f34bc839a1124ae6ffab4957c275ad3335271e6bde2a49de652a5 +size 201296 diff --git a/datasets/allenai_c4_en/fqdn/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/allenai_c4_en/fqdn/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d0c6d82f63a788ca0ef941e4035a697198b4053f --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fddfac9f4d3f26ff639798cadf388b3f7066c6b1ebd79ce49112181a93d12d9 +size 457088 diff --git a/datasets/allenai_c4_en/fqdn/long_line_ratio_chars_10000/metric.json b/datasets/allenai_c4_en/fqdn/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9bf01c8d790f31ae3727149239e1a8fc1eaf783b --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36bf4f9d8ea410a63b9bafcd5309e8a3c02131f72c38dccf9809bf5e0f18c0dd +size 273594 diff --git a/datasets/allenai_c4_en/fqdn/long_line_ratio_chars_2000/metric.json b/datasets/allenai_c4_en/fqdn/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fa021353424d584457e1ca46b51660b211c66ca3 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7870d486f759a4281d64777842f1b2ffc9000ab9fbbc156b2f060aa186fb905d +size 1007208 diff --git a/datasets/allenai_c4_en/fqdn/long_sentence_ratio_75/metric.json b/datasets/allenai_c4_en/fqdn/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..795d721a74a498f2903ba926502f2ce43b26e270 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17ecfb49ba00c0b76beb322e54b276ec687e32d95ad376801b68264e5c9faaae +size 1605361 diff --git a/datasets/allenai_c4_en/fqdn/long_word_ratio_7/metric.json b/datasets/allenai_c4_en/fqdn/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fa7fb045467e1a9c2b5856e0a190c89f09554f63 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ed493dfe25ed5ca3fe2b84088c55f217697a01456b00c8a1c17b44c04641a71 +size 1792368 diff --git a/datasets/allenai_c4_en/fqdn/n_lines/metric.json b/datasets/allenai_c4_en/fqdn/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..543376a1a4de8dc4cd9baa3671225776f1b1326a --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd437b80fe4f2a4e7d717cd8868a9f779a723e0e988691fac8e841b106990a8c +size 1401311 diff --git a/datasets/allenai_c4_en/fqdn/n_sentences/metric.json b/datasets/allenai_c4_en/fqdn/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..162fe8b505fd2aad3603a5b763f9f8669fc76a4c --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a6589be3e061a255ed88dc51c6b068c7d4c30211c636aad36b635a31bef3a1b +size 1416594 diff --git a/datasets/allenai_c4_en/fqdn/n_words/metric.json b/datasets/allenai_c4_en/fqdn/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d80c94c1034e73b5ef485eb649f869fd9f0e2958 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b6aebbb119d8dc9524917b223e023a91f6e6f7c04e230e257212f52903813a +size 1451188 diff --git a/datasets/allenai_c4_en/fqdn/non_alpha_digit_ratio/metric.json b/datasets/allenai_c4_en/fqdn/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1d680f14cd123e5852eaa1bd4cb39ed35013f1ab --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56a6ed5050c14e6db1d8919c9ae4b1c990485389f748881cd707d78bab938262 +size 1846055 diff --git a/datasets/allenai_c4_en/fqdn/punctuation_ratio/metric.json b/datasets/allenai_c4_en/fqdn/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..15ecab35b724f054bf31e5c863ef9f75c4984342 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d30a82db9526d7cb87c7aea1497b1cc2e9c372058720e4572738a9f3c04ada07 +size 1863971 diff --git a/datasets/allenai_c4_en/fqdn/short_line_ratio_chars_10/metric.json b/datasets/allenai_c4_en/fqdn/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b1ba569718efe5d370364b765cfd77893b82ac35 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa3aa96062ab15f0a5ee70c48fa5417c5bec9f6ed56f38959a50f004fcc554e6 +size 224470 diff --git a/datasets/allenai_c4_en/fqdn/short_line_ratio_chars_30/metric.json b/datasets/allenai_c4_en/fqdn/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cebf45f590ab6e0b48dd1fd2fcc75f1eb3c031b0 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06aa0809dbedf7bad9ad3aef7e6a8dd7a63eee2121addd38cc4a641dd5918d34 +size 1516332 diff --git a/datasets/allenai_c4_en/fqdn/short_sentence_ratio_20/metric.json b/datasets/allenai_c4_en/fqdn/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1f265deff6d11baccd2a27744d7364d008315b58 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c3af2c2ebc293af79d69f17d9a959a45afffd5a8b87ddc8a32224f6c25f2b2c +size 1619393 diff --git a/datasets/allenai_c4_en/fqdn/short_word_ratio_3/metric.json b/datasets/allenai_c4_en/fqdn/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2267abbb394179325348b5d199b6eadbffa20158 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd76ee88aefd796ee002ff0bbe19f0e534633f1c6ebae120e0a96409cfd2d53b +size 1807004 diff --git a/datasets/allenai_c4_en/fqdn/stop_word_ratio/metric.json b/datasets/allenai_c4_en/fqdn/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a2ec83302cb854254f7cc90ebac21fee610b01fe --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4fcf0909a13e9b5fa4f2474e23e2ed6c933cdeb8442ec39165be187edf1df2e +size 1758157 diff --git a/datasets/allenai_c4_en/fqdn/type_token_ratio/metric.json b/datasets/allenai_c4_en/fqdn/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1475e3c74bc393adeec76546a733a73cdd8d838b --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82fbe81626621522e1844b6f0db3d1c91bf049bdcac627bfd98e71401a7dcd2a +size 1796474 diff --git a/datasets/allenai_c4_en/fqdn/uppercase_ratio/metric.json b/datasets/allenai_c4_en/fqdn/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6e7de169ae3dc1c6753d8258be4774e6113d914c --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d25f8fc76974e04ca7f1b18e5bcbf32c04d726e7e718f89ec9ad93ed21a06711 +size 1836030 diff --git a/datasets/allenai_c4_en/fqdn/white_space_ratio/metric.json b/datasets/allenai_c4_en/fqdn/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f825066aaf5157be1cf975b69e8b23469fc0c2f8 --- /dev/null +++ b/datasets/allenai_c4_en/fqdn/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b481dc7f412871a09f9ae24bbaa1015f90e620efca35763930a0b326d3497c41 +size 1846756 diff --git a/datasets/allenai_c4_en/histogram/avg_line_length/metric.json b/datasets/allenai_c4_en/histogram/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b0ae72bbaaa4de5a76b211803af2b6ed23181c14 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64706836547dd92505c462d14b288aa45eb1237b0f62b8d54dc5ead6e68577ce +size 32776755 diff --git a/datasets/allenai_c4_en/histogram/avg_sentence_length/metric.json b/datasets/allenai_c4_en/histogram/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a15ce646399b1e08403ed95304e5b77946298ada --- /dev/null +++ b/datasets/allenai_c4_en/histogram/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58fbbcd53e5e6edc22dd7e1f145e3d30d494bc534615a00c30aa989e49c3a648 +size 12712015 diff --git a/datasets/allenai_c4_en/histogram/avg_word_length/metric.json b/datasets/allenai_c4_en/histogram/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..be16461f27a027aed28b208e3141ba6378b1f136 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49fec58610045ca6c70cc840b4ee5063dce64127d4b0b8ee99e682681050b380 +size 492624 diff --git a/datasets/allenai_c4_en/histogram/avg_words_per_line/metric.json b/datasets/allenai_c4_en/histogram/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0b8949b8a113bea49cf454e454dd537364017da9 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2675b3b6c52e8f6cc4e002c7f2256e40618bae71c2d5e718f6f78c4a5a7cce15 +size 8711193 diff --git a/datasets/allenai_c4_en/histogram/capitalized_ratio/metric.json b/datasets/allenai_c4_en/histogram/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..468d8099ec94388e69968a310238973d1d40d96a --- /dev/null +++ b/datasets/allenai_c4_en/histogram/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:635f1112a515a1215a03d501cc3a36718202959b8a9258b3df1afd3547223c8e +size 41710 diff --git a/datasets/allenai_c4_en/histogram/ccnet_perplexity_wikipedia_en/metric.json b/datasets/allenai_c4_en/histogram/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..634475e6bc8446581cc652aa4fc4ebfe8a095a25 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4fe377ec9f760fd50fdb88b2fb0d1f82366a059099a64575195cd6cfd4b8b3e +size 6708687 diff --git a/datasets/allenai_c4_en/histogram/digit_ratio/metric.json b/datasets/allenai_c4_en/histogram/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..48ba7eb70e7ebed16305f7d69d424746534fc9c9 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46c808cf4362d15b756fa6adb4156c86234d11607e6ef06433896da8027469fe +size 28476 diff --git a/datasets/allenai_c4_en/histogram/elipsis_ratio/metric.json b/datasets/allenai_c4_en/histogram/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4f4909889a3c6b2dc745f2df5c15a27f8b5c39b1 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3706de37bdeae4d16468d6334f23929e78099faa71e8ee6372af38c5ac1f47f +size 22912 diff --git a/datasets/allenai_c4_en/histogram/fasttext_en/metric.json b/datasets/allenai_c4_en/histogram/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4e6b27da4270e713ec9a64b90c72e4d437527a09 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ab49879c8588e296a222ade352f6a08ac11fbb12eb33e86d6ffa451889c9ed7 +size 40664 diff --git a/datasets/allenai_c4_en/histogram/length/metric.json b/datasets/allenai_c4_en/histogram/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..230120f6148944db2dc4376d6ca0b8469426c541 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1e5f6915cd97e5eb1cfb366aabd5f877e40778b9636ddeca69bfaefd9496181 +size 3984610 diff --git a/datasets/allenai_c4_en/histogram/line_char_duplicates/metric.json b/datasets/allenai_c4_en/histogram/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b1a8183cab45d128a82def2ec328ff6fd8b3b0da --- /dev/null +++ b/datasets/allenai_c4_en/histogram/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f35e4d190e53a300f84d066f45da136febaf7f900d0a264fbedc24e616e053 +size 45 diff --git a/datasets/allenai_c4_en/histogram/line_duplicates/metric.json b/datasets/allenai_c4_en/histogram/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b1a8183cab45d128a82def2ec328ff6fd8b3b0da --- /dev/null +++ b/datasets/allenai_c4_en/histogram/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f35e4d190e53a300f84d066f45da136febaf7f900d0a264fbedc24e616e053 +size 45 diff --git a/datasets/allenai_c4_en/histogram/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/allenai_c4_en/histogram/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a48202acd392f9d806a739234aa2bc16539ec1f1 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37c7c78e254802853ca66bc3bb163f1436285eacdc2bb06b4fc11ea00ad2438a +size 45 diff --git a/datasets/allenai_c4_en/histogram/long_line_ratio_chars_10000/metric.json b/datasets/allenai_c4_en/histogram/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..93ad502ccf483b1dae4c4b05fd997164e4fd24d1 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adf0a27248386ac8ecbc79e1e722370833d647aa16ef3794e7400404f7627fd1 +size 5864 diff --git a/datasets/allenai_c4_en/histogram/long_line_ratio_chars_2000/metric.json b/datasets/allenai_c4_en/histogram/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cc926afa37c66d5777e58a0cc7b1fbe0141beb59 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e10d45e9aa9f9549bde1bfc627905e1e8984706525df1b5605c3ca2794435616 +size 28442 diff --git a/datasets/allenai_c4_en/histogram/long_sentence_ratio_75/metric.json b/datasets/allenai_c4_en/histogram/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..08df06a51bb4f928f46dae4ac470a31449dfc586 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5013be4054e7fb0f0c3c7b128fc83be203a26e3ec1bfe934f2946346d0c2918 +size 42874 diff --git a/datasets/allenai_c4_en/histogram/long_word_ratio_5/metric.json b/datasets/allenai_c4_en/histogram/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ef554eb666344640e7f0b835e983f8007062a89b --- /dev/null +++ b/datasets/allenai_c4_en/histogram/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c4f984de0c13737617248d94c1370fdd82cf7a275d56df051d40e188c227cbb +size 4946 diff --git a/datasets/allenai_c4_en/histogram/long_word_ratio_7/metric.json b/datasets/allenai_c4_en/histogram/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cc5831c65cfb150a2f980218bb09ff4033bb65c1 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:127e2c2c254335993754dcc3ffd3f42c411d9dc510319d546d2d4d263c538d67 +size 37781 diff --git a/datasets/allenai_c4_en/histogram/n_lines/metric.json b/datasets/allenai_c4_en/histogram/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..79b13f728a345789ddbd40953f0c659a70d0b292 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce2f177430bb3b565a7bd822bcf28707ace7f0d3dab095a8a5f6f736178903b6 +size 51276 diff --git a/datasets/allenai_c4_en/histogram/n_sentences/metric.json b/datasets/allenai_c4_en/histogram/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4560ff640932ac97e3962720988c23739f69a950 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c6f1cbaa6a516c517f66b3efb80f51f3d5a9f0063cd32800f38653f1d4217c4 +size 111395 diff --git a/datasets/allenai_c4_en/histogram/n_words/metric.json b/datasets/allenai_c4_en/histogram/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..632312f25bf798c02a4744725ed734c5fc739612 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dae039d94d8d0d1f3edaac9e30481ba102919909234ba6784d6fb62eed3ed74e +size 1027744 diff --git a/datasets/allenai_c4_en/histogram/non_alpha_digit_ratio/metric.json b/datasets/allenai_c4_en/histogram/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a0f2719161f83a5b97bf23b5b2075d7b517d48b0 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47c6ab97769999ea9b4df634671e0ed6507d78935df36773e1b63129174c3500 +size 39714 diff --git a/datasets/allenai_c4_en/histogram/punctuation_ratio/metric.json b/datasets/allenai_c4_en/histogram/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..84779a67903067ed3525e09b552bc5ba65193b31 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf79e2f0a53f9722c34f7e51fedd4e7fecb4e6f52565208657fcead01016aefa +size 38714 diff --git a/datasets/allenai_c4_en/histogram/short_line_ratio_chars_10/metric.json b/datasets/allenai_c4_en/histogram/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ce8c87d7e1f79191a799f4eb48825fe513845956 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66c7090e7e29249586971f6d7f2e0ec43d6cf1df6f97844e329d814c9123db75 +size 5617 diff --git a/datasets/allenai_c4_en/histogram/short_line_ratio_chars_30/metric.json b/datasets/allenai_c4_en/histogram/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cfb3007154006765dd8b185f1448ea8be82c40dc --- /dev/null +++ b/datasets/allenai_c4_en/histogram/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:022c4f698bf3243fe8f6c2ce1817dc8d576334207f96cb5919de1cf85c2d282f +size 40359 diff --git a/datasets/allenai_c4_en/histogram/short_sentence_ratio_20/metric.json b/datasets/allenai_c4_en/histogram/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..99fec28d66f70ec827bbdb38b604024ffe31e2d7 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:197fb939fcd9bdf01bedd3edb6616667cde5dd39164cce74153c6f2920b0b666 +size 41757 diff --git a/datasets/allenai_c4_en/histogram/short_word_ratio_3/metric.json b/datasets/allenai_c4_en/histogram/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6f1e400ba3ee8aac2addc91b5a6e7b58c8c6af11 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1000a8cd672824695e30f74f9f7edc99bf621e45d568b3a9d8a65c7e3c2b3fd3 +size 41801 diff --git a/datasets/allenai_c4_en/histogram/stop_word_ratio/metric.json b/datasets/allenai_c4_en/histogram/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7a25f84d34b62520dc6f75021c8941d7a6969910 --- /dev/null +++ b/datasets/allenai_c4_en/histogram/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a86dcc3812d8b2963feca00b24f2896a6d0e788d3433a207542f041d66a37edd +size 19127 diff --git a/datasets/allenai_c4_en/histogram/type_token_ratio/metric.json b/datasets/allenai_c4_en/histogram/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b62732b1f6e1178b41094fa281069a009e5a266f --- /dev/null +++ b/datasets/allenai_c4_en/histogram/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:522ba8c32a1842c034ed44c6b8c791336177b0c083e01ffca3f4299decb2596b +size 42760 diff --git a/datasets/allenai_c4_en/histogram/uppercase_ratio/metric.json b/datasets/allenai_c4_en/histogram/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5b84815578a1e575fbe5085fd14ab930cf9bcece --- /dev/null +++ b/datasets/allenai_c4_en/histogram/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaa2222ba2168c873610700cab837db2e280a48e96e3db0dee862024262f3aa1 +size 37437 diff --git a/datasets/allenai_c4_en/histogram/white_space_ratio/metric.json b/datasets/allenai_c4_en/histogram/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..18c92cf7366006e91661810f8788e4c6e7e866fd --- /dev/null +++ b/datasets/allenai_c4_en/histogram/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed124a12c323981a6a7b36cba1eb724c8c10057d8bcce727ace81c7fa99831f5 +size 29164 diff --git a/datasets/allenai_c4_en/none/avg_line_length/metric.json b/datasets/allenai_c4_en/none/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..26592f7bb8542c822379083d58fb32c6940bc50f --- /dev/null +++ b/datasets/allenai_c4_en/none/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe9f5f5bbe46291624a9701ada2ffc7e4e60fbf3bc3e0419d42575741c5d8d9f +size 189 diff --git a/datasets/allenai_c4_en/none/avg_word_length/metric.json b/datasets/allenai_c4_en/none/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f3a6b4fdad1a6d7ce542b73bf6ca027e73cf6b8a --- /dev/null +++ b/datasets/allenai_c4_en/none/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33f7f9d1067c47f911a4a108b983496f6c2db3b5996b8576eb13dc2625325bb5 +size 202 diff --git a/datasets/allenai_c4_en/none/avg_words_per_line/metric.json b/datasets/allenai_c4_en/none/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..773497afa70ca10fbcb35d65bef951d472f0568d --- /dev/null +++ b/datasets/allenai_c4_en/none/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43343d6a9662b61d6126d3cd74459d678e6141d856708adedb8682b83896b3f4 +size 190 diff --git a/datasets/allenai_c4_en/none/digit_ratio/metric.json b/datasets/allenai_c4_en/none/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..800e5d2cb7c5169b1c76254c1a7dc8e0d46c32a3 --- /dev/null +++ b/datasets/allenai_c4_en/none/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65871937d6e3dbfdece9c44bb6782e9a629fda563d4eb07819f6d4d5b5af31df +size 210 diff --git a/datasets/allenai_c4_en/none/fasttext_en/metric.json b/datasets/allenai_c4_en/none/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d8df7acb6be07213e67bcc39ce5194c0ea203594 --- /dev/null +++ b/datasets/allenai_c4_en/none/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:780c9e30031a45049e0eeb34aa44da497891200792a0670ae7702d7596968297 +size 208 diff --git a/datasets/allenai_c4_en/none/length/metric.json b/datasets/allenai_c4_en/none/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..15aabf387bc6dcd5b783acae6d4692b4239d1010 --- /dev/null +++ b/datasets/allenai_c4_en/none/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37f408bdd40810a16f94be7a8326e02b9d2e3b452c2fdc07193aa9a3cc6171f9 +size 185 diff --git a/datasets/allenai_c4_en/none/line_char_duplicates/metric.json b/datasets/allenai_c4_en/none/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1ea0e0d75aebca9088e8f2419632dcd45e44abab --- /dev/null +++ b/datasets/allenai_c4_en/none/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a6f01c247cd19807f8b10d02d1a3d68e814635d04dd59513b07607284c24c5c +size 14 diff --git a/datasets/allenai_c4_en/none/line_duplicates/metric.json b/datasets/allenai_c4_en/none/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1ea0e0d75aebca9088e8f2419632dcd45e44abab --- /dev/null +++ b/datasets/allenai_c4_en/none/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a6f01c247cd19807f8b10d02d1a3d68e814635d04dd59513b07607284c24c5c +size 14 diff --git a/datasets/allenai_c4_en/none/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/allenai_c4_en/none/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8609885b049f0ccc693c78e0e5f27a17d1694bdc --- /dev/null +++ b/datasets/allenai_c4_en/none/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41a5c94e51b095f28ef4212cb105e199f0ad05cb3e3714ed065a46923fe11a9f +size 51 diff --git a/datasets/allenai_c4_en/none/long_line_ratio_chars_10000/metric.json b/datasets/allenai_c4_en/none/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..06f3375fde2520d50f91a683e6ed8326daa7724d --- /dev/null +++ b/datasets/allenai_c4_en/none/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06713828378b7783fb3180b5cedfa87a12490d99c5359b1baab7d1174fdfaab2 +size 197 diff --git a/datasets/allenai_c4_en/none/long_line_ratio_chars_2000/metric.json b/datasets/allenai_c4_en/none/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..556db30bbd7c90460ebabc6c6610f245a031b00c --- /dev/null +++ b/datasets/allenai_c4_en/none/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e82802c7d5dc2b934bafc082d9a9d91b72488c86a611e8b7a1040f9585721255 +size 195 diff --git a/datasets/allenai_c4_en/none/long_word_ratio_5/metric.json b/datasets/allenai_c4_en/none/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e6543a6449fda3d8741bd4e26161ebefb79c4fcb --- /dev/null +++ b/datasets/allenai_c4_en/none/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0da2001f73ec3a43228f1549d2029642a77ae71c6309e20ad24d8ab8477c02b +size 207 diff --git a/datasets/allenai_c4_en/none/long_word_ratio_7/metric.json b/datasets/allenai_c4_en/none/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1a21c2aa6a451e37bab08f374fd83ee4511655fe --- /dev/null +++ b/datasets/allenai_c4_en/none/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bb91a30a897c1df1cf4b7a1013218e4b15f1374a70da8223983da9e42d68b05 +size 205 diff --git a/datasets/allenai_c4_en/none/n_lines/metric.json b/datasets/allenai_c4_en/none/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9af5b243294220621094ebf474921165c001b66c --- /dev/null +++ b/datasets/allenai_c4_en/none/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac18b6d77562467950b66d17f68e3ec0e5187dcef88e027023274688b62a05ef +size 179 diff --git a/datasets/allenai_c4_en/none/n_words/metric.json b/datasets/allenai_c4_en/none/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f39d1a968d6b6ec1fed43888fd29101b07e0e9e5 --- /dev/null +++ b/datasets/allenai_c4_en/none/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4de7f5473e2a1347fa332f59d768bc1396d94704fb4fe432f976b65dd13eb388 +size 181 diff --git a/datasets/allenai_c4_en/none/non_alpha_digit_ratio/metric.json b/datasets/allenai_c4_en/none/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e3a426159a59e4f1ec0d49b8b7aadb3128120691 --- /dev/null +++ b/datasets/allenai_c4_en/none/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e849f5e5aa5d9fab521ebe9a592d97a886b2ca4226701a2094235518097c10cc +size 224 diff --git a/datasets/allenai_c4_en/none/short_line_ratio_chars_10/metric.json b/datasets/allenai_c4_en/none/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4dbbaf9d5d389b700bb983928c4b8e8d4c36853c --- /dev/null +++ b/datasets/allenai_c4_en/none/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9010d74b31a2f6f66fecdaeaba07abc0efe070842394c6406ad5603807150104 +size 199 diff --git a/datasets/allenai_c4_en/none/short_line_ratio_chars_30/metric.json b/datasets/allenai_c4_en/none/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..49d08c35befc7a7c56d4d26fac5951e79eee738d --- /dev/null +++ b/datasets/allenai_c4_en/none/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e51f9b3bf39a0cef85d15f2fe60f5a9f492168ab65a90b4ac8491658ba46207b +size 195 diff --git a/datasets/allenai_c4_en/none/short_word_ratio_3/metric.json b/datasets/allenai_c4_en/none/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a43ef8c1c380ec17fa861268a30c2fc9ddd7b172 --- /dev/null +++ b/datasets/allenai_c4_en/none/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e83b469275d0e450c059222754551fa3a41d19374ffb5b559c86160ba899dc71 +size 207 diff --git a/datasets/allenai_c4_en/none/white_space_ratio/metric.json b/datasets/allenai_c4_en/none/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c4ad43c68f6ca9e5fbeed7c5336d613e56c3d561 --- /dev/null +++ b/datasets/allenai_c4_en/none/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75ccf0093f94be65ac6e14059926d66f0497dae631955077a4ef901ad0d08b80 +size 226 diff --git a/datasets/allenai_c4_en/suffix/avg_line_length/metric.json b/datasets/allenai_c4_en/suffix/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..682c78fdedcc15bc78727a2d79218572c91d6850 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb0a5ffd822589526cc00eec79aa698003da78da113697dd3fa388777f2ab826 +size 498874 diff --git a/datasets/allenai_c4_en/suffix/avg_sentence_length/metric.json b/datasets/allenai_c4_en/suffix/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2bd2f2d20248eb1f10507216386c20f5fa08c4c8 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:248522cf9598cbb86729f04e309787faaa4a2851e5acaa91d07096d664b48181 +size 517553 diff --git a/datasets/allenai_c4_en/suffix/avg_word_length/metric.json b/datasets/allenai_c4_en/suffix/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3ef2dfcee144a057b56a873d64c9fa0f4124f4af --- /dev/null +++ b/datasets/allenai_c4_en/suffix/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12b7f0a7dafd7462676fba53430ff178c39bb9badeb6379ccceb25cbcbfc748d +size 547186 diff --git a/datasets/allenai_c4_en/suffix/avg_words_per_line/metric.json b/datasets/allenai_c4_en/suffix/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fe9ecb1fbf634478e4442504becf15b0dd7fec8d --- /dev/null +++ b/datasets/allenai_c4_en/suffix/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d638718d7df26ada248cf2053948a9e9bdf26e5de151324017526180b5fc6962 +size 494975 diff --git a/datasets/allenai_c4_en/suffix/capitalized_ratio/metric.json b/datasets/allenai_c4_en/suffix/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2b0df6cd5e70d322cd3a1f01df45db718084aabf --- /dev/null +++ b/datasets/allenai_c4_en/suffix/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e380ed9f9cd6ae7972b10d3b318f89db7902be5fb2aa1902c0db259bbef9dd8 +size 555954 diff --git a/datasets/allenai_c4_en/suffix/ccnet_perplexity_wikipedia_en/metric.json b/datasets/allenai_c4_en/suffix/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a826239443c6af22cbb4bc5c3dc78b60d9bcba9c --- /dev/null +++ b/datasets/allenai_c4_en/suffix/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5d3bd560f5c02c4ffa2cdf056b8c0fbbd406da7a1ecaf7354ce77eb80724041 +size 472389 diff --git a/datasets/allenai_c4_en/suffix/digit_ratio/metric.json b/datasets/allenai_c4_en/suffix/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c76425a21fff0b8d858d44610ff0c1166965235d --- /dev/null +++ b/datasets/allenai_c4_en/suffix/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:331093218c1a54273cf4b58dd9e4f77287a756ef763531cd8c5dc74fe249b123 +size 533631 diff --git a/datasets/allenai_c4_en/suffix/elipsis_ratio/metric.json b/datasets/allenai_c4_en/suffix/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..38f6f3a3bf11c089ac31cbb155c8b9d29d49cc86 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4baa369e061950b9a181220cc5245fe346cb07b33ea9d7f5cc083bfe46adb525 +size 396558 diff --git a/datasets/allenai_c4_en/suffix/fasttext_en/metric.json b/datasets/allenai_c4_en/suffix/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d7a34c8038634f5ca90042f31ee338d4a095d967 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9839d4eb1edcb2ae50d26ee42c084a6a14e6896e9bb832672a9bf0da3fdd2626 +size 545228 diff --git a/datasets/allenai_c4_en/suffix/length/metric.json b/datasets/allenai_c4_en/suffix/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1e09e796575482c4e0994a06df950de358c30d3a --- /dev/null +++ b/datasets/allenai_c4_en/suffix/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49649fc9ea6071d85e5b4ac86fd2e4c88734f83e95d22fa0641d1644c1188acc +size 446806 diff --git a/datasets/allenai_c4_en/suffix/line_char_duplicates/metric.json b/datasets/allenai_c4_en/suffix/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..23d7467bd16caa438b4dd5f599a781c847083b0a --- /dev/null +++ b/datasets/allenai_c4_en/suffix/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b1a8a9ec6fe9000b4d14c142cd98ab25f6d2286dd58705c3a3ff84f3fa4350b +size 40836 diff --git a/datasets/allenai_c4_en/suffix/line_duplicates/metric.json b/datasets/allenai_c4_en/suffix/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..23d7467bd16caa438b4dd5f599a781c847083b0a --- /dev/null +++ b/datasets/allenai_c4_en/suffix/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b1a8a9ec6fe9000b4d14c142cd98ab25f6d2286dd58705c3a3ff84f3fa4350b +size 40836 diff --git a/datasets/allenai_c4_en/suffix/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/allenai_c4_en/suffix/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..453d52a00e9e0c02119424a112316f40818b6f5d --- /dev/null +++ b/datasets/allenai_c4_en/suffix/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e3aedb66693560304797581754621203e248e35a38f9f1a22f408bb5457cd1d +size 127388 diff --git a/datasets/allenai_c4_en/suffix/long_line_ratio_chars_10000/metric.json b/datasets/allenai_c4_en/suffix/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d4c208bdde1132e871c637f999ce04ef61f88c59 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21fa8edc7d559f406879a1c66b7f48e1efcf782f230429a4fe6df7a5fcdfffd7 +size 93151 diff --git a/datasets/allenai_c4_en/suffix/long_line_ratio_chars_2000/metric.json b/datasets/allenai_c4_en/suffix/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a9d72a009f40d1314b034e9942b26e9bb0a36ffe --- /dev/null +++ b/datasets/allenai_c4_en/suffix/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece1db23dce05b1d7984998d8e88a9be5d7bc38dfd3039c701f4950c8ee2e038 +size 256922 diff --git a/datasets/allenai_c4_en/suffix/long_sentence_ratio_75/metric.json b/datasets/allenai_c4_en/suffix/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..44c70179ba0c457fd8af08be6e6dd1eeea296f5f --- /dev/null +++ b/datasets/allenai_c4_en/suffix/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd2c4a36154d033dc2243436e6e7d1f889d2cbef725355bbad0e0b0a538453e8 +size 488469 diff --git a/datasets/allenai_c4_en/suffix/long_word_ratio_5/metric.json b/datasets/allenai_c4_en/suffix/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d76dd134a147521712ce2c9cd66719f088a212b4 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b11e9b33dd8046f9742d03295d2fa65c59b64b6ca7575c6334597e71cddfe7a4 +size 568795 diff --git a/datasets/allenai_c4_en/suffix/long_word_ratio_7/metric.json b/datasets/allenai_c4_en/suffix/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c78ad06b46e59878abaa2a59c938c7475bf901dd --- /dev/null +++ b/datasets/allenai_c4_en/suffix/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42a76629d46bcc2260f91b2fc4b652429a210348fa2b0079c56397a6e74bbc08 +size 556495 diff --git a/datasets/allenai_c4_en/suffix/n_lines/metric.json b/datasets/allenai_c4_en/suffix/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..66e965d734c04b030fe4906c00cdfb77533ae9e5 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ec25619d85e221d12682443b46390c3429faed13ea9330f501d68ead7569573 +size 422404 diff --git a/datasets/allenai_c4_en/suffix/n_sentences/metric.json b/datasets/allenai_c4_en/suffix/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..514370f0e0982e0039be91241b74a398d6ba8811 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1961eb63ea9716a8253760c293bb8bd9b77148b8cef5bd92e6138b23be6fbf2a +size 429036 diff --git a/datasets/allenai_c4_en/suffix/n_words/metric.json b/datasets/allenai_c4_en/suffix/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ed736e47deaa24b10d9786d9d07d5e68d7538720 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d39dbe133dfc113fc66da0f3491880206c00a4d2df6aa83862119c9db40e6eb0 +size 439806 diff --git a/datasets/allenai_c4_en/suffix/non_alpha_digit_ratio/metric.json b/datasets/allenai_c4_en/suffix/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4381f5caae1f18c7776e4a42fa36b8eba3a67748 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:023bc0bb769196dc43aaddafb8e2affbca56376c5529347b48e8ead577224059 +size 576606 diff --git a/datasets/allenai_c4_en/suffix/punctuation_ratio/metric.json b/datasets/allenai_c4_en/suffix/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e967a22fe697f9b6c224eb60ef46ebbd7894e26d --- /dev/null +++ b/datasets/allenai_c4_en/suffix/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8449f14a58ea87e3fd8c3733783e84a94a51f141b7cb360a3463bb07a541c4 +size 584051 diff --git a/datasets/allenai_c4_en/suffix/short_line_ratio_chars_10/metric.json b/datasets/allenai_c4_en/suffix/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8d53e8d90dd0a23dd15117b5c76826f95ad0f0f8 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9d2793e5b4b35bdd8d92fc79806c39def7e69e58cd49961d866b950f9de6b61 +size 57097 diff --git a/datasets/allenai_c4_en/suffix/short_line_ratio_chars_30/metric.json b/datasets/allenai_c4_en/suffix/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1beda05590902ddd2a302c129f49137e03defb80 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9d04c6ddf4b5747f58c38c7c466d9ba26a569cc50c16063633a3c45e3d47a5b +size 399187 diff --git a/datasets/allenai_c4_en/suffix/short_sentence_ratio_20/metric.json b/datasets/allenai_c4_en/suffix/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e166ce383275a6208787aa1edcde6e0535ce0839 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8be8b41ce01afc56e3fb2d98d0a13d94e81f98a52c6ba9982a7ea1fcd091908f +size 462679 diff --git a/datasets/allenai_c4_en/suffix/short_word_ratio_3/metric.json b/datasets/allenai_c4_en/suffix/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3d7a1291b217479d2faf6ae2b606ecd4b490c163 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f948e24a3817013e27fde0bca5ee822062dd5154452630012c2d4f1b25b15a6 +size 563232 diff --git a/datasets/allenai_c4_en/suffix/stop_word_ratio/metric.json b/datasets/allenai_c4_en/suffix/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3971e2049d01662aa043d6f22e0fe521a0ec8843 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa6d8db5228132b99e0f78d018466a491e48c701868a3a68df735ceee9a6f37a +size 538874 diff --git a/datasets/allenai_c4_en/suffix/type_token_ratio/metric.json b/datasets/allenai_c4_en/suffix/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..043dc4d9a00b185fa3c2094c8fee591c953d949a --- /dev/null +++ b/datasets/allenai_c4_en/suffix/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c66e0e9cb84a94414d2db57af23a8f3f26692e9c8d637cb7580b831fe051b07 +size 557838 diff --git a/datasets/allenai_c4_en/suffix/uppercase_ratio/metric.json b/datasets/allenai_c4_en/suffix/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cd4470d389224c4b6c2e0ce3ecc3aff8a2216c2a --- /dev/null +++ b/datasets/allenai_c4_en/suffix/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e85b8580572033cabd64773dd91b783dd425f90823746e716b95d282aba76a4 +size 571024 diff --git a/datasets/allenai_c4_en/suffix/white_space_ratio/metric.json b/datasets/allenai_c4_en/suffix/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..19aee3819d015968932dbe27f4cf25b7d63a38e8 --- /dev/null +++ b/datasets/allenai_c4_en/suffix/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98284077031a78fc3e5477ddd982c1f42259e8f3a26f6144a245ba795f0d94d5 +size 578000 diff --git a/datasets/allenai_c4_en/summary/avg_line_length/metric.json b/datasets/allenai_c4_en/summary/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..26592f7bb8542c822379083d58fb32c6940bc50f --- /dev/null +++ b/datasets/allenai_c4_en/summary/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe9f5f5bbe46291624a9701ada2ffc7e4e60fbf3bc3e0419d42575741c5d8d9f +size 189 diff --git a/datasets/allenai_c4_en/summary/avg_sentence_length/metric.json b/datasets/allenai_c4_en/summary/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a1839e73165845ee0df08e386be1e52c78d44f08 --- /dev/null +++ b/datasets/allenai_c4_en/summary/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a32d32afde1c304aac0ca60c43506161498a9a2baa1a2f592b06dc9eb58c565 +size 205 diff --git a/datasets/allenai_c4_en/summary/avg_word_length/metric.json b/datasets/allenai_c4_en/summary/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f3a6b4fdad1a6d7ce542b73bf6ca027e73cf6b8a --- /dev/null +++ b/datasets/allenai_c4_en/summary/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33f7f9d1067c47f911a4a108b983496f6c2db3b5996b8576eb13dc2625325bb5 +size 202 diff --git a/datasets/allenai_c4_en/summary/avg_words_per_line/metric.json b/datasets/allenai_c4_en/summary/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..773497afa70ca10fbcb35d65bef951d472f0568d --- /dev/null +++ b/datasets/allenai_c4_en/summary/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43343d6a9662b61d6126d3cd74459d678e6141d856708adedb8682b83896b3f4 +size 190 diff --git a/datasets/allenai_c4_en/summary/capitalized_ratio/metric.json b/datasets/allenai_c4_en/summary/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f5b7054c459677dff613ce59ab932f800b776a6d --- /dev/null +++ b/datasets/allenai_c4_en/summary/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef71716f91d26176c2359394f151fd88d4a9f53be8d5bdb5aea955b89f2a08ee +size 206 diff --git a/datasets/allenai_c4_en/summary/ccnet_perplexity_wikipedia_en/metric.json b/datasets/allenai_c4_en/summary/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1593e29ffbaa82aea17753e955f5bce1a7414f52 --- /dev/null +++ b/datasets/allenai_c4_en/summary/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11cd0c78ade816db1727c88b098752911be19a7e160fcffba06a9d218473bc4c +size 191 diff --git a/datasets/allenai_c4_en/summary/digit_ratio/metric.json b/datasets/allenai_c4_en/summary/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..800e5d2cb7c5169b1c76254c1a7dc8e0d46c32a3 --- /dev/null +++ b/datasets/allenai_c4_en/summary/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65871937d6e3dbfdece9c44bb6782e9a629fda563d4eb07819f6d4d5b5af31df +size 210 diff --git a/datasets/allenai_c4_en/summary/elipsis_ratio/metric.json b/datasets/allenai_c4_en/summary/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6048f1b7be838260595b2b78a4351f6866126b2f --- /dev/null +++ b/datasets/allenai_c4_en/summary/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16cab3740ac02175571e78570bee2d644ad6ae3711b2fb28f36b306e164442ef +size 212 diff --git a/datasets/allenai_c4_en/summary/fasttext_en/metric.json b/datasets/allenai_c4_en/summary/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d8df7acb6be07213e67bcc39ce5194c0ea203594 --- /dev/null +++ b/datasets/allenai_c4_en/summary/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:780c9e30031a45049e0eeb34aa44da497891200792a0670ae7702d7596968297 +size 208 diff --git a/datasets/allenai_c4_en/summary/length/metric.json b/datasets/allenai_c4_en/summary/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..15aabf387bc6dcd5b783acae6d4692b4239d1010 --- /dev/null +++ b/datasets/allenai_c4_en/summary/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37f408bdd40810a16f94be7a8326e02b9d2e3b452c2fdc07193aa9a3cc6171f9 +size 185 diff --git a/datasets/allenai_c4_en/summary/line_char_duplicates/metric.json b/datasets/allenai_c4_en/summary/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1ea0e0d75aebca9088e8f2419632dcd45e44abab --- /dev/null +++ b/datasets/allenai_c4_en/summary/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a6f01c247cd19807f8b10d02d1a3d68e814635d04dd59513b07607284c24c5c +size 14 diff --git a/datasets/allenai_c4_en/summary/line_duplicates/metric.json b/datasets/allenai_c4_en/summary/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1ea0e0d75aebca9088e8f2419632dcd45e44abab --- /dev/null +++ b/datasets/allenai_c4_en/summary/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a6f01c247cd19807f8b10d02d1a3d68e814635d04dd59513b07607284c24c5c +size 14 diff --git a/datasets/allenai_c4_en/summary/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/allenai_c4_en/summary/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8609885b049f0ccc693c78e0e5f27a17d1694bdc --- /dev/null +++ b/datasets/allenai_c4_en/summary/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41a5c94e51b095f28ef4212cb105e199f0ad05cb3e3714ed065a46923fe11a9f +size 51 diff --git a/datasets/allenai_c4_en/summary/long_line_ratio_chars_10000/metric.json b/datasets/allenai_c4_en/summary/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..06f3375fde2520d50f91a683e6ed8326daa7724d --- /dev/null +++ b/datasets/allenai_c4_en/summary/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06713828378b7783fb3180b5cedfa87a12490d99c5359b1baab7d1174fdfaab2 +size 197 diff --git a/datasets/allenai_c4_en/summary/long_line_ratio_chars_2000/metric.json b/datasets/allenai_c4_en/summary/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..556db30bbd7c90460ebabc6c6610f245a031b00c --- /dev/null +++ b/datasets/allenai_c4_en/summary/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e82802c7d5dc2b934bafc082d9a9d91b72488c86a611e8b7a1040f9585721255 +size 195 diff --git a/datasets/allenai_c4_en/summary/long_sentence_ratio_75/metric.json b/datasets/allenai_c4_en/summary/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..615c93a32abb33a1e90b29e95e06be4f9b89ca9b --- /dev/null +++ b/datasets/allenai_c4_en/summary/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e15e132c9ee5365d9a083689946a70760a6c26783f764cdb694659cc8a7e2adc +size 190 diff --git a/datasets/allenai_c4_en/summary/long_word_ratio_5/metric.json b/datasets/allenai_c4_en/summary/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e6543a6449fda3d8741bd4e26161ebefb79c4fcb --- /dev/null +++ b/datasets/allenai_c4_en/summary/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0da2001f73ec3a43228f1549d2029642a77ae71c6309e20ad24d8ab8477c02b +size 207 diff --git a/datasets/allenai_c4_en/summary/long_word_ratio_7/metric.json b/datasets/allenai_c4_en/summary/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1a21c2aa6a451e37bab08f374fd83ee4511655fe --- /dev/null +++ b/datasets/allenai_c4_en/summary/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bb91a30a897c1df1cf4b7a1013218e4b15f1374a70da8223983da9e42d68b05 +size 205 diff --git a/datasets/allenai_c4_en/summary/n_lines/metric.json b/datasets/allenai_c4_en/summary/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9af5b243294220621094ebf474921165c001b66c --- /dev/null +++ b/datasets/allenai_c4_en/summary/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac18b6d77562467950b66d17f68e3ec0e5187dcef88e027023274688b62a05ef +size 179 diff --git a/datasets/allenai_c4_en/summary/n_sentences/metric.json b/datasets/allenai_c4_en/summary/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a45a57c438b4c90601867a4c935792c06017b4b1 --- /dev/null +++ b/datasets/allenai_c4_en/summary/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3d0576c46d3022cae11ea3a128c972368caca7d1526cd7bb2746e518ca182ac +size 179 diff --git a/datasets/allenai_c4_en/summary/n_words/metric.json b/datasets/allenai_c4_en/summary/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f39d1a968d6b6ec1fed43888fd29101b07e0e9e5 --- /dev/null +++ b/datasets/allenai_c4_en/summary/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4de7f5473e2a1347fa332f59d768bc1396d94704fb4fe432f976b65dd13eb388 +size 181 diff --git a/datasets/allenai_c4_en/summary/non_alpha_digit_ratio/metric.json b/datasets/allenai_c4_en/summary/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e3a426159a59e4f1ec0d49b8b7aadb3128120691 --- /dev/null +++ b/datasets/allenai_c4_en/summary/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e849f5e5aa5d9fab521ebe9a592d97a886b2ca4226701a2094235518097c10cc +size 224 diff --git a/datasets/allenai_c4_en/summary/punctuation_ratio/metric.json b/datasets/allenai_c4_en/summary/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..566e5dd0620f0b0243c9179b762fd057a305518c --- /dev/null +++ b/datasets/allenai_c4_en/summary/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11214e1ffe28454ca6204f4839127c3281965003cd8613fd5961abc70dc6997f +size 228 diff --git a/datasets/allenai_c4_en/summary/short_line_ratio_chars_10/metric.json b/datasets/allenai_c4_en/summary/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4dbbaf9d5d389b700bb983928c4b8e8d4c36853c --- /dev/null +++ b/datasets/allenai_c4_en/summary/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9010d74b31a2f6f66fecdaeaba07abc0efe070842394c6406ad5603807150104 +size 199 diff --git a/datasets/allenai_c4_en/summary/short_line_ratio_chars_30/metric.json b/datasets/allenai_c4_en/summary/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..49d08c35befc7a7c56d4d26fac5951e79eee738d --- /dev/null +++ b/datasets/allenai_c4_en/summary/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e51f9b3bf39a0cef85d15f2fe60f5a9f492168ab65a90b4ac8491658ba46207b +size 195 diff --git a/datasets/allenai_c4_en/summary/short_sentence_ratio_20/metric.json b/datasets/allenai_c4_en/summary/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6add22638a88e45c272f46aed3cc398e4a0eeace --- /dev/null +++ b/datasets/allenai_c4_en/summary/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2da854c432f79af1851d0b500a231b52c0b3c6e1d2e5f681d5b28ecb8a5b07de +size 191 diff --git a/datasets/allenai_c4_en/summary/short_word_ratio_3/metric.json b/datasets/allenai_c4_en/summary/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a43ef8c1c380ec17fa861268a30c2fc9ddd7b172 --- /dev/null +++ b/datasets/allenai_c4_en/summary/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e83b469275d0e450c059222754551fa3a41d19374ffb5b559c86160ba899dc71 +size 207 diff --git a/datasets/allenai_c4_en/summary/stop_word_ratio/metric.json b/datasets/allenai_c4_en/summary/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..79e7a8cae68ef1e3b4989ed91d00dbb803da63c1 --- /dev/null +++ b/datasets/allenai_c4_en/summary/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b06a2d4b5b5c43bbfc988941cac6e80373fe9f4bb19f380a1846eb262b912e8b +size 193 diff --git a/datasets/allenai_c4_en/summary/type_token_ratio/metric.json b/datasets/allenai_c4_en/summary/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..eac8199e991a266ecaa807125ed21e4328ed858a --- /dev/null +++ b/datasets/allenai_c4_en/summary/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96d307a4e85d9dd8a11ecc2e701339679e175723aadaca96be55054faea64be4 +size 210 diff --git a/datasets/allenai_c4_en/summary/uppercase_ratio/metric.json b/datasets/allenai_c4_en/summary/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..04bf5ec530547725ac12454b3f422a351adc098c --- /dev/null +++ b/datasets/allenai_c4_en/summary/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da881151c847e365cb23a2f1fd8cd55be9c22085551f85e2fe38460894051237 +size 204 diff --git a/datasets/allenai_c4_en/summary/white_space_ratio/metric.json b/datasets/allenai_c4_en/summary/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c4ad43c68f6ca9e5fbeed7c5336d613e56c3d561 --- /dev/null +++ b/datasets/allenai_c4_en/summary/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75ccf0093f94be65ac6e14059926d66f0497dae631955077a4ef901ad0d08b80 +size 226 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_line_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8c6beb5a37aa6ab44280cda4411e22008a8d2104 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fddf633c79d6acb392d8ecc57d5af348b916dc4b87f680f091e2706019256de7 +size 1674535 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_sentence_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3e6e527796bd06c78c6f1a743923c232c32b97de --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b93bdfc57b95d696e811b22a4bc7ee5b1baeafaeded9894f322381928e54323f +size 1649610 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_word_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1c067f6ab13f10bc10a457dc8abdf30413f65cdd --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:028d9b89130f4949fed2bd2afa6d5df4eb3d8548ac9fdc1ecc1210a1983af7d9 +size 1766774 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_words_per_line/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a5dbf7ee616d7c9884aeea89fa7558979bb19e7b --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a6c71c68adf681e6f0409ecde9ae06192123f9689fefc3a93d027a0777e16c +size 1669794 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/capitalized_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c031c45a82372f72317425f40e8b194798fc5275 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:577b1047ea3f0f0d7a257ef7e339f145e6e82bd9bc28a80b4a18a4423f62cea4 +size 1803269 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fc411f7f352b33bfc02abaf7e91813a7197706cb --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e338366eab258120adda116b4f5398152c917271bdc3377a1ed4be9ae47a6591 +size 1560725 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/digit_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..39dcb442c12ee27eba30e3843ed492b31f55ab1a --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64fe6f68eb81ac50e27569431d305cef790309feb4e6396ed1bae11d920ea85b +size 1746770 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/elipsis_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0f059c025e72e19b55fdb0576e5b83abc2b61f9f --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe3e23199d79ed1faa4b2b9a10f9d676147de4e1ef1927a9f451cf9205d9d99c +size 1735005 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/fasttext_en/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a305fa3fd1c9c84908b43ed1fdd2964ba870a7c3 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:062b7cd16d2b5f392a1bc5e788b5d18d5c438a6440ef0565c05c76c5a9cff505 +size 1815172 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ea9da04da67ca7251af38e9acac5021d99be35eb --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d36eb7b0b17f957bb78b9e02c9f162745896383f1703fafbbfe4d2fc0bea57f +size 1463845 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/line_char_duplicates/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ef7313ca99c956fd22c0b61c35f85fc6a46e992b --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:170d282eb87875ca643dc36c05c9a76ce70db451fb210be6529bcc8f1e05c6f3 +size 1711759 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/line_duplicates/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e0d92120e58a80097d51128e1a2bf3c940302e7b --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0583442f1306a76a6857ec620fd8e359c5c217f6f0f16a63c23c3223a714280 +size 1634065 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3a0305284d5021a3d5b1e69d34817804e1337c85 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9334d6bbdc7ed46c3ff3b534f2a9fabc06ee36a6d1399823208ff93bf7e22809 +size 1610499 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_10000/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e3322c93d0846693ca0d488e6002b38c28d81166 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bf85df624624b11026a1c63f12f6246fba686425babbe693cc7b5497069ffad +size 255169 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_2000/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7cd5b62a279419964d198d18af1c7dccd1c6fe07 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adb192c0110243b77dd57651304bc4eee9c43d123bb80631027cd87530d819ac +size 842420 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_sentence_ratio_75/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c303cd8ba18d22b42c385635910a8ed23a009bae --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85307134989937c62b01898c5363d377e0f44b9e0a82b9e58f065a4cd7c46e84 +size 1642052 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_word_ratio_5/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ee6b38ec90812340c58cb343357449fb8c456f20 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aaba5e9237891af269ba4a47e24a1c007fbbd394242808151004771e99c89f62 +size 257421299 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_word_ratio_7/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6f9290b3d506792c06526bd11cb9fc2cfe5bdbb5 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47848cf5fdc432acff28651db053f432418dd878a0be564438bf0da8d4e36618 +size 1819951 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/n_lines/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8829502e412ff7a91d8c1a0dcf07c8aa3d48a1e3 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b76c554b4c014d3761a68930cc29273a87e88272053171ef4d5e20e3f8f2fe2 +size 1415067 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/n_sentences/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4e5203af40e14dfc9b115a19e5b822ea491a3c48 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82b6694a1cdf6f5311ca32cbd4e47548bcc6c6476125a4a3b03433c0cc49c66b +size 1412774 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/n_words/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..dcfb0e1e084cf5839e60f140aa72a11c3f3ec332 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0692073dc53436cacf4c829f469d5f786f556e5aa2d89937b19ec65b48e82e5 +size 1444244 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/non_alpha_digit_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..726d2de500ce4cb1293b90996251412191bc450d --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26e7a68e0949b90c887affcaab4d8e044ee6b00aeea3e260bf906df007b48177 +size 1841719 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/punctuation_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1130da904e2224f868b46c90cc2c1cc674c3fa1a --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069997a964c16a2751bc5621f4b8e1dbde5ad18f54f9ef93e06058a1a5fe2f6a +size 1853773 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_10/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ccd00dc489e32e528638d96b27ac99caa14eb7ab --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd68076b1310515c966acae04928bf68d1b73531db58e35497bdd364684a0cc4 +size 1657360 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_30/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..20c658be0f793dfd200bb2710573bfb778c56d64 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cd0f2b05c09c3621a922bec44e245b613b39ca111497810ecb58b78050c2a09 +size 1673808 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_sentence_ratio_20/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e5693d9a7752f8823e61448060e8dbe57c4d6f24 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9308f7c1d8b25fad082486241fc145e825d0c6449fdf9e7d5522abb7e6b0c66d +size 1644677 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_word_ratio_3/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6e5c6d2aa006d5e08e87d28c5b19896c4d0799c2 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b8be02420983f7b1bfce6be0a66d3f6414c1f8e7a4d52fd29f5808e862a890c +size 1812299 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/stop_word_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0b1015ed51ba08a49e1fdfa3f53fa72680e138de --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb68ae791c62ce3511e04a5afc78500cf0f204843083f24d74696a4a0cbf9aba +size 1836446 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/type_token_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7d91e8fc9a8d89239621f556fb58a563e907adf2 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:639121cd833159e70bc8a42686ee955161a3b9811330ebcd377505f8ed0f7265 +size 1804574 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/uppercase_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e79db3de3e94e299959f7576439dbd3cf0a7c5e9 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99a7ec042f786858fba7f35959a785d309e68164bb4feb7a63979679e433dced +size 1839824 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/fqdn/white_space_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..16cf462d49823d4c204f46043427f71160d156c1 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/fqdn/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8de904a504ecaa4747a80569e601398579b5e98d291d2ca61d6f6637d2cd07ca +size 1842921 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_line_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7c51000b92d149857ad09c188e0eee5b2b2ff5e4 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6369f934c34066bbf39d7b7e16ac4f697e7abd90b1e0c615eca01e7966549f6 +size 17198043 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_sentence_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ef6aaa029a3a4939ed7402582e192fb072fbb8d9 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ebd3ddeb41e221d5e43e798503eab6fddac4c0b64563d06df0f26a52a02a9f9 +size 12253696 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_word_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f7e989ec320cfd44735c81d66f1d1218baa44305 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:618e1952ed313315f82540c54184c39d2b48ff9c61f00cbaad54641f96d81733 +size 284547 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_words_per_line/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..25523853d332de8839d517f97168eb03f56f4388 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a4a9671ca85bf752089a23e0f48261fdfbaa8bd61b7f40f2452a56fa61074f3 +size 4868633 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/capitalized_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..30bb620c1f7669da9e6d222e9a8c5f390c115300 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cfe747b41ca264538bba1a2fa88e4244985cddb84ab1b9e4892c2eaad7e7a2f +size 39688 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f2813c9d6b2af0da8eced91148b9f6e7cb1b6eb5 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49bc58abe7133bcdff1ee5e5586ea7e269d448289e87134cf5097bb7380bab15 +size 3156721 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/digit_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..13fb66563f6d4a15afb3be6e19df45a733be201c --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f88ccf0e688cf67a523fd5d8bb8046758149d01606f4c6a02df64dce2e03a7d +size 19076 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/elipsis_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cb10906324d024943d63f9b268faad1de9da2c05 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e28ae0ec4691ce5a703646894391126c7778a8566547cb5266e575cedf58667a +size 2946 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/fasttext_en/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..48cc9cd70a393cde493780f07f7c4439ebb46e12 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e724bb17b3341a9638c35ba83d689230f6a397b0f345036c419764170bcd0e3a +size 15143 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f10904548dcd5a439f4d6dfb4c09d27099fb06de --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8359d2135281e5e2629b2b66bf4dbf86ef45c2b97d3f9a5d4f1c168761adf261 +size 3013944 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/line_char_duplicates/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cf75a49d1cb8b28d6440a81d8afc6f1d382ffaca --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2f71a30583553daf9aa9c31e1d290579f1369a2e691d56e6bd6d0aba152d33 +size 9111 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/line_duplicates/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3599ddd0219d782a03633ce72166c4d94219704a --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8846275705e587b48543c0c6ceabbb0434bb898ab29cef26851f821deca36f6 +size 12937 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8c06eab4ecb7d5022e22ee225907ece7ee9a34a5 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b60dddb1f3882678780b929ad671cfcefb42c633b4087b0084b5141930544593 +size 42112 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_line_ratio_chars_10000/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..794a6433ebb4bef676a39b9fbedcdd11aaa2eec6 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f364aa715c3a5c8d5b395037fc6dd3a57c2b440e511f984b046c2571e89e7715 +size 5517 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_line_ratio_chars_2000/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8159b9caef7aaee85b532d4cb372977df6c45f19 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68ae2042b5556dc2062a37985869a8618b4dfce4c6b2c1c3b1b1a65f28db497b +size 17875 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_sentence_ratio_75/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6fdc7ee8d4f5e070f4acd3fca5c01e7e62f933d4 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5716757972bc322f3199df13ad81b85f2274f757e9807611160117c2a0a8ca0 +size 41774 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_word_ratio_5/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8c6e120f85d1c888d2fab32d0d368b6dea6db514 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:069d3c3123c16c1e687d1789ecff43a2264c13d2380ca3436b83ddd07119e320 +size 4187 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_word_ratio_7/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..99da53cb3a8ad958add73ee6e9bd3b9d31d31c7f --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f74013e504a25832f22821c0a2fe2ae3e8947afe81b920ec1d54f9c4ab193d1 +size 31076 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/n_lines/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..98d9afa73e310452c5d42ff79f9f66fce2dbb033 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baa62c0cafcd2e766325839461220dfc5175850e5200eafb8da6c0923f42f8c0 +size 133746 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/n_sentences/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..55c2703e4271bf28e411b6d4e033526e22c1d54c --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e50847be9a428559f52501813811cb13d437218dd0ee4c3a4ab445653334cedf +size 118513 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/n_words/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3f4b0a259145a3ef5c761559a2fb2b92ed7620ef --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:754045fad74542291032fc53be267ba8e5f61fcc3bd0304753e23003edc90a89 +size 1002685 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/non_alpha_digit_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6daa1ead113cc8e4916d5b926bc4b78a6c2ec3d7 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cb15dd7af664558fd564d0f404c325b71386b29f951d70baea2962581883d8d +size 19287 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/punctuation_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1eadf148b5774c7d34ec29d83ea7e8ee477ceeef --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2739eb7253431b199d95f107ea835b7f04ae1fe00ba66dbd9185589a6f7aeb2 +size 18854 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_line_ratio_chars_10/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3a98eef4484b3f084f8dc6e79900af1f0c92da85 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cf98c764e858ee0e3ace65c912e21efd71ee3e1172024f0ae773316f8ad2103 +size 40239 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_line_ratio_chars_30/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6ab2e0f32e5cd288f6bf0668d7a5e1e4ab09e7a5 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a587106481e1d20625b828f5ecd3839e83a0db79663d79301b002e61346adaf +size 42231 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_sentence_ratio_20/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..60f035c152cdf3b99a311328c07b80b55a9bf932 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:812cd0be5071aaeebc586149b240d91e7d8e5f8be31a49a0facdb83ab86effad +size 38635 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_word_ratio_3/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..93c2eae329f24d144b21f1d4ba5d77927143a9fa --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:421343de03cd6fc9a680a2bbb24e49593b65212f2cb3cd42d60a9529e6b3f6ac +size 34058 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/stop_word_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0727079ec518b1b03660707462ae01b47607dd88 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e71df0646933c557beaa872a7de8ca5757dbdab1629600e5980efdd33a5887f +size 15584 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/type_token_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f8558d611759908c1d10737c8f247417b4ceab9d --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:852db1500ce77a5b7b74c469919f4aaca02791f35f5e294d0f8583d0e8942498 +size 39830 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/uppercase_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e1d7a74c90da986c1e06a45012062cf0514b4502 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bd6b073a37da84d66f18ec01335b3850bb2b33d3bbef6f3d026067a10bb43b5 +size 27749 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/histogram/white_space_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/histogram/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5430b61e392fa1b5a20704e7f3bf387be1cab13c --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/histogram/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4477804d60c4a6b11f43065ca9885715c765809785d65cca8be0609d7746f958 +size 12088 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/avg_line_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4ccbb705b7d46ded4ececa39952ebb147514a591 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:450c5f61b9838efb08ccfa7e217310075c7bf08df8c4713e075586be0dcbecf7 +size 214 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/avg_word_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c5994261674bbe9a01c2c801911ee0d2d3b8abb2 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fabde06f81853e33bb18b14562e91c9ed4b13779c9e552c400bd459a2237a5f +size 211 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/avg_words_per_line/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..502bc5d3dbef13031df2a283d8cb759e7d413adc --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ea5111d80945958457e3b507f5a93a4dd91f2575d23c0a78f7110d6c48c663 +size 200 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/digit_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..09082ffd56ef7501015bb56107d117649ce8bc8f --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf7c6c2fbe176cd1a626652c7b865a6fda8f83808fb2d66cc12f744b81f32d57 +size 217 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/fasttext_en/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..005d87ff86a4c0bc43d52985e53d807ccc600eb4 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32e89bcced69be77d95ba8a8c9cbf054054b298cf865950e72a2e16c75df9fe +size 221 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f692f829313ddac1443cf2e1098e7b28fab6a72f --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa103d682f1319373583e7cd29957379f5b5bf2c1265285dbdeb5ae55df99e2d +size 186 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/line_char_duplicates/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..822931b32b61c0a25eec4a39a14e9b1c52915b3a --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d44b549f2905ababe83d4dc8c965e2ae988c99bd6876869bf900982accb533b +size 194 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/line_duplicates/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9e225f6f4c86696704501ca3ab0627551cce3d72 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c53752761a1c99a2cf65f5b1b9e553ff37ac96041e2effa1519f24877b6c6c7 +size 192 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e7edd14b47737167403aeb2a96189b893b8bd10b --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee606206061325d5c803cca5935dfbfd74de01c01cf99480e3780db6b5e41e42 +size 197 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/long_word_ratio_5/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ae2e07164881b43a3cb490d6d465d011ca8a26f6 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e135f8cdafdee33e6cae93f82d35ba2db2fa421bbe4b27b999d2d5638d078f +size 215 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/long_word_ratio_7/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ae2e07164881b43a3cb490d6d465d011ca8a26f6 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e135f8cdafdee33e6cae93f82d35ba2db2fa421bbe4b27b999d2d5638d078f +size 215 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/n_lines/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4ba753e221aea9372fffb395d3bab5800ee2bb1f --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fddd7ed1fb2a741b6c06bc079abf76ba4d8ccb48c733f57ee35a0befb5a8d6b8 +size 179 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/n_words/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..78e3542f99ad69839959cd18c8682352d41a3ba1 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceb8224de5f5c37a137b48caf55d49fc40a5a1d83a87397ad2f2d188f42cffbd +size 183 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/non_alpha_digit_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f6ac13a6c0b13f9ec3b25a043810eca3e7b28d2d --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a13dfea96ce6d9032f261752f03c20cd70190016745c98e4268bfaa14835339f +size 231 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/short_word_ratio_3/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e09bdee1c50566eb644a3d8a485bec5adc633d7a --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39294ca3521b685b21bb88c95d14c656bcd8e38a92d2929d81d892d83a683113 +size 233 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/none/white_space_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/none/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a7ffe4cb19d42e6597e0b2558fef7b05b2a41802 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/none/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53847f9de52893988813383ce6497e5104aad9f3cd31799917474bc5d4384775 +size 235 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_line_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fe90efc16e153474350865858da0210404738ef3 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:918bb05237a6e7fcdc38e7c70dfdf070b9fe0ba99ea0f2cc55dc6883f7b0b152 +size 205705 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_sentence_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6852bc78b96f3c5efb6c2f5781f89457fe1df6c7 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62cd578d4e5947ed6149f930b5166cbfa0d4e76a33da73b8c54ac0aa1ed8ce41 +size 201715 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_word_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..92b99f68e83532d1d4440a96e56c5838606d206e --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11ec2f81bb993cb9bb550868520014a309bdd3141558c605192214d8bdce63e7 +size 218232 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_words_per_line/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..63e6a034086c8b27ac67d1011178ec7c5d6199d6 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbb31323f2c30a01a5a8e49a6cac9b36731cc2ef0163ae6437cfc182c24acb9f +size 205175 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/capitalized_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2de8fb29e56f57e95afbad582bb523e2a5f1ce92 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bc5128231e88baf731ef008dc74d02433e36e12e777975702e9f93dc53c4743 +size 215193 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b9e244fa8ac114d397909d2e49a4219442b3335c --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33a44afe2c15a0e4e3af4c9df9f1d27e14816cf49958e5adcc506e29b056bcf1 +size 183136 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/digit_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5dcc231e1d094f356ca70969b21883f9c634cc2c --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:858a0f229d7edd0b360ae5ba74f460f740965a574f7c438874faaa6fdaf24061 +size 217118 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/elipsis_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..19703029d319c863fed3b183a44c66cff2e7ffbd --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4222f84166bd7f9c0171190c7d13cf34f953a3ffdd1277aec2580c3f4c4e8d1c +size 170194 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/fasttext_en/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6857a0ae06146ac5407c434c5c74f8e366c7180e --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2820cf9f439eaf71248e5ba5f23ac87d60425dc870face1e0f88a534df4d18b3 +size 224653 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1eca1480077f4af41ff21facaccede69d7dc699f --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c310dafd8dfcffe23204e32aabcbe6aee7abf55c970c78d5896c3ab9b19d0b6 +size 173769 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/line_char_duplicates/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e9ce8c0627fcf32cc0e0de7298035943cee21b71 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f20782e6248888fca87578ea684b36ac319daf593c70b5488f88b0a79e397ec8 +size 187300 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/line_duplicates/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1f4be95590e855494cdc7c72d70873c653eef5db --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5b5d1b88e83564c9b663c0629064af74079901b530b13fbdfe7599db8377204 +size 176861 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5961f68f841e6aed40bb71f4fd92efaf31879d18 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78083bf724374c82ab3b9f0cdf6703dd988200f5c541101ded1cf7445d425c17 +size 196638 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_line_ratio_chars_10000/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6a4d1671507ef162f660f60c1ff3c19bb4cec9e5 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3de2ea81d882ace182747943c0efc8da528e42dfbe0b2803f138f0fd2d7ad98e +size 26847 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_line_ratio_chars_2000/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f74e665f4efe543db6cf1a5266e827ab649dfce4 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:192e2114556f114c3e81ace26ee63b888f333d5449367b70b8398a28564de0f6 +size 66518 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_sentence_ratio_75/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..734a5628bdf8a6725a28a567491ace1f47d2ae6d --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7c563d7543e92171626758a086c3de20d36027e016343e9925a1630f0108814 +size 198394 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_word_ratio_5/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..45a8da512d9bf6f89c9ddadb48de6c6ab62e5f3b --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:336ae76092e334ed561323a5dcdb5f186051ab617a69397d795a78324b7ffd5e +size 218209 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_word_ratio_7/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9797c2adf6a26f72e045f1a12274f3f58027e1d3 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75322ea9acc3018197fd1c5fa882fc54aafda45f6e57783858917f0d3c09a02a +size 224831 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/n_lines/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..41088479d39f461a4882015531889803f8e4436d --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56c0088445b9ecc8c0a1df57b030dc5f6eefc2d1b004c51a3e518efce3460fa4 +size 164985 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/n_sentences/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..67d954222a812ee7eeb54a63ab306b304ddd78fc --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61ed95304f8270537322fedd86d7b64a74e05e51ed954e50de5d151b1c21bcc9 +size 163680 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/n_words/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..98d1fe6e093f07c8f49558e3c66fc4a418a1a937 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fa1de620c85b8260ad9832ae78689fe5da4cc3730b4e5ac39cabdcae5cf7ecf +size 170707 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/non_alpha_digit_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..03bc5eab840635de676845052ed6334937cab9e3 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42ac53a0d8aed0788b1e59d9c544422c49fb5d5081b863cc566f20136763b1b7 +size 228748 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/punctuation_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c31a6239244d896a2002ccb026eae975cb4d09eb --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe54e46576edb5656d185272db713144969060f7fa49d4fd4046d5c6aea44835 +size 230880 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_line_ratio_chars_10/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a1b07514d8d93a746a82cdb78fcfd6274d13f4dc --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36ec364797be7dd88c277d003fa3cfd9748f08382d86d0de1f779a38def09092 +size 193475 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_line_ratio_chars_30/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5661423c21660f00d19b2b150a061c9d518967da --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31ed0fc05ba2e628892aa0a157d0ab8d42c35c69a39595fbe42f3ab9eb592cf9 +size 202215 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_sentence_ratio_20/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e6bdf686ce8b9597a6b660023be01bd34d3dc8d6 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68678e99e0e3115b9bc6ddc5dc234ff993ec5beeb0b6142bddd2acc04dc4f991 +size 174456 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_word_ratio_3/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9217bc042aba98a2ae22476ec98ea56f7bba454f --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f81b99e34ac14bfd0ce6ac349f319492a1a4ffaa6954403cc907d5c4f83e543 +size 224938 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/stop_word_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f9152a392ccde1eb1fe5b8b37737bebe30036bbf --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb26833c47016ede6b094667810db2fbae3ed1ce7db46a54a50ffa25a7aaf16d +size 227376 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/type_token_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..56f8759893c8b25ff027cbc1a129869029eaff4e --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa9337fb57fc1ef0a8673a0d3435f9657b77da7cb838dca6d3d729d94d76a523 +size 222848 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/uppercase_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..db49854b3e68ec83b9f0f3222c5eb218f942c908 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9723eeedebe8788c98c35bf90d6d04d61bef6b2030687330460fa561028522cf +size 228593 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/suffix/white_space_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/suffix/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6bbf15163eee8d759214f33437d53868e35b7235 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/suffix/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a5e0818451a078902c9a77c8299460b0d81f5420edd2f89859314f1d08e83d7 +size 229146 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_line_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5ca1f71ee50963fa9eedd3b5b72315df0e75869c --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ffe53f67e1eb9761e7421b1c25efede0eff73fa022f1f6051f71371c29b6316 +size 191 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_sentence_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..306fe6c9d2bb66ee813d1eee42c24b7da6d569f0 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb529322c2932d48bc438366d3e420738fc9e9d28c0a057669a697c705520236 +size 204 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_word_length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..025c76c3510597a2da18c2c9e999b5226cdce338 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e2f80cfd53edbb2f7b6685de7fc90808f61d5dcd62a46af680fb683150df66 +size 216 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_words_per_line/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4f8f9a508af2d67125c60574d791397c7963a949 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e0534c826b412c5ce297e84b8ca3e24431e29212722dec3ca6421b1bddf990c +size 192 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/capitalized_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..aceff1e4c2a95bf8623e1dd7fbb6448bc444e075 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d932b1c02d78431e76df6ba302288ec2a562ace40e677bbe6f05a3aef99e9fcb +size 205 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0ffcdd9478a3dbbcd348555b22a83cacb26ead1d --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:720c5854066e718e9149019064929494f0dc4735fc605fd4daddb222913ff8d3 +size 189 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/digit_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..521d3c0dacd9cea870e011307e119a44bf2c1a3a --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4182b6caf9043aa3cbfd941d3710f24e5bae20d2a1752f89d49fdaad7386b8e1 +size 208 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/elipsis_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..70abc87b704e0357221a354895be63930fc9c55d --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:994c657745f6d51b914f87e5c43a9ebc5dd96ee05daa743b0d0b36aa12dd81cb +size 212 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/fasttext_en/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..78726e6280a24819c5f7e49f4e3fa9143b68d9c9 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cc8edaf5fa79e19e155db8c4cd051191b01fcbac5680b2a9fb29e91da04f964 +size 219 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/length/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4d5b0c684163a09ac496b85f3a138cbc64924cef --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d4ca475451799d796d293718c724b5df0689240fde813c45a7b9a2df9f1ee8b +size 183 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/line_char_duplicates/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cf46b2fe27af85cc58ce44ac6d50b910d542ac5d --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7d1e013e7961eecd903ce7f332b63e2e2426ea94f6bf38462fe0a1fa89bcac8 +size 210 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/line_duplicates/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c862825485ba7d59d4c7091d6397663519e77b7b --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87eca0d483dbb962fa9bf597b9efdac5b7fcc7d50cf5310141fe7f3e2018109b +size 207 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f0c19f18df6fe74100ee91302cfb4f3b4cebb183 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb2d55b47cf05a3db9acdc3a5385f2dc9c608e5339665842db88dfc5f44cd583 +size 185 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/long_line_ratio_chars_10000/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f84298db1d1471166931289199474220161dac8b --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:275fd522bacdc6fbbc19b09cabc4fb5fd61490843ff41faf7fb51dd1506ebed8 +size 197 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/long_line_ratio_chars_2000/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e599aa0af68f956d8cdb7252237b72a8800720a8 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0451cbeb6039e82b4bcf38d94b5ea479e56945945de1f4b930c96ee3d047b1c8 +size 194 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/long_sentence_ratio_75/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d15b60520f812616363d0126b0c03bd8e5011916 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76912b07208cacc58690aa812c567531891b44db1abcacc43ce78126b06c38c0 +size 190 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/long_word_ratio_5/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ae2e07164881b43a3cb490d6d465d011ca8a26f6 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7e135f8cdafdee33e6cae93f82d35ba2db2fa421bbe4b27b999d2d5638d078f +size 215 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/long_word_ratio_7/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b1fac044493838a8b433babc3c3f0ec29e853193 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:704d7b419be733b6e2baa7ae6e7bb4f8e2918b39f6699095a443aab6ea671068 +size 207 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/n_lines/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7190044280ffe368d322558e6d136b7eed9d9543 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de1cbea5344deb2a3e52793944dccc17ab28172697001e62f3869a9b36b195ff +size 179 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/n_sentences/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..bfca78ea8444959b0dfb0dfe008e9af895e6d933 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0384d5468cc7782e4170c9224a753c0df6fb028c822e6b2ba9422e7ecd02aa9 +size 176 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/n_words/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c5b0dbc0d866adef786fb37949cef787569ed969 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd0d927236ebdff6bc3702ff5be3a221497e5e8cdae53b5422ad1d1afd72b3c2 +size 181 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/non_alpha_digit_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..aedd62902b6332f663713ae35f66b4023af2fb01 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afcfff81e63091669456c92952ac006d972e032384b349f81eabb88bf56e54e7 +size 221 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/punctuation_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..09dbe6723891486bee78fbc397b137ce26602e9f --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bb65b3eeb7460b010bd1482d2a5d0f5f22d8547355d2e054b52cb5bba4437b2 +size 208 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/short_line_ratio_chars_10/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cc722a8383dfdf1f1756d4c8e72df9835bd14a02 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02c0778ed35ee6bf34b4d70f06bd63851be09629c4c99e40d1888c7c97e79e16 +size 205 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/short_line_ratio_chars_30/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..79733d1d41bdcc52f2ba2e3ee5683972238e6dda --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59ec579fd3c54afc9becbd7d135f8187e2d57ce6984cde1e6c0b2fbb8fcdc47b +size 188 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/short_sentence_ratio_20/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fea2f6d1bfcf8a9ae75313064ba2800481e630db --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d65ed875d7ece337b78618248f2a9ecd5e3f256a7f49b0c8750c03d230aa37b +size 189 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/short_word_ratio_3/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3302de66741948e9a4f546a15267d8f4bead3e43 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86b035aacb929ce74d7fe0c3a2185f777db2d6e5d100eedc4442b628549f14df +size 224 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/stop_word_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9ab9d307c58c6b439a6d25dcc64e08abba588958 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f346e09ddc5c83e4afa14fe7b0671706a5743c56366a5a0a7d6aa1f830270107 +size 226 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/type_token_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..dc10bad2e98be6aefba608be280ccfb273a1e83b --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb9162d9cc270aea38808816bffded638a012d0798b36771b284823a960731ef +size 208 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/uppercase_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a4efb193ead37b152b078b1fb934b9686e81c99f --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3896ca834bc3e2b27916a8f3ffb89a94a3d742ea8a0081ef656d6e11d601eb3a +size 210 diff --git a/datasets/dedup_global_CC-MAIN-2013-48/summary/white_space_ratio/metric.json b/datasets/dedup_global_CC-MAIN-2013-48/summary/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..29f4a53eb61d8ef3cb125bd7ab78a7b259257f11 --- /dev/null +++ b/datasets/dedup_global_CC-MAIN-2013-48/summary/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:656afcb743e14a3b85da40fdebf9c0f419b2ecdf67a07b22feaf52b20c432b6b +size 225 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_line_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d23a13b680aff47d1c639e1eb2759a8890452d8f --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5937970d969da1cf0010ad6dbe4f8a60372f33a4f9e19946b2a12dd92e2e3ce +size 1677124 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_sentence_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f0189584a04bbc26daad61f907f6c1d4d244df51 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f4d52a4daf6b0ce48357bfac77e0f2292c71eb46e30ae735f5ebfac49dd314 +size 1660546 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_word_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..982e9c7d1cf1bd9d37f4f1ff27e01bf5003f3183 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46528a1c86120b57ff764779e3aa1e070c615b6e0fc44a6c499b9c3b0b4ae576 +size 1772489 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_words_per_line/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b41aa83f0a2c03c5e1d4e554e232af45c331397d --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf54617cd035fb8c4ae73c25556d40a17acaed5d6bc9134f3082beeb30900ecb +size 1672166 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/capitalized_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..80d1c42f03eb8262e97191b121b95e4660a562c9 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6f7b2d334fd0b9f42cba22b4f7d9c53c5ccf24f1c232611f9a0e853960cfb35 +size 1803691 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..eb729a7d962e00ad1a9454118f254ddd6c36e2b5 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3eed1dc77693a0d5099882d29b421a57eabfac96f2243e1a56dd94e7455426f9 +size 1567338 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/digit_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ecf41c59e01004c431f0021337b86b523ff1a867 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68c730c68d19a28fb4b29ffcf7cd0dec745e40d2f707e97133d18452a43e410f +size 1737674 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/elipsis_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b41da395f537806e55401f05a0cfcbd8a8082fcb --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ef358ab35deb1dc0bb3921e3af44e34979464582e341d91d295dd8650f1b4a3 +size 1757293 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/fasttext_en/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..aa258e53abe347e9836b6fc52049ab7b33c86a65 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eebe23ff2745f07493010486f0cfcef44ab89e0da72df8b517075b6bedf63221 +size 1821555 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a032ae27bdb780d1f02b1b51aaede325d4ee7b11 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:781618b532a397e49899f13ec139257e86e50b515d69755918bde332067f45f4 +size 1479894 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/line_char_duplicates/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e3070182c49ef0c5e4a408183ec50032d25157d3 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c04adef4617ee5666ea739fbf5847b4fb756ba6aefee05210fb328c24ce7578f +size 1736830 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/line_duplicates/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2191cd98b3106523f06027a38c1310812ae2cda4 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10f0e371be35cd8b6f6981ea73939cf0d7a2049848d172b9363320ecd9e791b4 +size 1633227 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8e1c60859cd07c798dcfde13abff75fd31bf8351 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7971a61b10be1bfb0d11072312d819c0626654fc106bb1d1b573348c669e0a7 +size 1582099 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_10000/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5d21f92408ef457af7ef24e7e8342f4ac0602650 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4683fed3a5973b201f70685223e321729ef0f7e6667d4bd79756c9f0d14886cd +size 438037 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_2000/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..499b137c14d40f7ec47339a3843b7831e1d85d84 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29657c91ad864ad911d8b2ee506c0e8c918f15f6528b0136e9c22ccf2de755f6 +size 1454653 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_sentence_ratio_75/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e56cd00021e824dfb814a405967cd9625909bc66 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c75108831cb65c85ebe68ee795a354a19acf2b059c6dc5c8ee720502dd9381ff +size 1617041 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_word_ratio_7/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0aaf7006d81d4d66f161901a827a7a027f8dcaf7 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e466b27034509aa713b8b66848e54911b452f2279221ce43140a20d4297e47b6 +size 1819984 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/n_lines/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..247c50247a4872d4f4310d7207080ded8ad68d8f --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9161440e24fbb88930f7fa71b213d69f72811419d40e7af0c20018d93694f76 +size 1431112 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/n_sentences/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..60086487ddc477fe4afbbac9ed9c8e430413d056 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:873495eeb0afc93d9a2168ac8015f3b06f2145d54f422b45eb3d759e0dfeb1e2 +size 1431226 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/n_words/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..807829dad2c9c73730f022b633093b3e94ed0991 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7710be78fba89a0b1c8a61b3eb9eae5c412057c2a99a52d81c67e0fa66baf49b +size 1458931 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/non_alpha_digit_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f3d306de369dad615efe9db77e715c79b1a87b42 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94dbd4384635742c52e7f9edd5ddb7d187a81d6bd57176b8dbdbfa4f30a04bcc +size 1846962 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/punctuation_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..65520ac31de713ec89df3beba379db8d880b5d04 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd913dfe18af01ba8ddc068954feff7ead27108eb0612fa3775014ab12c37c43 +size 1851364 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_10/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9338a19a4ac501707a0cfef682b785e5b490f9db --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9690bd63a6b6174471eac5d71fa040247fa9060bffd55fb62e7e6c9e2fb47d25 +size 1664306 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_30/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..db43ee4d09a62c37f92c05b6fbb11d6406e24451 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:493866e32c24ca682abfe1e8316549d616911f5dee56b4884fdeeb6ad32a9341 +size 1673767 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_sentence_ratio_20/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c54d5787f6c9e8f43c64d41fe9e4999cff4ee43b --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9af96877db5e61e32797c6eccb0764b4f578e56f4a85fc33ec3af77b25fb296f +size 1666087 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_word_ratio_3/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..10f10a405ad527d7d825bda3e11e9e62b1fa0da5 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdbb35f3d1c7a3ff21d4ba6af5f8e46aa6ef70bba42e6c764c2e861f2b2a0742 +size 1816683 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/stop_word_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..771fb993e84718232304694a1c4679c924d3812b --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9acbcbb3a6a8449d3aadb39d412ca674e2133439e96dcc266722961d67e19ff0 +size 1839923 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/type_token_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0b1645b50551c0f59c891af9f0da532733024965 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3820886e4a87fec4a621667c449e6f4e97f61149917fe84f5b26485daeb3e8b5 +size 1812097 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/uppercase_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..792192e757c0b56ae4ef0e11984fb2ce155f446b --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca426978e0387bc873aeb76dc45e1b70a922c01dd084edb7bab4bdeab76389a8 +size 1832997 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/white_space_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f97b20cfd8862c8d2b25ce0c80d6d16cdae4b127 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/fqdn/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73262b6a61ada0164a5504b02aeeb6ad51585e20be1985190bb475974156be11 +size 1847947 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_line_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a8e1fe31fafb6537dd473a7a1297c2002e9d7e3f --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0da077224910704bda2b73b084ad0cd6697362beb6e17a42a5a53eeb2f148844 +size 30473875 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_sentence_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8163925e404f226b786cc147c7098e90faa1fd42 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab9fc0ea01704190dc41e3be95f48f9529c7b0ce968175db94e4906dbd83089f +size 17312620 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_word_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..77afaffd52325d758ca6404a285815b2f4ad9530 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a932bb047f04ac3ea7d7ad50fd326899804486b9d2cd88082fbafd281d8f0aa +size 305819 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_words_per_line/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ae6f07197a89fad1874680be815e6b30ea3c5001 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8795d920d2488866023d2c0bba9165e07d80a006acf81c92801bdaaf92a595e +size 8582139 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/capitalized_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..12bdbc783804983dc4a2fae8252390b550409e97 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d36c22fa77e1504e687a5cd3dd09153997b03c338d04e306aec062287c25c1b +size 37302 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..db14a760de2b115e13660e0fc34541f2bc8b7315 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d93c80625af74c43beb1a586e1084a5e8960452365c1eaf22af0b1c33ca640b3 +size 3546837 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/digit_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5d1bb2a5481fe033da753b2dbac3558419309e95 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:097da08ed8726b30329a76474606ef4664d4f21685ce7a35e4ef92f433cd6bf7 +size 21058 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/elipsis_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d76deebd92f91990dbd59a67b3f8cff71de4981e --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:878564cff8224714c5d406478cf223593a30dc56173fa1d49fff249466026a3c +size 3140 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/fasttext_en/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cf8f5650e2d1a3b47e5d1acf364489cd9c633672 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8521d3981d04bf7993eaba6a0d1ff9ad3feaefff2dedfb6deb6f93137913130 +size 15350 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..496d7ba0a81b8975c28beb91a1b51682b921cd80 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe3151b713326bb0d4c5b3af894d128da55f79113b84782072da121cf3bfe042 +size 6722317 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/line_char_duplicates/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..428aa13c5d0facedafae5c43bb67ebd8eabc7e44 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3b8834f61d1a3e04e7f371ffb4204b2b338316d28e2e753ee16c924eef6b010 +size 9202 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/line_duplicates/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8b741dffc584c00d786b48ffddb974e48176a4cb --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5f98688c2d347252386b7c927dc4b609af0e1c8d903551d3867763c754ce9db +size 13042 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e34719b0963b94b3ca094e929656bdc14d2932d1 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64fe35423f9f1ce8ba6769c3f72e65a61c02854a5b772269a17f8e70417a062 +size 42879 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_line_ratio_chars_10000/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..31c261799caca0d62bb8016a4e7087e701ad4ea6 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6450ce7e62996dde638d4ebc02235b526d92c9279c97343c57c082a36c41b8da +size 9446 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_line_ratio_chars_2000/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a8e614e0ab9048c0948226761b8f12b3b71edb83 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7b56b0a0584a97c3fa951eb4e31a9138130b3d5d1bdb1d5a322315eb947d7be +size 30738 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_sentence_ratio_75/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9af9728f06d80a8fa90a7588a2acdf5fa998cea7 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05e86ec221ecf0c0b887f6f0fe4f89d87fc2a4d260d196ec2188232573b8fc98 +size 42692 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_word_ratio_5/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..51dd18a2c6037c7e4d31a7891a948814d45c2c4a --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc28048282149381383613d26f989ae9f09a129ea95d3a6401914a7b9762ad61 +size 4598 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_word_ratio_7/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..de4b76b6c1e2e2e4a53fe1507f88421e26b58962 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f38f2ea281e6f5e05dba3ae0d0bdb758019633f535632fea236915bb2365a17d +size 33218 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/n_lines/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7fd4d057afe1355e39660c8ccccc9e2a7029b674 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:124363a30c541a8a92c0aef0cc4aaff3d8f2c6dd6fd3e22148b513bd18d06a8b +size 308778 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/n_sentences/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..37d76136e9c06e4c38debf11af38c8137ed299a7 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23d55f1b08330ad7ff0798efdd33b164506bbd992e52be8192999ed7e54256a4 +size 226643 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/n_words/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e66ce9520814aacee41eae7af305cf33c16b77da --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b180e2e689cef51c845c5bc65aeca155a77e512db0a42ba7d8d16cc6988eecc +size 2229347 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/non_alpha_digit_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8b14854de5bd98c2b2df37c9fafbb7cec1f2c452 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd5894dc42f5f9955e677aaf1df6104e17094935c0c8ca32d6a74a338a003e35 +size 21837 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/punctuation_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7b4c883e00bc63b1f6853d66a0a9e022957ad800 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f66b815f5d45c7f8f72afc3c2d62ac994ca751312935f1cd17b2caf2d6539a1 +size 20922 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_line_ratio_chars_10/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c5c82c2990a53d0eb91ca30ad3efea1288a4afd7 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64b9648997de8078952b8530bee16c6665c70d3346584ac6d5bdfd2dfe902b5a +size 41186 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_line_ratio_chars_30/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3bbb483b71334db51f7520c338ae24bd90eb9af4 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94d392c3c166a4680369760e7e90f212ee0c97b4087778e4b299b874b92fca4a +size 42904 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_sentence_ratio_20/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3ead65f10905ff5161bb4073485961f27f7aaf9d --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8fb09ac0844d8ab45228d0ec1312cb6d4b06700459ae8bea0492f4814cd386b +size 40796 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_word_ratio_3/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a137fde8d3cde7e8053de77514d4c4d2c39eee7b --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7caefec8b43c8448f9403b957f59ddf54c87bfcad11730061e8059fd9828605 +size 36552 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/stop_word_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4d53af7d0ae0cfb1f052d4ce2e939e6229128c97 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:195194009a01136a76ff27599bbbe6356a45633e74a66492bdb857585e586e6f +size 17498 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/type_token_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5eae0bc1160e9b5d7b015e4d03b0fe61882c3cd7 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a43000defc4523e47c9c4fc4d6181e83a5b2d9bef610ddf0a6496c6b8afe677 +size 41360 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/uppercase_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..17effc5f5028ebc8b4fdb96663eeb70370671cd8 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d5e7f3a471844207bfbff563eee02a658d5e8762541b99894bd381c5e28ba3d +size 29107 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/histogram/white_space_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..95e7627139bcdeb376c26b7ce7b1f5effa947916 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/histogram/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41808d5c75723f5e2de6eaa34a6687615e77cf7e7ebc6851fa0db2e7779ca2fa +size 13411 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/avg_line_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2053670a0b48c44a72d7e6dd5475e32d14abf5ff --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ab9d11295d4903544a9562446c2bba82ef88ceff52621d37d908b0f30403608 +size 193 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/avg_word_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3287a6942a82b1e1fc2db8e9621d4a0e99f2bb24 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8120df4bf6e235172e38fb470b7421b78cd1cd1d0201a1665e1d3d0b2cb45f5e +size 204 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/avg_words_per_line/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..de2c3430a63da4add79fb29bdbb010770c9fdd82 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b829e8f6d21d0f313f96202b9f7a2bc0a4d72e62755fe350d868a8ca261b7903 +size 192 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/digit_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0044141f6d4ec081fb5ad3bf2381d9ecd89ffee9 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e10dc2247db7c49f8803bee84275318236faa77f95797ece72a4e4fd06adee0 +size 212 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/fasttext_en/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..627b3fb840571087fbbe88ebbbea4e0a4e432e09 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d769e56c6e5b9b97f35083ae704b8eccf98fed88f232c7b62c14254890fb2cce +size 222 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..78c1ead6efa300010aa8b5b2ce438be3a1342faf --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4efe9c94d45d5141b7568b4ab2256ed689178186ffa02be51b66c0328809367 +size 185 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/line_char_duplicates/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..85627005bee1adcccd44288046b676476c73f883 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:497bdec038462b72eab364c7f6e43200699ba419646e476c76d3cd4d27b8147c +size 195 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/line_duplicates/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ccd0b8bdfa2d2112ed220cd4801f30bd0ea15665 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d0b22e4cfdcec2cf09d80b81019e5e9536e36a7c5fb5ab99a42ddae61c6ede1 +size 192 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5dfa50eb55917f9c5314fa1cd5b5d7ef33fe6ee0 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:911d308df972d1a7f6363447d059b27ecc0a2558e4664346726152668039f74a +size 189 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/long_line_ratio_chars_10000/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f4b7ce1b8c561784990357755e4b70d408351c55 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f4fc3741f56881cf4ac7c45dc4e437817467c2650af34c549d92bd4b3e91c23 +size 195 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/long_line_ratio_chars_2000/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b936cafb8eee1aa38fc882733af126a30ea15442 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5911c0c489e243480569793b6f95ac6e9043e0597fa188dc3f4611c06a782521 +size 194 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/long_word_ratio_5/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5226484d9d0d35c5333a1abb12558ae6898c4c76 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38973f39e8a2fb654309b5589c72b8ba429eb5e6a2f775215f98e2df05bc293d +size 216 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/long_word_ratio_7/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b0c21c104f7015157f914f7ce69c29f865beadf1 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce8d44e54ce32688edc5e13576006e9f71dcd282d593a5e086f38018fa0a7213 +size 205 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/n_lines/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c538a70475f45d51aeedadafb23f68507b42ef49 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:658e305b1fe00438575fcbb8128457cac9d0f81b770db5da08fd407877e21f68 +size 180 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/n_words/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e4d585d8ab66334003b66a2a1bbdbc8420c8eafb --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b69dd2b89b5584282d519a1ee2fc4d0f9990394afe9b6b814900ec7320c11816 +size 181 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/non_alpha_digit_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1e12ff3e9e9b0f2d2f436940e26da217bee6f391 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3b91c3c8b66106dc81f94b66f4c81278ce9451503d2cd6935be5d763d3fc28e +size 223 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/short_line_ratio_chars_10/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8d0ce76d82b85f9241ba2682ca83e06f10f45eb6 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94e6b94550fd4a1ff9a93ad5edcc1049dc942cad614bb0252b7a8d67da612aa9 +size 206 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/short_line_ratio_chars_30/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ab5f9f3de262d87b8ac44ac370f7965a9f8695c4 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a08e2b60d02279492138dd72eee37d4d6ea51a864c1850461e62b58755ece7b +size 191 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/short_word_ratio_3/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6808da1ce1628aaf4713a18ab04c10881625bc45 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:966172bb6397af992f456a08550537df4692ddb2af9a61aaf82c14aa7e92fd77 +size 225 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/none/white_space_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/none/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e6409ce2b5f053c03c1b0384ff1f90f3eb846508 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/none/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e60fcb87073d7385ee0e9e0d08bf4c9fc059b939c0d5cf28b566a680032b455 +size 227 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_line_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f5293c666475530ea95ae1c9c4b78405931f6e38 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:865cb05dc87b885f50cc59e5fa2d66f895ebe225b59995b7ce722e7f2619c4ce +size 283365 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_sentence_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7600ce6368d6ea84f97f054b149cd4cf0b19965c --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8148e019c366b29a970c9dba5d52f0bcee5d0a7513d0376fb97bac3e453ba810 +size 279887 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_word_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3f4ca878956cc057e6be28827fe09b335f0d1f27 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba81d61abac2157756500a4eb9f4b46a365a69d5ff5e6783dc80085a26e71a7e +size 300382 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_words_per_line/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3d7e3f4cb504de8d207aeba203be2aa4c571ad7c --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:015c893eca8ad614adeb6b68ab3b1091e581f4630f33c05e987989613cd9c907 +size 282445 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/capitalized_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ac0edbe767626bbe1f0abe844d56ad0fcc5bdce8 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c676239b23a2cc090354d5a11293429c0e6ebcca7a31b7acdbdd39e02898d541 +size 169985 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e561b61083338a38bd810d95f65ca3cef48779b2 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a976b78ba05a1006e4527fe5670f1b5806bb5677c0cc034873a0d8703d2d54e +size 253225 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/digit_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..23caa4605079bcae8cc74c0c1e4de82d8d9cc6b7 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46230bc7774658951a82d548c8b03a3d351b84b50fc7a7b87671f887c4a0ebff +size 297340 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/elipsis_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..715593bd86897e4844f0e8caa5e7e0239728ce3f --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9435ef007f3ecd41ccc5467d02bb278ffec1d380feaefd316fbb9cd1dd0bf647 +size 233110 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/fasttext_en/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..164ca1ee36d3bd87716fcaa8f1b3a9e390bbc95c --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c64ee0174ff44eb61fa346827412474ff859d8b5f5cdddb722f9b2a9dc7c2dad +size 309349 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2c7ffa5161c049e8e58547ac0f45d59c4de06d0a --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53b00a6ce0aff3409a8d27f19747996c22d98849a864bb63c365c92151bc5276 +size 240613 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/line_char_duplicates/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..169c958381e43a5d4ad24fc9b35e159dd6e9c408 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75207fc787e09f5e8eb7f4f7bec0c06c0995227872b002b058b4ce15e9ea9fae +size 262211 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/line_duplicates/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..280da988d665b8c7361668267f27193b007b23f8 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:228e1abb8408c6162ace8c211fb8aa4b838b49d2eb941c7a9cf1dfa025e80928 +size 245690 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e6ef93d1f2d6164e2799811e9e22272014a2d353 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88bcb060ccef633f7032e4a0a5578c597d31bfac7b28fb7afc488eb8dbc85534 +size 272126 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_line_ratio_chars_10000/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..071a3dac69fcacd2717d8ec7056e36386d6c59b3 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ef8d1b0bba258adbc0233a41bcd9d40f2ad5e0d3e9b195f00d09de7510ca9e7 +size 51750 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_line_ratio_chars_2000/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..75917a1ace86091f7c10ac5e253c9b9a7850bae5 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dedf145b9148b5bb4718bba2450b6d21ba5f7bd9802df3e1884209eb80519e43 +size 119498 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_sentence_ratio_75/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..790b67ba998eeabd85ee99423ae6933611ef8e88 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12d6cadaae84bdf11bf90db8bc3603076eb6670e4001f7be0bc86be27b8e3aef +size 275260 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_word_ratio_5/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a261b543b66874af1d7a2fd6b337179795a168eb --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d565e75562cdd214c476ad25d5e8eb866f93cda2617bc201871d16d0d7734244 +size 301317 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_word_ratio_7/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8cba408083ff1b0872059d72d1c1f401f010a855 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ff2cf66c90031859c49ac673ed305a4c62b400cafa56b8d0b37a222eb13c5bf +size 309501 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/n_lines/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..499cb3b1a9b10d16aa2d844688a9d657f94df987 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac9693e83fe49fd88bee056ef4f69a8d27811530a21d4c45f984d98680da905c +size 228960 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/n_sentences/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..50aea7b989df0acdfde6aca9988074e220dbe7cb --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87176198c3bd473e04acba5b30b52b6a87b1a7da587a18bbbca9d4bd33ce1940 +size 228628 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/n_words/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c0d73113b2705eb86d7ddbd1ee468fa04789116a --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:190646c3cf0e8457a696084a2353be59d95ad2232aed353550612a2b2f50a07a +size 236299 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/non_alpha_digit_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..39fe4b7ccb8d15c1ec891a13ce8b300f3ab9d726 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d550ef3698d7ece140a87b644c65b44225891993aa49a5b4db861d2e545b8ba +size 314682 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/punctuation_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ce9455fd47d47bac41fe0faef32a44140fa592b1 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e3dd2d777ddc89e84d84dc1b93b3d020460b8f488ec248e83c8f16dc3a59011 +size 317597 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_line_ratio_chars_10/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1cae90cedf3f8b0acc1886cd4050fc7f6d1add3f --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbb41b64301117c1f814e3f40a387e1f42303cfd6e5cfa9fbab0a9ea040e58c0 +size 267153 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_line_ratio_chars_30/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c9904110ac13b44522ba407ec34b4eec6a9aa3fd --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:077a1eb4caa230c8e7e753d20ba8c60250cb805aec36a4bfc96666b1a2ed0bfb +size 277357 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_sentence_ratio_20/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1b897249f931b1951ac71fdc3b079460a917b11e --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ff541e946361017d30a31e392cc3f7f5cdc687fc93246d2ea8481471367f44a +size 247698 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_word_ratio_3/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ccacae995592fe2ef2b5615cb7143ba253878d52 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:979bbd395b38b11bd248f0bbbd81adf0bd9f233876fe7e299d818a9b8e8e1b11 +size 309080 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/stop_word_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..66b6cf91f93215c750b55657fbb862b8070c28bb --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5262b025f3216c4a63a5e7224c481883ae2b09610ea94472c8072387ba6d7d1e +size 312507 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/type_token_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ac27e875980766d323abbabc343b116b14001065 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf8d594087ab0501b33b9b60de4dcd5b7ff5d06b585eff49c68d392aeb3f00fa +size 306711 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/uppercase_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f613c50aaa8d54b92bb47cda83e5e35a360130ba --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74f47ed183576f93e59dc78e5f4b3d5ed722e278a625491900b30491ce12b510 +size 314865 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/suffix/white_space_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..50f30eca4023aa363e0395f1be65d6432c662ccc --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/suffix/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9af96039dabad319255622465584e4add3f97739e8a13cf859bdb40b97010ef +size 315345 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_line_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..018548e8f24b5db79c5bb764e9ce4c0f2c6588f4 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28edaa997d3523169d8d8d8e3458d800c4a1a0177850adef6de9fac51f530ff9 +size 207 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_sentence_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f29e98fcdb482bf493479547f3d1e841b428529a --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f1407868bce18a251c051d93ad56e323ccaff5fd63343fbe8ed6d6124647394 +size 203 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_word_length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4ac7e2039750e49ca9031c47fbb33ab1bbac4b7a --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:797e4495c377b8cbbeb1de4d66a4b2314a2d98f2daf04cb9578e77d4207675c1 +size 218 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_words_per_line/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a613d943ff343c1a2e23ab1bb0d0e8a877e1726d --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151746fe1cc0c54c40a58268e45036f7201e11e30c8c8330010ac3e5adc34c03 +size 191 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/capitalized_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..30698b80e54f52f5e21cedc7a2dd9d92b2f6fd81 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74c7b1fd3209963b23979b76719cadfc56822bafe3d122d52a85aa2e7fbfae8d +size 206 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..19f93bf04fe9f7c61b636da576c434d187a27a2d --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ca92e8952ab5f0f7e52b28f3c60bfcc07c4421b9648f418217bd339f4706848 +size 192 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/digit_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1ec8e8bbbcfc9844e92480bee67a0f164f24dd2a --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14a9d38be43c5d1b8016a55b0a7952582e12a28a1abd570353278d34cfd4bf71 +size 210 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/elipsis_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..32c17e9c60493681005b23ed054ac9928218298e --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:504daf02a31c4c63630eeb8f846d659d9d1eb44446655dc9419a65c5e411c01e +size 215 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/fasttext_en/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ed866054aa37d34f8ffde13b6bd88989f89e4b1b --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3db118bfd2cc684dd219150e0f9793c362e65ebde708346cf740662ac4239dd +size 222 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/length/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d2a57811d26c3ace3d7626a5c8015520ed25e1f8 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5470b44b1ac9dd31ad9219cb6789b5a05005acb8dd8129df916170d475ebd812 +size 185 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/line_char_duplicates/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b1eef7bc8a66c918baeeab178dc03be21488d61a --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ff47ca89ca4b9c91befc95d4b27e8bb2ee3fd3c045726ea4fdf24fab715af99 +size 210 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/line_duplicates/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..310dc7dcd44aacce22695ca6dd73367be3e3befc --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30bc00d26d6e6a9b522c463949d7de400bd46df6829b6193cdb489dae8920daa +size 209 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9966b0409b15aed71da66be7d12a30fd8f0e8bd1 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:417af43a4cbdd897c6512331262de9fbe398aa956f4b4e0932d9d0b4b290647c +size 191 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_line_ratio_chars_10000/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..01821fbac0653902b070c7e0d8ab18cbe0913d82 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b8c01f9ee69f75d8766e400f994d7642c582974e89ab2c65e6a3adb13175f8c +size 197 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_line_ratio_chars_2000/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..468cf6e86c0590a4de5def80727b6ab936375c4f --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8050f41a47f92ba8654112564fc4c4acfb6bf12f3c10954bfa036189047ec26f +size 195 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_sentence_ratio_75/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8e496f4f771effd554d3440ab49bdce9ea42d436 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97756f63c837722b1a2b2f24c007d84c572d95452db8d716710162d8d4df7f20 +size 190 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_word_ratio_5/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_word_ratio_5/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5226484d9d0d35c5333a1abb12558ae6898c4c76 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_word_ratio_5/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38973f39e8a2fb654309b5589c72b8ba429eb5e6a2f775215f98e2df05bc293d +size 216 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_word_ratio_7/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..acc1ede16cbbb02f9a8e36c4ae80ec92ba036a6e --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dece4d1f4fd653465399dbe1561ee04aa777f024a74f7ceaf04f3e7c4ccf3b7 +size 205 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/n_lines/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6f76cd4750bd5184d368151739c70a39479f102b --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61dbcf7260180cf2773e78a3f562db3012a8d789caa9bf62278ae7f25d6ae664 +size 178 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/n_sentences/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8d5b4c845cf7e6d3312dff860b8d6620b97cbcb5 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:738f4e259df3a45cfd02155ada0e6125d02b6349f0291b4f0513df440ce088d7 +size 178 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/n_words/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..42bf28c885e9d4c97e4319b801fbe9a21155ed50 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b3cbd211d8e39cedb6c0fdefd998a764f2aeb55ed9b5f1f3b169e64f0c498ee +size 183 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/non_alpha_digit_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9940ff0350850cf7d5872db48f243265f5bbbb40 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc94cc13afa07fd86b9501ba5e2e4c3a28618a6ccaec73bceee73b22bf8b3d0 +size 225 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/punctuation_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..822bf1b017f3d939276e13246886fd36b5ec10e1 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49c9816637e9b6c29cc1c97d0d874be15872ac0f22da5a694ece21800bd89f0c +size 211 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_line_ratio_chars_10/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f2896bc1240fab1d7c02acaf29b05cc630fe1449 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12639a677a785d655c15c6b3654350065c0e39c27a618540284cc35bcad719b9 +size 207 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_line_ratio_chars_30/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..978880b216f627a5529bcd0e75fa8f4976619737 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0bb9d954cc9ac4174f86ef1b4ebfb95fd212ba9cd499b0bdffead300a463e66 +size 189 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_sentence_ratio_20/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e78a9592b22335572bedf4bd535187c75c41b6b0 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be9f75022a6d05549d6940ac89684e0e2669b5719d081fd0d1e22e59f073c166 +size 192 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_word_ratio_3/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..27e0ed42bead787572191d4c124cce729d6ea1e7 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:992791f179a92c7bf39cf2d6128be9dda0caec4ec1a2a71030feb80456267737 +size 208 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/stop_word_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6ddf407688dcafa91da9f8b410669835589133bf --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47c482b279cc4a0012e37d7d2e8860d5558b23c236a2a501a15b29e78974b2ec +size 226 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/type_token_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..58481f0e9cfb2d0a0c20692116c2d32abc73d8b3 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cbafdec3ba6bdfec9325720ff98a172b7fe3f5ac6e22e6b56b68bbbef0eca10 +size 206 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/uppercase_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ef0c164b2fd8c8f6159f94139304d90791bf3ebf --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9d16cfa2719968f9ed0ba41682cd76c4e741b1d24cea90d98a56d23824449de +size 209 diff --git a/datasets/dedup_independant_CC-MAIN-2013-48/summary/white_space_ratio/metric.json b/datasets/dedup_independant_CC-MAIN-2013-48/summary/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..001fc6019fb736576d2d40d6b91952cf3f635aa3 --- /dev/null +++ b/datasets/dedup_independant_CC-MAIN-2013-48/summary/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daf4a5f184e235bb234ffc00ed59d01736e2bd6b383765a3540c241fd33792ed +size 228 diff --git a/datasets/dolma-v1.7-cc/fqdn/avg_line_length/metric.json b/datasets/dolma-v1.7-cc/fqdn/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f76faf3812ca460c79abaef1106af610dd14ee4c --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3aedda7fb795c83e48afaad852cd3da022a96132547c9e26ccbef4a07edf105 +size 1618876 diff --git a/datasets/dolma-v1.7-cc/fqdn/avg_sentence_length/metric.json b/datasets/dolma-v1.7-cc/fqdn/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e0671a97aee6179e680eb2e8b953ab29d56d8c86 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cbff4cb8e9714bf0b95968c1def2da1d8affb6df11e9e84bb7f45a52f3b2571 +size 1646797 diff --git a/datasets/dolma-v1.7-cc/fqdn/avg_word_length/metric.json b/datasets/dolma-v1.7-cc/fqdn/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a2face977154a349f8d601775404f8a04952b6c6 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32c6c458585e9d16ecd6f3fc4a339d679a1d06c4cb1def96b1dcd39ce425bafe +size 1750688 diff --git a/datasets/dolma-v1.7-cc/fqdn/avg_words_per_line/metric.json b/datasets/dolma-v1.7-cc/fqdn/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..29c830af9ddf8645f5a84d81436f4cbebf0fc151 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:758287bc5a3b554eeb4612ffd20beac8cba32f314caeb7ce9e7d10a224ee40c2 +size 1609228 diff --git a/datasets/dolma-v1.7-cc/fqdn/capitalized_ratio/metric.json b/datasets/dolma-v1.7-cc/fqdn/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9dc7f4b659684ed96804c8e7652dcd944a020225 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fcc054a47984867b8420f1850be915c50b5c10b11f38d30ae714ef9cc4ad03c +size 1802312 diff --git a/datasets/dolma-v1.7-cc/fqdn/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dolma-v1.7-cc/fqdn/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c00e507657c7af36917e7ae2dac5ec4348b53139 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f80ab238797e9b8fc51bf2926de336b4e517f148aa096a65adaef8f3ec51d27 +size 1542048 diff --git a/datasets/dolma-v1.7-cc/fqdn/digit_ratio/metric.json b/datasets/dolma-v1.7-cc/fqdn/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ce4782f7043f808c07927107989b9e5ae91e7817 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f80c32de14f87c9fd31ccacfc9e73a39531e1bd34a469c9d42c05f12ebdbf45 +size 1727094 diff --git a/datasets/dolma-v1.7-cc/fqdn/elipsis_ratio/metric.json b/datasets/dolma-v1.7-cc/fqdn/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4e2e231ddfa2a4e8326dc957755f2a6f9e149a8e --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b7cb5aeb444e871469ba53c93dccaad8090dbf6a3a73c4ff35b4057acefbbe4 +size 1694755 diff --git a/datasets/dolma-v1.7-cc/fqdn/fasttext_en/metric.json b/datasets/dolma-v1.7-cc/fqdn/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f9bc0f08e25a96874c43c59a65c852f6c24e1980 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4bbaf6eb4126dee72f467849b32e10ba8d2c62a31978fbe659ef540e3d6203a +size 1569461 diff --git a/datasets/dolma-v1.7-cc/fqdn/length/metric.json b/datasets/dolma-v1.7-cc/fqdn/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..90720811ef56a3623833b9bdfd828c1c2157545d --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d47d2e4b6d69c3f8e27ce99afe053a89398de638157169b7a62db639201265f6 +size 1460496 diff --git a/datasets/dolma-v1.7-cc/fqdn/line_char_duplicates/metric.json b/datasets/dolma-v1.7-cc/fqdn/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..83dba7e462bb92ff10a52eb41e4e5c8f2357a4a6 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c51374e7868940dc6471a97f9f2a24dbc685ac19577a1005287a8561977dfb2 +size 215008 diff --git a/datasets/dolma-v1.7-cc/fqdn/line_duplicates/metric.json b/datasets/dolma-v1.7-cc/fqdn/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1eb8a24c5e525cd4d81a590857225b855c005edc --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70aa2bb84e02cb9145ae1429a6a6916b3cda80e1deda97ff47b030b6cf45b269 +size 1325266 diff --git a/datasets/dolma-v1.7-cc/fqdn/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dolma-v1.7-cc/fqdn/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..39b6a82cc3c9106a9bd77919c7bd7bcc501c4e75 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b3803450988be8ed8b278856afebbafc03b9c99e74bf83194aaf7c3f37a3e03 +size 1609470 diff --git a/datasets/dolma-v1.7-cc/fqdn/long_line_ratio_chars_10000/metric.json b/datasets/dolma-v1.7-cc/fqdn/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..352d08cc8f9eab67700f753949dce25fcafa58da --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fb5cc3043c0c804a17862ab889ab347debf8168e11a6da818272475513e8f6a +size 228306 diff --git a/datasets/dolma-v1.7-cc/fqdn/long_line_ratio_chars_2000/metric.json b/datasets/dolma-v1.7-cc/fqdn/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7e234811cb632498c20c9c69a13a80c1df87d311 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3728aa46e139df7eee4a32f40bd7c2e1ba835a4554f6b34acd2200ed2c324446 +size 726692 diff --git a/datasets/dolma-v1.7-cc/fqdn/long_sentence_ratio_75/metric.json b/datasets/dolma-v1.7-cc/fqdn/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..09b6e95815095054297a4af0715692ade2511b5b --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b999512f00bcce0082d1f4547fcd6c6dbf3b7b6ee5456bb09b820239c7d0e691 +size 1612844 diff --git a/datasets/dolma-v1.7-cc/fqdn/long_word_ratio_7/metric.json b/datasets/dolma-v1.7-cc/fqdn/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..987a6614f22b1e8222549313766acbd8f240221c --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16d3c5d7fd63c48fd1aa5544222722f65ff47d5da3b280b316f183890da337c1 +size 1801732 diff --git a/datasets/dolma-v1.7-cc/fqdn/n_lines/metric.json b/datasets/dolma-v1.7-cc/fqdn/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d490a47358e99669a6463c6f337220d2b1ca0dd8 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd2d2bb19d8e77d5775d81fe8f239ff90e87bbe3321faf17f4317c620966ea9 +size 1400764 diff --git a/datasets/dolma-v1.7-cc/fqdn/n_sentences/metric.json b/datasets/dolma-v1.7-cc/fqdn/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..15ecc46134bd92373be72bc3ece9d0ff5c36463c --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7806624cf69c554fb276e73156d31b18c1d58fb87cd4f28ae89038aed605c32 +size 1410080 diff --git a/datasets/dolma-v1.7-cc/fqdn/n_words/metric.json b/datasets/dolma-v1.7-cc/fqdn/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cb4e5c35a3f0641dca960699426275c186a3cbee --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7643c3218a5a75992e2d708ef9994d37d015c3eb5d2eeecde8e874a4906e1e64 +size 1439873 diff --git a/datasets/dolma-v1.7-cc/fqdn/non_alpha_digit_ratio/metric.json b/datasets/dolma-v1.7-cc/fqdn/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..51c8e32a05d2915fa335fdb6c6b51175ba7491bc --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4123bd7f8114e0cd9a1d4350ae5835f15e5ba6ca421b04eb1674b147a07e3ad7 +size 1836353 diff --git a/datasets/dolma-v1.7-cc/fqdn/punctuation_ratio/metric.json b/datasets/dolma-v1.7-cc/fqdn/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fd8c31c833af5062ad4d50393d432bdf63a8a525 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f13bfa71d1dacc0a7b57f7b8fc902b7808d8b216e85627e54e0c58d70a13ee2 +size 1852778 diff --git a/datasets/dolma-v1.7-cc/fqdn/short_line_ratio_chars_10/metric.json b/datasets/dolma-v1.7-cc/fqdn/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2b9a3391eda8d2c9df315d4fe955a9cc5d1cc55c --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b8ef13b2f9b94921e46862d9f65f0351b9ea4e8106b36c64346173c8d12f11c +size 1598982 diff --git a/datasets/dolma-v1.7-cc/fqdn/short_line_ratio_chars_30/metric.json b/datasets/dolma-v1.7-cc/fqdn/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1710799b457fe9fd2c2bd9a4d0bd41c23c6505fb --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ad3dc5f4b15b3ef7ed9a61ad7e1ddc674af4a8bdcdbef236cba51b7ff6f0f15 +size 1605004 diff --git a/datasets/dolma-v1.7-cc/fqdn/short_sentence_ratio_20/metric.json b/datasets/dolma-v1.7-cc/fqdn/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4256f16408ff7f53bde098def5a82d0772f65096 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2f47df94ed2ef093ff0f655b7f6d078e2d960f822396bcb44707e21498c5341 +size 1643924 diff --git a/datasets/dolma-v1.7-cc/fqdn/short_word_ratio_3/metric.json b/datasets/dolma-v1.7-cc/fqdn/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..583818e80802c7cd980c8987a629683967ae52df --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c0410b9395806a321ea0a866035a921bb25d91cd646e4b7d2c2f39c9574cd22 +size 1795723 diff --git a/datasets/dolma-v1.7-cc/fqdn/stop_word_ratio/metric.json b/datasets/dolma-v1.7-cc/fqdn/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..14059e356390246448ed4a3136da8fb84dbaba64 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bc21c9e6988ab26a6817286473f34f1adb81e160c7003e61e284100bc4aaba9 +size 1789067 diff --git a/datasets/dolma-v1.7-cc/fqdn/type_token_ratio/metric.json b/datasets/dolma-v1.7-cc/fqdn/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..79860d3a4a42029faef97947a5fb9b3e36ec83c7 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:383ef5021b9c65fdb00864cc6f7383be142ecf25ee82b10e6f6c51402a3fc0af +size 1756964 diff --git a/datasets/dolma-v1.7-cc/fqdn/uppercase_ratio/metric.json b/datasets/dolma-v1.7-cc/fqdn/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7bc31b29a486e64004dd0e9893ec5c51bfed5fa1 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f7d72bcfbd1217ade3755a280fad699a1ab8a446f36aa4ee08da0487e88e549 +size 1842294 diff --git a/datasets/dolma-v1.7-cc/fqdn/white_space_ratio/metric.json b/datasets/dolma-v1.7-cc/fqdn/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..326823ca928fd8ddf133682ef51d306ae411f882 --- /dev/null +++ b/datasets/dolma-v1.7-cc/fqdn/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b76767425200c1150a33c01bb8bbddf669152df13686689b004454985bec104 +size 1834413 diff --git a/datasets/dolma-v1.7-cc/histogram/avg_line_length/metric.json b/datasets/dolma-v1.7-cc/histogram/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a191e4701c3d7e5624b60a364193de95fc7f8051 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:216371f4cb4a938c6f19f04076bde7728617e76a67d6c33c287ef2709510267d +size 16963981 diff --git a/datasets/dolma-v1.7-cc/histogram/avg_sentence_length/metric.json b/datasets/dolma-v1.7-cc/histogram/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b5464199cc5cd2a6c406ffcb068d6aad41433b3b --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:145b40c06edce6038644b2724461e35d41ecb8a0b7c8d2de8a304c6a21217ff9 +size 8037351 diff --git a/datasets/dolma-v1.7-cc/histogram/avg_word_length/metric.json b/datasets/dolma-v1.7-cc/histogram/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7fb65b31367b6fd890fef5a3f5c8a837ba049318 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ead804e85be2d2a27be219b5666fa82a8bcb5abdda71c88b5289afd80c524ff +size 234200 diff --git a/datasets/dolma-v1.7-cc/histogram/avg_words_per_line/metric.json b/datasets/dolma-v1.7-cc/histogram/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2fda316b1bc48952e820cad433d0ba6bff13754c --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3301e3309d293a880ba0f62f7ae34a00380812504049c7600a506fc58405c077 +size 5669236 diff --git a/datasets/dolma-v1.7-cc/histogram/capitalized_ratio/metric.json b/datasets/dolma-v1.7-cc/histogram/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..60393105eea6670a7beef250eedd5d241d7223b9 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7718ea2442e61e9fd6cf2d443c9ce03173786fabe72d718c469c81b499e35e2e +size 36211 diff --git a/datasets/dolma-v1.7-cc/histogram/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dolma-v1.7-cc/histogram/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f74da87d7e3f6ece9040d72d359d6d575f4b498d --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:984e2a1e44f8be1b468d400a11dc1f6ddaadb3cf8e759338d1b71c65f6912637 +size 1792560 diff --git a/datasets/dolma-v1.7-cc/histogram/digit_ratio/metric.json b/datasets/dolma-v1.7-cc/histogram/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8188b67f0e9b5c1bcad59586b175f73b6e151338 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feeb4e844f9a272a17c6ddb1b2bd45dd1b1d6094d9255da0bf13efca9b28b892 +size 18869 diff --git a/datasets/dolma-v1.7-cc/histogram/elipsis_ratio/metric.json b/datasets/dolma-v1.7-cc/histogram/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c5f18f0147605342d29e30d37779356a1c3bffb8 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:589e567ef2aa61a4b54d95bf5904892b78ff34aa4fa4fc2336210d3777cdb036 +size 8068 diff --git a/datasets/dolma-v1.7-cc/histogram/fasttext_en/metric.json b/datasets/dolma-v1.7-cc/histogram/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..05d5cdac978594788ad52342794628588ae73a7b --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:696947c204633b8102dc17d0dd3e5aa5aacf8916e400e54bd2d339c2a9d8acb9 +size 2139 diff --git a/datasets/dolma-v1.7-cc/histogram/length/metric.json b/datasets/dolma-v1.7-cc/histogram/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d2ec5f2cc95e0f8f7f285a3750f3b3b3fcaf4983 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5bc9c35ab476f542f2e4daf08da2bc6d793776ec1e05117688f88e8ada23d3f +size 2485462 diff --git a/datasets/dolma-v1.7-cc/histogram/line_char_duplicates/metric.json b/datasets/dolma-v1.7-cc/histogram/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a2f1fb911c7f6e12875f2009e9fb2988eb4f21b6 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f2ce611d87aa1512eefa87bff263456256ce58f54b65a56ff0c877f9642776f +size 473 diff --git a/datasets/dolma-v1.7-cc/histogram/line_duplicates/metric.json b/datasets/dolma-v1.7-cc/histogram/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..edf422dfd3c8a158813a732b4eca7603621682fd --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcb13378369f51075a35c0a9f2603db639f81e3cf2e233f3bd8cbfebe9fb00d4 +size 13509 diff --git a/datasets/dolma-v1.7-cc/histogram/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dolma-v1.7-cc/histogram/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ab33e7690bf35a3fd09b8d24e0bcb7da87e9e8aa --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa9e30c23ecb2f9b98b4f1779dc25a50abb3fc1a83a1d4b304749b4589d08874 +size 32603 diff --git a/datasets/dolma-v1.7-cc/histogram/long_line_ratio_chars_10000/metric.json b/datasets/dolma-v1.7-cc/histogram/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..beb0c4767e4466aea3c434ae3fd24d11a4bd4169 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b30b264c94953b201c1e9ea3361dcffb998b00a197a1375aa4ab244950414fc9 +size 5518 diff --git a/datasets/dolma-v1.7-cc/histogram/long_line_ratio_chars_2000/metric.json b/datasets/dolma-v1.7-cc/histogram/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0e794d67cb2910eb32e3789191bb37b027bd05d4 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61e7229b6c340f65d726f1a2f9bd2968be157808fd49e605e9bc52b0d2ab38da +size 26420 diff --git a/datasets/dolma-v1.7-cc/histogram/long_sentence_ratio_75/metric.json b/datasets/dolma-v1.7-cc/histogram/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6697c914be6cad537672ff963326d4ec1287d100 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c2b98ca4b0702907fbdecd3f1ef5b340d3689f4d4b349ef123fcb4caf1d327c +size 41707 diff --git a/datasets/dolma-v1.7-cc/histogram/long_word_ratio_7/metric.json b/datasets/dolma-v1.7-cc/histogram/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..257b779c73eee805128baba4f46272eba392895f --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8786490a8280c82e41a1f5bdbfe26f7e2c62809a188910f798343265c1693ad0 +size 28366 diff --git a/datasets/dolma-v1.7-cc/histogram/n_lines/metric.json b/datasets/dolma-v1.7-cc/histogram/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4030140b94525f5ecdb8a51e9022f4afd14b3457 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19af8dd2d231839213e6759a5ef61f3692d40c3ff4dad04f559c33a9458408f4 +size 54599 diff --git a/datasets/dolma-v1.7-cc/histogram/n_sentences/metric.json b/datasets/dolma-v1.7-cc/histogram/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b59db1c9189c2b0bd6113cfb9265fb43fb10deda --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d3aa8152e86b88458d094408053eec11d7555785f3c362a45c90977ded06c0 +size 109093 diff --git a/datasets/dolma-v1.7-cc/histogram/n_words/metric.json b/datasets/dolma-v1.7-cc/histogram/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2deb27685ed41cd83477f8b406002001d29d7745 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5822eaa4df719eb0e521ebe29b6f8c2f1f2e283c4165a8a8cbf96218f1770001 +size 804403 diff --git a/datasets/dolma-v1.7-cc/histogram/non_alpha_digit_ratio/metric.json b/datasets/dolma-v1.7-cc/histogram/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7f28a288812e913cd14f586c657d47583bf2b48e --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfcdd08ffaefd3736f646c245228baa376f9a257f4763f387f2462b2458d74ea +size 20795 diff --git a/datasets/dolma-v1.7-cc/histogram/punctuation_ratio/metric.json b/datasets/dolma-v1.7-cc/histogram/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..700eb952c8fb10e18aceb755c1d6301a0eea4f06 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19fe469b111c0e69bef1b7ff78e54faedb5cf620a5ce27525170fd1dc199bd4f +size 15396 diff --git a/datasets/dolma-v1.7-cc/histogram/short_line_ratio_chars_10/metric.json b/datasets/dolma-v1.7-cc/histogram/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b3af903a70d60bf97725a6eb360c7d77a0f3998f --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3be24b62bd0035388c3756407f81c50f72492a9ec52fc3b31c34fadb2b613ce8 +size 15643 diff --git a/datasets/dolma-v1.7-cc/histogram/short_line_ratio_chars_30/metric.json b/datasets/dolma-v1.7-cc/histogram/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0059527b3cc59dc6c84381f981afdc85d1833286 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d12be688511eb154ee4a9a0515b938c99b183eea406cacc0ad09429bebd189 +size 37402 diff --git a/datasets/dolma-v1.7-cc/histogram/short_sentence_ratio_20/metric.json b/datasets/dolma-v1.7-cc/histogram/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8cb43f739460241b6022418fbdee673af4d7a8d9 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:522a5e4d85aef217c2acd992633771de94a02b2fe019d31145867c16103ec8c5 +size 35830 diff --git a/datasets/dolma-v1.7-cc/histogram/short_word_ratio_3/metric.json b/datasets/dolma-v1.7-cc/histogram/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..47ccc4dc730c108801a5f1707b37ca254b287794 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:163657ad3089139316adfb22bd7e4b9d24677df5cefb63ce83cdcb32bb76e449 +size 33947 diff --git a/datasets/dolma-v1.7-cc/histogram/stop_word_ratio/metric.json b/datasets/dolma-v1.7-cc/histogram/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b18cc09067de7433d0984438490f7f95263b116b --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:448749f7bff3b9b7c3820be17f54be6ce553c937142a9ac78c27ceb7e88cf7a0 +size 17125 diff --git a/datasets/dolma-v1.7-cc/histogram/type_token_ratio/metric.json b/datasets/dolma-v1.7-cc/histogram/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a67d69bf023280b8048b926be5edd563ffb9b861 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0523a0880f686683efad2194a153c974f61f5da8c0ae80f52e53a70e767c93bb +size 39293 diff --git a/datasets/dolma-v1.7-cc/histogram/uppercase_ratio/metric.json b/datasets/dolma-v1.7-cc/histogram/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..caff34db9d9a85204c843f7bc38e074cc14c8caf --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67058e1018d9f89b625823793401c557a93f5b8c9d95160849ea5e1238a7f3cb +size 34227 diff --git a/datasets/dolma-v1.7-cc/histogram/white_space_ratio/metric.json b/datasets/dolma-v1.7-cc/histogram/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5b7c76b1090df7ee0d622cd7f5780f24bfa5c664 --- /dev/null +++ b/datasets/dolma-v1.7-cc/histogram/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1a6abf478604eb4fe972ceac29a1b72d04b556ca27347bf210f124bca779611 +size 15243 diff --git a/datasets/dolma-v1.7-cc/none/avg_line_length/metric.json b/datasets/dolma-v1.7-cc/none/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..dabdb0c1a23c5619638aa1900922edae2c93ca9d --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ee4aa2e2960085e8460aecb814d24365374f61054d2a6b606782aba29cd24c6 +size 204 diff --git a/datasets/dolma-v1.7-cc/none/avg_word_length/metric.json b/datasets/dolma-v1.7-cc/none/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f5a05daf30b6323752a26f0e6026d60f5d3fc4b6 --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0864872e233d8e2d22c400efd4115e1a9d6b34b1e34d55d744db777b12dda2e4 +size 204 diff --git a/datasets/dolma-v1.7-cc/none/avg_words_per_line/metric.json b/datasets/dolma-v1.7-cc/none/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ee85ddee9437e25b3d236bf377b51fd6f551ea47 --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39788119e779a6231f5d3c11563fee201b7c5e4788b192d2c7324d5169061951 +size 205 diff --git a/datasets/dolma-v1.7-cc/none/digit_ratio/metric.json b/datasets/dolma-v1.7-cc/none/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d91a8730834d70b97737a9964e55a7693645f2d3 --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7ecae7ab485f7bdd11a11708e8bcd51543f50b1e2168e8dff234969e13c61a9 +size 209 diff --git a/datasets/dolma-v1.7-cc/none/length/metric.json b/datasets/dolma-v1.7-cc/none/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..334f8ea7bd64666659d15dfd99f486447153a875 --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17b28fc703bbed815ef20656d19dd5399148a33675d8da159afef493e8ad9d1d +size 180 diff --git a/datasets/dolma-v1.7-cc/none/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dolma-v1.7-cc/none/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..dd34a1fc6485f3dd1817d719d419a008f2b943bf --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:673456a7b9b1d46a4e632a5fd189944871fb6c53cc6fdda37ed1b5fa9af5419f +size 188 diff --git a/datasets/dolma-v1.7-cc/none/long_line_ratio_chars_10000/metric.json b/datasets/dolma-v1.7-cc/none/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..519628a619bbb06a55c8efccf39100530fdea42d --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad0976364b4018345dd3c40b2247aa3778ece15a15a4168084366959f62377c5 +size 194 diff --git a/datasets/dolma-v1.7-cc/none/long_line_ratio_chars_2000/metric.json b/datasets/dolma-v1.7-cc/none/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e26a02115cf1f0b7469ba8ffa65b4ebb8f981cc9 --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c71656788c12c392c64046bb959a6d49b19024768f7cc5959f96bce49a70273b +size 194 diff --git a/datasets/dolma-v1.7-cc/none/long_word_ratio_7/metric.json b/datasets/dolma-v1.7-cc/none/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d02bec71451d0a0e2da42f749d802cbee67c5f78 --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2082c8d7d0129aed260e024d00581c8cdc1e28b7434ab8aba3f77903b79c9204 +size 207 diff --git a/datasets/dolma-v1.7-cc/none/n_lines/metric.json b/datasets/dolma-v1.7-cc/none/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a53f9a1dd2289c8672166e87247cd1d87567892c --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2184a3e2aa32e51d6b70478deb43aae1450be42bdfcfc31ccc4bba8ce850425d +size 176 diff --git a/datasets/dolma-v1.7-cc/none/n_words/metric.json b/datasets/dolma-v1.7-cc/none/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e58be62d1945172da4eb7ea18f985634f96a102c --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1e8b44818a61f8528bda44574663b2c72559d8b76df3ae82075f827308e3238 +size 180 diff --git a/datasets/dolma-v1.7-cc/none/non_alpha_digit_ratio/metric.json b/datasets/dolma-v1.7-cc/none/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..bae417628c3be91395fe1ce6347471711940dbfa --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b992adb9acfad0d4a10801db490404b73badb6d70d81804da3a23a260e105292 +size 222 diff --git a/datasets/dolma-v1.7-cc/none/short_line_ratio_chars_10/metric.json b/datasets/dolma-v1.7-cc/none/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..04b537eb5e24b9324ab70e275c138d23f7d28cbb --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89886f5fa7741a2d95f386d30c59dc135b8912decafe064a75c0781cf10e2130 +size 189 diff --git a/datasets/dolma-v1.7-cc/none/short_line_ratio_chars_30/metric.json b/datasets/dolma-v1.7-cc/none/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7cc8ecb1075b8020708b4bb23362b23db53bc4be --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b4bd4041bf82e41c0e554406af175ca172127273a9d977d73823c3e7cac5860 +size 192 diff --git a/datasets/dolma-v1.7-cc/none/short_word_ratio_3/metric.json b/datasets/dolma-v1.7-cc/none/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f422a49f265b6ead4358700f37e0e079f936277b --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f1804c3fb433426aa83845249454c51a5722fea8f29e8797f6e281dbb255b88 +size 193 diff --git a/datasets/dolma-v1.7-cc/none/white_space_ratio/metric.json b/datasets/dolma-v1.7-cc/none/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cf8cfbdb63d4fbe2f0a20ac23ed9da8f68075052 --- /dev/null +++ b/datasets/dolma-v1.7-cc/none/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5b6146686c7fb088fef11e402b4fd08763dd5e75df947de77cba1424ffdcb42 +size 226 diff --git a/datasets/dolma-v1.7-cc/suffix/avg_line_length/metric.json b/datasets/dolma-v1.7-cc/suffix/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2c851a1acb3afe451148692412d879e91202d4cf --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d34c6e076eb8048f3896b31b9a416b458608ef9928c1bd270588ce99e8a0299 +size 365596 diff --git a/datasets/dolma-v1.7-cc/suffix/avg_sentence_length/metric.json b/datasets/dolma-v1.7-cc/suffix/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4fe0f00931b2bb12541cb0c7d902d3435383b4c0 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b39f716710e9da7b44d5db36409a95891ce82e3221e0ad017ea27796932cbe5 +size 375960 diff --git a/datasets/dolma-v1.7-cc/suffix/avg_word_length/metric.json b/datasets/dolma-v1.7-cc/suffix/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..aacfae5650731622207e4b1f8b482d971e010170 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a91d87d93625a30d00caa5919fb9a98fad2bef40d00b622e8712290ef510b00 +size 402084 diff --git a/datasets/dolma-v1.7-cc/suffix/avg_words_per_line/metric.json b/datasets/dolma-v1.7-cc/suffix/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6e5cecc9e219e1738eae434f0f4a29f9a9bd6230 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75b5dbb11a4ef748981e3477b7ae8fcc9dbf18219133192815d847ff74f5522f +size 360976 diff --git a/datasets/dolma-v1.7-cc/suffix/capitalized_ratio/metric.json b/datasets/dolma-v1.7-cc/suffix/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..03b869aacce92e510dc783bf0b20bcf02b300493 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:761f34f3e9773f3a335962eb9de0c7ab4a62ec6238b1a3494d95a5b2e7ba9977 +size 412403 diff --git a/datasets/dolma-v1.7-cc/suffix/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dolma-v1.7-cc/suffix/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1a7f309eb97890242dc4ddb25744e5ee31084052 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:216f6e711cdde74a0b30e639fa8ccf502a6b32b94e72123c28a2d18bf655a2d8 +size 340130 diff --git a/datasets/dolma-v1.7-cc/suffix/digit_ratio/metric.json b/datasets/dolma-v1.7-cc/suffix/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4be8a8591f5883c9c1c4ae75f07a9767c8179e23 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:779bcd5b9053086e9318e53b99f22b14ce52ef03c79daf54d0ab8364bda1157d +size 396463 diff --git a/datasets/dolma-v1.7-cc/suffix/elipsis_ratio/metric.json b/datasets/dolma-v1.7-cc/suffix/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..bbe33c7960e97d70a49dc01f05098804a985e477 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db6e912e7a7b394b821c53457a54bf499acbefd3e0ba3efd55a024c6c3f687ca +size 295981 diff --git a/datasets/dolma-v1.7-cc/suffix/fasttext_en/metric.json b/datasets/dolma-v1.7-cc/suffix/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..21bbfeb9c11407a864010014dc47f87b34d45d27 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac228d20d8a4e627c3b0c90fd322e7241ade700284d6be5aa1d2f18790682e5c +size 343034 diff --git a/datasets/dolma-v1.7-cc/suffix/length/metric.json b/datasets/dolma-v1.7-cc/suffix/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..867ea1b1325753336df4a1442421fd3fc8c8247a --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15e1031db386f0b9dde94a389e4731553c4058f294d3fa7eeae1f9fe5b46e4e6 +size 323450 diff --git a/datasets/dolma-v1.7-cc/suffix/line_char_duplicates/metric.json b/datasets/dolma-v1.7-cc/suffix/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fe24e6d9e7ced9c0c1ca1e1b7eb41630fef242e1 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bbe2e894d54822f1e076835e8e22f8122ceb743606482e6e79189183c155701 +size 39351 diff --git a/datasets/dolma-v1.7-cc/suffix/line_duplicates/metric.json b/datasets/dolma-v1.7-cc/suffix/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..859dff1d50807e817ba6a4f20f2548f39f21a945 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab4bef45134a0bcaeb97c16ce6089e64e10fdaff6ab3ccc38973429d870abcee +size 171191 diff --git a/datasets/dolma-v1.7-cc/suffix/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dolma-v1.7-cc/suffix/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7e1e16ddb0b4a130c98441275fe3a8ae9273c17e --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c968feffa09a89613ce95c23880691670ca5cedd6c5e16b3013f6a8f675ff87 +size 344027 diff --git a/datasets/dolma-v1.7-cc/suffix/long_line_ratio_chars_10000/metric.json b/datasets/dolma-v1.7-cc/suffix/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f773800bae975297c1cdae6b050a662a667613d6 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10c2e5a1b16248cc66dc94bdcba178d3a42267258056f44e4a9823024c7f1ab0 +size 90874 diff --git a/datasets/dolma-v1.7-cc/suffix/long_line_ratio_chars_2000/metric.json b/datasets/dolma-v1.7-cc/suffix/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1abff126453d234ea61558c50e9234b4c53c8c15 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46ddeeb97b1a0f3e7b3f57feda3119d728d4c3f4617066d46b279957b622bb48 +size 169347 diff --git a/datasets/dolma-v1.7-cc/suffix/long_sentence_ratio_75/metric.json b/datasets/dolma-v1.7-cc/suffix/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..51ea746af85238bde00122bda986a9a18cd0a331 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb0cd51979cd979ba2a9942ddba6710c6d7ef4ba96216627ee4efe0230c3fbbf +size 364855 diff --git a/datasets/dolma-v1.7-cc/suffix/long_word_ratio_7/metric.json b/datasets/dolma-v1.7-cc/suffix/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..71a258903dfaccc34272a463b6b0886b3f22892c --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0c1f05e66d44e3f4bc7b9e6a0123b79764b518a4b3313193b30c6eade1f1b52 +size 411723 diff --git a/datasets/dolma-v1.7-cc/suffix/n_lines/metric.json b/datasets/dolma-v1.7-cc/suffix/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..991bd5df21677e41e9a5462ed89ed6d5530d5974 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2522e3d3ca08165508780558ce54c7efddf018596b14f0007fa2dd7b37606162 +size 302902 diff --git a/datasets/dolma-v1.7-cc/suffix/n_sentences/metric.json b/datasets/dolma-v1.7-cc/suffix/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..97284a3e7ef692dedae09615f28b036035b6067f --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0baa99ee4b5f04d125bd58961ce406383c093322e64bc9b97175021acf912e9 +size 307879 diff --git a/datasets/dolma-v1.7-cc/suffix/n_words/metric.json b/datasets/dolma-v1.7-cc/suffix/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cabc029027cb84306a5393d856fb820e6b3871e2 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de6e7a3e2b7c36e8a4c16cf400bc241b9f2beee8ba101b9398b10a09c82a9488 +size 317626 diff --git a/datasets/dolma-v1.7-cc/suffix/non_alpha_digit_ratio/metric.json b/datasets/dolma-v1.7-cc/suffix/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..28cc5437372966951bbe472c3f9f08312e5a774c --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8841f32af59bc6c1cf89abd10b1164a91d8dd206f91d8e83c7b20fe78c82557d +size 422846 diff --git a/datasets/dolma-v1.7-cc/suffix/punctuation_ratio/metric.json b/datasets/dolma-v1.7-cc/suffix/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3b399a78d72a0e048c211af8990e832009ecaa5b --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b19a6f79ff882c9b263cc606c6d4e1ebb5e32937bd7dbe724b54ee3f3886e865 +size 429600 diff --git a/datasets/dolma-v1.7-cc/suffix/short_line_ratio_chars_10/metric.json b/datasets/dolma-v1.7-cc/suffix/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d84299ead081fc2bf3985ff4b619264a3b83d970 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:381fcf88bf0d0ad940a7d20f665f163e9c4354af61605b2d4207e8e9da5d6323 +size 335631 diff --git a/datasets/dolma-v1.7-cc/suffix/short_line_ratio_chars_30/metric.json b/datasets/dolma-v1.7-cc/suffix/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0d13d4d01919e9fb157679f64c6cbbfbe1fe383f --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:feda2a48acc8a88b5d854ede4424f96c421d0362f4eb7516db8ffe1f25554aa3 +size 340189 diff --git a/datasets/dolma-v1.7-cc/suffix/short_sentence_ratio_20/metric.json b/datasets/dolma-v1.7-cc/suffix/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..da6f7165bdb98dfa7a65b4cb61d4ff4a6ac398d5 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec6a29638903ba8b61cff7d52662c9b99ec6f0a63bc531eddac3a3e43a586593 +size 322343 diff --git a/datasets/dolma-v1.7-cc/suffix/short_word_ratio_3/metric.json b/datasets/dolma-v1.7-cc/suffix/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0bd082889977985f80ac814c699fd6cde63cbfba --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:432e0d5525dbc8beb73192cf7e91f798034089b482d455a88701cc7b9e431040 +size 413473 diff --git a/datasets/dolma-v1.7-cc/suffix/stop_word_ratio/metric.json b/datasets/dolma-v1.7-cc/suffix/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..890166c81533c696ee3e28de5d4665df32f71a0a --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f07e842aa11fafbe05c2a23b1e3612beb43a129df3310025d3114c5f1aedcdeb +size 409316 diff --git a/datasets/dolma-v1.7-cc/suffix/type_token_ratio/metric.json b/datasets/dolma-v1.7-cc/suffix/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..63cffda9c357d208ca87b726f72f74b90a6c229e --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf3485d98790baee7172ed6833b4dd4c074025ce3c73fbfb44f6d8084334ecf5 +size 404889 diff --git a/datasets/dolma-v1.7-cc/suffix/uppercase_ratio/metric.json b/datasets/dolma-v1.7-cc/suffix/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c0569f7f9002a7a87ee36e1174d02b744b728f79 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0843672f2dc0175aff8822a8648de9ba321d35e9e6d74783d90fab84b4e40ff9 +size 423736 diff --git a/datasets/dolma-v1.7-cc/suffix/white_space_ratio/metric.json b/datasets/dolma-v1.7-cc/suffix/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e31927591eb486d039ac706d3fdca6dff3c43250 --- /dev/null +++ b/datasets/dolma-v1.7-cc/suffix/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a95dc7b1191d82610fb7760f34d41b377295e690b8527abb23ef5201ea782830 +size 423582 diff --git a/datasets/dolma-v1.7-cc/summary/avg_line_length/metric.json b/datasets/dolma-v1.7-cc/summary/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..dabdb0c1a23c5619638aa1900922edae2c93ca9d --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ee4aa2e2960085e8460aecb814d24365374f61054d2a6b606782aba29cd24c6 +size 204 diff --git a/datasets/dolma-v1.7-cc/summary/avg_sentence_length/metric.json b/datasets/dolma-v1.7-cc/summary/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ce592c6a37088245a63fe81dadd4fa5aa210b484 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6e88935b4e2cf29e8de0c2668445ec6e1ffebe5c5ff074dc3386a22a93f0f47 +size 188 diff --git a/datasets/dolma-v1.7-cc/summary/avg_word_length/metric.json b/datasets/dolma-v1.7-cc/summary/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f5a05daf30b6323752a26f0e6026d60f5d3fc4b6 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0864872e233d8e2d22c400efd4115e1a9d6b34b1e34d55d744db777b12dda2e4 +size 204 diff --git a/datasets/dolma-v1.7-cc/summary/avg_words_per_line/metric.json b/datasets/dolma-v1.7-cc/summary/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ee85ddee9437e25b3d236bf377b51fd6f551ea47 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39788119e779a6231f5d3c11563fee201b7c5e4788b192d2c7324d5169061951 +size 205 diff --git a/datasets/dolma-v1.7-cc/summary/capitalized_ratio/metric.json b/datasets/dolma-v1.7-cc/summary/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9821b376e4a11bec2de23ad33e65153f126882b8 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da7c055f4104feeea8d2b75e244e9032600b0e3f5898655ffc537d9c3ac529ce +size 206 diff --git a/datasets/dolma-v1.7-cc/summary/ccnet_perplexity_wikipedia_en/metric.json b/datasets/dolma-v1.7-cc/summary/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6ff57c0e9a7f2b680306ac7cdd55df19b08ce176 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a480be34b447a40cf5200fae5acc5510d66f253e6ba9fb8950d4e58453cc8315 +size 189 diff --git a/datasets/dolma-v1.7-cc/summary/digit_ratio/metric.json b/datasets/dolma-v1.7-cc/summary/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d91a8730834d70b97737a9964e55a7693645f2d3 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7ecae7ab485f7bdd11a11708e8bcd51543f50b1e2168e8dff234969e13c61a9 +size 209 diff --git a/datasets/dolma-v1.7-cc/summary/elipsis_ratio/metric.json b/datasets/dolma-v1.7-cc/summary/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..eec6834a4233cdf25d72fd727ff643c7bb482ca2 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7789d20c3437b36de102d7842a7db7304445466bb7c21c84bc639b1458f550c0 +size 210 diff --git a/datasets/dolma-v1.7-cc/summary/fasttext_en/metric.json b/datasets/dolma-v1.7-cc/summary/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7c43c110975460e9c9ebb7afd64cf41804b29c17 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec01c61bff3c3478b4b3e60c226fa0117eef1c677007456f71fd068b604aab6 +size 191 diff --git a/datasets/dolma-v1.7-cc/summary/length/metric.json b/datasets/dolma-v1.7-cc/summary/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..334f8ea7bd64666659d15dfd99f486447153a875 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17b28fc703bbed815ef20656d19dd5399148a33675d8da159afef493e8ad9d1d +size 180 diff --git a/datasets/dolma-v1.7-cc/summary/line_char_duplicates/metric.json b/datasets/dolma-v1.7-cc/summary/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e01f5c87fafd6343cea30d845de36060a09d156c --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc887090e961a2a878959fe3179d15e0c6b2c7cdb385d25be2433e15224deae1 +size 214 diff --git a/datasets/dolma-v1.7-cc/summary/line_duplicates/metric.json b/datasets/dolma-v1.7-cc/summary/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5214b933c8771726bd90f74731b3464ba79589bb --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d8c36674c1bc82dcc89ddee0d86e7df1e5171febc0bb8671d08bd8c837b5e0 +size 208 diff --git a/datasets/dolma-v1.7-cc/summary/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/dolma-v1.7-cc/summary/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..dd34a1fc6485f3dd1817d719d419a008f2b943bf --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:673456a7b9b1d46a4e632a5fd189944871fb6c53cc6fdda37ed1b5fa9af5419f +size 188 diff --git a/datasets/dolma-v1.7-cc/summary/long_line_ratio_chars_10000/metric.json b/datasets/dolma-v1.7-cc/summary/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..519628a619bbb06a55c8efccf39100530fdea42d --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad0976364b4018345dd3c40b2247aa3778ece15a15a4168084366959f62377c5 +size 194 diff --git a/datasets/dolma-v1.7-cc/summary/long_line_ratio_chars_2000/metric.json b/datasets/dolma-v1.7-cc/summary/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e26a02115cf1f0b7469ba8ffa65b4ebb8f981cc9 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c71656788c12c392c64046bb959a6d49b19024768f7cc5959f96bce49a70273b +size 194 diff --git a/datasets/dolma-v1.7-cc/summary/long_sentence_ratio_75/metric.json b/datasets/dolma-v1.7-cc/summary/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a65abf62d0db4b0b4091bc9e9ddfb9cf1612969a --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c518cd5ecf7e5e49517f1f808c33d6fd2d36bdbc0ad974405df36a5bb02777f +size 187 diff --git a/datasets/dolma-v1.7-cc/summary/long_word_ratio_7/metric.json b/datasets/dolma-v1.7-cc/summary/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d02bec71451d0a0e2da42f749d802cbee67c5f78 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2082c8d7d0129aed260e024d00581c8cdc1e28b7434ab8aba3f77903b79c9204 +size 207 diff --git a/datasets/dolma-v1.7-cc/summary/n_lines/metric.json b/datasets/dolma-v1.7-cc/summary/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a53f9a1dd2289c8672166e87247cd1d87567892c --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2184a3e2aa32e51d6b70478deb43aae1450be42bdfcfc31ccc4bba8ce850425d +size 176 diff --git a/datasets/dolma-v1.7-cc/summary/n_sentences/metric.json b/datasets/dolma-v1.7-cc/summary/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..88f137d7c536c6f38a871ea41feb4186c734b0f2 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b822d10dcb23dfb45ee315a39c27e9ad49012281fafaa97cde6d7e3c8481ace +size 173 diff --git a/datasets/dolma-v1.7-cc/summary/n_words/metric.json b/datasets/dolma-v1.7-cc/summary/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e58be62d1945172da4eb7ea18f985634f96a102c --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1e8b44818a61f8528bda44574663b2c72559d8b76df3ae82075f827308e3238 +size 180 diff --git a/datasets/dolma-v1.7-cc/summary/non_alpha_digit_ratio/metric.json b/datasets/dolma-v1.7-cc/summary/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..bae417628c3be91395fe1ce6347471711940dbfa --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b992adb9acfad0d4a10801db490404b73badb6d70d81804da3a23a260e105292 +size 222 diff --git a/datasets/dolma-v1.7-cc/summary/punctuation_ratio/metric.json b/datasets/dolma-v1.7-cc/summary/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..22cec1c2c92b1e8a70b3c13bec5e1923af03220a --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e22cfb2a13c6775510643654b4564513beee3b3dc63f56a23e113f69227e985 +size 228 diff --git a/datasets/dolma-v1.7-cc/summary/short_line_ratio_chars_10/metric.json b/datasets/dolma-v1.7-cc/summary/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..04b537eb5e24b9324ab70e275c138d23f7d28cbb --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89886f5fa7741a2d95f386d30c59dc135b8912decafe064a75c0781cf10e2130 +size 189 diff --git a/datasets/dolma-v1.7-cc/summary/short_line_ratio_chars_30/metric.json b/datasets/dolma-v1.7-cc/summary/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7cc8ecb1075b8020708b4bb23362b23db53bc4be --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b4bd4041bf82e41c0e554406af175ca172127273a9d977d73823c3e7cac5860 +size 192 diff --git a/datasets/dolma-v1.7-cc/summary/short_sentence_ratio_20/metric.json b/datasets/dolma-v1.7-cc/summary/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2218ca5112a3d294a540c2b89cb2fc19e35e9418 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2288b211bda77518ed49079628dd1709a53ffa81b3c642f83e957dfe060fd645 +size 191 diff --git a/datasets/dolma-v1.7-cc/summary/short_word_ratio_3/metric.json b/datasets/dolma-v1.7-cc/summary/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f422a49f265b6ead4358700f37e0e079f936277b --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f1804c3fb433426aa83845249454c51a5722fea8f29e8797f6e281dbb255b88 +size 193 diff --git a/datasets/dolma-v1.7-cc/summary/stop_word_ratio/metric.json b/datasets/dolma-v1.7-cc/summary/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..58f92c755e9c8042064358c4d09cae79eeeabbd4 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74bd0f43d6d20c136901ff7aea88ce420705611384bdd3dca697ad7c09775648 +size 207 diff --git a/datasets/dolma-v1.7-cc/summary/type_token_ratio/metric.json b/datasets/dolma-v1.7-cc/summary/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8b006f48ca3d668b8ea9d9a48ebde237a84dfabe --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:706c5083f6de90b735f9fa5d7fbcacafa33eea5aaa50cb132656b01e7a1d8611 +size 205 diff --git a/datasets/dolma-v1.7-cc/summary/uppercase_ratio/metric.json b/datasets/dolma-v1.7-cc/summary/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ef46ad635b740929769ed110b7ce7efe4377fab9 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1943d89e83eb34707aaaf9ba25a96d5cff77631d2a73a5719e3bafffda2aa23d +size 207 diff --git a/datasets/dolma-v1.7-cc/summary/white_space_ratio/metric.json b/datasets/dolma-v1.7-cc/summary/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cf8cfbdb63d4fbe2f0a20ac23ed9da8f68075052 --- /dev/null +++ b/datasets/dolma-v1.7-cc/summary/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5b6146686c7fb088fef11e402b4fd08763dd5e75df947de77cba1424ffdcb42 +size 226 diff --git a/datasets/fineweb/fqdn/avg_line_length/metric.json b/datasets/fineweb/fqdn/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..87c95db8ffd8661315f1630a2d33b99bf241436d --- /dev/null +++ b/datasets/fineweb/fqdn/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:417630e8171afe58da5a233d5a4036b9f83d96e9a776f6f87627f3f188d333fe +size 1658511 diff --git a/datasets/fineweb/fqdn/avg_sentence_length/metric.json b/datasets/fineweb/fqdn/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..99ce718b5d55eb33827b92fa8243cf16c35deed9 --- /dev/null +++ b/datasets/fineweb/fqdn/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e66429f86816ba95ab55c0d005d4ee06b4d91761050ec3d4d69e41a5cafc18d +size 1666726 diff --git a/datasets/fineweb/fqdn/avg_word_length/metric.json b/datasets/fineweb/fqdn/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fe7e555155f6a5367c4f8ca550ab45d2a75e68f1 --- /dev/null +++ b/datasets/fineweb/fqdn/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e393cc52d2135dc13fb176bd74bb947817236ef203052a1a2fe0d4f32846803 +size 1768057 diff --git a/datasets/fineweb/fqdn/avg_words_per_line/metric.json b/datasets/fineweb/fqdn/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7bfd30cc635faff91a36e444f18ff636cfac1c88 --- /dev/null +++ b/datasets/fineweb/fqdn/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fbdd99514f4bb8a00e07b98214503fd20734e3f9707ea44f4b9b100236ff2bf +size 1646080 diff --git a/datasets/fineweb/fqdn/capitalized_ratio/metric.json b/datasets/fineweb/fqdn/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..787fb3b17b3276549b793948a79445cf9787237b --- /dev/null +++ b/datasets/fineweb/fqdn/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46021c90c6fe3d1bd8643ae0d8695f27d91add5555e9017ff8d34583dd16786f +size 1808247 diff --git a/datasets/fineweb/fqdn/ccnet_perplexity_wikipedia_en/metric.json b/datasets/fineweb/fqdn/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..76281954932e7ab2d67e177ad1dd2951ae25531b --- /dev/null +++ b/datasets/fineweb/fqdn/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ead8e8247b4aff6c9d0eaddaf273491544ad22d66111aaa49c53f7ec43e965e4 +size 1567464 diff --git a/datasets/fineweb/fqdn/digit_ratio/metric.json b/datasets/fineweb/fqdn/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d89b458b2e5833366a2074daf3ac17af842e2e99 --- /dev/null +++ b/datasets/fineweb/fqdn/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4baf97a0f763f4f9a3b9f31cf7d98533e99c933641ab45883f524f8d7b67a6cd +size 1735014 diff --git a/datasets/fineweb/fqdn/elipsis_ratio/metric.json b/datasets/fineweb/fqdn/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..08807292ffabf341578e75bdc30c1c9cffae77dc --- /dev/null +++ b/datasets/fineweb/fqdn/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d020e654aca44d3de33a5c43847df5a0adbbaa3b82f15de490cd4e53ba9b41b +size 1746817 diff --git a/datasets/fineweb/fqdn/fasttext_en/metric.json b/datasets/fineweb/fqdn/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b826739e67221cba3adec3449ecce0c22e360bac --- /dev/null +++ b/datasets/fineweb/fqdn/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f117ee9ebf3131404007b7a896e887ae971993cbd77accf679ea67e3ea9ccfd +size 1820525 diff --git a/datasets/fineweb/fqdn/length/metric.json b/datasets/fineweb/fqdn/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..29b89d08ac651d5e1d3d8b43236d4e6ac01e5611 --- /dev/null +++ b/datasets/fineweb/fqdn/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:786673894d3038904ee2dc48a7df60c4d88f33256d08db265630975cdf3c52c8 +size 1472082 diff --git a/datasets/fineweb/fqdn/line_char_duplicates/metric.json b/datasets/fineweb/fqdn/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b07c076cded6849a6347235d9ec1d270b204090a --- /dev/null +++ b/datasets/fineweb/fqdn/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:278367d58bb24399df892f73ed757c526fc1211f9cc70af0e0afb2d00ddb548c +size 1704265 diff --git a/datasets/fineweb/fqdn/line_duplicates/metric.json b/datasets/fineweb/fqdn/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..86012707aa7565c6052e6ee2f4ffb1a19a680070 --- /dev/null +++ b/datasets/fineweb/fqdn/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b243552dca7b85dda7138bf9037b22ee687311c2adb375c76b85d03e8da4cf9f +size 1591658 diff --git a/datasets/fineweb/fqdn/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/fineweb/fqdn/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..94643e00eb41bd4d0275dd7eedc0a410c3afb43c --- /dev/null +++ b/datasets/fineweb/fqdn/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebcecfc28d9f19a391ff2d9205b7449188d408ba2649bd5a229585baa8af57a9 +size 1645923 diff --git a/datasets/fineweb/fqdn/long_line_ratio_chars_10000/metric.json b/datasets/fineweb/fqdn/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2e8bced1c9fd4795b468d013203ecf690779fbd6 --- /dev/null +++ b/datasets/fineweb/fqdn/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90200d65416813d646c6f30b21c727b6a227273e182d83e5642609d9a4a5687d +size 363948 diff --git a/datasets/fineweb/fqdn/long_line_ratio_chars_2000/metric.json b/datasets/fineweb/fqdn/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..606b6daba14d20780a9c59e621341f9d7c148259 --- /dev/null +++ b/datasets/fineweb/fqdn/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1ef5ba7a678081158d7a8c09d6b3fd2cd1cec28999ae9ccca930a4b81dfc273 +size 1275566 diff --git a/datasets/fineweb/fqdn/long_sentence_ratio_75/metric.json b/datasets/fineweb/fqdn/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..34a1bc5fcac437cce2b60b225a6b9857127a79b1 --- /dev/null +++ b/datasets/fineweb/fqdn/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bae3661af2b4ecba6bfc93b2400d530b46f4789793a34cc58b36ae50208436ef +size 1630201 diff --git a/datasets/fineweb/fqdn/long_word_ratio_7/metric.json b/datasets/fineweb/fqdn/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3d05a2f6581acda136f0c232cfa4fff3b139999a --- /dev/null +++ b/datasets/fineweb/fqdn/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bc8d8168c95ea4204b306496440a762c6ced84b1fb0f373e17b632dd9ae5d9e +size 1822666 diff --git a/datasets/fineweb/fqdn/n_lines/metric.json b/datasets/fineweb/fqdn/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e570f5b569b94af72706d41d77262819044ca9b5 --- /dev/null +++ b/datasets/fineweb/fqdn/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba3cffaa668d344d36684ad4ae50b9ab0453127e386ae863075828f52b09a220 +size 1420064 diff --git a/datasets/fineweb/fqdn/n_sentences/metric.json b/datasets/fineweb/fqdn/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9167349248d6df044fad76ec92c5c41633f46250 --- /dev/null +++ b/datasets/fineweb/fqdn/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:969c316c175d554d75f23ecac925ce3536955b0ee0b761ff53d9283765a31a4f +size 1423561 diff --git a/datasets/fineweb/fqdn/n_words/metric.json b/datasets/fineweb/fqdn/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a8cdf67435537aa91aca11f5362037b5b4d5f781 --- /dev/null +++ b/datasets/fineweb/fqdn/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1f42df041326637fc6cf85d613f17d5c0e24813f1f9f1985c2129892e34c997 +size 1449787 diff --git a/datasets/fineweb/fqdn/non_alpha_digit_ratio/metric.json b/datasets/fineweb/fqdn/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cd206b49b075b9e0dd908e3b7f9114518f9ef83d --- /dev/null +++ b/datasets/fineweb/fqdn/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4964dccdf8b2fe653ad5a61f40ee74a7feefaf508c1f9c07f7b6b16a00bd518 +size 1847644 diff --git a/datasets/fineweb/fqdn/punctuation_ratio/metric.json b/datasets/fineweb/fqdn/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..bae45e00dfa90c9bada0bb062385205699cd9d88 --- /dev/null +++ b/datasets/fineweb/fqdn/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cad7bd2614f0ccf5421f696221e514bfcf0b430c73a3fd6fb3dc47967721e16 +size 1861442 diff --git a/datasets/fineweb/fqdn/short_line_ratio_chars_10/metric.json b/datasets/fineweb/fqdn/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..623820fbf6f5006c44ca7da61bf432406c48d37c --- /dev/null +++ b/datasets/fineweb/fqdn/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e98416867e51daa20969f9f0b7cef6bc35e9907e109da8347fa78a2b372a4df +size 1606305 diff --git a/datasets/fineweb/fqdn/short_line_ratio_chars_30/metric.json b/datasets/fineweb/fqdn/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..49d5cb2c57c97fd7fbd6e3ba2827a227bbbbd0d7 --- /dev/null +++ b/datasets/fineweb/fqdn/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34b4597207cb1ae412cfe36ff993626d386f82cd8c16a32cf4c2aabf4aa2fd69 +size 1683798 diff --git a/datasets/fineweb/fqdn/short_sentence_ratio_20/metric.json b/datasets/fineweb/fqdn/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6f4feb508b2668ed888ac2f2f43f4c6f4f73d9ae --- /dev/null +++ b/datasets/fineweb/fqdn/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1caef8078234f739763651e2d7adcd72a0ca8fa7b17f50b8eaac5e0b3f1361f +size 1658015 diff --git a/datasets/fineweb/fqdn/short_word_ratio_3/metric.json b/datasets/fineweb/fqdn/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..812f87151dc1dc4485fe9e909e997702ac4c0a76 --- /dev/null +++ b/datasets/fineweb/fqdn/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cda1d701aeef482babacb168efc9163ce0af3a04de6c59f4f5c6bbee161d86c3 +size 1811992 diff --git a/datasets/fineweb/fqdn/stop_word_ratio/metric.json b/datasets/fineweb/fqdn/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..99466c6ba06a1bf3c6bf8b89be4073b131f34ce3 --- /dev/null +++ b/datasets/fineweb/fqdn/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c69e34ac1827b8dd6a79fdb091410998f202e275d632c3bdbd6dd883b2380673 +size 1836309 diff --git a/datasets/fineweb/fqdn/type_token_ratio/metric.json b/datasets/fineweb/fqdn/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..bc633a5d83a3d02011e838e88b1d959b4df7cb71 --- /dev/null +++ b/datasets/fineweb/fqdn/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bae34fd33323fe3cfa7da97c735722264c44b2fc4c325191e6f2a3613d08f71b +size 1804757 diff --git a/datasets/fineweb/fqdn/uppercase_ratio/metric.json b/datasets/fineweb/fqdn/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d11162d4701b05f026ccc539d9443986046b972b --- /dev/null +++ b/datasets/fineweb/fqdn/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d09e28bf7f64b37b49939af47658e8b6db68e0f2e6e01344fab7655b84a10988 +size 1840861 diff --git a/datasets/fineweb/fqdn/white_space_ratio/metric.json b/datasets/fineweb/fqdn/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d03f1cd5265cb115207973514d30fb94bb0f8f12 --- /dev/null +++ b/datasets/fineweb/fqdn/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ecadd98566bc1e50cd156a28161b5318a3e36a48e259838eada69e4e58c26ee +size 1845176 diff --git a/datasets/fineweb/histogram/avg_line_length/metric.json b/datasets/fineweb/histogram/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b59d9ab5d8309152b54ca1579ba17bc5b3a42ba5 --- /dev/null +++ b/datasets/fineweb/histogram/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6965f16c0e9c5db5b723d8611e85018c22f3531fcab7ffc79de9f878e9cd8302 +size 31533770 diff --git a/datasets/fineweb/histogram/avg_sentence_length/metric.json b/datasets/fineweb/histogram/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..dc7a13580155b703021baff488b147b537f3b51a --- /dev/null +++ b/datasets/fineweb/histogram/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7999b5fab9e95f7107d954f05ab00102d7ba979133e15f2dc10101a17c903be +size 13433418 diff --git a/datasets/fineweb/histogram/avg_word_length/metric.json b/datasets/fineweb/histogram/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3607eea69a4313cd4351f683b7cbb50a1012ab9b --- /dev/null +++ b/datasets/fineweb/histogram/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b7091e4090b59b64b72ba83d9554444d3f052c726f5e16e7c6c954f9f39cd6b +size 270845 diff --git a/datasets/fineweb/histogram/avg_words_per_line/metric.json b/datasets/fineweb/histogram/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a7657d99d9f2dfa651cca8c53301d3cd66b94572 --- /dev/null +++ b/datasets/fineweb/histogram/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1abf5ae993df74b00814e7c05280bed31f2863c6b4a7935600bac7686d44bf22 +size 8844815 diff --git a/datasets/fineweb/histogram/capitalized_ratio/metric.json b/datasets/fineweb/histogram/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c82dd3bf1f87484cc644bc3c8175dda2f304ddf3 --- /dev/null +++ b/datasets/fineweb/histogram/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd3af87ed61cd63a72fc180111a6af7ac793343e3a234c1e23c8d6f136cb1f45 +size 39809 diff --git a/datasets/fineweb/histogram/ccnet_perplexity_wikipedia_en/metric.json b/datasets/fineweb/histogram/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..790df3a054fb212804203ce1eb55aeb72de3c40b --- /dev/null +++ b/datasets/fineweb/histogram/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eca0951bc72e1318aea1072b5cbd4cff6b340a3205653c8b553572a3a485a065 +size 3309407 diff --git a/datasets/fineweb/histogram/digit_ratio/metric.json b/datasets/fineweb/histogram/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3fe8dd702d1be95621a1fc60293dce1f668e3e59 --- /dev/null +++ b/datasets/fineweb/histogram/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80440a4a6262eb8447c59463a1f8be507b3f78f26fd104e04b23e7ef78d26687 +size 16611 diff --git a/datasets/fineweb/histogram/elipsis_ratio/metric.json b/datasets/fineweb/histogram/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6669d69d5652bbe718685152c5cea0ed759f9f18 --- /dev/null +++ b/datasets/fineweb/histogram/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46cc366db1d805b0d7b23dddebe602317d8ffa02effe7ed8449d08c6a3255642 +size 3500 diff --git a/datasets/fineweb/histogram/fasttext_en/metric.json b/datasets/fineweb/histogram/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f1084372d712ca13e4309569043f9ef91a642b0b --- /dev/null +++ b/datasets/fineweb/histogram/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7da721694e49d9892c153cb118551cae1ab98b771b6c8aea4ede1014f0ec0af2 +size 15261 diff --git a/datasets/fineweb/histogram/length/metric.json b/datasets/fineweb/histogram/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d56019facab2580313995eb8850fc8b95c5e628d --- /dev/null +++ b/datasets/fineweb/histogram/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7594566127671cd9d42b8fed3ca765a69e139cd11804a03b058fb243037e6e7b +size 5260684 diff --git a/datasets/fineweb/histogram/line_char_duplicates/metric.json b/datasets/fineweb/histogram/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..310936874321bd33b238cb09df03cd14510bd729 --- /dev/null +++ b/datasets/fineweb/histogram/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc5473a5115a7c00296d2f494b1c9bc75fd642b9d8d0dbef1aa0dc38230d8a74 +size 7218 diff --git a/datasets/fineweb/histogram/line_duplicates/metric.json b/datasets/fineweb/histogram/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..02f314688fe35bba9b4c963db2a9e657736f0b9f --- /dev/null +++ b/datasets/fineweb/histogram/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d840141cfb177c7bbb03d29afe12a00fa0389e8931e40a56db01f09219acce09 +size 14153 diff --git a/datasets/fineweb/histogram/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/fineweb/histogram/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..aa34a77aadb843962000a999fc807b138caa837f --- /dev/null +++ b/datasets/fineweb/histogram/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93541659e85458208c25b5d9b4f4807e86cba344db2084edfb4b1a25ba49d4db +size 37545 diff --git a/datasets/fineweb/histogram/long_line_ratio_chars_10000/metric.json b/datasets/fineweb/histogram/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a6fedeec841f69a549c08278a1649560cfded124 --- /dev/null +++ b/datasets/fineweb/histogram/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47a66eb44e69a103b3136d7a43b4f42dde739db9aee2cadd2660311f297feb3a +size 12830 diff --git a/datasets/fineweb/histogram/long_line_ratio_chars_2000/metric.json b/datasets/fineweb/histogram/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d5ecd8b8519f1fd154b08dc9598a17022541f1cc --- /dev/null +++ b/datasets/fineweb/histogram/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6851a082bb5110b242455816efea93d7fe2948086b7bdde6116935a77870a20 +size 33157 diff --git a/datasets/fineweb/histogram/long_sentence_ratio_75/metric.json b/datasets/fineweb/histogram/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..df788b26da1e6f5ca56e87478db10d249b99a6dc --- /dev/null +++ b/datasets/fineweb/histogram/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb23bd3d173a735cf3afb839c863e93bd1e126414f762e6feff1de2905adc1d3 +size 42633 diff --git a/datasets/fineweb/histogram/long_word_ratio_7/metric.json b/datasets/fineweb/histogram/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ec5405ed991671b6d7eb3c6df32854a479e788b9 --- /dev/null +++ b/datasets/fineweb/histogram/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2500d1cd5b19a34311ebb64db2f4e7369a376532e157465568376505c2cb56a5 +size 30231 diff --git a/datasets/fineweb/histogram/n_lines/metric.json b/datasets/fineweb/histogram/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7c2b21c426b317965d91895de399876072d5a090 --- /dev/null +++ b/datasets/fineweb/histogram/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6b3df3998a7f7229268f1098cf2cd37347d3251fe613d757482ed1943c8fb13 +size 147899 diff --git a/datasets/fineweb/histogram/n_sentences/metric.json b/datasets/fineweb/histogram/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..28c3c735cab88a3101aca6021d62b2b8d9d0655e --- /dev/null +++ b/datasets/fineweb/histogram/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe8fe83aba00902f4e69dabf967a534a595460c194ed034a7583a14299eac40e +size 184573 diff --git a/datasets/fineweb/histogram/n_words/metric.json b/datasets/fineweb/histogram/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4e6d920e170fa060b310a211e44c1719dcd1ecc7 --- /dev/null +++ b/datasets/fineweb/histogram/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64b7c1d38359dc2671ecedbf0021fbcd9a23ad81e2b20968944a22f0901f046f +size 1702931 diff --git a/datasets/fineweb/histogram/non_alpha_digit_ratio/metric.json b/datasets/fineweb/histogram/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..979d12097e3510c211b549ffdfd73e94ddf6b331 --- /dev/null +++ b/datasets/fineweb/histogram/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdea7fefed04464cb78752d78b6c517d14f050b4acac00986407a2a7bb073b61 +size 18765 diff --git a/datasets/fineweb/histogram/punctuation_ratio/metric.json b/datasets/fineweb/histogram/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2ac4191182a00d9e8b40f7cb5da076e5ea0ae773 --- /dev/null +++ b/datasets/fineweb/histogram/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46e514d3fc6ea28879b39d8b15b16d406fad50911e0d2a340520bee0cc779214 +size 17180 diff --git a/datasets/fineweb/histogram/short_line_ratio_chars_10/metric.json b/datasets/fineweb/histogram/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4d98392d6a8305a57e4b7a30aed00cccad54a800 --- /dev/null +++ b/datasets/fineweb/histogram/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61df4801ca046152dfe37943415b87fd792c962b6aff05ffa65e032c2c90c884 +size 24797 diff --git a/datasets/fineweb/histogram/short_line_ratio_chars_30/metric.json b/datasets/fineweb/histogram/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..dedabc5aefd3b4f4959b2b92f3fcc75e6e5dd5e6 --- /dev/null +++ b/datasets/fineweb/histogram/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4931cea605f20e3524070d5c92ebded0a9320cee5b76633d183a2fac2d12a54 +size 30312 diff --git a/datasets/fineweb/histogram/short_sentence_ratio_20/metric.json b/datasets/fineweb/histogram/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ad2bc8837d2c2123305300a5154f74f5402aafd7 --- /dev/null +++ b/datasets/fineweb/histogram/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d70cda4b0f8fd2b2d9dc0e789d330ca477b91c35b485434fbf9e23b4b056f3d0 +size 39509 diff --git a/datasets/fineweb/histogram/short_word_ratio_3/metric.json b/datasets/fineweb/histogram/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1a370baa07f84c0dd3d0f5f40d0138c31957ec1f --- /dev/null +++ b/datasets/fineweb/histogram/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35ad8f0d69e4321bfdcbb94e7681f8060e3f224dd4d77f3cf75c146e1d9cb370 +size 34185 diff --git a/datasets/fineweb/histogram/stop_word_ratio/metric.json b/datasets/fineweb/histogram/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..82ccb8567712866059cd37b19f4e5ec195641f69 --- /dev/null +++ b/datasets/fineweb/histogram/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2746e96bd5b4969ea4b85768ffbf7508033ea46320469bcb58bdbeaf4a22d28d +size 16663 diff --git a/datasets/fineweb/histogram/type_token_ratio/metric.json b/datasets/fineweb/histogram/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8a042011c570680f2f59b314edf2f804fa857925 --- /dev/null +++ b/datasets/fineweb/histogram/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3ec3012f095902b53a7e9925342247d1aa38da1fad4b1470b34038713d32f62 +size 41567 diff --git a/datasets/fineweb/histogram/uppercase_ratio/metric.json b/datasets/fineweb/histogram/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..75e93df67004544ce72876957b567ee33c8ab3bc --- /dev/null +++ b/datasets/fineweb/histogram/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a679a925228e273238ea54a38fdc0216ebed258dd251f0a02eb26b45712ba28c +size 30488 diff --git a/datasets/fineweb/histogram/white_space_ratio/metric.json b/datasets/fineweb/histogram/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b0b60e81534bd9d8979968e60390adb11f500382 --- /dev/null +++ b/datasets/fineweb/histogram/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6835851b350ded2a1e75c0f4405db2ab73d4e612ba0960484436d2c6816a032 +size 11585 diff --git a/datasets/fineweb/none/avg_line_length/metric.json b/datasets/fineweb/none/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6266b44829885706722e99b75b2d48e086091b0f --- /dev/null +++ b/datasets/fineweb/none/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29c63cc1185710c8e80db27f65133aaec5516dcbea6c07606303664d6e89e4e7 +size 208 diff --git a/datasets/fineweb/none/avg_word_length/metric.json b/datasets/fineweb/none/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..94d9a64f60851b83baaa9c49dfa8cd6eb975e70f --- /dev/null +++ b/datasets/fineweb/none/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:259cafd4f1952508cab6b851eb10c0c0ac00155eadc465edbcc08c57027469ef +size 218 diff --git a/datasets/fineweb/none/avg_words_per_line/metric.json b/datasets/fineweb/none/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b3c2fd7e5e009e8db2d3d4b74f4c69a87d39f4f9 --- /dev/null +++ b/datasets/fineweb/none/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a923850f99b655095d64216aafab16079daf577eec6d76f8e063c2d96e6e21b6 +size 208 diff --git a/datasets/fineweb/none/digit_ratio/metric.json b/datasets/fineweb/none/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..80cce4153fade5043102216dc9e13c84cedfc22a --- /dev/null +++ b/datasets/fineweb/none/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2884d6169089cf4a105eec660835f0df11003f8245da71036d869648b46c91a0 +size 209 diff --git a/datasets/fineweb/none/fasttext_en/metric.json b/datasets/fineweb/none/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5038457ea483aa6e621a4d4a09430f2349172310 --- /dev/null +++ b/datasets/fineweb/none/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8825943428cb03c9fd30cb1d75fb1b12ad5faffeb3281d73bf30b4a1c3f8f3f +size 220 diff --git a/datasets/fineweb/none/length/metric.json b/datasets/fineweb/none/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..25571190255339abb3d1a29ead791e72d9d05c54 --- /dev/null +++ b/datasets/fineweb/none/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508bd7b66685e22e11d34a81fd33b30800f4e7e77f66bc0093fb6c2a9fb256c8 +size 183 diff --git a/datasets/fineweb/none/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/fineweb/none/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..80bd31dee248fe358387542a17cf4793d7839cd9 --- /dev/null +++ b/datasets/fineweb/none/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3e1ecbed0f3c3ca250c41f7a59df6d1c319453bc0ef64115dfb9a67428c4315 +size 207 diff --git a/datasets/fineweb/none/long_line_ratio_chars_10000/metric.json b/datasets/fineweb/none/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ef8cf175656fcf90a49c169ef8cd6a7273547966 --- /dev/null +++ b/datasets/fineweb/none/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6edaaafd67503b7266593c9e4d0a111fe370740ee2dfc69d9ec08eac00e3f634 +size 196 diff --git a/datasets/fineweb/none/long_line_ratio_chars_2000/metric.json b/datasets/fineweb/none/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4003524de9d30883f4acbc6b877f1e7053107c9d --- /dev/null +++ b/datasets/fineweb/none/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfe1a748a75adde3003019216a0a681f82ceba098cae19ebbdee4a327e954b41 +size 192 diff --git a/datasets/fineweb/none/long_word_ratio_7/metric.json b/datasets/fineweb/none/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..da732cfc47a0e9914be4144a21f26497a92bbbb3 --- /dev/null +++ b/datasets/fineweb/none/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:219cdfe8fc7c158d556e6395270bbb3ad66dfac92090bc5df819b32e2ab72bf9 +size 205 diff --git a/datasets/fineweb/none/n_lines/metric.json b/datasets/fineweb/none/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d16eff7e1bb7b6b9ad3a05b2241de82cc8c95e1e --- /dev/null +++ b/datasets/fineweb/none/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd7e5178f0933f1b9af76b538b10034b996685943c82e1a02300cebbf3cebd7 +size 180 diff --git a/datasets/fineweb/none/n_words/metric.json b/datasets/fineweb/none/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d5776989c19ac9e82c3e9bc2cc9b5c1c64c3bbcf --- /dev/null +++ b/datasets/fineweb/none/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1afeee7262798667d3bdfb902efde91bca4e3e6abe3d73b03a3e03e3d2dc2e17 +size 184 diff --git a/datasets/fineweb/none/non_alpha_digit_ratio/metric.json b/datasets/fineweb/none/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..803575b69f9a8e2f90a4b9a0520c246c7b90fef8 --- /dev/null +++ b/datasets/fineweb/none/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e70cff4d1395cbc7c7173ea4bbb2f586e58c232a1c475d0980a69e71476d4074 +size 224 diff --git a/datasets/fineweb/none/short_line_ratio_chars_10/metric.json b/datasets/fineweb/none/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..66f78e6273780a8ae203a7a4212ca9e31f2f0e75 --- /dev/null +++ b/datasets/fineweb/none/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c95f9d2c6e06b4618e8a203b5fa8fc6865ce560474355a88a65fa9360963b30e +size 211 diff --git a/datasets/fineweb/none/short_line_ratio_chars_30/metric.json b/datasets/fineweb/none/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..30af41068a2d9a1de7cf1aeb31ef7b329078398c --- /dev/null +++ b/datasets/fineweb/none/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c92f8fe6e4b802431c0337e37891984ae7f9ff98c43e3bf0ac67f42b57607fc7 +size 206 diff --git a/datasets/fineweb/none/short_word_ratio_3/metric.json b/datasets/fineweb/none/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0e671bf8f4b9c5b68225e7156926caddc2d76d5c --- /dev/null +++ b/datasets/fineweb/none/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:883d298d764708328a33c5f3016d53d5a77d34f3a5bf9dc0996a9970d168b26d +size 226 diff --git a/datasets/fineweb/none/white_space_ratio/metric.json b/datasets/fineweb/none/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..115679c2ab6d091bb494ec1373f483cd1d596210 --- /dev/null +++ b/datasets/fineweb/none/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66e8be471033706e67279fc15c29eac34a4bcab2ff8fc44a9df4b6b61a36aee5 +size 227 diff --git a/datasets/fineweb/suffix/avg_line_length/metric.json b/datasets/fineweb/suffix/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..685754c84683998a5a3cc52121e00b3b4a5091dc --- /dev/null +++ b/datasets/fineweb/suffix/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a28383e38e93fe19dac043299b9f9f11f7d642451fd40483ea0f4c27843990e5 +size 495248 diff --git a/datasets/fineweb/suffix/avg_sentence_length/metric.json b/datasets/fineweb/suffix/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8d43b131437236120ede293a269c8b6b76920123 --- /dev/null +++ b/datasets/fineweb/suffix/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb0caaa26f2020355bc4f591ed19948ffbeda0dd05ca0046b8dbc374561dbf57 +size 499421 diff --git a/datasets/fineweb/suffix/avg_word_length/metric.json b/datasets/fineweb/suffix/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..92d7c63748e5333323695373eb88afa94c2b3b64 --- /dev/null +++ b/datasets/fineweb/suffix/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e88f85073b749bbe8e753fbaf25afa7977ea858d80a8f10ce31f881ecb86170a +size 531707 diff --git a/datasets/fineweb/suffix/avg_words_per_line/metric.json b/datasets/fineweb/suffix/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..dc9ff1975403da19f12d699bc252f5713886f56b --- /dev/null +++ b/datasets/fineweb/suffix/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf7899399d9e0824bbdb0b4934292e48a60320b6cb451162c483dc87e23b75e6 +size 492097 diff --git a/datasets/fineweb/suffix/capitalized_ratio/metric.json b/datasets/fineweb/suffix/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7cc3cb9e23af66ecb22436029ee5dcf75178078a --- /dev/null +++ b/datasets/fineweb/suffix/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2604df13fb2583f85981166ab757e2a6756c0829959ad8034788708ae067134 +size 546359 diff --git a/datasets/fineweb/suffix/ccnet_perplexity_wikipedia_en/metric.json b/datasets/fineweb/suffix/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..280f22ae3de70a0ee5015fa3ddb0af6655460eb9 --- /dev/null +++ b/datasets/fineweb/suffix/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82e043b1d33dce97a2ebf86c68821241d4c5a0a4a9ff1f3354660d3817c6f436 +size 452858 diff --git a/datasets/fineweb/suffix/digit_ratio/metric.json b/datasets/fineweb/suffix/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..95d5a0d2fe7e13950682bf3330fd1bc2055a02fb --- /dev/null +++ b/datasets/fineweb/suffix/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24e6fc81a32cf56a669dc91f7aa2acf7ffd6e0619a46781b4fdd2a23bfbc0eb4 +size 521448 diff --git a/datasets/fineweb/suffix/elipsis_ratio/metric.json b/datasets/fineweb/suffix/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e173cfaa9f1c154f62cf2d0d9d4f94b634c10f53 --- /dev/null +++ b/datasets/fineweb/suffix/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02c338935c7ceb55e74f7d69adde9f033be5f4f73be49ff51d59bd4835dea153 +size 420008 diff --git a/datasets/fineweb/suffix/fasttext_en/metric.json b/datasets/fineweb/suffix/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2677686d1667a6ad213a90270569e950455265e9 --- /dev/null +++ b/datasets/fineweb/suffix/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78fb8ff8f07edb82bf37df2bc6ee8fe8cc57cb89d355566ecaa30e623af8acef +size 585338 diff --git a/datasets/fineweb/suffix/length/metric.json b/datasets/fineweb/suffix/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f6ea41dcce3b149753dadc2f1c8463f896f98902 --- /dev/null +++ b/datasets/fineweb/suffix/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fc80c45d03b61b1d0d9cdf0626e7470781003176021aed52b7681ba9ed62842 +size 430976 diff --git a/datasets/fineweb/suffix/line_char_duplicates/metric.json b/datasets/fineweb/suffix/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e4b5376df7b8ca12ecd6374f44aac469d212501e --- /dev/null +++ b/datasets/fineweb/suffix/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:793f506a7bbe89a37933a21eba4d361d357c56ef17484b02939982caa99e3d6b +size 327080 diff --git a/datasets/fineweb/suffix/line_duplicates/metric.json b/datasets/fineweb/suffix/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1c2a730dee2ff6942e52494a147ceae7686e0c52 --- /dev/null +++ b/datasets/fineweb/suffix/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69a7d42f83e182de274d01bfca2db90db171ca505a961eff7fbe3bd64befca0e +size 317226 diff --git a/datasets/fineweb/suffix/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/fineweb/suffix/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..be0bd5f229fb088434d9fa9565f2c38bed140c6e --- /dev/null +++ b/datasets/fineweb/suffix/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43db42ce9e96670427ced78c3bd6f109b2d63a75b9035a9802c993f15f7a061f +size 489379 diff --git a/datasets/fineweb/suffix/long_line_ratio_chars_10000/metric.json b/datasets/fineweb/suffix/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d42c401e244637228c3321a5dbd3dca44ffd01d9 --- /dev/null +++ b/datasets/fineweb/suffix/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05891fdf137d9409ea07a881b4c97a9d34753798809affb56d0f5b4cc4524bc1 +size 152172 diff --git a/datasets/fineweb/suffix/long_line_ratio_chars_2000/metric.json b/datasets/fineweb/suffix/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3544e077faf89c91219df726c3a386d8886e0d2a --- /dev/null +++ b/datasets/fineweb/suffix/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59abe1ff9dc4d7e0c94c79833aec683cce4489de64b5906764b3ce2b40d1a6b8 +size 262340 diff --git a/datasets/fineweb/suffix/long_sentence_ratio_75/metric.json b/datasets/fineweb/suffix/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..02ba5a044e060ec4c05b00fc84dc0932a2f5c784 --- /dev/null +++ b/datasets/fineweb/suffix/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b7ee3a65a9d3d3c41717490785e3f39e2d4193616cfc87d233e472615dceb0c +size 488264 diff --git a/datasets/fineweb/suffix/long_word_ratio_7/metric.json b/datasets/fineweb/suffix/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..626cbe4f0624b828ada3a93c1398ef45df124247 --- /dev/null +++ b/datasets/fineweb/suffix/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2118eb1859f2fbb2fe71abee5dd9e948c200437a1e2af6679697f543a3b20bea +size 548728 diff --git a/datasets/fineweb/suffix/n_lines/metric.json b/datasets/fineweb/suffix/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..27d5ffc7d01afecee377c1735e7f7fd4c104fa77 --- /dev/null +++ b/datasets/fineweb/suffix/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eadd8d16163a5f9cce24424a73a6e8b522289b8b8340e30d0834f52457664006 +size 456462 diff --git a/datasets/fineweb/suffix/n_sentences/metric.json b/datasets/fineweb/suffix/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..64165afc18cfc4c229b0f07bfffd8f4fbd06bf63 --- /dev/null +++ b/datasets/fineweb/suffix/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:839e7e394aed5fd945aa747e2238e23391018fcf7b3a4027eaadd9fc7cf51966 +size 411554 diff --git a/datasets/fineweb/suffix/n_words/metric.json b/datasets/fineweb/suffix/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..491fbce65d11305285eb708fd4f9286536d92ef8 --- /dev/null +++ b/datasets/fineweb/suffix/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9789e811b7e5cf339d56e6c8081e5a2a84baa1db56192c24c1a60973c3cc1fc7 +size 423454 diff --git a/datasets/fineweb/suffix/non_alpha_digit_ratio/metric.json b/datasets/fineweb/suffix/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3621a62984b6531c0c27912158ae23f8e6b61be4 --- /dev/null +++ b/datasets/fineweb/suffix/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d98b8e8640f962a780cdf5c17618a7112cbabda0c0fe0f312623ed4efe0865db +size 556967 diff --git a/datasets/fineweb/suffix/punctuation_ratio/metric.json b/datasets/fineweb/suffix/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b6e68e4a030eafff3b35a5b1c6cd6387f65024f6 --- /dev/null +++ b/datasets/fineweb/suffix/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe7c431a90c1807183646c1bd1cafdfeade89d9fda6ac5339760f60fc90a440f +size 607103 diff --git a/datasets/fineweb/suffix/short_line_ratio_chars_10/metric.json b/datasets/fineweb/suffix/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ac00ebaa04db64bcde8193c8205444330e8a786b --- /dev/null +++ b/datasets/fineweb/suffix/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e085f2a9be91003f910e6181624f9b020ba6be5f78733b7ad9d2f279bc93b813 +size 303809 diff --git a/datasets/fineweb/suffix/short_line_ratio_chars_30/metric.json b/datasets/fineweb/suffix/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..649aeb96eb5d75e9a6b41c20da6eef7c4f175b76 --- /dev/null +++ b/datasets/fineweb/suffix/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:764a62ac7541c4fe08838ed4ecaeca845d7ca68c82903ea0c4ca69348ad48898 +size 480606 diff --git a/datasets/fineweb/suffix/short_sentence_ratio_20/metric.json b/datasets/fineweb/suffix/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2943e4d2aa1e0157ba59ccc45f7e06c8f6d9d533 --- /dev/null +++ b/datasets/fineweb/suffix/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62130532381ef575924c864c7cdac7bea246170af3655ca3129dae86648b6b92 +size 451064 diff --git a/datasets/fineweb/suffix/short_word_ratio_3/metric.json b/datasets/fineweb/suffix/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fe53c9947f9a838fb5540cb8677086c634bd0674 --- /dev/null +++ b/datasets/fineweb/suffix/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e95967b857922a6cf08042628e76adf0338831a9fb9d50dc59bb522a3911803 +size 547092 diff --git a/datasets/fineweb/suffix/stop_word_ratio/metric.json b/datasets/fineweb/suffix/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c21b206e8e7bfb3b48bc6d7ab26e99d9cea44b9c --- /dev/null +++ b/datasets/fineweb/suffix/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81be797a032b8a3bded7d2978e6418eae02211c708aebaaa431a3f9b0431d577 +size 551221 diff --git a/datasets/fineweb/suffix/type_token_ratio/metric.json b/datasets/fineweb/suffix/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..43d0df977ef12a3e7cc4d009c6b218379c2a7759 --- /dev/null +++ b/datasets/fineweb/suffix/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a2ae08bce0ce5fcf6fc40820f3ffcecacf372414c3ff719a55a12b1b2b74744 +size 543227 diff --git a/datasets/fineweb/suffix/uppercase_ratio/metric.json b/datasets/fineweb/suffix/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a174627ed2a2f2786b2c03f973313230feb10f42 --- /dev/null +++ b/datasets/fineweb/suffix/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508b5e43234a2ca5b514e1bb2dc31dc5f0d0a3813e1c23e7deb2b345c6783a26 +size 557400 diff --git a/datasets/fineweb/suffix/white_space_ratio/metric.json b/datasets/fineweb/suffix/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c625604a7a2384190aec232cec49edf705e6631f --- /dev/null +++ b/datasets/fineweb/suffix/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29b603556f716738b8797ea7286de4a8ffb5666c763e4ff43d0be265c7f336c1 +size 557261 diff --git a/datasets/fineweb/summary/avg_line_length/metric.json b/datasets/fineweb/summary/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6266b44829885706722e99b75b2d48e086091b0f --- /dev/null +++ b/datasets/fineweb/summary/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29c63cc1185710c8e80db27f65133aaec5516dcbea6c07606303664d6e89e4e7 +size 208 diff --git a/datasets/fineweb/summary/avg_sentence_length/metric.json b/datasets/fineweb/summary/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8700fbdae3da528260966229dd6f7effbdac8ac9 --- /dev/null +++ b/datasets/fineweb/summary/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd14f2ab1b2bc1ec55872abd29a7b39766c7b4d5e02e3e7ce7253b161e24d689 +size 205 diff --git a/datasets/fineweb/summary/avg_word_length/metric.json b/datasets/fineweb/summary/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..94d9a64f60851b83baaa9c49dfa8cd6eb975e70f --- /dev/null +++ b/datasets/fineweb/summary/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:259cafd4f1952508cab6b851eb10c0c0ac00155eadc465edbcc08c57027469ef +size 218 diff --git a/datasets/fineweb/summary/avg_words_per_line/metric.json b/datasets/fineweb/summary/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b3c2fd7e5e009e8db2d3d4b74f4c69a87d39f4f9 --- /dev/null +++ b/datasets/fineweb/summary/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a923850f99b655095d64216aafab16079daf577eec6d76f8e063c2d96e6e21b6 +size 208 diff --git a/datasets/fineweb/summary/capitalized_ratio/metric.json b/datasets/fineweb/summary/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..793f5627e0d98387d8057eed407754471005a6a4 --- /dev/null +++ b/datasets/fineweb/summary/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb128bd639447982cf076fb6afa78aa43c6b60dc80d336f7492040e8bc56ee41 +size 207 diff --git a/datasets/fineweb/summary/ccnet_perplexity_wikipedia_en/metric.json b/datasets/fineweb/summary/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9489abdb5ca7e721f7da78e6e0e2e900ccb66149 --- /dev/null +++ b/datasets/fineweb/summary/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc7f711a9d4a1f87874b71f3a14e466e52fbe2ea965e503fb8cecd0e213e4a54 +size 193 diff --git a/datasets/fineweb/summary/digit_ratio/metric.json b/datasets/fineweb/summary/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..80cce4153fade5043102216dc9e13c84cedfc22a --- /dev/null +++ b/datasets/fineweb/summary/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2884d6169089cf4a105eec660835f0df11003f8245da71036d869648b46c91a0 +size 209 diff --git a/datasets/fineweb/summary/elipsis_ratio/metric.json b/datasets/fineweb/summary/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..07f2bac17e3d5e708a7968a2f29de807e3c3745c --- /dev/null +++ b/datasets/fineweb/summary/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39a5418b53e7a9c2037fbb19bb5f840ef285a7335965a0d97769c3e521a66f58 +size 211 diff --git a/datasets/fineweb/summary/fasttext_en/metric.json b/datasets/fineweb/summary/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5038457ea483aa6e621a4d4a09430f2349172310 --- /dev/null +++ b/datasets/fineweb/summary/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8825943428cb03c9fd30cb1d75fb1b12ad5faffeb3281d73bf30b4a1c3f8f3f +size 220 diff --git a/datasets/fineweb/summary/length/metric.json b/datasets/fineweb/summary/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..25571190255339abb3d1a29ead791e72d9d05c54 --- /dev/null +++ b/datasets/fineweb/summary/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508bd7b66685e22e11d34a81fd33b30800f4e7e77f66bc0093fb6c2a9fb256c8 +size 183 diff --git a/datasets/fineweb/summary/line_char_duplicates/metric.json b/datasets/fineweb/summary/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e35fef9163ea283e2f92b33f4604c68fe1a94b41 --- /dev/null +++ b/datasets/fineweb/summary/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b838ca9bdce922ab0403ac42429221de4544a6a44383cb4f615a8b3a597d8f8 +size 214 diff --git a/datasets/fineweb/summary/line_duplicates/metric.json b/datasets/fineweb/summary/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..93ed1854421297bb46c0cc2800ee3f0c4c51dbca --- /dev/null +++ b/datasets/fineweb/summary/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0ed1074a0f270b5c202d3bf201e0bf38d947840d80eea74ba1f97529ceeea34 +size 212 diff --git a/datasets/fineweb/summary/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/fineweb/summary/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..80bd31dee248fe358387542a17cf4793d7839cd9 --- /dev/null +++ b/datasets/fineweb/summary/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3e1ecbed0f3c3ca250c41f7a59df6d1c319453bc0ef64115dfb9a67428c4315 +size 207 diff --git a/datasets/fineweb/summary/long_line_ratio_chars_10000/metric.json b/datasets/fineweb/summary/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ef8cf175656fcf90a49c169ef8cd6a7273547966 --- /dev/null +++ b/datasets/fineweb/summary/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6edaaafd67503b7266593c9e4d0a111fe370740ee2dfc69d9ec08eac00e3f634 +size 196 diff --git a/datasets/fineweb/summary/long_line_ratio_chars_2000/metric.json b/datasets/fineweb/summary/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4003524de9d30883f4acbc6b877f1e7053107c9d --- /dev/null +++ b/datasets/fineweb/summary/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfe1a748a75adde3003019216a0a681f82ceba098cae19ebbdee4a327e954b41 +size 192 diff --git a/datasets/fineweb/summary/long_sentence_ratio_75/metric.json b/datasets/fineweb/summary/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..82c62e9b7cdfebe7d25a43bd71d69ef12213675f --- /dev/null +++ b/datasets/fineweb/summary/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd021e2e69c5a151a9d8cd749378f5bd596a3bd9c395dc3f6cefb31681babbd9 +size 189 diff --git a/datasets/fineweb/summary/long_word_ratio_7/metric.json b/datasets/fineweb/summary/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..da732cfc47a0e9914be4144a21f26497a92bbbb3 --- /dev/null +++ b/datasets/fineweb/summary/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:219cdfe8fc7c158d556e6395270bbb3ad66dfac92090bc5df819b32e2ab72bf9 +size 205 diff --git a/datasets/fineweb/summary/n_lines/metric.json b/datasets/fineweb/summary/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d16eff7e1bb7b6b9ad3a05b2241de82cc8c95e1e --- /dev/null +++ b/datasets/fineweb/summary/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dd7e5178f0933f1b9af76b538b10034b996685943c82e1a02300cebbf3cebd7 +size 180 diff --git a/datasets/fineweb/summary/n_sentences/metric.json b/datasets/fineweb/summary/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9c28487a3ecb50cddf9fa2386089325681d290af --- /dev/null +++ b/datasets/fineweb/summary/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d95a6a59887b13db0ae8c152d59ea0e432d86500ceca3f71e09c518b3270bdd6 +size 177 diff --git a/datasets/fineweb/summary/n_words/metric.json b/datasets/fineweb/summary/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d5776989c19ac9e82c3e9bc2cc9b5c1c64c3bbcf --- /dev/null +++ b/datasets/fineweb/summary/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1afeee7262798667d3bdfb902efde91bca4e3e6abe3d73b03a3e03e3d2dc2e17 +size 184 diff --git a/datasets/fineweb/summary/non_alpha_digit_ratio/metric.json b/datasets/fineweb/summary/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..803575b69f9a8e2f90a4b9a0520c246c7b90fef8 --- /dev/null +++ b/datasets/fineweb/summary/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e70cff4d1395cbc7c7173ea4bbb2f586e58c232a1c475d0980a69e71476d4074 +size 224 diff --git a/datasets/fineweb/summary/punctuation_ratio/metric.json b/datasets/fineweb/summary/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3e0feb42b9384b7713c2fd6a6596aef033f642cc --- /dev/null +++ b/datasets/fineweb/summary/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:068395afdeb65fe9887dc6d0c790eac3bacce8707b9aa5f5cf95a44e51d6c844 +size 217 diff --git a/datasets/fineweb/summary/short_line_ratio_chars_10/metric.json b/datasets/fineweb/summary/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..66f78e6273780a8ae203a7a4212ca9e31f2f0e75 --- /dev/null +++ b/datasets/fineweb/summary/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c95f9d2c6e06b4618e8a203b5fa8fc6865ce560474355a88a65fa9360963b30e +size 211 diff --git a/datasets/fineweb/summary/short_line_ratio_chars_30/metric.json b/datasets/fineweb/summary/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..30af41068a2d9a1de7cf1aeb31ef7b329078398c --- /dev/null +++ b/datasets/fineweb/summary/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c92f8fe6e4b802431c0337e37891984ae7f9ff98c43e3bf0ac67f42b57607fc7 +size 206 diff --git a/datasets/fineweb/summary/short_sentence_ratio_20/metric.json b/datasets/fineweb/summary/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5de6ecc56508fcef20816cc82f3c3c6fa5260d26 --- /dev/null +++ b/datasets/fineweb/summary/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b9bf4c312a4f5d39303e15bd655ff995ceb53e9628c025756461057fc7854ba +size 192 diff --git a/datasets/fineweb/summary/short_word_ratio_3/metric.json b/datasets/fineweb/summary/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0e671bf8f4b9c5b68225e7156926caddc2d76d5c --- /dev/null +++ b/datasets/fineweb/summary/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:883d298d764708328a33c5f3016d53d5a77d34f3a5bf9dc0996a9970d168b26d +size 226 diff --git a/datasets/fineweb/summary/stop_word_ratio/metric.json b/datasets/fineweb/summary/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..03c763e048b3e09451aef63d0a520c4ee4ad8340 --- /dev/null +++ b/datasets/fineweb/summary/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d25beb7eeaa7d943d6199fdb70754dc87bca1e5b1da4032a62b89eb02d382c07 +size 209 diff --git a/datasets/fineweb/summary/type_token_ratio/metric.json b/datasets/fineweb/summary/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8768455d47f480eeedc032d400ac6abc741736f4 --- /dev/null +++ b/datasets/fineweb/summary/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40f073a245cdddb4964b31bbffdc38f5ebb4c9dcd0e839f18e0b93018bfce193 +size 207 diff --git a/datasets/fineweb/summary/uppercase_ratio/metric.json b/datasets/fineweb/summary/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a2639766369877b0544aa5097d9f856100ab9e5a --- /dev/null +++ b/datasets/fineweb/summary/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2c1223acb653d44b1b4ca4e600d6fdba5401a3ba4698076c6385613edd7d1b5 +size 209 diff --git a/datasets/fineweb/summary/white_space_ratio/metric.json b/datasets/fineweb/summary/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..115679c2ab6d091bb494ec1373f483cd1d596210 --- /dev/null +++ b/datasets/fineweb/summary/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66e8be471033706e67279fc15c29eac34a4bcab2ff8fc44a9df4b6b61a36aee5 +size 227 diff --git a/datasets/red_pajama_v2/fqdn/avg_line_length/metric.json b/datasets/red_pajama_v2/fqdn/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..46c8ceceb5ed54ad6bdeded1db90dfc55d427059 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5a7d3c864a73270de3ef6df34c0a3540679375bf0f362b7de80bb0868e91e7d +size 1642462 diff --git a/datasets/red_pajama_v2/fqdn/avg_sentence_length/metric.json b/datasets/red_pajama_v2/fqdn/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..681139b84079b99f9fb4714431c6654d2fb8facf --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e06c14b04741f95da5ecab5a656bde7ac6834912fa1f1368eae38488c6d61629 +size 1635628 diff --git a/datasets/red_pajama_v2/fqdn/avg_word_length/metric.json b/datasets/red_pajama_v2/fqdn/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a496094d0ba15e27d4b5fe852bc1e328a37a53a4 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2def0d91e6ecdef85347d9452ae347ea1b70f49848ea958a75f1fa60d1dc273e +size 1741743 diff --git a/datasets/red_pajama_v2/fqdn/avg_words_per_line/metric.json b/datasets/red_pajama_v2/fqdn/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8192c21bb95e6cccb7fd91a3ec897cd7b212eed1 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f871ab26a44089eec412f253a4908de0566003914523dd2f401eaf81949d604 +size 1636004 diff --git a/datasets/red_pajama_v2/fqdn/capitalized_ratio/metric.json b/datasets/red_pajama_v2/fqdn/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7b09c5451d5e1b5751d83207df2e881cfd8df41f --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5250cc6842d520a761fb45d14358208f4943dbe71a00c3f47ea4f7da868a87c +size 1788681 diff --git a/datasets/red_pajama_v2/fqdn/ccnet_perplexity_wikipedia_en/metric.json b/datasets/red_pajama_v2/fqdn/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0d73a6e710b3b1229ff8aed21e399930b45eed67 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e183ed3a57dfa81480ba3af0f65833682a481480ec89104ef08a46aecd5d1c1 +size 1538926 diff --git a/datasets/red_pajama_v2/fqdn/digit_ratio/metric.json b/datasets/red_pajama_v2/fqdn/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c77641206eb78093290449655f8c0fbe72aa0712 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf1ed3bd746de2b7847ad76b86b7b1e5d97b3e00261fc2df19b27e0b02690ca1 +size 1749678 diff --git a/datasets/red_pajama_v2/fqdn/elipsis_ratio/metric.json b/datasets/red_pajama_v2/fqdn/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..30638e829382b2bda8a64c0b43263ed769f062d7 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d81508a528c9644d956a7f93dfb93428920fcda42d70b398717c6e40a2523760 +size 1690608 diff --git a/datasets/red_pajama_v2/fqdn/fasttext_en/metric.json b/datasets/red_pajama_v2/fqdn/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..88ef28ddd0b8759441d35c40158e976a86e043a9 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2de70ec8951482e5f512aeaabfd1b39cc400fbc9ae6b5d3bd2262e16b90aeb3a +size 1561224 diff --git a/datasets/red_pajama_v2/fqdn/length/metric.json b/datasets/red_pajama_v2/fqdn/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fe2725dd6a4e40e57434363b526d5427eb01bb18 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8afbeaddbca94db3bf08bd752ef1001759db8e824b06e42f2735778a6b32410 +size 1465728 diff --git a/datasets/red_pajama_v2/fqdn/line_char_duplicates/metric.json b/datasets/red_pajama_v2/fqdn/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c87d7bde08a12930e0265fa0cd8de43183569d88 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2947012dd082d20aacb29611df6ff755597a3ad59e04580277347d2d6ea7c720 +size 199368 diff --git a/datasets/red_pajama_v2/fqdn/line_duplicates/metric.json b/datasets/red_pajama_v2/fqdn/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c87d7bde08a12930e0265fa0cd8de43183569d88 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2947012dd082d20aacb29611df6ff755597a3ad59e04580277347d2d6ea7c720 +size 199368 diff --git a/datasets/red_pajama_v2/fqdn/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/red_pajama_v2/fqdn/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6fc540639bd295f398e34b70fdf4ce1af20df3a6 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17b3e1498a9d7b9a42b815b5575d570c797fae301c223d3794abe5db1a623de9 +size 1619788 diff --git a/datasets/red_pajama_v2/fqdn/long_line_ratio_chars_10000/metric.json b/datasets/red_pajama_v2/fqdn/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..59fabff69e6c94cdfc4f7b0dca96b2881909b279 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d80222813a94ec7956a967b49996d675ea9996cba183003ec4812a01953ceb02 +size 492037 diff --git a/datasets/red_pajama_v2/fqdn/long_line_ratio_chars_2000/metric.json b/datasets/red_pajama_v2/fqdn/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6190e5a1a9bd693a87bfe57ba4b64b8b90833e1d --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01ce23fd97a4ff32319e8a5e50edca0d67f26d7145d4c7bf6fd3af632709f99e +size 1267100 diff --git a/datasets/red_pajama_v2/fqdn/long_sentence_ratio_75/metric.json b/datasets/red_pajama_v2/fqdn/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..be0f03e8eaa492bcf503b3fd3cd30f074ce87ca2 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08603b695e3a302f2d17625e74450d065e23e737455b87c4c19eb10ad3bbf9ac +size 1612179 diff --git a/datasets/red_pajama_v2/fqdn/long_word_ratio_7/metric.json b/datasets/red_pajama_v2/fqdn/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..892c5de174d8ded90291f67ce50a13ca2648b2d5 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40ac4bcfb529f22085e6289daa8214efb11d0db83ce8c31924c7eba02d37da12 +size 1784156 diff --git a/datasets/red_pajama_v2/fqdn/n_lines/metric.json b/datasets/red_pajama_v2/fqdn/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0a823719e5b1694970b9d63a08bbedae96f1a41b --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d40fbdd23532c71dabdc8028f6131d00f0d733930f530760d966acdda1e2f91 +size 1409603 diff --git a/datasets/red_pajama_v2/fqdn/n_sentences/metric.json b/datasets/red_pajama_v2/fqdn/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1e03a639c3250967d621e64d476af52bad0ff8ce --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d8a03d5ec29a57e0c37f5883dbc3495bded14ef7f4b2ad19705603512b43519 +size 1409493 diff --git a/datasets/red_pajama_v2/fqdn/n_words/metric.json b/datasets/red_pajama_v2/fqdn/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2aa01bbf4d1dd10bcd22b876ccd93c36db1eb2d0 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e64ac560f9a9606323b76a60dad5dd8760019f2c570187d7ebf62d584e14741a +size 1446743 diff --git a/datasets/red_pajama_v2/fqdn/non_alpha_digit_ratio/metric.json b/datasets/red_pajama_v2/fqdn/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..dc7ee724c95eb4b4a810f6670e9a7d546f1fec67 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d06a3e51a87a2abc5ababc440654f18524ee0c35296168f2c4f76d9e6177e1f +size 1828650 diff --git a/datasets/red_pajama_v2/fqdn/punctuation_ratio/metric.json b/datasets/red_pajama_v2/fqdn/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..32fee810d7272c3077f9db3dc9c6fb946d58b076 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a3c996f4797d338277e0708877aa732c6a1305b2eed2a5eadfdefdfb719e75d +size 1832485 diff --git a/datasets/red_pajama_v2/fqdn/short_line_ratio_chars_10/metric.json b/datasets/red_pajama_v2/fqdn/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..88d6fffd72e2dae82021ab4e19d971894d88783d --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8af5ed6ac4f511db645a02127322c169de254ed0d4211c90a9471ae5401d3ffd +size 1612541 diff --git a/datasets/red_pajama_v2/fqdn/short_line_ratio_chars_30/metric.json b/datasets/red_pajama_v2/fqdn/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..763ea78143d614fe9145a17bf0604b02b59d9e9b --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cedd8b4a99cdf27386540c2822a0c0de6ed7c87afe584dd6127ca110aeb5578 +size 1617531 diff --git a/datasets/red_pajama_v2/fqdn/short_sentence_ratio_20/metric.json b/datasets/red_pajama_v2/fqdn/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ccd2416ad03cd51b3cd04b733db790641b245fce --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff78ad9f0fc9b2a2f72a62c85d0b540924cd857d1d4ff8c7ad918b3e2badd6bc +size 1628125 diff --git a/datasets/red_pajama_v2/fqdn/short_word_ratio_3/metric.json b/datasets/red_pajama_v2/fqdn/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2b5f35a6cbf3af3c4cd8a6393d3ca0558f898883 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e89bec9e112e4169d12a39a623cf9fba1ec375baba20a520264b290b0eeea650 +size 1787457 diff --git a/datasets/red_pajama_v2/fqdn/stop_word_ratio/metric.json b/datasets/red_pajama_v2/fqdn/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fee111b4a199a6dcb63e4ec32178ca35dba318ac --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:883b10a7b98270917cedb30bfac0c858f5b7d831daccf1eeb73883c59a775555 +size 1757920 diff --git a/datasets/red_pajama_v2/fqdn/type_token_ratio/metric.json b/datasets/red_pajama_v2/fqdn/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f7028beaa6b35876cbe1b0f860fc9a295019fadc --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:943372131e8299a3409ae7a40b64ac062869ba2f131a7dc32f8ae561332fdf7d +size 1754016 diff --git a/datasets/red_pajama_v2/fqdn/uppercase_ratio/metric.json b/datasets/red_pajama_v2/fqdn/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..2ccc13490e4073cbf79f79ec2c159f1f8d0a2498 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a6fd53fd0158138e1cb567bc0c0c67903cbd0f968d8a4d14700623ac4b3d7f2 +size 1835674 diff --git a/datasets/red_pajama_v2/fqdn/white_space_ratio/metric.json b/datasets/red_pajama_v2/fqdn/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..15149c7667b74724f33f4d11ce8ebf6dd3e50bd6 --- /dev/null +++ b/datasets/red_pajama_v2/fqdn/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:825792aebe81d9d1d6e8e582505a148a96c572381617157bbbe2516867987d56 +size 1827891 diff --git a/datasets/red_pajama_v2/histogram/avg_line_length/metric.json b/datasets/red_pajama_v2/histogram/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f9efc048270a8fdc8e24307d721b2b90e48cfb86 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0562bcacd0be25196a116f805ea243f006e7d74318ed64f336589a5d74f7544d +size 14732005 diff --git a/datasets/red_pajama_v2/histogram/avg_sentence_length/metric.json b/datasets/red_pajama_v2/histogram/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5567462a8e4d4cc0513bdb4bd71d55b5d95a88fc --- /dev/null +++ b/datasets/red_pajama_v2/histogram/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a5d9f7b57ce31df277eaa9d5df805f571fc33fd04f9c3b8ee3d9b755c7ae4c3 +size 9110626 diff --git a/datasets/red_pajama_v2/histogram/avg_word_length/metric.json b/datasets/red_pajama_v2/histogram/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..353753c0f9e5561b7164aca60b7e44d816712455 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06861e82a2a0781aeb1c2f3be5e029bd2ca6fd98a4890873862727384a33aaa5 +size 423743 diff --git a/datasets/red_pajama_v2/histogram/avg_words_per_line/metric.json b/datasets/red_pajama_v2/histogram/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..394cd31b6acc53d6c1bb71386e31961c18cbe526 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:752770b3e7de10a632b07428c6cf50c4aec1e336d353dba8155436ce4ef64c3a +size 4811670 diff --git a/datasets/red_pajama_v2/histogram/capitalized_ratio/metric.json b/datasets/red_pajama_v2/histogram/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..293ace4bfc3c886592afedbc1347f4339f72fc28 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:572ee511ab72cc0fe3e93cdad8a11dacdbde758a17cb07753bbd6d9793fcad41 +size 39853 diff --git a/datasets/red_pajama_v2/histogram/ccnet_perplexity_wikipedia_en/metric.json b/datasets/red_pajama_v2/histogram/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..b6aba3ca53df0e6e9fb74ab80d2a44c569bf720c --- /dev/null +++ b/datasets/red_pajama_v2/histogram/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a32ddb1fe7dae2cd85f5abad048f28cb678172fd097fbf40d722cb358d0b686 +size 1319938 diff --git a/datasets/red_pajama_v2/histogram/digit_ratio/metric.json b/datasets/red_pajama_v2/histogram/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e031a8582aa8b17665c0cbc11605666ff971cfdf --- /dev/null +++ b/datasets/red_pajama_v2/histogram/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:935b4198f03f680d28ac5991ab73d5f4ab75551ddc97ccf822657921245692a5 +size 30427 diff --git a/datasets/red_pajama_v2/histogram/elipsis_ratio/metric.json b/datasets/red_pajama_v2/histogram/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..41ac75292cc5353ec44ce4fd31f6180cc39af5cb --- /dev/null +++ b/datasets/red_pajama_v2/histogram/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c1062072c8ae6406fe479762fb1ae4727ef29ea6549d1b6f804c0b70e006789 +size 15455 diff --git a/datasets/red_pajama_v2/histogram/fasttext_en/metric.json b/datasets/red_pajama_v2/histogram/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4d73338a582f3f59a3121d7dd254c5596b843bad --- /dev/null +++ b/datasets/red_pajama_v2/histogram/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0eb496159ba6456f7f07556917828f2c213086699d45d5d7a0acd8dd4d3e34eb +size 2120 diff --git a/datasets/red_pajama_v2/histogram/length/metric.json b/datasets/red_pajama_v2/histogram/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7f2106b33396ef71b2bb06e01d86b68f676e2573 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f75412adf4d39b435d63395ff62e2b90746d83bf85e4959cf65adab8d02a372 +size 2979840 diff --git a/datasets/red_pajama_v2/histogram/line_char_duplicates/metric.json b/datasets/red_pajama_v2/histogram/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ca05e2826cc8f54277ff41d63ed7ecf8f490aae7 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93d66e5f7ccb4e90bc7ecefeef5e6e9fd4dc108149b9716389a954aa9a3c7713 +size 43 diff --git a/datasets/red_pajama_v2/histogram/line_duplicates/metric.json b/datasets/red_pajama_v2/histogram/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ca05e2826cc8f54277ff41d63ed7ecf8f490aae7 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93d66e5f7ccb4e90bc7ecefeef5e6e9fd4dc108149b9716389a954aa9a3c7713 +size 43 diff --git a/datasets/red_pajama_v2/histogram/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/red_pajama_v2/histogram/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8356bb147b4682a2df8d6f797f914eb0c5656599 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67f98008a2146a719789126f0603531161eeea8bd164be36092bf9143507f26e +size 41386 diff --git a/datasets/red_pajama_v2/histogram/long_line_ratio_chars_10000/metric.json b/datasets/red_pajama_v2/histogram/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c33cbc40ad519770d64e0d587977f4df9e5c0757 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d60d1b5c8a9e37213da61b976bab70752d6a98ee9d845113101a77a13f88182 +size 7601 diff --git a/datasets/red_pajama_v2/histogram/long_line_ratio_chars_2000/metric.json b/datasets/red_pajama_v2/histogram/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..cc6a90f2c4c7a98b5d8aa1d0ec4c2bec6181149e --- /dev/null +++ b/datasets/red_pajama_v2/histogram/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e04d4f669b3ae0c53ce3fb55521f9e9094456d27a4b7da9d4f20f46b3778121 +size 20820 diff --git a/datasets/red_pajama_v2/histogram/long_sentence_ratio_75/metric.json b/datasets/red_pajama_v2/histogram/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..45e04cf089a8ed8e968576f95822b40c7e817888 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:184b62a1244b6b6fb332c7621e18b4e1211a87c2cc80c802463248f2244a31ed +size 41297 diff --git a/datasets/red_pajama_v2/histogram/long_word_ratio_7/metric.json b/datasets/red_pajama_v2/histogram/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..82ed799f25e5bfc03bae9c35b53f73054e76301a --- /dev/null +++ b/datasets/red_pajama_v2/histogram/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50ee751f5bcf59f21f24694bc028626b66bab873c162ca5ed444da3b8ad444e0 +size 30654 diff --git a/datasets/red_pajama_v2/histogram/n_lines/metric.json b/datasets/red_pajama_v2/histogram/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ef7788f51338629eb5b5423301750632f0107ec5 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8ec9c011f098e1d9508a8d76d2db5b3b2dc71111ddb03a30b27d12b70732da5 +size 88314 diff --git a/datasets/red_pajama_v2/histogram/n_sentences/metric.json b/datasets/red_pajama_v2/histogram/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..10501ea5c0f067e367d147f4dccdc7cbaa84a203 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcb1acb87ec9bf1f137fe4b44a59bc3b1e58806641c89bced1fc04e0a292204d +size 126460 diff --git a/datasets/red_pajama_v2/histogram/n_words/metric.json b/datasets/red_pajama_v2/histogram/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d3925afcc10ee2cc4d5829f3de9e1f53145560f2 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4625d9032ece1fe4b404be62d7fdbc2e917acf732f39e9bee478b842b5d6d4ff +size 1021482 diff --git a/datasets/red_pajama_v2/histogram/non_alpha_digit_ratio/metric.json b/datasets/red_pajama_v2/histogram/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..32935166352b12cd8bec4c3aeae81505c48fe96b --- /dev/null +++ b/datasets/red_pajama_v2/histogram/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe1fc7b12d9aa6629fed1217ab1d0c24bbcfa1f8af59afe921a0709afd200f7f +size 29892 diff --git a/datasets/red_pajama_v2/histogram/punctuation_ratio/metric.json b/datasets/red_pajama_v2/histogram/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5666cec44634611c5573c72831dad071585f0b5d --- /dev/null +++ b/datasets/red_pajama_v2/histogram/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c6c45be7f6034f6bb260bfa90490acd644495d894b038ec207654f308af5c16 +size 23628 diff --git a/datasets/red_pajama_v2/histogram/short_line_ratio_chars_10/metric.json b/datasets/red_pajama_v2/histogram/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..93085542c52ba6585e0d0fb4fc63466eb4b57576 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7b6c83e5b41fec2b9b1c2eda538cbbf269faea51fdf02728ca6b1f8d6d83c7 +size 26378 diff --git a/datasets/red_pajama_v2/histogram/short_line_ratio_chars_30/metric.json b/datasets/red_pajama_v2/histogram/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..708989edb02e537f9d3f750cdadabe77b70c8ed7 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36dfc87d28e998a12c3b30110630b209218180aa78e224c12aa487024bb8041a +size 40890 diff --git a/datasets/red_pajama_v2/histogram/short_sentence_ratio_20/metric.json b/datasets/red_pajama_v2/histogram/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..24d4a530acfa13743608390518cd71b2b0652406 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb5ba7f3431360b1f7c647181a0848018aa3c4127a4838e3205be36f93f40ea8 +size 39736 diff --git a/datasets/red_pajama_v2/histogram/short_word_ratio_3/metric.json b/datasets/red_pajama_v2/histogram/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..8729753b3dee01992baab297f5c8523caef79392 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4c29983edadac68cd66fe7b26dfc96482c7f0c14ca3f232172d4c7f50470a60 +size 39909 diff --git a/datasets/red_pajama_v2/histogram/stop_word_ratio/metric.json b/datasets/red_pajama_v2/histogram/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4578aaa5ff68ecba04391cb9f090917b1c174c38 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f94f16f23bd138c7bdb9412b99e36e4fd9142710c1ec2b259a3d090922ee2b +size 16254 diff --git a/datasets/red_pajama_v2/histogram/type_token_ratio/metric.json b/datasets/red_pajama_v2/histogram/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c25b05845b2f40d7fe7713753fd74ee19dd5b7ae --- /dev/null +++ b/datasets/red_pajama_v2/histogram/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:944a6b447eeaae829872afd15cf03f42de5a14a0f28acf744d67c76d6ee6e2e1 +size 41022 diff --git a/datasets/red_pajama_v2/histogram/uppercase_ratio/metric.json b/datasets/red_pajama_v2/histogram/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e7280164bfe76d6b926fb573d0e68a474a489801 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bff7ab8a038dad3879f9ac2547f73e483bf2de75137ca35328a506dec1eb5d9 +size 34138 diff --git a/datasets/red_pajama_v2/histogram/white_space_ratio/metric.json b/datasets/red_pajama_v2/histogram/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..bd189b8a1a83fd124775415b2514a0c1a9653df3 --- /dev/null +++ b/datasets/red_pajama_v2/histogram/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:674d05330a250bae90dc9140e391151df6703bc4854d6556ff7b39d228c36bfc +size 22054 diff --git a/datasets/red_pajama_v2/none/avg_line_length/metric.json b/datasets/red_pajama_v2/none/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0ad2c9e3169b947c7b4a7ccc2758ca3c4bc5a97a --- /dev/null +++ b/datasets/red_pajama_v2/none/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99bf408fd168a234f69c66472e84bcfd9100d25da4c5bde1c6fedb51d690abdd +size 192 diff --git a/datasets/red_pajama_v2/none/avg_word_length/metric.json b/datasets/red_pajama_v2/none/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5c59972e735cc14490e2d9b8dc84cf2a729beeb8 --- /dev/null +++ b/datasets/red_pajama_v2/none/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e1c7d1c9c63504f7dc687931e4b052a5834e73a967e18810e6e4d83b2495024 +size 188 diff --git a/datasets/red_pajama_v2/none/avg_words_per_line/metric.json b/datasets/red_pajama_v2/none/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e94f4655d3406a8a7d31fe3db2a5583be2d48a45 --- /dev/null +++ b/datasets/red_pajama_v2/none/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6be67ab0cbde6963fa843b0a7754f4bd03cd970f8cb141c8b3bf118a7144511 +size 188 diff --git a/datasets/red_pajama_v2/none/digit_ratio/metric.json b/datasets/red_pajama_v2/none/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..699fd2a55fc2f443f871b4736afe7762109b5fbb --- /dev/null +++ b/datasets/red_pajama_v2/none/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59671076d094d070decbe26d866f5f8cf87625dcb6c4743ea3bfb5037d247190 +size 206 diff --git a/datasets/red_pajama_v2/none/length/metric.json b/datasets/red_pajama_v2/none/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6b9ca28a23d0e7c9f886dab4403884cf4912cb6c --- /dev/null +++ b/datasets/red_pajama_v2/none/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46039db109b384e7042e3c55978c9c3cd2eed3df0d2c876d38ec1b2303b24c11 +size 181 diff --git a/datasets/red_pajama_v2/none/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/red_pajama_v2/none/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1c84eeb5b8aad3c8e31f641e0f5746e1c8e9ac2d --- /dev/null +++ b/datasets/red_pajama_v2/none/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aa0b151dddea7a76e2f97acbd7adafa638f43945d21e6a8d301102460ad0b12 +size 188 diff --git a/datasets/red_pajama_v2/none/long_line_ratio_chars_10000/metric.json b/datasets/red_pajama_v2/none/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e0d053c407519006a75dca1791405fd1f32d5f03 --- /dev/null +++ b/datasets/red_pajama_v2/none/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:959aece966c5b9241a7d46b70c8e019fdbd4800f89d9213785b3fbd4e8bb7d7d +size 194 diff --git a/datasets/red_pajama_v2/none/long_line_ratio_chars_2000/metric.json b/datasets/red_pajama_v2/none/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..06c74649ea96659ed707e412978c27b1cafab25c --- /dev/null +++ b/datasets/red_pajama_v2/none/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:322f555d51fa22597bd65b1df1f4e460883cfb947c6bb3fa1eb39b9ad026cc82 +size 191 diff --git a/datasets/red_pajama_v2/none/long_word_ratio_7/metric.json b/datasets/red_pajama_v2/none/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..57254841d2d93aa14d0d79d717d95cb5c93750cf --- /dev/null +++ b/datasets/red_pajama_v2/none/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb30104ef22a341e4e3fd3eea705db96783e5758ea5630f0138ec89ac2101d76 +size 189 diff --git a/datasets/red_pajama_v2/none/n_lines/metric.json b/datasets/red_pajama_v2/none/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7aa815c2baf4735fa61a7d49bb1d8375401c3c49 --- /dev/null +++ b/datasets/red_pajama_v2/none/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b57a9cb8d3db184ae00007bdad45871222a6f6ccefd9d9534c87175ad68fec8 +size 174 diff --git a/datasets/red_pajama_v2/none/n_words/metric.json b/datasets/red_pajama_v2/none/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..60d9bfb7268dfdc12ecafa91c46077b19175ead0 --- /dev/null +++ b/datasets/red_pajama_v2/none/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9655978b2e83b8dfcabdb0e76e7036938cb65b3f0bb68f6e0ef0654babd345c6 +size 179 diff --git a/datasets/red_pajama_v2/none/non_alpha_digit_ratio/metric.json b/datasets/red_pajama_v2/none/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d3ea712ea0bcfc6fed45fec8ef4b230f05f68453 --- /dev/null +++ b/datasets/red_pajama_v2/none/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8388dbbcbb1175ae975a49011d132b7728b9c65dcb5885dfc1dd48a3c6ee8083 +size 190 diff --git a/datasets/red_pajama_v2/none/short_line_ratio_chars_10/metric.json b/datasets/red_pajama_v2/none/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..bda599f917e6916431883a7c86bede386785f58a --- /dev/null +++ b/datasets/red_pajama_v2/none/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed3999291d917bda550fa81a824406efb50e44985dde5df95a55f0c0e4c64c5f +size 192 diff --git a/datasets/red_pajama_v2/none/short_line_ratio_chars_30/metric.json b/datasets/red_pajama_v2/none/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4b09a6a9591e880e1cca0b1ebfc9f414a3615237 --- /dev/null +++ b/datasets/red_pajama_v2/none/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fde9302f7fa82f571a6766dc57eebd9ffc1c85c1622277c14a38662d302fa9fa +size 190 diff --git a/datasets/red_pajama_v2/none/short_word_ratio_3/metric.json b/datasets/red_pajama_v2/none/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a7f5c5dba0cfd5c36fd811ca2c96daca1158afcb --- /dev/null +++ b/datasets/red_pajama_v2/none/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16706ccd2b7f6a9517ffa803c8f8bc34a6c9d4d5f826626fa51990cc98809d62 +size 191 diff --git a/datasets/red_pajama_v2/none/white_space_ratio/metric.json b/datasets/red_pajama_v2/none/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0c0443e6e96ce7fa6a53a93a0aee8cfd5281a304 --- /dev/null +++ b/datasets/red_pajama_v2/none/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a588963db9bce8f31b4fd439eedd67a91f37ae47d97c2962f4033f597a69e96 +size 207 diff --git a/datasets/red_pajama_v2/suffix/avg_line_length/metric.json b/datasets/red_pajama_v2/suffix/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..550efadcf62885098d742dc3f13a547d71026498 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e89212908754271c18a22a7bc5ccfdef1e7dc54087084b4d94466107d4ada07 +size 295763 diff --git a/datasets/red_pajama_v2/suffix/avg_sentence_length/metric.json b/datasets/red_pajama_v2/suffix/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..67e5c58581a4304849776e6fdea638fc3d16c9bc --- /dev/null +++ b/datasets/red_pajama_v2/suffix/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:164657c052a381092c8d72587fd7def4164428f222a0eecae152cbc63800b6ed +size 293925 diff --git a/datasets/red_pajama_v2/suffix/avg_word_length/metric.json b/datasets/red_pajama_v2/suffix/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e3c3c439836e3aa8e96ee3dcbe80d00cfc45497f --- /dev/null +++ b/datasets/red_pajama_v2/suffix/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4fcfe634d9a49283a04a9031bad0b5defaa27ac166ecfe3e074de88060e65242 +size 315697 diff --git a/datasets/red_pajama_v2/suffix/avg_words_per_line/metric.json b/datasets/red_pajama_v2/suffix/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..04409eb1dbd3ae3e179a5ae7ace4f22ac8a515f4 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38612ae4e042ae9eb25f83102508250c968d9eabd3933fcdde6ebfaa94b42386 +size 294813 diff --git a/datasets/red_pajama_v2/suffix/capitalized_ratio/metric.json b/datasets/red_pajama_v2/suffix/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..de3e9364b8ead5b489c9af46696cb760e7af76d4 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cade082e45138a904182b847c2f36e333bf6d1045baf3d95ae104f95f39fd07 +size 324013 diff --git a/datasets/red_pajama_v2/suffix/ccnet_perplexity_wikipedia_en/metric.json b/datasets/red_pajama_v2/suffix/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..51fbd7ee0ea8a9e9ab3edb68ac5242dfd3eabf45 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3495d17df0535177a3422626973754fdce36ea29810995c69c4c8cb102c04a9 +size 266552 diff --git a/datasets/red_pajama_v2/suffix/digit_ratio/metric.json b/datasets/red_pajama_v2/suffix/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..3995bf30a0f118160605f4f3e6104ef87ef31ae8 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d209f38f54b0b45bb7c2248ad9d589e1c8b010058353cabbbf58d92b2550c1a +size 317884 diff --git a/datasets/red_pajama_v2/suffix/elipsis_ratio/metric.json b/datasets/red_pajama_v2/suffix/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5f7c033835c8c0f7ece459dd5458b91ba804c1c4 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d043c474fed63126439abdce4a70d72f9ec55838adebc3a8a7ad9e77c22db179 +size 243024 diff --git a/datasets/red_pajama_v2/suffix/fasttext_en/metric.json b/datasets/red_pajama_v2/suffix/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6c4959a00a9cf5acf97ea13e875b6976c2758b4b --- /dev/null +++ b/datasets/red_pajama_v2/suffix/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b487a947950c756247a18ba9af910a25016b5bd4ca2d3f1a3c6fad40414c16d +size 268118 diff --git a/datasets/red_pajama_v2/suffix/length/metric.json b/datasets/red_pajama_v2/suffix/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..eede9b067586fe81ba74bf91574d1d3292ce9f34 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826e14fd3db21e2b4036b990ad06342ca9da29eec5a16027e72fe638ed1a27fa +size 253537 diff --git a/datasets/red_pajama_v2/suffix/line_char_duplicates/metric.json b/datasets/red_pajama_v2/suffix/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fdc2cf7e62758c5c071cdcc6e8c4f9a1e49dba9e --- /dev/null +++ b/datasets/red_pajama_v2/suffix/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85d74adce829869e44bac4b8acc1e8ffa3ae74ef80a3c453e1def95e59f1cbb5 +size 22832 diff --git a/datasets/red_pajama_v2/suffix/line_duplicates/metric.json b/datasets/red_pajama_v2/suffix/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..fdc2cf7e62758c5c071cdcc6e8c4f9a1e49dba9e --- /dev/null +++ b/datasets/red_pajama_v2/suffix/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85d74adce829869e44bac4b8acc1e8ffa3ae74ef80a3c453e1def95e59f1cbb5 +size 22832 diff --git a/datasets/red_pajama_v2/suffix/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/red_pajama_v2/suffix/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..feb25a2d30b3ba03b6454a03d66bd07c7252c971 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14265fd93ffc8bb13ecc0b74e8ac6d2eadfbb69b31dffda06b82ad1d737f0971 +size 290138 diff --git a/datasets/red_pajama_v2/suffix/long_line_ratio_chars_10000/metric.json b/datasets/red_pajama_v2/suffix/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..db1c998a21b9fce67f01eb23acca084dac9846af --- /dev/null +++ b/datasets/red_pajama_v2/suffix/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a190f43a0f7697c777197e69aeef0763173ba0e0ce2232415b67ee127764372 +size 66161 diff --git a/datasets/red_pajama_v2/suffix/long_line_ratio_chars_2000/metric.json b/datasets/red_pajama_v2/suffix/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..899bc8c3d6ed6aca32bca511d2dca10ec334bf08 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59e175d970e70db6dde69f9da9f387241e3c9b232a7fc2fff2c3709192050a8d +size 131598 diff --git a/datasets/red_pajama_v2/suffix/long_sentence_ratio_75/metric.json b/datasets/red_pajama_v2/suffix/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5b4b2d507474f01ef63539d4372736e765ddc7b9 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0eda8bc4bca5e418bf0e50597a8c156b3e0baaa0c637fb7da9f2ee28ad9e0850 +size 286823 diff --git a/datasets/red_pajama_v2/suffix/long_word_ratio_7/metric.json b/datasets/red_pajama_v2/suffix/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..60cf7d15722b0b87eea1bdac20269852b10a75ee --- /dev/null +++ b/datasets/red_pajama_v2/suffix/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:964f2be43d286003466fba373fc2b56a062ec36181da6f44dcd77e6b98d7660e +size 322960 diff --git a/datasets/red_pajama_v2/suffix/n_lines/metric.json b/datasets/red_pajama_v2/suffix/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9be5f973b073ebc62b91831f015e6e8c7ea864d2 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9ab05211eac3e4c0f7fc9784bc8c3e270932ff4c94e37c90a6cb5976e2c3cf +size 238701 diff --git a/datasets/red_pajama_v2/suffix/n_sentences/metric.json b/datasets/red_pajama_v2/suffix/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4089c90a4a081eafddd16eb09bc36ff71027ad0c --- /dev/null +++ b/datasets/red_pajama_v2/suffix/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a46949982c69c440977078a739d3f9f62dbe3d1bf200cbf9b21bc7bcd9cc218 +size 238960 diff --git a/datasets/red_pajama_v2/suffix/n_words/metric.json b/datasets/red_pajama_v2/suffix/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9f6d1cfbc893882a0be3e3d75b8fab487b2b9ce2 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aea9414f6f725df67b3fdbd8779a8bcab16e57b44529ff274699ef6d7605e044 +size 248647 diff --git a/datasets/red_pajama_v2/suffix/non_alpha_digit_ratio/metric.json b/datasets/red_pajama_v2/suffix/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4b7141db8c0d57bf5c85c223c4bc71539ede9678 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:115f40280ffbc77a6d72a069c190051bef287ebbe6ff5778c458c1927ccb1e3c +size 334169 diff --git a/datasets/red_pajama_v2/suffix/punctuation_ratio/metric.json b/datasets/red_pajama_v2/suffix/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..ae7e8db6cefa4ce3373532baa86788e17510e243 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fca9035097fb31da1c69cf64dbb445535ad645eb57a21fcfa6f96870cae1cd0 +size 334736 diff --git a/datasets/red_pajama_v2/suffix/short_line_ratio_chars_10/metric.json b/datasets/red_pajama_v2/suffix/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0203a595f027c4edc4bb7edd2d83de19849acd3a --- /dev/null +++ b/datasets/red_pajama_v2/suffix/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0fd414d77a9a9293cff56692204735978a2eed844e08bf450ab63e313f73a88 +size 233806 diff --git a/datasets/red_pajama_v2/suffix/short_line_ratio_chars_30/metric.json b/datasets/red_pajama_v2/suffix/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..f9e9f446eb8d10a4bdbfcf4dcbf7e41d9016348d --- /dev/null +++ b/datasets/red_pajama_v2/suffix/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2501b3f55a47a02772f2b8e81b00939865f6e4be0f51241f2aea86031e81fa3 +size 286060 diff --git a/datasets/red_pajama_v2/suffix/short_sentence_ratio_20/metric.json b/datasets/red_pajama_v2/suffix/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1d314339fafad2959c7c40dfb3e60761ec2906aa --- /dev/null +++ b/datasets/red_pajama_v2/suffix/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:548b96058f87acb16fadab262baf50eb4636c55ab04d94cdd0a10445afc65370 +size 254100 diff --git a/datasets/red_pajama_v2/suffix/short_word_ratio_3/metric.json b/datasets/red_pajama_v2/suffix/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c013d70dcbcbd690e4377146477a7271ceccfa7f --- /dev/null +++ b/datasets/red_pajama_v2/suffix/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a416336cf8fce29af43a3081ec883896d3bc8aed0eab584f64f6026e16be606 +size 324248 diff --git a/datasets/red_pajama_v2/suffix/stop_word_ratio/metric.json b/datasets/red_pajama_v2/suffix/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0ce78987d854fa7941d82d6be7ea2fc222c879c6 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbb79ecd4d5f15230dc8c829044de4c7b6b40046db58a0d094746e328bb13b64 +size 319158 diff --git a/datasets/red_pajama_v2/suffix/type_token_ratio/metric.json b/datasets/red_pajama_v2/suffix/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..50ea73837690de4e15273f8bab4a3c61fbb8be45 --- /dev/null +++ b/datasets/red_pajama_v2/suffix/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d7b32172ea00947f322ce675a38d72f1ccfdbbc66473a24e6bb7acdbe490a7e +size 317835 diff --git a/datasets/red_pajama_v2/suffix/uppercase_ratio/metric.json b/datasets/red_pajama_v2/suffix/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1b984593c76284cce0c6e3b43e37e63d661d400b --- /dev/null +++ b/datasets/red_pajama_v2/suffix/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5f5ae422744d82f509e72c1b22dd77ba02568d0b18976a4f666a5d8aae8cffc +size 334942 diff --git a/datasets/red_pajama_v2/suffix/white_space_ratio/metric.json b/datasets/red_pajama_v2/suffix/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..67f1185be2caa3533314e9365be4561384664b4c --- /dev/null +++ b/datasets/red_pajama_v2/suffix/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74a4fc0178e14288ba29b63d5c5034015fbe3bb9ce8b2cac557fa56b3dcc63f5 +size 333208 diff --git a/datasets/red_pajama_v2/summary/avg_line_length/metric.json b/datasets/red_pajama_v2/summary/avg_line_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0ad2c9e3169b947c7b4a7ccc2758ca3c4bc5a97a --- /dev/null +++ b/datasets/red_pajama_v2/summary/avg_line_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99bf408fd168a234f69c66472e84bcfd9100d25da4c5bde1c6fedb51d690abdd +size 192 diff --git a/datasets/red_pajama_v2/summary/avg_sentence_length/metric.json b/datasets/red_pajama_v2/summary/avg_sentence_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d99e6c2ea79c6977aeaffa732663fcf488c6320a --- /dev/null +++ b/datasets/red_pajama_v2/summary/avg_sentence_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46e5c5462f6bef00e429b115d5bea8e6121557bcfeb938ce98c299870be6d6bb +size 191 diff --git a/datasets/red_pajama_v2/summary/avg_word_length/metric.json b/datasets/red_pajama_v2/summary/avg_word_length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5c59972e735cc14490e2d9b8dc84cf2a729beeb8 --- /dev/null +++ b/datasets/red_pajama_v2/summary/avg_word_length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e1c7d1c9c63504f7dc687931e4b052a5834e73a967e18810e6e4d83b2495024 +size 188 diff --git a/datasets/red_pajama_v2/summary/avg_words_per_line/metric.json b/datasets/red_pajama_v2/summary/avg_words_per_line/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e94f4655d3406a8a7d31fe3db2a5583be2d48a45 --- /dev/null +++ b/datasets/red_pajama_v2/summary/avg_words_per_line/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6be67ab0cbde6963fa843b0a7754f4bd03cd970f8cb141c8b3bf118a7144511 +size 188 diff --git a/datasets/red_pajama_v2/summary/capitalized_ratio/metric.json b/datasets/red_pajama_v2/summary/capitalized_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4e64f5844fcc940a8a73621d6124189bd52433c0 --- /dev/null +++ b/datasets/red_pajama_v2/summary/capitalized_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9358318e0d5302e38b66304aa3039dc4d8932a2bd583ed6883f10efec823904c +size 190 diff --git a/datasets/red_pajama_v2/summary/ccnet_perplexity_wikipedia_en/metric.json b/datasets/red_pajama_v2/summary/ccnet_perplexity_wikipedia_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..209465f147c917dcc1fdf04e6b31ee1d912e6c77 --- /dev/null +++ b/datasets/red_pajama_v2/summary/ccnet_perplexity_wikipedia_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a388cf56bcf2eca1b75ec9b4ea1c1facb62622003123ff49202e0226ea8a8fa +size 191 diff --git a/datasets/red_pajama_v2/summary/digit_ratio/metric.json b/datasets/red_pajama_v2/summary/digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..699fd2a55fc2f443f871b4736afe7762109b5fbb --- /dev/null +++ b/datasets/red_pajama_v2/summary/digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59671076d094d070decbe26d866f5f8cf87625dcb6c4743ea3bfb5037d247190 +size 206 diff --git a/datasets/red_pajama_v2/summary/elipsis_ratio/metric.json b/datasets/red_pajama_v2/summary/elipsis_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9525537457aaa301c06a50821c76a99a26938454 --- /dev/null +++ b/datasets/red_pajama_v2/summary/elipsis_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2565704d1feff19317c77597efd97c3ac7b896c52d01eb79df1600329201698f +size 193 diff --git a/datasets/red_pajama_v2/summary/fasttext_en/metric.json b/datasets/red_pajama_v2/summary/fasttext_en/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..c0b24d5745ac255ef8d52b8134e3e7c9ad658648 --- /dev/null +++ b/datasets/red_pajama_v2/summary/fasttext_en/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d28f665ff907200e4f88678c84969b9c5de86e31df924f18b39a4904b763adc6 +size 188 diff --git a/datasets/red_pajama_v2/summary/length/metric.json b/datasets/red_pajama_v2/summary/length/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..6b9ca28a23d0e7c9f886dab4403884cf4912cb6c --- /dev/null +++ b/datasets/red_pajama_v2/summary/length/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46039db109b384e7042e3c55978c9c3cd2eed3df0d2c876d38ec1b2303b24c11 +size 181 diff --git a/datasets/red_pajama_v2/summary/line_char_duplicates/metric.json b/datasets/red_pajama_v2/summary/line_char_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1ea0e0d75aebca9088e8f2419632dcd45e44abab --- /dev/null +++ b/datasets/red_pajama_v2/summary/line_char_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a6f01c247cd19807f8b10d02d1a3d68e814635d04dd59513b07607284c24c5c +size 14 diff --git a/datasets/red_pajama_v2/summary/line_duplicates/metric.json b/datasets/red_pajama_v2/summary/line_duplicates/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1ea0e0d75aebca9088e8f2419632dcd45e44abab --- /dev/null +++ b/datasets/red_pajama_v2/summary/line_duplicates/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a6f01c247cd19807f8b10d02d1a3d68e814635d04dd59513b07607284c24c5c +size 14 diff --git a/datasets/red_pajama_v2/summary/lines_ending_with_terminal_mark_ratio/metric.json b/datasets/red_pajama_v2/summary/lines_ending_with_terminal_mark_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..1c84eeb5b8aad3c8e31f641e0f5746e1c8e9ac2d --- /dev/null +++ b/datasets/red_pajama_v2/summary/lines_ending_with_terminal_mark_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2aa0b151dddea7a76e2f97acbd7adafa638f43945d21e6a8d301102460ad0b12 +size 188 diff --git a/datasets/red_pajama_v2/summary/long_line_ratio_chars_10000/metric.json b/datasets/red_pajama_v2/summary/long_line_ratio_chars_10000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..e0d053c407519006a75dca1791405fd1f32d5f03 --- /dev/null +++ b/datasets/red_pajama_v2/summary/long_line_ratio_chars_10000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:959aece966c5b9241a7d46b70c8e019fdbd4800f89d9213785b3fbd4e8bb7d7d +size 194 diff --git a/datasets/red_pajama_v2/summary/long_line_ratio_chars_2000/metric.json b/datasets/red_pajama_v2/summary/long_line_ratio_chars_2000/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..06c74649ea96659ed707e412978c27b1cafab25c --- /dev/null +++ b/datasets/red_pajama_v2/summary/long_line_ratio_chars_2000/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:322f555d51fa22597bd65b1df1f4e460883cfb947c6bb3fa1eb39b9ad026cc82 +size 191 diff --git a/datasets/red_pajama_v2/summary/long_sentence_ratio_75/metric.json b/datasets/red_pajama_v2/summary/long_sentence_ratio_75/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..48437e3bfb3dd8e5164c0635b58ea02dd15d3651 --- /dev/null +++ b/datasets/red_pajama_v2/summary/long_sentence_ratio_75/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc386afafef0fa04d75fb62a913c47ed609bf59a158cd64924070e59668298ce +size 188 diff --git a/datasets/red_pajama_v2/summary/long_word_ratio_7/metric.json b/datasets/red_pajama_v2/summary/long_word_ratio_7/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..57254841d2d93aa14d0d79d717d95cb5c93750cf --- /dev/null +++ b/datasets/red_pajama_v2/summary/long_word_ratio_7/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb30104ef22a341e4e3fd3eea705db96783e5758ea5630f0138ec89ac2101d76 +size 189 diff --git a/datasets/red_pajama_v2/summary/n_lines/metric.json b/datasets/red_pajama_v2/summary/n_lines/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..7aa815c2baf4735fa61a7d49bb1d8375401c3c49 --- /dev/null +++ b/datasets/red_pajama_v2/summary/n_lines/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b57a9cb8d3db184ae00007bdad45871222a6f6ccefd9d9534c87175ad68fec8 +size 174 diff --git a/datasets/red_pajama_v2/summary/n_sentences/metric.json b/datasets/red_pajama_v2/summary/n_sentences/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..aab58bfef97ae11d7eb51c5bdf913dcc3603c7d3 --- /dev/null +++ b/datasets/red_pajama_v2/summary/n_sentences/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e78bcb6e5025f1589923be841f7c7e25deee849ac028df2cab86fafad935dce +size 176 diff --git a/datasets/red_pajama_v2/summary/n_words/metric.json b/datasets/red_pajama_v2/summary/n_words/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..60d9bfb7268dfdc12ecafa91c46077b19175ead0 --- /dev/null +++ b/datasets/red_pajama_v2/summary/n_words/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9655978b2e83b8dfcabdb0e76e7036938cb65b3f0bb68f6e0ef0654babd345c6 +size 179 diff --git a/datasets/red_pajama_v2/summary/non_alpha_digit_ratio/metric.json b/datasets/red_pajama_v2/summary/non_alpha_digit_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..d3ea712ea0bcfc6fed45fec8ef4b230f05f68453 --- /dev/null +++ b/datasets/red_pajama_v2/summary/non_alpha_digit_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8388dbbcbb1175ae975a49011d132b7728b9c65dcb5885dfc1dd48a3c6ee8083 +size 190 diff --git a/datasets/red_pajama_v2/summary/punctuation_ratio/metric.json b/datasets/red_pajama_v2/summary/punctuation_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..9463b368f903b329cf6cf054b0c4d6945aca1be0 --- /dev/null +++ b/datasets/red_pajama_v2/summary/punctuation_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a1df7c7869567ba6de31de8be156ca29c2e1cfc4aa263b55ca4c39e66331629 +size 192 diff --git a/datasets/red_pajama_v2/summary/short_line_ratio_chars_10/metric.json b/datasets/red_pajama_v2/summary/short_line_ratio_chars_10/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..bda599f917e6916431883a7c86bede386785f58a --- /dev/null +++ b/datasets/red_pajama_v2/summary/short_line_ratio_chars_10/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed3999291d917bda550fa81a824406efb50e44985dde5df95a55f0c0e4c64c5f +size 192 diff --git a/datasets/red_pajama_v2/summary/short_line_ratio_chars_30/metric.json b/datasets/red_pajama_v2/summary/short_line_ratio_chars_30/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4b09a6a9591e880e1cca0b1ebfc9f414a3615237 --- /dev/null +++ b/datasets/red_pajama_v2/summary/short_line_ratio_chars_30/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fde9302f7fa82f571a6766dc57eebd9ffc1c85c1622277c14a38662d302fa9fa +size 190 diff --git a/datasets/red_pajama_v2/summary/short_sentence_ratio_20/metric.json b/datasets/red_pajama_v2/summary/short_sentence_ratio_20/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..92d733b68f6b1bd6c27fca750a8ad7291d873a7c --- /dev/null +++ b/datasets/red_pajama_v2/summary/short_sentence_ratio_20/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e3413e7a3445381e7eb24a08999942f879c29120b18937b91d579a442ab4fd4 +size 190 diff --git a/datasets/red_pajama_v2/summary/short_word_ratio_3/metric.json b/datasets/red_pajama_v2/summary/short_word_ratio_3/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..a7f5c5dba0cfd5c36fd811ca2c96daca1158afcb --- /dev/null +++ b/datasets/red_pajama_v2/summary/short_word_ratio_3/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16706ccd2b7f6a9517ffa803c8f8bc34a6c9d4d5f826626fa51990cc98809d62 +size 191 diff --git a/datasets/red_pajama_v2/summary/stop_word_ratio/metric.json b/datasets/red_pajama_v2/summary/stop_word_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..4ad489d2ebf80332039ba3873703028bdadbb77e --- /dev/null +++ b/datasets/red_pajama_v2/summary/stop_word_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85e2afd1beeb09ba88691e57ed797fa8a67c4ef272851af4c07a98ac3eff085e +size 192 diff --git a/datasets/red_pajama_v2/summary/type_token_ratio/metric.json b/datasets/red_pajama_v2/summary/type_token_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..5831c4b0c86086c7be733d00281fc778c0ddd040 --- /dev/null +++ b/datasets/red_pajama_v2/summary/type_token_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a78ff4a484341347622d49725a96b5a737e496e6f834b603c08c4a0cf2d576b4 +size 208 diff --git a/datasets/red_pajama_v2/summary/uppercase_ratio/metric.json b/datasets/red_pajama_v2/summary/uppercase_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..535cccc711aff00e7d0e13b4ec37e44c1fe714ea --- /dev/null +++ b/datasets/red_pajama_v2/summary/uppercase_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b9e3ebb7339aa4a1f89334eec40d8f522ed330344df5a49c2440cdfc370dd18 +size 193 diff --git a/datasets/red_pajama_v2/summary/white_space_ratio/metric.json b/datasets/red_pajama_v2/summary/white_space_ratio/metric.json new file mode 100644 index 0000000000000000000000000000000000000000..0c0443e6e96ce7fa6a53a93a0aee8cfd5281a304 --- /dev/null +++ b/datasets/red_pajama_v2/summary/white_space_ratio/metric.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a588963db9bce8f31b4fd439eedd67a91f37ae47d97c2962f4033f597a69e96 +size 207