- avg_line_length
- avg_sentence_length
- avg_word_length
- avg_words_per_line
- capitalized_ratio
- ccnet_perplexity_wikipedia_en
- digit_ratio
- elipsis_ratio
- fasttext_en
- length
- line_char_duplicates
- line_duplicates
- lines_ending_with_terminal_mark_ratio
- long_line_ratio_chars_10000
- long_line_ratio_chars_2000
- long_sentence_ratio_75
- long_word_ratio_5
- long_word_ratio_7
- n_lines
- n_sentences
- n_words
- non_alpha_digit_ratio
- punctuation_ratio
- short_line_ratio_chars_10
- short_line_ratio_chars_30
- short_sentence_ratio_20
- short_word_ratio_3
- stop_word_ratio
- type_token_ratio
- uppercase_ratio
- white_space_ratio