Create readability pipeline.
Browse files- .ipynb_checkpoints/config-checkpoint.cfg +170 -0
- README.md +3 -4
- attribute_ruler/patterns +0 -0
- config.cfg +3 -2
- en_readability-any-py3-none-any.whl +2 -2
- meta.json +3 -2
- parser/model +1 -1
- parser/moves +1 -2
- tagger/cfg +1 -0
- tagger/model +2 -2
- tok2vec/model +2 -2
- tokenizer +0 -0
- vocab/strings.json +2 -2
.ipynb_checkpoints/config-checkpoint.cfg
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[paths]
|
2 |
+
train = null
|
3 |
+
dev = null
|
4 |
+
vectors = null
|
5 |
+
init_tok2vec = null
|
6 |
+
|
7 |
+
[system]
|
8 |
+
seed = 0
|
9 |
+
gpu_allocator = null
|
10 |
+
|
11 |
+
[nlp]
|
12 |
+
lang = "en"
|
13 |
+
pipeline = ["tok2vec","tagger","parser","attribute_ruler","readability"]
|
14 |
+
disabled = []
|
15 |
+
before_creation = null
|
16 |
+
after_creation = null
|
17 |
+
after_pipeline_creation = null
|
18 |
+
batch_size = 1000
|
19 |
+
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
20 |
+
|
21 |
+
[components]
|
22 |
+
|
23 |
+
[components.attribute_ruler]
|
24 |
+
factory = "attribute_ruler"
|
25 |
+
scorer = {"@scorers":"spacy.attribute_ruler_scorer.v1"}
|
26 |
+
validate = false
|
27 |
+
|
28 |
+
[components.parser]
|
29 |
+
factory = "parser"
|
30 |
+
learn_tokens = false
|
31 |
+
min_action_freq = 30
|
32 |
+
moves = null
|
33 |
+
scorer = {"@scorers":"spacy.parser_scorer.v1"}
|
34 |
+
update_with_oracle_cut_size = 100
|
35 |
+
|
36 |
+
[components.parser.model]
|
37 |
+
@architectures = "spacy.TransitionBasedParser.v2"
|
38 |
+
state_type = "parser"
|
39 |
+
extra_state_tokens = false
|
40 |
+
hidden_width = 64
|
41 |
+
maxout_pieces = 2
|
42 |
+
use_upper = true
|
43 |
+
nO = null
|
44 |
+
|
45 |
+
[components.parser.model.tok2vec]
|
46 |
+
@architectures = "spacy.Tok2VecListener.v1"
|
47 |
+
width = 96
|
48 |
+
upstream = "tok2vec"
|
49 |
+
|
50 |
+
[components.readability]
|
51 |
+
factory = "readability"
|
52 |
+
|
53 |
+
[components.tagger]
|
54 |
+
factory = "tagger"
|
55 |
+
neg_prefix = "!"
|
56 |
+
overwrite = false
|
57 |
+
scorer = {"@scorers":"spacy.tagger_scorer.v1"}
|
58 |
+
|
59 |
+
[components.tagger.model]
|
60 |
+
@architectures = "spacy.Tagger.v1"
|
61 |
+
nO = null
|
62 |
+
|
63 |
+
[components.tagger.model.tok2vec]
|
64 |
+
@architectures = "spacy.Tok2VecListener.v1"
|
65 |
+
width = 96
|
66 |
+
upstream = "tok2vec"
|
67 |
+
|
68 |
+
[components.tok2vec]
|
69 |
+
factory = "tok2vec"
|
70 |
+
|
71 |
+
[components.tok2vec.model]
|
72 |
+
@architectures = "spacy.Tok2Vec.v2"
|
73 |
+
|
74 |
+
[components.tok2vec.model.embed]
|
75 |
+
@architectures = "spacy.MultiHashEmbed.v2"
|
76 |
+
width = 96
|
77 |
+
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
|
78 |
+
rows = [5000,2500,2500,2500,100]
|
79 |
+
include_static_vectors = false
|
80 |
+
|
81 |
+
[components.tok2vec.model.encode]
|
82 |
+
@architectures = "spacy.MaxoutWindowEncoder.v2"
|
83 |
+
width = 96
|
84 |
+
depth = 4
|
85 |
+
window_size = 1
|
86 |
+
maxout_pieces = 3
|
87 |
+
|
88 |
+
[corpora]
|
89 |
+
|
90 |
+
[corpora.dev]
|
91 |
+
@readers = "spacy.Corpus.v1"
|
92 |
+
path = ${paths.dev}
|
93 |
+
gold_preproc = false
|
94 |
+
max_length = 0
|
95 |
+
limit = 0
|
96 |
+
augmenter = null
|
97 |
+
|
98 |
+
[corpora.train]
|
99 |
+
@readers = "spacy.Corpus.v1"
|
100 |
+
path = ${paths.train}
|
101 |
+
gold_preproc = false
|
102 |
+
max_length = 0
|
103 |
+
limit = 0
|
104 |
+
augmenter = null
|
105 |
+
|
106 |
+
[training]
|
107 |
+
seed = ${system.seed}
|
108 |
+
gpu_allocator = ${system.gpu_allocator}
|
109 |
+
dropout = 0.1
|
110 |
+
accumulate_gradient = 1
|
111 |
+
patience = 1600
|
112 |
+
max_epochs = 0
|
113 |
+
max_steps = 20000
|
114 |
+
eval_frequency = 200
|
115 |
+
frozen_components = []
|
116 |
+
annotating_components = []
|
117 |
+
dev_corpus = "corpora.dev"
|
118 |
+
train_corpus = "corpora.train"
|
119 |
+
before_to_disk = null
|
120 |
+
|
121 |
+
[training.batcher]
|
122 |
+
@batchers = "spacy.batch_by_words.v1"
|
123 |
+
discard_oversize = false
|
124 |
+
tolerance = 0.2
|
125 |
+
get_length = null
|
126 |
+
|
127 |
+
[training.batcher.size]
|
128 |
+
@schedules = "compounding.v1"
|
129 |
+
start = 100
|
130 |
+
stop = 1000
|
131 |
+
compound = 1.001
|
132 |
+
t = 0.0
|
133 |
+
|
134 |
+
[training.logger]
|
135 |
+
@loggers = "spacy.ConsoleLogger.v1"
|
136 |
+
progress_bar = false
|
137 |
+
|
138 |
+
[training.optimizer]
|
139 |
+
@optimizers = "Adam.v1"
|
140 |
+
beta1 = 0.9
|
141 |
+
beta2 = 0.999
|
142 |
+
L2_is_weight_decay = true
|
143 |
+
L2 = 0.01
|
144 |
+
grad_clip = 1.0
|
145 |
+
use_averages = false
|
146 |
+
eps = 0.00000001
|
147 |
+
learn_rate = 0.001
|
148 |
+
|
149 |
+
[training.score_weights]
|
150 |
+
tag_acc = 0.5
|
151 |
+
dep_uas = 0.25
|
152 |
+
dep_las = 0.25
|
153 |
+
dep_las_per_type = null
|
154 |
+
sents_p = null
|
155 |
+
sents_r = null
|
156 |
+
sents_f = 0.0
|
157 |
+
|
158 |
+
[pretraining]
|
159 |
+
|
160 |
+
[initialize]
|
161 |
+
vectors = ${paths.vectors}
|
162 |
+
init_tok2vec = ${paths.init_tok2vec}
|
163 |
+
vocab_data = null
|
164 |
+
lookups = null
|
165 |
+
before_init = null
|
166 |
+
after_init = null
|
167 |
+
|
168 |
+
[initialize.components]
|
169 |
+
|
170 |
+
[initialize.tokenizer]
|
README.md
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
---
|
2 |
-
license: other
|
3 |
tags:
|
4 |
- spacy
|
5 |
- text-classification
|
@@ -12,7 +11,7 @@ A Spacy pipeline for generating readability scores
|
|
12 |
| --- | --- |
|
13 |
| **Name** | `en_readability` |
|
14 |
| **Version** | `0.1` |
|
15 |
-
| **spaCy** | `>=3.
|
16 |
| **Default Pipeline** | `tok2vec`, `tagger`, `parser`, `attribute_ruler`, `readability` |
|
17 |
| **Components** | `tok2vec`, `tagger`, `parser`, `attribute_ruler`, `readability` |
|
18 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
@@ -24,11 +23,11 @@ A Spacy pipeline for generating readability scores
|
|
24 |
|
25 |
<details>
|
26 |
|
27 |
-
<summary>View label scheme (
|
28 |
|
29 |
| Component | Labels |
|
30 |
| --- | --- |
|
31 |
-
| **`tagger`** | `$`, `''`, `,`, `-LRB-`, `-RRB-`, `.`, `:`, `ADD`, `AFX`, `CC`, `CD`, `DT`, `EX`, `FW`, `HYPH`, `IN`, `JJ`, `JJR`, `JJS`, `LS`, `MD`, `NFP`, `NN`, `NNP`, `NNPS`, `NNS`, `PDT`, `POS`, `PRP`, `PRP$`, `RB`, `RBR`, `RBS`, `RP`, `SYM`, `TO`, `UH`, `VB`, `VBD`, `VBG`, `VBN`, `VBP`, `VBZ`, `WDT`, `WP`, `WP$`, `WRB`, `XX`, ```` |
|
32 |
| **`parser`** | `ROOT`, `acl`, `acomp`, `advcl`, `advmod`, `agent`, `amod`, `appos`, `attr`, `aux`, `auxpass`, `case`, `cc`, `ccomp`, `compound`, `conj`, `csubj`, `csubjpass`, `dative`, `dep`, `det`, `dobj`, `expl`, `intj`, `mark`, `meta`, `neg`, `nmod`, `npadvmod`, `nsubj`, `nsubjpass`, `nummod`, `oprd`, `parataxis`, `pcomp`, `pobj`, `poss`, `preconj`, `predet`, `prep`, `prt`, `punct`, `quantmod`, `relcl`, `xcomp` |
|
33 |
|
34 |
</details>
|
|
|
1 |
---
|
|
|
2 |
tags:
|
3 |
- spacy
|
4 |
- text-classification
|
|
|
11 |
| --- | --- |
|
12 |
| **Name** | `en_readability` |
|
13 |
| **Version** | `0.1` |
|
14 |
+
| **spaCy** | `>=3.4.0,<3.5.0` |
|
15 |
| **Default Pipeline** | `tok2vec`, `tagger`, `parser`, `attribute_ruler`, `readability` |
|
16 |
| **Components** | `tok2vec`, `tagger`, `parser`, `attribute_ruler`, `readability` |
|
17 |
| **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
|
|
|
23 |
|
24 |
<details>
|
25 |
|
26 |
+
<summary>View label scheme (95 labels for 2 components)</summary>
|
27 |
|
28 |
| Component | Labels |
|
29 |
| --- | --- |
|
30 |
+
| **`tagger`** | `$`, `''`, `,`, `-LRB-`, `-RRB-`, `.`, `:`, `ADD`, `AFX`, `CC`, `CD`, `DT`, `EX`, `FW`, `HYPH`, `IN`, `JJ`, `JJR`, `JJS`, `LS`, `MD`, `NFP`, `NN`, `NNP`, `NNPS`, `NNS`, `PDT`, `POS`, `PRP`, `PRP$`, `RB`, `RBR`, `RBS`, `RP`, `SYM`, `TO`, `UH`, `VB`, `VBD`, `VBG`, `VBN`, `VBP`, `VBZ`, `WDT`, `WP`, `WP$`, `WRB`, `XX`, `_SP`, ```` |
|
31 |
| **`parser`** | `ROOT`, `acl`, `acomp`, `advcl`, `advmod`, `agent`, `amod`, `appos`, `attr`, `aux`, `auxpass`, `case`, `cc`, `ccomp`, `compound`, `conj`, `csubj`, `csubjpass`, `dative`, `dep`, `det`, `dobj`, `expl`, `intj`, `mark`, `meta`, `neg`, `nmod`, `npadvmod`, `nsubj`, `nsubjpass`, `nummod`, `oprd`, `parataxis`, `pcomp`, `pobj`, `poss`, `preconj`, `predet`, `prep`, `prt`, `punct`, `quantmod`, `relcl`, `xcomp` |
|
32 |
|
33 |
</details>
|
attribute_ruler/patterns
CHANGED
Binary files a/attribute_ruler/patterns and b/attribute_ruler/patterns differ
|
|
config.cfg
CHANGED
@@ -57,8 +57,9 @@ overwrite = false
|
|
57 |
scorer = {"@scorers":"spacy.tagger_scorer.v1"}
|
58 |
|
59 |
[components.tagger.model]
|
60 |
-
@architectures = "spacy.Tagger.
|
61 |
nO = null
|
|
|
62 |
|
63 |
[components.tagger.model.tok2vec]
|
64 |
@architectures = "spacy.Tok2VecListener.v1"
|
@@ -75,7 +76,7 @@ factory = "tok2vec"
|
|
75 |
@architectures = "spacy.MultiHashEmbed.v2"
|
76 |
width = 96
|
77 |
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
|
78 |
-
rows = [5000,
|
79 |
include_static_vectors = false
|
80 |
|
81 |
[components.tok2vec.model.encode]
|
|
|
57 |
scorer = {"@scorers":"spacy.tagger_scorer.v1"}
|
58 |
|
59 |
[components.tagger.model]
|
60 |
+
@architectures = "spacy.Tagger.v2"
|
61 |
nO = null
|
62 |
+
normalize = false
|
63 |
|
64 |
[components.tagger.model.tok2vec]
|
65 |
@architectures = "spacy.Tok2VecListener.v1"
|
|
|
76 |
@architectures = "spacy.MultiHashEmbed.v2"
|
77 |
width = 96
|
78 |
attrs = ["NORM","PREFIX","SUFFIX","SHAPE","SPACY"]
|
79 |
+
rows = [5000,1000,2500,2500,50]
|
80 |
include_static_vectors = false
|
81 |
|
82 |
[components.tok2vec.model.encode]
|
en_readability-any-py3-none-any.whl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53adcc14fe186b430af7cda67efa72e3bf21f519ff28ce2eb9bf091cc27cbc16
|
3 |
+
size 6324285
|
meta.json
CHANGED
@@ -7,8 +7,8 @@
|
|
7 |
"email":"",
|
8 |
"url":"www.valurank.com",
|
9 |
"license":"",
|
10 |
-
"spacy_version":">=3.
|
11 |
-
"spacy_git_version":"
|
12 |
"vectors":{
|
13 |
"width":0,
|
14 |
"vectors":0,
|
@@ -68,6 +68,7 @@
|
|
68 |
"WP$",
|
69 |
"WRB",
|
70 |
"XX",
|
|
|
71 |
"``"
|
72 |
],
|
73 |
"parser":[
|
|
|
7 |
"email":"",
|
8 |
"url":"www.valurank.com",
|
9 |
"license":"",
|
10 |
+
"spacy_version":">=3.4.0,<3.5.0",
|
11 |
+
"spacy_git_version":"d583626a8",
|
12 |
"vectors":{
|
13 |
"width":0,
|
14 |
"vectors":0,
|
|
|
68 |
"WP$",
|
69 |
"WRB",
|
70 |
"XX",
|
71 |
+
"_SP",
|
72 |
"``"
|
73 |
],
|
74 |
"parser":[
|
parser/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 319909
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e80971fd38f1f20f11dabe644a485c6ef0846064256c7b2e929148a8b3ce6b97
|
3 |
size 319909
|
parser/moves
CHANGED
@@ -1,2 +1 @@
|
|
1 |
-
��moves�
|
2 |
-
{"0":{"":995932},"1":{"":989662},"2":{"det":172430,"nsubj":165679,"compound":116803,"amod":106128,"aux":87078,"punct":65505,"advmod":62711,"poss":36427,"mark":27913,"nummod":22583,"auxpass":15597,"prep":13989,"nsubjpass":13867,"neg":12358,"cc":10694,"nmod":9572,"advcl":9063,"npadvmod":8135,"quantmod":7071,"intj":6557,"ccomp":5899,"dobj":3427,"expl":3360,"dep":3191,"predet":1945,"parataxis":1826,"csubj":1431,"preconj":620,"pobj||prep":615,"attr":578,"meta":448,"advmod||conj":367,"dobj||xcomp":352,"acomp":284,"nsubj||ccomp":224,"dative":206,"advmod||xcomp":149,"dobj||ccomp":70,"csubjpass":64,"dobj||conj":62,"prep||conj":51,"acl":48,"prep||nsubj":41,"prep||dobj":36,"xcomp":34,"advmod||ccomp":32,"oprd":31},"3":{"punct":183437,"pobj":182256,"prep":173845,"dobj":89650,"conj":59689,"cc":51858,"ccomp":30404,"advmod":22820,"xcomp":21045,"relcl":20968,"advcl":19833,"attr":17739,"acomp":16824,"appos":14963,"case":13361,"acl":12091,"pcomp":10345,"npadvmod":9702,"prt":8179,"agent":3884,"dative":3867,"nsubj":3465,"intj":2898,"neg":2871,"amod":2843,"nummod":2510,"oprd":2304,"dep":1518,"parataxis":1261,"quantmod":317,"nmod":296,"acl||dobj":202,"prep||dobj":190,"prep||nsubj":162,"acl||nsubj":159,"appos||nsubj":145,"relcl||dobj":134,"relcl||nsubj":111,"aux":103,"expl":96,"meta":93,"appos||dobj":86,"preconj":71,"csubj":65,"prep||nsubjpass":55,"prep||advmod":54,"prep||acomp":53,"det":51,"nsubjpass":45,"acl||nsubjpass":42,"relcl||pobj":41,"mark":40,"auxpass":39,"prep||pobj":36,"relcl||nsubjpass":32,"appos||nsubjpass":31},"4":{"ROOT":110979}}�cfg��neg_key�
|
|
|
1 |
+
��moves�{"0":{"":994332},"1":{"":999432},"2":{"det":172595,"nsubj":165748,"compound":116623,"amod":105184,"aux":86667,"punct":65478,"advmod":62763,"poss":36443,"mark":27941,"nummod":22598,"auxpass":15594,"prep":14001,"nsubjpass":13856,"neg":12357,"cc":10739,"nmod":9562,"advcl":9062,"npadvmod":8168,"quantmod":7101,"intj":6464,"ccomp":5896,"dobj":3427,"expl":3360,"dep":2871,"predet":1944,"parataxis":1837,"csubj":1428,"preconj":621,"pobj||prep":616,"attr":578,"meta":376,"advmod||conj":368,"dobj||xcomp":352,"acomp":284,"nsubj||ccomp":224,"dative":206,"advmod||xcomp":149,"dobj||ccomp":70,"csubjpass":64,"dobj||conj":62,"prep||conj":51,"acl":48,"prep||nsubj":41,"prep||dobj":36,"xcomp":34,"advmod||ccomp":32,"oprd":31},"3":{"punct":183790,"pobj":182191,"prep":174008,"dobj":89615,"conj":59687,"cc":51930,"ccomp":30385,"advmod":22861,"xcomp":21021,"relcl":20969,"advcl":19828,"attr":17741,"acomp":16922,"appos":15265,"case":13388,"acl":12085,"pcomp":10324,"dep":10116,"npadvmod":9796,"prt":8179,"agent":3903,"dative":3866,"nsubj":3470,"neg":2906,"amod":2839,"intj":2819,"nummod":2732,"oprd":2301,"parataxis":1261,"quantmod":319,"nmod":294,"acl||dobj":200,"prep||dobj":190,"prep||nsubj":162,"acl||nsubj":159,"appos||nsubj":145,"relcl||dobj":134,"relcl||nsubj":111,"aux":103,"expl":96,"meta":92,"appos||dobj":86,"preconj":71,"csubj":65,"prep||nsubjpass":55,"prep||advmod":54,"prep||acomp":53,"det":51,"nsubjpass":45,"relcl||pobj":42,"acl||nsubjpass":42,"mark":40,"auxpass":39,"prep||pobj":36,"relcl||nsubjpass":32,"appos||nsubjpass":31},"4":{"ROOT":111664}}�cfg��neg_key�
|
|
tagger/cfg
CHANGED
@@ -48,6 +48,7 @@
|
|
48 |
"WP$",
|
49 |
"WRB",
|
50 |
"XX",
|
|
|
51 |
"``"
|
52 |
],
|
53 |
"neg_prefix":"!",
|
|
|
48 |
"WP$",
|
49 |
"WRB",
|
50 |
"XX",
|
51 |
+
"_SP",
|
52 |
"``"
|
53 |
],
|
54 |
"neg_prefix":"!",
|
tagger/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d62054e74f89be08b720157a45ddf3a5a5a9e8c51f191cdea364e390c0032d7e
|
3 |
+
size 19829
|
tok2vec/model
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6967e88ec7b0680d94a75500c46fe19a1b1e01ef5f608a58826077e45af5010d
|
3 |
+
size 6139229
|
tokenizer
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
vocab/strings.json
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3b2696502155e027d7e26609065b911a03ee6c5004b150fa989e2d03a3ca4338
|
3 |
+
size 1104000
|