arxyzan commited on
Commit
25fc230
1 Parent(s): e74fd92

Upload 5 files

Browse files
model_config.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: roberta_text_classification
2
+ config_type: model
3
+ task: text_classification
4
+ num_labels: 3
5
+ id2label:
6
+ 0: negative
7
+ 1: positive
8
+ 2: neutral
9
+ attention_probs_dropout_prob: 0.1
10
+ bos_token_id: 0
11
+ eos_token_id: 2
12
+ gradient_checkpointing: false
13
+ hidden_act: gelu
14
+ hidden_dropout_prob: 0.1
15
+ hidden_size: 768
16
+ initializer_range: 0.02
17
+ intermediate_size: 3072
18
+ layer_norm_eps: 1.0e-12
19
+ max_position_embeddings: 514
20
+ num_attention_heads: 12
21
+ num_hidden_layers: 12
22
+ pad_token_id: 1
23
+ position_embedding_type: absolute
24
+ type_vocab_size: 1
25
+ use_cache: true
26
+ vocab_size: 42000
preprocessor/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
preprocessor/tokenizer_config.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: bpe_tokenizer
2
+ config_type: preprocessor
3
+ pretrained_path: hezarai/roberta-base-fa
4
+ max_length: 512
5
+ truncation_strategy: longest_first
6
+ truncation_direction: right
7
+ stride: 0
8
+ padding_strategy: longest
9
+ padding_direction: right
10
+ pad_to_multiple_of: 0
11
+ pad_token_id: 0
12
+ pad_token: <pad>
13
+ pad_token_type_id: 0
14
+ unk_token: <unk>
15
+ special_tokens:
16
+ - <s>
17
+ - <pad>
18
+ - </s>
19
+ - <unk>
20
+ - <mask>
21
+ - <|endoftext|>
22
+ - <|startoftext|>
23
+ - <nl>
24
+ - <hs>
25
+ - <sep>
26
+ - <cls>
27
+ continuing_subword_prefix: ''
28
+ end_of_word_suffix: ''
29
+ fuse_unk: false
30
+ train_config:
31
+ name: bpe_tokenizer
32
+ config_type: preprocessor
33
+ vocab_size: 30000
34
+ min_frequency: 2
35
+ limit_alphabet: 1000
36
+ show_progress: true
train/dataset_config.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: text_classification
2
+ config_type: dataset
3
+ task: text_classification
4
+ path: hezarai/sentiment_digikala_snappfood
5
+ tokenizer_path: hezarai/roberta-base-fa
6
+ label_field: label
7
+ text_field: text
8
+ id2label:
9
+ 0: negative
10
+ 1: positive
11
+ 2: neutral
12
+ label2id:
13
+ negative: 0
14
+ positive: 1
15
+ neutral: 2
16
+ num_labels: 3
train/train_config.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: roberta_text_classification
2
+ config_type: train
3
+ device: cuda
4
+ init_weights_from: hezarai/roberta-base-fa
5
+ seed: 42
6
+ batch_size: 8
7
+ metrics:
8
+ f1:
9
+ task: multiclass
10
+ num_train_epochs: 5
11
+ checkpoints_dir: checkpoints/