2024-07-01 01:31:45,393 - INFO - allennlp.common.params - random_seed = 13370 2024-07-01 01:31:45,393 - INFO - allennlp.common.params - numpy_seed = 1337 2024-07-01 01:31:45,393 - INFO - allennlp.common.params - pytorch_seed = 133 2024-07-01 01:31:45,394 - INFO - allennlp.common.checks - Pytorch version: 2.3.1+cu121 2024-07-01 01:31:45,394 - INFO - allennlp.common.params - type = default 2024-07-01 01:31:45,395 - INFO - allennlp.common.params - dataset_reader.type = compreno_ud_dataset_reader 2024-07-01 01:31:45,395 - INFO - allennlp.common.params - dataset_reader.token_indexers.tokens.type = pretrained_transformer_mismatched 2024-07-01 01:31:45,395 - INFO - allennlp.common.params - dataset_reader.token_indexers.tokens.token_min_padding_length = 0 2024-07-01 01:31:45,395 - INFO - allennlp.common.params - dataset_reader.token_indexers.tokens.model_name = xlm-roberta-base 2024-07-01 01:31:45,395 - INFO - allennlp.common.params - dataset_reader.token_indexers.tokens.namespace = tags 2024-07-01 01:31:45,395 - INFO - allennlp.common.params - dataset_reader.token_indexers.tokens.max_length = None 2024-07-01 01:31:45,395 - INFO - allennlp.common.params - dataset_reader.token_indexers.tokens.tokenizer_kwargs = None 2024-07-01 01:31:47,056 - INFO - allennlp.common.params - train_data_path = data/train.conllu 2024-07-01 01:31:47,056 - INFO - allennlp.common.params - datasets_for_vocab_creation = None 2024-07-01 01:31:47,056 - INFO - allennlp.common.params - validation_dataset_reader = None 2024-07-01 01:31:47,056 - INFO - allennlp.common.params - validation_data_path = data/validation.conllu 2024-07-01 01:31:47,056 - INFO - allennlp.common.params - test_data_path = None 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - evaluate_on_test = False 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - batch_weight_key = 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.type = multiprocess 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.batch_size = 24 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.drop_last = False 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.shuffle = True 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.batch_sampler = None 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.batches_per_epoch = None 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.num_workers = 0 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.max_instances_in_memory = None 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.start_method = fork 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.cuda_device = None 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.quiet = False 2024-07-01 01:31:47,057 - INFO - allennlp.common.params - data_loader.collate_fn = 2024-07-01 01:31:47,057 - INFO - tqdm - loading instances: 0it [00:00, ?it/s] 2024-07-01 01:31:57,129 - INFO - tqdm - loading instances: 25551it [00:10, 2339.78it/s] 2024-07-01 01:31:57,696 - INFO - allennlp.common.params - validation_data_loader.type = multiprocess 2024-07-01 01:31:57,696 - INFO - allennlp.common.params - validation_data_loader.batch_size = 24 2024-07-01 01:31:57,696 - INFO - allennlp.common.params - validation_data_loader.drop_last = False 2024-07-01 01:31:57,696 - INFO - allennlp.common.params - validation_data_loader.shuffle = False 2024-07-01 01:31:57,696 - INFO - allennlp.common.params - validation_data_loader.batch_sampler = None 2024-07-01 01:31:57,696 - INFO - allennlp.common.params - validation_data_loader.batches_per_epoch = None 2024-07-01 01:31:57,696 - INFO - allennlp.common.params - validation_data_loader.num_workers = 0 2024-07-01 01:31:57,696 - INFO - allennlp.common.params - validation_data_loader.max_instances_in_memory = None 2024-07-01 01:31:57,696 - INFO - allennlp.common.params - validation_data_loader.start_method = fork 2024-07-01 01:31:57,696 - INFO - allennlp.common.params - validation_data_loader.cuda_device = None 2024-07-01 01:31:57,697 - INFO - allennlp.common.params - validation_data_loader.quiet = False 2024-07-01 01:31:57,697 - INFO - allennlp.common.params - validation_data_loader.collate_fn = 2024-07-01 01:31:57,697 - INFO - tqdm - loading instances: 0it [00:00, ?it/s] 2024-07-01 01:32:00,794 - INFO - allennlp.common.params - vocabulary.type = from_instances 2024-07-01 01:32:00,794 - INFO - allennlp.common.params - vocabulary.max_vocab_size = None 2024-07-01 01:32:00,794 - INFO - allennlp.common.params - vocabulary.non_padded_namespaces = ('*tags', '*labels') 2024-07-01 01:32:00,794 - INFO - allennlp.common.params - vocabulary.pretrained_files = None 2024-07-01 01:32:00,794 - INFO - allennlp.common.params - vocabulary.only_include_pretrained_words = False 2024-07-01 01:32:00,794 - INFO - allennlp.common.params - vocabulary.min_pretrained_embeddings = None 2024-07-01 01:32:00,794 - INFO - allennlp.common.params - vocabulary.padding_token = @@PADDING@@ 2024-07-01 01:32:00,794 - INFO - allennlp.common.params - vocabulary.oov_token = @@UNKNOWN@@ 2024-07-01 01:32:00,794 - INFO - allennlp.data.vocabulary - Fitting token dictionary from dataset. 2024-07-01 01:32:00,794 - INFO - tqdm - building vocab: 0it [00:00, ?it/s] 2024-07-01 01:32:01,418 - INFO - allennlp.common.params - model.type = morpho_syntax_semantic_parser 2024-07-01 01:32:01,418 - INFO - allennlp.common.params - model.indexer.type = pretrained_transformer_mismatched 2024-07-01 01:32:01,418 - INFO - allennlp.common.params - model.indexer.token_min_padding_length = 0 2024-07-01 01:32:01,418 - INFO - allennlp.common.params - model.indexer.model_name = xlm-roberta-base 2024-07-01 01:32:01,418 - INFO - allennlp.common.params - model.indexer.namespace = tags 2024-07-01 01:32:01,418 - INFO - allennlp.common.params - model.indexer.max_length = None 2024-07-01 01:32:01,418 - INFO - allennlp.common.params - model.indexer.tokenizer_kwargs = None 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.type = pretrained_transformer_mismatched 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.model_name = xlm-roberta-base 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.max_length = None 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.sub_module = None 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.train_parameters = True 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.last_layer_only = True 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.override_weights_file = None 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.override_weights_strip_prefix = None 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.load_weights = True 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.gradient_checkpointing = None 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.tokenizer_kwargs = None 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.transformer_kwargs = None 2024-07-01 01:32:01,419 - INFO - allennlp.common.params - model.embedder.sub_token_mode = avg 2024-07-01 01:32:01,947 - INFO - allennlp.common.params - model.lemma_rule_classifier.hid_dim = 512 2024-07-01 01:32:01,948 - INFO - allennlp.common.params - model.lemma_rule_classifier.activation = relu 2024-07-01 01:32:01,948 - INFO - allennlp.common.params - model.lemma_rule_classifier.dropout = 0.1 2024-07-01 01:32:01,948 - INFO - allennlp.common.params - model.lemma_rule_classifier.dictionaries = [] 2024-07-01 01:32:01,948 - INFO - allennlp.common.params - model.lemma_rule_classifier.topk = None 2024-07-01 01:32:01,950 - INFO - allennlp.common.params - model.pos_feats_classifier.hid_dim = 256 2024-07-01 01:32:01,950 - INFO - allennlp.common.params - model.pos_feats_classifier.activation = relu 2024-07-01 01:32:01,950 - INFO - allennlp.common.params - model.pos_feats_classifier.dropout = 0.1 2024-07-01 01:32:01,952 - INFO - allennlp.common.params - model.depencency_classifier.hid_dim = 128 2024-07-01 01:32:01,952 - INFO - allennlp.common.params - model.depencency_classifier.activation = relu 2024-07-01 01:32:01,952 - INFO - allennlp.common.params - model.depencency_classifier.dropout = 0.1 2024-07-01 01:32:01,974 - INFO - allennlp.common.params - model.misc_classifier.hid_dim = 128 2024-07-01 01:32:01,974 - INFO - allennlp.common.params - model.misc_classifier.activation = relu 2024-07-01 01:32:01,974 - INFO - allennlp.common.params - model.misc_classifier.dropout = 0.1 2024-07-01 01:32:01,975 - INFO - allennlp.common.params - model.semslot_classifier.hid_dim = 1024 2024-07-01 01:32:01,975 - INFO - allennlp.common.params - model.semslot_classifier.activation = relu 2024-07-01 01:32:01,975 - INFO - allennlp.common.params - model.semslot_classifier.dropout = 0.1 2024-07-01 01:32:01,979 - INFO - allennlp.common.params - model.semclass_classifier.hid_dim = 1024 2024-07-01 01:32:01,979 - INFO - allennlp.common.params - model.semclass_classifier.activation = relu 2024-07-01 01:32:01,979 - INFO - allennlp.common.params - model.semclass_classifier.dropout = 0.1 2024-07-01 01:32:01,983 - INFO - allennlp.common.params - model.null_classifier.hid_dim = 512 2024-07-01 01:32:01,983 - INFO - allennlp.common.params - model.null_classifier.activation = relu 2024-07-01 01:32:01,983 - INFO - allennlp.common.params - model.null_classifier.dropout = 0.1 2024-07-01 01:32:01,983 - INFO - allennlp.common.params - model.null_classifier.positive_class_weight = 1.0 2024-07-01 01:32:16,071 - INFO - allennlp.common.params - trainer.type = gradient_descent 2024-07-01 01:32:16,071 - INFO - allennlp.common.params - trainer.cuda_device = 0 2024-07-01 01:32:16,071 - INFO - allennlp.common.params - trainer.distributed = False 2024-07-01 01:32:16,071 - INFO - allennlp.common.params - trainer.world_size = 1 2024-07-01 01:32:16,071 - INFO - allennlp.common.params - trainer.patience = None 2024-07-01 01:32:16,071 - INFO - allennlp.common.params - trainer.validation_metric = +Avg 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.num_epochs = 10 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.grad_norm = False 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.grad_clipping = 5 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.num_gradient_accumulation_steps = 1 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.use_amp = False 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.no_grad = None 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.momentum_scheduler = None 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.moving_average = None 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.checkpointer = 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.enable_default_callbacks = True 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.run_confidence_checks = True 2024-07-01 01:32:16,072 - INFO - allennlp.common.params - trainer.grad_scaling = True 2024-07-01 01:32:16,285 - INFO - allennlp.common.params - trainer.optimizer.type = adam 2024-07-01 01:32:16,286 - INFO - allennlp.common.params - trainer.optimizer.lr = 0.01 2024-07-01 01:32:16,286 - INFO - allennlp.common.params - trainer.optimizer.betas = (0.9, 0.999) 2024-07-01 01:32:16,286 - INFO - allennlp.common.params - trainer.optimizer.eps = 1e-08 2024-07-01 01:32:16,286 - INFO - allennlp.common.params - trainer.optimizer.weight_decay = 0.0 2024-07-01 01:32:16,286 - INFO - allennlp.common.params - trainer.optimizer.amsgrad = False 2024-07-01 01:32:16,287 - INFO - allennlp.training.optimizers - Done constructing parameter groups. 2024-07-01 01:32:16,287 - INFO - allennlp.training.optimizers - Group 0: ['embedder._matched_embedder.transformer_model.encoder.layer.0.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.10.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.embeddings.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.8.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.9.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.8.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.6.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.2.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.2.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.2.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.3.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.6.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.7.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.8.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.3.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.2.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.3.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.1.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.0.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.embeddings.word_embeddings.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.6.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.10.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.7.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.10.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.9.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.11.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.5.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.11.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.7.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.11.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.3.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.10.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.5.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.7.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.0.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.6.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.4.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.9.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.2.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.6.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.8.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.6.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.embeddings.position_embeddings.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.3.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.7.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.10.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.2.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.2.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.8.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.7.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.embeddings.token_type_embeddings.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.0.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.5.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.1.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.11.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.6.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.3.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.4.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.2.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.11.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.9.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.10.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.0.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.9.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.11.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.10.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.5.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.4.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.5.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.0.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.1.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.7.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.1.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.1.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.2.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.6.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.10.output.dense.weight', 'embedder._matched_embedder.transformer_model.pooler.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.5.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.3.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.9.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.9.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.5.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.1.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.11.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.1.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.0.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.11.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.3.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.5.attention.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.7.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.9.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.1.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.6.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.8.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.10.output.dense.bias', 'embedder._matched_embedder.transformer_model.embeddings.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.7.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.11.output.dense.weight', 'embedder._matched_embedder.transformer_model.pooler.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.8.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.key.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.1.output.LayerNorm.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.0.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.3.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.intermediate.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.8.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.2.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.0.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.5.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.10.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.7.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.8.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.1.attention.output.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.4.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.5.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.value.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.3.attention.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.11.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.0.output.dense.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.8.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.query.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.9.attention.output.LayerNorm.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.9.intermediate.dense.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.key.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.value.bias', 'embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.query.weight', 'embedder._matched_embedder.transformer_model.encoder.layer.6.output.LayerNorm.weight'], {} 2024-07-01 01:32:16,287 - INFO - allennlp.training.optimizers - Group 1: ['dependency_classifier.arc_head_mlp.1.bias', 'dependency_classifier.rel_attention_ud._bias', 'lemma_rule_classifier.classifier.1.weight', 'semslot_classifier.classifier.4.bias', 'semslot_classifier.classifier.1.weight', 'dependency_classifier.rel_attention_eud._weight_matrix', 'lemma_rule_classifier.classifier.1.bias', 'pos_feats_classifier.classifier.4.bias', 'dependency_classifier.arc_attention_eud._weight_matrix', 'null_classifier.classifier.4.bias', 'dependency_classifier.rel_dep_mlp.1.weight', 'dependency_classifier.rel_attention_ud._weight_matrix', 'dependency_classifier.rel_dep_mlp.1.bias', 'semclass_classifier.classifier.4.weight', 'misc_classifier.classifier.1.bias', 'semslot_classifier.classifier.1.bias', 'dependency_classifier.arc_attention_ud._bias', 'semslot_classifier.classifier.4.weight', 'semclass_classifier.classifier.4.bias', 'dependency_classifier.arc_attention_ud._weight_matrix', 'dependency_classifier.arc_attention_eud._bias', 'misc_classifier.classifier.4.weight', 'pos_feats_classifier.classifier.4.weight', 'pos_feats_classifier.classifier.1.bias', 'dependency_classifier.rel_head_mlp.1.bias', 'dependency_classifier.arc_dep_mlp.1.bias', 'null_classifier.classifier.4.weight', 'lemma_rule_classifier.classifier.4.weight', 'null_classifier.classifier.1.bias', 'pos_feats_classifier.classifier.1.weight', 'lemma_rule_classifier.classifier.4.bias', 'misc_classifier.classifier.1.weight', 'misc_classifier.classifier.4.bias', 'dependency_classifier.rel_head_mlp.1.weight', 'semclass_classifier.classifier.1.weight', 'semclass_classifier.classifier.1.bias', 'null_classifier.classifier.1.weight', 'dependency_classifier.arc_dep_mlp.1.weight', 'dependency_classifier.rel_attention_eud._bias', 'dependency_classifier.arc_head_mlp.1.weight'], {} 2024-07-01 01:32:16,287 - INFO - allennlp.training.optimizers - Group 2: [], {} 2024-07-01 01:32:16,287 - INFO - allennlp.training.optimizers - Number of trainable parameters: 287203493 2024-07-01 01:32:16,288 - INFO - allennlp.common.util - The following parameters are Frozen (without gradient): 2024-07-01 01:32:16,288 - INFO - allennlp.common.util - The following parameters are Tunable (with gradient): 2024-07-01 01:32:16,288 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.embeddings.word_embeddings.weight 2024-07-01 01:32:16,288 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.embeddings.position_embeddings.weight 2024-07-01 01:32:16,288 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.embeddings.token_type_embeddings.weight 2024-07-01 01:32:16,288 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.embeddings.LayerNorm.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.embeddings.LayerNorm.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.query.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.query.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.key.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.key.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.value.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.attention.self.value.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.attention.output.dense.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.attention.output.dense.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.attention.output.LayerNorm.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.attention.output.LayerNorm.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.intermediate.dense.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.intermediate.dense.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.output.dense.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.output.dense.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.output.LayerNorm.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.0.output.LayerNorm.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.query.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.query.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.key.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.key.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.value.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.attention.self.value.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.attention.output.dense.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.attention.output.dense.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.attention.output.LayerNorm.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.attention.output.LayerNorm.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.intermediate.dense.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.intermediate.dense.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.output.dense.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.output.dense.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.output.LayerNorm.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.1.output.LayerNorm.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.query.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.query.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.key.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.key.bias 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.value.weight 2024-07-01 01:32:16,289 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.attention.self.value.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.attention.output.dense.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.attention.output.dense.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.attention.output.LayerNorm.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.attention.output.LayerNorm.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.intermediate.dense.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.intermediate.dense.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.output.dense.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.output.dense.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.output.LayerNorm.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.2.output.LayerNorm.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.query.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.query.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.key.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.key.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.value.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.attention.self.value.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.attention.output.dense.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.attention.output.dense.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.attention.output.LayerNorm.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.attention.output.LayerNorm.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.intermediate.dense.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.intermediate.dense.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.output.dense.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.output.dense.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.output.LayerNorm.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.3.output.LayerNorm.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.query.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.query.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.key.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.key.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.value.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.attention.self.value.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.attention.output.dense.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.attention.output.dense.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.attention.output.LayerNorm.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.attention.output.LayerNorm.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.intermediate.dense.weight 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.intermediate.dense.bias 2024-07-01 01:32:16,290 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.output.dense.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.output.dense.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.output.LayerNorm.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.4.output.LayerNorm.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.query.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.query.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.key.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.key.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.value.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.attention.self.value.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.attention.output.dense.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.attention.output.dense.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.attention.output.LayerNorm.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.attention.output.LayerNorm.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.intermediate.dense.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.intermediate.dense.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.output.dense.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.output.dense.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.output.LayerNorm.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.5.output.LayerNorm.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.query.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.query.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.key.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.key.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.value.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.attention.self.value.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.attention.output.dense.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.attention.output.dense.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.attention.output.LayerNorm.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.attention.output.LayerNorm.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.intermediate.dense.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.intermediate.dense.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.output.dense.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.output.dense.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.output.LayerNorm.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.6.output.LayerNorm.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.query.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.query.bias 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.key.weight 2024-07-01 01:32:16,291 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.key.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.value.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.attention.self.value.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.attention.output.dense.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.attention.output.dense.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.attention.output.LayerNorm.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.attention.output.LayerNorm.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.intermediate.dense.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.intermediate.dense.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.output.dense.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.output.dense.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.output.LayerNorm.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.7.output.LayerNorm.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.query.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.query.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.key.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.key.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.value.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.attention.self.value.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.attention.output.dense.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.attention.output.dense.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.attention.output.LayerNorm.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.attention.output.LayerNorm.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.intermediate.dense.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.intermediate.dense.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.output.dense.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.output.dense.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.output.LayerNorm.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.8.output.LayerNorm.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.query.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.query.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.key.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.key.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.value.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.attention.self.value.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.attention.output.dense.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.attention.output.dense.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.attention.output.LayerNorm.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.attention.output.LayerNorm.bias 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.intermediate.dense.weight 2024-07-01 01:32:16,292 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.intermediate.dense.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.output.dense.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.output.dense.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.output.LayerNorm.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.9.output.LayerNorm.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.query.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.query.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.key.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.key.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.value.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.attention.self.value.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.attention.output.dense.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.attention.output.dense.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.attention.output.LayerNorm.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.attention.output.LayerNorm.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.intermediate.dense.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.intermediate.dense.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.output.dense.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.output.dense.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.output.LayerNorm.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.10.output.LayerNorm.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.query.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.query.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.key.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.key.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.value.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.attention.self.value.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.attention.output.dense.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.attention.output.dense.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.attention.output.LayerNorm.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.attention.output.LayerNorm.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.intermediate.dense.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.intermediate.dense.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.output.dense.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.output.dense.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.output.LayerNorm.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.encoder.layer.11.output.LayerNorm.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.pooler.dense.weight 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - embedder._matched_embedder.transformer_model.pooler.dense.bias 2024-07-01 01:32:16,293 - INFO - allennlp.common.util - lemma_rule_classifier.classifier.1.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - lemma_rule_classifier.classifier.1.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - lemma_rule_classifier.classifier.4.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - lemma_rule_classifier.classifier.4.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - pos_feats_classifier.classifier.1.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - pos_feats_classifier.classifier.1.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - pos_feats_classifier.classifier.4.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - pos_feats_classifier.classifier.4.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.arc_dep_mlp.1.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.arc_dep_mlp.1.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.arc_head_mlp.1.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.arc_head_mlp.1.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.rel_dep_mlp.1.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.rel_dep_mlp.1.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.rel_head_mlp.1.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.rel_head_mlp.1.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.arc_attention_ud._weight_matrix 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.arc_attention_ud._bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.rel_attention_ud._weight_matrix 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.rel_attention_ud._bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.arc_attention_eud._weight_matrix 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.arc_attention_eud._bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.rel_attention_eud._weight_matrix 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - dependency_classifier.rel_attention_eud._bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - misc_classifier.classifier.1.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - misc_classifier.classifier.1.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - misc_classifier.classifier.4.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - misc_classifier.classifier.4.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - semslot_classifier.classifier.1.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - semslot_classifier.classifier.1.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - semslot_classifier.classifier.4.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - semslot_classifier.classifier.4.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - semclass_classifier.classifier.1.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - semclass_classifier.classifier.1.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - semclass_classifier.classifier.4.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - semclass_classifier.classifier.4.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - null_classifier.classifier.1.weight 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - null_classifier.classifier.1.bias 2024-07-01 01:32:16,294 - INFO - allennlp.common.util - null_classifier.classifier.4.weight 2024-07-01 01:32:16,295 - INFO - allennlp.common.util - null_classifier.classifier.4.bias 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.type = slanted_triangular 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.cut_frac = 0 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.ratio = 32 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.last_epoch = -1 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.gradual_unfreezing = True 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.discriminative_fine_tuning = True 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.decay_factor = 0.001 2024-07-01 01:32:16,295 - INFO - allennlp.training.learning_rate_schedulers.slanted_triangular - Gradual unfreezing. Training only the top 1 layers. 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - type = default 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - save_completed_epochs = True 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - save_every_num_seconds = None 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - save_every_num_batches = None 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - keep_most_recent_by_count = 2 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - keep_most_recent_by_age = None 2024-07-01 01:32:16,295 - INFO - allennlp.common.params - trainer.callbacks.0.type = tensorboard 2024-07-01 01:32:16,296 - INFO - allennlp.common.params - trainer.callbacks.0.summary_interval = 100 2024-07-01 01:32:16,296 - INFO - allennlp.common.params - trainer.callbacks.0.distribution_interval = None 2024-07-01 01:32:16,296 - INFO - allennlp.common.params - trainer.callbacks.0.batch_size_interval = None 2024-07-01 01:32:16,296 - INFO - allennlp.common.params - trainer.callbacks.0.should_log_parameter_statistics = False 2024-07-01 01:32:16,296 - INFO - allennlp.common.params - trainer.callbacks.0.should_log_learning_rate = True 2024-07-01 01:32:16,297 - WARNING - allennlp.training.gradient_descent_trainer - You provided a validation dataset but patience was set to None, meaning that early stopping is disabled 2024-07-01 01:32:16,298 - INFO - allennlp.training.gradient_descent_trainer - Beginning training. 2024-07-01 01:32:16,298 - INFO - allennlp.training.gradient_descent_trainer - Epoch 0/9 2024-07-01 01:32:16,299 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.5G 2024-07-01 01:32:16,299 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 1.1G 2024-07-01 01:32:16,300 - INFO - allennlp.training.gradient_descent_trainer - Training 2024-07-01 01:32:16,300 - INFO - tqdm - 0%| | 0/1147 [00:00") 2024-07-01 01:32:16,713 - INFO - allennlp.training.callbacks.console_logger - batch_input/lemma_rule_labels (Shape: 24 x 37) tensor([[ 0, 0, 0, ..., 0, 0, 0], [ 0, 27, 11, ..., 0, 0, 0], [ 0, 0, 5, ..., 0, 0, 0], ..., [ 5, 0, 0, ..., 0, 0, 0], [ 0, 0, 4, ..., 0, 0, 0], [ 0, 4, 0, ..., 0, 0, 0]], device='cuda:0') 2024-07-01 01:32:16,714 - INFO - allennlp.training.callbacks.console_logger - batch_input/pos_feats_labels (Shape: 24 x 37) tensor([[ 8, 74, 91, ..., 0, 0, 0], [ 1, 222, 144, ..., 0, 0, 0], [ 2, 0, 152, ..., 0, 0, 0], ..., [ 95, 38, 1, ..., 0, 0, 0], [ 2, 22, 18, ..., 0, 0, 0], [ 48, 131, 0, ..., 0, 0, 0]], device='cuda:0') 2024-07-01 01:32:16,715 - INFO - allennlp.training.callbacks.console_logger - batch_input/deprel_labels (Shape: 24 x 37 x 37) tensor([[[-1, -1, -1, ..., -1, -1, -1], [ 1, -1, -1, ..., -1, -1, -1], [-1, 29, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, 2, -1, ..., -1, -1, -1], [-1, -1, 4, ..., -1, -1, -1], [-1, -1, 5, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, -1, -1, ..., -1, -1, -1], [ 0, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], ..., [[-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, 11, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, 1, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, 2, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, 0, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]]], device='cuda:0') 2024-07-01 01:32:16,722 - INFO - allennlp.training.callbacks.console_logger - batch_input/deps_labels (Shape: 24 x 37 x 37) tensor([[[-1, -1, -1, ..., -1, -1, -1], [ 4, -1, -1, ..., -1, -1, -1], [-1, 44, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, 1, -1, ..., -1, -1, -1], [-1, -1, 22, ..., -1, -1, -1], [-1, -1, 2, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, -1, -1, ..., -1, -1, -1], [ 0, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], ..., [[-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, 11, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, 4, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, 1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, 0, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]]], device='cuda:0') 2024-07-01 01:32:16,728 - INFO - allennlp.training.callbacks.console_logger - batch_input/misc_labels (Shape: 24 x 37) tensor([[0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [1, 0, 0, ..., 0, 0, 0], ..., [0, 2, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], device='cuda:0') 2024-07-01 01:32:16,729 - INFO - allennlp.training.callbacks.console_logger - batch_input/semslot_labels (Shape: 24 x 37) tensor([[ 2, 3, 0, ..., 0, 0, 0], [ 0, 25, 1, ..., 0, 0, 0], [19, 0, 7, ..., 0, 0, 0], ..., [13, 21, 0, ..., 0, 0, 0], [23, 2, 12, ..., 0, 0, 0], [ 0, 23, 0, ..., 0, 0, 0]], device='cuda:0') 2024-07-01 01:32:16,731 - INFO - allennlp.training.callbacks.console_logger - batch_input/semclass_labels (Shape: 24 x 37) tensor([[189, 20, 20, ..., 0, 0, 0], [ 1, 15, 19, ..., 0, 0, 0], [ 11, 0, 9, ..., 0, 0, 0], ..., [ 3, 15, 1, ..., 0, 0, 0], [ 11, 37, 2, ..., 0, 0, 0], [ 0, 11, 0, ..., 0, 0, 0]], device='cuda:0') 2024-07-01 01:32:16,732 - INFO - allennlp.training.callbacks.console_logger - Field : "batch_input/metadata" : (Length 24 of type "") 2024-07-01 01:32:26,396 - INFO - tqdm - NullAccuracy: 0.9818, NullF1: 0.0769, Lemma: 0.7297, PosFeats: 0.6471, UD-UAS: 0.4263, UD-LAS: 0.4118, EUD-UAS: 0.1424, EUD-LAS: 0.1220, Misc: 0.8961, SS: 0.6627, SC: 0.6433, Avg: 0.5201, batch_loss: 7.6293, loss: 11.3000 ||: 16%|#5 | 179/1147 [00:10<00:53, 17.95it/s] 2024-07-01 01:32:36,457 - INFO - tqdm - NullAccuracy: 0.9848, NullF1: 0.1274, Lemma: 0.7833, PosFeats: 0.7376, UD-UAS: 0.4983, UD-LAS: 0.4867, EUD-UAS: 0.2040, EUD-LAS: 0.1826, Misc: 0.9107, SS: 0.7025, SC: 0.7055, Avg: 0.5790, batch_loss: 5.4980, loss: 8.8952 ||: 32%|###1 | 362/1147 [00:20<00:45, 17.12it/s] 2024-07-01 01:32:46,538 - INFO - tqdm - NullAccuracy: 0.9857, NullF1: 0.1675, Lemma: 0.8084, PosFeats: 0.7764, UD-UAS: 0.5376, UD-LAS: 0.5274, EUD-UAS: 0.2393, EUD-LAS: 0.2183, Misc: 0.9185, SS: 0.7222, SC: 0.7351, Avg: 0.6093, batch_loss: 4.8169, loss: 7.8232 ||: 48%|####7 | 547/1147 [00:30<00:30, 19.42it/s] 2024-07-01 01:32:56,590 - INFO - tqdm - NullAccuracy: 0.9864, NullF1: 0.1917, Lemma: 0.8242, PosFeats: 0.7990, UD-UAS: 0.5641, UD-LAS: 0.5552, EUD-UAS: 0.2649, EUD-LAS: 0.2447, Misc: 0.9236, SS: 0.7351, SC: 0.7546, Avg: 0.6295, batch_loss: 5.2682, loss: 7.1602 ||: 64%|######3 | 730/1147 [00:40<00:23, 17.81it/s] 2024-07-01 01:33:06,690 - INFO - tqdm - NullAccuracy: 0.9869, NullF1: 0.2129, Lemma: 0.8366, PosFeats: 0.8144, UD-UAS: 0.5816, UD-LAS: 0.5735, EUD-UAS: 0.2811, EUD-LAS: 0.2618, Misc: 0.9269, SS: 0.7444, SC: 0.7686, Avg: 0.6432, batch_loss: 5.2146, loss: 6.7194 ||: 79%|#######9 | 911/1147 [00:50<00:13, 18.11it/s] 2024-07-01 01:33:16,753 - INFO - tqdm - NullAccuracy: 0.9873, NullF1: 0.2327, Lemma: 0.8466, PosFeats: 0.8269, UD-UAS: 0.5948, UD-LAS: 0.5873, EUD-UAS: 0.2932, EUD-LAS: 0.2747, Misc: 0.9296, SS: 0.7514, SC: 0.7794, Avg: 0.6538, batch_loss: 4.8879, loss: 6.3822 ||: 95%|#########4| 1089/1147 [01:00<00:03, 17.51it/s] 2024-07-01 01:33:19,729 - INFO - tqdm - NullAccuracy: 0.9874, NullF1: 0.2370, Lemma: 0.8490, PosFeats: 0.8300, UD-UAS: 0.5989, UD-LAS: 0.5916, EUD-UAS: 0.2972, EUD-LAS: 0.2789, Misc: 0.9304, SS: 0.7534, SC: 0.7824, Avg: 0.6569, batch_loss: 4.7072, loss: 6.2913 ||: 100%|#########9| 1143/1147 [01:03<00:00, 17.37it/s] 2024-07-01 01:33:19,839 - INFO - tqdm - NullAccuracy: 0.9874, NullF1: 0.2372, Lemma: 0.8490, PosFeats: 0.8301, UD-UAS: 0.5991, UD-LAS: 0.5917, EUD-UAS: 0.2973, EUD-LAS: 0.2790, Misc: 0.9304, SS: 0.7535, SC: 0.7825, Avg: 0.6570, batch_loss: 4.8284, loss: 6.2886 ||: 100%|#########9| 1145/1147 [01:03<00:00, 17.62it/s] 2024-07-01 01:33:19,925 - INFO - tqdm - NullAccuracy: 0.9874, NullF1: 0.2375, Lemma: 0.8491, PosFeats: 0.8302, UD-UAS: 0.5993, UD-LAS: 0.5919, EUD-UAS: 0.2975, EUD-LAS: 0.2792, Misc: 0.9305, SS: 0.7536, SC: 0.7826, Avg: 0.6571, batch_loss: 3.4145, loss: 6.2846 ||: 100%|##########| 1147/1147 [01:03<00:00, 18.03it/s] 2024-07-01 01:33:19,925 - INFO - allennlp.training.gradient_descent_trainer - Validating 2024-07-01 01:33:19,926 - INFO - tqdm - 0%| | 0/287 [00:00") 2024-07-01 01:33:20,023 - INFO - allennlp.training.callbacks.console_logger - batch_input/lemma_rule_labels (Shape: 24 x 35) tensor([[ 8, 0, 0, ..., 0, 0, 0], [ 0, 8, 0, ..., 0, 0, 0], [ 0, 13, 62, ..., 0, 0, 0], ..., [ 0, 11, 0, ..., 0, 0, 0], [ 0, 0, 13, ..., 0, 0, 0], [124, 41, 36, ..., 0, 0, 0]], device='cuda:0') 2024-07-01 01:33:20,024 - INFO - allennlp.training.callbacks.console_logger - batch_input/pos_feats_labels (Shape: 24 x 35) tensor([[158, 0, 34, ..., 0, 0, 0], [ 14, 41, 12, ..., 0, 0, 0], [197, 24, 116, ..., 0, 0, 0], ..., [ 8, 33, 2, ..., 22, 0, 0], [ 0, 12, 24, ..., 0, 0, 0], [721, 94, 57, ..., 0, 0, 0]], device='cuda:0') 2024-07-01 01:33:20,025 - INFO - allennlp.training.callbacks.console_logger - batch_input/deprel_labels (Shape: 24 x 35 x 35) tensor([[[ 5, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, 13, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, 3, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, -1, 3, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, 5, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], ..., [[-1, 3, -1, ..., -1, -1, -1], [-1, 5, -1, ..., -1, -1, -1], [-1, 7, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, 0, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, 10, -1, ..., -1, -1, -1], [-1, -1, 21, ..., -1, -1, -1], [-1, -1, 5, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]]], device='cuda:0') 2024-07-01 01:33:20,032 - INFO - allennlp.training.callbacks.console_logger - batch_input/deps_labels (Shape: 24 x 35 x 35) tensor([[[ 2, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, 14, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, 3, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, -1, 3, ..., -1, -1, -1], [-1, -1, 21, ..., -1, -1, -1], [-1, -1, 2, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], ..., [[-1, 3, -1, ..., -1, -1, -1], [-1, 2, -1, ..., -1, -1, -1], [-1, 6, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, 0, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]], [[-1, 9, -1, ..., -1, -1, -1], [-1, -1, 24, ..., -1, -1, -1], [-1, -1, 2, ..., -1, -1, -1], ..., [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1], [-1, -1, -1, ..., -1, -1, -1]]], device='cuda:0') 2024-07-01 01:33:20,038 - INFO - allennlp.training.callbacks.console_logger - batch_input/misc_labels (Shape: 24 x 35) tensor([[1, 0, 0, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0], [0, 2, 0, ..., 0, 0, 0], ..., [0, 0, 0, ..., 1, 0, 0], [1, 0, 2, ..., 0, 0, 0], [0, 0, 0, ..., 0, 0, 0]], device='cuda:0') 2024-07-01 01:33:20,039 - INFO - allennlp.training.callbacks.console_logger - batch_input/semslot_labels (Shape: 24 x 35) tensor([[ 1, 0, 10, ..., 0, 0, 0], [ 0, 19, 20, ..., 0, 0, 0], [ 3, 1, 15, ..., 0, 0, 0], ..., [ 2, 1, 23, ..., 82, 0, 0], [ 0, 3, 1, ..., 0, 0, 0], [13, 31, 1, ..., 0, 0, 0]], device='cuda:0') 2024-07-01 01:33:20,041 - INFO - allennlp.training.callbacks.console_logger - batch_input/semclass_labels (Shape: 24 x 35) tensor([[ 28, 0, 373, ..., 0, 0, 0], [ 10, 6, 60, ..., 0, 0, 0], [ 24, 8, 12, ..., 0, 0, 0], ..., [ 87, 22, 120, ..., 222, 0, 0], [ 0, 2, 8, ..., 0, 0, 0], [ 3, 16, 6, ..., 0, 0, 0]], device='cuda:0') 2024-07-01 01:33:20,042 - INFO - allennlp.training.callbacks.console_logger - Field : "batch_input/metadata" : (Length 24 of type "") 2024-07-01 01:33:29,991 - INFO - tqdm - NullAccuracy: 0.9893, NullF1: 0.3847, Lemma: 0.9299, PosFeats: 0.9255, UD-UAS: 0.7652, UD-LAS: 0.7662, EUD-UAS: 0.4128, EUD-LAS: 0.3992, Misc: 0.9596, SS: 0.8146, SC: 0.8776, Avg: 0.7612, batch_loss: 3.0156, loss: 3.4289 ||: 45%|####5 | 130/287 [00:10<00:10, 14.94it/s] 2024-07-01 01:33:40,019 - INFO - tqdm - NullAccuracy: 0.9896, NullF1: 0.3925, Lemma: 0.9335, PosFeats: 0.9291, UD-UAS: 0.7714, UD-LAS: 0.7721, EUD-UAS: 0.4227, EUD-LAS: 0.4091, Misc: 0.9610, SS: 0.8174, SC: 0.8809, Avg: 0.7663, batch_loss: 3.4157, loss: 3.2924 ||: 95%|#########4| 272/287 [00:20<00:01, 13.98it/s] 2024-07-01 01:33:40,964 - INFO - tqdm - NullAccuracy: 0.9896, NullF1: 0.3936, Lemma: 0.9336, PosFeats: 0.9295, UD-UAS: 0.7720, UD-LAS: 0.7727, EUD-UAS: 0.4236, EUD-LAS: 0.4098, Misc: 0.9609, SS: 0.8173, SC: 0.8810, Avg: 0.7667, batch_loss: 2.6802, loss: 3.2845 ||: 100%|#########9| 286/287 [00:21<00:00, 14.80it/s] 2024-07-01 01:33:41,017 - INFO - tqdm - NullAccuracy: 0.9896, NullF1: 0.3941, Lemma: 0.9336, PosFeats: 0.9296, UD-UAS: 0.7722, UD-LAS: 0.7729, EUD-UAS: 0.4236, EUD-LAS: 0.4098, Misc: 0.9609, SS: 0.8173, SC: 0.8810, Avg: 0.7668, batch_loss: 2.8979, loss: 3.2831 ||: 100%|##########| 287/287 [00:21<00:00, 13.61it/s] 2024-07-01 01:33:41,017 - INFO - allennlp.training.learning_rate_schedulers.slanted_triangular - Gradual unfreezing finished. Training all layers. 2024-07-01 01:33:41,020 - INFO - allennlp.training.callbacks.console_logger - Training | Validation 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - Avg | 0.657 | 0.767 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - EUD-LAS | 0.279 | 0.410 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - EUD-UAS | 0.298 | 0.424 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - Lemma | 0.849 | 0.934 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - Misc | 0.930 | 0.961 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - NullAccuracy | 0.987 | 0.990 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - NullF1 | 0.237 | 0.394 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - PosFeats | 0.830 | 0.930 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - SC | 0.783 | 0.881 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - SS | 0.754 | 0.817 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - UD-LAS | 0.592 | 0.773 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - UD-UAS | 0.599 | 0.772 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 1096.931 | N/A 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - loss | 6.285 | 3.283 2024-07-01 01:33:41,021 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4585.289 | N/A 2024-07-01 01:33:42,564 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:01:26.265534 2024-07-01 01:33:42,564 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:12:42 2024-07-01 01:33:42,564 - INFO - allennlp.training.gradient_descent_trainer - Epoch 1/9 2024-07-01 01:33:42,564 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.9G 2024-07-01 01:33:42,565 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 3.0G 2024-07-01 01:33:42,566 - INFO - allennlp.training.gradient_descent_trainer - Training 2024-07-01 01:33:42,566 - INFO - tqdm - 0%| | 0/1147 [00:00