Update modelling_hat.py
Browse files- modelling_hat.py +113 -110
modelling_hat.py
CHANGED
@@ -319,116 +319,119 @@ class SentenceClassifierOutput(ModelOutput):
|
|
319 |
sentence_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
320 |
|
321 |
|
322 |
-
class HATConfig(PretrainedConfig):
|
323 |
-
|
324 |
-
|
325 |
-
|
326 |
-
|
327 |
-
|
328 |
-
|
329 |
-
|
330 |
-
|
331 |
-
|
332 |
-
|
333 |
-
|
334 |
-
|
335 |
-
|
336 |
-
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
349 |
-
|
350 |
-
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
365 |
-
|
366 |
-
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
374 |
-
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
386 |
-
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
399 |
-
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
|
410 |
-
|
411 |
-
|
412 |
-
|
413 |
-
|
414 |
-
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
422 |
-
|
423 |
-
|
424 |
-
|
425 |
-
|
426 |
-
|
427 |
-
|
428 |
-
|
429 |
-
|
430 |
-
|
431 |
-
|
|
|
|
|
|
|
432 |
|
433 |
class HATEmbeddings(nn.Module):
|
434 |
"""
|
|
|
319 |
sentence_attentions: Optional[Tuple[torch.FloatTensor]] = None
|
320 |
|
321 |
|
322 |
+
# class HATConfig(PretrainedConfig):
|
323 |
+
# r"""
|
324 |
+
# This is the configuration class to store the configuration of a :class:`~transformers.HAT`.
|
325 |
+
# It is used to instantiate a HAT model according to the specified arguments,
|
326 |
+
# defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
|
327 |
+
# to that of the HAT `kiddothe2b/hat-base-4096 <https://huggingface.co/kiddothe2b/hat-base-4096>`__ architecture.
|
328 |
+
|
329 |
+
# Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
|
330 |
+
# outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
|
331 |
+
|
332 |
+
|
333 |
+
# Args:
|
334 |
+
# vocab_size (:obj:`int`, `optional`, defaults to 30522):
|
335 |
+
# Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
|
336 |
+
# :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
|
337 |
+
# :class:`~transformers.TFBertModel`.
|
338 |
+
# max_sentences (:obj:`int`, `optional`, defaults to 64):
|
339 |
+
# The maximum number of sentences that this model might ever be used with.
|
340 |
+
# max_sentence_size (:obj:`int`, `optional`, defaults to 128):
|
341 |
+
# The maximum sentence length that this model might ever be used with.
|
342 |
+
# model_max_length (:obj:`int`, `optional`, defaults to 8192):
|
343 |
+
# The maximum sequence length (max_sentences * max_sentence_size) that this model might ever be used with
|
344 |
+
# encoder_layout (:obj:`Dict`):
|
345 |
+
# The sentence/document encoder layout.
|
346 |
+
# hidden_size (:obj:`int`, `optional`, defaults to 768):
|
347 |
+
# Dimensionality of the encoder layers and the pooler layer.
|
348 |
+
# num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
|
349 |
+
# Number of hidden layers in the Transformer encoder.
|
350 |
+
# num_attention_heads (:obj:`int`, `optional`, defaults to 12):
|
351 |
+
# Number of attention heads for each attention layer in the Transformer encoder.
|
352 |
+
# intermediate_size (:obj:`int`, `optional`, defaults to 3072):
|
353 |
+
# Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
|
354 |
+
# hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
|
355 |
+
# The non-linear activation function (function or string) in the encoder and pooler. If string,
|
356 |
+
# :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
|
357 |
+
# hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
358 |
+
# The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
359 |
+
# attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
|
360 |
+
# The dropout ratio for the attention probabilities.
|
361 |
+
# max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
|
362 |
+
# The maximum sequence length that this model might ever be used with. Typically set this to something large
|
363 |
+
# just in case (e.g., 512 or 1024 or 2048).
|
364 |
+
# type_vocab_size (:obj:`int`, `optional`, defaults to 2):
|
365 |
+
# The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
|
366 |
+
# :class:`~transformers.TFBertModel`.
|
367 |
+
# initializer_range (:obj:`float`, `optional`, defaults to 0.02):
|
368 |
+
# The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
369 |
+
# layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
|
370 |
+
# The epsilon used by the layer normalization layers.
|
371 |
+
# position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
|
372 |
+
# Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
|
373 |
+
# :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
|
374 |
+
# :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
|
375 |
+
# <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
|
376 |
+
# `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
|
377 |
+
# <https://arxiv.org/abs/2009.13658>`__.
|
378 |
+
# use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
379 |
+
# Whether or not the model should return the last key/values attentions (not used by all models). Only
|
380 |
+
# relevant if ``config.is_decoder=True``.
|
381 |
+
# classifier_dropout (:obj:`float`, `optional`):
|
382 |
+
# The dropout ratio for the classification head.
|
383 |
+
# """
|
384 |
+
# model_type = "hierarchical-transformer"
|
385 |
+
|
386 |
+
# def __init__(
|
387 |
+
# self,
|
388 |
+
# vocab_size=30522,
|
389 |
+
# hidden_size=768,
|
390 |
+
# max_sentences=64,
|
391 |
+
# max_sentence_size=128,
|
392 |
+
# model_max_length=8192,
|
393 |
+
# num_hidden_layers=12,
|
394 |
+
# num_attention_heads=12,
|
395 |
+
# intermediate_size=3072,
|
396 |
+
# hidden_act="gelu",
|
397 |
+
# hidden_dropout_prob=0.1,
|
398 |
+
# attention_probs_dropout_prob=0.1,
|
399 |
+
# max_position_embeddings=512,
|
400 |
+
# type_vocab_size=2,
|
401 |
+
# initializer_range=0.02,
|
402 |
+
# layer_norm_eps=1e-12,
|
403 |
+
# pad_token_id=0,
|
404 |
+
# position_embedding_type="absolute",
|
405 |
+
# encoder_layout=None,
|
406 |
+
# use_cache=True,
|
407 |
+
# classifier_dropout=None,
|
408 |
+
# **kwargs
|
409 |
+
# ):
|
410 |
+
# super().__init__(pad_token_id=pad_token_id, **kwargs)
|
411 |
+
|
412 |
+
# self.vocab_size = vocab_size
|
413 |
+
# self.hidden_size = hidden_size
|
414 |
+
# self.max_sentences = max_sentences
|
415 |
+
# self.max_sentence_size = max_sentence_size
|
416 |
+
# self.model_max_length = model_max_length
|
417 |
+
# self.encoder_layout = encoder_layout
|
418 |
+
# self.num_hidden_layers = num_hidden_layers
|
419 |
+
# self.num_attention_heads = num_attention_heads
|
420 |
+
# self.hidden_act = hidden_act
|
421 |
+
# self.intermediate_size = intermediate_size
|
422 |
+
# self.hidden_dropout_prob = hidden_dropout_prob
|
423 |
+
# self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
424 |
+
# self.max_position_embeddings = max_position_embeddings
|
425 |
+
# self.type_vocab_size = type_vocab_size
|
426 |
+
# self.initializer_range = initializer_range
|
427 |
+
# self.layer_norm_eps = layer_norm_eps
|
428 |
+
# self.position_embedding_type = position_embedding_type
|
429 |
+
# self.use_cache = use_cache
|
430 |
+
# self.classifier_dropout = classifier_dropout
|
431 |
+
|
432 |
+
|
433 |
+
|
434 |
+
from configuration_hat import HATConfig
|
435 |
|
436 |
class HATEmbeddings(nn.Module):
|
437 |
"""
|