File size: 10,400 Bytes
a599152 d73ed40 a599152 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
# coding=utf-8
# Copyright 2022 Microsoft, clefourrier and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Graphormer model configuration"""
from ...configuration_utils import PretrainedConfig
from ...utils import logging
logger = logging.get_logger(__name__)
GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
# pcqm4mv1 now deprecated
"graphormer-base": "https://huggingface.co/clefourrier/graphormer-base-pcqm4mv2/resolve/main/config.json",
# See all Graphormer models at https://huggingface.co/models?filter=graphormer
}
class GraphormerConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`~GraphormerModel`]. It is used to instantiate an
Graphormer model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the Graphormer
[graphormer-base-pcqm4mv1](https://huggingface.co/graphormer-base-pcqm4mv1) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
num_classes (`int`, *optional*, defaults to 1):
Number of target classes or labels, set to n for binary classification of n tasks.
num_atoms (`int`, *optional*, defaults to 512*9):
Number of node types in the graphs.
num_edges (`int`, *optional*, defaults to 512*3):
Number of edges types in the graph.
num_in_degree (`int`, *optional*, defaults to 512):
Number of in degrees types in the input graphs.
num_out_degree (`int`, *optional*, defaults to 512):
Number of out degrees types in the input graphs.
num_edge_dis (`int`, *optional*, defaults to 128):
Number of edge dis in the input graphs.
multi_hop_max_dist (`int`, *optional*, defaults to 20):
Maximum distance of multi hop edges between two nodes.
spatial_pos_max (`int`, *optional*, defaults to 1024):
Maximum distance between nodes in the graph attention bias matrices, used during preprocessing and
collation.
edge_type (`str`, *optional*, defaults to multihop):
Type of edge relation chosen.
max_nodes (`int`, *optional*, defaults to 512):
Maximum number of nodes which can be parsed for the input graphs.
share_input_output_embed (`bool`, *optional*, defaults to `False`):
Shares the embedding layer between encoder and decoder - careful, True is not implemented.
num_layers (`int`, *optional*, defaults to 12):
Number of layers.
embedding_dim (`int`, *optional*, defaults to 768):
Dimension of the embedding layer in encoder.
ffn_embedding_dim (`int`, *optional*, defaults to 768):
Dimension of the "intermediate" (often named feed-forward) layer in encoder.
num_attention_heads (`int`, *optional*, defaults to 32):
Number of attention heads in the encoder.
self_attention (`bool`, *optional*, defaults to `True`):
Model is self attentive (False not implemented).
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for the attention weights.
layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
bias (`bool`, *optional*, defaults to `True`):
Uses bias in the attention module - unsupported at the moment.
embed_scale(`float`, *optional*, defaults to None):
Scaling factor for the node embeddings.
num_trans_layers_to_freeze (`int`, *optional*, defaults to 0):
Number of transformer layers to freeze.
encoder_normalize_before (`bool`, *optional*, defaults to `False`):
Normalize features before encoding the graph.
pre_layernorm (`bool`, *optional*, defaults to `False`):
Apply layernorm before self attention and the feed forward network. Without this, post layernorm will be
used.
apply_graphormer_init (`bool`, *optional*, defaults to `False`):
Apply a custom graphormer initialisation to the model before training.
freeze_embeddings (`bool`, *optional*, defaults to `False`):
Freeze the embedding layer, or train it along the model.
encoder_normalize_before (`bool`, *optional*, defaults to `False`):
Apply the layer norm before each encoder block.
q_noise (`float`, *optional*, defaults to 0.0):
Amount of quantization noise (see "Training with Quantization Noise for Extreme Model Compression"). (For
more detail, see fairseq's documentation on quant_noise).
qn_block_size (`int`, *optional*, defaults to 8):
Size of the blocks for subsequent quantization with iPQ (see q_noise).
kdim (`int`, *optional*, defaults to None):
Dimension of the key in the attention, if different from the other values.
vdim (`int`, *optional*, defaults to None):
Dimension of the value in the attention, if different from the other values.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
traceable (`bool`, *optional*, defaults to `False`):
Changes return value of the encoder's inner_state to stacked tensors.
Example:
```python
>>> from transformers import GraphormerForGraphClassification, GraphormerConfig
>>> # Initializing a Graphormer graphormer-base-pcqm4mv2 style configuration
>>> configuration = GraphormerConfig()
>>> # Initializing a model from the graphormer-base-pcqm4mv1 style configuration
>>> model = GraphormerForGraphClassification(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
"""
model_type = "graphormer"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
num_classes: int = 7,
num_atoms: int = 512 * 9,
num_edges: int = 512 * 3,
num_in_degree: int = 512,
num_out_degree: int = 512,
num_spatial: int = 512,
num_edge_dis: int = 128,
multi_hop_max_dist: int = 5, # sometimes is 20
spatial_pos_max: int = 1024,
edge_type: str = "multi_hop",
max_nodes: int = 512,
share_input_output_embed: bool = False,
num_hidden_layers: int = 12,
embedding_dim: int = 768,
ffn_embedding_dim: int = 768,
num_attention_heads: int = 32,
dropout: float = 0.1,
attention_dropout: float = 0.1,
layerdrop: float = 0.0,
encoder_normalize_before: bool = False,
pre_layernorm: bool = False,
apply_graphormer_init: bool = False,
activation_fn: str = "gelu",
embed_scale: float = None,
freeze_embeddings: bool = False,
num_trans_layers_to_freeze: int = 0,
traceable: bool = False,
q_noise: float = 0.0,
qn_block_size: int = 8,
kdim: int = None,
vdim: int = None,
bias: bool = True,
self_attention: bool = True,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
**kwargs,
):
self.num_classes = num_classes
self.num_atoms = num_atoms
self.num_in_degree = num_in_degree
self.num_out_degree = num_out_degree
self.num_edges = num_edges
self.num_spatial = num_spatial
self.num_edge_dis = num_edge_dis
self.edge_type = edge_type
self.multi_hop_max_dist = multi_hop_max_dist
self.spatial_pos_max = spatial_pos_max
self.max_nodes = max_nodes
self.num_hidden_layers = num_hidden_layers
self.embedding_dim = embedding_dim
self.hidden_size = embedding_dim
self.ffn_embedding_dim = ffn_embedding_dim
self.num_attention_heads = num_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.layerdrop = layerdrop
self.encoder_normalize_before = encoder_normalize_before
self.pre_layernorm = pre_layernorm
self.apply_graphormer_init = apply_graphormer_init
self.activation_fn = activation_fn
self.embed_scale = embed_scale
self.freeze_embeddings = freeze_embeddings
self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
self.share_input_output_embed = share_input_output_embed
self.traceable = traceable
self.q_noise = q_noise
self.qn_block_size = qn_block_size
# These parameters are here for future extensions
# atm, the model only supports self attention
self.kdim = kdim
self.vdim = vdim
self.self_attention = self_attention
self.bias = bias
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
**kwargs,
) |