{ "_name_or_path": "/home/dinalt/ai_assets/models/walsh_instruct", "activation_args": {}, "activation_cls": "torch.nn.GELU", "architectures": [ "HFCausalModel" ], "attention_args": { "beta": 0.25, "dropout": 0.1 }, "attention_cls": ".CausalSelfAttention", "auto_map": { "AutoConfig": "modelling_walsh.Config", "AutoModelForCausalLM": "modelling_walsh.HFCausalModel" }, "d_embed": 2048, "dim_feedforward": 8192, "dropout": 0.1, "embdding_cls": "torch.nn.Embedding", "embedding_args": {}, "feedforward_args": { "beta": 0.25, "bias": true }, "feedforward_cls": ".FeedforwardLayer", "head_args": {}, "head_cls": ".Transformer", "init_gain": 1.0, "layer_args": { "alpha": 2.828427124746 }, "layer_cls": ".DeepnetLayer", "layer_stack_args": {}, "layer_stack_cls": ".TransformerLayerStack", "loss_function": ".causal_loss", "max_sequence_length": 16384, "model_type": "walsh-causal-v1", "norm_args": { "normalized_shape": 2084 }, "norm_cls": "torch.nn.LayerNorm", "num_attention_heads": 32, "num_hidden_layers": 32, "output_proj_args": {}, "output_proj_cls": "torch.nn.Linear", "pad_index": null, "positional_encoder_args": { "d_embed": 2048, "gain": 0.3333, "max_seq": 16384 }, "positional_encoder_cls": ".RSWalshPositionalEncoder", "torch_dtype": "bfloat16", "transformer_args": {}, "transformer_cls": ".Transformer", "transformers_version": "4.37.2", "vocab_size": 32000 }