End of training

Browse files

Files changed (6) hide show

README.md +21 -25
config.json +1 -1
configuration_moe.py +3 -3
model.safetensors +2 -2
modeling_moe.py +19 -34
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -17,8 +17,8 @@ should probably proofread and complete it, then remove this comment. -->
 This model is a fine-tuned version of [](https://huggingface.co/) on the imagefolder dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.4329
-- Accuracy: 0.8668
 ## Model description
@@ -38,11 +38,11 @@ More information needed
 The following hyperparameters were used during training:
 - learning_rate: 5e-05
-- train_batch_size: 200
-- eval_batch_size: 200
 - seed: 42
 - gradient_accumulation_steps: 4
-- total_train_batch_size: 800
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
 - lr_scheduler_warmup_ratio: 0.1
@@ -52,26 +52,22 @@ The following hyperparameters were used during training:
 | Training Loss | Epoch | Step | Validation Loss | Accuracy |
 |:-------------:|:-----:|:----:|:---------------:|:--------:|
-| 0.4831        | 0.99  | 24   | 0.5000          | 0.8607   |
-| 0.3545        | 1.98  | 48   | 0.4135          | 0.8696   |
-| 0.3388        | 2.97  | 72   | 0.4527          | 0.8654   |
-| 0.3378        | 4.0   | 97   | 0.4225          | 0.8706   |
-| 0.3372        | 4.99  | 121  | 0.4032          | 0.8654   |
-| 0.326         | 5.98  | 145  | 0.4222          | 0.8654   |
-| 0.3437        | 6.97  | 169  | 0.4231          | 0.8640   |
-| 0.3157        | 8.0   | 194  | 0.3980          | 0.8720   |
-| 0.3193        | 8.99  | 218  | 0.4001          | 0.8682   |
-| 0.3027        | 9.98  | 242  | 0.4163          | 0.8650   |
-| 0.2933        | 10.97 | 266  | 0.4105          | 0.8715   |
-| 0.3041        | 12.0  | 291  | 0.4004          | 0.8729   |
-| 0.2845        | 12.99 | 315  | 0.4020          | 0.8720   |
-| 0.2845        | 13.98 | 339  | 0.4223          | 0.8715   |
-| 0.2797        | 14.97 | 363  | 0.4089          | 0.8678   |
-| 0.295         | 16.0  | 388  | 0.4162          | 0.8701   |
-| 0.2767        | 16.99 | 412  | 0.4369          | 0.8678   |
-| 0.2692        | 17.98 | 436  | 0.4292          | 0.8678   |
-| 0.2543        | 18.97 | 460  | 0.4328          | 0.8659   |
-| 0.2743        | 19.79 | 480  | 0.4329          | 0.8668   |
 ### Framework versions

 This model is a fine-tuned version of [](https://huggingface.co/) on the imagefolder dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.3758
+- Accuracy: 0.8762
 ## Model description
 The following hyperparameters were used during training:
 - learning_rate: 5e-05
+- train_batch_size: 2000
+- eval_batch_size: 2000
 - seed: 42
 - gradient_accumulation_steps: 4
+- total_train_batch_size: 8000
 - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
 - lr_scheduler_type: linear
 - lr_scheduler_warmup_ratio: 0.1
 | Training Loss | Epoch | Step | Validation Loss | Accuracy |
 |:-------------:|:-----:|:----:|:---------------:|:--------:|
+| No log        | 0.8   | 2    | 0.4078          | 0.8724   |
+| No log        | 2.0   | 5    | 0.3771          | 0.8776   |
+| No log        | 2.8   | 7    | 0.3776          | 0.8762   |
+| 0.3197        | 4.0   | 10   | 0.3773          | 0.8762   |
+| 0.3197        | 4.8   | 12   | 0.3768          | 0.8762   |
+| 0.3197        | 6.0   | 15   | 0.3762          | 0.8762   |
+| 0.3197        | 6.8   | 17   | 0.3760          | 0.8762   |
+| 0.2995        | 8.0   | 20   | 0.3759          | 0.8762   |
+| 0.2995        | 8.8   | 22   | 0.3759          | 0.8762   |
+| 0.2995        | 10.0  | 25   | 0.3758          | 0.8762   |
+| 0.2995        | 10.8  | 27   | 0.3758          | 0.8762   |
+| 0.2996        | 12.0  | 30   | 0.3758          | 0.8762   |
+| 0.2996        | 12.8  | 32   | 0.3758          | 0.8762   |
+| 0.2996        | 14.0  | 35   | 0.3758          | 0.8762   |
+| 0.2996        | 14.8  | 37   | 0.3758          | 0.8762   |
+| 0.3024        | 16.0  | 40   | 0.3758          | 0.8762   |
 ### Framework versions

config.json CHANGED Viewed

@@ -6,7 +6,7 @@
     "AutoConfig": "configuration_moe.MoEConfig",
     "AutoModelForImageClassification": "modeling_moe.MoEModelForImageClassification"
   },
-  "base_model": "louislu9911/BaseModel-leaf-disease-convnextv2-base-1k-224-0_1_2_3_4",
   "expert_class_mapping": {
     "0": [
       0,

     "AutoConfig": "configuration_moe.MoEConfig",
     "AutoModelForImageClassification": "modeling_moe.MoEModelForImageClassification"
   },
+  "baseline_model": "louislu9911/BaseModel-leaf-disease-convnextv2-base-1k-224-0_1_2_3_4",
   "expert_class_mapping": {
     "0": [
       0,

configuration_moe.py CHANGED Viewed

@@ -13,7 +13,7 @@ EXPERTS = [
 SWITCH_GATE = (
     f"{DEFAULT_HUGGINGFACE_ACCOUNT}/switch_gate-leaf-disease-{model_checkpoint}"
 )
-BASE_MODEL = (
     f"{DEFAULT_HUGGINGFACE_ACCOUNT}/BaseModel-leaf-disease-{model_checkpoint}-0_1_2_3_4"
 )
@@ -25,14 +25,14 @@ class MoEConfig(PretrainedConfig):
         self,
         experts: List[str] = EXPERTS,
         switch_gate: str = SWITCH_GATE,
-        base_model: str = BASE_MODEL,
         num_classes: int = 5,
         expert_class_mapping: Dict[int, List[int]] = None,
         **kwargs,
     ):
         self.experts = experts
         self.switch_gate = switch_gate
-        self.base_model = base_model
         self.num_classes = num_classes
         self.expert_class_mapping = expert_class_mapping
         super().__init__(**kwargs)

 SWITCH_GATE = (
     f"{DEFAULT_HUGGINGFACE_ACCOUNT}/switch_gate-leaf-disease-{model_checkpoint}"
 )
+BASELINE_MODEL = (
     f"{DEFAULT_HUGGINGFACE_ACCOUNT}/BaseModel-leaf-disease-{model_checkpoint}-0_1_2_3_4"
 )
         self,
         experts: List[str] = EXPERTS,
         switch_gate: str = SWITCH_GATE,
+        baseline_model: str = BASELINE_MODEL,
         num_classes: int = 5,
         expert_class_mapping: Dict[int, List[int]] = None,
         **kwargs,
     ):
         self.experts = experts
         self.switch_gate = switch_gate
+        self.baseline_model = baseline_model
         self.num_classes = num_classes
         self.expert_class_mapping = expert_class_mapping
         super().__init__(**kwargs)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6541987d439493d740c0ee1c047cba4fead52be9508a1520391b75ea8d4ced72
-size 2991581672

 version https://git-lfs.github.com/spec/v1
+oid sha256:d274c9e2d535934cc02d89e5d0c7f84c2988a384bb531a7af68804378247063b
+size 2022035716

modeling_moe.py CHANGED Viewed

@@ -5,14 +5,14 @@ from transformers import PreTrainedModel, AutoModelForImageClassification
 from .configuration_moe import MoEConfig
-def subgate(num_classes):
     layers = nn.Sequential(
         nn.Flatten(),
         nn.Linear(224 * 224 * 3, 1024),
         nn.ReLU(),
         nn.Linear(1024, 512),
         nn.ReLU(),
-        nn.Linear(512, num_classes * 2),
     )
     return layers
@@ -26,11 +26,8 @@ class MoEModelForImageClassification(PreTrainedModel):
         self.switch_gate_model = AutoModelForImageClassification.from_pretrained(
             config.switch_gate
         )
-        self.base_model1 = AutoModelForImageClassification.from_pretrained(
-            config.base_model
-        )
-        self.base_model2 = AutoModelForImageClassification.from_pretrained(
-            config.base_model
         )
         self.expert_model_1 = AutoModelForImageClassification.from_pretrained(
             config.experts[0]
@@ -39,13 +36,12 @@ class MoEModelForImageClassification(PreTrainedModel):
             config.experts[1]
         )
-        self.subgate1 = subgate(config.num_classes)
-        self.subgate2 = subgate(config.num_classes)
         # Freeze all params
         for module in [
-            self.base_model1,
-            self.base_model2,
             self.expert_model_1,
             self.expert_model_2,
         ]:
@@ -54,36 +50,25 @@ class MoEModelForImageClassification(PreTrainedModel):
     def forward(self, pixel_values, labels=None):
         switch_gate_result = self.switch_gate_model(pixel_values).logits
-        base_model1_result = self.base_model1(pixel_values).logits
-        base_model2_result = self.base_model2(pixel_values).logits
         expert1_result = self.expert_model_1(pixel_values).logits
         expert2_result = self.expert_model_2(pixel_values).logits
-        subgate1_result = self.subgate1(pixel_values)
-        subgate1_result = torch.reshape(subgate1_result, (2, -1, self.num_classes))
-        subgate2_result = self.subgate2(pixel_values)
-        subgate2_result = torch.reshape(subgate2_result, (2, -1, self.num_classes))
-        expert1_and_base_res = (
-            expert1_result * subgate1_result[0, :, :]
-            + base_model1_result * subgate1_result[1, :, :]
-        )
-        expert2_and_base_res = (
-            expert2_result * subgate2_result[0, :, :]
-            + base_model2_result * subgate2_result[1, :, :]
-        )
-        # Gating Network
-        expert1_and_base_res = expert1_and_base_res * switch_gate_result[
-            :, 0
-        ].unsqueeze(1)
-        expert2_and_base_res = expert2_and_base_res * switch_gate_result[
-            :, 1
-        ].unsqueeze(1)
-        logits = expert1_and_base_res + expert2_and_base_res
         if labels is not None:
             loss = F.cross_entropy(logits, labels)
             return {"loss": loss, "logits": logits}

 from .configuration_moe import MoEConfig
+def subgate(num_out):
     layers = nn.Sequential(
         nn.Flatten(),
         nn.Linear(224 * 224 * 3, 1024),
         nn.ReLU(),
         nn.Linear(1024, 512),
         nn.ReLU(),
+        nn.Linear(512, num_out),
     )
     return layers
         self.switch_gate_model = AutoModelForImageClassification.from_pretrained(
             config.switch_gate
         )
+        self.baseline_model = AutoModelForImageClassification.from_pretrained(
+            config.baseline_model
         )
         self.expert_model_1 = AutoModelForImageClassification.from_pretrained(
             config.experts[0]
             config.experts[1]
         )
+        self.subgate = subgate(2)
         # Freeze all params
         for module in [
+            self.switch_gate_model,
+            self.baseline_model,
             self.expert_model_1,
             self.expert_model_2,
         ]:
     def forward(self, pixel_values, labels=None):
         switch_gate_result = self.switch_gate_model(pixel_values).logits
         expert1_result = self.expert_model_1(pixel_values).logits
         expert2_result = self.expert_model_2(pixel_values).logits
+        # Gating Network
+        experts_result = torch.stack(
+            [expert1_result, expert2_result], dim=1
+        ) * switch_gate_result.unsqueeze(-1)
+        experts_result = experts_result.sum(dim=1)
+        baseline_model_result = self.baseline_model(pixel_values).logits
+        subgate_result = self.subgate(pixel_values)
+        subgate_prob = F.softmax(subgate_result, dim=-1)
+        experts_and_base_result = torch.stack(
+            [experts_result, baseline_model_result], dim=1
+        ) * subgate_prob.unsqueeze(-1)
+        logits = experts_and_base_result.sum(dim=1)
         if labels is not None:
             loss = F.cross_entropy(logits, labels)
             return {"loss": loss, "logits": logits}

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b64507f98f39450bae4ef79c4f93c178ab811178fca4cc4522e322b941641ec3
 size 4984

 version https://git-lfs.github.com/spec/v1
+oid sha256:f0b7c5c3611717751700245126f1467e9b2d7ee3ebb544c05ba4806732954b7e
 size 4984