Upload SegformerForSemanticSegmentation
Browse files- config.json +3 -56
- model.py +63 -3
config.json
CHANGED
@@ -2,27 +2,10 @@
|
|
2 |
"architectures": [
|
3 |
"SegformerForSemanticSegmentation"
|
4 |
],
|
5 |
-
"attention_probs_dropout_prob": 0.0,
|
6 |
"auto_map": {
|
|
|
7 |
"AutoModelForImageSegmentation": "model.SegformerForSemanticSegmentation"
|
8 |
},
|
9 |
-
"classifier_dropout_prob": 0.1,
|
10 |
-
"decoder_hidden_size": 256,
|
11 |
-
"depths": [
|
12 |
-
2,
|
13 |
-
2,
|
14 |
-
2,
|
15 |
-
2
|
16 |
-
],
|
17 |
-
"drop_path_rate": 0.1,
|
18 |
-
"hidden_act": "gelu",
|
19 |
-
"hidden_dropout_prob": 0.0,
|
20 |
-
"hidden_sizes": [
|
21 |
-
32,
|
22 |
-
64,
|
23 |
-
160,
|
24 |
-
256
|
25 |
-
],
|
26 |
"id2label": {
|
27 |
"0": "skin",
|
28 |
"1": "l_brow",
|
@@ -43,7 +26,6 @@
|
|
43 |
"16": "hair",
|
44 |
"17": "hat"
|
45 |
},
|
46 |
-
"initializer_range": 0.02,
|
47 |
"label2id": {
|
48 |
"cloth": 15,
|
49 |
"ear_r": 8,
|
@@ -64,43 +46,8 @@
|
|
64 |
"skin": 0,
|
65 |
"u_lip": 11
|
66 |
},
|
67 |
-
"
|
68 |
-
"mlp_ratios": [
|
69 |
-
4,
|
70 |
-
4,
|
71 |
-
4,
|
72 |
-
4
|
73 |
-
],
|
74 |
-
"model_type": "segformer",
|
75 |
-
"num_attention_heads": [
|
76 |
-
1,
|
77 |
-
2,
|
78 |
-
5,
|
79 |
-
8
|
80 |
-
],
|
81 |
-
"num_channels": 3,
|
82 |
"num_classes": 18,
|
83 |
-
"num_encoder_blocks": 4,
|
84 |
-
"patch_sizes": [
|
85 |
-
7,
|
86 |
-
3,
|
87 |
-
3,
|
88 |
-
3
|
89 |
-
],
|
90 |
-
"reshape_last_stage": true,
|
91 |
-
"semantic_loss_ignore_index": 255,
|
92 |
-
"sr_ratios": [
|
93 |
-
8,
|
94 |
-
4,
|
95 |
-
2,
|
96 |
-
1
|
97 |
-
],
|
98 |
-
"strides": [
|
99 |
-
4,
|
100 |
-
2,
|
101 |
-
2,
|
102 |
-
2
|
103 |
-
],
|
104 |
"torch_dtype": "float32",
|
105 |
-
"transformers_version": "4.
|
106 |
}
|
|
|
2 |
"architectures": [
|
3 |
"SegformerForSemanticSegmentation"
|
4 |
],
|
|
|
5 |
"auto_map": {
|
6 |
+
"AutoConfig": "model.FaceSegmenterConfig",
|
7 |
"AutoModelForImageSegmentation": "model.SegformerForSemanticSegmentation"
|
8 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
"id2label": {
|
10 |
"0": "skin",
|
11 |
"1": "l_brow",
|
|
|
26 |
"16": "hair",
|
27 |
"17": "hat"
|
28 |
},
|
|
|
29 |
"label2id": {
|
30 |
"cloth": 15,
|
31 |
"ear_r": 8,
|
|
|
46 |
"skin": 0,
|
47 |
"u_lip": 11
|
48 |
},
|
49 |
+
"model_type": "image-segmentation",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
"num_classes": 18,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
"torch_dtype": "float32",
|
52 |
+
"transformers_version": "4.37.0"
|
53 |
}
|
model.py
CHANGED
@@ -4,6 +4,65 @@ from torch import nn
|
|
4 |
from transformers.modeling_outputs import SemanticSegmenterOutput
|
5 |
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
def encode_down(c_in: int, c_out: int):
|
8 |
return nn.Sequential(
|
9 |
nn.Conv2d(in_channels=c_in, out_channels=c_out, kernel_size=3, padding=1),
|
@@ -28,7 +87,7 @@ class FaceUNet(nn.Module):
|
|
28 |
def __init__(self, num_classes: int):
|
29 |
super().__init__()
|
30 |
self.num_classes = num_classes
|
31 |
-
|
32 |
self.down_1 = nn.Conv2d(
|
33 |
in_channels=3,
|
34 |
out_channels=64,
|
@@ -42,6 +101,7 @@ class FaceUNet(nn.Module):
|
|
42 |
|
43 |
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
|
44 |
|
|
|
45 |
self.up_1 = decode_up(1024)
|
46 |
self.up_c1 = encode_down(1024, 512)
|
47 |
self.up_2 = decode_up(512)
|
@@ -83,7 +143,7 @@ class FaceUNet(nn.Module):
|
|
83 |
|
84 |
|
85 |
class Segformer(transformers.PreTrainedModel):
|
86 |
-
config_class =
|
87 |
|
88 |
def __init__(self, config):
|
89 |
super().__init__(config)
|
@@ -95,7 +155,7 @@ class Segformer(transformers.PreTrainedModel):
|
|
95 |
|
96 |
|
97 |
class SegformerForSemanticSegmentation(transformers.PreTrainedModel):
|
98 |
-
config_class =
|
99 |
|
100 |
def __init__(self, config):
|
101 |
super().__init__(config)
|
|
|
4 |
from transformers.modeling_outputs import SemanticSegmenterOutput
|
5 |
|
6 |
|
7 |
+
class FaceSegmenterConfig(transformers.PretrainedConfig):
|
8 |
+
model_type = "image-segmentation"
|
9 |
+
|
10 |
+
_id2label = {
|
11 |
+
0: "skin",
|
12 |
+
1: "l_brow",
|
13 |
+
2: "r_brow",
|
14 |
+
3: "l_eye",
|
15 |
+
4: "r_eye",
|
16 |
+
5: "eye_g",
|
17 |
+
6: "l_ear",
|
18 |
+
7: "r_ear",
|
19 |
+
8: "ear_r",
|
20 |
+
9: "nose",
|
21 |
+
10: "mouth",
|
22 |
+
11: "u_lip",
|
23 |
+
12: "l_lip",
|
24 |
+
13: "neck",
|
25 |
+
14: "neck_l",
|
26 |
+
15: "cloth",
|
27 |
+
16: "hair",
|
28 |
+
17: "hat",
|
29 |
+
}
|
30 |
+
|
31 |
+
_label2id = {
|
32 |
+
"skin": 0,
|
33 |
+
"l_brow": 1,
|
34 |
+
"r_brow": 2,
|
35 |
+
"l_eye": 3,
|
36 |
+
"r_eye": 4,
|
37 |
+
"eye_g": 5,
|
38 |
+
"l_ear": 6,
|
39 |
+
"r_ear": 7,
|
40 |
+
"ear_r": 8,
|
41 |
+
"nose": 9,
|
42 |
+
"mouth": 10,
|
43 |
+
"u_lip": 11,
|
44 |
+
"l_lip": 12,
|
45 |
+
"neck": 13,
|
46 |
+
"neck_l": 14,
|
47 |
+
"cloth": 15,
|
48 |
+
"hair": 16,
|
49 |
+
"hat": 17,
|
50 |
+
}
|
51 |
+
|
52 |
+
def __init__(self, **kwargs):
|
53 |
+
super().__init__(**kwargs)
|
54 |
+
self.id2label = kwargs.get("id2label", self._id2label)
|
55 |
+
|
56 |
+
# for some reason these are getting convert to strings when used in pipelines
|
57 |
+
id_keys = list(self.id2label.keys())
|
58 |
+
for label_id in id_keys:
|
59 |
+
label_value = self.id2label.pop(label_id)
|
60 |
+
self.id2label[int(label_id)] = label_value
|
61 |
+
|
62 |
+
self.label2id = kwargs.get("label2id", self._label2id)
|
63 |
+
self.num_classes = kwargs.get("num_classes", len(self.id2label))
|
64 |
+
|
65 |
+
|
66 |
def encode_down(c_in: int, c_out: int):
|
67 |
return nn.Sequential(
|
68 |
nn.Conv2d(in_channels=c_in, out_channels=c_out, kernel_size=3, padding=1),
|
|
|
87 |
def __init__(self, num_classes: int):
|
88 |
super().__init__()
|
89 |
self.num_classes = num_classes
|
90 |
+
# unet
|
91 |
self.down_1 = nn.Conv2d(
|
92 |
in_channels=3,
|
93 |
out_channels=64,
|
|
|
101 |
|
102 |
self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
|
103 |
|
104 |
+
# Below, `in_channels` again becomes 1024 as we are concatinating.
|
105 |
self.up_1 = decode_up(1024)
|
106 |
self.up_c1 = encode_down(1024, 512)
|
107 |
self.up_2 = decode_up(512)
|
|
|
143 |
|
144 |
|
145 |
class Segformer(transformers.PreTrainedModel):
|
146 |
+
config_class = FaceSegmenterConfig
|
147 |
|
148 |
def __init__(self, config):
|
149 |
super().__init__(config)
|
|
|
155 |
|
156 |
|
157 |
class SegformerForSemanticSegmentation(transformers.PreTrainedModel):
|
158 |
+
config_class = FaceSegmenterConfig
|
159 |
|
160 |
def __init__(self, config):
|
161 |
super().__init__(config)
|