Spaces:
Running
Running
JaesungHuh
commited on
Commit
•
931ef66
1
Parent(s):
ac6a529
change to from_pretrained
Browse files- __pycache__/model.cpython-38.pyc +0 -0
- app.py +3 -3
- model.py +37 -43
- requirements.txt +2 -1
__pycache__/model.cpython-38.pyc
CHANGED
Binary files a/__pycache__/model.cpython-38.pyc and b/__pycache__/model.cpython-38.pyc differ
|
|
app.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
from model import ECAPA_gender
|
4 |
-
|
5 |
-
model = ECAPA_gender(
|
6 |
-
model.load_state_dict(torch.load("gender_classifier.model", map_location="cpu"))
|
7 |
|
8 |
model.eval()
|
9 |
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
from model import ECAPA_gender
|
4 |
+
# Load the model
|
5 |
+
model = ECAPA_gender.from_pretrained("JaesungHuh/ecapa-gender")
|
6 |
+
# model.load_state_dict(torch.load("gender_classifier.model", map_location="cpu"))
|
7 |
|
8 |
model.eval()
|
9 |
|
model.py
CHANGED
@@ -1,14 +1,18 @@
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
import torch.nn as nn
|
3 |
import torch.nn.functional as F
|
4 |
|
5 |
import torchaudio
|
6 |
from torchaudio.functional import resample
|
7 |
-
|
|
|
8 |
|
9 |
|
10 |
class SEModule(nn.Module):
|
11 |
-
def __init__(self, channels, bottleneck=128):
|
12 |
super(SEModule, self).__init__()
|
13 |
self.se = nn.Sequential(
|
14 |
nn.AdaptiveAvgPool1d(1),
|
@@ -19,13 +23,13 @@ class SEModule(nn.Module):
|
|
19 |
nn.Sigmoid(),
|
20 |
)
|
21 |
|
22 |
-
def forward(self, input):
|
23 |
x = self.se(input)
|
24 |
return input * x
|
25 |
|
26 |
-
class Bottle2neck(nn.Module):
|
27 |
|
28 |
-
|
|
|
29 |
super(Bottle2neck, self).__init__()
|
30 |
width = int(math.floor(planes / scale))
|
31 |
self.conv1 = nn.Conv1d(inplanes, width*scale, kernel_size=1)
|
@@ -45,7 +49,7 @@ class Bottle2neck(nn.Module):
|
|
45 |
self.width = width
|
46 |
self.se = SEModule(planes)
|
47 |
|
48 |
-
def forward(self, x):
|
49 |
residual = x
|
50 |
out = self.conv1(x)
|
51 |
out = self.relu(out)
|
@@ -73,34 +77,12 @@ class Bottle2neck(nn.Module):
|
|
73 |
out = self.se(out)
|
74 |
out += residual
|
75 |
return out
|
|
|
76 |
|
77 |
-
class
|
78 |
-
|
79 |
-
def __init__(self, coef: float = 0.97):
|
80 |
-
super().__init__()
|
81 |
-
self.coef = coef
|
82 |
-
self.register_buffer(
|
83 |
-
'flipped_filter', torch.FloatTensor([-self.coef, 1.]).unsqueeze(0).unsqueeze(0)
|
84 |
-
)
|
85 |
-
|
86 |
-
def forward(self, input: torch.tensor) -> torch.tensor:
|
87 |
-
input = input.unsqueeze(1)
|
88 |
-
input = F.pad(input, (1, 0), 'reflect')
|
89 |
-
return F.conv1d(input, self.flipped_filter).squeeze(1)
|
90 |
-
|
91 |
-
|
92 |
-
class ECAPA_gender(nn.Module):
|
93 |
-
def __init__(self, config):
|
94 |
super(ECAPA_gender, self).__init__()
|
95 |
-
self.
|
96 |
-
C = config["C"]
|
97 |
-
|
98 |
-
self.torchfbank = torch.nn.Sequential(
|
99 |
-
PreEmphasis(),
|
100 |
-
torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
|
101 |
-
f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80),
|
102 |
-
)
|
103 |
-
|
104 |
self.conv1 = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
|
105 |
self.relu = nn.ReLU()
|
106 |
self.bn1 = nn.BatchNorm1d(C)
|
@@ -121,13 +103,26 @@ class ECAPA_gender(nn.Module):
|
|
121 |
self.fc6 = nn.Linear(3072, 192)
|
122 |
self.bn6 = nn.BatchNorm1d(192)
|
123 |
self.fc7 = nn.Linear(192, 2)
|
124 |
-
self.pred2gender = {0 : '
|
125 |
-
|
126 |
-
def
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
131 |
|
132 |
x = self.conv1(x)
|
133 |
x = self.relu(x)
|
@@ -158,17 +153,16 @@ class ECAPA_gender(nn.Module):
|
|
158 |
|
159 |
return x
|
160 |
|
161 |
-
def load_audio(self, path):
|
162 |
audio, sr = torchaudio.load(path)
|
163 |
if sr != 16000:
|
164 |
audio = resample(audio, sr, 16000)
|
165 |
return audio
|
166 |
|
167 |
-
def predict(self, audio):
|
168 |
audio = self.load_audio(audio)
|
169 |
self.eval()
|
170 |
with torch.no_grad():
|
171 |
output = self.forward(audio)
|
172 |
_, pred = output.max(1)
|
173 |
-
return self.pred2gender[pred.item()]
|
174 |
-
|
|
|
1 |
+
import math
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
import torch
|
5 |
import torch.nn as nn
|
6 |
import torch.nn.functional as F
|
7 |
|
8 |
import torchaudio
|
9 |
from torchaudio.functional import resample
|
10 |
+
|
11 |
+
from huggingface_hub import PyTorchModelHubMixin
|
12 |
|
13 |
|
14 |
class SEModule(nn.Module):
|
15 |
+
def __init__(self, channels : int , bottleneck : int = 128) -> None:
|
16 |
super(SEModule, self).__init__()
|
17 |
self.se = nn.Sequential(
|
18 |
nn.AdaptiveAvgPool1d(1),
|
|
|
23 |
nn.Sigmoid(),
|
24 |
)
|
25 |
|
26 |
+
def forward(self, input : torch.Tensor) -> torch.Tensor:
|
27 |
x = self.se(input)
|
28 |
return input * x
|
29 |
|
|
|
30 |
|
31 |
+
class Bottle2neck(nn.Module):
|
32 |
+
def __init__(self, inplanes : int, planes : int, kernel_size : Optional[int] = None, dilation : Optional[int] = None, scale : int = 8) -> None:
|
33 |
super(Bottle2neck, self).__init__()
|
34 |
width = int(math.floor(planes / scale))
|
35 |
self.conv1 = nn.Conv1d(inplanes, width*scale, kernel_size=1)
|
|
|
49 |
self.width = width
|
50 |
self.se = SEModule(planes)
|
51 |
|
52 |
+
def forward(self, x : torch.Tensor) -> torch.Tensor:
|
53 |
residual = x
|
54 |
out = self.conv1(x)
|
55 |
out = self.relu(out)
|
|
|
77 |
out = self.se(out)
|
78 |
out += residual
|
79 |
return out
|
80 |
+
|
81 |
|
82 |
+
class ECAPA_gender(nn.Module, PyTorchModelHubMixin):
|
83 |
+
def __init__(self, C : int = 1024):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
super(ECAPA_gender, self).__init__()
|
85 |
+
self.C = C
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
self.conv1 = nn.Conv1d(80, C, kernel_size=5, stride=1, padding=2)
|
87 |
self.relu = nn.ReLU()
|
88 |
self.bn1 = nn.BatchNorm1d(C)
|
|
|
103 |
self.fc6 = nn.Linear(3072, 192)
|
104 |
self.bn6 = nn.BatchNorm1d(192)
|
105 |
self.fc7 = nn.Linear(192, 2)
|
106 |
+
self.pred2gender = {0 : 'male', 1 : 'female'}
|
107 |
+
|
108 |
+
def logtorchfbank(self, x : torch.Tensor) -> torch.Tensor:
|
109 |
+
# Preemphasis
|
110 |
+
flipped_filter = torch.FloatTensor([-0.97, 1.]).unsqueeze(0).unsqueeze(0)
|
111 |
+
x = x.unsqueeze(1)
|
112 |
+
x = F.pad(x, (1, 0), 'reflect')
|
113 |
+
x = F.conv1d(x, flipped_filter).squeeze(1)
|
114 |
+
|
115 |
+
# Melspectrogram
|
116 |
+
x = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_fft=512, win_length=400, hop_length=160, \
|
117 |
+
f_min = 20, f_max = 7600, window_fn=torch.hamming_window, n_mels=80)(x) + 1e-6
|
118 |
+
|
119 |
+
# Log and normalize
|
120 |
+
x = x.log()
|
121 |
+
x = x - torch.mean(x, dim=-1, keepdim=True)
|
122 |
+
return x
|
123 |
+
|
124 |
+
def forward(self, x : torch.Tensor) -> torch.Tensor:
|
125 |
+
x = self.logtorchfbank(x)
|
126 |
|
127 |
x = self.conv1(x)
|
128 |
x = self.relu(x)
|
|
|
153 |
|
154 |
return x
|
155 |
|
156 |
+
def load_audio(self, path : str) -> torch.Tensor:
|
157 |
audio, sr = torchaudio.load(path)
|
158 |
if sr != 16000:
|
159 |
audio = resample(audio, sr, 16000)
|
160 |
return audio
|
161 |
|
162 |
+
def predict(self, audio : torch.Tensor) -> torch.Tensor:
|
163 |
audio = self.load_audio(audio)
|
164 |
self.eval()
|
165 |
with torch.no_grad():
|
166 |
output = self.forward(audio)
|
167 |
_, pred = output.max(1)
|
168 |
+
return self.pred2gender[pred.item()]
|
|
requirements.txt
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
torch
|
2 |
-
torchaudio
|
|
|
|
1 |
torch
|
2 |
+
torchaudio
|
3 |
+
pysoundfile
|