Spaces:
Sleeping
Sleeping
Marcos12886
commited on
Commit
•
206b5fc
1
Parent(s):
f72ba5a
Usar label2id menos
Browse files
model.py
CHANGED
@@ -14,17 +14,15 @@ from transformers import (
|
|
14 |
EarlyStoppingCallback
|
15 |
)
|
16 |
|
17 |
-
MODEL = "ntu-spml/distilhubert" # modelo base
|
18 |
-
FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
|
19 |
seed = 123
|
20 |
-
MAX_DURATION = 1.00
|
21 |
-
SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate #
|
22 |
token = os.getenv("HF_TOKEN") # TODO: probar a guardar el token en un archivo en local
|
23 |
config_file = "models_config.json"
|
24 |
-
|
25 |
-
|
26 |
-
batch_size = 16
|
27 |
-
num_workers = 12
|
28 |
|
29 |
class AudioDataset(Dataset):
|
30 |
def __init__(self, dataset_path, label2id, filter_white_noise):
|
@@ -120,13 +118,12 @@ def create_dataloader(dataset_path, filter_white_noise, test_size=0.2, shuffle=T
|
|
120 |
test_dataloader = DataLoader(
|
121 |
test_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
|
122 |
)
|
123 |
-
return train_dataloader, test_dataloader,
|
124 |
|
125 |
-
def load_model(model_path,
|
126 |
config = HubertConfig.from_pretrained(
|
127 |
pretrained_model_name_or_path=model_path,
|
128 |
num_labels=num_labels,
|
129 |
-
label2id=label2id,
|
130 |
id2label=id2label,
|
131 |
finetuning_task="audio-classification"
|
132 |
)
|
@@ -140,13 +137,13 @@ def load_model(model_path, label2id, id2label, num_labels):
|
|
140 |
return model
|
141 |
|
142 |
def train_params(dataset_path, filter_white_noise):
|
143 |
-
train_dataloader, test_dataloader,
|
144 |
-
model = load_model(MODEL,
|
145 |
return model, train_dataloader, test_dataloader, id2label
|
146 |
|
147 |
def predict_params(dataset_path, model_path, filter_white_noise):
|
148 |
-
_, _,
|
149 |
-
model = load_model(model_path,
|
150 |
return model, id2label
|
151 |
|
152 |
def compute_metrics(eval_pred):
|
@@ -173,11 +170,11 @@ def main(training_args, output_dir, dataset_path, filter_white_noise):
|
|
173 |
callbacks=[TensorBoardCallback(), EarlyStoppingCallback(early_stopping_patience=3)]
|
174 |
)
|
175 |
torch.cuda.empty_cache() # liberar memoria de la GPU
|
176 |
-
trainer.train() #
|
177 |
# trainer.save_model(output_dir) # Guardar modelo local.
|
178 |
-
os.makedirs(output_dir, exist_ok=True)
|
179 |
trainer.push_to_hub(token=token) # Subir modelo a perfil
|
180 |
-
upload_folder(repo_id=
|
181 |
|
182 |
def load_config(model_name):
|
183 |
with open(config_file, 'r') as f:
|
|
|
14 |
EarlyStoppingCallback
|
15 |
)
|
16 |
|
17 |
+
MODEL = "ntu-spml/distilhubert" # modelo base
|
18 |
+
FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL) # feature extractor del modelo base
|
19 |
seed = 123
|
20 |
+
MAX_DURATION = 1.00 # Máxima duración de los audios
|
21 |
+
SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16kHz
|
22 |
token = os.getenv("HF_TOKEN") # TODO: probar a guardar el token en un archivo en local
|
23 |
config_file = "models_config.json"
|
24 |
+
batch_size = 1024 # TODO: repasar si sigue siendo necesario
|
25 |
+
num_workers = 12 # Núcleos de la CPU
|
|
|
|
|
26 |
|
27 |
class AudioDataset(Dataset):
|
28 |
def __init__(self, dataset_path, label2id, filter_white_noise):
|
|
|
118 |
test_dataloader = DataLoader(
|
119 |
test_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
|
120 |
)
|
121 |
+
return train_dataloader, test_dataloader, id2label
|
122 |
|
123 |
+
def load_model(model_path, id2label, num_labels):
|
124 |
config = HubertConfig.from_pretrained(
|
125 |
pretrained_model_name_or_path=model_path,
|
126 |
num_labels=num_labels,
|
|
|
127 |
id2label=id2label,
|
128 |
finetuning_task="audio-classification"
|
129 |
)
|
|
|
137 |
return model
|
138 |
|
139 |
def train_params(dataset_path, filter_white_noise):
|
140 |
+
train_dataloader, test_dataloader, id2label = create_dataloader(dataset_path, filter_white_noise)
|
141 |
+
model = load_model(MODEL, id2label, num_labels=len(id2label))
|
142 |
return model, train_dataloader, test_dataloader, id2label
|
143 |
|
144 |
def predict_params(dataset_path, model_path, filter_white_noise):
|
145 |
+
_, _, id2label = create_dataloader(dataset_path, filter_white_noise)
|
146 |
+
model = load_model(model_path, id2label, num_labels=len(id2label))
|
147 |
return model, id2label
|
148 |
|
149 |
def compute_metrics(eval_pred):
|
|
|
170 |
callbacks=[TensorBoardCallback(), EarlyStoppingCallback(early_stopping_patience=3)]
|
171 |
)
|
172 |
torch.cuda.empty_cache() # liberar memoria de la GPU
|
173 |
+
trainer.train() # resume_from_checkpoint para continuar el train
|
174 |
# trainer.save_model(output_dir) # Guardar modelo local.
|
175 |
+
os.makedirs(output_dir, exist_ok=True)
|
176 |
trainer.push_to_hub(token=token) # Subir modelo a perfil
|
177 |
+
upload_folder(repo_id=output_dir, folder_path=output_dir, token=token) # subir a organización y local
|
178 |
|
179 |
def load_config(model_name):
|
180 |
with open(config_file, 'r') as f:
|