Marcos12886 commited on
Commit
206b5fc
1 Parent(s): f72ba5a

Usar label2id menos

Browse files
Files changed (1) hide show
  1. model.py +15 -18
model.py CHANGED
@@ -14,17 +14,15 @@ from transformers import (
14
  EarlyStoppingCallback
15
  )
16
 
17
- MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto
18
- FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
19
  seed = 123
20
- MAX_DURATION = 1.00
21
- SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000
22
  token = os.getenv("HF_TOKEN") # TODO: probar a guardar el token en un archivo en local
23
  config_file = "models_config.json"
24
- clasificador = "class"
25
- monitor = "mon"
26
- batch_size = 16
27
- num_workers = 12
28
 
29
  class AudioDataset(Dataset):
30
  def __init__(self, dataset_path, label2id, filter_white_noise):
@@ -120,13 +118,12 @@ def create_dataloader(dataset_path, filter_white_noise, test_size=0.2, shuffle=T
120
  test_dataloader = DataLoader(
121
  test_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
122
  )
123
- return train_dataloader, test_dataloader, label2id, id2label
124
 
125
- def load_model(model_path, label2id, id2label, num_labels):
126
  config = HubertConfig.from_pretrained(
127
  pretrained_model_name_or_path=model_path,
128
  num_labels=num_labels,
129
- label2id=label2id,
130
  id2label=id2label,
131
  finetuning_task="audio-classification"
132
  )
@@ -140,13 +137,13 @@ def load_model(model_path, label2id, id2label, num_labels):
140
  return model
141
 
142
  def train_params(dataset_path, filter_white_noise):
143
- train_dataloader, test_dataloader, label2id, id2label = create_dataloader(dataset_path, filter_white_noise)
144
- model = load_model(MODEL, label2id, id2label, num_labels=len(id2label))
145
  return model, train_dataloader, test_dataloader, id2label
146
 
147
  def predict_params(dataset_path, model_path, filter_white_noise):
148
- _, _, label2id, id2label = create_dataloader(dataset_path, filter_white_noise)
149
- model = load_model(model_path, label2id, id2label, num_labels=len(id2label))
150
  return model, id2label
151
 
152
  def compute_metrics(eval_pred):
@@ -173,11 +170,11 @@ def main(training_args, output_dir, dataset_path, filter_white_noise):
173
  callbacks=[TensorBoardCallback(), EarlyStoppingCallback(early_stopping_patience=3)]
174
  )
175
  torch.cuda.empty_cache() # liberar memoria de la GPU
176
- trainer.train() # se pueden modificar los parámetros para continuar el train
177
  # trainer.save_model(output_dir) # Guardar modelo local.
178
- os.makedirs(output_dir, exist_ok=True) # Crear carpeta
179
  trainer.push_to_hub(token=token) # Subir modelo a perfil
180
- upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}", folder_path=output_dir, token=token) # subir a organización y local
181
 
182
  def load_config(model_name):
183
  with open(config_file, 'r') as f:
 
14
  EarlyStoppingCallback
15
  )
16
 
17
+ MODEL = "ntu-spml/distilhubert" # modelo base
18
+ FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL) # feature extractor del modelo base
19
  seed = 123
20
+ MAX_DURATION = 1.00 # Máxima duración de los audios
21
+ SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16kHz
22
  token = os.getenv("HF_TOKEN") # TODO: probar a guardar el token en un archivo en local
23
  config_file = "models_config.json"
24
+ batch_size = 1024 # TODO: repasar si sigue siendo necesario
25
+ num_workers = 12 # Núcleos de la CPU
 
 
26
 
27
  class AudioDataset(Dataset):
28
  def __init__(self, dataset_path, label2id, filter_white_noise):
 
118
  test_dataloader = DataLoader(
119
  test_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
120
  )
121
+ return train_dataloader, test_dataloader, id2label
122
 
123
+ def load_model(model_path, id2label, num_labels):
124
  config = HubertConfig.from_pretrained(
125
  pretrained_model_name_or_path=model_path,
126
  num_labels=num_labels,
 
127
  id2label=id2label,
128
  finetuning_task="audio-classification"
129
  )
 
137
  return model
138
 
139
  def train_params(dataset_path, filter_white_noise):
140
+ train_dataloader, test_dataloader, id2label = create_dataloader(dataset_path, filter_white_noise)
141
+ model = load_model(MODEL, id2label, num_labels=len(id2label))
142
  return model, train_dataloader, test_dataloader, id2label
143
 
144
  def predict_params(dataset_path, model_path, filter_white_noise):
145
+ _, _, id2label = create_dataloader(dataset_path, filter_white_noise)
146
+ model = load_model(model_path, id2label, num_labels=len(id2label))
147
  return model, id2label
148
 
149
  def compute_metrics(eval_pred):
 
170
  callbacks=[TensorBoardCallback(), EarlyStoppingCallback(early_stopping_patience=3)]
171
  )
172
  torch.cuda.empty_cache() # liberar memoria de la GPU
173
+ trainer.train() # resume_from_checkpoint para continuar el train
174
  # trainer.save_model(output_dir) # Guardar modelo local.
175
+ os.makedirs(output_dir, exist_ok=True)
176
  trainer.push_to_hub(token=token) # Subir modelo a perfil
177
+ upload_folder(repo_id=output_dir, folder_path=output_dir, token=token) # subir a organización y local
178
 
179
  def load_config(model_name):
180
  with open(config_file, 'r') as f: