Marcos12886 commited on
Commit
d99bc5f
1 Parent(s): 52733a5

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +111 -56
model.py CHANGED
@@ -1,104 +1,160 @@
1
- import torch
2
- import numpy as np
3
  import os
 
 
 
 
 
4
  from huggingface_hub import login, upload_folder
5
- from datasets import load_dataset, Audio
6
  from transformers.integrations import TensorBoardCallback
7
  from transformers import (
8
- Wav2Vec2FeatureExtractor, AutoModelForAudioClassification,
9
  Trainer, TrainingArguments,
10
  EarlyStoppingCallback
11
  )
12
- import json
13
- # SE USA FLOAT32 EN EL MODELO ORIGINAL
14
  MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto
15
  FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
16
  seed = 123
17
  MAX_DURATION = 1.00
18
- SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000 # antes estaba float16
19
  token = os.getenv('MODEL_REPO_ID')
20
  config_file = "models_config.json"
21
  clasificador = "class"
22
  monitor = "mon"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  def seed_everything():
25
- np.random.seed(seed)
26
  torch.manual_seed(seed)
27
  torch.cuda.manual_seed(seed)
28
  torch.backends.cudnn.deterministic = True
29
  torch.backends.cudnn.benchmark = False
30
  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16384:8'
31
 
32
- def preprocess_audio(audio_arrays, batch=True):
33
- if batch:
34
- audios = [x["array"] for x in audio_arrays["audio"]] # para usar aquí
35
- else:
36
- audios = [audio_arrays] # para usar en realtime.py
37
- inputs = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)(
38
- raw_speech=audios,
39
- sampling_rate=SAMPLING_RATE,
40
- return_tensors="pt", # Devolver tensores de PyTorch
41
- max_length=int(SAMPLING_RATE * MAX_DURATION), # Necesario para truncation
42
- truncation=True, # Muchísimo más rápido.
43
- padding=True, # Vectores igual longitud
44
- do_normalize=True, # No afecta 1ª época, no sé si necesario
45
- # return_attention_mask=True, # Empeora 1ª época. No sé si necesario
46
- padding_value=0.0, # No afecta 1ª época, no sé si necesario
47
- float=32 # No afecta 1ª época, no sé si necesario
48
- )
49
- return inputs
50
 
51
- def load_and_prepare_dataset(dataset_path):
52
- dataset = load_dataset(dataset_path, split="train") # Split para que no ponga train de primeras
53
- # dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE)) # Da mejor accuracy pero creo que cambia el preprocesado.
54
- encoded_dataset = dataset.map(preprocess_audio, remove_columns=["audio"], batched=True) # num_proc hace q no vaya realtime
55
- labels = encoded_dataset.features["label"].names
56
- label2id = {label: str(i) for i, label in enumerate(labels)}
57
- id2label = {str(i): label for i, label in enumerate(labels)}
58
- encoded_dataset = encoded_dataset.train_test_split(test_size=0.2, seed=seed, stratify_by_column="label")
59
- return encoded_dataset, label2id, id2label
 
 
 
 
 
 
 
 
 
60
 
61
  def load_model(num_labels, label2id, id2label):
62
- model = AutoModelForAudioClassification.from_pretrained(
63
  MODEL,
64
  num_labels=num_labels,
65
  label2id=label2id,
66
- id2label=id2label
 
 
 
 
 
 
 
67
  )
 
68
  return model
69
 
70
  def model_params(dataset_path):
71
- login(token, add_to_git_credential=True)
72
- seed_everything()
73
- encoded_dataset, label2id, id2label = load_and_prepare_dataset(dataset_path)
74
- model = load_model(len(id2label), label2id, id2label)
75
- return model, encoded_dataset, id2label
76
 
77
  def compute_metrics(eval_pred):
78
- predictions = np.argmax(eval_pred.predictions, axis=1)
79
  references = eval_pred.label_ids
80
  return {
81
- "accuracy": np.mean(predictions == references),
82
  }
83
 
84
- def model_training(training_args, output_dir, dataset_path):
85
- model, encoded_dataset, _ = model_params(dataset_path)
86
- tensorboard_callback = TensorBoardCallback()
87
- early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)
88
  trainer = Trainer(
89
  model=model,
90
  args=training_args,
91
  compute_metrics=compute_metrics,
92
- train_dataset=encoded_dataset["train"],
93
- eval_dataset=encoded_dataset["test"],
94
- callbacks=[tensorboard_callback, early_stopping_callback]
95
  )
96
  torch.cuda.empty_cache() # liberar memoria de la GPU
97
  trainer.train() # se pueden modificar los parámetros para continuar el train
 
98
  trainer.push_to_hub(token=token) # Subir modelo a mi cuenta. Necesario para hacer la predicción, no sé por qué.
99
  trainer.save_model(output_dir) # para subir el modelo a Hugging Face. Necesario para hacer la predicción, no sé por qué.
100
  os.makedirs(output_dir, exist_ok=True) # Crear carpeta con el modelo si no existe
101
- upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}",folder_path=output_dir, token=token) # subir modelo a organización
102
 
103
  def load_config(model_name):
104
  with open(config_file, 'r') as f:
@@ -109,10 +165,9 @@ def load_config(model_name):
109
  return model_config
110
 
111
  if __name__ == "__main__":
112
- config = load_config(clasificador) # PARA CAMBIAR MODELOS
113
- # config = load_config(monitor) # PARA CAMBI
114
  training_args = config["training_args"]
115
  output_dir = config["output_dir"]
116
  dataset_path = config["dataset_path"]
117
- model_training(training_args, output_dir, dataset_path)
118
-
 
 
 
1
  import os
2
+ import json
3
+ import random
4
+ import torch
5
+ import torchaudio
6
+ from torch.utils.data import Dataset, DataLoader
7
  from huggingface_hub import login, upload_folder
 
8
  from transformers.integrations import TensorBoardCallback
9
  from transformers import (
10
+ Wav2Vec2FeatureExtractor, HubertConfig, HubertForSequenceClassification,
11
  Trainer, TrainingArguments,
12
  EarlyStoppingCallback
13
  )
14
+
 
15
  MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto
16
  FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
17
  seed = 123
18
  MAX_DURATION = 1.00
19
+ SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000
20
  token = os.getenv('MODEL_REPO_ID')
21
  config_file = "models_config.json"
22
  clasificador = "class"
23
  monitor = "mon"
24
+ batch_size = 4096
25
+
26
+ class AudioDataset(Dataset):
27
+ def __init__(self, dataset_path, label2id):
28
+ self.dataset_path = dataset_path
29
+ self.label2id = label2id
30
+ self.file_paths = []
31
+ self.labels = []
32
+ for label_dir, label_id in self.label2id.items():
33
+ label_path = os.path.join(self.dataset_path, label_dir)
34
+ if os.path.isdir(label_path):
35
+ for file_name in os.listdir(label_path):
36
+ audio_path = os.path.join(label_path, file_name)
37
+ self.file_paths.append(audio_path)
38
+ self.labels.append(label_id)
39
+
40
+ def __len__(self):
41
+ return len(self.file_paths)
42
+
43
+ def __getitem__(self, idx):
44
+ audio_path = self.file_paths[idx]
45
+ label = self.labels[idx]
46
+ input_values = self.preprocess_audio(audio_path)
47
+ return {
48
+ "input_values": input_values,
49
+ "labels": torch.tensor(label)
50
+ }
51
+
52
+ def preprocess_audio(self, audio_path):
53
+ waveform, sample_rate = torchaudio.load(
54
+ audio_path,
55
+ normalize=True, # Convierte a float32
56
+ # num_frames= # TODO: Probar para que no haga falta recortar los audios
57
+ )
58
+ if sample_rate != SAMPLING_RATE: # Resamplear si no es 16kHz
59
+ resampler = torchaudio.transforms.Resample(sample_rate, SAMPLING_RATE)
60
+ waveform = resampler(waveform)
61
+ if waveform.shape[0] > 1: # Si es stereo, convertir a mono
62
+ waveform = waveform.mean(dim=0)
63
+ waveform = waveform / torch.max(torch.abs(waveform))
64
+ inputs = FEATURE_EXTRACTOR(
65
+ waveform,
66
+ sampling_rate=SAMPLING_RATE,
67
+ return_tensors="pt",
68
+ max_length=int(SAMPLING_RATE * MAX_DURATION),
69
+ truncation=True,
70
+ padding=True,
71
+ )
72
+ return inputs.input_values.squeeze()
73
 
74
  def seed_everything():
 
75
  torch.manual_seed(seed)
76
  torch.cuda.manual_seed(seed)
77
  torch.backends.cudnn.deterministic = True
78
  torch.backends.cudnn.benchmark = False
79
  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16384:8'
80
 
81
+ def build_label_mappings(dataset_path):
82
+ label2id = {}
83
+ id2label = {}
84
+ label_id = 0
85
+ for label_dir in os.listdir(dataset_path):
86
+ if os.path.isdir(os.path.join(dataset_path, label_dir)):
87
+ label2id[label_dir] = label_id
88
+ id2label[label_id] = label_dir
89
+ label_id += 1
90
+ return label2id, id2label
 
 
 
 
 
 
 
 
91
 
92
+ def create_dataloader(dataset_path, test_size=0.2, num_workers=12, shuffle=True, pin_memory=True):
93
+ label2id, id2label = build_label_mappings(dataset_path)
94
+ dataset = AudioDataset(dataset_path, label2id)
95
+ dataset_size = len(dataset)
96
+ indices = list(range(dataset_size))
97
+ random.shuffle(indices)
98
+ split_idx = int(dataset_size * (1 - test_size))
99
+ train_indices = indices[:split_idx]
100
+ test_indices = indices[split_idx:]
101
+ train_dataset = torch.utils.data.Subset(dataset, train_indices)
102
+ test_dataset = torch.utils.data.Subset(dataset, test_indices)
103
+ train_dataloader = DataLoader(
104
+ train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
105
+ )
106
+ test_dataloader = DataLoader(
107
+ test_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
108
+ )
109
+ return train_dataloader, test_dataloader, label2id, id2label
110
 
111
  def load_model(num_labels, label2id, id2label):
112
+ config = HubertConfig.from_pretrained(
113
  MODEL,
114
  num_labels=num_labels,
115
  label2id=label2id,
116
+ id2label=id2label,
117
+ finetuning_task="audio-classification"
118
+ )
119
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
120
+ model = HubertForSequenceClassification.from_pretrained( # TODO: mirar parámetros. Posibles optimizaciones
121
+ MODEL,
122
+ config=config,
123
+ torch_dtype=torch.float32, # No afecta 1ª época, mejor ponerlo
124
  )
125
+ model.to(device)
126
  return model
127
 
128
  def model_params(dataset_path):
129
+ train_dataloader, test_dataloader, label2id, id2label = create_dataloader(dataset_path)
130
+ model = load_model(num_labels=len(id2label), label2id=label2id, id2label=id2label)
131
+ return model, train_dataloader, test_dataloader, id2label
 
 
132
 
133
  def compute_metrics(eval_pred):
134
+ predictions = torch.argmax(input=eval_pred.predictions)
135
  references = eval_pred.label_ids
136
  return {
137
+ "accuracy": torch.mean(predictions == references),
138
  }
139
 
140
+ def main(training_args, output_dir, dataset_path):
141
+ seed_everything()
142
+ model, train_dataloader, test_dataloader, _ = model_params(dataset_path)
 
143
  trainer = Trainer(
144
  model=model,
145
  args=training_args,
146
  compute_metrics=compute_metrics,
147
+ train_dataset=train_dataloader.dataset,
148
+ eval_dataset=test_dataloader.dataset,
149
+ callbacks=[TensorBoardCallback(), EarlyStoppingCallback(early_stopping_patience=3)]
150
  )
151
  torch.cuda.empty_cache() # liberar memoria de la GPU
152
  trainer.train() # se pueden modificar los parámetros para continuar el train
153
+ login(token, add_to_git_credential=True)
154
  trainer.push_to_hub(token=token) # Subir modelo a mi cuenta. Necesario para hacer la predicción, no sé por qué.
155
  trainer.save_model(output_dir) # para subir el modelo a Hugging Face. Necesario para hacer la predicción, no sé por qué.
156
  os.makedirs(output_dir, exist_ok=True) # Crear carpeta con el modelo si no existe
157
+ # upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}",folder_path=output_dir, token=token) # subir modelo a organización
158
 
159
  def load_config(model_name):
160
  with open(config_file, 'r') as f:
 
165
  return model_config
166
 
167
  if __name__ == "__main__":
168
+ # config = load_config(clasificador) # PARA CAMBIAR MODELOS
169
+ config = load_config(monitor) # PARA CAMBIAR MODELOS
170
  training_args = config["training_args"]
171
  output_dir = config["output_dir"]
172
  dataset_path = config["dataset_path"]
173
+ main(training_args, output_dir, dataset_path)