Marcos12886 commited on
Commit
3a98934
1 Parent(s): fe16cc3

Update model.py

Browse files
Files changed (1) hide show
  1. model.py +57 -125
model.py CHANGED
@@ -1,172 +1,104 @@
1
- import os
2
- import json
3
- import random
4
  import torch
5
- import torchaudio
6
- from torch.utils.data import Dataset, DataLoader
7
  from huggingface_hub import login, upload_folder
 
8
  from transformers.integrations import TensorBoardCallback
9
- from sklearn.metrics import accuracy_score, precision_recall_fscore_support
10
  from transformers import (
11
- Wav2Vec2FeatureExtractor, HubertConfig, HubertForSequenceClassification,
12
  Trainer, TrainingArguments,
13
  EarlyStoppingCallback
14
  )
15
-
 
16
  MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto
17
  FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
18
  seed = 123
19
  MAX_DURATION = 1.00
20
- SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000
21
- token = os.getenv("HF_TOKEN")
22
  config_file = "models_config.json"
23
  clasificador = "class"
24
  monitor = "mon"
25
- batch_size = 16
26
-
27
- class AudioDataset(Dataset):
28
- def __init__(self, dataset_path, label2id):
29
- self.dataset_path = dataset_path
30
- self.label2id = label2id
31
- self.file_paths = []
32
- self.labels = []
33
- for label_dir, label_id in self.label2id.items():
34
- label_path = os.path.join(self.dataset_path, label_dir)
35
- if os.path.isdir(label_path):
36
- for file_name in os.listdir(label_path):
37
- audio_path = os.path.join(label_path, file_name)
38
- self.file_paths.append(audio_path)
39
- self.labels.append(label_id)
40
-
41
- def __len__(self):
42
- return len(self.file_paths)
43
-
44
- def __getitem__(self, idx):
45
- audio_path = self.file_paths[idx]
46
- label = self.labels[idx]
47
- input_values = self.preprocess_audio(audio_path)
48
- return {
49
- "input_values": input_values,
50
- "labels": torch.tensor(label)
51
- }
52
-
53
- def preprocess_audio(self, audio_path):
54
- waveform, sample_rate = torchaudio.load(
55
- audio_path,
56
- normalize=True, # Convierte a float32
57
- # num_frames= # TODO: Probar para que no haga falta recortar los audios
58
- )
59
- if sample_rate != SAMPLING_RATE: # Resamplear si no es 16kHz
60
- resampler = torchaudio.transforms.Resample(sample_rate, SAMPLING_RATE)
61
- waveform = resampler(waveform)
62
- if waveform.shape[0] > 1: # Si es stereo, convertir a mono
63
- waveform = waveform.mean(dim=0)
64
- waveform = waveform / (torch.max(torch.abs(waveform)) + 1e-6) # Sin 1e-6 el accuracy es pésimo!!
65
- max_length = int(SAMPLING_RATE * MAX_DURATION)
66
- if waveform.shape[0] > max_length:
67
- waveform = waveform[:max_length]
68
- else:
69
- # Pad the waveform if it's shorter than max length
70
- waveform = torch.nn.functional.pad(waveform, (0, max_length - waveform.shape[0]))
71
- inputs = FEATURE_EXTRACTOR(
72
- waveform,
73
- sampling_rate=SAMPLING_RATE,
74
- return_tensors="pt",
75
- # max_length=int(SAMPLING_RATE * MAX_DURATION),
76
- # truncation=True,
77
- padding=True,
78
- )
79
- return inputs.input_values.squeeze()
80
 
81
  def seed_everything():
 
82
  torch.manual_seed(seed)
83
  torch.cuda.manual_seed(seed)
84
  torch.backends.cudnn.deterministic = True
85
  torch.backends.cudnn.benchmark = False
86
  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16384:8'
87
 
88
- def build_label_mappings(dataset_path):
89
- label2id = {}
90
- id2label = {}
91
- label_id = 0
92
- for label_dir in os.listdir(dataset_path):
93
- if os.path.isdir(os.path.join(dataset_path, label_dir)):
94
- label2id[label_dir] = label_id
95
- id2label[label_id] = label_dir
96
- label_id += 1
97
- return label2id, id2label
 
 
 
 
 
 
 
 
98
 
99
- def create_dataloader(dataset_path, test_size=0.2, num_workers=12, shuffle=True, pin_memory=True):
100
- label2id, id2label = build_label_mappings(dataset_path)
101
- dataset = AudioDataset(dataset_path, label2id)
102
- dataset_size = len(dataset)
103
- indices = list(range(dataset_size))
104
- random.shuffle(indices)
105
- split_idx = int(dataset_size * (1 - test_size))
106
- train_indices = indices[:split_idx]
107
- test_indices = indices[split_idx:]
108
- train_dataset = torch.utils.data.Subset(dataset, train_indices)
109
- test_dataset = torch.utils.data.Subset(dataset, test_indices)
110
- train_dataloader = DataLoader(
111
- train_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
112
- )
113
- test_dataloader = DataLoader(
114
- test_dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=pin_memory
115
- )
116
- return train_dataloader, test_dataloader, label2id, id2label
117
 
118
  def load_model(num_labels, label2id, id2label):
119
- config = HubertConfig.from_pretrained(
120
  MODEL,
121
  num_labels=num_labels,
122
  label2id=label2id,
123
- id2label=id2label,
124
- finetuning_task="audio-classification"
125
- )
126
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
127
- model = HubertForSequenceClassification.from_pretrained( # TODO: mirar parámetros. Posibles optimizaciones
128
- MODEL,
129
- config=config,
130
- torch_dtype=torch.float32, # No afecta 1ª época, mejor ponerlo
131
  )
132
- model.to(device)
133
  return model
134
 
135
  def model_params(dataset_path):
136
- train_dataloader, test_dataloader, label2id, id2label = create_dataloader(dataset_path)
137
- model = load_model(num_labels=len(id2label), label2id=label2id, id2label=id2label)
138
- return model, train_dataloader, test_dataloader, id2label
 
 
139
 
140
  def compute_metrics(eval_pred):
141
- predictions = torch.argmax(torch.tensor(eval_pred.predictions), dim=-1)
142
- references = torch.tensor(eval_pred.label_ids)
143
- accuracy = accuracy_score(references, predictions)
144
- precision, recall, f1, _ = precision_recall_fscore_support(references, predictions, average='weighted')
145
  return {
146
- "accuracy": accuracy,
147
- "precision": precision,
148
- "recall": recall,
149
- "f1": f1,
150
  }
151
 
152
- def main(training_args, output_dir, dataset_path):
153
- seed_everything()
154
- model, train_dataloader, test_dataloader, _ = model_params(dataset_path)
 
155
  trainer = Trainer(
156
  model=model,
157
  args=training_args,
158
  compute_metrics=compute_metrics,
159
- train_dataset=train_dataloader.dataset,
160
- eval_dataset=test_dataloader.dataset,
161
- callbacks=[TensorBoardCallback(), EarlyStoppingCallback(early_stopping_patience=3)]
162
  )
163
  torch.cuda.empty_cache() # liberar memoria de la GPU
164
  trainer.train() # se pueden modificar los parámetros para continuar el train
165
- login(token, add_to_git_credential=True)
166
  trainer.push_to_hub(token=token) # Subir modelo a mi cuenta. Necesario para hacer la predicción, no sé por qué.
167
  trainer.save_model(output_dir) # para subir el modelo a Hugging Face. Necesario para hacer la predicción, no sé por qué.
168
  os.makedirs(output_dir, exist_ok=True) # Crear carpeta con el modelo si no existe
169
- # upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}",folder_path=output_dir, token=token) # subir modelo a organización
170
 
171
  def load_config(model_name):
172
  with open(config_file, 'r') as f:
@@ -177,9 +109,9 @@ def load_config(model_name):
177
  return model_config
178
 
179
  if __name__ == "__main__":
180
- # config = load_config(clasificador) # PARA CAMBIAR MODELOS
181
- config = load_config(monitor) # PARA CAMBIAR MODELOS
182
  training_args = config["training_args"]
183
  output_dir = config["output_dir"]
184
  dataset_path = config["dataset_path"]
185
- main(training_args, output_dir, dataset_path)
 
 
 
 
1
  import torch
2
+ import numpy as np
3
+ import os
4
  from huggingface_hub import login, upload_folder
5
+ from datasets import load_dataset, Audio
6
  from transformers.integrations import TensorBoardCallback
 
7
  from transformers import (
8
+ Wav2Vec2FeatureExtractor, AutoModelForAudioClassification,
9
  Trainer, TrainingArguments,
10
  EarlyStoppingCallback
11
  )
12
+ import json
13
+ # SE USA FLOAT32 EN EL MODELO ORIGINAL
14
  MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto
15
  FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
16
  seed = 123
17
  MAX_DURATION = 1.00
18
+ SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000 # antes estaba float16
19
+ token = os.getenv('MODEL_REPO_ID')
20
  config_file = "models_config.json"
21
  clasificador = "class"
22
  monitor = "mon"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  def seed_everything():
25
+ np.random.seed(seed)
26
  torch.manual_seed(seed)
27
  torch.cuda.manual_seed(seed)
28
  torch.backends.cudnn.deterministic = True
29
  torch.backends.cudnn.benchmark = False
30
  os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16384:8'
31
 
32
+ def preprocess_audio(audio_arrays, batch=True):
33
+ if batch:
34
+ audios = [x["array"] for x in audio_arrays["audio"]] # para usar aquí
35
+ else:
36
+ audios = [audio_arrays] # para usar en realtime.py
37
+ inputs = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)(
38
+ raw_speech=audios,
39
+ sampling_rate=SAMPLING_RATE,
40
+ return_tensors="pt", # Devolver tensores de PyTorch
41
+ max_length=int(SAMPLING_RATE * MAX_DURATION), # Necesario para truncation
42
+ truncation=True, # Muchísimo más rápido.
43
+ padding=True, # Vectores igual longitud
44
+ do_normalize=True, # No afecta 1ª época, no sé si necesario
45
+ # return_attention_mask=True, # Empeora 1ª época. No sé si necesario
46
+ padding_value=0.0, # No afecta 1ª época, no sé si necesario
47
+ float=32 # No afecta 1ª época, no sé si necesario
48
+ )
49
+ return inputs
50
 
51
+ def load_and_prepare_dataset(dataset_path):
52
+ dataset = load_dataset(dataset_path, split="train") # Split para que no ponga train de primeras
53
+ # dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE)) # Da mejor accuracy pero creo que cambia el preprocesado.
54
+ encoded_dataset = dataset.map(preprocess_audio, remove_columns=["audio"], batched=True) # num_proc hace q no vaya realtime
55
+ labels = encoded_dataset.features["label"].names
56
+ label2id = {label: str(i) for i, label in enumerate(labels)}
57
+ id2label = {str(i): label for i, label in enumerate(labels)}
58
+ encoded_dataset = encoded_dataset.train_test_split(test_size=0.2, seed=seed, stratify_by_column="label")
59
+ return encoded_dataset, label2id, id2label
 
 
 
 
 
 
 
 
 
60
 
61
  def load_model(num_labels, label2id, id2label):
62
+ model = AutoModelForAudioClassification.from_pretrained(
63
  MODEL,
64
  num_labels=num_labels,
65
  label2id=label2id,
66
+ id2label=id2label
 
 
 
 
 
 
 
67
  )
 
68
  return model
69
 
70
  def model_params(dataset_path):
71
+ login(token, add_to_git_credential=True)
72
+ seed_everything()
73
+ encoded_dataset, label2id, id2label = load_and_prepare_dataset(dataset_path)
74
+ model = load_model(len(id2label), label2id, id2label)
75
+ return model, encoded_dataset, id2label
76
 
77
  def compute_metrics(eval_pred):
78
+ predictions = np.argmax(eval_pred.predictions, axis=1)
79
+ references = eval_pred.label_ids
 
 
80
  return {
81
+ "accuracy": np.mean(predictions == references),
 
 
 
82
  }
83
 
84
+ def model_training(training_args, output_dir, dataset_path):
85
+ model, encoded_dataset, _ = model_params(dataset_path)
86
+ tensorboard_callback = TensorBoardCallback()
87
+ early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)
88
  trainer = Trainer(
89
  model=model,
90
  args=training_args,
91
  compute_metrics=compute_metrics,
92
+ train_dataset=encoded_dataset["train"],
93
+ eval_dataset=encoded_dataset["test"],
94
+ callbacks=[tensorboard_callback, early_stopping_callback]
95
  )
96
  torch.cuda.empty_cache() # liberar memoria de la GPU
97
  trainer.train() # se pueden modificar los parámetros para continuar el train
 
98
  trainer.push_to_hub(token=token) # Subir modelo a mi cuenta. Necesario para hacer la predicción, no sé por qué.
99
  trainer.save_model(output_dir) # para subir el modelo a Hugging Face. Necesario para hacer la predicción, no sé por qué.
100
  os.makedirs(output_dir, exist_ok=True) # Crear carpeta con el modelo si no existe
101
+ upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}",folder_path=output_dir, token=token) # subir modelo a organización
102
 
103
  def load_config(model_name):
104
  with open(config_file, 'r') as f:
 
109
  return model_config
110
 
111
  if __name__ == "__main__":
112
+ config = load_config(clasificador) # PARA CAMBIAR MODELOS
113
+ # config = load_config(monitor) # PARA CAMBI
114
  training_args = config["training_args"]
115
  output_dir = config["output_dir"]
116
  dataset_path = config["dataset_path"]
117
+ model_training(training_args, output_dir, dataset_path)