Marcos12886 commited on
Commit
5195c9e
1 Parent(s): dc03e3e

Upload model.py

Browse files
Files changed (1) hide show
  1. model.py +117 -0
model.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import os
4
+ from huggingface_hub import login, upload_folder
5
+ from datasets import load_dataset, Audio
6
+ from transformers.integrations import TensorBoardCallback
7
+ from transformers import (
8
+ Wav2Vec2FeatureExtractor, AutoModelForAudioClassification,
9
+ Trainer, TrainingArguments,
10
+ EarlyStoppingCallback
11
+ )
12
+ import json
13
+ # SE USA FLOAT32 EN EL MODELO ORIGINAL
14
+ MODEL = "ntu-spml/distilhubert" # modelo base utilizado, para usar otro basta con cambiar esto
15
+ FEATURE_EXTRACTOR = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)
16
+ seed = 123
17
+ MAX_DURATION = 1.00
18
+ SAMPLING_RATE = FEATURE_EXTRACTOR.sampling_rate # 16000 # antes estaba float16
19
+ access_token = os.getenv('HF_ACCESS_TOKEN') # Se podría cambiar para meterlo cada vez. Pereza.
20
+ config_file = "models_config.json"
21
+ clasificador = "class"
22
+ monitor = "mon"
23
+
24
+ def seed_everything():
25
+ np.random.seed(seed)
26
+ torch.manual_seed(seed)
27
+ torch.cuda.manual_seed(seed)
28
+ torch.backends.cudnn.deterministic = True
29
+ torch.backends.cudnn.benchmark = False
30
+ os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':16384:8'
31
+
32
+ def preprocess_audio(audio_arrays, batch=True):
33
+ if batch:
34
+ audios = [x["array"] for x in audio_arrays["audio"]] # para usar aquí
35
+ else:
36
+ audios = [audio_arrays] # para usar en realtime.py
37
+ inputs = Wav2Vec2FeatureExtractor.from_pretrained(MODEL)(
38
+ raw_speech=audios,
39
+ sampling_rate=SAMPLING_RATE,
40
+ return_tensors="pt", # Devolver tensores de PyTorch
41
+ max_length=int(SAMPLING_RATE * MAX_DURATION), # Necesario para truncation
42
+ truncation=True, # Muchísimo más rápido.
43
+ padding=True, # Vectores igual longitud
44
+ do_normalize=True, # No afecta 1ª época, no sé si necesario
45
+ # return_attention_mask=True, # Empeora 1ª época. No sé si necesario
46
+ padding_value=0.0, # No afecta 1ª época, no sé si necesario
47
+ float=32 # No afecta 1ª época, no sé si necesario
48
+ )
49
+ return inputs
50
+
51
+ def load_and_prepare_dataset(dataset_path):
52
+ dataset = load_dataset(dataset_path, split="train") # Split para que no ponga train de primeras
53
+ # dataset = dataset.cast_column("audio", Audio(sampling_rate=SAMPLING_RATE)) # Da mejor accuracy pero creo que cambia el preprocesado.
54
+ encoded_dataset = dataset.map(preprocess_audio, remove_columns=["audio"], batched=True) # num_proc hace q no vaya realtime
55
+ labels = encoded_dataset.features["label"].names
56
+ label2id = {label: str(i) for i, label in enumerate(labels)}
57
+ id2label = {str(i): label for i, label in enumerate(labels)}
58
+ encoded_dataset = encoded_dataset.train_test_split(test_size=0.2, seed=seed, stratify_by_column="label")
59
+ return encoded_dataset, label2id, id2label
60
+
61
+ def load_model(num_labels, label2id, id2label):
62
+ model = AutoModelForAudioClassification.from_pretrained(
63
+ MODEL,
64
+ num_labels=num_labels,
65
+ label2id=label2id,
66
+ id2label=id2label
67
+ )
68
+ return model
69
+
70
+ def model_params(dataset_path):
71
+ login(token, add_to_git_credential=True)
72
+ seed_everything()
73
+ encoded_dataset, label2id, id2label = load_and_prepare_dataset(dataset_path)
74
+ model = load_model(len(id2label), label2id, id2label)
75
+ return model, encoded_dataset, id2label
76
+
77
+ def compute_metrics(eval_pred):
78
+ predictions = np.argmax(eval_pred.predictions, axis=1)
79
+ references = eval_pred.label_ids
80
+ return {
81
+ "accuracy": np.mean(predictions == references),
82
+ }
83
+
84
+ def model_training(training_args, output_dir, dataset_path):
85
+ model, encoded_dataset, _ = model_params(dataset_path)
86
+ tensorboard_callback = TensorBoardCallback()
87
+ early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)
88
+ trainer = Trainer(
89
+ model=model,
90
+ args=training_args,
91
+ compute_metrics=compute_metrics,
92
+ train_dataset=encoded_dataset["train"],
93
+ eval_dataset=encoded_dataset["test"],
94
+ callbacks=[tensorboard_callback, early_stopping_callback]
95
+ )
96
+ torch.cuda.empty_cache() # liberar memoria de la GPU
97
+ trainer.train() # se pueden modificar los parámetros para continuar el train
98
+ trainer.push_to_hub(token=token) # Subir modelo a mi cuenta. Necesario para hacer la predicción, no sé por qué.
99
+ trainer.save_model(output_dir) # para subir el modelo a Hugging Face. Necesario para hacer la predicción, no sé por qué.
100
+ os.makedirs(output_dir, exist_ok=True) # Crear carpeta con el modelo si no existe
101
+ upload_folder(repo_id=f"A-POR-LOS-8000/{output_dir}",folder_path=output_dir, token=token) # subir modelo a organización
102
+
103
+ def load_config(model_name):
104
+ with open(config_file, 'r') as f:
105
+ config = json.load(f)
106
+ model_config = config[model_name]
107
+ training_args = TrainingArguments(**model_config["training_args"])
108
+ model_config["training_args"] = training_args
109
+ return model_config
110
+
111
+ if __name__ == "__main__":
112
+ config = load_config(clasificador) # PARA CAMBIAR MODELOS
113
+ # config = load_config(monitor) # PARA CAMBI
114
+ training_args = config["training_args"]
115
+ output_dir = config["output_dir"]
116
+ dataset_path = config["dataset_path"]
117
+ model_training(training_args, output_dir, dataset_path)