File size: 1,884 Bytes
89831ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
786d3cf
ca29927
89831ff
ca29927
4d2bc87
ca29927
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55

---
language: ko
tags:
- whisper
- speech-recognition
datasets:
- maxseats/aihub-464-preprocessed-680GB-set-0
metrics:
- cer
---
# Model Name : SungBeom/whisper-small-ko
# Description

- νŒŒμΈνŠœλ‹ 데이터셋 : maxseats/aihub-464-preprocessed-680GB-set-0
- AI hub의 μ£Όμš” μ˜μ—­λ³„ 회의 μŒμ„± 데이터셋 680GB 쀑 첫번째 데이터(10GB)λ₯Ό νŒŒμΈνŠœλ‹ν•œ λͺ¨λΈμž…λ‹ˆλ‹€.
- 데이터셋 링크 : https://huggingface.co/datasets/maxseats/aihub-464-preprocessed-680GB-set-0

# νŒŒλΌλ―Έν„°

```
model_name = "SungBeom/whisper-small-ko" # λŒ€μ•ˆ : "SungBeom/whisper-small-ko"
dataset_name = "maxseats/aihub-464-preprocessed-680GB-set-0"  # 뢈러올 데이터셋(ν—ˆκΉ…νŽ˜μ΄μŠ€ κΈ°μ€€)

CACHE_DIR = '/mnt/a/maxseats/.finetuning_cache'  # μΊμ‹œ 디렉토리 지정
is_test = False  # True: μ†ŒλŸ‰μ˜ μƒ˜ν”Œ λ°μ΄ν„°λ‘œ ν…ŒμŠ€νŠΈ, False: μ‹€μ œ νŒŒμΈνŠœλ‹

token = "hf_" # ν—ˆκΉ…νŽ˜μ΄μŠ€ 토큰 μž…λ ₯

training_args = Seq2SeqTrainingArguments(
    output_dir=model_dir,  # μ›ν•˜λŠ” 리포지토리 이름을 μž…λ ₯ν•œλ‹€.
    per_device_train_batch_size=16,
    gradient_accumulation_steps=2,  # 배치 크기가 2λ°° κ°μ†Œν•  λ•Œλ§ˆλ‹€ 2λ°°μ”© 증가
    learning_rate=1e-5,
    warmup_steps=1000,
    # max_steps=2,  # epoch λŒ€μ‹  μ„€μ •
    num_train_epochs=1,     # epoch 수 μ„€μ • / max_steps와 이것 쀑 ν•˜λ‚˜λ§Œ μ„€μ •
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=16,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",  # ν•œκ΅­μ–΄μ˜ 경우 'wer'λ³΄λ‹€λŠ” 'cer'이 더 적합할 것
    greater_is_better=False,
    push_to_hub=True,
    save_total_limit=5,           # μ΅œλŒ€ μ €μž₯ν•  λͺ¨λΈ 수 지정
)
```