maxseats commited on
Commit
ca29927
β€’
1 Parent(s): 89831ff

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +37 -1
README.md CHANGED
@@ -16,5 +16,41 @@ metrics:
16
 
17
  # μ„€λͺ…
18
  - μ£Όμš” μ˜μ—­λ³„ 회의 μŒμ„± 데이터셋 680GB 쀑 첫번째 데이터(10GB)λ₯Ό νŒŒμΈνŠœλ‹ν•œ λͺ¨λΈμž…λ‹ˆλ‹€.
19
- - 링크 : https://huggingface.co/datasets/maxseats/aihub-464-preprocessed-680GB-set-0
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  # μ„€λͺ…
18
  - μ£Όμš” μ˜μ—­λ³„ 회의 μŒμ„± 데이터셋 680GB 쀑 첫번째 데이터(10GB)λ₯Ό νŒŒμΈνŠœλ‹ν•œ λͺ¨λΈμž…λ‹ˆλ‹€.
19
+ - 데이터셋 링크 : https://huggingface.co/datasets/maxseats/aihub-464-preprocessed-680GB-set-0
20
 
21
+ # νŒŒλΌλ―Έν„°
22
+ # model_name = "openai/whisper-base"
23
+ ```
24
+ model_name = "SungBeom/whisper-small-ko" # λŒ€μ•ˆ : "SungBeom/whisper-small-ko"
25
+ dataset_name = "maxseats/aihub-464-preprocessed-680GB-set-0" # 뢈러올 데이터셋(ν—ˆκΉ…νŽ˜μ΄μŠ€ κΈ°μ€€)
26
+
27
+ CACHE_DIR = '/mnt/a/maxseats/.finetuning_cache' # μΊμ‹œ 디렉토리 지정
28
+ is_test = False # True: μ†ŒλŸ‰μ˜ μƒ˜ν”Œ λ°μ΄ν„°λ‘œ ν…ŒμŠ€νŠΈ, False: μ‹€μ œ νŒŒμΈνŠœλ‹
29
+
30
+ token = "hf_" # ν—ˆκΉ…νŽ˜μ΄μŠ€ 토큰 μž…λ ₯
31
+
32
+ training_args = Seq2SeqTrainingArguments(
33
+ output_dir=model_dir, # μ›ν•˜λŠ” 리포지토리 이름을 μž…λ ₯ν•œλ‹€.
34
+ per_device_train_batch_size=16,
35
+ gradient_accumulation_steps=2, # 배치 크기가 2λ°° κ°μ†Œν•  λ•Œλ§ˆλ‹€ 2λ°°μ”© 증가
36
+ learning_rate=1e-5,
37
+ warmup_steps=1000,
38
+ # max_steps=2, # epoch λŒ€μ‹  μ„€μ •
39
+ num_train_epochs=1, # epoch 수 μ„€μ • / max_steps와 이것 쀑 ν•˜λ‚˜λ§Œ μ„€μ •
40
+ gradient_checkpointing=True,
41
+ fp16=True,
42
+ evaluation_strategy="steps",
43
+ per_device_eval_batch_size=16,
44
+ predict_with_generate=True,
45
+ generation_max_length=225,
46
+ save_steps=1000,
47
+ eval_steps=1000,
48
+ logging_steps=25,
49
+ report_to=["tensorboard"],
50
+ load_best_model_at_end=True,
51
+ metric_for_best_model="cer", # ν•œκ΅­μ–΄μ˜ 경우 'wer'λ³΄λ‹€λŠ” 'cer'이 더 적합할 것
52
+ greater_is_better=False,
53
+ push_to_hub=True,
54
+ save_total_limit=5, # μ΅œλŒ€ μ €μž₯ν•  λͺ¨λΈ 수 지정
55
+ )
56
+ ```