Update README.md
Browse files
README.md
CHANGED
@@ -9,15 +9,65 @@ model-index:
|
|
9 |
- name: LLaMA-3-8B-Instruct-TR-DPO
|
10 |
results:
|
11 |
- task:
|
12 |
-
type:
|
13 |
dataset:
|
14 |
-
type:
|
15 |
name: MMLU_TR_V0.2
|
16 |
metrics:
|
17 |
- name: 5-shot
|
18 |
type: 5-shot
|
19 |
value: 0.4983
|
20 |
verified: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
---
|
22 |
|
23 |
<img src="https://huggingface.co/Metin/LLaMA-3-8B-Instruct-TR-DPO/resolve/main/llama.png"
|
@@ -101,12 +151,14 @@ print(outputs[0]["generated_text"][len(prompt):])
|
|
101 |
|
102 |
## OpenLLMTurkishLeaderboard_v0.2 benchmark results
|
103 |
|
104 |
-
MMLU_TR_V0.2
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
110 |
|
111 |
## Output Example (DPO Model vs Base Model)
|
112 |
|
|
|
9 |
- name: LLaMA-3-8B-Instruct-TR-DPO
|
10 |
results:
|
11 |
- task:
|
12 |
+
type: multiple-choice
|
13 |
dataset:
|
14 |
+
type: multiple-choice
|
15 |
name: MMLU_TR_V0.2
|
16 |
metrics:
|
17 |
- name: 5-shot
|
18 |
type: 5-shot
|
19 |
value: 0.4983
|
20 |
verified: false
|
21 |
+
- task:
|
22 |
+
type: multiple-choice
|
23 |
+
dataset:
|
24 |
+
type: multiple-choice
|
25 |
+
name: Truthful_QA_V0.2
|
26 |
+
metrics:
|
27 |
+
- name: 0-shot
|
28 |
+
type: 0-shot
|
29 |
+
value: 0.5232
|
30 |
+
verified: false
|
31 |
+
- task:
|
32 |
+
type: multiple-choice
|
33 |
+
dataset:
|
34 |
+
type: multiple-choice
|
35 |
+
name: ARC_TR_V0.2
|
36 |
+
metrics:
|
37 |
+
- name: 25-shot
|
38 |
+
type: 25-shot
|
39 |
+
value: 0.4437
|
40 |
+
verified: false
|
41 |
+
- task:
|
42 |
+
type: multiple-choice
|
43 |
+
dataset:
|
44 |
+
type: multiple-choice
|
45 |
+
name: HellaSwag_TR_V0.2
|
46 |
+
metrics:
|
47 |
+
- name: 10-shot
|
48 |
+
type: 10-shot
|
49 |
+
value: 0.4558
|
50 |
+
verified: false
|
51 |
+
- task:
|
52 |
+
type: multiple-choice
|
53 |
+
dataset:
|
54 |
+
type: multiple-choice
|
55 |
+
name: GSM8K_TR_V0.2
|
56 |
+
metrics:
|
57 |
+
- name: 5-shot
|
58 |
+
type: 5-shot
|
59 |
+
value: 0.5421
|
60 |
+
verified: false
|
61 |
+
- task:
|
62 |
+
type: multiple-choice
|
63 |
+
dataset:
|
64 |
+
type: multiple-choice
|
65 |
+
name: Winogrande_TR_V0.2
|
66 |
+
metrics:
|
67 |
+
- name: 5-shot
|
68 |
+
type: 5-shot
|
69 |
+
value: 0.5506
|
70 |
+
verified: false
|
71 |
---
|
72 |
|
73 |
<img src="https://huggingface.co/Metin/LLaMA-3-8B-Instruct-TR-DPO/resolve/main/llama.png"
|
|
|
151 |
|
152 |
## OpenLLMTurkishLeaderboard_v0.2 benchmark results
|
153 |
|
154 |
+
- **MMLU_TR_V0.2**: 49.83%
|
155 |
+
- **Truthful_QA_TR_V0.2**: 52.32%
|
156 |
+
- **ARC_TR_V0.2**: 44.37%
|
157 |
+
- **HellaSwag_TR_V0.2**: 45.58%
|
158 |
+
- **GSM8K_TR_V0.2**: 54.21%
|
159 |
+
- **Winogrande_TR_V0.2**: 55.06%
|
160 |
+
|
161 |
+
These scores may differ from what you will get when you run the same benchmarks, as I did not use any inference engine (vLLM, TensorRT-LLM, etc.)
|
162 |
|
163 |
## Output Example (DPO Model vs Base Model)
|
164 |
|