eval
#6
by
qianxiao1111
- opened
This view is limited to 50 files because it contains too many changes.
See the raw diff here.
- .gitattributes +4 -0
- added_tokens.json +3 -24
- config.json +3 -28
- evaluation/.gitignore +190 -0
- evaluation/README.md +181 -0
- evaluation/general_benchmarks/HumanEval/README.md +74 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-cpp +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-cpp.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-cs +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-cs-bu.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-cs.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-d.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-go.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-java +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-java.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-jl.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-js.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-lua.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-php +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-php.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-pl.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-python.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-r.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-rb.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-rkt.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-rs.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-scala.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-sh +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-sh.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-swift.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-ts +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-ts.jsonl +3 -0
- evaluation/general_benchmarks/HumanEval/eval.sh +4 -0
- evaluation/general_benchmarks/HumanEval/eval_base_vllm.py +162 -0
- evaluation/general_benchmarks/HumanEval/eval_instruct.py +168 -0
- evaluation/general_benchmarks/HumanEval/eval_instruct_vllm.py +225 -0
- evaluation/general_benchmarks/HumanEval/eval_pal.py +62 -0
- evaluation/general_benchmarks/HumanEval/human_eval/__init__.py +0 -0
- evaluation/general_benchmarks/HumanEval/human_eval/data.py +48 -0
- evaluation/general_benchmarks/HumanEval/human_eval/evaluate_functional_correctness.py +32 -0
- evaluation/general_benchmarks/HumanEval/human_eval/evaluation.py +351 -0
- evaluation/general_benchmarks/HumanEval/human_eval/execution.py +817 -0
- evaluation/general_benchmarks/HumanEval/humaneval.py +217 -0
- evaluation/general_benchmarks/HumanEval/javatuples-1.2.jar +0 -0
- evaluation/general_benchmarks/HumanEval/test_config.yaml +15 -0
- evaluation/general_benchmarks/HumanEval/utils/dataset.py +72 -0
- evaluation/general_benchmarks/HumanEval/utils/utils.py +161 -0
- evaluation/general_benchmarks/MATH/LICENSE +21 -0
- evaluation/general_benchmarks/MATH/README.md +52 -0
- evaluation/general_benchmarks/MATH/data/aime24/test.jsonl +3 -0
.gitattributes
CHANGED
@@ -47,3 +47,7 @@ rng_state_2.pth filter=lfs diff=lfs merge=lfs -text
|
|
47 |
model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
48 |
rng_state_4.pth filter=lfs diff=lfs merge=lfs -text
|
49 |
model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
47 |
model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
48 |
rng_state_4.pth filter=lfs diff=lfs merge=lfs -text
|
49 |
model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
50 |
+
*.csv filter=lfs diff=lfs merge=lfs -text
|
51 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
52 |
+
*.jsonl filter=lfs diff=lfs merge=lfs -text
|
53 |
+
*.sqlite filter=lfs diff=lfs merge=lfs -text
|
added_tokens.json
CHANGED
@@ -1,24 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
"<|box_end|>": 151649,
|
5 |
-
"<|box_start|>": 151648,
|
6 |
-
"<|endoftext|>": 151643,
|
7 |
-
"<|file_sep|>": 151664,
|
8 |
-
"<|fim_middle|>": 151660,
|
9 |
-
"<|fim_pad|>": 151662,
|
10 |
-
"<|fim_prefix|>": 151659,
|
11 |
-
"<|fim_suffix|>": 151661,
|
12 |
-
"<|im_end|>": 151645,
|
13 |
-
"<|im_start|>": 151644,
|
14 |
-
"<|image_pad|>": 151655,
|
15 |
-
"<|object_ref_end|>": 151647,
|
16 |
-
"<|object_ref_start|>": 151646,
|
17 |
-
"<|quad_end|>": 151651,
|
18 |
-
"<|quad_start|>": 151650,
|
19 |
-
"<|repo_name|>": 151663,
|
20 |
-
"<|video_pad|>": 151656,
|
21 |
-
"<|vision_end|>": 151653,
|
22 |
-
"<|vision_pad|>": 151654,
|
23 |
-
"<|vision_start|>": 151652
|
24 |
-
}
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:58b54bbe36fc752f79a24a271ef66a0a0830054b4dfad94bde757d851968060b
|
3 |
+
size 605
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
CHANGED
@@ -1,28 +1,3 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
"Qwen2ForCausalLM"
|
5 |
-
],
|
6 |
-
"attention_dropout": 0.0,
|
7 |
-
"bos_token_id": 151643,
|
8 |
-
"eos_token_id": 151645,
|
9 |
-
"hidden_act": "silu",
|
10 |
-
"hidden_size": 3584,
|
11 |
-
"initializer_range": 0.02,
|
12 |
-
"intermediate_size": 18944,
|
13 |
-
"max_position_embeddings": 32768,
|
14 |
-
"max_window_layers": 28,
|
15 |
-
"model_type": "qwen2",
|
16 |
-
"num_attention_heads": 28,
|
17 |
-
"num_hidden_layers": 28,
|
18 |
-
"num_key_value_heads": 4,
|
19 |
-
"rms_norm_eps": 1e-06,
|
20 |
-
"rope_theta": 1000000.0,
|
21 |
-
"sliding_window": null,
|
22 |
-
"tie_word_embeddings": false,
|
23 |
-
"torch_dtype": "bfloat16",
|
24 |
-
"transformers_version": "4.44.2",
|
25 |
-
"use_cache": false,
|
26 |
-
"use_sliding_window": false,
|
27 |
-
"vocab_size": 152064
|
28 |
-
}
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:82c97ddf1f855ce947c97a62859a064efb2499cb7cbb82aa9c67bdc65b678c17
|
3 |
+
size 709
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/.gitignore
ADDED
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# Byte-compiled / optimized / DLL files
|
3 |
+
__pycache__/
|
4 |
+
*.py[cod]
|
5 |
+
*$py.class
|
6 |
+
*.sql
|
7 |
+
*.sqlite
|
8 |
+
*.splite
|
9 |
+
*.desc
|
10 |
+
*.txt
|
11 |
+
*.DS_Store
|
12 |
+
.DS_Store
|
13 |
+
!eval_retriever/data/*.json
|
14 |
+
!eval_retriever/preds/*.json
|
15 |
+
!reject_eval/*.json
|
16 |
+
!evalset/*/*.json
|
17 |
+
!evalset/*.json
|
18 |
+
|
19 |
+
# C extensions
|
20 |
+
*.so
|
21 |
+
|
22 |
+
# Distribution / packaging
|
23 |
+
.Python
|
24 |
+
build/
|
25 |
+
develop-eggs/
|
26 |
+
dist/
|
27 |
+
downloads/
|
28 |
+
eggs/
|
29 |
+
.eggs/
|
30 |
+
lib/
|
31 |
+
lib64/
|
32 |
+
parts/
|
33 |
+
sdist/
|
34 |
+
var/
|
35 |
+
wheels/
|
36 |
+
share/python-wheels/
|
37 |
+
*.egg-info/
|
38 |
+
.installed.cfg
|
39 |
+
*.egg
|
40 |
+
MANIFEST
|
41 |
+
|
42 |
+
# PyInstaller
|
43 |
+
# Usually these files are written by a python script from a template
|
44 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
45 |
+
*.manifest
|
46 |
+
*.spec
|
47 |
+
|
48 |
+
# Installer logs
|
49 |
+
pip-log.txt
|
50 |
+
pip-delete-this-directory.txt
|
51 |
+
|
52 |
+
# Unit test / coverage reports
|
53 |
+
htmlcov/
|
54 |
+
.tox/
|
55 |
+
.nox/
|
56 |
+
.coverage
|
57 |
+
.coverage.*
|
58 |
+
.cache
|
59 |
+
nosetests.xml
|
60 |
+
coverage.xml
|
61 |
+
*.cover
|
62 |
+
*.py,cover
|
63 |
+
.hypothesis/
|
64 |
+
.pytest_cache/
|
65 |
+
cover/
|
66 |
+
|
67 |
+
# Translations
|
68 |
+
*.mo
|
69 |
+
*.pot
|
70 |
+
|
71 |
+
# Django stuff:
|
72 |
+
*.log
|
73 |
+
local_settings.py
|
74 |
+
db.sqlite3
|
75 |
+
db.sqlite3-journal
|
76 |
+
|
77 |
+
# Flask stuff:
|
78 |
+
instance/
|
79 |
+
.webassets-cache
|
80 |
+
|
81 |
+
# Scrapy stuff:
|
82 |
+
.scrapy
|
83 |
+
|
84 |
+
# Sphinx documentation
|
85 |
+
docs/_build/
|
86 |
+
|
87 |
+
# PyBuilder
|
88 |
+
.pybuilder/
|
89 |
+
target/
|
90 |
+
|
91 |
+
# Jupyter Notebook
|
92 |
+
.ipynb_checkpoints
|
93 |
+
|
94 |
+
# IPython
|
95 |
+
profile_default/
|
96 |
+
ipython_config.py
|
97 |
+
|
98 |
+
# pyenv
|
99 |
+
# For a library or package, you might want to ignore these files since the code is
|
100 |
+
# intended to run in multiple environments; otherwise, check them in:
|
101 |
+
# .python-version
|
102 |
+
|
103 |
+
# pipenv
|
104 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
105 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
106 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
107 |
+
# install all needed dependencies.
|
108 |
+
#Pipfile.lock
|
109 |
+
|
110 |
+
# poetry
|
111 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
112 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
113 |
+
# commonly ignored for libraries.
|
114 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
115 |
+
#poetry.lock
|
116 |
+
|
117 |
+
# pdm
|
118 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
119 |
+
#pdm.lock
|
120 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
121 |
+
# in version control.
|
122 |
+
# https://pdm.fming.dev/#use-with-ide
|
123 |
+
.pdm.toml
|
124 |
+
|
125 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
126 |
+
__pypackages__/
|
127 |
+
|
128 |
+
# Celery stuff
|
129 |
+
celerybeat-schedule
|
130 |
+
celerybeat.pid
|
131 |
+
|
132 |
+
# SageMath parsed files
|
133 |
+
*.sage.py
|
134 |
+
|
135 |
+
# Environments
|
136 |
+
.env
|
137 |
+
.venv
|
138 |
+
env/
|
139 |
+
venv/
|
140 |
+
ENV/
|
141 |
+
env.bak/
|
142 |
+
venv.bak/
|
143 |
+
|
144 |
+
# Spyder project settings
|
145 |
+
.spyderproject
|
146 |
+
.spyproject
|
147 |
+
|
148 |
+
# Rope project settings
|
149 |
+
.ropeproject
|
150 |
+
|
151 |
+
# mkdocs documentation
|
152 |
+
/site
|
153 |
+
|
154 |
+
# mypy
|
155 |
+
.mypy_cache/
|
156 |
+
.dmypy.json
|
157 |
+
dmypy.json
|
158 |
+
|
159 |
+
# Pyre type checker
|
160 |
+
.pyre/
|
161 |
+
|
162 |
+
# pytype static type analyzer
|
163 |
+
.pytype/
|
164 |
+
|
165 |
+
# Cython debug symbols
|
166 |
+
cython_debug/
|
167 |
+
|
168 |
+
# PyCharm
|
169 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
170 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
171 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
172 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
173 |
+
#.idea/
|
174 |
+
|
175 |
+
output/
|
176 |
+
images/
|
177 |
+
.vscode
|
178 |
+
vllm_encoder/
|
179 |
+
|
180 |
+
!table_related_benchmarks/evalset/bird_data/*.sql
|
181 |
+
!table_related_benchmarks/evalset/spider_data/*.sql
|
182 |
+
|
183 |
+
table_related_benchmarks/evalset/spider_data/test_database/*
|
184 |
+
table_related_benchmarks/evalset/bird_data/dev_databases/*
|
185 |
+
table_related_benchmarks/evalset/spider_data/dev_database/*
|
186 |
+
|
187 |
+
|
188 |
+
!table_related_benchmarks/evalset/spider_data/test_database/README.md
|
189 |
+
!table_related_benchmarks/evalset/bird_data/dev_databases/README.md
|
190 |
+
!table_related_benchmarks/evalset/spider_data/dev_database/README.md
|
evaluation/README.md
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Benchmarks evaluations for tablegpt
|
2 |
+
|
3 |
+
<p align="center">
|
4 |
+
<a href="#-About">🔥About</a> •
|
5 |
+
<a href="#-Usage">💻Usage</a> •
|
6 |
+
</p>
|
7 |
+
|
8 |
+
## About
|
9 |
+
|
10 |
+
</div>
|
11 |
+
|
12 |
+
This is a repo opened for evaluation on different table-related benchmarks for tablegpt.
|
13 |
+
|
14 |
+
Given the complexity of table QA tasks and the uncertainty of input instructions, we provide evaluation datasets and scripts for 7 capabilities:
|
15 |
+
|
16 |
+
- ✨Code correction based on tables
|
17 |
+
- ✨Refusal of ambiguous questions
|
18 |
+
- ✨Table & field recall in multi-table scenarios
|
19 |
+
- ✨Table QA output code executable
|
20 |
+
- ✨Table-Bench.
|
21 |
+
- ✨Text2Sql.
|
22 |
+
- ✨TableInstruct, which includes a series of table-related evaluation benchmarks.
|
23 |
+
|
24 |
+
In addition, we have integrated other general abilities benchmarks like HumanEval, MBPP and MMLU/CMMLU.
|
25 |
+
We have built an inference method based on the local model path using vLLM as the backend, and defined a set of example prompts templates for the above benchmarks.
|
26 |
+
|
27 |
+
## Usage
|
28 |
+
|
29 |
+
</div>
|
30 |
+
</details>
|
31 |
+
|
32 |
+
⏬ To use this framework, please first install the repository from GitHub:
|
33 |
+
|
34 |
+
```shell
|
35 |
+
git clone https://github.com/tablegpt/tablegpt-eval
|
36 |
+
cd tablegpt-eval
|
37 |
+
pip install -r requirements.txt
|
38 |
+
```
|
39 |
+
|
40 |
+
</div>
|
41 |
+
</details>
|
42 |
+
|
43 |
+
[!Tips]
|
44 |
+
1. You can run all the benchmarks with the default params by running command `bash run_benchmarks.sh`.
|
45 |
+
2. If you want more configuration options for running parameters, refer to the typical Python script.
|
46 |
+
3. Download the .db files before running text2sql evaluation scripts. Download urls refer to `/table_related_benchmarks/evalset/bird_data/dev_databases/README.md`(Bird dev) & `table_related_benchmarks/evalset/spider_data/dev_database/README.md` (Spider dev) & `table_related_benchmarks/evalset/spider_data/test_database/README.md` (Spider test).
|
47 |
+
|
48 |
+
|
49 |
+
### Code correction eval
|
50 |
+
|
51 |
+
We provide a non-executable eval dataset based on the Python language. Eval dataset path:
|
52 |
+
|
53 |
+
```python
|
54 |
+
table_related_benchmarks/evalset/code_correction_test/correction_set.json
|
55 |
+
```
|
56 |
+
|
57 |
+
We use the ***executable_pass_rate*** and ***absolute_match_rate*** of the corrected code in pass-1 to evaluate the model's code correction ability. You can perform code-correction evaluation by running the following Python command:
|
58 |
+
|
59 |
+
```bash
|
60 |
+
python table_related_benchmarks/run_code_correction_eval.py \
|
61 |
+
--model_path <EVAL MODEL PATH> \
|
62 |
+
--template <CHAT_TEMPLATE_NAME, support [llama3, baichuan, chatglm, None], default None> \
|
63 |
+
--eval_results_save_path <PATH TO SAVE THE EVAL RESULTS> \
|
64 |
+
--gpus_num <NUMBER OF GPU TO RUN INFERENCE> \
|
65 |
+
--temperature <ONE OF THE INFERENCE PARAMETER>
|
66 |
+
```
|
67 |
+
|
68 |
+
### Ambiguous reject eval
|
69 |
+
|
70 |
+
We provide 298 table-based queries, with a ratio of about 1:3 between queries marked as ambiguous (to be rejected) and queries that should be accepted and correctly answered. Dataset path:
|
71 |
+
|
72 |
+
```python
|
73 |
+
# test queries
|
74 |
+
evalset/reject_test/test_query.json
|
75 |
+
# queries with ground truth
|
76 |
+
evalset/reject_test/ground_truth.json
|
77 |
+
```
|
78 |
+
|
79 |
+
We use **accuracy**, **recall**, and **F1 score** as metrics to evaluate the LLM's ability in this task. You can perform reject evaluation by running the following Python command:
|
80 |
+
|
81 |
+
```bash
|
82 |
+
python table_related_benchmarks/run_reject_eval.py \
|
83 |
+
--model_path <EVAL MODEL PATH> \
|
84 |
+
--save_path <LLM OUTPUT CONTENT SAVE PATH> \
|
85 |
+
--gpus_num <NUMBER OF GPU TO RUN INFERENCE> \
|
86 |
+
--temperature <ONE OF THE INFERENCE PARAMETER>
|
87 |
+
```
|
88 |
+
|
89 |
+
### Table&Fields recall eval
|
90 |
+
|
91 |
+
The provided eval dataset path:
|
92 |
+
|
93 |
+
```python
|
94 |
+
table_related_benchmarks/evalset/retrieval_test/recall_set.json
|
95 |
+
```
|
96 |
+
|
97 |
+
We use a series of evaluation metrics such as **recall**, **precision**, **Jaccard similarity**, and **Hamming loss** to assess the LLM's performance in table and field retrieval tasks. You can perform recall evaluation by running the following Python command:
|
98 |
+
|
99 |
+
```bash
|
100 |
+
python table_related_benchmarks/run_recall_eval.py \
|
101 |
+
--model_path <EVAL MODEL PATH> \
|
102 |
+
--temperature <TEMPERATURE> \
|
103 |
+
--gpus_num <NUMBER OF GPU TO RUN INFERENCE>
|
104 |
+
```
|
105 |
+
|
106 |
+
### Table QA executable
|
107 |
+
|
108 |
+
Provide 2178 table based queries, eval dataset path:
|
109 |
+
|
110 |
+
```python
|
111 |
+
table_related_benchmarks/evalset/table_qa_execuate_test/tableqa_samples_with_paths.jsonl
|
112 |
+
```
|
113 |
+
|
114 |
+
We employ ***executable_pass_rate*** of pass-1 to employ the model's tableQA code generation ability. You can perform tableQA evaluation by running the following Python command:
|
115 |
+
|
116 |
+
```bash
|
117 |
+
python table_related_benchmarks/run_tableqa_execution_eval.py \
|
118 |
+
--model_path <EVAL MODEL PATH> \
|
119 |
+
--temperature <ONE OF THE INFERENCE PARAMETER> \
|
120 |
+
--gpus_num <NUMBER OF GPU TO RUN INFERENCE>
|
121 |
+
```
|
122 |
+
|
123 |
+
### TableBench evaluation
|
124 |
+
|
125 |
+
The provided eval dataset path:
|
126 |
+
|
127 |
+
```python
|
128 |
+
table_related_benchmarks/evalset/TableBench
|
129 |
+
```
|
130 |
+
|
131 |
+
In the evaluation of TableBench, Rough-L was used to assess general QA questions, while pass@1 was used as the evaluation metric for visualization-type samples. You can perform TableBench evaluation by the following command:
|
132 |
+
|
133 |
+
```bash
|
134 |
+
python table_related_benchmarks/run_table_bench_eval.py \
|
135 |
+
--model_path <EVAL MODEL PATH> \
|
136 |
+
--temperature <ONE OF THE INFERENCE PARAMETER> \
|
137 |
+
--gpus_num <NUMBER OF GPU TO RUN INFERENCE>
|
138 |
+
```
|
139 |
+
|
140 |
+
### TableInstruct
|
141 |
+
|
142 |
+
The provided eval dataset path:
|
143 |
+
|
144 |
+
```python
|
145 |
+
table_related_benchmarks/evalset/TableInstruct
|
146 |
+
```
|
147 |
+
|
148 |
+
You can perform TableInstruct evaluation by the following command:
|
149 |
+
|
150 |
+
```bash
|
151 |
+
python table_related_benchmarks/run_table_instruct_eval.py \
|
152 |
+
--model_path <EVAL MODEL PATH> \
|
153 |
+
--temperature <ONE OF THE INFERENCE PARAMETER> \
|
154 |
+
--gpus_num <NUMBER OF GPU TO RUN INFERENCE>
|
155 |
+
```
|
156 |
+
|
157 |
+
### Text2Sql
|
158 |
+
```bash
|
159 |
+
python table_related_benchmarks/run_text2sql_eval.py --model_path <EVAL MODEL PATH>
|
160 |
+
```
|
161 |
+
|
162 |
+
### HumanEval
|
163 |
+
Perform HumanEval evaluation by the following command:
|
164 |
+
|
165 |
+
```bash
|
166 |
+
python general_benchmarks/HumanEval/eval_instruct_vllm.py --model_path <EVAL MODEL PATH>
|
167 |
+
```
|
168 |
+
|
169 |
+
### MBPP
|
170 |
+
Perform MBPP evaluation by the following command:
|
171 |
+
|
172 |
+
```bash
|
173 |
+
python general_benchmarks/MBPP/eval_instruct_vllm.py --model_path <EVAL MODEL PATH>
|
174 |
+
```
|
175 |
+
|
176 |
+
### MMLU & CMMLU
|
177 |
+
|
178 |
+
```bash
|
179 |
+
python general_benchmarks/MMLU/evaluator.py --task <mmlu or cmmlu> --lang <en or zh> --model_path <EVAL MODEL PATH>
|
180 |
+
```
|
181 |
+
|
evaluation/general_benchmarks/HumanEval/README.md
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## 1. Introduction
|
2 |
+
|
3 |
+
We provide a test script to evaluate the performance of the **deepseek-coder** model on code generation benchmarks. We select the widely-used benchmarks: **[HumanEval-Python](https://huggingface.co/datasets/openai_humaneval), [HumanEval-Multilingual](https://huggingface.co/datasets/nuprl/MultiPL-E)**.
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
## 2. Setup
|
8 |
+
|
9 |
+
```
|
10 |
+
pip install accelerate
|
11 |
+
pip install attrdict
|
12 |
+
pip install transformers
|
13 |
+
pip install pytorch
|
14 |
+
```
|
15 |
+
|
16 |
+
|
17 |
+
## 3. Evaluation
|
18 |
+
|
19 |
+
We've created a sample script, **eval.sh**, that demonstrates how to test the **DeepSeek-Coder-1.3b-Base** model on the HumanEval dataset leveraging **8** GPUs. If your use case involves a different model or dataset, simply adjust the script to fit your needs.
|
20 |
+
|
21 |
+
Additionally, for various programming languages, the execution path may differ. Please ensure you update the appropriate paths in the **humaneval/execution.py** file accordingly.
|
22 |
+
|
23 |
+
```bash
|
24 |
+
MODEL_NAME_OR_PATH="deepseek-ai/deepseek-coder-1.3b-base"
|
25 |
+
DATASET_ROOT="data/"
|
26 |
+
LANGUAGE="python"
|
27 |
+
python -m accelerate.commands.launch --config_file test_config.yaml eval_pal.py --logdir ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
|
28 |
+
```
|
29 |
+
|
30 |
+
To evaluate the instruction-based model, please follow the script below:
|
31 |
+
```bash
|
32 |
+
LANG="python"
|
33 |
+
OUPUT_DIR="output"
|
34 |
+
MODEL="deepseek-coder-33b-instruct"
|
35 |
+
|
36 |
+
CUDA_VISIBLE_DEVICES=0,1 python eval_instruct.py \
|
37 |
+
--model "deepseek-ai/$MODEL" \
|
38 |
+
--output_path "$OUPUT_DIR/${LANG}.$MODEL.jsonl" \
|
39 |
+
--language $LANG \
|
40 |
+
--temp_dir $OUPUT_DIR
|
41 |
+
```
|
42 |
+
|
43 |
+
## 4. Experimental Results
|
44 |
+
|
45 |
+
We report experimental results here for 8 main-stream programming languages, **python**, **c++**, **java**, **PHP**, **TypeScript**, **C#**, **Bash**, and **JavaScript**. For all open-source models, we utilize this repository to obtain the performance of the models on the HumanEval dataset. We set the maximum input length to **4096** and the maximum output length to **500**, and employ the **greedy search strategy**.
|
46 |
+
|
47 |
+
|
48 |
+
#### (1) Multilingual Base Models
|
49 |
+
|
50 |
+
| Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
|
51 |
+
|-------------------|------|--------|-------|------|------|------|------|------|------|------|
|
52 |
+
| code-cushman-001 | 12B | 33.5% | 31.9% | 30.6%| 28.9%| 31.3%| 22.1%| 11.7%| - | - |
|
53 |
+
| CodeShell | 7B | 35.4% | 32.9% | 34.2%| 31.7%| 30.2%| 38.0%| 7.0% | 33.5%| 30.4%|
|
54 |
+
| CodeGeeX2 | 6B | 36.0% | 29.2% | 25.9%| 23.6%| 20.8%| 29.7%| 6.3% | 24.8%| 24.5%|
|
55 |
+
| StarCoderBase | 16B | 31.7% | 31.1% | 28.5%| 25.4%| 34.0%| 34.8%| 8.9% | 29.8%| 28.0%|
|
56 |
+
| CodeLLama | 7B | 31.7% | 29.8% | 34.2%| 23.6%| 36.5%| 36.7%| 12.0%| 29.2%| 29.2%|
|
57 |
+
| CodeLLama | 13B | 36.0% | 37.9% | 38.0%| 34.2%| 45.2%| 43.0%| 16.5%| 32.3%| 35.4%|
|
58 |
+
| CodeLLama | 34B | 48.2% | 44.7% | 44.9%| 41.0%| 42.1%| 48.7%| 15.8%| 42.2%| 41.0%|
|
59 |
+
| | | | | | | | | | | |
|
60 |
+
| DeepSeek-Coder-Base| 1.3B | 34.8% | 31.1% | 32.3%| 24.2%| 28.9%| 36.7%| 10.1%| 28.6%| 28.3%|
|
61 |
+
| DeepSeek-Coder-Base| 5.7B | 48.7% | 45.3% | 41.1%| 39.7%| 44.7%| 41.1%| 27.8%| 42.2%| 41.3%|
|
62 |
+
| DeepSeek-Coder-Base| 6.7B | 49.4% | 50.3% | 43.0%| 38.5%| 49.7%| 50.0%| 28.5%| 48.4%| 44.7%|
|
63 |
+
| DeepSeek-Coder-Base|33B | **56.1%** | **58.4%** | **51.9%**| **44.1%**| **52.8%**| **51.3%**| **32.3%**| **55.3%**| **50.3%**|
|
64 |
+
|
65 |
+
#### (2) Instruction-Tuned Models
|
66 |
+
| Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
|
67 |
+
|---------------------|------|--------|-------|------|------|------|------|------|------|------|
|
68 |
+
| GPT-3.5-Turbo | - | 76.2% | 63.4% | 69.2%| 60.9%| 69.1%| 70.8%| 42.4%| 67.1%| 64.9%|
|
69 |
+
| GPT-4 | - | **84.1%** | **76.4%** | **81.6%**| **77.2%**| **77.4%**| **79.1%**| **58.2%**| **78.0%**| **76.5%**|
|
70 |
+
| | | | | | | | | | | |
|
71 |
+
| DeepSeek-Coder-Instruct | 1.3B | 65.2% | 45.3% | 51.9% | 45.3% | 59.7% |55.1% | 12.7% | 52.2% | 48.4% |
|
72 |
+
| DeepSeek-Coder-Instruct | 6.7B | 78.9% | 63.4% | 68.4% | 68.9%| 67.2%| 72.8%| 36.7%| 72.7%| 66.1%|
|
73 |
+
| DeepSeek-Coder-Instruct | 33B | **79.3%** | **68.9%** | **73.4%** | **72.7%**| **67.9%**| **74.1%**| **43.0%**| **73.9%**| **69.2%**|
|
74 |
+
|
evaluation/general_benchmarks/HumanEval/data/humaneval-cpp
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-cpp.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8717eabdf202137158c84506144b0fb1e73d5ecccbe5363ec79009ca014df629
|
3 |
+
size 388688
|
evaluation/general_benchmarks/HumanEval/data/humaneval-cs
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-cs-bu.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d281b53b24e0f44cb76f1b1a8702b1ca668ff2a29c7621276ee8b658f5c124c6
|
3 |
+
size 448701
|
evaluation/general_benchmarks/HumanEval/data/humaneval-cs.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c3fe18be10addc2d0b96311f4db192ae3232e08628d17768d889d6ab87be224
|
3 |
+
size 452021
|
evaluation/general_benchmarks/HumanEval/data/humaneval-d.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:226938c015f90d713a3e30d8f174f4a6d2c88820cf50512379a16890dda70332
|
3 |
+
size 289365
|
evaluation/general_benchmarks/HumanEval/data/humaneval-go.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7f5dfe3ff001049b1221e9d21a8119b5cbc38eb87c97fefc5d57fa7adc1df888
|
3 |
+
size 432325
|
evaluation/general_benchmarks/HumanEval/data/humaneval-java
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-java.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b21b015763452ec4f9f4e0e6425148ec331ace4da1232c8b4d441186185f6265
|
3 |
+
size 454059
|
evaluation/general_benchmarks/HumanEval/data/humaneval-jl.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef21bd920889c9c1ab0e87a825cc26bd895e7a715f569f8ba7de577f870b6815
|
3 |
+
size 268754
|
evaluation/general_benchmarks/HumanEval/data/humaneval-js.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:634e2eee8d6e22de07c121b972207683dd96be76256bf44bdfc1a3386b739287
|
3 |
+
size 297853
|
evaluation/general_benchmarks/HumanEval/data/humaneval-lua.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d2e91f3603f6aee63db2c2d5754d165397603a8c9bd6130842af7988b27a96fc
|
3 |
+
size 298314
|
evaluation/general_benchmarks/HumanEval/data/humaneval-php
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-php.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bebd86875b1d8e65a8b7e692ed7bdf64612b44aa388825ea4dab40c7047c786b
|
3 |
+
size 388096
|
evaluation/general_benchmarks/HumanEval/data/humaneval-pl.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:949048bff3eaea7ae47cd5759042c088a7c227d53c74fe95a80728fd5aefbf77
|
3 |
+
size 437506
|
evaluation/general_benchmarks/HumanEval/data/humaneval-python.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0eae07adadbb00d51962fdb78b9e2a26bfa8ade85dc54eb57cae9bffed2f5c54
|
3 |
+
size 342974
|
evaluation/general_benchmarks/HumanEval/data/humaneval-r.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2a3268ea54d2bfb8ce65bb2c808437ac0d9934c6caf99fcf75d6b6a4fb3f911
|
3 |
+
size 311904
|
evaluation/general_benchmarks/HumanEval/data/humaneval-rb.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4204395b835019e32d55513325a0fad01c6d382fd1eb97b516f0458d00058302
|
3 |
+
size 312806
|
evaluation/general_benchmarks/HumanEval/data/humaneval-rkt.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:083325daf55daf431ae6d6d17cf017afb18a6ec790d4c841b7c2b4752c5807ff
|
3 |
+
size 312006
|
evaluation/general_benchmarks/HumanEval/data/humaneval-rs.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e33f562838e973b7e6b8d76dbc1fb84b076d7426629b4cdc624f12678778d2fa
|
3 |
+
size 306470
|
evaluation/general_benchmarks/HumanEval/data/humaneval-scala.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:482b72a16613a563755d642e37792da68471399789b069fba5b1249e831445f3
|
3 |
+
size 384243
|
evaluation/general_benchmarks/HumanEval/data/humaneval-sh
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-sh.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f702106e9ce4aa7385de568d9248d6c57c90382d782b162cb9e072fbd01ccf8
|
3 |
+
size 274180
|
evaluation/general_benchmarks/HumanEval/data/humaneval-swift.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e4418d6f532e4298969e727690cda80dc88b94e4abf836f8ff79ac18a737eaaa
|
3 |
+
size 344436
|
evaluation/general_benchmarks/HumanEval/data/humaneval-ts
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-ts.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4b1af1050c9f226335d54a8c80553c39454a525cace3e67bdcfcc9092ba02637
|
3 |
+
size 304732
|
evaluation/general_benchmarks/HumanEval/eval.sh
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MODEL_NAME_OR_PATH="/data3/models/DeepSeek/deepseek-coder-6.7b-base"
|
2 |
+
DATASET_ROOT="HumanEval/data"
|
3 |
+
LANGUAGE="python"
|
4 |
+
CUDA_VISIBLE_DEVICES=5,6,7 python -m accelerate.commands.launch --config_file HumanEval/test_config.yaml HumanEval/eval_pal.py --model_path ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
|
evaluation/general_benchmarks/HumanEval/eval_base_vllm.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
from argparse import ArgumentParser
|
5 |
+
# from accelerate import Accelerator
|
6 |
+
# from accelerate import DistributedDataParallelKwargs
|
7 |
+
from pathlib import Path
|
8 |
+
|
9 |
+
import numpy as np
|
10 |
+
import torch
|
11 |
+
import torch.nn.functional as F
|
12 |
+
from human_eval.evaluation import evaluate_functional_correctness
|
13 |
+
from tqdm import tqdm
|
14 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
15 |
+
from utils.dataset import HumanEvalDataset
|
16 |
+
from utils.utils import cleanup_code
|
17 |
+
from vllm import LLM, SamplingParams
|
18 |
+
|
19 |
+
|
20 |
+
class HumanEval:
|
21 |
+
"""
|
22 |
+
HumanEval evaluation class.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
data_root,
|
28 |
+
language="python",
|
29 |
+
log_dir=None,
|
30 |
+
issft=False,
|
31 |
+
inference_increment=True,
|
32 |
+
n_sample=1,
|
33 |
+
k_sample=1,
|
34 |
+
):
|
35 |
+
self.data_root = data_root
|
36 |
+
self.k = k_sample
|
37 |
+
self.n_sample = n_sample
|
38 |
+
self.language = language
|
39 |
+
self.log_dir = log_dir
|
40 |
+
self.sft = issft
|
41 |
+
self.inference_increment = inference_increment
|
42 |
+
os.makedirs(self.log_dir, exist_ok=True)
|
43 |
+
|
44 |
+
@torch.no_grad()
|
45 |
+
def eval_model(self, args):
|
46 |
+
"""
|
47 |
+
Evaluate the model on HumanEval.
|
48 |
+
"""
|
49 |
+
assert (
|
50 |
+
self.log_dir is not None
|
51 |
+
), "log_dir should not be None when evaluating humaneval"
|
52 |
+
dataset = HumanEvalDataset(
|
53 |
+
self.data_root,
|
54 |
+
sample_num=self.n_sample,
|
55 |
+
language=self.language,
|
56 |
+
issft=self.sft,
|
57 |
+
)
|
58 |
+
model_name_or_path = args.model_path
|
59 |
+
print("model", model_name_or_path)
|
60 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
61 |
+
print(
|
62 |
+
"load tokenizer {} from {} over.".format(
|
63 |
+
tokenizer.__class__, model_name_or_path
|
64 |
+
)
|
65 |
+
)
|
66 |
+
|
67 |
+
llm = LLM(
|
68 |
+
model=model_name_or_path,
|
69 |
+
tensor_parallel_size=1,
|
70 |
+
max_model_len=4096,
|
71 |
+
trust_remote_code=True,
|
72 |
+
enforce_eager=True,
|
73 |
+
)
|
74 |
+
sampling_params = SamplingParams(
|
75 |
+
temperature=0,
|
76 |
+
max_tokens=1024,
|
77 |
+
top_p=0.95,
|
78 |
+
stop_token_ids=[tokenizer.eos_token_id],
|
79 |
+
)
|
80 |
+
messages_list = []
|
81 |
+
for i in range(len(dataset)):
|
82 |
+
data = dataset[i]
|
83 |
+
prompt = data["prompt"].strip()
|
84 |
+
messages_list.append(prompt)
|
85 |
+
outputs = llm.generate(messages_list, sampling_params=sampling_params)
|
86 |
+
assert len(dataset) == len(outputs), "dataset and outputs different lengths."
|
87 |
+
log_file = os.path.join(self.log_dir, f"{self.language}.json")
|
88 |
+
tmpfile = open(log_file, "w")
|
89 |
+
for i, output in enumerate(tqdm(outputs)):
|
90 |
+
data = dataset[i]
|
91 |
+
output = output.outputs[0].text
|
92 |
+
output = cleanup_code(
|
93 |
+
output,
|
94 |
+
self.language,
|
95 |
+
"humaneval",
|
96 |
+
self.sft,
|
97 |
+
dataset.stopwords,
|
98 |
+
)
|
99 |
+
# sft mode does not need original prompt
|
100 |
+
if not self.sft:
|
101 |
+
suffixprediction = data["original_prompt"] + "\n" + output
|
102 |
+
res = {
|
103 |
+
"task_id": data["task_id"],
|
104 |
+
"generation": suffixprediction,
|
105 |
+
"prompt": data["original_prompt"],
|
106 |
+
}
|
107 |
+
tmpfile.write(json.dumps(res) + "\n")
|
108 |
+
|
109 |
+
tmpfile.close()
|
110 |
+
# calculate the final score of pass@k
|
111 |
+
self._calculate_final_score(log_file)
|
112 |
+
return
|
113 |
+
|
114 |
+
def _calculate_final_score(self, logfilepath):
|
115 |
+
"""
|
116 |
+
Calculate the final score.
|
117 |
+
"""
|
118 |
+
res = evaluate_functional_correctness(
|
119 |
+
input_file=logfilepath,
|
120 |
+
problem_file=os.path.join(
|
121 |
+
self.data_root, f"humaneval-{self.language}.jsonl"
|
122 |
+
),
|
123 |
+
tmp_dir=self.log_dir,
|
124 |
+
language=self.language,
|
125 |
+
)
|
126 |
+
print("score is", res["pass@%d" % self.k])
|
127 |
+
os.remove(logfilepath)
|
128 |
+
return
|
129 |
+
|
130 |
+
|
131 |
+
if __name__ == "__main__":
|
132 |
+
parser = ArgumentParser()
|
133 |
+
parser.add_argument("--logdir", type=str, default="")
|
134 |
+
parser.add_argument(
|
135 |
+
"--model_path",
|
136 |
+
type=str,
|
137 |
+
help="model name or path",
|
138 |
+
default="/data0/pretrained-models/qwen2-7b",
|
139 |
+
)
|
140 |
+
|
141 |
+
parser.add_argument("--language", type=str, default="python")
|
142 |
+
parser.add_argument(
|
143 |
+
"--dataroot",
|
144 |
+
type=str,
|
145 |
+
default="HumanEval/data",
|
146 |
+
)
|
147 |
+
args = parser.parse_args()
|
148 |
+
|
149 |
+
logdir = args.logdir
|
150 |
+
language = args.language
|
151 |
+
|
152 |
+
if logdir == "":
|
153 |
+
logdir = "output/tmp/"
|
154 |
+
|
155 |
+
evaluator = HumanEval(
|
156 |
+
data_root=args.dataroot,
|
157 |
+
log_dir=logdir,
|
158 |
+
n_sample=1,
|
159 |
+
language=language,
|
160 |
+
)
|
161 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
162 |
+
evaluator.eval_model(args)
|
evaluation/general_benchmarks/HumanEval/eval_instruct.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from pathlib import Path
|
5 |
+
|
6 |
+
import torch
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
data_abs_dir = Path(__file__).parent / "data"
|
10 |
+
|
11 |
+
from human_eval.evaluation import evaluate_functional_correctness
|
12 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
13 |
+
from utils.utils import extract_generation_code, languge_settings
|
14 |
+
|
15 |
+
|
16 |
+
def build_deepseekcoder_instruction(languge: str, question: str):
|
17 |
+
return """
|
18 |
+
Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
|
19 |
+
```{}
|
20 |
+
{}
|
21 |
+
```
|
22 |
+
""".strip().format(
|
23 |
+
languge.lower(), question.strip()
|
24 |
+
)
|
25 |
+
|
26 |
+
|
27 |
+
def generate_one(example, lang, tokenizer, model):
|
28 |
+
prompt = build_deepseekcoder_instruction(
|
29 |
+
languge_settings[lang]["full_name"], example["prompt"]
|
30 |
+
)
|
31 |
+
inputs = tokenizer.apply_chat_template(
|
32 |
+
[{"role": "user", "content": prompt}],
|
33 |
+
return_tensors="pt",
|
34 |
+
add_generation_prompt=True,
|
35 |
+
).to(model.device)
|
36 |
+
|
37 |
+
stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
|
38 |
+
assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
|
39 |
+
|
40 |
+
outputs = model.generate(
|
41 |
+
inputs,
|
42 |
+
max_new_tokens=1024,
|
43 |
+
do_sample=False,
|
44 |
+
# top_p=0.95,
|
45 |
+
# temperature=temperature,
|
46 |
+
pad_token_id=stop_id,
|
47 |
+
eos_token_id=stop_id,
|
48 |
+
)
|
49 |
+
|
50 |
+
output = tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
|
51 |
+
example["output"] = output
|
52 |
+
|
53 |
+
return extract_generation_code(example, lang_code=lang)
|
54 |
+
|
55 |
+
|
56 |
+
def generate_main(args):
|
57 |
+
model_name_or_path = args.model
|
58 |
+
lang = args.language
|
59 |
+
saved_path = args.output_path
|
60 |
+
temp_dir = args.temp_dir
|
61 |
+
os.makedirs(temp_dir, exist_ok=True)
|
62 |
+
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
63 |
+
|
64 |
+
print("model", model_name_or_path)
|
65 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
66 |
+
print(
|
67 |
+
"load tokenizer {} from {} over.".format(
|
68 |
+
tokenizer.__class__, model_name_or_path
|
69 |
+
)
|
70 |
+
)
|
71 |
+
model = AutoModelForCausalLM.from_pretrained(
|
72 |
+
model_name_or_path,
|
73 |
+
torch_dtype=torch.bfloat16,
|
74 |
+
device_map="auto",
|
75 |
+
# use_flash_attention_2=True
|
76 |
+
)
|
77 |
+
model.eval()
|
78 |
+
examples = [json.loads(x) for x in open(problem_file) if x.strip()]
|
79 |
+
print("Read {} examples for evaluation over.".format(len(examples)))
|
80 |
+
|
81 |
+
generated_examples = []
|
82 |
+
for ex in tqdm(examples, desc="Generating"):
|
83 |
+
gen_example = generate_one(ex, args.language, tokenizer, model)
|
84 |
+
generated_examples.append(gen_example)
|
85 |
+
|
86 |
+
print("Generate all over!!!")
|
87 |
+
with open(saved_path, "w", encoding="utf-8") as fw:
|
88 |
+
for ex in generated_examples:
|
89 |
+
fw.write(json.dumps(ex) + "\n")
|
90 |
+
print(
|
91 |
+
"Save {} processed examples into {} over!".format(
|
92 |
+
len(generated_examples), saved_path
|
93 |
+
)
|
94 |
+
)
|
95 |
+
|
96 |
+
result = evaluate_functional_correctness(
|
97 |
+
input_file=saved_path,
|
98 |
+
tmp_dir=temp_dir,
|
99 |
+
n_workers=8,
|
100 |
+
timeout=3.0,
|
101 |
+
problem_file=problem_file,
|
102 |
+
language=lang,
|
103 |
+
)
|
104 |
+
print(lang, result, model_name_or_path)
|
105 |
+
pass
|
106 |
+
|
107 |
+
|
108 |
+
def evaluation_only(args):
|
109 |
+
lang = args.language
|
110 |
+
temp_dir = args.temp_dir
|
111 |
+
assert os.path.exists(args.output_path), "Not fond output file: {}".format(
|
112 |
+
args.output_path
|
113 |
+
)
|
114 |
+
os.makedirs(temp_dir, exist_ok=True)
|
115 |
+
|
116 |
+
output_name = os.path.basename(args.output_path)
|
117 |
+
output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
|
118 |
+
|
119 |
+
processed_examples = [
|
120 |
+
extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")
|
121 |
+
]
|
122 |
+
processed_path = os.path.join(temp_dir, output_name)
|
123 |
+
with open(processed_path, "w", encoding="utf-8") as fw:
|
124 |
+
for ex in processed_examples:
|
125 |
+
fw.write(json.dumps(ex) + "\n")
|
126 |
+
print(
|
127 |
+
"Save {} processed examples into {} over!".format(
|
128 |
+
len(processed_examples), processed_path
|
129 |
+
)
|
130 |
+
)
|
131 |
+
|
132 |
+
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
133 |
+
from human_eval.evaluation import evaluate_functional_correctness
|
134 |
+
|
135 |
+
result = evaluate_functional_correctness(
|
136 |
+
input_file=processed_path,
|
137 |
+
tmp_dir=temp_dir,
|
138 |
+
n_workers=8,
|
139 |
+
timeout=3.0,
|
140 |
+
problem_file=problem_file,
|
141 |
+
language=lang,
|
142 |
+
)
|
143 |
+
print(lang, result)
|
144 |
+
|
145 |
+
|
146 |
+
if __name__ == "__main__":
|
147 |
+
parser = argparse.ArgumentParser()
|
148 |
+
parser.add_argument(
|
149 |
+
"--model",
|
150 |
+
type=str,
|
151 |
+
help="model name or path",
|
152 |
+
default="/data0/pretrained-models/deepseek-coder-6.7b-instruct",
|
153 |
+
)
|
154 |
+
parser.add_argument(
|
155 |
+
"--output_path",
|
156 |
+
type=str,
|
157 |
+
help="output path of your generation",
|
158 |
+
default="/home/qyhuang/DeepSeek-Coder/outputs/deepseek-chat.json",
|
159 |
+
)
|
160 |
+
parser.add_argument("--language", type=str, help="langauge", default="python")
|
161 |
+
parser.add_argument(
|
162 |
+
"--temp_dir", type=str, help="temp dir for evaluation", default="tmp"
|
163 |
+
)
|
164 |
+
args = parser.parse_args()
|
165 |
+
|
166 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
167 |
+
generate_main(args)
|
168 |
+
pass
|
evaluation/general_benchmarks/HumanEval/eval_instruct_vllm.py
ADDED
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import argparse
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import shutil
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
import torch
|
8 |
+
import transformers
|
9 |
+
from human_eval.evaluation import evaluate_functional_correctness
|
10 |
+
from tqdm import tqdm
|
11 |
+
from transformers import AutoTokenizer
|
12 |
+
from utils.utils import extract_generation_code, languge_settings
|
13 |
+
from vllm import LLM, SamplingParams
|
14 |
+
|
15 |
+
data_abs_dir = Path(__file__).parent / "data"
|
16 |
+
|
17 |
+
|
18 |
+
def build_deepseekcoder_instruction(languge: str, question: str):
|
19 |
+
return """
|
20 |
+
Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
|
21 |
+
```{}
|
22 |
+
{}
|
23 |
+
```
|
24 |
+
""".strip().format(
|
25 |
+
languge.lower(), question.strip()
|
26 |
+
)
|
27 |
+
|
28 |
+
|
29 |
+
def create_dir(output_dir):
|
30 |
+
if os.path.exists(output_dir):
|
31 |
+
if not os.access(output_dir, os.W_OK):
|
32 |
+
shutil.rmtree(output_dir)
|
33 |
+
os.makedirs(output_dir)
|
34 |
+
os.chmod(output_dir, 0o777)
|
35 |
+
print("not write permission, makedir:", output_dir)
|
36 |
+
else:
|
37 |
+
print(f"{output_dir} exists!")
|
38 |
+
else:
|
39 |
+
os.makedirs(output_dir)
|
40 |
+
os.chmod(output_dir, 0o777)
|
41 |
+
print("makedir:", output_dir)
|
42 |
+
|
43 |
+
|
44 |
+
def get_client_res(messages, example, output_key, open_ai_key=False):
|
45 |
+
try:
|
46 |
+
if open_ai_key:
|
47 |
+
from openai import AzureOpenAI, OpenAI
|
48 |
+
try:
|
49 |
+
api_key = os.environ["OPENAI_API_KEY"]
|
50 |
+
except KeyError:
|
51 |
+
print("环境变量 OPENAI_API_KEY 未设置")
|
52 |
+
api_key = "default_value"
|
53 |
+
|
54 |
+
client = AzureOpenAI(
|
55 |
+
api_key=api_key,
|
56 |
+
api_version="2024-07-01-preview",
|
57 |
+
azure_endpoint="https://zju-tablegpt.openai.azure.com/",
|
58 |
+
)
|
59 |
+
chat_response = client.chat.completions.create(
|
60 |
+
model="gpt-4o",
|
61 |
+
# model="gpt-4o-mini",
|
62 |
+
messages=messages,
|
63 |
+
top_p=0.95,
|
64 |
+
temperature=0,
|
65 |
+
max_tokens=1024,
|
66 |
+
timeout=40,
|
67 |
+
)
|
68 |
+
else:
|
69 |
+
# Set OpenAI's API key and API base to use vLLM's API server.
|
70 |
+
openai_api_key = "EMPTY"
|
71 |
+
openai_api_base = "http://localhost:8080/v1"
|
72 |
+
|
73 |
+
client = OpenAI(
|
74 |
+
api_key=openai_api_key,
|
75 |
+
base_url=openai_api_base,
|
76 |
+
)
|
77 |
+
chat_response = client.chat.completions.create(
|
78 |
+
model="qwen2-7b-sft",
|
79 |
+
messages=messages,
|
80 |
+
top_p=0.3,
|
81 |
+
temperature=0.1,
|
82 |
+
max_tokens=1024,
|
83 |
+
)
|
84 |
+
example[output_key] = chat_response.choices[0].message.content
|
85 |
+
except Exception as e:
|
86 |
+
print(f"An unexpected error occurred: {e}")
|
87 |
+
example[output_key] = None
|
88 |
+
example["input"] = messages
|
89 |
+
return example
|
90 |
+
|
91 |
+
|
92 |
+
|
93 |
+
def generate_main(args):
|
94 |
+
model_name_or_path = args.model_path
|
95 |
+
lang = args.language
|
96 |
+
temp_dir = args.temp_dir
|
97 |
+
create_dir(temp_dir)
|
98 |
+
# os.makedirs(temp_dir, exist_ok=True)
|
99 |
+
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
100 |
+
if not args.api:
|
101 |
+
print("model", model_name_or_path)
|
102 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
103 |
+
print(
|
104 |
+
"load tokenizer {} from {} over.".format(
|
105 |
+
tokenizer.__class__, model_name_or_path
|
106 |
+
)
|
107 |
+
)
|
108 |
+
llm_args = {
|
109 |
+
"model": model_name_or_path,
|
110 |
+
"gpu_memory_utilization": 0.95,
|
111 |
+
"trust_remote_code": True,
|
112 |
+
"tensor_parallel_size": args.gpus_num,
|
113 |
+
"dtype": "half",
|
114 |
+
"max_model_len": 8192,
|
115 |
+
"enforce_eager": True,
|
116 |
+
}
|
117 |
+
|
118 |
+
llm = LLM(**llm_args)
|
119 |
+
sampling_params = SamplingParams(
|
120 |
+
temperature=0,
|
121 |
+
max_tokens=1024,
|
122 |
+
top_p=0.95,
|
123 |
+
stop_token_ids=[tokenizer.eos_token_id],
|
124 |
+
)
|
125 |
+
|
126 |
+
examples = [json.loads(x) for x in open(problem_file) if x.strip()]
|
127 |
+
print("Read {} examples for evaluation over.".format(len(examples)))
|
128 |
+
messages_list = []
|
129 |
+
for example in tqdm(examples, desc="Generating"):
|
130 |
+
prompt = build_deepseekcoder_instruction(
|
131 |
+
languge_settings[lang]["full_name"], example["prompt"]
|
132 |
+
)
|
133 |
+
message = [{"role": "user", "content": prompt}]
|
134 |
+
if args.api:
|
135 |
+
messages_list.append(message)
|
136 |
+
else:
|
137 |
+
messages_list.append(
|
138 |
+
tokenizer.apply_chat_template(
|
139 |
+
message, tokenize=False, add_generation_prompt=True
|
140 |
+
)
|
141 |
+
)
|
142 |
+
if args.api:
|
143 |
+
from joblib import Parallel, delayed
|
144 |
+
examples_ = Parallel(n_jobs=24)(
|
145 |
+
delayed(get_client_res)(inp, examples[i], "output",open_ai_key=True)
|
146 |
+
for i, inp in enumerate(tqdm(messages_list))
|
147 |
+
)
|
148 |
+
|
149 |
+
# 请求错误的重新请求
|
150 |
+
examples = []
|
151 |
+
for example in examples_:
|
152 |
+
if example["output"] == None:
|
153 |
+
example = get_client_res(
|
154 |
+
example["input"], example, "output", open_ai_key=True
|
155 |
+
)
|
156 |
+
del example["input"]
|
157 |
+
examples.append(example)
|
158 |
+
|
159 |
+
generated_examples = []
|
160 |
+
for example in examples:
|
161 |
+
example = extract_generation_code(example, lang_code=lang)
|
162 |
+
generated_examples.append(example)
|
163 |
+
else:
|
164 |
+
outputs = llm.generate(messages_list, sampling_params=sampling_params)
|
165 |
+
generated_examples = []
|
166 |
+
for i, output in enumerate(tqdm(outputs)):
|
167 |
+
output = output.outputs[0].text
|
168 |
+
example = examples[i]
|
169 |
+
example["output"] = output
|
170 |
+
example = extract_generation_code(example, lang_code=lang)
|
171 |
+
generated_examples.append(example)
|
172 |
+
|
173 |
+
print("Generate all over!!!")
|
174 |
+
# os.makedirs(args.save_dir, exist_ok=True)
|
175 |
+
create_dir(args.save_dir)
|
176 |
+
saved_path = os.path.join(args.save_dir, "results_humaneval.json")
|
177 |
+
with open(saved_path, "w", encoding="utf-8") as fw:
|
178 |
+
for ex in generated_examples:
|
179 |
+
fw.write(json.dumps(ex) + "\n")
|
180 |
+
print(
|
181 |
+
"Save {} processed examples into {} over!".format(
|
182 |
+
len(generated_examples), saved_path
|
183 |
+
)
|
184 |
+
)
|
185 |
+
|
186 |
+
result = evaluate_functional_correctness(
|
187 |
+
input_file=saved_path,
|
188 |
+
tmp_dir=temp_dir,
|
189 |
+
n_workers=8,
|
190 |
+
timeout=3.0,
|
191 |
+
problem_file=problem_file,
|
192 |
+
language=lang,
|
193 |
+
out_path=saved_path,
|
194 |
+
)
|
195 |
+
print(lang, result, model_name_or_path)
|
196 |
+
|
197 |
+
|
198 |
+
if __name__ == "__main__":
|
199 |
+
parser = argparse.ArgumentParser()
|
200 |
+
parser.add_argument(
|
201 |
+
"--model_path",
|
202 |
+
type=str,
|
203 |
+
help="model name or path",
|
204 |
+
default="/data4/sft_output/qwen2-instruct-0709/checkpoint-1400",
|
205 |
+
)
|
206 |
+
parser.add_argument(
|
207 |
+
"--gpus_num", type=int, default=1, help="the number of GPUs you want to use."
|
208 |
+
)
|
209 |
+
parser.add_argument(
|
210 |
+
"--save_dir",
|
211 |
+
type=str,
|
212 |
+
help="output path of your generation",
|
213 |
+
default="output",
|
214 |
+
)
|
215 |
+
parser.add_argument("--api", action="store_true", help="infer api type")
|
216 |
+
parser.add_argument("--language", type=str, help="langauge", default="python")
|
217 |
+
parser.add_argument(
|
218 |
+
"--temp_dir", type=str, help="temp dir for evaluation", default="output/tmp"
|
219 |
+
)
|
220 |
+
parser.add_argument("--seed", type=int, help="seed", default=42)
|
221 |
+
args = parser.parse_args()
|
222 |
+
|
223 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
224 |
+
transformers.set_seed(args.seed)
|
225 |
+
generate_main(args)
|
evaluation/general_benchmarks/HumanEval/eval_pal.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import subprocess
|
4 |
+
import sys
|
5 |
+
from argparse import ArgumentParser
|
6 |
+
from pathlib import Path
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
import pandas as pd
|
10 |
+
import torch
|
11 |
+
import torch.distributed as dist
|
12 |
+
import torch.nn.functional as F
|
13 |
+
from accelerate import Accelerator, DistributedDataParallelKwargs
|
14 |
+
from humaneval import HumanEval as evaltor
|
15 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
16 |
+
|
17 |
+
if __name__ == "__main__":
|
18 |
+
kwargs_handlers = [DistributedDataParallelKwargs(find_unused_parameters=True)]
|
19 |
+
accelerator = Accelerator(mixed_precision="bf16", kwargs_handlers=kwargs_handlers)
|
20 |
+
|
21 |
+
parser = ArgumentParser()
|
22 |
+
parser.add_argument("--logdir", type=str, default="./output")
|
23 |
+
parser.add_argument(
|
24 |
+
"--model_path",
|
25 |
+
type=str,
|
26 |
+
default="/data3/models/DeepSeek/deepseek-coder-6.7b-base",
|
27 |
+
)
|
28 |
+
parser.add_argument("--language", type=str, default="python")
|
29 |
+
parser.add_argument("--dataroot", type=str, default="HumanEval/data")
|
30 |
+
args = parser.parse_args()
|
31 |
+
|
32 |
+
logdir = args.logdir
|
33 |
+
language = args.language
|
34 |
+
model_path = args.model_path
|
35 |
+
|
36 |
+
if logdir == "":
|
37 |
+
logdir = "tmp/"
|
38 |
+
tokenizer = dict(
|
39 |
+
cls=AutoTokenizer,
|
40 |
+
model_path=model_path,
|
41 |
+
)
|
42 |
+
|
43 |
+
dataroot = args.dataroot
|
44 |
+
|
45 |
+
evaluator = evaltor(
|
46 |
+
data_root=dataroot,
|
47 |
+
max_seq_len=4096,
|
48 |
+
tokenizer_cfg=tokenizer,
|
49 |
+
log_dir=logdir,
|
50 |
+
n_sample=1,
|
51 |
+
batch_size=1,
|
52 |
+
language=language,
|
53 |
+
max_gen_len=500,
|
54 |
+
)
|
55 |
+
model = AutoModelForCausalLM.from_pretrained(
|
56 |
+
model_path,
|
57 |
+
device_map=accelerator.device,
|
58 |
+
trust_remote_code=True,
|
59 |
+
torch_dtype=torch.bfloat16,
|
60 |
+
)
|
61 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
62 |
+
evaluator.eval_model(model, accelerator)
|
evaluation/general_benchmarks/HumanEval/human_eval/__init__.py
ADDED
File without changes
|
evaluation/general_benchmarks/HumanEval/human_eval/data.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gzip
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
from typing import Dict, Iterable
|
5 |
+
|
6 |
+
ROOT = os.path.dirname(os.path.abspath(__file__))
|
7 |
+
HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
|
8 |
+
|
9 |
+
|
10 |
+
def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
|
11 |
+
return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
|
12 |
+
|
13 |
+
|
14 |
+
def stream_jsonl(filename: str) -> Iterable[Dict]:
|
15 |
+
"""
|
16 |
+
Parses each jsonl line and yields it as a dictionary
|
17 |
+
"""
|
18 |
+
if filename.endswith(".gz"):
|
19 |
+
with open(filename, "rb") as gzfp:
|
20 |
+
with gzip.open(gzfp, "rt") as fp:
|
21 |
+
for line in fp:
|
22 |
+
if any(not x.isspace() for x in line):
|
23 |
+
yield json.loads(line)
|
24 |
+
else:
|
25 |
+
with open(filename, "r", encoding="utf-8") as fp:
|
26 |
+
for line in fp:
|
27 |
+
if any(not x.isspace() for x in line):
|
28 |
+
yield json.loads(line)
|
29 |
+
|
30 |
+
|
31 |
+
def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
|
32 |
+
"""
|
33 |
+
Writes an iterable of dictionaries to jsonl
|
34 |
+
"""
|
35 |
+
if append:
|
36 |
+
mode = "ab"
|
37 |
+
else:
|
38 |
+
mode = "wb"
|
39 |
+
filename = os.path.expanduser(filename)
|
40 |
+
if filename.endswith(".gz"):
|
41 |
+
with open(filename, mode) as fp:
|
42 |
+
with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
|
43 |
+
for x in data:
|
44 |
+
gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
|
45 |
+
else:
|
46 |
+
with open(filename, mode) as fp:
|
47 |
+
for x in data:
|
48 |
+
fp.write((json.dumps(x) + "\n").encode("utf-8"))
|
evaluation/general_benchmarks/HumanEval/human_eval/evaluate_functional_correctness.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
import fire
|
4 |
+
|
5 |
+
from .data import HUMAN_EVAL
|
6 |
+
from .evaluation import evaluate_functional_correctness
|
7 |
+
|
8 |
+
|
9 |
+
def entry_point(
|
10 |
+
sample_file: str,
|
11 |
+
k: str = "1,10,100",
|
12 |
+
n_workers: int = 4,
|
13 |
+
timeout: float = 3.0,
|
14 |
+
problem_file: str = "",
|
15 |
+
is_mbpp: bool = False,
|
16 |
+
):
|
17 |
+
"""
|
18 |
+
Evaluates the functional correctness of generated samples, and writes
|
19 |
+
results to f"{sample_file}_results.jsonl.gz"
|
20 |
+
"""
|
21 |
+
k = list(map(int, k.split(",")))
|
22 |
+
results = evaluate_functional_correctness(
|
23 |
+
sample_file, k, n_workers, timeout, problem_file, is_mbpp
|
24 |
+
)
|
25 |
+
print(results)
|
26 |
+
|
27 |
+
|
28 |
+
def main():
|
29 |
+
fire.Fire(entry_point)
|
30 |
+
|
31 |
+
|
32 |
+
sys.exit(main())
|
evaluation/general_benchmarks/HumanEval/human_eval/evaluation.py
ADDED
@@ -0,0 +1,351 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gzip
|
2 |
+
import itertools
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6 |
+
from typing import *
|
7 |
+
|
8 |
+
import numpy as np
|
9 |
+
from tqdm.auto import tqdm
|
10 |
+
|
11 |
+
from human_eval.data import stream_jsonl
|
12 |
+
from human_eval.execution import check_correctness
|
13 |
+
|
14 |
+
IMPORT_HELPER = {
|
15 |
+
"python": [
|
16 |
+
"import math",
|
17 |
+
"import re",
|
18 |
+
"import sys",
|
19 |
+
"import copy",
|
20 |
+
"import datetime",
|
21 |
+
"import itertools",
|
22 |
+
"import collections",
|
23 |
+
"import heapq",
|
24 |
+
"import functools",
|
25 |
+
"import hashlib",
|
26 |
+
"import numpy",
|
27 |
+
"import numpy as np",
|
28 |
+
"import string",
|
29 |
+
"from typing import *",
|
30 |
+
"from collections import *",
|
31 |
+
],
|
32 |
+
"go": [
|
33 |
+
"math",
|
34 |
+
"strings",
|
35 |
+
"fmt",
|
36 |
+
"strconv",
|
37 |
+
"time",
|
38 |
+
"bytes",
|
39 |
+
"regexp",
|
40 |
+
"sort",
|
41 |
+
"math/rand",
|
42 |
+
"crypto/md5",
|
43 |
+
],
|
44 |
+
"cpp": [
|
45 |
+
"#include<stdlib.h>",
|
46 |
+
"#include<algorithm>",
|
47 |
+
"#include<math.h>",
|
48 |
+
"#include<stdio.h>",
|
49 |
+
"#include<vector>",
|
50 |
+
"#include<string>",
|
51 |
+
"#include<climits>",
|
52 |
+
"#include<cstring>",
|
53 |
+
"#include<iostream>",
|
54 |
+
"#include<cassert>",
|
55 |
+
],
|
56 |
+
"cs": [
|
57 |
+
"using System.Numerics;",
|
58 |
+
"using System.Diagnostics;",
|
59 |
+
"using System.Collections.Generic;",
|
60 |
+
"using System.Linq;",
|
61 |
+
"using System.Text;",
|
62 |
+
"using System.Security.Cryptography;",
|
63 |
+
"using System.Collections.Generic;",
|
64 |
+
],
|
65 |
+
}
|
66 |
+
|
67 |
+
|
68 |
+
LANGUAGE_NAME = {
|
69 |
+
"cpp": "CPP",
|
70 |
+
"go": "Go",
|
71 |
+
"java": "Java",
|
72 |
+
"js": "JavaScript",
|
73 |
+
"python": "Python",
|
74 |
+
}
|
75 |
+
|
76 |
+
|
77 |
+
def read_dataset(
|
78 |
+
data_file: str = None,
|
79 |
+
dataset_type: str = "humaneval",
|
80 |
+
num_shot=None,
|
81 |
+
) -> Dict:
|
82 |
+
"""
|
83 |
+
Reads a dataset and returns a dictionary of tasks.
|
84 |
+
"""
|
85 |
+
if num_shot is not None:
|
86 |
+
print(f"{num_shot}-shot setting...")
|
87 |
+
if "humaneval" in dataset_type.lower():
|
88 |
+
if data_file is None:
|
89 |
+
current_path = os.path.dirname(os.path.abspath(__file__))
|
90 |
+
data_file = os.path.join(
|
91 |
+
current_path,
|
92 |
+
"..",
|
93 |
+
"humaneval-x",
|
94 |
+
"python",
|
95 |
+
"data",
|
96 |
+
"humaneval_python.jsonl.gz",
|
97 |
+
)
|
98 |
+
dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
|
99 |
+
else:
|
100 |
+
raise f"Dataset: {dataset_type} not supported."
|
101 |
+
|
102 |
+
return dataset
|
103 |
+
|
104 |
+
|
105 |
+
def estimate_pass_at_k(
|
106 |
+
num_samples: Union[int, List[int], np.ndarray],
|
107 |
+
num_correct: Union[List[int], np.ndarray],
|
108 |
+
k: int,
|
109 |
+
) -> np.ndarray:
|
110 |
+
"""
|
111 |
+
Estimates pass@k of each problem and returns them in an array.
|
112 |
+
"""
|
113 |
+
|
114 |
+
def estimator(n: int, c: int, k: int) -> float:
|
115 |
+
"""
|
116 |
+
Calculates 1 - comb(n - c, k) / comb(n, k).
|
117 |
+
"""
|
118 |
+
if n - c < k:
|
119 |
+
return 1.0
|
120 |
+
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
121 |
+
|
122 |
+
if isinstance(num_samples, int):
|
123 |
+
num_samples_it = itertools.repeat(num_samples, len(num_correct))
|
124 |
+
else:
|
125 |
+
assert len(num_samples) == len(num_correct)
|
126 |
+
num_samples_it = iter(num_samples)
|
127 |
+
|
128 |
+
return np.array(
|
129 |
+
[estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
|
130 |
+
)
|
131 |
+
|
132 |
+
|
133 |
+
def process_humaneval_test(
|
134 |
+
sample, problems, example_test=False, is_mbpp=False, language="python"
|
135 |
+
):
|
136 |
+
"""
|
137 |
+
Processes a sample for evaluation.
|
138 |
+
"""
|
139 |
+
task_id = sample["task_id"]
|
140 |
+
if is_mbpp:
|
141 |
+
return sample["generation"] + "\n" + "\n".join(problems[task_id]["test"])
|
142 |
+
|
143 |
+
prompt = sample["prompt"]
|
144 |
+
if (
|
145 |
+
example_test
|
146 |
+
and "example_test" in problems[task_id]
|
147 |
+
and problems[task_id]["example_test"] != ""
|
148 |
+
):
|
149 |
+
test = problems[task_id]["example_test"]
|
150 |
+
else:
|
151 |
+
test = problems[task_id]["test"]
|
152 |
+
code = sample["generation"]
|
153 |
+
|
154 |
+
# Pre-process for different languages
|
155 |
+
if language == "python":
|
156 |
+
test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
|
157 |
+
test_string = test_setup + code + "\n" + test + "\n"
|
158 |
+
elif language == "cpp":
|
159 |
+
test_set_up = ""
|
160 |
+
for s in IMPORT_HELPER["cpp"]:
|
161 |
+
if s not in prompt:
|
162 |
+
test_set_up += s + "\n"
|
163 |
+
test_string = test_set_up + "\n" + code + "\n" + test
|
164 |
+
elif language == "java":
|
165 |
+
test_string = code + "\n" + test
|
166 |
+
elif language == "cs":
|
167 |
+
test_set_up = ""
|
168 |
+
for s in IMPORT_HELPER["cs"]:
|
169 |
+
test_set_up += s + "\n"
|
170 |
+
test_string = test_set_up + "\n" + code + "\n" + test
|
171 |
+
elif language in ["js", "javascript", "ts", "sh", "go"]:
|
172 |
+
test_string = code + "\n" + test
|
173 |
+
elif language == "go232":
|
174 |
+
import_string = problems[task_id]["import"]
|
175 |
+
prompt = prompt.replace(import_string, "")
|
176 |
+
if example_test and "example_test" in problems[task_id]:
|
177 |
+
test = problems[task_id]["example_test"]
|
178 |
+
else:
|
179 |
+
test = problems[task_id]["test"]
|
180 |
+
test_setup = problems[task_id]["test_setup"]
|
181 |
+
other_pkgs = []
|
182 |
+
for pkg in IMPORT_HELPER["go"]:
|
183 |
+
if pkg not in test_setup:
|
184 |
+
p = pkg.split("/")[-1]
|
185 |
+
if p + "." in code:
|
186 |
+
other_pkgs.append(f'"{pkg}"')
|
187 |
+
if other_pkgs:
|
188 |
+
import_other_pkgs = (
|
189 |
+
"import (\n" + " ".join([p + "\n" for p in other_pkgs]) + ")"
|
190 |
+
)
|
191 |
+
test_string = (
|
192 |
+
test_setup
|
193 |
+
+ "\n"
|
194 |
+
+ import_other_pkgs
|
195 |
+
+ "\n"
|
196 |
+
+ prompt
|
197 |
+
+ code
|
198 |
+
+ "\n"
|
199 |
+
+ test
|
200 |
+
)
|
201 |
+
else:
|
202 |
+
test_string = test_setup + "\n" + prompt + code + "\n" + test
|
203 |
+
elif language == "rust":
|
204 |
+
main = "\nfn main(){ \n } \n"
|
205 |
+
declaration = problems[task_id]["declaration"]
|
206 |
+
test_string = main + declaration + prompt + code + test
|
207 |
+
elif language == "php":
|
208 |
+
if code[:5] != "<?php":
|
209 |
+
code = "<?php\n" + code
|
210 |
+
test_string = code + "\n" + test + "?>"
|
211 |
+
return test_string
|
212 |
+
|
213 |
+
|
214 |
+
def stream_jsonl_all(filename: str) -> Iterable[Dict]:
|
215 |
+
"""
|
216 |
+
Streams a JSONL file.
|
217 |
+
"""
|
218 |
+
results = []
|
219 |
+
if filename.endswith(".gz"):
|
220 |
+
fp = gzip.open(open(filename, "rb"), "rt")
|
221 |
+
else:
|
222 |
+
fp = open(filename, "r")
|
223 |
+
for line in fp:
|
224 |
+
if any(not x.isspace() for x in line):
|
225 |
+
results.append(json.loads(line))
|
226 |
+
fp.close()
|
227 |
+
|
228 |
+
return results
|
229 |
+
|
230 |
+
|
231 |
+
def evaluate_functional_correctness(
|
232 |
+
input_file: str = None,
|
233 |
+
tmp_dir: str = "./",
|
234 |
+
n_workers: int = 32,
|
235 |
+
timeout: float = 10.0,
|
236 |
+
problem_file: str = "../data/humaneval_python.jsonl.gz",
|
237 |
+
out_path: str = None,
|
238 |
+
k: List[int] = [1, 10, 100],
|
239 |
+
test_groundtruth: bool = False,
|
240 |
+
example_test: bool = False,
|
241 |
+
is_mbpp: bool = False,
|
242 |
+
language: str = "python",
|
243 |
+
):
|
244 |
+
"""
|
245 |
+
Evaluates the functional correctness of a model.
|
246 |
+
"""
|
247 |
+
if example_test:
|
248 |
+
print("Example test...")
|
249 |
+
|
250 |
+
problems = read_dataset(problem_file, dataset_type="humaneval")
|
251 |
+
sample_jsonl = stream_jsonl_all(input_file)
|
252 |
+
|
253 |
+
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
254 |
+
|
255 |
+
futures = []
|
256 |
+
completion_id = Counter()
|
257 |
+
n_samples = 0
|
258 |
+
# results = defaultdict(list)
|
259 |
+
results = {}
|
260 |
+
|
261 |
+
if test_groundtruth:
|
262 |
+
print("Testing ground truth...")
|
263 |
+
for sample in tqdm(problems.values()):
|
264 |
+
task_id = sample["task_id"]
|
265 |
+
lang = task_id.split("/")[0].lower()
|
266 |
+
if lang == "javascript":
|
267 |
+
lang = "js"
|
268 |
+
tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
|
269 |
+
sample["generation"] = sample["canonical_solution"]
|
270 |
+
sample["test_code"] = process_humaneval_test(
|
271 |
+
sample, problems, example_test, language
|
272 |
+
)
|
273 |
+
if sample["test_code"] is None:
|
274 |
+
continue
|
275 |
+
args = (
|
276 |
+
task_id,
|
277 |
+
sample,
|
278 |
+
lang,
|
279 |
+
timeout,
|
280 |
+
tmp_dir_,
|
281 |
+
completion_id[task_id],
|
282 |
+
)
|
283 |
+
future = executor.submit(check_correctness, *args)
|
284 |
+
futures.append(future)
|
285 |
+
completion_id[task_id] += 1
|
286 |
+
n_samples += 1
|
287 |
+
else:
|
288 |
+
print("Reading samples...")
|
289 |
+
for sample in tqdm(sample_jsonl):
|
290 |
+
task_id = sample["task_id"]
|
291 |
+
if not is_mbpp:
|
292 |
+
lang = language
|
293 |
+
if not is_mbpp and lang == "javascript":
|
294 |
+
lang = "js"
|
295 |
+
if is_mbpp:
|
296 |
+
lang = "python"
|
297 |
+
tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
|
298 |
+
sample["task_id"] = task_id
|
299 |
+
sample["test_code"] = process_humaneval_test(
|
300 |
+
sample, problems, example_test, is_mbpp, language
|
301 |
+
)
|
302 |
+
if sample["test_code"] is None:
|
303 |
+
continue
|
304 |
+
if "completion_id" in sample:
|
305 |
+
completion_id_ = sample["completion_id"]
|
306 |
+
else:
|
307 |
+
completion_id_ = completion_id[task_id]
|
308 |
+
args = (task_id, sample, lang, timeout, tmp_dir_, completion_id_)
|
309 |
+
future = executor.submit(check_correctness, *args)
|
310 |
+
futures.append(future)
|
311 |
+
completion_id[task_id] += 1
|
312 |
+
n_samples += 1
|
313 |
+
|
314 |
+
if len(completion_id) == len(problems):
|
315 |
+
evaluate_pass_at_k = True
|
316 |
+
else:
|
317 |
+
evaluate_pass_at_k = False
|
318 |
+
|
319 |
+
print("Running test suites...")
|
320 |
+
for future in tqdm(as_completed(futures), total=len(futures)):
|
321 |
+
result = future.result()
|
322 |
+
# results[result["task_id"]].append((result["completion_id"], result))
|
323 |
+
results[result["task_id"]] = result
|
324 |
+
|
325 |
+
# Calculate pass@k.
|
326 |
+
total, correct = [], []
|
327 |
+
for result in results.values():
|
328 |
+
# passed = [r[1]["passed"] for r in result]
|
329 |
+
passed = [result["passed"]]
|
330 |
+
total.append(len(passed))
|
331 |
+
correct.append(sum(passed))
|
332 |
+
total = np.array(total)
|
333 |
+
correct = np.array(correct)
|
334 |
+
|
335 |
+
if evaluate_pass_at_k:
|
336 |
+
ks = k
|
337 |
+
pass_at_k = {
|
338 |
+
f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
|
339 |
+
for k in ks
|
340 |
+
if (total >= k).all()
|
341 |
+
}
|
342 |
+
print(pass_at_k)
|
343 |
+
else:
|
344 |
+
print("Total:", np.sum(total))
|
345 |
+
print("Correct:", np.sum(correct))
|
346 |
+
|
347 |
+
if out_path:
|
348 |
+
with open(out_path, "w") as f:
|
349 |
+
json.dump(list(results.values()), f, ensure_ascii=False)
|
350 |
+
|
351 |
+
return pass_at_k
|
evaluation/general_benchmarks/HumanEval/human_eval/execution.py
ADDED
@@ -0,0 +1,817 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import contextlib
|
2 |
+
import faulthandler
|
3 |
+
import gzip
|
4 |
+
import io
|
5 |
+
import json
|
6 |
+
import multiprocessing
|
7 |
+
import os
|
8 |
+
import platform
|
9 |
+
import random
|
10 |
+
import signal
|
11 |
+
import subprocess
|
12 |
+
import tempfile
|
13 |
+
import traceback
|
14 |
+
from typing import *
|
15 |
+
|
16 |
+
java_exec = ""
|
17 |
+
node_exec = ""
|
18 |
+
tsc_exec = ""
|
19 |
+
go_exec = ""
|
20 |
+
php_exec = ""
|
21 |
+
cs_exec = ""
|
22 |
+
|
23 |
+
|
24 |
+
def check_correctness(
|
25 |
+
task_id: str,
|
26 |
+
sample: dict,
|
27 |
+
language_type: str,
|
28 |
+
timeout: float = 3.0,
|
29 |
+
tmp_dir: str = None,
|
30 |
+
completion_id: Optional[int] = None,
|
31 |
+
) -> Dict:
|
32 |
+
"""
|
33 |
+
Evaluates the functional correctness of a completion by running the test
|
34 |
+
suite provided in the problem.
|
35 |
+
"""
|
36 |
+
|
37 |
+
def unsafe_execute(tmp_dir):
|
38 |
+
random_id = random.randint(1, 100000)
|
39 |
+
if "python" in language_type.lower():
|
40 |
+
with create_tempdir():
|
41 |
+
|
42 |
+
# These system calls are needed when cleaning up tempdir.
|
43 |
+
import os
|
44 |
+
import shutil
|
45 |
+
|
46 |
+
rmtree = shutil.rmtree
|
47 |
+
rmdir = os.rmdir
|
48 |
+
chdir = os.chdir
|
49 |
+
|
50 |
+
# Disable functionalities that can make destructive changes to the test.
|
51 |
+
reliability_guard()
|
52 |
+
|
53 |
+
try:
|
54 |
+
exec_globals = {}
|
55 |
+
with swallow_io():
|
56 |
+
with time_limit(timeout):
|
57 |
+
# WARNING
|
58 |
+
# This program exists to execute untrusted model-generated code. Although
|
59 |
+
# it is highly unlikely that model-generated code will do something overtly
|
60 |
+
# malicious in response to this test suite, model-generated code may act
|
61 |
+
# destructively due to a lack of model capability or alignment.
|
62 |
+
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
63 |
+
# does not perform destructive actions on their host or network.
|
64 |
+
# Once you have read this disclaimer and taken appropriate precautions,
|
65 |
+
# uncomment the following line and proceed at your own risk:
|
66 |
+
exec(sample["test_code"], exec_globals)
|
67 |
+
result.append("passed")
|
68 |
+
except TimeoutException:
|
69 |
+
result.append("timed out")
|
70 |
+
except AssertionError as e:
|
71 |
+
result.append(f"failed: AssertionError")
|
72 |
+
except BaseException as e:
|
73 |
+
result.append(f"failed: {e}")
|
74 |
+
# print(sample["test_code"])
|
75 |
+
# print(result)
|
76 |
+
# Needed for cleaning up.
|
77 |
+
shutil.rmtree = rmtree
|
78 |
+
os.rmdir = rmdir
|
79 |
+
os.chdir = chdir
|
80 |
+
|
81 |
+
elif "go" in language_type.lower():
|
82 |
+
assert (
|
83 |
+
tmp_dir is not None
|
84 |
+
), "Go should be evaluated in a dir where necessary module files installed."
|
85 |
+
|
86 |
+
import os
|
87 |
+
import shutil
|
88 |
+
|
89 |
+
if "tmp" not in tmp_dir:
|
90 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
91 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
92 |
+
if not os.path.exists(tmp_dir):
|
93 |
+
os.makedirs(tmp_dir)
|
94 |
+
origin_path = os.getcwd()
|
95 |
+
os.chdir(tmp_dir)
|
96 |
+
open(f"main_test.go", "w").write(sample["test_code"])
|
97 |
+
try:
|
98 |
+
exec_result = None
|
99 |
+
with time_limit(timeout):
|
100 |
+
# WARNING
|
101 |
+
# This program exists to execute untrusted model-generated code. Although
|
102 |
+
# it is highly unlikely that model-generated code will do something overtly
|
103 |
+
# malicious in response to this test suite, model-generated code may act
|
104 |
+
# destructively due to a lack of model capability or alignment.
|
105 |
+
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
106 |
+
# does not perform destructive actions on their host or network.
|
107 |
+
# Once you have read this disclaimer and taken appropriate precautions,
|
108 |
+
# uncomment the following line and proceed at your own risk:
|
109 |
+
exec_result = subprocess.run(
|
110 |
+
[
|
111 |
+
f"{go_exec}go",
|
112 |
+
"test",
|
113 |
+
f"-timeout={timeout}s",
|
114 |
+
"main_test.go",
|
115 |
+
],
|
116 |
+
timeout=timeout,
|
117 |
+
capture_output=True,
|
118 |
+
)
|
119 |
+
|
120 |
+
if exec_result.returncode == 0:
|
121 |
+
result.append("passed")
|
122 |
+
else:
|
123 |
+
if exec_result.stderr:
|
124 |
+
try:
|
125 |
+
err = exec_result.stderr.decode()
|
126 |
+
except:
|
127 |
+
err = exec_result.stderr
|
128 |
+
else:
|
129 |
+
try:
|
130 |
+
err = exec_result.stdout.decode()
|
131 |
+
except:
|
132 |
+
err = exec_result.stdout
|
133 |
+
result.append(f"failed: {err}")
|
134 |
+
|
135 |
+
except TimeoutException:
|
136 |
+
result.append("timed out")
|
137 |
+
os.chdir(origin_path)
|
138 |
+
shutil.rmtree(tmp_dir)
|
139 |
+
elif "js" in language_type.lower():
|
140 |
+
import os
|
141 |
+
import shutil
|
142 |
+
|
143 |
+
if "tmp" not in tmp_dir:
|
144 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
145 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
146 |
+
if not os.path.exists(tmp_dir):
|
147 |
+
os.makedirs(tmp_dir)
|
148 |
+
origin_path = os.getcwd()
|
149 |
+
os.chdir(tmp_dir)
|
150 |
+
open(f"test.js", "w").write(sample["test_code"])
|
151 |
+
try:
|
152 |
+
exec_result = None
|
153 |
+
with time_limit(timeout):
|
154 |
+
# WARNING
|
155 |
+
# This program exists to execute untrusted model-generated code. Although
|
156 |
+
# it is highly unlikely that model-generated code will do something overtly
|
157 |
+
# malicious in response to this test suite, model-generated code may act
|
158 |
+
# destructively due to a lack of model capability or alignment.
|
159 |
+
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
160 |
+
# does not perform destructive actions on their host or network.
|
161 |
+
# Once you have read this disclaimer and taken appropriate precautions,
|
162 |
+
# uncomment the following line and proceed at your own risk:
|
163 |
+
exec_result = subprocess.run(
|
164 |
+
[f"{node_exec}node", "test.js"],
|
165 |
+
timeout=timeout,
|
166 |
+
capture_output=True,
|
167 |
+
)
|
168 |
+
|
169 |
+
if exec_result.stderr.decode():
|
170 |
+
err = exec_result.stderr.decode()
|
171 |
+
result.append(f"failed: {err}")
|
172 |
+
elif exec_result.stdout.decode():
|
173 |
+
err = exec_result.stdout.decode()
|
174 |
+
result.append(f"failed: {err}")
|
175 |
+
else:
|
176 |
+
result.append("passed")
|
177 |
+
|
178 |
+
except TimeoutException:
|
179 |
+
result.append("timed out")
|
180 |
+
os.chdir(origin_path)
|
181 |
+
shutil.rmtree(tmp_dir)
|
182 |
+
elif "cpp" in language_type.lower():
|
183 |
+
import os
|
184 |
+
import shutil
|
185 |
+
|
186 |
+
origin_path = os.getcwd()
|
187 |
+
if "tmp" not in tmp_dir:
|
188 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
189 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
190 |
+
if not os.path.exists(tmp_dir):
|
191 |
+
os.makedirs(tmp_dir)
|
192 |
+
|
193 |
+
os.chdir(tmp_dir)
|
194 |
+
open(f"test.cpp", "w").write(sample["test_code"])
|
195 |
+
if "162" in task_id:
|
196 |
+
compilation_result = subprocess.run(
|
197 |
+
["/usr/bin/g++", "-std=c++17", "test.cpp", "-lcrypto", "-lssl"],
|
198 |
+
timeout=timeout,
|
199 |
+
capture_output=True,
|
200 |
+
)
|
201 |
+
else:
|
202 |
+
compilation_result = subprocess.run(
|
203 |
+
["/usr/bin/g++", "-std=c++17", "test.cpp"],
|
204 |
+
timeout=timeout,
|
205 |
+
capture_output=True,
|
206 |
+
)
|
207 |
+
if compilation_result.returncode != 0:
|
208 |
+
if compilation_result.stderr:
|
209 |
+
err = compilation_result.stderr.decode()
|
210 |
+
else:
|
211 |
+
err = compilation_result.stdout.decode()
|
212 |
+
result.append(f"failed: compilation error: {err}")
|
213 |
+
else:
|
214 |
+
try:
|
215 |
+
exec_result = None
|
216 |
+
with time_limit(timeout):
|
217 |
+
# WARNING
|
218 |
+
# This program exists to execute untrusted model-generated code. Although
|
219 |
+
# it is highly unlikely that model-generated code will do something overtly
|
220 |
+
# malicious in response to this test suite, model-generated code may act
|
221 |
+
# destructively due to a lack of model capability or alignment.
|
222 |
+
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
223 |
+
# does not perform destructive actions on their host or network.
|
224 |
+
# Once you have read this disclaimer and taken appropriate precautions,
|
225 |
+
# uncomment the following line and proceed at your own risk:
|
226 |
+
exec_result = subprocess.run(
|
227 |
+
["./a.out"], timeout=timeout, capture_output=True
|
228 |
+
)
|
229 |
+
|
230 |
+
if exec_result.returncode == 0:
|
231 |
+
result.append("passed")
|
232 |
+
else:
|
233 |
+
if exec_result.stderr:
|
234 |
+
try:
|
235 |
+
err = exec_result.stderr.decode()
|
236 |
+
except:
|
237 |
+
err = exec_result.stderr
|
238 |
+
else:
|
239 |
+
try:
|
240 |
+
err = exec_result.stdout.decode()
|
241 |
+
except:
|
242 |
+
err = exec_result.stdout
|
243 |
+
result.append(f"failed: {err}")
|
244 |
+
except TimeoutException:
|
245 |
+
result.append("timed out")
|
246 |
+
# print(result[-1])
|
247 |
+
# print(sample["test_code"])
|
248 |
+
os.chdir(origin_path)
|
249 |
+
shutil.rmtree(tmp_dir)
|
250 |
+
elif "php" in language_type.lower():
|
251 |
+
import os
|
252 |
+
import shutil
|
253 |
+
|
254 |
+
origin_path = os.getcwd()
|
255 |
+
if "tmp" not in tmp_dir:
|
256 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
257 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
258 |
+
if not os.path.exists(tmp_dir):
|
259 |
+
os.makedirs(tmp_dir)
|
260 |
+
|
261 |
+
os.chdir(tmp_dir)
|
262 |
+
open(f"test.php", "w").write(sample["test_code"])
|
263 |
+
try:
|
264 |
+
exec_result = None
|
265 |
+
with time_limit(timeout):
|
266 |
+
cmd = f"{php_exec}php -f test.php"
|
267 |
+
exec_result = subprocess.run(
|
268 |
+
cmd, timeout=timeout, capture_output=True, shell=True
|
269 |
+
)
|
270 |
+
|
271 |
+
if exec_result.returncode == 0:
|
272 |
+
result.append("passed")
|
273 |
+
else:
|
274 |
+
if exec_result.stderr:
|
275 |
+
try:
|
276 |
+
err = exec_result.stderr.decode()
|
277 |
+
except:
|
278 |
+
err = exec_result.stderr
|
279 |
+
else:
|
280 |
+
try:
|
281 |
+
err = exec_result.stdout.decode()
|
282 |
+
except:
|
283 |
+
err = exec_result.stdout
|
284 |
+
result.append(f"failed: {err}")
|
285 |
+
except TimeoutException:
|
286 |
+
result.append("timed out")
|
287 |
+
print(result[-1])
|
288 |
+
print(sample["test_code"])
|
289 |
+
os.chdir(origin_path)
|
290 |
+
shutil.rmtree(tmp_dir)
|
291 |
+
elif "sh" in language_type.lower():
|
292 |
+
import os
|
293 |
+
import shutil
|
294 |
+
|
295 |
+
origin_path = os.getcwd()
|
296 |
+
if "tmp" not in tmp_dir:
|
297 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
298 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
299 |
+
if not os.path.exists(tmp_dir):
|
300 |
+
os.makedirs(tmp_dir)
|
301 |
+
|
302 |
+
os.chdir(tmp_dir)
|
303 |
+
open(f"test.sh", "w").write(sample["test_code"])
|
304 |
+
try:
|
305 |
+
exec_result = None
|
306 |
+
with time_limit(timeout):
|
307 |
+
cmd = "/bin/bash test.sh"
|
308 |
+
exec_result = subprocess.run(
|
309 |
+
cmd, timeout=10, capture_output=True, shell=True
|
310 |
+
)
|
311 |
+
|
312 |
+
if exec_result.returncode == 0:
|
313 |
+
result.append("passed")
|
314 |
+
else:
|
315 |
+
if exec_result.stderr:
|
316 |
+
try:
|
317 |
+
err = exec_result.stderr.decode()
|
318 |
+
except:
|
319 |
+
err = exec_result.stderr
|
320 |
+
else:
|
321 |
+
try:
|
322 |
+
err = exec_result.stdout.decode()
|
323 |
+
except:
|
324 |
+
err = exec_result.stdout
|
325 |
+
result.append(f"failed: {err}")
|
326 |
+
except TimeoutException:
|
327 |
+
result.append("timed out")
|
328 |
+
# print(result[-1])
|
329 |
+
# print(sample["test_code"])
|
330 |
+
os.chdir(origin_path)
|
331 |
+
shutil.rmtree(tmp_dir)
|
332 |
+
elif "ts" in language_type.lower():
|
333 |
+
import os
|
334 |
+
import shutil
|
335 |
+
|
336 |
+
origin_path = os.getcwd()
|
337 |
+
if "tmp" not in tmp_dir:
|
338 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
339 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
340 |
+
if not os.path.exists(tmp_dir):
|
341 |
+
os.makedirs(tmp_dir)
|
342 |
+
|
343 |
+
os.chdir(tmp_dir)
|
344 |
+
env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
|
345 |
+
open(f"test.ts", "w").write(sample["test_code"])
|
346 |
+
cmd = f"{tsc_exec}tsc test.ts --target ES2015 --lib ES2015,DOM"
|
347 |
+
compilation_result = subprocess.run(
|
348 |
+
cmd, timeout=timeout, capture_output=True, env=env, shell=True
|
349 |
+
)
|
350 |
+
if compilation_result.returncode != 0:
|
351 |
+
if compilation_result.stderr:
|
352 |
+
err = compilation_result.stderr.decode()
|
353 |
+
else:
|
354 |
+
err = compilation_result.stdout.decode()
|
355 |
+
result.append(f"failed: compilation error: {err}")
|
356 |
+
else:
|
357 |
+
try:
|
358 |
+
exec_result = None
|
359 |
+
with time_limit(timeout):
|
360 |
+
exec_result = subprocess.run(
|
361 |
+
[f"{node_exec}node", "test.js"],
|
362 |
+
timeout=timeout,
|
363 |
+
capture_output=True,
|
364 |
+
)
|
365 |
+
|
366 |
+
if exec_result.returncode == 0:
|
367 |
+
result.append("passed")
|
368 |
+
else:
|
369 |
+
if exec_result.stderr:
|
370 |
+
try:
|
371 |
+
err = exec_result.stderr.decode()
|
372 |
+
except:
|
373 |
+
err = exec_result.stderr
|
374 |
+
else:
|
375 |
+
try:
|
376 |
+
err = exec_result.stdout.decode()
|
377 |
+
except:
|
378 |
+
err = exec_result.stdout
|
379 |
+
result.append(f"failed: {err}")
|
380 |
+
except TimeoutException:
|
381 |
+
result.append("timed out")
|
382 |
+
if result[-1] != "passed":
|
383 |
+
env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
|
384 |
+
cmd = f"{tsc_exec}tsc test.ts"
|
385 |
+
compilation_result = subprocess.run(
|
386 |
+
cmd, timeout=timeout, capture_output=True, env=env, shell=True
|
387 |
+
)
|
388 |
+
if compilation_result.returncode != 0:
|
389 |
+
if compilation_result.stderr:
|
390 |
+
err = compilation_result.stderr.decode()
|
391 |
+
else:
|
392 |
+
err = compilation_result.stdout.decode()
|
393 |
+
result[-1] = f"failed: compilation error: {err}"
|
394 |
+
else:
|
395 |
+
try:
|
396 |
+
exec_result = None
|
397 |
+
with time_limit(timeout):
|
398 |
+
exec_result = subprocess.run(
|
399 |
+
[f"{node_exec}node", "test.js"],
|
400 |
+
timeout=timeout,
|
401 |
+
capture_output=True,
|
402 |
+
)
|
403 |
+
|
404 |
+
if exec_result.returncode == 0:
|
405 |
+
result[-1] = "passed"
|
406 |
+
else:
|
407 |
+
if exec_result.stderr:
|
408 |
+
try:
|
409 |
+
err = exec_result.stderr.decode()
|
410 |
+
except:
|
411 |
+
err = exec_result.stderr
|
412 |
+
else:
|
413 |
+
try:
|
414 |
+
err = exec_result.stdout.decode()
|
415 |
+
except:
|
416 |
+
err = exec_result.stdout
|
417 |
+
result[-1] = f"failed: {err}"
|
418 |
+
except TimeoutException:
|
419 |
+
result[-1] = "timed out"
|
420 |
+
|
421 |
+
os.chdir(origin_path)
|
422 |
+
shutil.rmtree(tmp_dir)
|
423 |
+
elif "cs" in language_type.lower():
|
424 |
+
import os
|
425 |
+
import shutil
|
426 |
+
|
427 |
+
origin_path = os.getcwd()
|
428 |
+
if "tmp" not in tmp_dir:
|
429 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
430 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
431 |
+
if not os.path.exists(tmp_dir):
|
432 |
+
os.makedirs(tmp_dir)
|
433 |
+
os.chdir(tmp_dir)
|
434 |
+
open(f"Program.cs", "w").write(sample["test_code"])
|
435 |
+
cmd = f"{cs_exec}mcs -d:DEBUG Program.cs"
|
436 |
+
compilation_result = subprocess.run(cmd, shell=True, capture_output=True)
|
437 |
+
if compilation_result.returncode != 0:
|
438 |
+
if compilation_result.stderr:
|
439 |
+
err = compilation_result.stderr.decode()
|
440 |
+
else:
|
441 |
+
err = compilation_result.stdout.decode()
|
442 |
+
result.append(f"failed: compilation error: {err}")
|
443 |
+
else:
|
444 |
+
try:
|
445 |
+
exec_result = None
|
446 |
+
cmd = f"{cs_exec}mono Program.exe"
|
447 |
+
env = dict(MONO_TRACE_LISTENER="Console.Error")
|
448 |
+
with time_limit(timeout):
|
449 |
+
exec_result = subprocess.run(
|
450 |
+
cmd,
|
451 |
+
timeout=timeout,
|
452 |
+
shell=True,
|
453 |
+
capture_output=True,
|
454 |
+
env=env,
|
455 |
+
)
|
456 |
+
|
457 |
+
if "Fail" not in exec_result.stderr.decode():
|
458 |
+
result.append("passed")
|
459 |
+
else:
|
460 |
+
if exec_result.stderr:
|
461 |
+
try:
|
462 |
+
err = exec_result.stderr.decode()
|
463 |
+
except:
|
464 |
+
err = exec_result.stderr
|
465 |
+
else:
|
466 |
+
try:
|
467 |
+
err = exec_result.stdout.decode()
|
468 |
+
except:
|
469 |
+
err = exec_result.stdout
|
470 |
+
result.append(f"failed: {err}")
|
471 |
+
except TimeoutException:
|
472 |
+
result.append("timed out")
|
473 |
+
except Exception as e:
|
474 |
+
result.append(f"failed: {e}")
|
475 |
+
os.chdir(origin_path)
|
476 |
+
shutil.rmtree(tmp_dir)
|
477 |
+
elif "rust" in language_type.lower():
|
478 |
+
import os
|
479 |
+
|
480 |
+
WD: str = os.path.dirname(os.path.abspath(__file__))
|
481 |
+
RUST_DIR: str = os.path.join(WD, "rust")
|
482 |
+
RUST_SRC: str = os.path.join(RUST_DIR, "src")
|
483 |
+
RUST_BIN: str = os.path.join(RUST_SRC, "bin")
|
484 |
+
RUST_TMP_DIR: str = os.path.join(RUST_DIR, "tmp")
|
485 |
+
RUST_LOGS: str = os.path.join(RUST_TMP_DIR, "logs")
|
486 |
+
RUST_EXT: str = ".rs"
|
487 |
+
|
488 |
+
# Create mandatory tmp directories
|
489 |
+
os.makedirs(RUST_TMP_DIR, exist_ok=True)
|
490 |
+
os.makedirs(RUST_LOGS, exist_ok=True)
|
491 |
+
os.makedirs(RUST_SRC, exist_ok=True)
|
492 |
+
os.makedirs(RUST_BIN, exist_ok=True)
|
493 |
+
|
494 |
+
with tempfile.NamedTemporaryFile(dir=RUST_BIN, delete=False) as f:
|
495 |
+
# temporal file name
|
496 |
+
file_prefix = sample["task_id"].lower().replace("/", "_")
|
497 |
+
file_name: str = file_prefix + RUST_EXT
|
498 |
+
|
499 |
+
os.rename(f.name, os.path.join(RUST_BIN, file_name))
|
500 |
+
|
501 |
+
# Sample to pure Rust function
|
502 |
+
rust_code: str = sample["test_code"]
|
503 |
+
|
504 |
+
# dump the rust source code in the target temporal file
|
505 |
+
f.write(rust_code.encode("utf-8"))
|
506 |
+
|
507 |
+
# Proceed towards Rust binaries compilation. Therefore move to Rust module root dir.
|
508 |
+
os.chdir(RUST_DIR)
|
509 |
+
|
510 |
+
# Two possible outcomes
|
511 |
+
# Pass OR Fail compilation
|
512 |
+
log_filename: str = file_prefix + ".jsonl"
|
513 |
+
log_path: str = os.path.join(RUST_LOGS, log_filename)
|
514 |
+
cargo_check: str = (
|
515 |
+
"cargo check --bin "
|
516 |
+
+ file_prefix
|
517 |
+
+ " --message-format json >> "
|
518 |
+
+ log_path
|
519 |
+
)
|
520 |
+
# Compilation build status
|
521 |
+
returned_val_compilation: int
|
522 |
+
|
523 |
+
# Overwrite file content
|
524 |
+
if os.path.exists(log_path):
|
525 |
+
if (file_size := os.path.getsize(log_path)) >= 0:
|
526 |
+
os.remove(log_path)
|
527 |
+
returned_val_compilation = os.system(cargo_check)
|
528 |
+
|
529 |
+
else:
|
530 |
+
returned_val_compilation = os.system(cargo_check)
|
531 |
+
|
532 |
+
# 0 means success
|
533 |
+
if returned_val_compilation == 0:
|
534 |
+
|
535 |
+
# Execution pipeline
|
536 |
+
cargo_test: str = (
|
537 |
+
"cargo test --bin "
|
538 |
+
+ file_prefix
|
539 |
+
+ " --message-format json >> "
|
540 |
+
+ log_path
|
541 |
+
)
|
542 |
+
returned_val_execution = os.system(cargo_test)
|
543 |
+
|
544 |
+
if returned_val_execution == 0:
|
545 |
+
result.append("passed")
|
546 |
+
else:
|
547 |
+
result.append(f"failed: execution error")
|
548 |
+
|
549 |
+
else:
|
550 |
+
result.append(f"failed: compilation error")
|
551 |
+
|
552 |
+
elif "java" in language_type.lower():
|
553 |
+
assert tmp_dir is not None, "Java should be evaluated in a temporary dir."
|
554 |
+
|
555 |
+
import os
|
556 |
+
import shutil
|
557 |
+
|
558 |
+
if "tmp" not in tmp_dir:
|
559 |
+
tmp_dir = os.path.join(tmp_dir, "tmp")
|
560 |
+
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
561 |
+
if not os.path.exists(tmp_dir):
|
562 |
+
os.makedirs(tmp_dir)
|
563 |
+
open(os.path.join(tmp_dir, "Problem.java"), "w").write(sample["test_code"])
|
564 |
+
origin_path = os.getcwd()
|
565 |
+
os.system(f"cp ./javatuples-1.2.jar {tmp_dir}/")
|
566 |
+
os.chdir(tmp_dir)
|
567 |
+
res = "failed: unknown error"
|
568 |
+
compile_returncode = -1
|
569 |
+
for _ in range(5):
|
570 |
+
try:
|
571 |
+
cmd = f"{java_exec}javac -cp javatuples-1.2.jar Problem.java"
|
572 |
+
compilation_result = subprocess.run(
|
573 |
+
cmd, timeout=60, capture_output=True, shell=True
|
574 |
+
)
|
575 |
+
compile_returncode = compilation_result.returncode
|
576 |
+
break
|
577 |
+
except subprocess.TimeoutExpired as e:
|
578 |
+
continue
|
579 |
+
if compile_returncode != 0:
|
580 |
+
res = "failed: compilation error"
|
581 |
+
else:
|
582 |
+
exec_result = None
|
583 |
+
try:
|
584 |
+
# WARNING
|
585 |
+
# This program exists to execute untrusted model-generated code. Although
|
586 |
+
# it is highly unlikely that model-generated code will do something overtly
|
587 |
+
# malicious in response to this test suite, model-generated code may act
|
588 |
+
# destructively due to a lack of model capability or alignment.
|
589 |
+
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
590 |
+
# does not perform destructive actions on their host or network.
|
591 |
+
# Once you have read this disclaimer and taken appropriate precautions,
|
592 |
+
# uncomment the following line and proceed at your own risk:
|
593 |
+
cmd = f"{java_exec}java -ea -cp .:javatuples-1.2.jar Problem"
|
594 |
+
exec_result = subprocess.run(
|
595 |
+
cmd, timeout=timeout, capture_output=True, shell=True
|
596 |
+
)
|
597 |
+
if exec_result.returncode == 0:
|
598 |
+
res = "passed"
|
599 |
+
elif exec_result.returncode == 1:
|
600 |
+
if "AssertionError" in exec_result.stderr.decode(
|
601 |
+
"unicode-escape"
|
602 |
+
):
|
603 |
+
res = "failed: wrong answer"
|
604 |
+
else:
|
605 |
+
res = f"failed: {exec_result.stderr.decode()}"
|
606 |
+
except subprocess.TimeoutExpired as e:
|
607 |
+
res = "time out"
|
608 |
+
except BaseException as e:
|
609 |
+
res = f"failed: {e}"
|
610 |
+
|
611 |
+
result.append(res)
|
612 |
+
os.chdir(origin_path)
|
613 |
+
shutil.rmtree(tmp_dir)
|
614 |
+
|
615 |
+
manager = multiprocessing.Manager()
|
616 |
+
result = manager.list()
|
617 |
+
|
618 |
+
p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,))
|
619 |
+
p.start()
|
620 |
+
p.join(timeout=timeout + 1)
|
621 |
+
if p.is_alive():
|
622 |
+
p.kill()
|
623 |
+
|
624 |
+
if not result:
|
625 |
+
result.append("timed out")
|
626 |
+
|
627 |
+
return {
|
628 |
+
"task_id": task_id,
|
629 |
+
"completion_id": completion_id,
|
630 |
+
"result": result[0],
|
631 |
+
"passed": result[0] == "passed",
|
632 |
+
"finish": -1 if "finish" not in sample else sample["finish"],
|
633 |
+
"test_code": sample["test_code"],
|
634 |
+
"prompt": sample["prompt"],
|
635 |
+
# "canonical_solution" : sample["canonical_solution"],
|
636 |
+
# "test" : sample["test"],
|
637 |
+
# "text" : sample["text"],
|
638 |
+
# "output" : sample["output"],
|
639 |
+
# "generation" : sample["generation"],
|
640 |
+
}
|
641 |
+
|
642 |
+
|
643 |
+
# Copyright (c) OpenAI (https://openai.com)
|
644 |
+
|
645 |
+
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
646 |
+
# of this software and associated documentation files (the "Software"), to deal
|
647 |
+
# in the Software without restriction, including without limitation the rights
|
648 |
+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
649 |
+
# copies of the Software, and to permit persons to whom the Software is
|
650 |
+
# furnished to do so, subject to the following conditions:
|
651 |
+
|
652 |
+
# The above copyright notice and this permission notice shall be included in
|
653 |
+
# all copies or substantial portions of the Software.
|
654 |
+
|
655 |
+
|
656 |
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
657 |
+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
658 |
+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
659 |
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
660 |
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
661 |
+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
662 |
+
# THE SOFTWARE.
|
663 |
+
# ============================================================================
|
664 |
+
@contextlib.contextmanager
|
665 |
+
def time_limit(seconds: float):
|
666 |
+
def signal_handler(signum, frame):
|
667 |
+
raise TimeoutException("Timed out!")
|
668 |
+
|
669 |
+
signal.setitimer(signal.ITIMER_REAL, seconds)
|
670 |
+
signal.signal(signal.SIGALRM, signal_handler)
|
671 |
+
try:
|
672 |
+
yield
|
673 |
+
finally:
|
674 |
+
signal.setitimer(signal.ITIMER_REAL, 0)
|
675 |
+
|
676 |
+
|
677 |
+
@contextlib.contextmanager
|
678 |
+
def swallow_io():
|
679 |
+
stream = WriteOnlyStringIO()
|
680 |
+
with contextlib.redirect_stdout(stream):
|
681 |
+
with contextlib.redirect_stderr(stream):
|
682 |
+
with redirect_stdin(stream):
|
683 |
+
yield
|
684 |
+
|
685 |
+
|
686 |
+
@contextlib.contextmanager
|
687 |
+
def create_tempdir():
|
688 |
+
with tempfile.TemporaryDirectory() as dirname:
|
689 |
+
with chdir(dirname):
|
690 |
+
yield dirname
|
691 |
+
|
692 |
+
|
693 |
+
class TimeoutException(Exception):
|
694 |
+
pass
|
695 |
+
|
696 |
+
|
697 |
+
class WriteOnlyStringIO(io.StringIO):
|
698 |
+
"""StringIO that throws an exception when it's read from"""
|
699 |
+
|
700 |
+
def read(self, *args, **kwargs):
|
701 |
+
raise IOError
|
702 |
+
|
703 |
+
def readline(self, *args, **kwargs):
|
704 |
+
raise IOError
|
705 |
+
|
706 |
+
def readlines(self, *args, **kwargs):
|
707 |
+
raise IOError
|
708 |
+
|
709 |
+
def readable(self, *args, **kwargs):
|
710 |
+
"""Returns True if the IO object can be read."""
|
711 |
+
return False
|
712 |
+
|
713 |
+
|
714 |
+
class redirect_stdin(contextlib._RedirectStream): # type: ignore
|
715 |
+
_stream = "stdin"
|
716 |
+
|
717 |
+
|
718 |
+
@contextlib.contextmanager
|
719 |
+
def chdir(root):
|
720 |
+
if root == ".":
|
721 |
+
yield
|
722 |
+
return
|
723 |
+
cwd = os.getcwd()
|
724 |
+
os.chdir(root)
|
725 |
+
try:
|
726 |
+
yield
|
727 |
+
except BaseException as exc:
|
728 |
+
raise exc
|
729 |
+
finally:
|
730 |
+
os.chdir(cwd)
|
731 |
+
|
732 |
+
|
733 |
+
def reliability_guard(maximum_memory_bytes: Optional[int] = None):
|
734 |
+
"""
|
735 |
+
This disables various destructive functions and prevents the generated code
|
736 |
+
from interfering with the test (e.g. fork bomb, killing other processes,
|
737 |
+
removing filesystem files, etc.)
|
738 |
+
|
739 |
+
WARNING
|
740 |
+
This function is NOT a security sandbox. Untrusted code, including, model-
|
741 |
+
generated code, should not be blindly executed outside of one. See the
|
742 |
+
Codex paper for more information about OpenAI's code sandbox, and proceed
|
743 |
+
with caution.
|
744 |
+
"""
|
745 |
+
|
746 |
+
if maximum_memory_bytes is not None:
|
747 |
+
import resource
|
748 |
+
|
749 |
+
resource.setrlimit(
|
750 |
+
resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
|
751 |
+
)
|
752 |
+
resource.setrlimit(
|
753 |
+
resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
|
754 |
+
)
|
755 |
+
if not platform.uname().system == "Darwin":
|
756 |
+
resource.setrlimit(
|
757 |
+
resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
|
758 |
+
)
|
759 |
+
|
760 |
+
faulthandler.disable()
|
761 |
+
|
762 |
+
import builtins
|
763 |
+
|
764 |
+
builtins.exit = None
|
765 |
+
builtins.quit = None
|
766 |
+
|
767 |
+
import os
|
768 |
+
|
769 |
+
os.environ["OMP_NUM_THREADS"] = "1"
|
770 |
+
|
771 |
+
os.kill = None
|
772 |
+
os.system = None
|
773 |
+
os.putenv = None
|
774 |
+
os.remove = None
|
775 |
+
os.removedirs = None
|
776 |
+
os.rmdir = None
|
777 |
+
os.fchdir = None
|
778 |
+
os.setuid = None
|
779 |
+
os.fork = None
|
780 |
+
os.forkpty = None
|
781 |
+
os.killpg = None
|
782 |
+
os.rename = None
|
783 |
+
os.renames = None
|
784 |
+
os.truncate = None
|
785 |
+
os.replace = None
|
786 |
+
os.unlink = None
|
787 |
+
os.fchmod = None
|
788 |
+
os.fchown = None
|
789 |
+
os.chmod = None
|
790 |
+
os.chown = None
|
791 |
+
os.chroot = None
|
792 |
+
os.fchdir = None
|
793 |
+
os.lchflags = None
|
794 |
+
os.lchmod = None
|
795 |
+
os.lchown = None
|
796 |
+
os.getcwd = None
|
797 |
+
os.chdir = None
|
798 |
+
|
799 |
+
import shutil
|
800 |
+
|
801 |
+
shutil.rmtree = None
|
802 |
+
shutil.move = None
|
803 |
+
shutil.chown = None
|
804 |
+
|
805 |
+
import subprocess
|
806 |
+
|
807 |
+
subprocess.Popen = None # type: ignore
|
808 |
+
|
809 |
+
__builtins__["help"] = None
|
810 |
+
|
811 |
+
import sys
|
812 |
+
|
813 |
+
sys.modules["ipdb"] = None
|
814 |
+
sys.modules["joblib"] = None
|
815 |
+
sys.modules["resource"] = None
|
816 |
+
sys.modules["psutil"] = None
|
817 |
+
sys.modules["tkinter"] = None
|
evaluation/general_benchmarks/HumanEval/humaneval.py
ADDED
@@ -0,0 +1,217 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import json
|
3 |
+
import multiprocessing
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
import string
|
7 |
+
import subprocess
|
8 |
+
import time
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
import torch
|
12 |
+
import torch.distributed as dist
|
13 |
+
# from attrdict import AttrDict
|
14 |
+
from human_eval.evaluation import evaluate_functional_correctness
|
15 |
+
from transformers import AutoTokenizer
|
16 |
+
from utils.dataset import HumanEvalDataset
|
17 |
+
from utils.utils import cleanup_code
|
18 |
+
|
19 |
+
|
20 |
+
class HumanEval:
|
21 |
+
"""
|
22 |
+
HumanEval evaluation class.
|
23 |
+
"""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
data_root,
|
28 |
+
max_seq_len=2048,
|
29 |
+
language="python",
|
30 |
+
max_gen_len=200,
|
31 |
+
batch_size=512,
|
32 |
+
log_dir=None,
|
33 |
+
temperature=0,
|
34 |
+
issft=False,
|
35 |
+
top_p=0.95,
|
36 |
+
model_name="",
|
37 |
+
inference_increment=True,
|
38 |
+
tokenizer_cfg=None,
|
39 |
+
n_sample=40,
|
40 |
+
k_sample=1,
|
41 |
+
):
|
42 |
+
self.data_root = data_root
|
43 |
+
self.max_seq_len = max_seq_len
|
44 |
+
self.max_gen_len = max_gen_len
|
45 |
+
self.batch_size = batch_size
|
46 |
+
self.k = k_sample
|
47 |
+
self.n_sample = n_sample
|
48 |
+
self.language = language
|
49 |
+
self.log_dir = log_dir
|
50 |
+
self.sft = issft
|
51 |
+
self.temperature = temperature
|
52 |
+
self.top_p = top_p
|
53 |
+
self.model_name = tokenizer_cfg["model_path"].replace("/", "_")
|
54 |
+
self.inference_increment = inference_increment
|
55 |
+
os.makedirs(self.log_dir, exist_ok=True)
|
56 |
+
tokenizer_cls = tokenizer_cfg.pop("cls")
|
57 |
+
try:
|
58 |
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
59 |
+
tokenizer_cfg.pop("model_path"), trust_remote_code=True
|
60 |
+
)
|
61 |
+
except Exception as e:
|
62 |
+
print(e)
|
63 |
+
assert False
|
64 |
+
|
65 |
+
@torch.no_grad()
|
66 |
+
def eval_model(self, gpt, accelerator):
|
67 |
+
"""
|
68 |
+
Evaluate the model on HumanEval.
|
69 |
+
"""
|
70 |
+
assert (
|
71 |
+
self.log_dir is not None
|
72 |
+
), "log_dir should not be None when evaluating humaneval"
|
73 |
+
dataset = HumanEvalDataset(
|
74 |
+
self.data_root,
|
75 |
+
sample_num=self.n_sample,
|
76 |
+
language=self.language,
|
77 |
+
issft=self.sft,
|
78 |
+
)
|
79 |
+
nprompt = len(dataset) // self.n_sample
|
80 |
+
dp_rank = accelerator.process_index
|
81 |
+
dp_size = accelerator.num_processes
|
82 |
+
if self.k > 1:
|
83 |
+
assert self.n_sample >= 100, "HumanEval PASS@100 needs n_sample >= 100"
|
84 |
+
gpt.eval()
|
85 |
+
# each process will process a subset of the dataset
|
86 |
+
prompt_indices_split = np.array_split(range(nprompt), dp_size)
|
87 |
+
prompt_indices = prompt_indices_split[dp_rank]
|
88 |
+
indices = [
|
89 |
+
x * self.n_sample + j for x in prompt_indices for j in range(self.n_sample)
|
90 |
+
]
|
91 |
+
all_num = len(indices)
|
92 |
+
processed_num = 0
|
93 |
+
log_file = os.path.join(
|
94 |
+
self.log_dir,
|
95 |
+
f"{self.model_name}_rank{dp_rank}_bs{self.batch_size}_shot_log_{self.language}.json",
|
96 |
+
)
|
97 |
+
tmpfile = open(log_file, "w")
|
98 |
+
start_time = time.time()
|
99 |
+
# split the dataset into batches and construct a list of inputs
|
100 |
+
for idx in range(0, len(indices), self.batch_size):
|
101 |
+
prompt_list = []
|
102 |
+
prompt_lens = []
|
103 |
+
orriginal_prompt_list = []
|
104 |
+
tokenized_prompt_lens = []
|
105 |
+
taskid = []
|
106 |
+
# get the prompts from the dataset
|
107 |
+
for j in indices[idx : idx + self.batch_size]:
|
108 |
+
data = dataset[j]
|
109 |
+
fprompt = data["prompt"].strip()
|
110 |
+
prompt_list.append(fprompt)
|
111 |
+
tmp = self.tokenizer.encode(fprompt)
|
112 |
+
orriginal_prompt_list.append(data["original_prompt"])
|
113 |
+
prompt_lens.append(len(fprompt))
|
114 |
+
tokenized_prompt_lens.append(tmp)
|
115 |
+
taskid.append(data["task_id"])
|
116 |
+
input_ids = torch.tensor(tokenized_prompt_lens).to(accelerator.device)
|
117 |
+
# generate the code
|
118 |
+
if self.temperature != 0:
|
119 |
+
decoded = gpt.generate(
|
120 |
+
input_ids=input_ids,
|
121 |
+
max_new_tokens=self.max_gen_len,
|
122 |
+
do_sample=True,
|
123 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
124 |
+
temperature=self.temperature,
|
125 |
+
top_p=self.top_p,
|
126 |
+
pad_token_id=self.tokenizer.eos_token_id,
|
127 |
+
)
|
128 |
+
else:
|
129 |
+
decoded = gpt.generate(
|
130 |
+
input_ids=input_ids,
|
131 |
+
max_new_tokens=self.max_gen_len,
|
132 |
+
do_sample=False,
|
133 |
+
eos_token_id=self.tokenizer.eos_token_id,
|
134 |
+
pad_token_id=self.tokenizer.eos_token_id,
|
135 |
+
)
|
136 |
+
# save the results to a file
|
137 |
+
for local_idx, text in enumerate(decoded):
|
138 |
+
prediction = decoded[local_idx]
|
139 |
+
prediction = self.tokenizer.decode(prediction, skip_special_tokens=True)
|
140 |
+
suffixprediction = prediction[prompt_lens[local_idx] :]
|
141 |
+
suffixprediction = cleanup_code(
|
142 |
+
suffixprediction,
|
143 |
+
self.language,
|
144 |
+
"humaneval",
|
145 |
+
self.sft,
|
146 |
+
dataset.stopwords,
|
147 |
+
)
|
148 |
+
# sft mode does not need original prompt
|
149 |
+
if not self.sft:
|
150 |
+
suffixprediction = (
|
151 |
+
orriginal_prompt_list[local_idx] + "\n" + suffixprediction
|
152 |
+
)
|
153 |
+
res = {
|
154 |
+
"task_id": taskid[local_idx],
|
155 |
+
"generation": suffixprediction,
|
156 |
+
"prompt": orriginal_prompt_list[local_idx],
|
157 |
+
"wholecode": prediction,
|
158 |
+
}
|
159 |
+
tmpfile.write(json.dumps(res) + "\n")
|
160 |
+
tmpfile.flush()
|
161 |
+
processed_num += 1
|
162 |
+
self.log_score(dp_rank, processed_num, all_num, start_time, self.batch_size)
|
163 |
+
tmpfile.close()
|
164 |
+
accelerator.wait_for_everyone()
|
165 |
+
# calculate the final score of pass@k
|
166 |
+
self._calculate_final_score(accelerator)
|
167 |
+
accelerator.wait_for_everyone()
|
168 |
+
return
|
169 |
+
|
170 |
+
def log_score(self, dp_rank, processed_num, all_num, start_time, bs):
|
171 |
+
"""
|
172 |
+
Log the score.
|
173 |
+
"""
|
174 |
+
mem = torch.cuda.max_memory_allocated() / (1 << 30)
|
175 |
+
avg_time = (time.time() - start_time) / processed_num * bs
|
176 |
+
print(
|
177 |
+
f"DP RANK:{dp_rank} process_num/all_num:{int(processed_num)}/{all_num} "
|
178 |
+
f"avg_time_per_batch:{avg_time:.2f} s "
|
179 |
+
f"still_need:{((all_num - processed_num) // bs + 1) * avg_time / 60:.2f} m",
|
180 |
+
f"mem:{mem:.3f} GiB bs:{bs}",
|
181 |
+
flush=True,
|
182 |
+
)
|
183 |
+
if processed_num == all_num:
|
184 |
+
print(
|
185 |
+
f"EVAL DONE! Process time {(time.time() - start_time) / 60:.2f} m",
|
186 |
+
flush=True,
|
187 |
+
)
|
188 |
+
|
189 |
+
def _calculate_final_score(self, accelerator):
|
190 |
+
"""
|
191 |
+
Calculate the final score.
|
192 |
+
"""
|
193 |
+
if accelerator.is_local_main_process:
|
194 |
+
logfilepath = os.path.join(self.log_dir, f"final_{self.model_name}.jsonl")
|
195 |
+
logfile = open(logfilepath, "w")
|
196 |
+
for i in range(accelerator.num_processes):
|
197 |
+
tmplogfile = os.path.join(
|
198 |
+
self.log_dir,
|
199 |
+
f"{self.model_name}_rank{i}_bs{self.batch_size}_shot_log_{self.language}.json",
|
200 |
+
)
|
201 |
+
logfile.write(open(tmplogfile).read().strip() + "\n")
|
202 |
+
os.remove(tmplogfile)
|
203 |
+
logfile.close()
|
204 |
+
timeout = 10
|
205 |
+
runlang = self.language
|
206 |
+
res = evaluate_functional_correctness(
|
207 |
+
input_file=logfilepath,
|
208 |
+
problem_file=os.path.join(
|
209 |
+
self.data_root, f"humaneval-{self.language}.jsonl"
|
210 |
+
),
|
211 |
+
tmp_dir=self.log_dir,
|
212 |
+
timeout=timeout,
|
213 |
+
language=runlang,
|
214 |
+
)
|
215 |
+
print("score is", res["pass@%d" % self.k])
|
216 |
+
os.remove(logfilepath)
|
217 |
+
return
|
evaluation/general_benchmarks/HumanEval/javatuples-1.2.jar
ADDED
Binary file (65.5 kB). View file
|
|
evaluation/general_benchmarks/HumanEval/test_config.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
compute_environment: LOCAL_MACHINE
|
2 |
+
distributed_type: MULTI_GPU
|
3 |
+
downcast_bf16: 'no'
|
4 |
+
gpu_ids: all
|
5 |
+
machine_rank: 0
|
6 |
+
main_training_function: main
|
7 |
+
mixed_precision: 'no'
|
8 |
+
num_machines: 1
|
9 |
+
num_processes: 3
|
10 |
+
rdzv_backend: static
|
11 |
+
same_network: true
|
12 |
+
tpu_env: []
|
13 |
+
tpu_use_cluster: false
|
14 |
+
tpu_use_sudo: false
|
15 |
+
use_cpu: false
|
evaluation/general_benchmarks/HumanEval/utils/dataset.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
class HumanEvalDataset:
|
8 |
+
|
9 |
+
def __init__(self, root, sample_num=1, language="python", issft=False):
|
10 |
+
"""
|
11 |
+
root: the path to the HumanEval dataset
|
12 |
+
sample_num: the number of samples for each prompt
|
13 |
+
language: the language of the HumanEval dataset
|
14 |
+
issft: whether to use the SFT setting
|
15 |
+
"""
|
16 |
+
self.root = root
|
17 |
+
self.data = open(
|
18 |
+
os.path.join(self.root, f"humaneval-{language}.jsonl")
|
19 |
+
).readlines()
|
20 |
+
|
21 |
+
tmp = self.get_qa_only_data(self.data, issft)
|
22 |
+
self.clean_data = []
|
23 |
+
for i in range(len(tmp)):
|
24 |
+
for j in range(sample_num):
|
25 |
+
self.clean_data.append(tmp[i])
|
26 |
+
self.stopwords = self.clean_data[0]["stopwords"]
|
27 |
+
np.random.seed(1234)
|
28 |
+
print(f"Read HumanEval from {root}, number of samples {len(self.clean_data)}")
|
29 |
+
|
30 |
+
def get_qa_only_data(self, data_json, sft=False):
|
31 |
+
"""
|
32 |
+
data_json: the jsonl file of HumanEval
|
33 |
+
sft: whether to use the SFT setting
|
34 |
+
return: a list of dict, each dict contains the prompt, task_id and stopwords
|
35 |
+
"""
|
36 |
+
ans = []
|
37 |
+
for line in data_json:
|
38 |
+
line = json.loads(line)
|
39 |
+
prompt = line["prompt"].strip()
|
40 |
+
if "prefix" in line:
|
41 |
+
origin_prompt = line["prefix"]
|
42 |
+
else:
|
43 |
+
origin_prompt = line["prompt"]
|
44 |
+
|
45 |
+
if sft:
|
46 |
+
prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\n\n### Instruction:\nWrite a program to perform the given task.\n\nInput:\n{prompt}\n\n### Response:\n"""
|
47 |
+
if "stop_tokens" in line:
|
48 |
+
s = line["stop_tokens"]
|
49 |
+
else:
|
50 |
+
s = []
|
51 |
+
ans.append(
|
52 |
+
{
|
53 |
+
"prompt": prompt,
|
54 |
+
"task_id": line["task_id"],
|
55 |
+
"original_prompt": origin_prompt,
|
56 |
+
"stopwords": s,
|
57 |
+
}
|
58 |
+
)
|
59 |
+
return ans
|
60 |
+
|
61 |
+
def __len__(self):
|
62 |
+
"""
|
63 |
+
return the number of samples in the dataset
|
64 |
+
"""
|
65 |
+
return len(self.clean_data)
|
66 |
+
|
67 |
+
def __getitem__(self, index):
|
68 |
+
"""
|
69 |
+
return the sample at index
|
70 |
+
"""
|
71 |
+
sample = self.clean_data[index]
|
72 |
+
return sample
|
evaluation/general_benchmarks/HumanEval/utils/utils.py
ADDED
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
languge_settings = {
|
4 |
+
"python": {
|
5 |
+
"full_name": "Python",
|
6 |
+
"indent": 4,
|
7 |
+
},
|
8 |
+
"cpp": {
|
9 |
+
"full_name": "cpp",
|
10 |
+
"indent": 0,
|
11 |
+
"main": "int main()",
|
12 |
+
},
|
13 |
+
"java": {
|
14 |
+
"full_name": "Java",
|
15 |
+
"indent": 4,
|
16 |
+
"main": "public static void main",
|
17 |
+
},
|
18 |
+
"cs": {
|
19 |
+
"full_name": "csharp",
|
20 |
+
"indent": 0,
|
21 |
+
"main": "public static void Main",
|
22 |
+
},
|
23 |
+
"php": {
|
24 |
+
"full_name": "PHP",
|
25 |
+
"indent": 0,
|
26 |
+
},
|
27 |
+
"ts": {
|
28 |
+
"full_name": "TypeScript",
|
29 |
+
"indent": 0,
|
30 |
+
},
|
31 |
+
"js": {"full_name": "JavaScript", "indent": 0},
|
32 |
+
"sh": {"full_name": "Bash", "indent": 0},
|
33 |
+
}
|
34 |
+
|
35 |
+
|
36 |
+
def get_function_name(question: str, lang: str):
|
37 |
+
func_lines = [x for x in question.strip().split("\n") if x.strip()]
|
38 |
+
|
39 |
+
if lang.lower() == "python":
|
40 |
+
func_idx = [
|
41 |
+
i for i in range(len(func_lines)) if func_lines[i].startswith("def ")
|
42 |
+
][-1]
|
43 |
+
func_name = func_lines[func_idx].split("(")[0].strip()
|
44 |
+
func_prefix = "\n".join(func_lines[:func_idx])
|
45 |
+
return func_name, func_prefix
|
46 |
+
|
47 |
+
func_name = func_lines[-1].split("{")[0].strip()
|
48 |
+
func_prefix = "\n".join(func_lines[:-1])
|
49 |
+
return func_name, func_prefix
|
50 |
+
|
51 |
+
|
52 |
+
def extract_generation_code(example: str, lang_code: str, verbose: bool = False):
|
53 |
+
task_id = example["task_id"]
|
54 |
+
output = example.get("output", example.get("gpt_completion"))
|
55 |
+
question = example["prompt"].strip()
|
56 |
+
setting = languge_settings[lang_code]
|
57 |
+
lang = setting["full_name"]
|
58 |
+
indent = setting["indent"]
|
59 |
+
|
60 |
+
try:
|
61 |
+
code_block: str = re.findall(
|
62 |
+
f"```{lang.lower()}\n(.*?)```", output, re.DOTALL | re.IGNORECASE
|
63 |
+
)[0]
|
64 |
+
if verbose:
|
65 |
+
print(">>> Task: {}\n{}".format(task_id, code_block))
|
66 |
+
|
67 |
+
# Remove main
|
68 |
+
if setting.get("main", None) and setting["main"] in code_block:
|
69 |
+
main_start = code_block.index(setting["main"])
|
70 |
+
code_block = code_block[:main_start]
|
71 |
+
|
72 |
+
func_name, func_prefix = get_function_name(question, lang)
|
73 |
+
|
74 |
+
try:
|
75 |
+
start = code_block.lower().index(func_name.lower())
|
76 |
+
indent = 0
|
77 |
+
while start - indent >= 0 and code_block[start - indent - 1] == " ":
|
78 |
+
indent += 1
|
79 |
+
|
80 |
+
try:
|
81 |
+
end = code_block.rindex("\n" + " " * indent + "}")
|
82 |
+
except:
|
83 |
+
end = len(code_block)
|
84 |
+
except:
|
85 |
+
start = 0
|
86 |
+
try:
|
87 |
+
end = code_block.rindex("\n" + " " * indent + "}")
|
88 |
+
except:
|
89 |
+
end = len(code_block)
|
90 |
+
|
91 |
+
body = code_block[start:end]
|
92 |
+
|
93 |
+
if lang_code.lower() in ["php", "ts", "js"]:
|
94 |
+
body += "\n" + " " * indent + "}"
|
95 |
+
|
96 |
+
generation = func_prefix + "\n" + body + "\n"
|
97 |
+
example["generation"] = generation
|
98 |
+
|
99 |
+
except Exception as ex:
|
100 |
+
print(
|
101 |
+
"Failed to extract code block with error `{}`:\n>>> Task: {}\n>>> Output:\n{}".format(
|
102 |
+
ex, task_id, output
|
103 |
+
)
|
104 |
+
)
|
105 |
+
example["generation"] = example["prompt"] + "\n" + output
|
106 |
+
|
107 |
+
return example
|
108 |
+
|
109 |
+
|
110 |
+
def cleanup_code(
|
111 |
+
code: str,
|
112 |
+
language_type: str = None,
|
113 |
+
dataset: str = None,
|
114 |
+
issft: bool = False,
|
115 |
+
stop_words=[],
|
116 |
+
):
|
117 |
+
"""
|
118 |
+
Cleans up the generated code.
|
119 |
+
"""
|
120 |
+
|
121 |
+
if language_type.lower() == "python":
|
122 |
+
if issft:
|
123 |
+
code = _clean_python_code_for_sft(code)
|
124 |
+
stop_words = ["\ndef", "\nclass", "\nif", "\n#", "\nprint"]
|
125 |
+
code = _truncate_code_at_stopwords(code, stop_words)
|
126 |
+
elif language_type.lower() == "ts":
|
127 |
+
code = _truncate_code_at_stopwords(
|
128 |
+
code,
|
129 |
+
stop_words
|
130 |
+
+ [
|
131 |
+
"\nexport",
|
132 |
+
"\nimport",
|
133 |
+
"\nexport default",
|
134 |
+
"\nimport default",
|
135 |
+
"\nconsole.log",
|
136 |
+
],
|
137 |
+
)
|
138 |
+
else:
|
139 |
+
code = _truncate_code_at_stopwords(code, stop_words)
|
140 |
+
|
141 |
+
return code
|
142 |
+
|
143 |
+
|
144 |
+
def _clean_python_code_for_sft(code):
|
145 |
+
code = code.replace("\r", "")
|
146 |
+
if "```python" in code:
|
147 |
+
code_start_idx = code.index("```python")
|
148 |
+
code = code[code_start_idx:].replace("```python", "").strip()
|
149 |
+
end_idx = code.find("```") if "```" in code else len(code)
|
150 |
+
code = code[:end_idx].strip()
|
151 |
+
|
152 |
+
return code
|
153 |
+
|
154 |
+
|
155 |
+
def _truncate_code_at_stopwords(code, stop_words):
|
156 |
+
min_stop_idx = len(code)
|
157 |
+
for stop_word in stop_words:
|
158 |
+
stop_index = code.find(stop_word)
|
159 |
+
if 0 <= stop_index < min_stop_idx:
|
160 |
+
min_stop_idx = stop_index
|
161 |
+
return code[:min_stop_idx]
|
evaluation/general_benchmarks/MATH/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2024 Zhibin Gou
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
evaluation/general_benchmarks/MATH/README.md
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Requirements
|
2 |
+
You can install the required packages with the following command:
|
3 |
+
```bash
|
4 |
+
cd latex2sympy
|
5 |
+
pip install -e .
|
6 |
+
cd ..
|
7 |
+
pip install -r requirements.txt
|
8 |
+
pip install vllm==0.5.1 --no-build-isolation
|
9 |
+
pip install transformers==4.42.3
|
10 |
+
```
|
11 |
+
|
12 |
+
### Evaluation
|
13 |
+
You can evaluate Qwen2.5/Qwen2-Math-Instruct series model with the following command:
|
14 |
+
```bash
|
15 |
+
# Qwen2.5-Math-Instruct Series
|
16 |
+
PROMPT_TYPE="qwen25-math-cot"
|
17 |
+
# Qwen2.5-Math-1.5B-Instruct
|
18 |
+
export CUDA_VISIBLE_DEVICES="0"
|
19 |
+
MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-1.5B-Instruct"
|
20 |
+
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
21 |
+
|
22 |
+
# Qwen2.5-Math-7B-Instruct
|
23 |
+
export CUDA_VISIBLE_DEVICES="0"
|
24 |
+
MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-7B-Instruct"
|
25 |
+
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
26 |
+
|
27 |
+
# Qwen2.5-Math-72B-Instruct
|
28 |
+
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
29 |
+
MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-72B-Instruct"
|
30 |
+
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
31 |
+
|
32 |
+
|
33 |
+
# Qwen2-Math-Instruct Series
|
34 |
+
PROMPT_TYPE="qwen-boxed"
|
35 |
+
# Qwen2-Math-1.5B-Instruct
|
36 |
+
export CUDA_VISIBLE_DEVICES="0"
|
37 |
+
MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-1.5B-Instruct"
|
38 |
+
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
39 |
+
|
40 |
+
# Qwen2-Math-7B-Instruct
|
41 |
+
export CUDA_VISIBLE_DEVICES="0"
|
42 |
+
MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-7B-Instruct"
|
43 |
+
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
44 |
+
|
45 |
+
# Qwen2-Math-72B-Instruct
|
46 |
+
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
47 |
+
MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-72B-Instruct"
|
48 |
+
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
49 |
+
```
|
50 |
+
|
51 |
+
## Acknowledgement
|
52 |
+
The codebase is adapted from [math-evaluation-harness](https://github.com/ZubinGou/math-evaluation-harness).
|
evaluation/general_benchmarks/MATH/data/aime24/test.jsonl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:af2b8bd2aa911b6333ad0df32f3ca05c7ae8ed10f1731f4372c8ae26990bf7ac
|
3 |
+
size 156944
|