qianxiao1111 commited on
Commit
2a26d3b
1 Parent(s): 0751062

upgrade: add benchmarks eval

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +4 -0
  2. added_tokens.json +3 -24
  3. config.json +3 -28
  4. evaluation/.gitignore +190 -0
  5. evaluation/README.md +181 -0
  6. evaluation/general_benchmarks/HumanEval/README.md +74 -0
  7. evaluation/general_benchmarks/HumanEval/data/humaneval-cpp +0 -0
  8. evaluation/general_benchmarks/HumanEval/data/humaneval-cpp.jsonl +3 -0
  9. evaluation/general_benchmarks/HumanEval/data/humaneval-cs +0 -0
  10. evaluation/general_benchmarks/HumanEval/data/humaneval-cs-bu.jsonl +3 -0
  11. evaluation/general_benchmarks/HumanEval/data/humaneval-cs.jsonl +3 -0
  12. evaluation/general_benchmarks/HumanEval/data/humaneval-d.jsonl +3 -0
  13. evaluation/general_benchmarks/HumanEval/data/humaneval-go.jsonl +3 -0
  14. evaluation/general_benchmarks/HumanEval/data/humaneval-java +0 -0
  15. evaluation/general_benchmarks/HumanEval/data/humaneval-java.jsonl +3 -0
  16. evaluation/general_benchmarks/HumanEval/data/humaneval-jl.jsonl +3 -0
  17. evaluation/general_benchmarks/HumanEval/data/humaneval-js.jsonl +3 -0
  18. evaluation/general_benchmarks/HumanEval/data/humaneval-lua.jsonl +3 -0
  19. evaluation/general_benchmarks/HumanEval/data/humaneval-php +0 -0
  20. evaluation/general_benchmarks/HumanEval/data/humaneval-php.jsonl +3 -0
  21. evaluation/general_benchmarks/HumanEval/data/humaneval-pl.jsonl +3 -0
  22. evaluation/general_benchmarks/HumanEval/data/humaneval-python.jsonl +3 -0
  23. evaluation/general_benchmarks/HumanEval/data/humaneval-r.jsonl +3 -0
  24. evaluation/general_benchmarks/HumanEval/data/humaneval-rb.jsonl +3 -0
  25. evaluation/general_benchmarks/HumanEval/data/humaneval-rkt.jsonl +3 -0
  26. evaluation/general_benchmarks/HumanEval/data/humaneval-rs.jsonl +3 -0
  27. evaluation/general_benchmarks/HumanEval/data/humaneval-scala.jsonl +3 -0
  28. evaluation/general_benchmarks/HumanEval/data/humaneval-sh +0 -0
  29. evaluation/general_benchmarks/HumanEval/data/humaneval-sh.jsonl +3 -0
  30. evaluation/general_benchmarks/HumanEval/data/humaneval-swift.jsonl +3 -0
  31. evaluation/general_benchmarks/HumanEval/data/humaneval-ts +0 -0
  32. evaluation/general_benchmarks/HumanEval/data/humaneval-ts.jsonl +3 -0
  33. evaluation/general_benchmarks/HumanEval/eval.sh +4 -0
  34. evaluation/general_benchmarks/HumanEval/eval_base_vllm.py +162 -0
  35. evaluation/general_benchmarks/HumanEval/eval_instruct.py +168 -0
  36. evaluation/general_benchmarks/HumanEval/eval_instruct_vllm.py +225 -0
  37. evaluation/general_benchmarks/HumanEval/eval_pal.py +62 -0
  38. evaluation/general_benchmarks/HumanEval/human_eval/__init__.py +0 -0
  39. evaluation/general_benchmarks/HumanEval/human_eval/data.py +48 -0
  40. evaluation/general_benchmarks/HumanEval/human_eval/evaluate_functional_correctness.py +32 -0
  41. evaluation/general_benchmarks/HumanEval/human_eval/evaluation.py +351 -0
  42. evaluation/general_benchmarks/HumanEval/human_eval/execution.py +817 -0
  43. evaluation/general_benchmarks/HumanEval/humaneval.py +217 -0
  44. evaluation/general_benchmarks/HumanEval/javatuples-1.2.jar +0 -0
  45. evaluation/general_benchmarks/HumanEval/test_config.yaml +15 -0
  46. evaluation/general_benchmarks/HumanEval/utils/dataset.py +72 -0
  47. evaluation/general_benchmarks/HumanEval/utils/utils.py +161 -0
  48. evaluation/general_benchmarks/MATH/LICENSE +21 -0
  49. evaluation/general_benchmarks/MATH/README.md +52 -0
  50. evaluation/general_benchmarks/MATH/data/aime24/test.jsonl +3 -0
.gitattributes CHANGED
@@ -47,3 +47,7 @@ rng_state_2.pth filter=lfs diff=lfs merge=lfs -text
47
  model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
48
  rng_state_4.pth filter=lfs diff=lfs merge=lfs -text
49
  model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
47
  model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
48
  rng_state_4.pth filter=lfs diff=lfs merge=lfs -text
49
  model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
50
+ *.csv filter=lfs diff=lfs merge=lfs -text
51
+ *.json filter=lfs diff=lfs merge=lfs -text
52
+ *.jsonl filter=lfs diff=lfs merge=lfs -text
53
+ *.sqlite filter=lfs diff=lfs merge=lfs -text
added_tokens.json CHANGED
@@ -1,24 +1,3 @@
1
- {
2
- "</tool_call>": 151658,
3
- "<tool_call>": 151657,
4
- "<|box_end|>": 151649,
5
- "<|box_start|>": 151648,
6
- "<|endoftext|>": 151643,
7
- "<|file_sep|>": 151664,
8
- "<|fim_middle|>": 151660,
9
- "<|fim_pad|>": 151662,
10
- "<|fim_prefix|>": 151659,
11
- "<|fim_suffix|>": 151661,
12
- "<|im_end|>": 151645,
13
- "<|im_start|>": 151644,
14
- "<|image_pad|>": 151655,
15
- "<|object_ref_end|>": 151647,
16
- "<|object_ref_start|>": 151646,
17
- "<|quad_end|>": 151651,
18
- "<|quad_start|>": 151650,
19
- "<|repo_name|>": 151663,
20
- "<|video_pad|>": 151656,
21
- "<|vision_end|>": 151653,
22
- "<|vision_pad|>": 151654,
23
- "<|vision_start|>": 151652
24
- }
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:58b54bbe36fc752f79a24a271ef66a0a0830054b4dfad94bde757d851968060b
3
+ size 605
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json CHANGED
@@ -1,28 +1,3 @@
1
- {
2
- "_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
3
- "architectures": [
4
- "Qwen2ForCausalLM"
5
- ],
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 151643,
8
- "eos_token_id": 151645,
9
- "hidden_act": "silu",
10
- "hidden_size": 3584,
11
- "initializer_range": 0.02,
12
- "intermediate_size": 18944,
13
- "max_position_embeddings": 32768,
14
- "max_window_layers": 28,
15
- "model_type": "qwen2",
16
- "num_attention_heads": 28,
17
- "num_hidden_layers": 28,
18
- "num_key_value_heads": 4,
19
- "rms_norm_eps": 1e-06,
20
- "rope_theta": 1000000.0,
21
- "sliding_window": null,
22
- "tie_word_embeddings": false,
23
- "torch_dtype": "bfloat16",
24
- "transformers_version": "4.44.2",
25
- "use_cache": false,
26
- "use_sliding_window": false,
27
- "vocab_size": 152064
28
- }
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82c97ddf1f855ce947c97a62859a064efb2499cb7cbb82aa9c67bdc65b678c17
3
+ size 709
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/.gitignore ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Byte-compiled / optimized / DLL files
3
+ __pycache__/
4
+ *.py[cod]
5
+ *$py.class
6
+ *.sql
7
+ *.sqlite
8
+ *.splite
9
+ *.desc
10
+ *.txt
11
+ *.DS_Store
12
+ .DS_Store
13
+ !eval_retriever/data/*.json
14
+ !eval_retriever/preds/*.json
15
+ !reject_eval/*.json
16
+ !evalset/*/*.json
17
+ !evalset/*.json
18
+
19
+ # C extensions
20
+ *.so
21
+
22
+ # Distribution / packaging
23
+ .Python
24
+ build/
25
+ develop-eggs/
26
+ dist/
27
+ downloads/
28
+ eggs/
29
+ .eggs/
30
+ lib/
31
+ lib64/
32
+ parts/
33
+ sdist/
34
+ var/
35
+ wheels/
36
+ share/python-wheels/
37
+ *.egg-info/
38
+ .installed.cfg
39
+ *.egg
40
+ MANIFEST
41
+
42
+ # PyInstaller
43
+ # Usually these files are written by a python script from a template
44
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
45
+ *.manifest
46
+ *.spec
47
+
48
+ # Installer logs
49
+ pip-log.txt
50
+ pip-delete-this-directory.txt
51
+
52
+ # Unit test / coverage reports
53
+ htmlcov/
54
+ .tox/
55
+ .nox/
56
+ .coverage
57
+ .coverage.*
58
+ .cache
59
+ nosetests.xml
60
+ coverage.xml
61
+ *.cover
62
+ *.py,cover
63
+ .hypothesis/
64
+ .pytest_cache/
65
+ cover/
66
+
67
+ # Translations
68
+ *.mo
69
+ *.pot
70
+
71
+ # Django stuff:
72
+ *.log
73
+ local_settings.py
74
+ db.sqlite3
75
+ db.sqlite3-journal
76
+
77
+ # Flask stuff:
78
+ instance/
79
+ .webassets-cache
80
+
81
+ # Scrapy stuff:
82
+ .scrapy
83
+
84
+ # Sphinx documentation
85
+ docs/_build/
86
+
87
+ # PyBuilder
88
+ .pybuilder/
89
+ target/
90
+
91
+ # Jupyter Notebook
92
+ .ipynb_checkpoints
93
+
94
+ # IPython
95
+ profile_default/
96
+ ipython_config.py
97
+
98
+ # pyenv
99
+ # For a library or package, you might want to ignore these files since the code is
100
+ # intended to run in multiple environments; otherwise, check them in:
101
+ # .python-version
102
+
103
+ # pipenv
104
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
106
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
107
+ # install all needed dependencies.
108
+ #Pipfile.lock
109
+
110
+ # poetry
111
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
112
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
113
+ # commonly ignored for libraries.
114
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
115
+ #poetry.lock
116
+
117
+ # pdm
118
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
119
+ #pdm.lock
120
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
121
+ # in version control.
122
+ # https://pdm.fming.dev/#use-with-ide
123
+ .pdm.toml
124
+
125
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
126
+ __pypackages__/
127
+
128
+ # Celery stuff
129
+ celerybeat-schedule
130
+ celerybeat.pid
131
+
132
+ # SageMath parsed files
133
+ *.sage.py
134
+
135
+ # Environments
136
+ .env
137
+ .venv
138
+ env/
139
+ venv/
140
+ ENV/
141
+ env.bak/
142
+ venv.bak/
143
+
144
+ # Spyder project settings
145
+ .spyderproject
146
+ .spyproject
147
+
148
+ # Rope project settings
149
+ .ropeproject
150
+
151
+ # mkdocs documentation
152
+ /site
153
+
154
+ # mypy
155
+ .mypy_cache/
156
+ .dmypy.json
157
+ dmypy.json
158
+
159
+ # Pyre type checker
160
+ .pyre/
161
+
162
+ # pytype static type analyzer
163
+ .pytype/
164
+
165
+ # Cython debug symbols
166
+ cython_debug/
167
+
168
+ # PyCharm
169
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
170
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
171
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
172
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
173
+ #.idea/
174
+
175
+ output/
176
+ images/
177
+ .vscode
178
+ vllm_encoder/
179
+
180
+ !table_related_benchmarks/evalset/bird_data/*.sql
181
+ !table_related_benchmarks/evalset/spider_data/*.sql
182
+
183
+ table_related_benchmarks/evalset/spider_data/test_database/*
184
+ table_related_benchmarks/evalset/bird_data/dev_databases/*
185
+ table_related_benchmarks/evalset/spider_data/dev_database/*
186
+
187
+
188
+ !table_related_benchmarks/evalset/spider_data/test_database/README.md
189
+ !table_related_benchmarks/evalset/bird_data/dev_databases/README.md
190
+ !table_related_benchmarks/evalset/spider_data/dev_database/README.md
evaluation/README.md ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Benchmarks evaluations for tablegpt
2
+
3
+ <p align="center">
4
+ <a href="#-About">🔥About</a> •
5
+ <a href="#-Usage">💻Usage</a> •
6
+ </p>
7
+
8
+ ## About
9
+
10
+ </div>
11
+
12
+ This is a repo opened for evaluation on different table-related benchmarks for tablegpt.
13
+
14
+ Given the complexity of table QA tasks and the uncertainty of input instructions, we provide evaluation datasets and scripts for 7 capabilities:
15
+
16
+ - ✨Code correction based on tables
17
+ - ✨Refusal of ambiguous questions
18
+ - ✨Table & field recall in multi-table scenarios
19
+ - ✨Table QA output code executable
20
+ - ✨Table-Bench.
21
+ - ✨Text2Sql.
22
+ - ✨TableInstruct, which includes a series of table-related evaluation benchmarks.
23
+
24
+ In addition, we have integrated other general abilities benchmarks like HumanEval, MBPP and MMLU/CMMLU.
25
+ We have built an inference method based on the local model path using vLLM as the backend, and defined a set of example prompts templates for the above benchmarks.
26
+
27
+ ## Usage
28
+
29
+ </div>
30
+ </details>
31
+
32
+ ⏬ To use this framework, please first install the repository from GitHub:
33
+
34
+ ```shell
35
+ git clone https://github.com/tablegpt/tablegpt-eval
36
+ cd tablegpt-eval
37
+ pip install -r requirements.txt
38
+ ```
39
+
40
+ </div>
41
+ </details>
42
+
43
+ [!Tips]
44
+ 1. You can run all the benchmarks with the default params by running command `bash run_benchmarks.sh`.
45
+ 2. If you want more configuration options for running parameters, refer to the typical Python script.
46
+ 3. Download the .db files before running text2sql evaluation scripts. Download urls refer to `/table_related_benchmarks/evalset/bird_data/dev_databases/README.md`(Bird dev) & `table_related_benchmarks/evalset/spider_data/dev_database/README.md` (Spider dev) & `table_related_benchmarks/evalset/spider_data/test_database/README.md` (Spider test).
47
+
48
+
49
+ ### Code correction eval
50
+
51
+ We provide a non-executable eval dataset based on the Python language. Eval dataset path:
52
+
53
+ ```python
54
+ table_related_benchmarks/evalset/code_correction_test/correction_set.json
55
+ ```
56
+
57
+ We use the ***executable_pass_rate*** and ***absolute_match_rate*** of the corrected code in pass-1 to evaluate the model's code correction ability. You can perform code-correction evaluation by running the following Python command:
58
+
59
+ ```bash
60
+ python table_related_benchmarks/run_code_correction_eval.py \
61
+ --model_path <EVAL MODEL PATH> \
62
+ --template <CHAT_TEMPLATE_NAME, support [llama3, baichuan, chatglm, None], default None> \
63
+ --eval_results_save_path <PATH TO SAVE THE EVAL RESULTS> \
64
+ --gpus_num <NUMBER OF GPU TO RUN INFERENCE> \
65
+ --temperature <ONE OF THE INFERENCE PARAMETER>
66
+ ```
67
+
68
+ ### Ambiguous reject eval
69
+
70
+ We provide 298 table-based queries, with a ratio of about 1:3 between queries marked as ambiguous (to be rejected) and queries that should be accepted and correctly answered. Dataset path:
71
+
72
+ ```python
73
+ # test queries
74
+ evalset/reject_test/test_query.json
75
+ # queries with ground truth
76
+ evalset/reject_test/ground_truth.json
77
+ ```
78
+
79
+ We use **accuracy**, **recall**, and **F1 score** as metrics to evaluate the LLM's ability in this task. You can perform reject evaluation by running the following Python command:
80
+
81
+ ```bash
82
+ python table_related_benchmarks/run_reject_eval.py \
83
+ --model_path <EVAL MODEL PATH> \
84
+ --save_path <LLM OUTPUT CONTENT SAVE PATH> \
85
+ --gpus_num <NUMBER OF GPU TO RUN INFERENCE> \
86
+ --temperature <ONE OF THE INFERENCE PARAMETER>
87
+ ```
88
+
89
+ ### Table&Fields recall eval
90
+
91
+ The provided eval dataset path:
92
+
93
+ ```python
94
+ table_related_benchmarks/evalset/retrieval_test/recall_set.json
95
+ ```
96
+
97
+ We use a series of evaluation metrics such as **recall**, **precision**, **Jaccard similarity**, and **Hamming loss** to assess the LLM's performance in table and field retrieval tasks. You can perform recall evaluation by running the following Python command:
98
+
99
+ ```bash
100
+ python table_related_benchmarks/run_recall_eval.py \
101
+ --model_path <EVAL MODEL PATH> \
102
+ --temperature <TEMPERATURE> \
103
+ --gpus_num <NUMBER OF GPU TO RUN INFERENCE>
104
+ ```
105
+
106
+ ### Table QA executable
107
+
108
+ Provide 2178 table based queries, eval dataset path:
109
+
110
+ ```python
111
+ table_related_benchmarks/evalset/table_qa_execuate_test/tableqa_samples_with_paths.jsonl
112
+ ```
113
+
114
+ We employ ***executable_pass_rate*** of pass-1 to employ the model's tableQA code generation ability. You can perform tableQA evaluation by running the following Python command:
115
+
116
+ ```bash
117
+ python table_related_benchmarks/run_tableqa_execution_eval.py \
118
+ --model_path <EVAL MODEL PATH> \
119
+ --temperature <ONE OF THE INFERENCE PARAMETER> \
120
+ --gpus_num <NUMBER OF GPU TO RUN INFERENCE>
121
+ ```
122
+
123
+ ### TableBench evaluation
124
+
125
+ The provided eval dataset path:
126
+
127
+ ```python
128
+ table_related_benchmarks/evalset/TableBench
129
+ ```
130
+
131
+ In the evaluation of TableBench, Rough-L was used to assess general QA questions, while pass@1 was used as the evaluation metric for visualization-type samples. You can perform TableBench evaluation by the following command:
132
+
133
+ ```bash
134
+ python table_related_benchmarks/run_table_bench_eval.py \
135
+ --model_path <EVAL MODEL PATH> \
136
+ --temperature <ONE OF THE INFERENCE PARAMETER> \
137
+ --gpus_num <NUMBER OF GPU TO RUN INFERENCE>
138
+ ```
139
+
140
+ ### TableInstruct
141
+
142
+ The provided eval dataset path:
143
+
144
+ ```python
145
+ table_related_benchmarks/evalset/TableInstruct
146
+ ```
147
+
148
+ You can perform TableInstruct evaluation by the following command:
149
+
150
+ ```bash
151
+ python table_related_benchmarks/run_table_instruct_eval.py \
152
+ --model_path <EVAL MODEL PATH> \
153
+ --temperature <ONE OF THE INFERENCE PARAMETER> \
154
+ --gpus_num <NUMBER OF GPU TO RUN INFERENCE>
155
+ ```
156
+
157
+ ### Text2Sql
158
+ ```bash
159
+ python table_related_benchmarks/run_text2sql_eval.py --model_path <EVAL MODEL PATH>
160
+ ```
161
+
162
+ ### HumanEval
163
+ Perform HumanEval evaluation by the following command:
164
+
165
+ ```bash
166
+ python general_benchmarks/HumanEval/eval_instruct_vllm.py --model_path <EVAL MODEL PATH>
167
+ ```
168
+
169
+ ### MBPP
170
+ Perform MBPP evaluation by the following command:
171
+
172
+ ```bash
173
+ python general_benchmarks/MBPP/eval_instruct_vllm.py --model_path <EVAL MODEL PATH>
174
+ ```
175
+
176
+ ### MMLU & CMMLU
177
+
178
+ ```bash
179
+ python general_benchmarks/MMLU/evaluator.py --task <mmlu or cmmlu> --lang <en or zh> --model_path <EVAL MODEL PATH>
180
+ ```
181
+
evaluation/general_benchmarks/HumanEval/README.md ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## 1. Introduction
2
+
3
+ We provide a test script to evaluate the performance of the **deepseek-coder** model on code generation benchmarks. We select the widely-used benchmarks: **[HumanEval-Python](https://huggingface.co/datasets/openai_humaneval), [HumanEval-Multilingual](https://huggingface.co/datasets/nuprl/MultiPL-E)**.
4
+
5
+
6
+
7
+ ## 2. Setup
8
+
9
+ ```
10
+ pip install accelerate
11
+ pip install attrdict
12
+ pip install transformers
13
+ pip install pytorch
14
+ ```
15
+
16
+
17
+ ## 3. Evaluation
18
+
19
+ We've created a sample script, **eval.sh**, that demonstrates how to test the **DeepSeek-Coder-1.3b-Base** model on the HumanEval dataset leveraging **8** GPUs. If your use case involves a different model or dataset, simply adjust the script to fit your needs.
20
+
21
+ Additionally, for various programming languages, the execution path may differ. Please ensure you update the appropriate paths in the **humaneval/execution.py** file accordingly.
22
+
23
+ ```bash
24
+ MODEL_NAME_OR_PATH="deepseek-ai/deepseek-coder-1.3b-base"
25
+ DATASET_ROOT="data/"
26
+ LANGUAGE="python"
27
+ python -m accelerate.commands.launch --config_file test_config.yaml eval_pal.py --logdir ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
28
+ ```
29
+
30
+ To evaluate the instruction-based model, please follow the script below:
31
+ ```bash
32
+ LANG="python"
33
+ OUPUT_DIR="output"
34
+ MODEL="deepseek-coder-33b-instruct"
35
+
36
+ CUDA_VISIBLE_DEVICES=0,1 python eval_instruct.py \
37
+ --model "deepseek-ai/$MODEL" \
38
+ --output_path "$OUPUT_DIR/${LANG}.$MODEL.jsonl" \
39
+ --language $LANG \
40
+ --temp_dir $OUPUT_DIR
41
+ ```
42
+
43
+ ## 4. Experimental Results
44
+
45
+ We report experimental results here for 8 main-stream programming languages, **python**, **c++**, **java**, **PHP**, **TypeScript**, **C#**, **Bash**, and **JavaScript**. For all open-source models, we utilize this repository to obtain the performance of the models on the HumanEval dataset. We set the maximum input length to **4096** and the maximum output length to **500**, and employ the **greedy search strategy**.
46
+
47
+
48
+ #### (1) Multilingual Base Models
49
+
50
+ | Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
51
+ |-------------------|------|--------|-------|------|------|------|------|------|------|------|
52
+ | code-cushman-001 | 12B | 33.5% | 31.9% | 30.6%| 28.9%| 31.3%| 22.1%| 11.7%| - | - |
53
+ | CodeShell | 7B | 35.4% | 32.9% | 34.2%| 31.7%| 30.2%| 38.0%| 7.0% | 33.5%| 30.4%|
54
+ | CodeGeeX2 | 6B | 36.0% | 29.2% | 25.9%| 23.6%| 20.8%| 29.7%| 6.3% | 24.8%| 24.5%|
55
+ | StarCoderBase | 16B | 31.7% | 31.1% | 28.5%| 25.4%| 34.0%| 34.8%| 8.9% | 29.8%| 28.0%|
56
+ | CodeLLama | 7B | 31.7% | 29.8% | 34.2%| 23.6%| 36.5%| 36.7%| 12.0%| 29.2%| 29.2%|
57
+ | CodeLLama | 13B | 36.0% | 37.9% | 38.0%| 34.2%| 45.2%| 43.0%| 16.5%| 32.3%| 35.4%|
58
+ | CodeLLama | 34B | 48.2% | 44.7% | 44.9%| 41.0%| 42.1%| 48.7%| 15.8%| 42.2%| 41.0%|
59
+ | | | | | | | | | | | |
60
+ | DeepSeek-Coder-Base| 1.3B | 34.8% | 31.1% | 32.3%| 24.2%| 28.9%| 36.7%| 10.1%| 28.6%| 28.3%|
61
+ | DeepSeek-Coder-Base| 5.7B | 48.7% | 45.3% | 41.1%| 39.7%| 44.7%| 41.1%| 27.8%| 42.2%| 41.3%|
62
+ | DeepSeek-Coder-Base| 6.7B | 49.4% | 50.3% | 43.0%| 38.5%| 49.7%| 50.0%| 28.5%| 48.4%| 44.7%|
63
+ | DeepSeek-Coder-Base|33B | **56.1%** | **58.4%** | **51.9%**| **44.1%**| **52.8%**| **51.3%**| **32.3%**| **55.3%**| **50.3%**|
64
+
65
+ #### (2) Instruction-Tuned Models
66
+ | Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
67
+ |---------------------|------|--------|-------|------|------|------|------|------|------|------|
68
+ | GPT-3.5-Turbo | - | 76.2% | 63.4% | 69.2%| 60.9%| 69.1%| 70.8%| 42.4%| 67.1%| 64.9%|
69
+ | GPT-4 | - | **84.1%** | **76.4%** | **81.6%**| **77.2%**| **77.4%**| **79.1%**| **58.2%**| **78.0%**| **76.5%**|
70
+ | | | | | | | | | | | |
71
+ | DeepSeek-Coder-Instruct | 1.3B | 65.2% | 45.3% | 51.9% | 45.3% | 59.7% |55.1% | 12.7% | 52.2% | 48.4% |
72
+ | DeepSeek-Coder-Instruct | 6.7B | 78.9% | 63.4% | 68.4% | 68.9%| 67.2%| 72.8%| 36.7%| 72.7%| 66.1%|
73
+ | DeepSeek-Coder-Instruct | 33B | **79.3%** | **68.9%** | **73.4%** | **72.7%**| **67.9%**| **74.1%**| **43.0%**| **73.9%**| **69.2%**|
74
+
evaluation/general_benchmarks/HumanEval/data/humaneval-cpp ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-cpp.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8717eabdf202137158c84506144b0fb1e73d5ecccbe5363ec79009ca014df629
3
+ size 388688
evaluation/general_benchmarks/HumanEval/data/humaneval-cs ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-cs-bu.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d281b53b24e0f44cb76f1b1a8702b1ca668ff2a29c7621276ee8b658f5c124c6
3
+ size 448701
evaluation/general_benchmarks/HumanEval/data/humaneval-cs.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c3fe18be10addc2d0b96311f4db192ae3232e08628d17768d889d6ab87be224
3
+ size 452021
evaluation/general_benchmarks/HumanEval/data/humaneval-d.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:226938c015f90d713a3e30d8f174f4a6d2c88820cf50512379a16890dda70332
3
+ size 289365
evaluation/general_benchmarks/HumanEval/data/humaneval-go.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f5dfe3ff001049b1221e9d21a8119b5cbc38eb87c97fefc5d57fa7adc1df888
3
+ size 432325
evaluation/general_benchmarks/HumanEval/data/humaneval-java ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-java.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b21b015763452ec4f9f4e0e6425148ec331ace4da1232c8b4d441186185f6265
3
+ size 454059
evaluation/general_benchmarks/HumanEval/data/humaneval-jl.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef21bd920889c9c1ab0e87a825cc26bd895e7a715f569f8ba7de577f870b6815
3
+ size 268754
evaluation/general_benchmarks/HumanEval/data/humaneval-js.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:634e2eee8d6e22de07c121b972207683dd96be76256bf44bdfc1a3386b739287
3
+ size 297853
evaluation/general_benchmarks/HumanEval/data/humaneval-lua.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2e91f3603f6aee63db2c2d5754d165397603a8c9bd6130842af7988b27a96fc
3
+ size 298314
evaluation/general_benchmarks/HumanEval/data/humaneval-php ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-php.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bebd86875b1d8e65a8b7e692ed7bdf64612b44aa388825ea4dab40c7047c786b
3
+ size 388096
evaluation/general_benchmarks/HumanEval/data/humaneval-pl.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:949048bff3eaea7ae47cd5759042c088a7c227d53c74fe95a80728fd5aefbf77
3
+ size 437506
evaluation/general_benchmarks/HumanEval/data/humaneval-python.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0eae07adadbb00d51962fdb78b9e2a26bfa8ade85dc54eb57cae9bffed2f5c54
3
+ size 342974
evaluation/general_benchmarks/HumanEval/data/humaneval-r.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2a3268ea54d2bfb8ce65bb2c808437ac0d9934c6caf99fcf75d6b6a4fb3f911
3
+ size 311904
evaluation/general_benchmarks/HumanEval/data/humaneval-rb.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4204395b835019e32d55513325a0fad01c6d382fd1eb97b516f0458d00058302
3
+ size 312806
evaluation/general_benchmarks/HumanEval/data/humaneval-rkt.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:083325daf55daf431ae6d6d17cf017afb18a6ec790d4c841b7c2b4752c5807ff
3
+ size 312006
evaluation/general_benchmarks/HumanEval/data/humaneval-rs.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e33f562838e973b7e6b8d76dbc1fb84b076d7426629b4cdc624f12678778d2fa
3
+ size 306470
evaluation/general_benchmarks/HumanEval/data/humaneval-scala.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:482b72a16613a563755d642e37792da68471399789b069fba5b1249e831445f3
3
+ size 384243
evaluation/general_benchmarks/HumanEval/data/humaneval-sh ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-sh.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f702106e9ce4aa7385de568d9248d6c57c90382d782b162cb9e072fbd01ccf8
3
+ size 274180
evaluation/general_benchmarks/HumanEval/data/humaneval-swift.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4418d6f532e4298969e727690cda80dc88b94e4abf836f8ff79ac18a737eaaa
3
+ size 344436
evaluation/general_benchmarks/HumanEval/data/humaneval-ts ADDED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-ts.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b1af1050c9f226335d54a8c80553c39454a525cace3e67bdcfcc9092ba02637
3
+ size 304732
evaluation/general_benchmarks/HumanEval/eval.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ MODEL_NAME_OR_PATH="/data3/models/DeepSeek/deepseek-coder-6.7b-base"
2
+ DATASET_ROOT="HumanEval/data"
3
+ LANGUAGE="python"
4
+ CUDA_VISIBLE_DEVICES=5,6,7 python -m accelerate.commands.launch --config_file HumanEval/test_config.yaml HumanEval/eval_pal.py --model_path ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
evaluation/general_benchmarks/HumanEval/eval_base_vllm.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import time
4
+ from argparse import ArgumentParser
5
+ # from accelerate import Accelerator
6
+ # from accelerate import DistributedDataParallelKwargs
7
+ from pathlib import Path
8
+
9
+ import numpy as np
10
+ import torch
11
+ import torch.nn.functional as F
12
+ from human_eval.evaluation import evaluate_functional_correctness
13
+ from tqdm import tqdm
14
+ from transformers import AutoModelForCausalLM, AutoTokenizer
15
+ from utils.dataset import HumanEvalDataset
16
+ from utils.utils import cleanup_code
17
+ from vllm import LLM, SamplingParams
18
+
19
+
20
+ class HumanEval:
21
+ """
22
+ HumanEval evaluation class.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ data_root,
28
+ language="python",
29
+ log_dir=None,
30
+ issft=False,
31
+ inference_increment=True,
32
+ n_sample=1,
33
+ k_sample=1,
34
+ ):
35
+ self.data_root = data_root
36
+ self.k = k_sample
37
+ self.n_sample = n_sample
38
+ self.language = language
39
+ self.log_dir = log_dir
40
+ self.sft = issft
41
+ self.inference_increment = inference_increment
42
+ os.makedirs(self.log_dir, exist_ok=True)
43
+
44
+ @torch.no_grad()
45
+ def eval_model(self, args):
46
+ """
47
+ Evaluate the model on HumanEval.
48
+ """
49
+ assert (
50
+ self.log_dir is not None
51
+ ), "log_dir should not be None when evaluating humaneval"
52
+ dataset = HumanEvalDataset(
53
+ self.data_root,
54
+ sample_num=self.n_sample,
55
+ language=self.language,
56
+ issft=self.sft,
57
+ )
58
+ model_name_or_path = args.model_path
59
+ print("model", model_name_or_path)
60
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
61
+ print(
62
+ "load tokenizer {} from {} over.".format(
63
+ tokenizer.__class__, model_name_or_path
64
+ )
65
+ )
66
+
67
+ llm = LLM(
68
+ model=model_name_or_path,
69
+ tensor_parallel_size=1,
70
+ max_model_len=4096,
71
+ trust_remote_code=True,
72
+ enforce_eager=True,
73
+ )
74
+ sampling_params = SamplingParams(
75
+ temperature=0,
76
+ max_tokens=1024,
77
+ top_p=0.95,
78
+ stop_token_ids=[tokenizer.eos_token_id],
79
+ )
80
+ messages_list = []
81
+ for i in range(len(dataset)):
82
+ data = dataset[i]
83
+ prompt = data["prompt"].strip()
84
+ messages_list.append(prompt)
85
+ outputs = llm.generate(messages_list, sampling_params=sampling_params)
86
+ assert len(dataset) == len(outputs), "dataset and outputs different lengths."
87
+ log_file = os.path.join(self.log_dir, f"{self.language}.json")
88
+ tmpfile = open(log_file, "w")
89
+ for i, output in enumerate(tqdm(outputs)):
90
+ data = dataset[i]
91
+ output = output.outputs[0].text
92
+ output = cleanup_code(
93
+ output,
94
+ self.language,
95
+ "humaneval",
96
+ self.sft,
97
+ dataset.stopwords,
98
+ )
99
+ # sft mode does not need original prompt
100
+ if not self.sft:
101
+ suffixprediction = data["original_prompt"] + "\n" + output
102
+ res = {
103
+ "task_id": data["task_id"],
104
+ "generation": suffixprediction,
105
+ "prompt": data["original_prompt"],
106
+ }
107
+ tmpfile.write(json.dumps(res) + "\n")
108
+
109
+ tmpfile.close()
110
+ # calculate the final score of pass@k
111
+ self._calculate_final_score(log_file)
112
+ return
113
+
114
+ def _calculate_final_score(self, logfilepath):
115
+ """
116
+ Calculate the final score.
117
+ """
118
+ res = evaluate_functional_correctness(
119
+ input_file=logfilepath,
120
+ problem_file=os.path.join(
121
+ self.data_root, f"humaneval-{self.language}.jsonl"
122
+ ),
123
+ tmp_dir=self.log_dir,
124
+ language=self.language,
125
+ )
126
+ print("score is", res["pass@%d" % self.k])
127
+ os.remove(logfilepath)
128
+ return
129
+
130
+
131
+ if __name__ == "__main__":
132
+ parser = ArgumentParser()
133
+ parser.add_argument("--logdir", type=str, default="")
134
+ parser.add_argument(
135
+ "--model_path",
136
+ type=str,
137
+ help="model name or path",
138
+ default="/data0/pretrained-models/qwen2-7b",
139
+ )
140
+
141
+ parser.add_argument("--language", type=str, default="python")
142
+ parser.add_argument(
143
+ "--dataroot",
144
+ type=str,
145
+ default="HumanEval/data",
146
+ )
147
+ args = parser.parse_args()
148
+
149
+ logdir = args.logdir
150
+ language = args.language
151
+
152
+ if logdir == "":
153
+ logdir = "output/tmp/"
154
+
155
+ evaluator = HumanEval(
156
+ data_root=args.dataroot,
157
+ log_dir=logdir,
158
+ n_sample=1,
159
+ language=language,
160
+ )
161
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
162
+ evaluator.eval_model(args)
evaluation/general_benchmarks/HumanEval/eval_instruct.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+
6
+ import torch
7
+ from tqdm import tqdm
8
+
9
+ data_abs_dir = Path(__file__).parent / "data"
10
+
11
+ from human_eval.evaluation import evaluate_functional_correctness
12
+ from transformers import AutoModelForCausalLM, AutoTokenizer
13
+ from utils.utils import extract_generation_code, languge_settings
14
+
15
+
16
+ def build_deepseekcoder_instruction(languge: str, question: str):
17
+ return """
18
+ Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
19
+ ```{}
20
+ {}
21
+ ```
22
+ """.strip().format(
23
+ languge.lower(), question.strip()
24
+ )
25
+
26
+
27
+ def generate_one(example, lang, tokenizer, model):
28
+ prompt = build_deepseekcoder_instruction(
29
+ languge_settings[lang]["full_name"], example["prompt"]
30
+ )
31
+ inputs = tokenizer.apply_chat_template(
32
+ [{"role": "user", "content": prompt}],
33
+ return_tensors="pt",
34
+ add_generation_prompt=True,
35
+ ).to(model.device)
36
+
37
+ stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
38
+ assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
39
+
40
+ outputs = model.generate(
41
+ inputs,
42
+ max_new_tokens=1024,
43
+ do_sample=False,
44
+ # top_p=0.95,
45
+ # temperature=temperature,
46
+ pad_token_id=stop_id,
47
+ eos_token_id=stop_id,
48
+ )
49
+
50
+ output = tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
51
+ example["output"] = output
52
+
53
+ return extract_generation_code(example, lang_code=lang)
54
+
55
+
56
+ def generate_main(args):
57
+ model_name_or_path = args.model
58
+ lang = args.language
59
+ saved_path = args.output_path
60
+ temp_dir = args.temp_dir
61
+ os.makedirs(temp_dir, exist_ok=True)
62
+ problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
63
+
64
+ print("model", model_name_or_path)
65
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
66
+ print(
67
+ "load tokenizer {} from {} over.".format(
68
+ tokenizer.__class__, model_name_or_path
69
+ )
70
+ )
71
+ model = AutoModelForCausalLM.from_pretrained(
72
+ model_name_or_path,
73
+ torch_dtype=torch.bfloat16,
74
+ device_map="auto",
75
+ # use_flash_attention_2=True
76
+ )
77
+ model.eval()
78
+ examples = [json.loads(x) for x in open(problem_file) if x.strip()]
79
+ print("Read {} examples for evaluation over.".format(len(examples)))
80
+
81
+ generated_examples = []
82
+ for ex in tqdm(examples, desc="Generating"):
83
+ gen_example = generate_one(ex, args.language, tokenizer, model)
84
+ generated_examples.append(gen_example)
85
+
86
+ print("Generate all over!!!")
87
+ with open(saved_path, "w", encoding="utf-8") as fw:
88
+ for ex in generated_examples:
89
+ fw.write(json.dumps(ex) + "\n")
90
+ print(
91
+ "Save {} processed examples into {} over!".format(
92
+ len(generated_examples), saved_path
93
+ )
94
+ )
95
+
96
+ result = evaluate_functional_correctness(
97
+ input_file=saved_path,
98
+ tmp_dir=temp_dir,
99
+ n_workers=8,
100
+ timeout=3.0,
101
+ problem_file=problem_file,
102
+ language=lang,
103
+ )
104
+ print(lang, result, model_name_or_path)
105
+ pass
106
+
107
+
108
+ def evaluation_only(args):
109
+ lang = args.language
110
+ temp_dir = args.temp_dir
111
+ assert os.path.exists(args.output_path), "Not fond output file: {}".format(
112
+ args.output_path
113
+ )
114
+ os.makedirs(temp_dir, exist_ok=True)
115
+
116
+ output_name = os.path.basename(args.output_path)
117
+ output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
118
+
119
+ processed_examples = [
120
+ extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")
121
+ ]
122
+ processed_path = os.path.join(temp_dir, output_name)
123
+ with open(processed_path, "w", encoding="utf-8") as fw:
124
+ for ex in processed_examples:
125
+ fw.write(json.dumps(ex) + "\n")
126
+ print(
127
+ "Save {} processed examples into {} over!".format(
128
+ len(processed_examples), processed_path
129
+ )
130
+ )
131
+
132
+ problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
133
+ from human_eval.evaluation import evaluate_functional_correctness
134
+
135
+ result = evaluate_functional_correctness(
136
+ input_file=processed_path,
137
+ tmp_dir=temp_dir,
138
+ n_workers=8,
139
+ timeout=3.0,
140
+ problem_file=problem_file,
141
+ language=lang,
142
+ )
143
+ print(lang, result)
144
+
145
+
146
+ if __name__ == "__main__":
147
+ parser = argparse.ArgumentParser()
148
+ parser.add_argument(
149
+ "--model",
150
+ type=str,
151
+ help="model name or path",
152
+ default="/data0/pretrained-models/deepseek-coder-6.7b-instruct",
153
+ )
154
+ parser.add_argument(
155
+ "--output_path",
156
+ type=str,
157
+ help="output path of your generation",
158
+ default="/home/qyhuang/DeepSeek-Coder/outputs/deepseek-chat.json",
159
+ )
160
+ parser.add_argument("--language", type=str, help="langauge", default="python")
161
+ parser.add_argument(
162
+ "--temp_dir", type=str, help="temp dir for evaluation", default="tmp"
163
+ )
164
+ args = parser.parse_args()
165
+
166
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
167
+ generate_main(args)
168
+ pass
evaluation/general_benchmarks/HumanEval/eval_instruct_vllm.py ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import shutil
5
+ from pathlib import Path
6
+
7
+ import torch
8
+ import transformers
9
+ from human_eval.evaluation import evaluate_functional_correctness
10
+ from tqdm import tqdm
11
+ from transformers import AutoTokenizer
12
+ from utils.utils import extract_generation_code, languge_settings
13
+ from vllm import LLM, SamplingParams
14
+
15
+ data_abs_dir = Path(__file__).parent / "data"
16
+
17
+
18
+ def build_deepseekcoder_instruction(languge: str, question: str):
19
+ return """
20
+ Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
21
+ ```{}
22
+ {}
23
+ ```
24
+ """.strip().format(
25
+ languge.lower(), question.strip()
26
+ )
27
+
28
+
29
+ def create_dir(output_dir):
30
+ if os.path.exists(output_dir):
31
+ if not os.access(output_dir, os.W_OK):
32
+ shutil.rmtree(output_dir)
33
+ os.makedirs(output_dir)
34
+ os.chmod(output_dir, 0o777)
35
+ print("not write permission, makedir:", output_dir)
36
+ else:
37
+ print(f"{output_dir} exists!")
38
+ else:
39
+ os.makedirs(output_dir)
40
+ os.chmod(output_dir, 0o777)
41
+ print("makedir:", output_dir)
42
+
43
+
44
+ def get_client_res(messages, example, output_key, open_ai_key=False):
45
+ try:
46
+ if open_ai_key:
47
+ from openai import AzureOpenAI, OpenAI
48
+ try:
49
+ api_key = os.environ["OPENAI_API_KEY"]
50
+ except KeyError:
51
+ print("环境变量 OPENAI_API_KEY 未设置")
52
+ api_key = "default_value"
53
+
54
+ client = AzureOpenAI(
55
+ api_key=api_key,
56
+ api_version="2024-07-01-preview",
57
+ azure_endpoint="https://zju-tablegpt.openai.azure.com/",
58
+ )
59
+ chat_response = client.chat.completions.create(
60
+ model="gpt-4o",
61
+ # model="gpt-4o-mini",
62
+ messages=messages,
63
+ top_p=0.95,
64
+ temperature=0,
65
+ max_tokens=1024,
66
+ timeout=40,
67
+ )
68
+ else:
69
+ # Set OpenAI's API key and API base to use vLLM's API server.
70
+ openai_api_key = "EMPTY"
71
+ openai_api_base = "http://localhost:8080/v1"
72
+
73
+ client = OpenAI(
74
+ api_key=openai_api_key,
75
+ base_url=openai_api_base,
76
+ )
77
+ chat_response = client.chat.completions.create(
78
+ model="qwen2-7b-sft",
79
+ messages=messages,
80
+ top_p=0.3,
81
+ temperature=0.1,
82
+ max_tokens=1024,
83
+ )
84
+ example[output_key] = chat_response.choices[0].message.content
85
+ except Exception as e:
86
+ print(f"An unexpected error occurred: {e}")
87
+ example[output_key] = None
88
+ example["input"] = messages
89
+ return example
90
+
91
+
92
+
93
+ def generate_main(args):
94
+ model_name_or_path = args.model_path
95
+ lang = args.language
96
+ temp_dir = args.temp_dir
97
+ create_dir(temp_dir)
98
+ # os.makedirs(temp_dir, exist_ok=True)
99
+ problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
100
+ if not args.api:
101
+ print("model", model_name_or_path)
102
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
103
+ print(
104
+ "load tokenizer {} from {} over.".format(
105
+ tokenizer.__class__, model_name_or_path
106
+ )
107
+ )
108
+ llm_args = {
109
+ "model": model_name_or_path,
110
+ "gpu_memory_utilization": 0.95,
111
+ "trust_remote_code": True,
112
+ "tensor_parallel_size": args.gpus_num,
113
+ "dtype": "half",
114
+ "max_model_len": 8192,
115
+ "enforce_eager": True,
116
+ }
117
+
118
+ llm = LLM(**llm_args)
119
+ sampling_params = SamplingParams(
120
+ temperature=0,
121
+ max_tokens=1024,
122
+ top_p=0.95,
123
+ stop_token_ids=[tokenizer.eos_token_id],
124
+ )
125
+
126
+ examples = [json.loads(x) for x in open(problem_file) if x.strip()]
127
+ print("Read {} examples for evaluation over.".format(len(examples)))
128
+ messages_list = []
129
+ for example in tqdm(examples, desc="Generating"):
130
+ prompt = build_deepseekcoder_instruction(
131
+ languge_settings[lang]["full_name"], example["prompt"]
132
+ )
133
+ message = [{"role": "user", "content": prompt}]
134
+ if args.api:
135
+ messages_list.append(message)
136
+ else:
137
+ messages_list.append(
138
+ tokenizer.apply_chat_template(
139
+ message, tokenize=False, add_generation_prompt=True
140
+ )
141
+ )
142
+ if args.api:
143
+ from joblib import Parallel, delayed
144
+ examples_ = Parallel(n_jobs=24)(
145
+ delayed(get_client_res)(inp, examples[i], "output",open_ai_key=True)
146
+ for i, inp in enumerate(tqdm(messages_list))
147
+ )
148
+
149
+ # 请求错误的重新请求
150
+ examples = []
151
+ for example in examples_:
152
+ if example["output"] == None:
153
+ example = get_client_res(
154
+ example["input"], example, "output", open_ai_key=True
155
+ )
156
+ del example["input"]
157
+ examples.append(example)
158
+
159
+ generated_examples = []
160
+ for example in examples:
161
+ example = extract_generation_code(example, lang_code=lang)
162
+ generated_examples.append(example)
163
+ else:
164
+ outputs = llm.generate(messages_list, sampling_params=sampling_params)
165
+ generated_examples = []
166
+ for i, output in enumerate(tqdm(outputs)):
167
+ output = output.outputs[0].text
168
+ example = examples[i]
169
+ example["output"] = output
170
+ example = extract_generation_code(example, lang_code=lang)
171
+ generated_examples.append(example)
172
+
173
+ print("Generate all over!!!")
174
+ # os.makedirs(args.save_dir, exist_ok=True)
175
+ create_dir(args.save_dir)
176
+ saved_path = os.path.join(args.save_dir, "results_humaneval.json")
177
+ with open(saved_path, "w", encoding="utf-8") as fw:
178
+ for ex in generated_examples:
179
+ fw.write(json.dumps(ex) + "\n")
180
+ print(
181
+ "Save {} processed examples into {} over!".format(
182
+ len(generated_examples), saved_path
183
+ )
184
+ )
185
+
186
+ result = evaluate_functional_correctness(
187
+ input_file=saved_path,
188
+ tmp_dir=temp_dir,
189
+ n_workers=8,
190
+ timeout=3.0,
191
+ problem_file=problem_file,
192
+ language=lang,
193
+ out_path=saved_path,
194
+ )
195
+ print(lang, result, model_name_or_path)
196
+
197
+
198
+ if __name__ == "__main__":
199
+ parser = argparse.ArgumentParser()
200
+ parser.add_argument(
201
+ "--model_path",
202
+ type=str,
203
+ help="model name or path",
204
+ default="/data4/sft_output/qwen2-instruct-0709/checkpoint-1400",
205
+ )
206
+ parser.add_argument(
207
+ "--gpus_num", type=int, default=1, help="the number of GPUs you want to use."
208
+ )
209
+ parser.add_argument(
210
+ "--save_dir",
211
+ type=str,
212
+ help="output path of your generation",
213
+ default="output",
214
+ )
215
+ parser.add_argument("--api", action="store_true", help="infer api type")
216
+ parser.add_argument("--language", type=str, help="langauge", default="python")
217
+ parser.add_argument(
218
+ "--temp_dir", type=str, help="temp dir for evaluation", default="output/tmp"
219
+ )
220
+ parser.add_argument("--seed", type=int, help="seed", default=42)
221
+ args = parser.parse_args()
222
+
223
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
224
+ transformers.set_seed(args.seed)
225
+ generate_main(args)
evaluation/general_benchmarks/HumanEval/eval_pal.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import subprocess
4
+ import sys
5
+ from argparse import ArgumentParser
6
+ from pathlib import Path
7
+
8
+ import numpy as np
9
+ import pandas as pd
10
+ import torch
11
+ import torch.distributed as dist
12
+ import torch.nn.functional as F
13
+ from accelerate import Accelerator, DistributedDataParallelKwargs
14
+ from humaneval import HumanEval as evaltor
15
+ from transformers import AutoModelForCausalLM, AutoTokenizer
16
+
17
+ if __name__ == "__main__":
18
+ kwargs_handlers = [DistributedDataParallelKwargs(find_unused_parameters=True)]
19
+ accelerator = Accelerator(mixed_precision="bf16", kwargs_handlers=kwargs_handlers)
20
+
21
+ parser = ArgumentParser()
22
+ parser.add_argument("--logdir", type=str, default="./output")
23
+ parser.add_argument(
24
+ "--model_path",
25
+ type=str,
26
+ default="/data3/models/DeepSeek/deepseek-coder-6.7b-base",
27
+ )
28
+ parser.add_argument("--language", type=str, default="python")
29
+ parser.add_argument("--dataroot", type=str, default="HumanEval/data")
30
+ args = parser.parse_args()
31
+
32
+ logdir = args.logdir
33
+ language = args.language
34
+ model_path = args.model_path
35
+
36
+ if logdir == "":
37
+ logdir = "tmp/"
38
+ tokenizer = dict(
39
+ cls=AutoTokenizer,
40
+ model_path=model_path,
41
+ )
42
+
43
+ dataroot = args.dataroot
44
+
45
+ evaluator = evaltor(
46
+ data_root=dataroot,
47
+ max_seq_len=4096,
48
+ tokenizer_cfg=tokenizer,
49
+ log_dir=logdir,
50
+ n_sample=1,
51
+ batch_size=1,
52
+ language=language,
53
+ max_gen_len=500,
54
+ )
55
+ model = AutoModelForCausalLM.from_pretrained(
56
+ model_path,
57
+ device_map=accelerator.device,
58
+ trust_remote_code=True,
59
+ torch_dtype=torch.bfloat16,
60
+ )
61
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
62
+ evaluator.eval_model(model, accelerator)
evaluation/general_benchmarks/HumanEval/human_eval/__init__.py ADDED
File without changes
evaluation/general_benchmarks/HumanEval/human_eval/data.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gzip
2
+ import json
3
+ import os
4
+ from typing import Dict, Iterable
5
+
6
+ ROOT = os.path.dirname(os.path.abspath(__file__))
7
+ HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
8
+
9
+
10
+ def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
11
+ return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
12
+
13
+
14
+ def stream_jsonl(filename: str) -> Iterable[Dict]:
15
+ """
16
+ Parses each jsonl line and yields it as a dictionary
17
+ """
18
+ if filename.endswith(".gz"):
19
+ with open(filename, "rb") as gzfp:
20
+ with gzip.open(gzfp, "rt") as fp:
21
+ for line in fp:
22
+ if any(not x.isspace() for x in line):
23
+ yield json.loads(line)
24
+ else:
25
+ with open(filename, "r", encoding="utf-8") as fp:
26
+ for line in fp:
27
+ if any(not x.isspace() for x in line):
28
+ yield json.loads(line)
29
+
30
+
31
+ def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
32
+ """
33
+ Writes an iterable of dictionaries to jsonl
34
+ """
35
+ if append:
36
+ mode = "ab"
37
+ else:
38
+ mode = "wb"
39
+ filename = os.path.expanduser(filename)
40
+ if filename.endswith(".gz"):
41
+ with open(filename, mode) as fp:
42
+ with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
43
+ for x in data:
44
+ gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
45
+ else:
46
+ with open(filename, mode) as fp:
47
+ for x in data:
48
+ fp.write((json.dumps(x) + "\n").encode("utf-8"))
evaluation/general_benchmarks/HumanEval/human_eval/evaluate_functional_correctness.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+
3
+ import fire
4
+
5
+ from .data import HUMAN_EVAL
6
+ from .evaluation import evaluate_functional_correctness
7
+
8
+
9
+ def entry_point(
10
+ sample_file: str,
11
+ k: str = "1,10,100",
12
+ n_workers: int = 4,
13
+ timeout: float = 3.0,
14
+ problem_file: str = "",
15
+ is_mbpp: bool = False,
16
+ ):
17
+ """
18
+ Evaluates the functional correctness of generated samples, and writes
19
+ results to f"{sample_file}_results.jsonl.gz"
20
+ """
21
+ k = list(map(int, k.split(",")))
22
+ results = evaluate_functional_correctness(
23
+ sample_file, k, n_workers, timeout, problem_file, is_mbpp
24
+ )
25
+ print(results)
26
+
27
+
28
+ def main():
29
+ fire.Fire(entry_point)
30
+
31
+
32
+ sys.exit(main())
evaluation/general_benchmarks/HumanEval/human_eval/evaluation.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gzip
2
+ import itertools
3
+ import json
4
+ import os
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+ from typing import *
7
+
8
+ import numpy as np
9
+ from tqdm.auto import tqdm
10
+
11
+ from human_eval.data import stream_jsonl
12
+ from human_eval.execution import check_correctness
13
+
14
+ IMPORT_HELPER = {
15
+ "python": [
16
+ "import math",
17
+ "import re",
18
+ "import sys",
19
+ "import copy",
20
+ "import datetime",
21
+ "import itertools",
22
+ "import collections",
23
+ "import heapq",
24
+ "import functools",
25
+ "import hashlib",
26
+ "import numpy",
27
+ "import numpy as np",
28
+ "import string",
29
+ "from typing import *",
30
+ "from collections import *",
31
+ ],
32
+ "go": [
33
+ "math",
34
+ "strings",
35
+ "fmt",
36
+ "strconv",
37
+ "time",
38
+ "bytes",
39
+ "regexp",
40
+ "sort",
41
+ "math/rand",
42
+ "crypto/md5",
43
+ ],
44
+ "cpp": [
45
+ "#include<stdlib.h>",
46
+ "#include<algorithm>",
47
+ "#include<math.h>",
48
+ "#include<stdio.h>",
49
+ "#include<vector>",
50
+ "#include<string>",
51
+ "#include<climits>",
52
+ "#include<cstring>",
53
+ "#include<iostream>",
54
+ "#include<cassert>",
55
+ ],
56
+ "cs": [
57
+ "using System.Numerics;",
58
+ "using System.Diagnostics;",
59
+ "using System.Collections.Generic;",
60
+ "using System.Linq;",
61
+ "using System.Text;",
62
+ "using System.Security.Cryptography;",
63
+ "using System.Collections.Generic;",
64
+ ],
65
+ }
66
+
67
+
68
+ LANGUAGE_NAME = {
69
+ "cpp": "CPP",
70
+ "go": "Go",
71
+ "java": "Java",
72
+ "js": "JavaScript",
73
+ "python": "Python",
74
+ }
75
+
76
+
77
+ def read_dataset(
78
+ data_file: str = None,
79
+ dataset_type: str = "humaneval",
80
+ num_shot=None,
81
+ ) -> Dict:
82
+ """
83
+ Reads a dataset and returns a dictionary of tasks.
84
+ """
85
+ if num_shot is not None:
86
+ print(f"{num_shot}-shot setting...")
87
+ if "humaneval" in dataset_type.lower():
88
+ if data_file is None:
89
+ current_path = os.path.dirname(os.path.abspath(__file__))
90
+ data_file = os.path.join(
91
+ current_path,
92
+ "..",
93
+ "humaneval-x",
94
+ "python",
95
+ "data",
96
+ "humaneval_python.jsonl.gz",
97
+ )
98
+ dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
99
+ else:
100
+ raise f"Dataset: {dataset_type} not supported."
101
+
102
+ return dataset
103
+
104
+
105
+ def estimate_pass_at_k(
106
+ num_samples: Union[int, List[int], np.ndarray],
107
+ num_correct: Union[List[int], np.ndarray],
108
+ k: int,
109
+ ) -> np.ndarray:
110
+ """
111
+ Estimates pass@k of each problem and returns them in an array.
112
+ """
113
+
114
+ def estimator(n: int, c: int, k: int) -> float:
115
+ """
116
+ Calculates 1 - comb(n - c, k) / comb(n, k).
117
+ """
118
+ if n - c < k:
119
+ return 1.0
120
+ return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
121
+
122
+ if isinstance(num_samples, int):
123
+ num_samples_it = itertools.repeat(num_samples, len(num_correct))
124
+ else:
125
+ assert len(num_samples) == len(num_correct)
126
+ num_samples_it = iter(num_samples)
127
+
128
+ return np.array(
129
+ [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
130
+ )
131
+
132
+
133
+ def process_humaneval_test(
134
+ sample, problems, example_test=False, is_mbpp=False, language="python"
135
+ ):
136
+ """
137
+ Processes a sample for evaluation.
138
+ """
139
+ task_id = sample["task_id"]
140
+ if is_mbpp:
141
+ return sample["generation"] + "\n" + "\n".join(problems[task_id]["test"])
142
+
143
+ prompt = sample["prompt"]
144
+ if (
145
+ example_test
146
+ and "example_test" in problems[task_id]
147
+ and problems[task_id]["example_test"] != ""
148
+ ):
149
+ test = problems[task_id]["example_test"]
150
+ else:
151
+ test = problems[task_id]["test"]
152
+ code = sample["generation"]
153
+
154
+ # Pre-process for different languages
155
+ if language == "python":
156
+ test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
157
+ test_string = test_setup + code + "\n" + test + "\n"
158
+ elif language == "cpp":
159
+ test_set_up = ""
160
+ for s in IMPORT_HELPER["cpp"]:
161
+ if s not in prompt:
162
+ test_set_up += s + "\n"
163
+ test_string = test_set_up + "\n" + code + "\n" + test
164
+ elif language == "java":
165
+ test_string = code + "\n" + test
166
+ elif language == "cs":
167
+ test_set_up = ""
168
+ for s in IMPORT_HELPER["cs"]:
169
+ test_set_up += s + "\n"
170
+ test_string = test_set_up + "\n" + code + "\n" + test
171
+ elif language in ["js", "javascript", "ts", "sh", "go"]:
172
+ test_string = code + "\n" + test
173
+ elif language == "go232":
174
+ import_string = problems[task_id]["import"]
175
+ prompt = prompt.replace(import_string, "")
176
+ if example_test and "example_test" in problems[task_id]:
177
+ test = problems[task_id]["example_test"]
178
+ else:
179
+ test = problems[task_id]["test"]
180
+ test_setup = problems[task_id]["test_setup"]
181
+ other_pkgs = []
182
+ for pkg in IMPORT_HELPER["go"]:
183
+ if pkg not in test_setup:
184
+ p = pkg.split("/")[-1]
185
+ if p + "." in code:
186
+ other_pkgs.append(f'"{pkg}"')
187
+ if other_pkgs:
188
+ import_other_pkgs = (
189
+ "import (\n" + " ".join([p + "\n" for p in other_pkgs]) + ")"
190
+ )
191
+ test_string = (
192
+ test_setup
193
+ + "\n"
194
+ + import_other_pkgs
195
+ + "\n"
196
+ + prompt
197
+ + code
198
+ + "\n"
199
+ + test
200
+ )
201
+ else:
202
+ test_string = test_setup + "\n" + prompt + code + "\n" + test
203
+ elif language == "rust":
204
+ main = "\nfn main(){ \n } \n"
205
+ declaration = problems[task_id]["declaration"]
206
+ test_string = main + declaration + prompt + code + test
207
+ elif language == "php":
208
+ if code[:5] != "<?php":
209
+ code = "<?php\n" + code
210
+ test_string = code + "\n" + test + "?>"
211
+ return test_string
212
+
213
+
214
+ def stream_jsonl_all(filename: str) -> Iterable[Dict]:
215
+ """
216
+ Streams a JSONL file.
217
+ """
218
+ results = []
219
+ if filename.endswith(".gz"):
220
+ fp = gzip.open(open(filename, "rb"), "rt")
221
+ else:
222
+ fp = open(filename, "r")
223
+ for line in fp:
224
+ if any(not x.isspace() for x in line):
225
+ results.append(json.loads(line))
226
+ fp.close()
227
+
228
+ return results
229
+
230
+
231
+ def evaluate_functional_correctness(
232
+ input_file: str = None,
233
+ tmp_dir: str = "./",
234
+ n_workers: int = 32,
235
+ timeout: float = 10.0,
236
+ problem_file: str = "../data/humaneval_python.jsonl.gz",
237
+ out_path: str = None,
238
+ k: List[int] = [1, 10, 100],
239
+ test_groundtruth: bool = False,
240
+ example_test: bool = False,
241
+ is_mbpp: bool = False,
242
+ language: str = "python",
243
+ ):
244
+ """
245
+ Evaluates the functional correctness of a model.
246
+ """
247
+ if example_test:
248
+ print("Example test...")
249
+
250
+ problems = read_dataset(problem_file, dataset_type="humaneval")
251
+ sample_jsonl = stream_jsonl_all(input_file)
252
+
253
+ with ThreadPoolExecutor(max_workers=n_workers) as executor:
254
+
255
+ futures = []
256
+ completion_id = Counter()
257
+ n_samples = 0
258
+ # results = defaultdict(list)
259
+ results = {}
260
+
261
+ if test_groundtruth:
262
+ print("Testing ground truth...")
263
+ for sample in tqdm(problems.values()):
264
+ task_id = sample["task_id"]
265
+ lang = task_id.split("/")[0].lower()
266
+ if lang == "javascript":
267
+ lang = "js"
268
+ tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
269
+ sample["generation"] = sample["canonical_solution"]
270
+ sample["test_code"] = process_humaneval_test(
271
+ sample, problems, example_test, language
272
+ )
273
+ if sample["test_code"] is None:
274
+ continue
275
+ args = (
276
+ task_id,
277
+ sample,
278
+ lang,
279
+ timeout,
280
+ tmp_dir_,
281
+ completion_id[task_id],
282
+ )
283
+ future = executor.submit(check_correctness, *args)
284
+ futures.append(future)
285
+ completion_id[task_id] += 1
286
+ n_samples += 1
287
+ else:
288
+ print("Reading samples...")
289
+ for sample in tqdm(sample_jsonl):
290
+ task_id = sample["task_id"]
291
+ if not is_mbpp:
292
+ lang = language
293
+ if not is_mbpp and lang == "javascript":
294
+ lang = "js"
295
+ if is_mbpp:
296
+ lang = "python"
297
+ tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
298
+ sample["task_id"] = task_id
299
+ sample["test_code"] = process_humaneval_test(
300
+ sample, problems, example_test, is_mbpp, language
301
+ )
302
+ if sample["test_code"] is None:
303
+ continue
304
+ if "completion_id" in sample:
305
+ completion_id_ = sample["completion_id"]
306
+ else:
307
+ completion_id_ = completion_id[task_id]
308
+ args = (task_id, sample, lang, timeout, tmp_dir_, completion_id_)
309
+ future = executor.submit(check_correctness, *args)
310
+ futures.append(future)
311
+ completion_id[task_id] += 1
312
+ n_samples += 1
313
+
314
+ if len(completion_id) == len(problems):
315
+ evaluate_pass_at_k = True
316
+ else:
317
+ evaluate_pass_at_k = False
318
+
319
+ print("Running test suites...")
320
+ for future in tqdm(as_completed(futures), total=len(futures)):
321
+ result = future.result()
322
+ # results[result["task_id"]].append((result["completion_id"], result))
323
+ results[result["task_id"]] = result
324
+
325
+ # Calculate pass@k.
326
+ total, correct = [], []
327
+ for result in results.values():
328
+ # passed = [r[1]["passed"] for r in result]
329
+ passed = [result["passed"]]
330
+ total.append(len(passed))
331
+ correct.append(sum(passed))
332
+ total = np.array(total)
333
+ correct = np.array(correct)
334
+
335
+ if evaluate_pass_at_k:
336
+ ks = k
337
+ pass_at_k = {
338
+ f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
339
+ for k in ks
340
+ if (total >= k).all()
341
+ }
342
+ print(pass_at_k)
343
+ else:
344
+ print("Total:", np.sum(total))
345
+ print("Correct:", np.sum(correct))
346
+
347
+ if out_path:
348
+ with open(out_path, "w") as f:
349
+ json.dump(list(results.values()), f, ensure_ascii=False)
350
+
351
+ return pass_at_k
evaluation/general_benchmarks/HumanEval/human_eval/execution.py ADDED
@@ -0,0 +1,817 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import faulthandler
3
+ import gzip
4
+ import io
5
+ import json
6
+ import multiprocessing
7
+ import os
8
+ import platform
9
+ import random
10
+ import signal
11
+ import subprocess
12
+ import tempfile
13
+ import traceback
14
+ from typing import *
15
+
16
+ java_exec = ""
17
+ node_exec = ""
18
+ tsc_exec = ""
19
+ go_exec = ""
20
+ php_exec = ""
21
+ cs_exec = ""
22
+
23
+
24
+ def check_correctness(
25
+ task_id: str,
26
+ sample: dict,
27
+ language_type: str,
28
+ timeout: float = 3.0,
29
+ tmp_dir: str = None,
30
+ completion_id: Optional[int] = None,
31
+ ) -> Dict:
32
+ """
33
+ Evaluates the functional correctness of a completion by running the test
34
+ suite provided in the problem.
35
+ """
36
+
37
+ def unsafe_execute(tmp_dir):
38
+ random_id = random.randint(1, 100000)
39
+ if "python" in language_type.lower():
40
+ with create_tempdir():
41
+
42
+ # These system calls are needed when cleaning up tempdir.
43
+ import os
44
+ import shutil
45
+
46
+ rmtree = shutil.rmtree
47
+ rmdir = os.rmdir
48
+ chdir = os.chdir
49
+
50
+ # Disable functionalities that can make destructive changes to the test.
51
+ reliability_guard()
52
+
53
+ try:
54
+ exec_globals = {}
55
+ with swallow_io():
56
+ with time_limit(timeout):
57
+ # WARNING
58
+ # This program exists to execute untrusted model-generated code. Although
59
+ # it is highly unlikely that model-generated code will do something overtly
60
+ # malicious in response to this test suite, model-generated code may act
61
+ # destructively due to a lack of model capability or alignment.
62
+ # Users are strongly encouraged to sandbox this evaluation suite so that it
63
+ # does not perform destructive actions on their host or network.
64
+ # Once you have read this disclaimer and taken appropriate precautions,
65
+ # uncomment the following line and proceed at your own risk:
66
+ exec(sample["test_code"], exec_globals)
67
+ result.append("passed")
68
+ except TimeoutException:
69
+ result.append("timed out")
70
+ except AssertionError as e:
71
+ result.append(f"failed: AssertionError")
72
+ except BaseException as e:
73
+ result.append(f"failed: {e}")
74
+ # print(sample["test_code"])
75
+ # print(result)
76
+ # Needed for cleaning up.
77
+ shutil.rmtree = rmtree
78
+ os.rmdir = rmdir
79
+ os.chdir = chdir
80
+
81
+ elif "go" in language_type.lower():
82
+ assert (
83
+ tmp_dir is not None
84
+ ), "Go should be evaluated in a dir where necessary module files installed."
85
+
86
+ import os
87
+ import shutil
88
+
89
+ if "tmp" not in tmp_dir:
90
+ tmp_dir = os.path.join(tmp_dir, "tmp")
91
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
92
+ if not os.path.exists(tmp_dir):
93
+ os.makedirs(tmp_dir)
94
+ origin_path = os.getcwd()
95
+ os.chdir(tmp_dir)
96
+ open(f"main_test.go", "w").write(sample["test_code"])
97
+ try:
98
+ exec_result = None
99
+ with time_limit(timeout):
100
+ # WARNING
101
+ # This program exists to execute untrusted model-generated code. Although
102
+ # it is highly unlikely that model-generated code will do something overtly
103
+ # malicious in response to this test suite, model-generated code may act
104
+ # destructively due to a lack of model capability or alignment.
105
+ # Users are strongly encouraged to sandbox this evaluation suite so that it
106
+ # does not perform destructive actions on their host or network.
107
+ # Once you have read this disclaimer and taken appropriate precautions,
108
+ # uncomment the following line and proceed at your own risk:
109
+ exec_result = subprocess.run(
110
+ [
111
+ f"{go_exec}go",
112
+ "test",
113
+ f"-timeout={timeout}s",
114
+ "main_test.go",
115
+ ],
116
+ timeout=timeout,
117
+ capture_output=True,
118
+ )
119
+
120
+ if exec_result.returncode == 0:
121
+ result.append("passed")
122
+ else:
123
+ if exec_result.stderr:
124
+ try:
125
+ err = exec_result.stderr.decode()
126
+ except:
127
+ err = exec_result.stderr
128
+ else:
129
+ try:
130
+ err = exec_result.stdout.decode()
131
+ except:
132
+ err = exec_result.stdout
133
+ result.append(f"failed: {err}")
134
+
135
+ except TimeoutException:
136
+ result.append("timed out")
137
+ os.chdir(origin_path)
138
+ shutil.rmtree(tmp_dir)
139
+ elif "js" in language_type.lower():
140
+ import os
141
+ import shutil
142
+
143
+ if "tmp" not in tmp_dir:
144
+ tmp_dir = os.path.join(tmp_dir, "tmp")
145
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
146
+ if not os.path.exists(tmp_dir):
147
+ os.makedirs(tmp_dir)
148
+ origin_path = os.getcwd()
149
+ os.chdir(tmp_dir)
150
+ open(f"test.js", "w").write(sample["test_code"])
151
+ try:
152
+ exec_result = None
153
+ with time_limit(timeout):
154
+ # WARNING
155
+ # This program exists to execute untrusted model-generated code. Although
156
+ # it is highly unlikely that model-generated code will do something overtly
157
+ # malicious in response to this test suite, model-generated code may act
158
+ # destructively due to a lack of model capability or alignment.
159
+ # Users are strongly encouraged to sandbox this evaluation suite so that it
160
+ # does not perform destructive actions on their host or network.
161
+ # Once you have read this disclaimer and taken appropriate precautions,
162
+ # uncomment the following line and proceed at your own risk:
163
+ exec_result = subprocess.run(
164
+ [f"{node_exec}node", "test.js"],
165
+ timeout=timeout,
166
+ capture_output=True,
167
+ )
168
+
169
+ if exec_result.stderr.decode():
170
+ err = exec_result.stderr.decode()
171
+ result.append(f"failed: {err}")
172
+ elif exec_result.stdout.decode():
173
+ err = exec_result.stdout.decode()
174
+ result.append(f"failed: {err}")
175
+ else:
176
+ result.append("passed")
177
+
178
+ except TimeoutException:
179
+ result.append("timed out")
180
+ os.chdir(origin_path)
181
+ shutil.rmtree(tmp_dir)
182
+ elif "cpp" in language_type.lower():
183
+ import os
184
+ import shutil
185
+
186
+ origin_path = os.getcwd()
187
+ if "tmp" not in tmp_dir:
188
+ tmp_dir = os.path.join(tmp_dir, "tmp")
189
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
190
+ if not os.path.exists(tmp_dir):
191
+ os.makedirs(tmp_dir)
192
+
193
+ os.chdir(tmp_dir)
194
+ open(f"test.cpp", "w").write(sample["test_code"])
195
+ if "162" in task_id:
196
+ compilation_result = subprocess.run(
197
+ ["/usr/bin/g++", "-std=c++17", "test.cpp", "-lcrypto", "-lssl"],
198
+ timeout=timeout,
199
+ capture_output=True,
200
+ )
201
+ else:
202
+ compilation_result = subprocess.run(
203
+ ["/usr/bin/g++", "-std=c++17", "test.cpp"],
204
+ timeout=timeout,
205
+ capture_output=True,
206
+ )
207
+ if compilation_result.returncode != 0:
208
+ if compilation_result.stderr:
209
+ err = compilation_result.stderr.decode()
210
+ else:
211
+ err = compilation_result.stdout.decode()
212
+ result.append(f"failed: compilation error: {err}")
213
+ else:
214
+ try:
215
+ exec_result = None
216
+ with time_limit(timeout):
217
+ # WARNING
218
+ # This program exists to execute untrusted model-generated code. Although
219
+ # it is highly unlikely that model-generated code will do something overtly
220
+ # malicious in response to this test suite, model-generated code may act
221
+ # destructively due to a lack of model capability or alignment.
222
+ # Users are strongly encouraged to sandbox this evaluation suite so that it
223
+ # does not perform destructive actions on their host or network.
224
+ # Once you have read this disclaimer and taken appropriate precautions,
225
+ # uncomment the following line and proceed at your own risk:
226
+ exec_result = subprocess.run(
227
+ ["./a.out"], timeout=timeout, capture_output=True
228
+ )
229
+
230
+ if exec_result.returncode == 0:
231
+ result.append("passed")
232
+ else:
233
+ if exec_result.stderr:
234
+ try:
235
+ err = exec_result.stderr.decode()
236
+ except:
237
+ err = exec_result.stderr
238
+ else:
239
+ try:
240
+ err = exec_result.stdout.decode()
241
+ except:
242
+ err = exec_result.stdout
243
+ result.append(f"failed: {err}")
244
+ except TimeoutException:
245
+ result.append("timed out")
246
+ # print(result[-1])
247
+ # print(sample["test_code"])
248
+ os.chdir(origin_path)
249
+ shutil.rmtree(tmp_dir)
250
+ elif "php" in language_type.lower():
251
+ import os
252
+ import shutil
253
+
254
+ origin_path = os.getcwd()
255
+ if "tmp" not in tmp_dir:
256
+ tmp_dir = os.path.join(tmp_dir, "tmp")
257
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
258
+ if not os.path.exists(tmp_dir):
259
+ os.makedirs(tmp_dir)
260
+
261
+ os.chdir(tmp_dir)
262
+ open(f"test.php", "w").write(sample["test_code"])
263
+ try:
264
+ exec_result = None
265
+ with time_limit(timeout):
266
+ cmd = f"{php_exec}php -f test.php"
267
+ exec_result = subprocess.run(
268
+ cmd, timeout=timeout, capture_output=True, shell=True
269
+ )
270
+
271
+ if exec_result.returncode == 0:
272
+ result.append("passed")
273
+ else:
274
+ if exec_result.stderr:
275
+ try:
276
+ err = exec_result.stderr.decode()
277
+ except:
278
+ err = exec_result.stderr
279
+ else:
280
+ try:
281
+ err = exec_result.stdout.decode()
282
+ except:
283
+ err = exec_result.stdout
284
+ result.append(f"failed: {err}")
285
+ except TimeoutException:
286
+ result.append("timed out")
287
+ print(result[-1])
288
+ print(sample["test_code"])
289
+ os.chdir(origin_path)
290
+ shutil.rmtree(tmp_dir)
291
+ elif "sh" in language_type.lower():
292
+ import os
293
+ import shutil
294
+
295
+ origin_path = os.getcwd()
296
+ if "tmp" not in tmp_dir:
297
+ tmp_dir = os.path.join(tmp_dir, "tmp")
298
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
299
+ if not os.path.exists(tmp_dir):
300
+ os.makedirs(tmp_dir)
301
+
302
+ os.chdir(tmp_dir)
303
+ open(f"test.sh", "w").write(sample["test_code"])
304
+ try:
305
+ exec_result = None
306
+ with time_limit(timeout):
307
+ cmd = "/bin/bash test.sh"
308
+ exec_result = subprocess.run(
309
+ cmd, timeout=10, capture_output=True, shell=True
310
+ )
311
+
312
+ if exec_result.returncode == 0:
313
+ result.append("passed")
314
+ else:
315
+ if exec_result.stderr:
316
+ try:
317
+ err = exec_result.stderr.decode()
318
+ except:
319
+ err = exec_result.stderr
320
+ else:
321
+ try:
322
+ err = exec_result.stdout.decode()
323
+ except:
324
+ err = exec_result.stdout
325
+ result.append(f"failed: {err}")
326
+ except TimeoutException:
327
+ result.append("timed out")
328
+ # print(result[-1])
329
+ # print(sample["test_code"])
330
+ os.chdir(origin_path)
331
+ shutil.rmtree(tmp_dir)
332
+ elif "ts" in language_type.lower():
333
+ import os
334
+ import shutil
335
+
336
+ origin_path = os.getcwd()
337
+ if "tmp" not in tmp_dir:
338
+ tmp_dir = os.path.join(tmp_dir, "tmp")
339
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
340
+ if not os.path.exists(tmp_dir):
341
+ os.makedirs(tmp_dir)
342
+
343
+ os.chdir(tmp_dir)
344
+ env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
345
+ open(f"test.ts", "w").write(sample["test_code"])
346
+ cmd = f"{tsc_exec}tsc test.ts --target ES2015 --lib ES2015,DOM"
347
+ compilation_result = subprocess.run(
348
+ cmd, timeout=timeout, capture_output=True, env=env, shell=True
349
+ )
350
+ if compilation_result.returncode != 0:
351
+ if compilation_result.stderr:
352
+ err = compilation_result.stderr.decode()
353
+ else:
354
+ err = compilation_result.stdout.decode()
355
+ result.append(f"failed: compilation error: {err}")
356
+ else:
357
+ try:
358
+ exec_result = None
359
+ with time_limit(timeout):
360
+ exec_result = subprocess.run(
361
+ [f"{node_exec}node", "test.js"],
362
+ timeout=timeout,
363
+ capture_output=True,
364
+ )
365
+
366
+ if exec_result.returncode == 0:
367
+ result.append("passed")
368
+ else:
369
+ if exec_result.stderr:
370
+ try:
371
+ err = exec_result.stderr.decode()
372
+ except:
373
+ err = exec_result.stderr
374
+ else:
375
+ try:
376
+ err = exec_result.stdout.decode()
377
+ except:
378
+ err = exec_result.stdout
379
+ result.append(f"failed: {err}")
380
+ except TimeoutException:
381
+ result.append("timed out")
382
+ if result[-1] != "passed":
383
+ env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
384
+ cmd = f"{tsc_exec}tsc test.ts"
385
+ compilation_result = subprocess.run(
386
+ cmd, timeout=timeout, capture_output=True, env=env, shell=True
387
+ )
388
+ if compilation_result.returncode != 0:
389
+ if compilation_result.stderr:
390
+ err = compilation_result.stderr.decode()
391
+ else:
392
+ err = compilation_result.stdout.decode()
393
+ result[-1] = f"failed: compilation error: {err}"
394
+ else:
395
+ try:
396
+ exec_result = None
397
+ with time_limit(timeout):
398
+ exec_result = subprocess.run(
399
+ [f"{node_exec}node", "test.js"],
400
+ timeout=timeout,
401
+ capture_output=True,
402
+ )
403
+
404
+ if exec_result.returncode == 0:
405
+ result[-1] = "passed"
406
+ else:
407
+ if exec_result.stderr:
408
+ try:
409
+ err = exec_result.stderr.decode()
410
+ except:
411
+ err = exec_result.stderr
412
+ else:
413
+ try:
414
+ err = exec_result.stdout.decode()
415
+ except:
416
+ err = exec_result.stdout
417
+ result[-1] = f"failed: {err}"
418
+ except TimeoutException:
419
+ result[-1] = "timed out"
420
+
421
+ os.chdir(origin_path)
422
+ shutil.rmtree(tmp_dir)
423
+ elif "cs" in language_type.lower():
424
+ import os
425
+ import shutil
426
+
427
+ origin_path = os.getcwd()
428
+ if "tmp" not in tmp_dir:
429
+ tmp_dir = os.path.join(tmp_dir, "tmp")
430
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
431
+ if not os.path.exists(tmp_dir):
432
+ os.makedirs(tmp_dir)
433
+ os.chdir(tmp_dir)
434
+ open(f"Program.cs", "w").write(sample["test_code"])
435
+ cmd = f"{cs_exec}mcs -d:DEBUG Program.cs"
436
+ compilation_result = subprocess.run(cmd, shell=True, capture_output=True)
437
+ if compilation_result.returncode != 0:
438
+ if compilation_result.stderr:
439
+ err = compilation_result.stderr.decode()
440
+ else:
441
+ err = compilation_result.stdout.decode()
442
+ result.append(f"failed: compilation error: {err}")
443
+ else:
444
+ try:
445
+ exec_result = None
446
+ cmd = f"{cs_exec}mono Program.exe"
447
+ env = dict(MONO_TRACE_LISTENER="Console.Error")
448
+ with time_limit(timeout):
449
+ exec_result = subprocess.run(
450
+ cmd,
451
+ timeout=timeout,
452
+ shell=True,
453
+ capture_output=True,
454
+ env=env,
455
+ )
456
+
457
+ if "Fail" not in exec_result.stderr.decode():
458
+ result.append("passed")
459
+ else:
460
+ if exec_result.stderr:
461
+ try:
462
+ err = exec_result.stderr.decode()
463
+ except:
464
+ err = exec_result.stderr
465
+ else:
466
+ try:
467
+ err = exec_result.stdout.decode()
468
+ except:
469
+ err = exec_result.stdout
470
+ result.append(f"failed: {err}")
471
+ except TimeoutException:
472
+ result.append("timed out")
473
+ except Exception as e:
474
+ result.append(f"failed: {e}")
475
+ os.chdir(origin_path)
476
+ shutil.rmtree(tmp_dir)
477
+ elif "rust" in language_type.lower():
478
+ import os
479
+
480
+ WD: str = os.path.dirname(os.path.abspath(__file__))
481
+ RUST_DIR: str = os.path.join(WD, "rust")
482
+ RUST_SRC: str = os.path.join(RUST_DIR, "src")
483
+ RUST_BIN: str = os.path.join(RUST_SRC, "bin")
484
+ RUST_TMP_DIR: str = os.path.join(RUST_DIR, "tmp")
485
+ RUST_LOGS: str = os.path.join(RUST_TMP_DIR, "logs")
486
+ RUST_EXT: str = ".rs"
487
+
488
+ # Create mandatory tmp directories
489
+ os.makedirs(RUST_TMP_DIR, exist_ok=True)
490
+ os.makedirs(RUST_LOGS, exist_ok=True)
491
+ os.makedirs(RUST_SRC, exist_ok=True)
492
+ os.makedirs(RUST_BIN, exist_ok=True)
493
+
494
+ with tempfile.NamedTemporaryFile(dir=RUST_BIN, delete=False) as f:
495
+ # temporal file name
496
+ file_prefix = sample["task_id"].lower().replace("/", "_")
497
+ file_name: str = file_prefix + RUST_EXT
498
+
499
+ os.rename(f.name, os.path.join(RUST_BIN, file_name))
500
+
501
+ # Sample to pure Rust function
502
+ rust_code: str = sample["test_code"]
503
+
504
+ # dump the rust source code in the target temporal file
505
+ f.write(rust_code.encode("utf-8"))
506
+
507
+ # Proceed towards Rust binaries compilation. Therefore move to Rust module root dir.
508
+ os.chdir(RUST_DIR)
509
+
510
+ # Two possible outcomes
511
+ # Pass OR Fail compilation
512
+ log_filename: str = file_prefix + ".jsonl"
513
+ log_path: str = os.path.join(RUST_LOGS, log_filename)
514
+ cargo_check: str = (
515
+ "cargo check --bin "
516
+ + file_prefix
517
+ + " --message-format json >> "
518
+ + log_path
519
+ )
520
+ # Compilation build status
521
+ returned_val_compilation: int
522
+
523
+ # Overwrite file content
524
+ if os.path.exists(log_path):
525
+ if (file_size := os.path.getsize(log_path)) >= 0:
526
+ os.remove(log_path)
527
+ returned_val_compilation = os.system(cargo_check)
528
+
529
+ else:
530
+ returned_val_compilation = os.system(cargo_check)
531
+
532
+ # 0 means success
533
+ if returned_val_compilation == 0:
534
+
535
+ # Execution pipeline
536
+ cargo_test: str = (
537
+ "cargo test --bin "
538
+ + file_prefix
539
+ + " --message-format json >> "
540
+ + log_path
541
+ )
542
+ returned_val_execution = os.system(cargo_test)
543
+
544
+ if returned_val_execution == 0:
545
+ result.append("passed")
546
+ else:
547
+ result.append(f"failed: execution error")
548
+
549
+ else:
550
+ result.append(f"failed: compilation error")
551
+
552
+ elif "java" in language_type.lower():
553
+ assert tmp_dir is not None, "Java should be evaluated in a temporary dir."
554
+
555
+ import os
556
+ import shutil
557
+
558
+ if "tmp" not in tmp_dir:
559
+ tmp_dir = os.path.join(tmp_dir, "tmp")
560
+ tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
561
+ if not os.path.exists(tmp_dir):
562
+ os.makedirs(tmp_dir)
563
+ open(os.path.join(tmp_dir, "Problem.java"), "w").write(sample["test_code"])
564
+ origin_path = os.getcwd()
565
+ os.system(f"cp ./javatuples-1.2.jar {tmp_dir}/")
566
+ os.chdir(tmp_dir)
567
+ res = "failed: unknown error"
568
+ compile_returncode = -1
569
+ for _ in range(5):
570
+ try:
571
+ cmd = f"{java_exec}javac -cp javatuples-1.2.jar Problem.java"
572
+ compilation_result = subprocess.run(
573
+ cmd, timeout=60, capture_output=True, shell=True
574
+ )
575
+ compile_returncode = compilation_result.returncode
576
+ break
577
+ except subprocess.TimeoutExpired as e:
578
+ continue
579
+ if compile_returncode != 0:
580
+ res = "failed: compilation error"
581
+ else:
582
+ exec_result = None
583
+ try:
584
+ # WARNING
585
+ # This program exists to execute untrusted model-generated code. Although
586
+ # it is highly unlikely that model-generated code will do something overtly
587
+ # malicious in response to this test suite, model-generated code may act
588
+ # destructively due to a lack of model capability or alignment.
589
+ # Users are strongly encouraged to sandbox this evaluation suite so that it
590
+ # does not perform destructive actions on their host or network.
591
+ # Once you have read this disclaimer and taken appropriate precautions,
592
+ # uncomment the following line and proceed at your own risk:
593
+ cmd = f"{java_exec}java -ea -cp .:javatuples-1.2.jar Problem"
594
+ exec_result = subprocess.run(
595
+ cmd, timeout=timeout, capture_output=True, shell=True
596
+ )
597
+ if exec_result.returncode == 0:
598
+ res = "passed"
599
+ elif exec_result.returncode == 1:
600
+ if "AssertionError" in exec_result.stderr.decode(
601
+ "unicode-escape"
602
+ ):
603
+ res = "failed: wrong answer"
604
+ else:
605
+ res = f"failed: {exec_result.stderr.decode()}"
606
+ except subprocess.TimeoutExpired as e:
607
+ res = "time out"
608
+ except BaseException as e:
609
+ res = f"failed: {e}"
610
+
611
+ result.append(res)
612
+ os.chdir(origin_path)
613
+ shutil.rmtree(tmp_dir)
614
+
615
+ manager = multiprocessing.Manager()
616
+ result = manager.list()
617
+
618
+ p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,))
619
+ p.start()
620
+ p.join(timeout=timeout + 1)
621
+ if p.is_alive():
622
+ p.kill()
623
+
624
+ if not result:
625
+ result.append("timed out")
626
+
627
+ return {
628
+ "task_id": task_id,
629
+ "completion_id": completion_id,
630
+ "result": result[0],
631
+ "passed": result[0] == "passed",
632
+ "finish": -1 if "finish" not in sample else sample["finish"],
633
+ "test_code": sample["test_code"],
634
+ "prompt": sample["prompt"],
635
+ # "canonical_solution" : sample["canonical_solution"],
636
+ # "test" : sample["test"],
637
+ # "text" : sample["text"],
638
+ # "output" : sample["output"],
639
+ # "generation" : sample["generation"],
640
+ }
641
+
642
+
643
+ # Copyright (c) OpenAI (https://openai.com)
644
+
645
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
646
+ # of this software and associated documentation files (the "Software"), to deal
647
+ # in the Software without restriction, including without limitation the rights
648
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
649
+ # copies of the Software, and to permit persons to whom the Software is
650
+ # furnished to do so, subject to the following conditions:
651
+
652
+ # The above copyright notice and this permission notice shall be included in
653
+ # all copies or substantial portions of the Software.
654
+
655
+
656
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
657
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
658
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
659
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
660
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
661
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
662
+ # THE SOFTWARE.
663
+ # ============================================================================
664
+ @contextlib.contextmanager
665
+ def time_limit(seconds: float):
666
+ def signal_handler(signum, frame):
667
+ raise TimeoutException("Timed out!")
668
+
669
+ signal.setitimer(signal.ITIMER_REAL, seconds)
670
+ signal.signal(signal.SIGALRM, signal_handler)
671
+ try:
672
+ yield
673
+ finally:
674
+ signal.setitimer(signal.ITIMER_REAL, 0)
675
+
676
+
677
+ @contextlib.contextmanager
678
+ def swallow_io():
679
+ stream = WriteOnlyStringIO()
680
+ with contextlib.redirect_stdout(stream):
681
+ with contextlib.redirect_stderr(stream):
682
+ with redirect_stdin(stream):
683
+ yield
684
+
685
+
686
+ @contextlib.contextmanager
687
+ def create_tempdir():
688
+ with tempfile.TemporaryDirectory() as dirname:
689
+ with chdir(dirname):
690
+ yield dirname
691
+
692
+
693
+ class TimeoutException(Exception):
694
+ pass
695
+
696
+
697
+ class WriteOnlyStringIO(io.StringIO):
698
+ """StringIO that throws an exception when it's read from"""
699
+
700
+ def read(self, *args, **kwargs):
701
+ raise IOError
702
+
703
+ def readline(self, *args, **kwargs):
704
+ raise IOError
705
+
706
+ def readlines(self, *args, **kwargs):
707
+ raise IOError
708
+
709
+ def readable(self, *args, **kwargs):
710
+ """Returns True if the IO object can be read."""
711
+ return False
712
+
713
+
714
+ class redirect_stdin(contextlib._RedirectStream): # type: ignore
715
+ _stream = "stdin"
716
+
717
+
718
+ @contextlib.contextmanager
719
+ def chdir(root):
720
+ if root == ".":
721
+ yield
722
+ return
723
+ cwd = os.getcwd()
724
+ os.chdir(root)
725
+ try:
726
+ yield
727
+ except BaseException as exc:
728
+ raise exc
729
+ finally:
730
+ os.chdir(cwd)
731
+
732
+
733
+ def reliability_guard(maximum_memory_bytes: Optional[int] = None):
734
+ """
735
+ This disables various destructive functions and prevents the generated code
736
+ from interfering with the test (e.g. fork bomb, killing other processes,
737
+ removing filesystem files, etc.)
738
+
739
+ WARNING
740
+ This function is NOT a security sandbox. Untrusted code, including, model-
741
+ generated code, should not be blindly executed outside of one. See the
742
+ Codex paper for more information about OpenAI's code sandbox, and proceed
743
+ with caution.
744
+ """
745
+
746
+ if maximum_memory_bytes is not None:
747
+ import resource
748
+
749
+ resource.setrlimit(
750
+ resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
751
+ )
752
+ resource.setrlimit(
753
+ resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
754
+ )
755
+ if not platform.uname().system == "Darwin":
756
+ resource.setrlimit(
757
+ resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
758
+ )
759
+
760
+ faulthandler.disable()
761
+
762
+ import builtins
763
+
764
+ builtins.exit = None
765
+ builtins.quit = None
766
+
767
+ import os
768
+
769
+ os.environ["OMP_NUM_THREADS"] = "1"
770
+
771
+ os.kill = None
772
+ os.system = None
773
+ os.putenv = None
774
+ os.remove = None
775
+ os.removedirs = None
776
+ os.rmdir = None
777
+ os.fchdir = None
778
+ os.setuid = None
779
+ os.fork = None
780
+ os.forkpty = None
781
+ os.killpg = None
782
+ os.rename = None
783
+ os.renames = None
784
+ os.truncate = None
785
+ os.replace = None
786
+ os.unlink = None
787
+ os.fchmod = None
788
+ os.fchown = None
789
+ os.chmod = None
790
+ os.chown = None
791
+ os.chroot = None
792
+ os.fchdir = None
793
+ os.lchflags = None
794
+ os.lchmod = None
795
+ os.lchown = None
796
+ os.getcwd = None
797
+ os.chdir = None
798
+
799
+ import shutil
800
+
801
+ shutil.rmtree = None
802
+ shutil.move = None
803
+ shutil.chown = None
804
+
805
+ import subprocess
806
+
807
+ subprocess.Popen = None # type: ignore
808
+
809
+ __builtins__["help"] = None
810
+
811
+ import sys
812
+
813
+ sys.modules["ipdb"] = None
814
+ sys.modules["joblib"] = None
815
+ sys.modules["resource"] = None
816
+ sys.modules["psutil"] = None
817
+ sys.modules["tkinter"] = None
evaluation/general_benchmarks/HumanEval/humaneval.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import multiprocessing
4
+ import os
5
+ import re
6
+ import string
7
+ import subprocess
8
+ import time
9
+
10
+ import numpy as np
11
+ import torch
12
+ import torch.distributed as dist
13
+ # from attrdict import AttrDict
14
+ from human_eval.evaluation import evaluate_functional_correctness
15
+ from transformers import AutoTokenizer
16
+ from utils.dataset import HumanEvalDataset
17
+ from utils.utils import cleanup_code
18
+
19
+
20
+ class HumanEval:
21
+ """
22
+ HumanEval evaluation class.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ data_root,
28
+ max_seq_len=2048,
29
+ language="python",
30
+ max_gen_len=200,
31
+ batch_size=512,
32
+ log_dir=None,
33
+ temperature=0,
34
+ issft=False,
35
+ top_p=0.95,
36
+ model_name="",
37
+ inference_increment=True,
38
+ tokenizer_cfg=None,
39
+ n_sample=40,
40
+ k_sample=1,
41
+ ):
42
+ self.data_root = data_root
43
+ self.max_seq_len = max_seq_len
44
+ self.max_gen_len = max_gen_len
45
+ self.batch_size = batch_size
46
+ self.k = k_sample
47
+ self.n_sample = n_sample
48
+ self.language = language
49
+ self.log_dir = log_dir
50
+ self.sft = issft
51
+ self.temperature = temperature
52
+ self.top_p = top_p
53
+ self.model_name = tokenizer_cfg["model_path"].replace("/", "_")
54
+ self.inference_increment = inference_increment
55
+ os.makedirs(self.log_dir, exist_ok=True)
56
+ tokenizer_cls = tokenizer_cfg.pop("cls")
57
+ try:
58
+ self.tokenizer = AutoTokenizer.from_pretrained(
59
+ tokenizer_cfg.pop("model_path"), trust_remote_code=True
60
+ )
61
+ except Exception as e:
62
+ print(e)
63
+ assert False
64
+
65
+ @torch.no_grad()
66
+ def eval_model(self, gpt, accelerator):
67
+ """
68
+ Evaluate the model on HumanEval.
69
+ """
70
+ assert (
71
+ self.log_dir is not None
72
+ ), "log_dir should not be None when evaluating humaneval"
73
+ dataset = HumanEvalDataset(
74
+ self.data_root,
75
+ sample_num=self.n_sample,
76
+ language=self.language,
77
+ issft=self.sft,
78
+ )
79
+ nprompt = len(dataset) // self.n_sample
80
+ dp_rank = accelerator.process_index
81
+ dp_size = accelerator.num_processes
82
+ if self.k > 1:
83
+ assert self.n_sample >= 100, "HumanEval PASS@100 needs n_sample >= 100"
84
+ gpt.eval()
85
+ # each process will process a subset of the dataset
86
+ prompt_indices_split = np.array_split(range(nprompt), dp_size)
87
+ prompt_indices = prompt_indices_split[dp_rank]
88
+ indices = [
89
+ x * self.n_sample + j for x in prompt_indices for j in range(self.n_sample)
90
+ ]
91
+ all_num = len(indices)
92
+ processed_num = 0
93
+ log_file = os.path.join(
94
+ self.log_dir,
95
+ f"{self.model_name}_rank{dp_rank}_bs{self.batch_size}_shot_log_{self.language}.json",
96
+ )
97
+ tmpfile = open(log_file, "w")
98
+ start_time = time.time()
99
+ # split the dataset into batches and construct a list of inputs
100
+ for idx in range(0, len(indices), self.batch_size):
101
+ prompt_list = []
102
+ prompt_lens = []
103
+ orriginal_prompt_list = []
104
+ tokenized_prompt_lens = []
105
+ taskid = []
106
+ # get the prompts from the dataset
107
+ for j in indices[idx : idx + self.batch_size]:
108
+ data = dataset[j]
109
+ fprompt = data["prompt"].strip()
110
+ prompt_list.append(fprompt)
111
+ tmp = self.tokenizer.encode(fprompt)
112
+ orriginal_prompt_list.append(data["original_prompt"])
113
+ prompt_lens.append(len(fprompt))
114
+ tokenized_prompt_lens.append(tmp)
115
+ taskid.append(data["task_id"])
116
+ input_ids = torch.tensor(tokenized_prompt_lens).to(accelerator.device)
117
+ # generate the code
118
+ if self.temperature != 0:
119
+ decoded = gpt.generate(
120
+ input_ids=input_ids,
121
+ max_new_tokens=self.max_gen_len,
122
+ do_sample=True,
123
+ eos_token_id=self.tokenizer.eos_token_id,
124
+ temperature=self.temperature,
125
+ top_p=self.top_p,
126
+ pad_token_id=self.tokenizer.eos_token_id,
127
+ )
128
+ else:
129
+ decoded = gpt.generate(
130
+ input_ids=input_ids,
131
+ max_new_tokens=self.max_gen_len,
132
+ do_sample=False,
133
+ eos_token_id=self.tokenizer.eos_token_id,
134
+ pad_token_id=self.tokenizer.eos_token_id,
135
+ )
136
+ # save the results to a file
137
+ for local_idx, text in enumerate(decoded):
138
+ prediction = decoded[local_idx]
139
+ prediction = self.tokenizer.decode(prediction, skip_special_tokens=True)
140
+ suffixprediction = prediction[prompt_lens[local_idx] :]
141
+ suffixprediction = cleanup_code(
142
+ suffixprediction,
143
+ self.language,
144
+ "humaneval",
145
+ self.sft,
146
+ dataset.stopwords,
147
+ )
148
+ # sft mode does not need original prompt
149
+ if not self.sft:
150
+ suffixprediction = (
151
+ orriginal_prompt_list[local_idx] + "\n" + suffixprediction
152
+ )
153
+ res = {
154
+ "task_id": taskid[local_idx],
155
+ "generation": suffixprediction,
156
+ "prompt": orriginal_prompt_list[local_idx],
157
+ "wholecode": prediction,
158
+ }
159
+ tmpfile.write(json.dumps(res) + "\n")
160
+ tmpfile.flush()
161
+ processed_num += 1
162
+ self.log_score(dp_rank, processed_num, all_num, start_time, self.batch_size)
163
+ tmpfile.close()
164
+ accelerator.wait_for_everyone()
165
+ # calculate the final score of pass@k
166
+ self._calculate_final_score(accelerator)
167
+ accelerator.wait_for_everyone()
168
+ return
169
+
170
+ def log_score(self, dp_rank, processed_num, all_num, start_time, bs):
171
+ """
172
+ Log the score.
173
+ """
174
+ mem = torch.cuda.max_memory_allocated() / (1 << 30)
175
+ avg_time = (time.time() - start_time) / processed_num * bs
176
+ print(
177
+ f"DP RANK:{dp_rank} process_num/all_num:{int(processed_num)}/{all_num} "
178
+ f"avg_time_per_batch:{avg_time:.2f} s "
179
+ f"still_need:{((all_num - processed_num) // bs + 1) * avg_time / 60:.2f} m",
180
+ f"mem:{mem:.3f} GiB bs:{bs}",
181
+ flush=True,
182
+ )
183
+ if processed_num == all_num:
184
+ print(
185
+ f"EVAL DONE! Process time {(time.time() - start_time) / 60:.2f} m",
186
+ flush=True,
187
+ )
188
+
189
+ def _calculate_final_score(self, accelerator):
190
+ """
191
+ Calculate the final score.
192
+ """
193
+ if accelerator.is_local_main_process:
194
+ logfilepath = os.path.join(self.log_dir, f"final_{self.model_name}.jsonl")
195
+ logfile = open(logfilepath, "w")
196
+ for i in range(accelerator.num_processes):
197
+ tmplogfile = os.path.join(
198
+ self.log_dir,
199
+ f"{self.model_name}_rank{i}_bs{self.batch_size}_shot_log_{self.language}.json",
200
+ )
201
+ logfile.write(open(tmplogfile).read().strip() + "\n")
202
+ os.remove(tmplogfile)
203
+ logfile.close()
204
+ timeout = 10
205
+ runlang = self.language
206
+ res = evaluate_functional_correctness(
207
+ input_file=logfilepath,
208
+ problem_file=os.path.join(
209
+ self.data_root, f"humaneval-{self.language}.jsonl"
210
+ ),
211
+ tmp_dir=self.log_dir,
212
+ timeout=timeout,
213
+ language=runlang,
214
+ )
215
+ print("score is", res["pass@%d" % self.k])
216
+ os.remove(logfilepath)
217
+ return
evaluation/general_benchmarks/HumanEval/javatuples-1.2.jar ADDED
Binary file (65.5 kB). View file
 
evaluation/general_benchmarks/HumanEval/test_config.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ compute_environment: LOCAL_MACHINE
2
+ distributed_type: MULTI_GPU
3
+ downcast_bf16: 'no'
4
+ gpu_ids: all
5
+ machine_rank: 0
6
+ main_training_function: main
7
+ mixed_precision: 'no'
8
+ num_machines: 1
9
+ num_processes: 3
10
+ rdzv_backend: static
11
+ same_network: true
12
+ tpu_env: []
13
+ tpu_use_cluster: false
14
+ tpu_use_sudo: false
15
+ use_cpu: false
evaluation/general_benchmarks/HumanEval/utils/dataset.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import numpy as np
5
+
6
+
7
+ class HumanEvalDataset:
8
+
9
+ def __init__(self, root, sample_num=1, language="python", issft=False):
10
+ """
11
+ root: the path to the HumanEval dataset
12
+ sample_num: the number of samples for each prompt
13
+ language: the language of the HumanEval dataset
14
+ issft: whether to use the SFT setting
15
+ """
16
+ self.root = root
17
+ self.data = open(
18
+ os.path.join(self.root, f"humaneval-{language}.jsonl")
19
+ ).readlines()
20
+
21
+ tmp = self.get_qa_only_data(self.data, issft)
22
+ self.clean_data = []
23
+ for i in range(len(tmp)):
24
+ for j in range(sample_num):
25
+ self.clean_data.append(tmp[i])
26
+ self.stopwords = self.clean_data[0]["stopwords"]
27
+ np.random.seed(1234)
28
+ print(f"Read HumanEval from {root}, number of samples {len(self.clean_data)}")
29
+
30
+ def get_qa_only_data(self, data_json, sft=False):
31
+ """
32
+ data_json: the jsonl file of HumanEval
33
+ sft: whether to use the SFT setting
34
+ return: a list of dict, each dict contains the prompt, task_id and stopwords
35
+ """
36
+ ans = []
37
+ for line in data_json:
38
+ line = json.loads(line)
39
+ prompt = line["prompt"].strip()
40
+ if "prefix" in line:
41
+ origin_prompt = line["prefix"]
42
+ else:
43
+ origin_prompt = line["prompt"]
44
+
45
+ if sft:
46
+ prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\n\n### Instruction:\nWrite a program to perform the given task.\n\nInput:\n{prompt}\n\n### Response:\n"""
47
+ if "stop_tokens" in line:
48
+ s = line["stop_tokens"]
49
+ else:
50
+ s = []
51
+ ans.append(
52
+ {
53
+ "prompt": prompt,
54
+ "task_id": line["task_id"],
55
+ "original_prompt": origin_prompt,
56
+ "stopwords": s,
57
+ }
58
+ )
59
+ return ans
60
+
61
+ def __len__(self):
62
+ """
63
+ return the number of samples in the dataset
64
+ """
65
+ return len(self.clean_data)
66
+
67
+ def __getitem__(self, index):
68
+ """
69
+ return the sample at index
70
+ """
71
+ sample = self.clean_data[index]
72
+ return sample
evaluation/general_benchmarks/HumanEval/utils/utils.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ languge_settings = {
4
+ "python": {
5
+ "full_name": "Python",
6
+ "indent": 4,
7
+ },
8
+ "cpp": {
9
+ "full_name": "cpp",
10
+ "indent": 0,
11
+ "main": "int main()",
12
+ },
13
+ "java": {
14
+ "full_name": "Java",
15
+ "indent": 4,
16
+ "main": "public static void main",
17
+ },
18
+ "cs": {
19
+ "full_name": "csharp",
20
+ "indent": 0,
21
+ "main": "public static void Main",
22
+ },
23
+ "php": {
24
+ "full_name": "PHP",
25
+ "indent": 0,
26
+ },
27
+ "ts": {
28
+ "full_name": "TypeScript",
29
+ "indent": 0,
30
+ },
31
+ "js": {"full_name": "JavaScript", "indent": 0},
32
+ "sh": {"full_name": "Bash", "indent": 0},
33
+ }
34
+
35
+
36
+ def get_function_name(question: str, lang: str):
37
+ func_lines = [x for x in question.strip().split("\n") if x.strip()]
38
+
39
+ if lang.lower() == "python":
40
+ func_idx = [
41
+ i for i in range(len(func_lines)) if func_lines[i].startswith("def ")
42
+ ][-1]
43
+ func_name = func_lines[func_idx].split("(")[0].strip()
44
+ func_prefix = "\n".join(func_lines[:func_idx])
45
+ return func_name, func_prefix
46
+
47
+ func_name = func_lines[-1].split("{")[0].strip()
48
+ func_prefix = "\n".join(func_lines[:-1])
49
+ return func_name, func_prefix
50
+
51
+
52
+ def extract_generation_code(example: str, lang_code: str, verbose: bool = False):
53
+ task_id = example["task_id"]
54
+ output = example.get("output", example.get("gpt_completion"))
55
+ question = example["prompt"].strip()
56
+ setting = languge_settings[lang_code]
57
+ lang = setting["full_name"]
58
+ indent = setting["indent"]
59
+
60
+ try:
61
+ code_block: str = re.findall(
62
+ f"```{lang.lower()}\n(.*?)```", output, re.DOTALL | re.IGNORECASE
63
+ )[0]
64
+ if verbose:
65
+ print(">>> Task: {}\n{}".format(task_id, code_block))
66
+
67
+ # Remove main
68
+ if setting.get("main", None) and setting["main"] in code_block:
69
+ main_start = code_block.index(setting["main"])
70
+ code_block = code_block[:main_start]
71
+
72
+ func_name, func_prefix = get_function_name(question, lang)
73
+
74
+ try:
75
+ start = code_block.lower().index(func_name.lower())
76
+ indent = 0
77
+ while start - indent >= 0 and code_block[start - indent - 1] == " ":
78
+ indent += 1
79
+
80
+ try:
81
+ end = code_block.rindex("\n" + " " * indent + "}")
82
+ except:
83
+ end = len(code_block)
84
+ except:
85
+ start = 0
86
+ try:
87
+ end = code_block.rindex("\n" + " " * indent + "}")
88
+ except:
89
+ end = len(code_block)
90
+
91
+ body = code_block[start:end]
92
+
93
+ if lang_code.lower() in ["php", "ts", "js"]:
94
+ body += "\n" + " " * indent + "}"
95
+
96
+ generation = func_prefix + "\n" + body + "\n"
97
+ example["generation"] = generation
98
+
99
+ except Exception as ex:
100
+ print(
101
+ "Failed to extract code block with error `{}`:\n>>> Task: {}\n>>> Output:\n{}".format(
102
+ ex, task_id, output
103
+ )
104
+ )
105
+ example["generation"] = example["prompt"] + "\n" + output
106
+
107
+ return example
108
+
109
+
110
+ def cleanup_code(
111
+ code: str,
112
+ language_type: str = None,
113
+ dataset: str = None,
114
+ issft: bool = False,
115
+ stop_words=[],
116
+ ):
117
+ """
118
+ Cleans up the generated code.
119
+ """
120
+
121
+ if language_type.lower() == "python":
122
+ if issft:
123
+ code = _clean_python_code_for_sft(code)
124
+ stop_words = ["\ndef", "\nclass", "\nif", "\n#", "\nprint"]
125
+ code = _truncate_code_at_stopwords(code, stop_words)
126
+ elif language_type.lower() == "ts":
127
+ code = _truncate_code_at_stopwords(
128
+ code,
129
+ stop_words
130
+ + [
131
+ "\nexport",
132
+ "\nimport",
133
+ "\nexport default",
134
+ "\nimport default",
135
+ "\nconsole.log",
136
+ ],
137
+ )
138
+ else:
139
+ code = _truncate_code_at_stopwords(code, stop_words)
140
+
141
+ return code
142
+
143
+
144
+ def _clean_python_code_for_sft(code):
145
+ code = code.replace("\r", "")
146
+ if "```python" in code:
147
+ code_start_idx = code.index("```python")
148
+ code = code[code_start_idx:].replace("```python", "").strip()
149
+ end_idx = code.find("```") if "```" in code else len(code)
150
+ code = code[:end_idx].strip()
151
+
152
+ return code
153
+
154
+
155
+ def _truncate_code_at_stopwords(code, stop_words):
156
+ min_stop_idx = len(code)
157
+ for stop_word in stop_words:
158
+ stop_index = code.find(stop_word)
159
+ if 0 <= stop_index < min_stop_idx:
160
+ min_stop_idx = stop_index
161
+ return code[:min_stop_idx]
evaluation/general_benchmarks/MATH/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 Zhibin Gou
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
evaluation/general_benchmarks/MATH/README.md ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Requirements
2
+ You can install the required packages with the following command:
3
+ ```bash
4
+ cd latex2sympy
5
+ pip install -e .
6
+ cd ..
7
+ pip install -r requirements.txt
8
+ pip install vllm==0.5.1 --no-build-isolation
9
+ pip install transformers==4.42.3
10
+ ```
11
+
12
+ ### Evaluation
13
+ You can evaluate Qwen2.5/Qwen2-Math-Instruct series model with the following command:
14
+ ```bash
15
+ # Qwen2.5-Math-Instruct Series
16
+ PROMPT_TYPE="qwen25-math-cot"
17
+ # Qwen2.5-Math-1.5B-Instruct
18
+ export CUDA_VISIBLE_DEVICES="0"
19
+ MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-1.5B-Instruct"
20
+ bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
21
+
22
+ # Qwen2.5-Math-7B-Instruct
23
+ export CUDA_VISIBLE_DEVICES="0"
24
+ MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-7B-Instruct"
25
+ bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
26
+
27
+ # Qwen2.5-Math-72B-Instruct
28
+ export CUDA_VISIBLE_DEVICES="0,1,2,3"
29
+ MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-72B-Instruct"
30
+ bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
31
+
32
+
33
+ # Qwen2-Math-Instruct Series
34
+ PROMPT_TYPE="qwen-boxed"
35
+ # Qwen2-Math-1.5B-Instruct
36
+ export CUDA_VISIBLE_DEVICES="0"
37
+ MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-1.5B-Instruct"
38
+ bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
39
+
40
+ # Qwen2-Math-7B-Instruct
41
+ export CUDA_VISIBLE_DEVICES="0"
42
+ MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-7B-Instruct"
43
+ bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
44
+
45
+ # Qwen2-Math-72B-Instruct
46
+ export CUDA_VISIBLE_DEVICES="0,1,2,3"
47
+ MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-72B-Instruct"
48
+ bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
49
+ ```
50
+
51
+ ## Acknowledgement
52
+ The codebase is adapted from [math-evaluation-harness](https://github.com/ZubinGou/math-evaluation-harness).
evaluation/general_benchmarks/MATH/data/aime24/test.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af2b8bd2aa911b6333ad0df32f3ca05c7ae8ed10f1731f4372c8ae26990bf7ac
3
+ size 156944