TwT-6 commited on
Commit
256a159
1 Parent(s): 31ac772

Upload 2667 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. opencompass-my-api/.codespellrc +5 -0
  2. opencompass-my-api/.gitignore +129 -0
  3. opencompass-my-api/.owners.yml +14 -0
  4. opencompass-my-api/.pre-commit-config-zh-cn.yaml +96 -0
  5. opencompass-my-api/.pre-commit-config.yaml +96 -0
  6. opencompass-my-api/.readthedocs.yml +14 -0
  7. opencompass-my-api/LICENSE +203 -0
  8. opencompass-my-api/README.md +520 -0
  9. opencompass-my-api/README_zh-CN.md +522 -0
  10. opencompass-my-api/build/lib/opencompass/__init__.py +1 -0
  11. opencompass-my-api/build/lib/opencompass/datasets/FinanceIQ.py +39 -0
  12. opencompass-my-api/build/lib/opencompass/datasets/GaokaoBench.py +132 -0
  13. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/__init__.py +9 -0
  14. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/cmp_GCP_D.py +161 -0
  15. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/cmp_KSP.py +183 -0
  16. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/cmp_TSP_D.py +150 -0
  17. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/hard_GCP.py +189 -0
  18. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/hard_MSP.py +203 -0
  19. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/hard_TSP.py +211 -0
  20. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/p_BSP.py +124 -0
  21. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/p_EDP.py +145 -0
  22. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/p_SPP.py +196 -0
  23. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/prompts.py +96 -0
  24. opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/utils.py +43 -0
  25. opencompass-my-api/build/lib/opencompass/datasets/OpenFinData.py +47 -0
  26. opencompass-my-api/build/lib/opencompass/datasets/TheoremQA.py +38 -0
  27. opencompass-my-api/build/lib/opencompass/datasets/advglue.py +174 -0
  28. opencompass-my-api/build/lib/opencompass/datasets/afqmcd.py +21 -0
  29. opencompass-my-api/build/lib/opencompass/datasets/agieval/__init__.py +3 -0
  30. opencompass-my-api/build/lib/opencompass/datasets/agieval/agieval.py +99 -0
  31. opencompass-my-api/build/lib/opencompass/datasets/agieval/constructions.py +104 -0
  32. opencompass-my-api/build/lib/opencompass/datasets/agieval/dataset_loader.py +392 -0
  33. opencompass-my-api/build/lib/opencompass/datasets/agieval/evaluation.py +43 -0
  34. opencompass-my-api/build/lib/opencompass/datasets/agieval/math_equivalence.py +161 -0
  35. opencompass-my-api/build/lib/opencompass/datasets/agieval/post_process.py +198 -0
  36. opencompass-my-api/build/lib/opencompass/datasets/agieval/utils.py +43 -0
  37. opencompass-my-api/build/lib/opencompass/datasets/anli.py +18 -0
  38. opencompass-my-api/build/lib/opencompass/datasets/anthropics_evals.py +63 -0
  39. opencompass-my-api/build/lib/opencompass/datasets/arc.py +84 -0
  40. opencompass-my-api/build/lib/opencompass/datasets/ax.py +24 -0
  41. opencompass-my-api/build/lib/opencompass/datasets/base.py +28 -0
  42. opencompass-my-api/build/lib/opencompass/datasets/bbh.py +98 -0
  43. opencompass-my-api/build/lib/opencompass/datasets/boolq.py +56 -0
  44. opencompass-my-api/build/lib/opencompass/datasets/bustum.py +21 -0
  45. opencompass-my-api/build/lib/opencompass/datasets/c3.py +80 -0
  46. opencompass-my-api/build/lib/opencompass/datasets/cb.py +25 -0
  47. opencompass-my-api/build/lib/opencompass/datasets/ceval.py +76 -0
  48. opencompass-my-api/build/lib/opencompass/datasets/chid.py +43 -0
  49. opencompass-my-api/build/lib/opencompass/datasets/cibench.py +511 -0
  50. opencompass-my-api/build/lib/opencompass/datasets/circular.py +373 -0
opencompass-my-api/.codespellrc ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ [codespell]
2
+ skip = *.ipynb
3
+ count =
4
+ quiet-level = 3
5
+ ignore-words-list = nd, ans, ques, rouge, softwares, wit
opencompass-my-api/.gitignore ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ output_*/
3
+ outputs/
4
+ icl_inference_output/
5
+ .vscode/
6
+ tmp/
7
+ configs/eval_subjective_alignbench_test.py
8
+ configs/openai_key.py
9
+ configs/secrets.py
10
+ configs/datasets/log.json
11
+ configs/eval_debug*.py
12
+ configs/viz_*.py
13
+ data
14
+ work_dirs
15
+ models/*
16
+ configs/internal/
17
+ # Byte-compiled / optimized / DLL files
18
+ __pycache__/
19
+ *.py[cod]
20
+ *$py.class
21
+ *.ipynb
22
+
23
+ # C extensions
24
+ *.so
25
+
26
+ # Distribution / packaging
27
+ .Python
28
+ build/
29
+ develop-eggs/
30
+ dist/
31
+ downloads/
32
+ eggs/
33
+ .eggs/
34
+ lib/
35
+ lib64/
36
+ parts/
37
+ sdist/
38
+ var/
39
+ wheels/
40
+ *.egg-info/
41
+ .installed.cfg
42
+ *.egg
43
+ MANIFEST
44
+
45
+ # PyInstaller
46
+ # Usually these files are written by a python script from a template
47
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
48
+ *.manifest
49
+ *.spec
50
+
51
+ # Installer logs
52
+ pip-log.txt
53
+ pip-delete-this-directory.txt
54
+
55
+ # Unit test / coverage reports
56
+ htmlcov/
57
+ .tox/
58
+ .coverage
59
+ .coverage.*
60
+ .cache
61
+ nosetests.xml
62
+ coverage.xml
63
+ *.cover
64
+ .hypothesis/
65
+ .pytest_cache/
66
+
67
+ # Translations
68
+ *.mo
69
+ *.pot
70
+
71
+ # Django stuff:
72
+ *.log
73
+ local_settings.py
74
+ db.sqlite3
75
+
76
+ # Flask stuff:
77
+ instance/
78
+ .webassets-cache
79
+
80
+ # Scrapy stuff:
81
+ .scrapy
82
+
83
+ .idea
84
+
85
+ # Auto generate documentation
86
+ docs/en/_build/
87
+ docs/zh_cn/_build/
88
+
89
+ # .zip
90
+ *.zip
91
+
92
+ # sft config ignore list
93
+ configs/sft_cfg/*B_*
94
+ configs/sft_cfg/1B/*
95
+ configs/sft_cfg/7B/*
96
+ configs/sft_cfg/20B/*
97
+ configs/sft_cfg/60B/*
98
+ configs/sft_cfg/100B/*
99
+
100
+ configs/cky/
101
+ # in case llama clone in the opencompass
102
+ llama/
103
+
104
+ # in case ilagent clone in the opencompass
105
+ ilagent/
106
+
107
+ # ignore the config file for criticbench evaluation
108
+ configs/sft_cfg/criticbench_eval/*
109
+
110
+ # path of turbomind's model after runing `lmdeploy.serve.turbomind.deploy`
111
+ turbomind/
112
+
113
+ # cibench output
114
+ *.db
115
+ *.pth
116
+ *.pt
117
+ *.onnx
118
+ *.gz
119
+ *.gz.*
120
+ *.png
121
+ *.txt
122
+ *.jpg
123
+ *.json
124
+ *.csv
125
+ *.npy
126
+ *.c
127
+
128
+ # aliyun
129
+ core.*
opencompass-my-api/.owners.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ assign:
2
+ issues: enabled
3
+ pull_requests: disabled
4
+ strategy:
5
+ # random
6
+ daily-shift-based
7
+ scedule:
8
+ '*/1 * * * *'
9
+ assignees:
10
+ - Leymore
11
+ - bittersweet1999
12
+ - yingfhu
13
+ - kennymckormick
14
+ - tonysy
opencompass-my-api/.pre-commit-config-zh-cn.yaml ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exclude: |
2
+ (?x)^(
3
+ tests/data/|
4
+ opencompass/models/internal/|
5
+ opencompass/utils/internal/|
6
+ opencompass/openicl/icl_evaluator/hf_metrics/|
7
+ opencompass/datasets/lawbench/utils|
8
+ opencompass/datasets/lawbench/evaluation_functions/|
9
+ opencompass/datasets/medbench/|
10
+ opencompass/datasets/teval/|
11
+ opencompass/datasets/NPHardEval/|
12
+ docs/zh_cn/advanced_guides/compassbench_intro.md
13
+ )
14
+ repos:
15
+ - repo: https://gitee.com/openmmlab/mirrors-flake8
16
+ rev: 5.0.4
17
+ hooks:
18
+ - id: flake8
19
+ exclude: configs/
20
+ - repo: https://gitee.com/openmmlab/mirrors-isort
21
+ rev: 5.11.5
22
+ hooks:
23
+ - id: isort
24
+ exclude: configs/
25
+ - repo: https://gitee.com/openmmlab/mirrors-yapf
26
+ rev: v0.32.0
27
+ hooks:
28
+ - id: yapf
29
+ exclude: configs/
30
+ - repo: https://gitee.com/openmmlab/mirrors-codespell
31
+ rev: v2.2.1
32
+ hooks:
33
+ - id: codespell
34
+ exclude: |
35
+ (?x)^(
36
+ .*\.jsonl|
37
+ configs/
38
+ )
39
+ - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
40
+ rev: v4.3.0
41
+ hooks:
42
+ - id: trailing-whitespace
43
+ exclude: |
44
+ (?x)^(
45
+ dicts/|
46
+ projects/.*?/dicts/|
47
+ configs/
48
+ )
49
+ - id: check-yaml
50
+ - id: end-of-file-fixer
51
+ exclude: |
52
+ (?x)^(
53
+ dicts/|
54
+ projects/.*?/dicts/|
55
+ configs/
56
+ )
57
+ - id: requirements-txt-fixer
58
+ - id: double-quote-string-fixer
59
+ exclude: configs/
60
+ - id: check-merge-conflict
61
+ - id: fix-encoding-pragma
62
+ args: ["--remove"]
63
+ - id: mixed-line-ending
64
+ args: ["--fix=lf"]
65
+ - id: mixed-line-ending
66
+ args: ["--fix=lf"]
67
+ - repo: https://gitee.com/openmmlab/mirrors-mdformat
68
+ rev: 0.7.9
69
+ hooks:
70
+ - id: mdformat
71
+ args: ["--number", "--table-width", "200"]
72
+ additional_dependencies:
73
+ - mdformat-openmmlab
74
+ - mdformat_frontmatter
75
+ - linkify-it-py
76
+ exclude: configs/
77
+ - repo: https://gitee.com/openmmlab/mirrors-docformatter
78
+ rev: v1.3.1
79
+ hooks:
80
+ - id: docformatter
81
+ args: ["--in-place", "--wrap-descriptions", "79"]
82
+ - repo: local
83
+ hooks:
84
+ - id: update-dataset-suffix
85
+ name: dataset suffix updater
86
+ entry: ./tools/update_dataset_suffix.py
87
+ language: script
88
+ pass_filenames: true
89
+ require_serial: true
90
+ files: ^configs/datasets
91
+ # - repo: https://github.com/open-mmlab/pre-commit-hooks
92
+ # rev: v0.2.0 # Use the ref you want to point at
93
+ # hooks:
94
+ # - id: check-algo-readme
95
+ # - id: check-copyright
96
+ # args: ["mmocr", "tests", "tools"] # these directories will be checked
opencompass-my-api/.pre-commit-config.yaml ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exclude: |
2
+ (?x)^(
3
+ tests/data/|
4
+ opencompass/models/internal/|
5
+ opencompass/utils/internal/|
6
+ opencompass/openicl/icl_evaluator/hf_metrics/|
7
+ opencompass/datasets/lawbench/utils|
8
+ opencompass/datasets/lawbench/evaluation_functions/|
9
+ opencompass/datasets/medbench/|
10
+ opencompass/datasets/teval/|
11
+ opencompass/datasets/NPHardEval/|
12
+ docs/zh_cn/advanced_guides/compassbench_intro.md
13
+ )
14
+ repos:
15
+ - repo: https://github.com/PyCQA/flake8
16
+ rev: 5.0.4
17
+ hooks:
18
+ - id: flake8
19
+ exclude: configs/
20
+ - repo: https://github.com/PyCQA/isort
21
+ rev: 5.11.5
22
+ hooks:
23
+ - id: isort
24
+ exclude: configs/
25
+ - repo: https://github.com/pre-commit/mirrors-yapf
26
+ rev: v0.32.0
27
+ hooks:
28
+ - id: yapf
29
+ exclude: configs/
30
+ - repo: https://github.com/codespell-project/codespell
31
+ rev: v2.2.1
32
+ hooks:
33
+ - id: codespell
34
+ exclude: |
35
+ (?x)^(
36
+ .*\.jsonl|
37
+ configs/
38
+ )
39
+ - repo: https://github.com/pre-commit/pre-commit-hooks
40
+ rev: v4.3.0
41
+ hooks:
42
+ - id: trailing-whitespace
43
+ exclude: |
44
+ (?x)^(
45
+ dicts/|
46
+ projects/.*?/dicts/|
47
+ configs/
48
+ )
49
+ - id: check-yaml
50
+ - id: end-of-file-fixer
51
+ exclude: |
52
+ (?x)^(
53
+ dicts/|
54
+ projects/.*?/dicts/|
55
+ configs/
56
+ )
57
+ - id: requirements-txt-fixer
58
+ - id: double-quote-string-fixer
59
+ exclude: configs/
60
+ - id: check-merge-conflict
61
+ - id: fix-encoding-pragma
62
+ args: ["--remove"]
63
+ - id: mixed-line-ending
64
+ args: ["--fix=lf"]
65
+ - id: mixed-line-ending
66
+ args: ["--fix=lf"]
67
+ - repo: https://github.com/executablebooks/mdformat
68
+ rev: 0.7.9
69
+ hooks:
70
+ - id: mdformat
71
+ args: ["--number", "--table-width", "200"]
72
+ additional_dependencies:
73
+ - mdformat-openmmlab
74
+ - mdformat_frontmatter
75
+ - linkify-it-py
76
+ exclude: configs/
77
+ - repo: https://github.com/myint/docformatter
78
+ rev: v1.3.1
79
+ hooks:
80
+ - id: docformatter
81
+ args: ["--in-place", "--wrap-descriptions", "79"]
82
+ - repo: local
83
+ hooks:
84
+ - id: update-dataset-suffix
85
+ name: dataset suffix updater
86
+ entry: ./tools/update_dataset_suffix.py
87
+ language: script
88
+ pass_filenames: true
89
+ require_serial: true
90
+ files: ^configs/datasets
91
+ # - repo: https://github.com/open-mmlab/pre-commit-hooks
92
+ # rev: v0.2.0 # Use the ref you want to point at
93
+ # hooks:
94
+ # - id: check-algo-readme
95
+ # - id: check-copyright
96
+ # args: ["mmocr", "tests", "tools"] # these directories will be checked
opencompass-my-api/.readthedocs.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 2
2
+
3
+ # Set the version of Python and other tools you might need
4
+ build:
5
+ os: ubuntu-22.04
6
+ tools:
7
+ python: "3.8"
8
+
9
+ formats:
10
+ - epub
11
+
12
+ python:
13
+ install:
14
+ - requirements: requirements/docs.txt
opencompass-my-api/LICENSE ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright 2020 OpenCompass Authors. All rights reserved.
2
+
3
+ Apache License
4
+ Version 2.0, January 2004
5
+ http://www.apache.org/licenses/
6
+
7
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
8
+
9
+ 1. Definitions.
10
+
11
+ "License" shall mean the terms and conditions for use, reproduction,
12
+ and distribution as defined by Sections 1 through 9 of this document.
13
+
14
+ "Licensor" shall mean the copyright owner or entity authorized by
15
+ the copyright owner that is granting the License.
16
+
17
+ "Legal Entity" shall mean the union of the acting entity and all
18
+ other entities that control, are controlled by, or are under common
19
+ control with that entity. For the purposes of this definition,
20
+ "control" means (i) the power, direct or indirect, to cause the
21
+ direction or management of such entity, whether by contract or
22
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
23
+ outstanding shares, or (iii) beneficial ownership of such entity.
24
+
25
+ "You" (or "Your") shall mean an individual or Legal Entity
26
+ exercising permissions granted by this License.
27
+
28
+ "Source" form shall mean the preferred form for making modifications,
29
+ including but not limited to software source code, documentation
30
+ source, and configuration files.
31
+
32
+ "Object" form shall mean any form resulting from mechanical
33
+ transformation or translation of a Source form, including but
34
+ not limited to compiled object code, generated documentation,
35
+ and conversions to other media types.
36
+
37
+ "Work" shall mean the work of authorship, whether in Source or
38
+ Object form, made available under the License, as indicated by a
39
+ copyright notice that is included in or attached to the work
40
+ (an example is provided in the Appendix below).
41
+
42
+ "Derivative Works" shall mean any work, whether in Source or Object
43
+ form, that is based on (or derived from) the Work and for which the
44
+ editorial revisions, annotations, elaborations, or other modifications
45
+ represent, as a whole, an original work of authorship. For the purposes
46
+ of this License, Derivative Works shall not include works that remain
47
+ separable from, or merely link (or bind by name) to the interfaces of,
48
+ the Work and Derivative Works thereof.
49
+
50
+ "Contribution" shall mean any work of authorship, including
51
+ the original version of the Work and any modifications or additions
52
+ to that Work or Derivative Works thereof, that is intentionally
53
+ submitted to Licensor for inclusion in the Work by the copyright owner
54
+ or by an individual or Legal Entity authorized to submit on behalf of
55
+ the copyright owner. For the purposes of this definition, "submitted"
56
+ means any form of electronic, verbal, or written communication sent
57
+ to the Licensor or its representatives, including but not limited to
58
+ communication on electronic mailing lists, source code control systems,
59
+ and issue tracking systems that are managed by, or on behalf of, the
60
+ Licensor for the purpose of discussing and improving the Work, but
61
+ excluding communication that is conspicuously marked or otherwise
62
+ designated in writing by the copyright owner as "Not a Contribution."
63
+
64
+ "Contributor" shall mean Licensor and any individual or Legal Entity
65
+ on behalf of whom a Contribution has been received by Licensor and
66
+ subsequently incorporated within the Work.
67
+
68
+ 2. Grant of Copyright License. Subject to the terms and conditions of
69
+ this License, each Contributor hereby grants to You a perpetual,
70
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
71
+ copyright license to reproduce, prepare Derivative Works of,
72
+ publicly display, publicly perform, sublicense, and distribute the
73
+ Work and such Derivative Works in Source or Object form.
74
+
75
+ 3. Grant of Patent License. Subject to the terms and conditions of
76
+ this License, each Contributor hereby grants to You a perpetual,
77
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
78
+ (except as stated in this section) patent license to make, have made,
79
+ use, offer to sell, sell, import, and otherwise transfer the Work,
80
+ where such license applies only to those patent claims licensable
81
+ by such Contributor that are necessarily infringed by their
82
+ Contribution(s) alone or by combination of their Contribution(s)
83
+ with the Work to which such Contribution(s) was submitted. If You
84
+ institute patent litigation against any entity (including a
85
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
86
+ or a Contribution incorporated within the Work constitutes direct
87
+ or contributory patent infringement, then any patent licenses
88
+ granted to You under this License for that Work shall terminate
89
+ as of the date such litigation is filed.
90
+
91
+ 4. Redistribution. You may reproduce and distribute copies of the
92
+ Work or Derivative Works thereof in any medium, with or without
93
+ modifications, and in Source or Object form, provided that You
94
+ meet the following conditions:
95
+
96
+ (a) You must give any other recipients of the Work or
97
+ Derivative Works a copy of this License; and
98
+
99
+ (b) You must cause any modified files to carry prominent notices
100
+ stating that You changed the files; and
101
+
102
+ (c) You must retain, in the Source form of any Derivative Works
103
+ that You distribute, all copyright, patent, trademark, and
104
+ attribution notices from the Source form of the Work,
105
+ excluding those notices that do not pertain to any part of
106
+ the Derivative Works; and
107
+
108
+ (d) If the Work includes a "NOTICE" text file as part of its
109
+ distribution, then any Derivative Works that You distribute must
110
+ include a readable copy of the attribution notices contained
111
+ within such NOTICE file, excluding those notices that do not
112
+ pertain to any part of the Derivative Works, in at least one
113
+ of the following places: within a NOTICE text file distributed
114
+ as part of the Derivative Works; within the Source form or
115
+ documentation, if provided along with the Derivative Works; or,
116
+ within a display generated by the Derivative Works, if and
117
+ wherever such third-party notices normally appear. The contents
118
+ of the NOTICE file are for informational purposes only and
119
+ do not modify the License. You may add Your own attribution
120
+ notices within Derivative Works that You distribute, alongside
121
+ or as an addendum to the NOTICE text from the Work, provided
122
+ that such additional attribution notices cannot be construed
123
+ as modifying the License.
124
+
125
+ You may add Your own copyright statement to Your modifications and
126
+ may provide additional or different license terms and conditions
127
+ for use, reproduction, or distribution of Your modifications, or
128
+ for any such Derivative Works as a whole, provided Your use,
129
+ reproduction, and distribution of the Work otherwise complies with
130
+ the conditions stated in this License.
131
+
132
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
133
+ any Contribution intentionally submitted for inclusion in the Work
134
+ by You to the Licensor shall be under the terms and conditions of
135
+ this License, without any additional terms or conditions.
136
+ Notwithstanding the above, nothing herein shall supersede or modify
137
+ the terms of any separate license agreement you may have executed
138
+ with Licensor regarding such Contributions.
139
+
140
+ 6. Trademarks. This License does not grant permission to use the trade
141
+ names, trademarks, service marks, or product names of the Licensor,
142
+ except as required for reasonable and customary use in describing the
143
+ origin of the Work and reproducing the content of the NOTICE file.
144
+
145
+ 7. Disclaimer of Warranty. Unless required by applicable law or
146
+ agreed to in writing, Licensor provides the Work (and each
147
+ Contributor provides its Contributions) on an "AS IS" BASIS,
148
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
149
+ implied, including, without limitation, any warranties or conditions
150
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
151
+ PARTICULAR PURPOSE. You are solely responsible for determining the
152
+ appropriateness of using or redistributing the Work and assume any
153
+ risks associated with Your exercise of permissions under this License.
154
+
155
+ 8. Limitation of Liability. In no event and under no legal theory,
156
+ whether in tort (including negligence), contract, or otherwise,
157
+ unless required by applicable law (such as deliberate and grossly
158
+ negligent acts) or agreed to in writing, shall any Contributor be
159
+ liable to You for damages, including any direct, indirect, special,
160
+ incidental, or consequential damages of any character arising as a
161
+ result of this License or out of the use or inability to use the
162
+ Work (including but not limited to damages for loss of goodwill,
163
+ work stoppage, computer failure or malfunction, or any and all
164
+ other commercial damages or losses), even if such Contributor
165
+ has been advised of the possibility of such damages.
166
+
167
+ 9. Accepting Warranty or Additional Liability. While redistributing
168
+ the Work or Derivative Works thereof, You may choose to offer,
169
+ and charge a fee for, acceptance of support, warranty, indemnity,
170
+ or other liability obligations and/or rights consistent with this
171
+ License. However, in accepting such obligations, You may act only
172
+ on Your own behalf and on Your sole responsibility, not on behalf
173
+ of any other Contributor, and only if You agree to indemnify,
174
+ defend, and hold each Contributor harmless for any liability
175
+ incurred by, or claims asserted against, such Contributor by reason
176
+ of your accepting any such warranty or additional liability.
177
+
178
+ END OF TERMS AND CONDITIONS
179
+
180
+ APPENDIX: How to apply the Apache License to your work.
181
+
182
+ To apply the Apache License to your work, attach the following
183
+ boilerplate notice, with the fields enclosed by brackets "[]"
184
+ replaced with your own identifying information. (Don't include
185
+ the brackets!) The text should be enclosed in the appropriate
186
+ comment syntax for the file format. We also recommend that a
187
+ file or class name and description of purpose be included on the
188
+ same "printed page" as the copyright notice for easier
189
+ identification within third-party archives.
190
+
191
+ Copyright 2020 OpenCompass Authors.
192
+
193
+ Licensed under the Apache License, Version 2.0 (the "License");
194
+ you may not use this file except in compliance with the License.
195
+ You may obtain a copy of the License at
196
+
197
+ http://www.apache.org/licenses/LICENSE-2.0
198
+
199
+ Unless required by applicable law or agreed to in writing, software
200
+ distributed under the License is distributed on an "AS IS" BASIS,
201
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
202
+ See the License for the specific language governing permissions and
203
+ limitations under the License.
opencompass-my-api/README.md ADDED
@@ -0,0 +1,520 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <img src="docs/en/_static/image/logo.svg" width="500px"/>
3
+ <br />
4
+ <br />
5
+
6
+ [![docs](https://readthedocs.org/projects/opencompass/badge)](https://opencompass.readthedocs.io/en)
7
+ [![license](https://img.shields.io/github/license/InternLM/opencompass.svg)](https://github.com/open-compass/opencompass/blob/main/LICENSE)
8
+
9
+ <!-- [![PyPI](https://badge.fury.io/py/opencompass.svg)](https://pypi.org/project/opencompass/) -->
10
+
11
+ [🌐Website](https://opencompass.org.cn/) |
12
+ [📘Documentation](https://opencompass.readthedocs.io/en/latest/) |
13
+ [🛠️Installation](https://opencompass.readthedocs.io/en/latest/get_started/installation.html) |
14
+ [🤔Reporting Issues](https://github.com/open-compass/opencompass/issues/new/choose)
15
+
16
+ English | [简体中文](README_zh-CN.md)
17
+
18
+ </div>
19
+
20
+ <p align="center">
21
+ 👋 join us on <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> and <a href="https://r.vansin.top/?r=opencompass" target="_blank">WeChat</a>
22
+ </p>
23
+
24
+ ## 📣 OpenCompass 2023 LLM Annual Leaderboard
25
+
26
+ We are honored to have witnessed the tremendous progress of artificial general intelligence together with the community in the past year, and we are also very pleased that **OpenCompass** can help numerous developers and users.
27
+
28
+ We announce the launch of the **OpenCompass 2023 LLM Annual Leaderboard** plan. We expect to release the annual leaderboard of the LLMs in January 2024, systematically evaluating the performance of LLMs in various capabilities such as language, knowledge, reasoning, creation, long-text, and agents.
29
+
30
+ At that time, we will release rankings for both open-source models and commercial API models, aiming to provide a comprehensive, objective, and neutral reference for the industry and research community.
31
+
32
+ We sincerely invite various large models to join the OpenCompass to showcase their performance advantages in different fields. At the same time, we also welcome researchers and developers to provide valuable suggestions and contributions to jointly promote the development of the LLMs. If you have any questions or needs, please feel free to [contact us](mailto:[email protected]). In addition, relevant evaluation contents, performance statistics, and evaluation methods will be open-source along with the leaderboard release.
33
+
34
+ We have provided the more details of the CompassBench 2023 in [Doc](docs/zh_cn/advanced_guides/compassbench_intro.md).
35
+
36
+ Let's look forward to the release of the OpenCompass 2023 LLM Annual Leaderboard!
37
+
38
+ ## 🧭 Welcome
39
+
40
+ to **OpenCompass**!
41
+
42
+ Just like a compass guides us on our journey, OpenCompass will guide you through the complex landscape of evaluating large language models. With its powerful algorithms and intuitive interface, OpenCompass makes it easy to assess the quality and effectiveness of your NLP models.
43
+
44
+ 🚩🚩🚩 Explore opportunities at OpenCompass! We're currently **hiring full-time researchers/engineers and interns**. If you're passionate about LLM and OpenCompass, don't hesitate to reach out to us via [email](mailto:[email protected]). We'd love to hear from you!
45
+
46
+ 🔥🔥🔥 We are delighted to announce that **the OpenCompass has been recommended by the Meta AI**, click [Get Started](https://ai.meta.com/llama/get-started/#validation) of Llama for more information.
47
+
48
+ > **Attention**<br />
49
+ > We launch the OpenCompass Collaboration project, welcome to support diverse evaluation benchmarks into OpenCompass!
50
+ > Clike [Issue](https://github.com/open-compass/opencompass/issues/248) for more information.
51
+ > Let's work together to build a more powerful OpenCompass toolkit!
52
+
53
+ ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
54
+
55
+ - **\[2024.01.17\]** We supported the evaluation of [InternLM2](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_keyset.py) and [InternLM2-Chat](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py), InternLM2 showed extremely strong performance in these tests, welcome to try! 🔥🔥🔥.
56
+ - **\[2024.01.17\]** We supported the needle in a haystack test with multiple needles, more information can be found [here](https://opencompass.readthedocs.io/en/latest/advanced_guides/needleinahaystack_eval.html#id8) 🔥🔥🔥.
57
+ - **\[2023.12.28\]** We have enabled seamless evaluation of all models developed using [LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory), a powerful toolkit for comprehensive LLM development. 🔥🔥🔥.
58
+ - **\[2023.12.22\]** We have released [T-Eval](https://github.com/open-compass/T-Eval), a step-by-step evaluation benchmark to gauge your LLMs on tool utilization. Welcome to our [Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html) for more details! 🔥🔥🔥.
59
+ - **\[2023.12.10\]** We have released [VLMEvalKit](https://github.com/open-compass/VLMEvalKit), a toolkit for evaluating vision-language models (VLMs), currently support 20+ VLMs and 7 multi-modal benchmarks (including MMBench series).
60
+ - **\[2023.12.10\]** We have supported Mistral AI's MoE LLM: **Mixtral-8x7B-32K**. Welcome to [MixtralKit](https://github.com/open-compass/MixtralKit) for more details about inference and evaluation.
61
+
62
+ > [More](docs/en/notes/news.md)
63
+
64
+ ## ✨ Introduction
65
+
66
+ ![image](https://github.com/open-compass/opencompass/assets/22607038/f45fe125-4aed-4f8c-8fe8-df4efb41a8ea)
67
+
68
+ OpenCompass is a one-stop platform for large model evaluation, aiming to provide a fair, open, and reproducible benchmark for large model evaluation. Its main features include:
69
+
70
+ - **Comprehensive support for models and datasets**: Pre-support for 20+ HuggingFace and API models, a model evaluation scheme of 70+ datasets with about 400,000 questions, comprehensively evaluating the capabilities of the models in five dimensions.
71
+
72
+ - **Efficient distributed evaluation**: One line command to implement task division and distributed evaluation, completing the full evaluation of billion-scale models in just a few hours.
73
+
74
+ - **Diversified evaluation paradigms**: Support for zero-shot, few-shot, and chain-of-thought evaluations, combined with standard or dialogue-type prompt templates, to easily stimulate the maximum performance of various models.
75
+
76
+ - **Modular design with high extensibility**: Want to add new models or datasets, customize an advanced task division strategy, or even support a new cluster management system? Everything about OpenCompass can be easily expanded!
77
+
78
+ - **Experiment management and reporting mechanism**: Use config files to fully record each experiment, and support real-time reporting of results.
79
+
80
+ ## 📊 Leaderboard
81
+
82
+ We provide [OpenCompass Leaderboard](https://opencompass.org.cn/rank) for the community to rank all public models and API models. If you would like to join the evaluation, please provide the model repository URL or a standard API interface to the email address `[email protected]`.
83
+
84
+ <p align="right"><a href="#top">🔝Back to top</a></p>
85
+
86
+ ## 🛠️ Installation
87
+
88
+ Below are the steps for quick installation and datasets preparation.
89
+
90
+ ### 💻 Environment Setup
91
+
92
+ #### Open-source Models with GPU
93
+
94
+ ```bash
95
+ conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
96
+ conda activate opencompass
97
+ git clone https://github.com/open-compass/opencompass opencompass
98
+ cd opencompass
99
+ pip install -e .
100
+ ```
101
+
102
+ #### API Models with CPU-only
103
+
104
+ ```bash
105
+ conda create -n opencompass python=3.10 pytorch torchvision torchaudio cpuonly -c pytorch -y
106
+ conda activate opencompass
107
+ git clone https://github.com/open-compass/opencompass opencompass
108
+ cd opencompass
109
+ pip install -e .
110
+ # also please install requiresments packages via `pip install -r requirements/api.txt` for API models if needed.
111
+ ```
112
+
113
+ ### 📂 Data Preparation
114
+
115
+ ```bash
116
+ # Download dataset to data/ folder
117
+ wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
118
+ unzip OpenCompassData-core-20240207.zip
119
+ ```
120
+
121
+ Some third-party features, like Humaneval and Llama, may require additional steps to work properly, for detailed steps please refer to the [Installation Guide](https://opencompass.readthedocs.io/en/latest/get_started/installation.html).
122
+
123
+ <p align="right"><a href="#top">🔝Back to top</a></p>
124
+
125
+ ## 🏗️ ️Evaluation
126
+
127
+ After ensuring that OpenCompass is installed correctly according to the above steps and the datasets are prepared, you can evaluate the performance of the LLaMA-7b model on the MMLU and C-Eval datasets using the following command:
128
+
129
+ ```bash
130
+ python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
131
+ ```
132
+
133
+ OpenCompass has predefined configurations for many models and datasets. You can list all available model and dataset configurations using the [tools](./docs/en/tools.md#list-configs).
134
+
135
+ ```bash
136
+ # List all configurations
137
+ python tools/list_configs.py
138
+ # List all configurations related to llama and mmlu
139
+ python tools/list_configs.py llama mmlu
140
+ ```
141
+
142
+ You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:
143
+
144
+ ```bash
145
+ python run.py --datasets ceval_ppl mmlu_ppl \
146
+ --hf-path huggyllama/llama-7b \ # HuggingFace model path
147
+ --model-kwargs device_map='auto' \ # Arguments for model construction
148
+ --tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Arguments for tokenizer construction
149
+ --max-out-len 100 \ # Maximum number of tokens generated
150
+ --max-seq-len 2048 \ # Maximum sequence length the model can accept
151
+ --batch-size 8 \ # Batch size
152
+ --no-batch-padding \ # Don't enable batch padding, infer through for loop to avoid performance loss
153
+ --num-gpus 1 # Number of minimum required GPUs
154
+ ```
155
+
156
+ > **Note**<br />
157
+ > To run the command above, you will need to remove the comments starting from `# ` first.
158
+
159
+ Through the command line or configuration files, OpenCompass also supports evaluating APIs or custom models, as well as more diversified evaluation strategies. Please read the [Quick Start](https://opencompass.readthedocs.io/en/latest/get_started/quick_start.html) to learn how to run an evaluation task.
160
+
161
+ <p align="right"><a href="#top">🔝Back to top</a></p>
162
+
163
+ ## 📖 Dataset Support
164
+
165
+ <table align="center">
166
+ <tbody>
167
+ <tr align="center" valign="bottom">
168
+ <td>
169
+ <b>Language</b>
170
+ </td>
171
+ <td>
172
+ <b>Knowledge</b>
173
+ </td>
174
+ <td>
175
+ <b>Reasoning</b>
176
+ </td>
177
+ <td>
178
+ <b>Examination</b>
179
+ </td>
180
+ </tr>
181
+ <tr valign="top">
182
+ <td>
183
+ <details open>
184
+ <summary><b>Word Definition</b></summary>
185
+
186
+ - WiC
187
+ - SummEdits
188
+
189
+ </details>
190
+
191
+ <details open>
192
+ <summary><b>Idiom Learning</b></summary>
193
+
194
+ - CHID
195
+
196
+ </details>
197
+
198
+ <details open>
199
+ <summary><b>Semantic Similarity</b></summary>
200
+
201
+ - AFQMC
202
+ - BUSTM
203
+
204
+ </details>
205
+
206
+ <details open>
207
+ <summary><b>Coreference Resolution</b></summary>
208
+
209
+ - CLUEWSC
210
+ - WSC
211
+ - WinoGrande
212
+
213
+ </details>
214
+
215
+ <details open>
216
+ <summary><b>Translation</b></summary>
217
+
218
+ - Flores
219
+ - IWSLT2017
220
+
221
+ </details>
222
+
223
+ <details open>
224
+ <summary><b>Multi-language Question Answering</b></summary>
225
+
226
+ - TyDi-QA
227
+ - XCOPA
228
+
229
+ </details>
230
+
231
+ <details open>
232
+ <summary><b>Multi-language Summary</b></summary>
233
+
234
+ - XLSum
235
+
236
+ </details>
237
+ </td>
238
+ <td>
239
+ <details open>
240
+ <summary><b>Knowledge Question Answering</b></summary>
241
+
242
+ - BoolQ
243
+ - CommonSenseQA
244
+ - NaturalQuestions
245
+ - TriviaQA
246
+
247
+ </details>
248
+ </td>
249
+ <td>
250
+ <details open>
251
+ <summary><b>Textual Entailment</b></summary>
252
+
253
+ - CMNLI
254
+ - OCNLI
255
+ - OCNLI_FC
256
+ - AX-b
257
+ - AX-g
258
+ - CB
259
+ - RTE
260
+ - ANLI
261
+
262
+ </details>
263
+
264
+ <details open>
265
+ <summary><b>Commonsense Reasoning</b></summary>
266
+
267
+ - StoryCloze
268
+ - COPA
269
+ - ReCoRD
270
+ - HellaSwag
271
+ - PIQA
272
+ - SIQA
273
+
274
+ </details>
275
+
276
+ <details open>
277
+ <summary><b>Mathematical Reasoning</b></summary>
278
+
279
+ - MATH
280
+ - GSM8K
281
+
282
+ </details>
283
+
284
+ <details open>
285
+ <summary><b>Theorem Application</b></summary>
286
+
287
+ - TheoremQA
288
+ - StrategyQA
289
+ - SciBench
290
+
291
+ </details>
292
+
293
+ <details open>
294
+ <summary><b>Comprehensive Reasoning</b></summary>
295
+
296
+ - BBH
297
+
298
+ </details>
299
+ </td>
300
+ <td>
301
+ <details open>
302
+ <summary><b>Junior High, High School, University, Professional Examinations</b></summary>
303
+
304
+ - C-Eval
305
+ - AGIEval
306
+ - MMLU
307
+ - GAOKAO-Bench
308
+ - CMMLU
309
+ - ARC
310
+ - Xiezhi
311
+
312
+ </details>
313
+
314
+ <details open>
315
+ <summary><b>Medical Examinations</b></summary>
316
+
317
+ - CMB
318
+
319
+ </details>
320
+ </td>
321
+ </tr>
322
+ </td>
323
+ </tr>
324
+ </tbody>
325
+ <tbody>
326
+ <tr align="center" valign="bottom">
327
+ <td>
328
+ <b>Understanding</b>
329
+ </td>
330
+ <td>
331
+ <b>Long Context</b>
332
+ </td>
333
+ <td>
334
+ <b>Safety</b>
335
+ </td>
336
+ <td>
337
+ <b>Code</b>
338
+ </td>
339
+ </tr>
340
+ <tr valign="top">
341
+ <td>
342
+ <details open>
343
+ <summary><b>Reading Comprehension</b></summary>
344
+
345
+ - C3
346
+ - CMRC
347
+ - DRCD
348
+ - MultiRC
349
+ - RACE
350
+ - DROP
351
+ - OpenBookQA
352
+ - SQuAD2.0
353
+
354
+ </details>
355
+
356
+ <details open>
357
+ <summary><b>Content Summary</b></summary>
358
+
359
+ - CSL
360
+ - LCSTS
361
+ - XSum
362
+ - SummScreen
363
+
364
+ </details>
365
+
366
+ <details open>
367
+ <summary><b>Content Analysis</b></summary>
368
+
369
+ - EPRSTMT
370
+ - LAMBADA
371
+ - TNEWS
372
+
373
+ </details>
374
+ </td>
375
+ <td>
376
+ <details open>
377
+ <summary><b>Long Context Understanding</b></summary>
378
+
379
+ - LEval
380
+ - LongBench
381
+ - GovReports
382
+ - NarrativeQA
383
+ - Qasper
384
+
385
+ </details>
386
+ </td>
387
+ <td>
388
+ <details open>
389
+ <summary><b>Safety</b></summary>
390
+
391
+ - CivilComments
392
+ - CrowsPairs
393
+ - CValues
394
+ - JigsawMultilingual
395
+ - TruthfulQA
396
+
397
+ </details>
398
+ <details open>
399
+ <summary><b>Robustness</b></summary>
400
+
401
+ - AdvGLUE
402
+
403
+ </details>
404
+ </td>
405
+ <td>
406
+ <details open>
407
+ <summary><b>Code</b></summary>
408
+
409
+ - HumanEval
410
+ - HumanEvalX
411
+ - MBPP
412
+ - APPs
413
+ - DS1000
414
+
415
+ </details>
416
+ </td>
417
+ </tr>
418
+ </td>
419
+ </tr>
420
+ </tbody>
421
+ </table>
422
+
423
+ ## OpenCompass Ecosystem
424
+
425
+ <p align="right"><a href="#top">🔝Back to top</a></p>
426
+
427
+ ## 📖 Model Support
428
+
429
+ <table align="center">
430
+ <tbody>
431
+ <tr align="center" valign="bottom">
432
+ <td>
433
+ <b>Open-source Models</b>
434
+ </td>
435
+ <td>
436
+ <b>API Models</b>
437
+ </td>
438
+ <!-- <td>
439
+ <b>Custom Models</b>
440
+ </td> -->
441
+ </tr>
442
+ <tr valign="top">
443
+ <td>
444
+
445
+ - [InternLM](https://github.com/InternLM/InternLM)
446
+ - [LLaMA](https://github.com/facebookresearch/llama)
447
+ - [Vicuna](https://github.com/lm-sys/FastChat)
448
+ - [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
449
+ - [Baichuan](https://github.com/baichuan-inc)
450
+ - [WizardLM](https://github.com/nlpxucan/WizardLM)
451
+ - [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)
452
+ - [ChatGLM3](https://github.com/THUDM/ChatGLM3-6B)
453
+ - [TigerBot](https://github.com/TigerResearch/TigerBot)
454
+ - [Qwen](https://github.com/QwenLM/Qwen)
455
+ - [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
456
+ - ...
457
+
458
+ </td>
459
+ <td>
460
+
461
+ - OpenAI
462
+ - Claude
463
+ - ZhipuAI(ChatGLM)
464
+ - Baichuan
465
+ - ByteDance(YunQue)
466
+ - Huawei(PanGu)
467
+ - 360
468
+ - Baidu(ERNIEBot)
469
+ - MiniMax(ABAB-Chat)
470
+ - SenseTime(nova)
471
+ - Xunfei(Spark)
472
+ - ……
473
+
474
+ </td>
475
+
476
+ </tr>
477
+ </tbody>
478
+ </table>
479
+
480
+ <p align="right"><a href="#top">🔝Back to top</a></p>
481
+
482
+ ## 🔜 Roadmap
483
+
484
+ - [ ] Subjective Evaluation
485
+ - [ ] Release CompassAreana
486
+ - [ ] Subjective evaluation dataset.
487
+ - [x] Long-context
488
+ - [ ] Long-context evaluation with extensive datasets.
489
+ - [ ] Long-context leaderboard.
490
+ - [ ] Coding
491
+ - [ ] Coding evaluation leaderboard.
492
+ - [x] Non-python language evaluation service.
493
+ - [ ] Agent
494
+ - [ ] Support various agenet framework.
495
+ - [ ] Evaluation of tool use of the LLMs.
496
+ - [x] Robustness
497
+ - [x] Support various attack method
498
+
499
+ ## 👷‍♂️ Contributing
500
+
501
+ We appreciate all contributions to improving OpenCompass. Please refer to the [contributing guideline](https://opencompass.readthedocs.io/en/latest/notes/contribution_guide.html) for the best practice.
502
+
503
+ ## 🤝 Acknowledgements
504
+
505
+ Some code in this project is cited and modified from [OpenICL](https://github.com/Shark-NLP/OpenICL).
506
+
507
+ Some datasets and prompt implementations are modified from [chain-of-thought-hub](https://github.com/FranxYao/chain-of-thought-hub) and [instruct-eval](https://github.com/declare-lab/instruct-eval).
508
+
509
+ ## 🖊️ Citation
510
+
511
+ ```bibtex
512
+ @misc{2023opencompass,
513
+ title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
514
+ author={OpenCompass Contributors},
515
+ howpublished = {\url{https://github.com/open-compass/opencompass}},
516
+ year={2023}
517
+ }
518
+ ```
519
+
520
+ <p align="right"><a href="#top">🔝Back to top</a></p>
opencompass-my-api/README_zh-CN.md ADDED
@@ -0,0 +1,522 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+ <img src="docs/zh_cn/_static/image/logo.svg" width="500px"/>
3
+ <br />
4
+ <br />
5
+
6
+ [![docs](https://readthedocs.org/projects/opencompass/badge)](https://opencompass.readthedocs.io/zh_CN)
7
+ [![license](https://img.shields.io/github/license/InternLM/opencompass.svg)](https://github.com/open-compass/opencompass/blob/main/LICENSE)
8
+
9
+ <!-- [![PyPI](https://badge.fury.io/py/opencompass.svg)](https://pypi.org/project/opencompass/) -->
10
+
11
+ [🌐Website](https://opencompass.org.cn/) |
12
+ [📘Documentation](https://opencompass.readthedocs.io/zh_CN/latest/index.html) |
13
+ [🛠️Installation](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html) |
14
+ [🤔Reporting Issues](https://github.com/open-compass/opencompass/issues/new/choose)
15
+
16
+ [English](/README.md) | 简体中文
17
+
18
+ </div>
19
+
20
+ <p align="center">
21
+ 👋 加入我们的 <a href="https://discord.gg/KKwfEbFj7U" target="_blank">Discord</a> 和 <a href="https://r.vansin.top/?r=opencompass" target="_blank">微信社区</a>
22
+ </p>
23
+
24
+ ## 📣 2023 年度榜单计划
25
+
26
+ 我们有幸与社区共同见证了通用人工智能在过去一年里的巨大进展,也非常高兴OpenCompass能够帮助广大大模型开发者和使用者。
27
+
28
+ 我们宣布将启动**OpenCompass 2023年度大模型榜单**发布计划。我们预计将于2024年1月发布大模型年度榜单,系统性评估大模型在语言、知识、推理、创作、长文本和智能体等多个能力维度的表现。
29
+
30
+ 届时,我们将发布开源模型和商业API模型能力榜单,以期为业界提供一份**全面、客观、中立**的参考。
31
+
32
+ 我们诚挚邀请各类大模型接入OpenCompass评测体系,以展示其在各个领域的性能优势。同时,也欢迎广大研究者、开发者向我们提供宝贵的意见和建议,共同推动大模型领域的发展。如有任何问题或需求,请随时[联系我们](mailto:[email protected])。此外,相关评测内容,性能数据,评测方法也将随榜单发布一并开源。
33
+
34
+ 我们提供了本次评测所使用的部分题目示例,详情请见[CompassBench 2023](docs/zh_cn/advanced_guides/compassbench_intro.md).
35
+
36
+ <p>让我们共同期待OpenCompass 2023年度大模型榜单的发布,期待各大模型在榜单上的精彩表现!</p>
37
+
38
+ ## 🧭 欢迎
39
+
40
+ 来到**OpenCompass**!
41
+
42
+ 就像指南针在我们的旅程中为我们导航一样,我们希望OpenCompass能够帮助你穿越评估大型语言模型的重重迷雾。OpenCompass提供丰富的算法和功能支持,期待OpenCompass能够帮助社区更便捷地对NLP模型的性能进行公平全面的评估。
43
+
44
+ 🚩🚩🚩 欢迎加入 OpenCompass!我们目前**招聘全职研究人员/工程师和实习生**。如果您对 LLM 和 OpenCompass 充满热情,请随时通过[电子邮件](mailto:[email protected])与我们联系。我们非常期待与您交流!
45
+
46
+ 🔥🔥🔥 祝贺 **OpenCompass 作为大模型标准测试工具被Meta AI官方推荐**, 点击 Llama 的 [入门文档](https://ai.meta.com/llama/get-started/#validation) 获取更多信息.
47
+
48
+ > **注意**<br />
49
+ > 我们正式启动 OpenCompass 共建计划,诚邀社区用户为 OpenCompass 提供更具代表性和可信度的客观评测数据集!
50
+ > 点击 [Issue](https://github.com/open-compass/opencompass/issues/248) 获取更多数据集.
51
+ > 让我们携手共进,打造功能强大易用的大模型评测平台!
52
+
53
+ ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
54
+
55
+ - **\[2024.01.17\]** 我们支持了 [InternLM2](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py) 和 [InternLM2-Chat](https://github.com/open-compass/opencompass/blob/main/configs/eval_internlm2_chat_keyset.py) 的相关评测,InternLM2 在这些测试中表现出非常强劲的性能,欢迎试用!🔥🔥🔥.
56
+ - **\[2024.01.17\]** 我们支持了多根针版本的大海捞针测试,更多信息见[这里](https://opencompass.readthedocs.io/zh-cn/latest/advanced_guides/needleinahaystack_eval.html#id8)🔥🔥🔥.
57
+ - **\[2023.12.28\]** 我们支持了对使用[LLaMA2-Accessory](https://github.com/Alpha-VLLM/LLaMA2-Accessory)(一款强大的LLM开发工具箱)开发的所有模型的无缝评估! 🔥🔥🔥.
58
+ - **\[2023.12.22\]** 我们开源了[T-Eval](https://github.com/open-compass/T-Eval)用于评测大语言模型工具调用能力。欢迎访问T-Eval的官方[Leaderboard](https://open-compass.github.io/T-Eval/leaderboard.html)获取更多信息! 🔥🔥🔥.
59
+ - **\[2023.12.10\]** 我们开源了多模评测框架 [VLMEvalKit](https://github.com/open-compass/VLMEvalKit),目前已支持 20+ 个多模态大模型与包括 MMBench 系列在内的 7 个多模态评测集.
60
+ - **\[2023.12.10\]** 我们已经支持了Mistral AI的MoE模型 **Mixtral-8x7B-32K**。欢迎查阅[MixtralKit](https://github.com/open-compass/MixtralKit)以获取更多关于推理和评测的详细信息.
61
+
62
+ > [更多](docs/zh_cn/notes/news.md)
63
+
64
+ ## ✨ 介绍
65
+
66
+ ![image](https://github.com/open-compass/opencompass/assets/22607038/30bcb2e2-3969-4ac5-9f29-ad3f4abb4f3b)
67
+
68
+ OpenCompass 是面向大模型评测的一站式平台。其主要特点如下:
69
+
70
+ - **开源可复现**:提供公平、公开、可复现的大模型评测方案
71
+
72
+ - **全面的能力维度**:五大维度设计,提供 70+ 个数据集约 40 万题的的模型评测方案,全面评估模型能力
73
+
74
+ - **丰富的模型支持**:已支持 20+ HuggingFace 及 API 模型
75
+
76
+ - **分布式高效评测**:一行命令实现任务分割和分布式评测,数小时即可完成千亿模型全量评测
77
+
78
+ - **多样化评测范式**:支持零样本、小样本及思维链评测,结合标准型或对话型提示词模板,轻松激发各种模型最大性能
79
+
80
+ - **灵活化拓展**:想增加新模型或数据集?想要自定义更高级的任务分割策略,甚至接入新的集群管理系统?OpenCompass 的一切均可轻松扩展!
81
+
82
+ ## 📊 性能榜单
83
+
84
+ 我们将陆续提供开源模型和API模型的具体性能榜单,请见 [OpenCompass Leaderboard](https://opencompass.org.cn/rank) 。如需加入评测,请提供模型仓库地址或标准的 API 接口至邮箱 `[email protected]`.
85
+
86
+ <p align="right"><a href="#top">🔝返回顶部</a></p>
87
+
88
+ ## 🛠️ 安装
89
+
90
+ 下面展示了快速安装以及准备数据集的步骤。
91
+
92
+ ### 💻 环境配置
93
+
94
+ #### 面向开源模型的GPU环境
95
+
96
+ ```bash
97
+ conda create --name opencompass python=3.10 pytorch torchvision pytorch-cuda -c nvidia -c pytorch -y
98
+ conda activate opencompass
99
+ git clone https://github.com/open-compass/opencompass opencompass
100
+ cd opencompass
101
+ pip install -e .
102
+ ```
103
+
104
+ #### 面向API模型测试的CPU环境
105
+
106
+ ```bash
107
+ conda create -n opencompass python=3.10 pytorch torchvision torchaudio cpuonly -c pytorch -y
108
+ conda activate opencompass
109
+ git clone https://github.com/open-compass/opencompass opencompass
110
+ cd opencompass
111
+ pip install -e .
112
+ # 如果需要使用各个API模型,请 `pip install -r requirements/api.txt` 安装API模型的相关依赖
113
+ ```
114
+
115
+ ### 📂 数据准备
116
+
117
+ ```bash
118
+ # 下载数据集到 data/ 处
119
+ wget https://github.com/open-compass/opencompass/releases/download/0.2.2.rc1/OpenCompassData-core-20240207.zip
120
+ unzip OpenCompassData-core-20240207.zip
121
+ ```
122
+
123
+ 有部分第三方功能,如 Humaneval 以及 Llama,可能需要额外步骤才能正常运行,详细步骤请参考[安装指南](https://opencompass.readthedocs.io/zh_CN/latest/get_started/installation.html)。
124
+
125
+ <p align="right"><a href="#top">🔝返回顶部</a></p>
126
+
127
+ ## 🏗️ ️评测
128
+
129
+ 确保按照上述步骤正确安装 OpenCompass 并准备好数据集后,可以通过以下命令评测 LLaMA-7b 模型在 MMLU 和 C-Eval 数据集上的性能:
130
+
131
+ ```bash
132
+ python run.py --models hf_llama_7b --datasets mmlu_ppl ceval_ppl
133
+ ```
134
+
135
+ OpenCompass 预定义了许多模型和数据集的配置,你可以通过 [工具](./docs/zh_cn/tools.md#ListConfigs) 列出所有可用的模型和数据集配置。
136
+
137
+ ```bash
138
+ # 列出所有配置
139
+ python tools/list_configs.py
140
+ # 列出所有跟 llama 及 mmlu 相关的配置
141
+ python tools/list_configs.py llama mmlu
142
+ ```
143
+
144
+ 你也可以通过命令行去评测其它 HuggingFace 模型。同样以 LLaMA-7b 为例:
145
+
146
+ ```bash
147
+ python run.py --datasets ceval_ppl mmlu_ppl \
148
+ --hf-path huggyllama/llama-7b \ # HuggingFace 模型地址
149
+ --model-kwargs device_map='auto' \ # 构造 model 的参数
150
+ --tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # 构造 tokenizer 的参数
151
+ --max-out-len 100 \ # 最长生成 token 数
152
+ --max-seq-len 2048 \ # 模型能接受的最大序列长度
153
+ --batch-size 8 \ # 批次大小
154
+ --no-batch-padding \ # 不打开 batch padding,通过 for loop 推理,避免精度损失
155
+ --num-gpus 1 # 运行该模型所需的最少 gpu 数
156
+ ```
157
+
158
+ > **注意**<br />
159
+ > 若需要运行上述命令,你需要删除所有从 `# ` 开始的注释。
160
+
161
+ 通过命令行或配置文件,OpenCompass 还支持评测 API 或自定义模型,以及更多样化的评测策略。请阅读[快速开始](https://opencompass.readthedocs.io/zh_CN/latest/get_started/quick_start.html)了解如何运行一个评测任务。
162
+
163
+ 更多教程请查看我们的[文档](https://opencompass.readthedocs.io/zh_CN/latest/index.html)。
164
+
165
+ <p align="right"><a href="#top">🔝返回顶部</a></p>
166
+
167
+ ## 📖 数据集支持
168
+
169
+ <table align="center">
170
+ <tbody>
171
+ <tr align="center" valign="bottom">
172
+ <td>
173
+ <b>语言</b>
174
+ </td>
175
+ <td>
176
+ <b>知识</b>
177
+ </td>
178
+ <td>
179
+ <b>推理</b>
180
+ </td>
181
+ <td>
182
+ <b>考试</b>
183
+ </td>
184
+ </tr>
185
+ <tr valign="top">
186
+ <td>
187
+ <details open>
188
+ <summary><b>字词释义</b></summary>
189
+
190
+ - WiC
191
+ - SummEdits
192
+
193
+ </details>
194
+
195
+ <details open>
196
+ <summary><b>成语习语</b></summary>
197
+
198
+ - CHID
199
+
200
+ </details>
201
+
202
+ <details open>
203
+ <summary><b>语义相似度</b></summary>
204
+
205
+ - AFQMC
206
+ - BUSTM
207
+
208
+ </details>
209
+
210
+ <details open>
211
+ <summary><b>指代消解</b></summary>
212
+
213
+ - CLUEWSC
214
+ - WSC
215
+ - WinoGrande
216
+
217
+ </details>
218
+
219
+ <details open>
220
+ <summary><b>翻译</b></summary>
221
+
222
+ - Flores
223
+ - IWSLT2017
224
+
225
+ </details>
226
+
227
+ <details open>
228
+ <summary><b>多语种问答</b></summary>
229
+
230
+ - TyDi-QA
231
+ - XCOPA
232
+
233
+ </details>
234
+
235
+ <details open>
236
+ <summary><b>多语种总结</b></summary>
237
+
238
+ - XLSum
239
+
240
+ </details>
241
+ </td>
242
+ <td>
243
+ <details open>
244
+ <summary><b>知识问答</b></summary>
245
+
246
+ - BoolQ
247
+ - CommonSenseQA
248
+ - NaturalQuestions
249
+ - TriviaQA
250
+
251
+ </details>
252
+ </td>
253
+ <td>
254
+ <details open>
255
+ <summary><b>文本蕴含</b></summary>
256
+
257
+ - CMNLI
258
+ - OCNLI
259
+ - OCNLI_FC
260
+ - AX-b
261
+ - AX-g
262
+ - CB
263
+ - RTE
264
+ - ANLI
265
+
266
+ </details>
267
+
268
+ <details open>
269
+ <summary><b>常识推理</b></summary>
270
+
271
+ - StoryCloze
272
+ - COPA
273
+ - ReCoRD
274
+ - HellaSwag
275
+ - PIQA
276
+ - SIQA
277
+
278
+ </details>
279
+
280
+ <details open>
281
+ <summary><b>数学推理</b></summary>
282
+
283
+ - MATH
284
+ - GSM8K
285
+
286
+ </details>
287
+
288
+ <details open>
289
+ <summary><b>定理应用</b></summary>
290
+
291
+ - TheoremQA
292
+ - StrategyQA
293
+ - SciBench
294
+
295
+ </details>
296
+
297
+ <details open>
298
+ <summary><b>综合推理</b></summary>
299
+
300
+ - BBH
301
+
302
+ </details>
303
+ </td>
304
+ <td>
305
+ <details open>
306
+ <summary><b>初中/高中/大学/职业考试</b></summary>
307
+
308
+ - C-Eval
309
+ - AGIEval
310
+ - MMLU
311
+ - GAOKAO-Bench
312
+ - CMMLU
313
+ - ARC
314
+ - Xiezhi
315
+
316
+ </details>
317
+
318
+ <details open>
319
+ <summary><b>医学考试</b></summary>
320
+
321
+ - CMB
322
+
323
+ </details>
324
+ </td>
325
+ </tr>
326
+ </td>
327
+ </tr>
328
+ </tbody>
329
+ <tbody>
330
+ <tr align="center" valign="bottom">
331
+ <td>
332
+ <b>理解</b>
333
+ </td>
334
+ <td>
335
+ <b>长文本</b>
336
+ </td>
337
+ <td>
338
+ <b>安全</b>
339
+ </td>
340
+ <td>
341
+ <b>代码</b>
342
+ </td>
343
+ </tr>
344
+ <tr valign="top">
345
+ <td>
346
+ <details open>
347
+ <summary><b>阅读理解</b></summary>
348
+
349
+ - C3
350
+ - CMRC
351
+ - DRCD
352
+ - MultiRC
353
+ - RACE
354
+ - DROP
355
+ - OpenBookQA
356
+ - SQuAD2.0
357
+
358
+ </details>
359
+
360
+ <details open>
361
+ <summary><b>内容总结</b></summary>
362
+
363
+ - CSL
364
+ - LCSTS
365
+ - XSum
366
+ - SummScreen
367
+
368
+ </details>
369
+
370
+ <details open>
371
+ <summary><b>内容分析</b></summary>
372
+
373
+ - EPRSTMT
374
+ - LAMBADA
375
+ - TNEWS
376
+
377
+ </details>
378
+ </td>
379
+ <td>
380
+ <details open>
381
+ <summary><b>长文本理解</b></summary>
382
+
383
+ - LEval
384
+ - LongBench
385
+ - GovReports
386
+ - NarrativeQA
387
+ - Qasper
388
+
389
+ </details>
390
+ </td>
391
+ <td>
392
+ <details open>
393
+ <summary><b>安全</b></summary>
394
+
395
+ - CivilComments
396
+ - CrowsPairs
397
+ - CValues
398
+ - JigsawMultilingual
399
+ - TruthfulQA
400
+
401
+ </details>
402
+ <details open>
403
+ <summary><b>健壮性</b></summary>
404
+
405
+ - AdvGLUE
406
+
407
+ </details>
408
+ </td>
409
+ <td>
410
+ <details open>
411
+ <summary><b>代码</b></summary>
412
+
413
+ - HumanEval
414
+ - HumanEvalX
415
+ - MBPP
416
+ - APPs
417
+ - DS1000
418
+
419
+ </details>
420
+ </td>
421
+ </tr>
422
+ </td>
423
+ </tr>
424
+ </tbody>
425
+ </table>
426
+
427
+ <p align="right"><a href="#top">🔝返回顶部</a></p>
428
+
429
+ ## 📖 模型支持
430
+
431
+ <table align="center">
432
+ <tbody>
433
+ <tr align="center" valign="bottom">
434
+ <td>
435
+ <b>开源模型</b>
436
+ </td>
437
+ <td>
438
+ <b>API 模型</b>
439
+ </td>
440
+ <!-- <td>
441
+ <b>自定义模型</b>
442
+ </td> -->
443
+ </tr>
444
+ <tr valign="top">
445
+ <td>
446
+
447
+ - [InternLM](https://github.com/InternLM/InternLM)
448
+ - [LLaMA](https://github.com/facebookresearch/llama)
449
+ - [Vicuna](https://github.com/lm-sys/FastChat)
450
+ - [Alpaca](https://github.com/tatsu-lab/stanford_alpaca)
451
+ - [Baichuan](https://github.com/baichuan-inc)
452
+ - [WizardLM](https://github.com/nlpxucan/WizardLM)
453
+ - [ChatGLM2](https://github.com/THUDM/ChatGLM2-6B)
454
+ - [ChatGLM3](https://github.com/THUDM/ChatGLM3-6B)
455
+ - [TigerBot](https://github.com/TigerResearch/TigerBot)
456
+ - [Qwen](https://github.com/QwenLM/Qwen)
457
+ - [BlueLM](https://github.com/vivo-ai-lab/BlueLM)
458
+ - ……
459
+
460
+ </td>
461
+ <td>
462
+
463
+ - OpenAI
464
+ - Claude
465
+ - ZhipuAI(ChatGLM)
466
+ - Baichuan
467
+ - ByteDance(YunQue)
468
+ - Huawei(PanGu)
469
+ - 360
470
+ - Baidu(ERNIEBot)
471
+ - MiniMax(ABAB-Chat)
472
+ - SenseTime(nova)
473
+ - Xunfei(Spark)
474
+ - ……
475
+
476
+ </td>
477
+
478
+ </tr>
479
+ </tbody>
480
+ </table>
481
+
482
+ <p align="right"><a href="#top">🔝返回顶部</a></p>
483
+
484
+ ## 🔜 路线图
485
+
486
+ - [ ] 主观评测
487
+ - [ ] 发布主观评测榜单
488
+ - [ ] 发布主观评测数据集
489
+ - [x] 长文本
490
+ - [ ] 支持广泛的长文本评测集
491
+ - [ ] 发布长文本评测榜单
492
+ - [ ] 代码能力
493
+ - [ ] 发布代码能力评测榜单
494
+ - [x] 提供非Python语言的评测服务
495
+ - [ ] 智能体
496
+ - [ ] 支持丰富的智能体方案
497
+ - [ ] 提供智能体评测榜单
498
+ - [x] 鲁棒性
499
+ - [x] 支持各类攻击方法
500
+
501
+ ## 👷‍♂️ 贡献
502
+
503
+ 我们感谢所有的贡献者为改进和提升 OpenCompass 所作出的努力。请参考[贡献指南](https://opencompass.readthedocs.io/zh_CN/latest/notes/contribution_guide.html)来了解参与项目贡献的相关指引。
504
+
505
+ ## 🤝 致谢
506
+
507
+ 该项目部分的代码引用并修改自 [OpenICL](https://github.com/Shark-NLP/OpenICL)。
508
+
509
+ 该项目部分的数据集和提示词实现修改自 [chain-of-thought-hub](https://github.com/FranxYao/chain-of-thought-hub), [instruct-eval](https://github.com/declare-lab/instruct-eval)
510
+
511
+ ## 🖊️ 引用
512
+
513
+ ```bibtex
514
+ @misc{2023opencompass,
515
+ title={OpenCompass: A Universal Evaluation Platform for Foundation Models},
516
+ author={OpenCompass Contributors},
517
+ howpublished = {\url{https://github.com/open-compass/opencompass}},
518
+ year={2023}
519
+ }
520
+ ```
521
+
522
+ <p align="right"><a href="#top">🔝返回顶部</a></p>
opencompass-my-api/build/lib/opencompass/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ __version__ = '0.2.2'
opencompass-my-api/build/lib/opencompass/datasets/FinanceIQ.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import os.path as osp
3
+
4
+ from datasets import Dataset, DatasetDict
5
+
6
+ from opencompass.registry import LOAD_DATASET
7
+
8
+ from .base import BaseDataset
9
+
10
+
11
+ @LOAD_DATASET.register_module()
12
+ class FinanceIQDataset(BaseDataset):
13
+
14
+ # @staticmethod
15
+ # def load(path: str):
16
+ # from datasets import load_dataset
17
+ # return load_dataset('csv', data_files={'test': path})
18
+
19
+ @staticmethod
20
+ def load(path: str, name: str):
21
+ dataset = DatasetDict()
22
+ for split in ['dev', 'test']:
23
+ raw_data = []
24
+ filename = osp.join(path, split, f'{name}.csv')
25
+ with open(filename, encoding='utf-8') as f:
26
+ reader = csv.reader(f)
27
+ _ = next(reader) # skip the header
28
+ for row in reader:
29
+ assert len(row) == 7
30
+ raw_data.append({
31
+ 'question': row[1],
32
+ 'A': row[2],
33
+ 'B': row[3],
34
+ 'C': row[4],
35
+ 'D': row[5],
36
+ 'answer': row[6],
37
+ })
38
+ dataset[split] = Dataset.from_list(raw_data)
39
+ return dataset
opencompass-my-api/build/lib/opencompass/datasets/GaokaoBench.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+
4
+ from datasets import Dataset
5
+
6
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
7
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
8
+
9
+ from .base import BaseDataset
10
+
11
+
12
+ @LOAD_DATASET.register_module()
13
+ class GaokaoBenchDataset(BaseDataset):
14
+
15
+ @staticmethod
16
+ def load(path: str):
17
+ with open(path, encoding='utf-8') as f:
18
+ data = json.load(f)
19
+ return Dataset.from_list(data['example'])
20
+
21
+
22
+ valid_gaokao_bench_question_types = [
23
+ 'single_choice', 'multi_choice', 'multi_question_choice',
24
+ 'five_out_of_seven', 'cloze', 'subjective', 'correction'
25
+ ]
26
+
27
+
28
+ class GaokaoBenchEvaluator(BaseEvaluator):
29
+
30
+ def __init__(self, question_type) -> None:
31
+ super().__init__()
32
+ assert question_type in valid_gaokao_bench_question_types
33
+ self.question_type = question_type
34
+
35
+ def do_predictions_postprocess(self, model_output, answer_lenth=None):
36
+ if self.question_type == 'single_choice':
37
+ model_answer = []
38
+ temp = re.findall(r'[A-D]', model_output[::-1])
39
+ if len(temp) != 0:
40
+ model_answer.append(temp[0])
41
+
42
+ elif self.question_type == 'multi_question_choice':
43
+ model_answer = []
44
+ temp = re.findall(r'【答案】\s*[::]*\s*[A-Z]', model_output)
45
+
46
+ if len(temp) == answer_lenth:
47
+ for t in temp:
48
+ model_answer.append(re.findall(r'[A-Z]', t)[0])
49
+ else:
50
+ temp = re.findall(r'[A-Z]', model_output)
51
+ if len(temp) > 0:
52
+ for k in range(min(len(temp), answer_lenth)):
53
+ model_answer.append(temp[k])
54
+
55
+ elif self.question_type == 'multi_choice':
56
+ model_answer = []
57
+ answer = ''
58
+ content = re.sub(r'\s+', '', model_output)
59
+ answer_index = content.find('【答案】')
60
+ if answer_index > 0:
61
+ temp = content[answer_index:]
62
+ if len(re.findall(r'[A-D]', temp)) > 0:
63
+ for t in re.findall(r'[A-D]', temp):
64
+ answer += t
65
+ else:
66
+ temp = content[-10:]
67
+ if len(re.findall(r'[A-D]', temp)) > 0:
68
+ for t in re.findall(r'[A-D]', temp):
69
+ answer += t
70
+ if len(answer) != 0:
71
+ model_answer.append(answer)
72
+
73
+ elif self.question_type == 'five_out_of_seven':
74
+ model_answer = []
75
+ temp = re.findall(r'[A-G]', model_output)
76
+ if len(temp) > 0:
77
+ for k in range(min(5, len(temp))):
78
+ model_answer.append(temp[k])
79
+
80
+ return model_answer
81
+
82
+ def ensure_same_length(self, pred, refr):
83
+ if len(pred) == len(refr):
84
+ return pred
85
+ return ['Z'] * len(refr)
86
+
87
+ def score(self, predictions, references):
88
+ if self.question_type not in [
89
+ 'single_choice', 'multi_choice', 'multi_question_choice',
90
+ 'five_out_of_seven'
91
+ ]:
92
+ return {'score': 0}
93
+ elif self.question_type == 'multi_choice':
94
+ correct_score, total_score = 0, 0
95
+ for pred, refr in zip(predictions, references):
96
+ pred = self.do_predictions_postprocess(pred)
97
+ pred = self.ensure_same_length(pred, refr)
98
+ for p, r in zip(pred, refr):
99
+ if p == r:
100
+ correct_score += 2
101
+ else:
102
+ for i in p:
103
+ if i not in r:
104
+ break
105
+ else:
106
+ correct_score += 1
107
+ total_score += 2
108
+ return {'score': correct_score / total_score * 100}
109
+ else:
110
+ correct_score, total_score = 0, 0
111
+ for pred, refr in zip(predictions, references):
112
+ if self.question_type == 'multi_question_choice':
113
+ pred = self.do_predictions_postprocess(pred, len(refr))
114
+ else:
115
+ pred = self.do_predictions_postprocess(pred)
116
+ pred = self.ensure_same_length(pred, refr)
117
+ for p, r in zip(pred, refr):
118
+ if p == r:
119
+ correct_score += 1
120
+ total_score += 1
121
+ return {'score': correct_score / total_score * 100}
122
+
123
+
124
+ for question_type in valid_gaokao_bench_question_types:
125
+ # fix classic closure problem
126
+ def _gaokao_register(question_type):
127
+ ICL_EVALUATORS.register_module(
128
+ name='GaokaoBenchEvaluator' + '_' + question_type,
129
+ module=lambda *args, **kwargs: GaokaoBenchEvaluator(
130
+ question_type=question_type, *args, **kwargs))
131
+
132
+ _gaokao_register(question_type)
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from .cmp_GCP_D import * # noqa: F401, F403
2
+ from .cmp_KSP import * # noqa: F401, F403
3
+ from .cmp_TSP_D import * # noqa: F401, F403
4
+ from .hard_GCP import * # noqa: F401, F403
5
+ from .hard_MSP import * # noqa: F401, F403
6
+ from .hard_TSP import * # noqa: F401, F403
7
+ from .p_BSP import * # noqa: F401, F403
8
+ from .p_EDP import * # noqa: F401, F403
9
+ from .p_SPP import * # noqa: F401, F403
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/cmp_GCP_D.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+
3
+ import networkx as nx
4
+ from datasets import Dataset
5
+
6
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
7
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
8
+
9
+ from ..base import BaseDataset
10
+ from .prompts import gcp_dPrompts
11
+
12
+
13
+ def q2text(q, p=gcp_dPrompts):
14
+ number_of_colors = q.split('\n')[0].split()[-2] # last character of the first line
15
+ number_of_vertices = q.split('\n')[1].split(' ')[2] # third word of the second line
16
+ prompt_text = p['Intro'] + '\n' + \
17
+ p['Initial_question'].format(total_vertices=number_of_vertices, number_of_colors=number_of_colors) + '\n' + \
18
+ p['Output_content'] + '\n' + \
19
+ p['Output_format'] + '\n' + \
20
+ '\n The graph is below: \n'
21
+ for line in q.split('\n')[2:]:
22
+ vertex_list = line.split(' ')
23
+ this_line = 'Vertex {} is connected to vertex {}.'.format(
24
+ vertex_list[1], vertex_list[2])
25
+ prompt_text += this_line + '\n'
26
+ return prompt_text
27
+
28
+
29
+ @LOAD_DATASET.register_module(force=True)
30
+ class cmp_GCP_D_Dataset(BaseDataset):
31
+
32
+ @staticmethod
33
+ def load(path: str):
34
+ raw_data = []
35
+ data_path = path
36
+ all_data = []
37
+ for file_num in range(10):
38
+ with open(data_path + 'decision_data_GCP_{}.txt'.format(file_num)) as f:
39
+ data = f.read()
40
+ sample = data.split('\n\n')[:-1]
41
+ all_data += zip([file_num + 1] * len(sample), sample)
42
+ for (level, q) in all_data:
43
+ prompt = q2text(q)
44
+ raw_data.append({
45
+ 'prompt': prompt,
46
+ 'q': str(level) + '####\n' + q,
47
+ 'level': level
48
+ })
49
+ dataset = Dataset.from_list(raw_data)
50
+ return dataset
51
+
52
+
53
+ @ICL_EVALUATORS.register_module(force=True)
54
+ class cmp_GCP_D_Evaluator(BaseEvaluator):
55
+
56
+ def score(self, predictions, references):
57
+ assert len(predictions) == len(references)
58
+
59
+ result = {'pass': 0, 'fail': 0}
60
+ details = {}
61
+ for index, (q, output) in enumerate(zip(references, predictions)):
62
+ output_dict = {}
63
+ level = int(q.split('####\n')[0])
64
+ q = q.split('####\n')[-1]
65
+ try:
66
+ number_of_colors = int(q.split('\n')[0].split()[-2])
67
+ output, reasoning = self.parse_xml_to_dict(output)
68
+ output_dict['output'] = output
69
+ output_dict['correctness'], _ = self.gcp_decision_check(q, output, number_of_colors)
70
+ except Exception as e:
71
+ print(f'Attempt failed: {e}')
72
+ output_dict['correctness'] = False
73
+ output_dict['reasoning'] = reasoning
74
+
75
+ if output_dict['correctness']:
76
+ r = 'pass'
77
+ else:
78
+ r = 'fail'
79
+ result[r] += level
80
+ details[str(index)] = {'q': q, 'output': output, 'result': r}
81
+
82
+ result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100
83
+ result['details'] = details
84
+ final_result = {'Weighted Accuracy': result['score']}
85
+ return final_result
86
+
87
+ def parse_xml_to_dict(self, xml_string):
88
+ try:
89
+ assert '<final_answer>' in xml_string
90
+ assert '</final_answer>' in xml_string
91
+ assert '<reasoning>' in xml_string
92
+ assert '</reasoning>' in xml_string
93
+ final_answer_start = xml_string.index('<final_answer>') + len('<final_answer>')
94
+ final_answer_end = xml_string.index('</final_answer>')
95
+ reasoning_start = xml_string.index('<reasoning>') + len('<reasoning>')
96
+ reasoning_end = xml_string.index('</reasoning>')
97
+ final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip()
98
+ reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip()
99
+ try:
100
+ final_answer_element = ast.literal_eval(final_answer_element)
101
+ except Exception:
102
+ final_answer_element = ''
103
+ except Exception:
104
+ final_answer_element = ''
105
+ reasoning_element = ''
106
+
107
+ return final_answer_element, reasoning_element
108
+
109
+ def read_dimacs_format(self, dimacs_str):
110
+ lines = dimacs_str.strip().split('\n')
111
+ p_line = next(line for line in lines if line.startswith('p'))
112
+ _, _, num_vertices, num_edges = p_line.split()
113
+ num_vertices, num_edges = int(num_vertices), int(num_edges)
114
+
115
+ adjacency_list = {i: set() for i in range(1, num_vertices + 1)}
116
+ for line in lines:
117
+ if line.startswith('e'):
118
+ _, vertex1, vertex2 = line.split()
119
+ vertex1, vertex2 = int(vertex1), int(vertex2)
120
+ if vertex1 in adjacency_list and vertex2 in adjacency_list:
121
+ adjacency_list[vertex1].add(vertex2)
122
+ adjacency_list[vertex2].add(vertex1)
123
+
124
+ return num_vertices, adjacency_list
125
+
126
+ def gcp_greedy_solution(self, adjacency_list):
127
+ """Provides a greedy solution to the GCP problem.
128
+
129
+ :param adjacency_list: A dictionary of the adjacency list.
130
+ :return: A tuple of (num_colors, coloring).
131
+ """
132
+ G = nx.Graph()
133
+ G.add_nodes_from(adjacency_list.keys())
134
+ for vertex, neighbors in adjacency_list.items():
135
+ for neighbor in neighbors:
136
+ G.add_edge(vertex, neighbor)
137
+ coloring = nx.coloring.greedy_color(G, strategy='largest_first')
138
+ num_colors = max(coloring.values()) + 1
139
+ return num_colors, coloring
140
+
141
+ def gcp_decision_check(self, dimacs_str, answer, k_colors):
142
+ """Check if the given GCP instance is feasible with k_colors.
143
+
144
+ :param dimacs_str: The DIMACS format string of the GCP instance.
145
+ :param answer: The answer returned by the model.
146
+ :param k_colors: The target number of colors.
147
+ :return: A tuple of (is_correct, message).
148
+ """
149
+ num_vertices, adjacency_list = self.read_dimacs_format(dimacs_str)
150
+ try:
151
+ is_feasible = answer.get('Feasible', 'no').lower() == 'yes'
152
+ except Exception:
153
+ return False, 'Feasible key not found'
154
+ num_colors, coloring = self.gcp_greedy_solution(adjacency_list)
155
+ exist_optimal = num_colors <= k_colors
156
+ if is_feasible != exist_optimal:
157
+ if exist_optimal:
158
+ return False, f'Feasibility mismatch: {coloring}'
159
+ else:
160
+ return False, f'Feasibility mismatch: {is_feasible} vs {exist_optimal}'
161
+ return True, 'Feasible' if is_feasible else 'Infeasible'
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/cmp_KSP.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import json
3
+
4
+ from datasets import Dataset
5
+
6
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
7
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
8
+
9
+ from ..base import BaseDataset
10
+ from .prompts import kspPrompts
11
+
12
+
13
+ def q2text(q, p=kspPrompts):
14
+ knapsack_capacity = q['knapsack_capacity']
15
+ items = q['items']
16
+ prompt_text = p['Intro'] + '\n' + \
17
+ p['Initial_question'].format(knapsack_capacity=knapsack_capacity) + '\n' + \
18
+ p['Output_content'] + '\n' + \
19
+ p['Output_format'] + \
20
+ '\n The items details are as below: \n'
21
+ for item in items:
22
+ this_line = f"Item {item['id']} has weight {item['weight']} and value {item['value']}."
23
+ prompt_text += this_line + '\n'
24
+ return prompt_text
25
+
26
+
27
+ @LOAD_DATASET.register_module(force=True)
28
+ class cmp_KSP_Dataset(BaseDataset):
29
+
30
+ @staticmethod
31
+ def load(path: str):
32
+ raw_data = []
33
+ data_path = path
34
+ all_data = []
35
+ with open(data_path + 'ksp_instances.json', 'r') as f:
36
+ data = json.load(f)
37
+ for sample in data:
38
+ level = len(sample['items']) - 3
39
+ all_data.append((level, sample))
40
+ for (level, q) in all_data:
41
+ prompt = q2text(q)
42
+ raw_data.append({
43
+ 'prompt': prompt,
44
+ 'q': str(level) + '####\n' + json.dumps(q),
45
+ 'level': level
46
+ })
47
+ dataset = Dataset.from_list(raw_data)
48
+ return dataset
49
+
50
+
51
+ @ICL_EVALUATORS.register_module(force=True)
52
+ class cmp_KSP_Evaluator(BaseEvaluator):
53
+
54
+ def score(self, predictions, references):
55
+ assert len(predictions) == len(references)
56
+
57
+ result = {'pass': 0, 'fail': 0}
58
+ details = {}
59
+ for index, (q, output) in enumerate(zip(references, predictions)):
60
+ output_dict = {}
61
+ level = int(q.split('####\n')[0])
62
+ q = json.loads(q.split('####\n')[-1])
63
+ try:
64
+ llm_string = q
65
+ output, reasoning = self.parse_xml_to_dict(llm_string)
66
+ output_dict['output'] = output
67
+ output_dict['correctness'], _ = self.kspCheck(q, output)
68
+ output_dict['reasoning'] = reasoning
69
+ output_dict['level'] = level
70
+ except Exception as e:
71
+ print(f'Attempt failed: {e}')
72
+ if output_dict:
73
+ if output_dict['correctness']:
74
+ r = 'pass'
75
+ else:
76
+ r = 'fail'
77
+ else:
78
+ print(f'Failed to run {q}')
79
+ r = 'fail'
80
+
81
+ result[r] += level
82
+ details[str(index)] = {'q': q, 'output': output, 'result': r}
83
+
84
+ result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100
85
+ result['details'] = details
86
+ final_result = {'Weighted Accuracy': result['score']}
87
+ return final_result
88
+
89
+ def parse_xml_to_dict(self, xml_string):
90
+ try:
91
+ assert '<final_answer>' in xml_string
92
+ assert '</final_answer>' in xml_string
93
+ assert '<reasoning>' in xml_string
94
+ assert '</reasoning>' in xml_string
95
+ final_answer_start = xml_string.index('<final_answer>') + len('<final_answer>')
96
+ final_answer_end = xml_string.index('</final_answer>')
97
+ reasoning_start = xml_string.index('<reasoning>') + len('<reasoning>')
98
+ reasoning_end = xml_string.index('</reasoning>')
99
+ final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip()
100
+ reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip()
101
+ try:
102
+ final_answer_element = ast.literal_eval(final_answer_element)
103
+ except Exception:
104
+ final_answer_element = ''
105
+ except Exception:
106
+ final_answer_element = ''
107
+ reasoning_element = ''
108
+
109
+ return final_answer_element, reasoning_element
110
+
111
+ def ksp_optimal_solution(self, knapsacks, capacity):
112
+ """Provides the optimal solution for the KSP instance with dynamic
113
+ programming.
114
+
115
+ :param knapsacks: A dictionary of the knapsacks.
116
+ :param capacity: The capacity of the knapsack.
117
+ :return: The optimal value.
118
+ """
119
+ # num_knapsacks = len(knapsacks)
120
+
121
+ # Create a one-dimensional array to store intermediate solutions
122
+ dp = [0] * (capacity + 1)
123
+
124
+ for itemId, (weight, value) in knapsacks.items():
125
+ for w in range(capacity, weight - 1, -1):
126
+ dp[w] = max(dp[w], value + dp[w - weight])
127
+
128
+ return dp[capacity]
129
+
130
+ # KSP
131
+ def kspCheck(self, instance, solution):
132
+ """Validates the solution for the KSP instance.
133
+
134
+ :param instance: A dictionary of the KSP instance.
135
+ :param solution: A dictionary of the solution.
136
+ :return: A tuple of (is_correct, message).
137
+ """
138
+ # Change string key to integer key and value to boolean
139
+ items = instance.get('items', [])
140
+ knapsacks = {
141
+ item['id']: (item['weight'], item['value'])
142
+ for item in items
143
+ }
144
+
145
+ ksp_optimal_value = self.ksp_optimal_solution(
146
+ knapsacks, instance['knapsack_capacity'])
147
+
148
+ try:
149
+ is_feasible = (solution.get('Feasible', '').lower() == 'yes')
150
+ except Exception:
151
+ return False, f'Output format is incorrect.'
152
+ if is_feasible != (ksp_optimal_value > 0):
153
+ return False, f'The solution is {is_feasible} but the optimal solution is {ksp_optimal_value > 0}.'
154
+
155
+ total_value = int(solution.get('TotalValue', -1))
156
+ selectedItems = list(map(int, solution.get('SelectedItemIds', [])))
157
+
158
+ if len(set(selectedItems)) != len(selectedItems):
159
+ return False, f'Duplicate items are selected.'
160
+
161
+ total_weight = 0
162
+ cum_value = 0
163
+
164
+ # Calculate total weight and value of selected items
165
+ for item in selectedItems:
166
+ if knapsacks.get(item, False):
167
+ weight, value = knapsacks[item]
168
+ total_weight += weight
169
+ cum_value += value
170
+ else:
171
+ return False, f'Item {item} does not exist.'
172
+
173
+ # Check if the item weight exceeds the knapsack capacity
174
+ if total_weight > instance['knapsack_capacity']:
175
+ return False, f"Total weight {total_weight} exceeds knapsack capacity {instance['knapsack_capacity']}."
176
+
177
+ if total_value != cum_value:
178
+ return False, f'The total value {total_value} does not match the cumulative value {cum_value} of the selected items.'
179
+
180
+ if total_value != ksp_optimal_value:
181
+ return False, f'The total value {total_value} does not match the optimal value {ksp_optimal_value}.'
182
+
183
+ return True, f'The solution is valid with total weight {total_weight} and total value {total_value}.'
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/cmp_TSP_D.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import json
3
+
4
+ import networkx as nx
5
+ import pandas as pd
6
+ from datasets import Dataset
7
+
8
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
9
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
10
+
11
+ from ..base import BaseDataset
12
+ from .prompts import tsp_dPrompts
13
+
14
+
15
+ def q2text(adj_matrix, distance_limit, p=tsp_dPrompts):
16
+ total_cities = adj_matrix.shape[0] # exclude the last row
17
+ prompt_text = p['Intro'] + '\n' + \
18
+ p['Initial_question'].format(total_cities=total_cities, distance_limit=distance_limit) + '\n' + \
19
+ p['Output_content'] + '\n' + \
20
+ p['Output_format'] + '\n' + \
21
+ 'The distances between cities are below: \n'
22
+
23
+ for i in range(adj_matrix.shape[0]):
24
+ for j in range(adj_matrix.shape[1]):
25
+ if i < j: # only use the upper triangle
26
+ this_line = 'The distance between City {} and City {} is {}.'.format(i, j, adj_matrix[i, j])
27
+ prompt_text += this_line + '\n'
28
+ return prompt_text
29
+
30
+
31
+ @LOAD_DATASET.register_module(force=True)
32
+ class cmp_TSP_D_Dataset(BaseDataset):
33
+
34
+ @staticmethod
35
+ def load(path: str):
36
+ raw_data = []
37
+ data_path = path
38
+ all_data = []
39
+ for level in range(10):
40
+ for file_num in range(10):
41
+ df = pd.read_csv(data_path + 'decision_data_TSP_level_{}_instance_{}.csv'.format(level, file_num + 1),
42
+ header=None,
43
+ index_col=False)
44
+ all_data.append((level + 1, df))
45
+
46
+ for (level, q) in all_data:
47
+ threshold = q.iloc[-1, 0] # therashold is the last row
48
+ distance_matrix = q.iloc[:
49
+ -1].values # distance matrix is the rest of the rows
50
+ prompt = q2text(distance_matrix, threshold)
51
+ raw_data.append({
52
+ 'prompt': prompt,
53
+ 'q': str(level) + '####\n' + json.dumps(q.to_json()),
54
+ 'level': level
55
+ })
56
+ dataset = Dataset.from_list(raw_data)
57
+ return dataset
58
+
59
+
60
+ @ICL_EVALUATORS.register_module(force=True)
61
+ class cmp_TSP_D_Evaluator(BaseEvaluator):
62
+
63
+ def score(self, predictions, references):
64
+ assert len(predictions) == len(references)
65
+
66
+ result = {'pass': 0, 'fail': 0}
67
+ details = {}
68
+ tsp_d_Results = []
69
+ for index, (q, llm_string) in enumerate(zip(references, predictions)):
70
+ output_dict = {}
71
+ output, reasoning = self.parse_xml_to_dict(llm_string)
72
+ level = int(q.split('####\n')[0])
73
+ q = json.loads(q.split('####\n')[-1])
74
+ q = pd.DataFrame(eval(q))
75
+ threshold = q.iloc[-1, 0] # therashold is the last row
76
+ distance_matrix = q.iloc[:-1].values # distance matrix is the rest of the rows
77
+ output_dict['output'] = output
78
+ try:
79
+ output_dict['correctness'], _ = self.tsp_decision_check(distance_matrix, threshold, output)
80
+ except Exception as e:
81
+ print(f'Check failed: {e}')
82
+ output_dict['correctness'] = False
83
+ output_dict['reasoning'] = reasoning
84
+ output_dict['level'] = level
85
+ if output_dict:
86
+ tsp_d_Results.append(output_dict)
87
+ if output_dict['correctness']:
88
+ r = 'pass'
89
+ else:
90
+ r = 'fail'
91
+
92
+ result[r] += level
93
+ details[str(index)] = {'q': q, 'output': output, 'result': r}
94
+
95
+ result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100
96
+ result['details'] = details
97
+ final_result = {'Weighted Accuracy': result['score']}
98
+ return final_result
99
+
100
+ def parse_xml_to_dict(self, xml_string):
101
+ try:
102
+ assert '<final_answer>' in xml_string
103
+ assert '</final_answer>' in xml_string
104
+ assert '<reasoning>' in xml_string
105
+ assert '</reasoning>' in xml_string
106
+ final_answer_start = xml_string.index('<final_answer>') + len('<final_answer>')
107
+ final_answer_end = xml_string.index('</final_answer>')
108
+ reasoning_start = xml_string.index('<reasoning>') + len('<reasoning>')
109
+ reasoning_end = xml_string.index('</reasoning>')
110
+ final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip()
111
+ reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip()
112
+ try:
113
+ final_answer_element = ast.literal_eval(final_answer_element)
114
+ except Exception:
115
+ final_answer_element = ''
116
+ except Exception:
117
+ final_answer_element = ''
118
+ reasoning_element = ''
119
+
120
+ return final_answer_element, reasoning_element
121
+
122
+ def tsp_approx(self, distance_matrix):
123
+ """Returns an approximate solution to the TSP problem.
124
+
125
+ :param distance_matrix: A 2D numpy array representing the distance matrix.
126
+ :return: A list of the cities in the order they were visited.
127
+ """
128
+ G = nx.from_numpy_array(distance_matrix)
129
+ return nx.approximation.traveling_salesman_problem(G)
130
+
131
+ def tsp_decision_check(self, distance_matrix, threshold, tour):
132
+ """Checks if a given TSP tour is valid and within the threshold
133
+ distance.
134
+
135
+ :param distance_matrix: A 2D numpy array representing the distance matrix.
136
+ :param threshold: The maximum distance allowed.
137
+ :param tour: A dictionary containing the feasibility.
138
+ """
139
+ try:
140
+ is_feasible = tour.get('Feasible', 'no').lower() == 'yes'
141
+ except Exception:
142
+ return False, 'Output format incorrect'
143
+
144
+ # Calculate the approxed distance of the tour
145
+ tours = self.tsp_approx(distance_matrix)
146
+ tour_distance = sum(distance_matrix[tours[i], tours[i + 1]] for i in range(len(tours) - 1)) + distance_matrix[tours[-1], tours[0]]
147
+
148
+ if is_feasible != (tour_distance <= threshold):
149
+ return False, f'Feasibility mismatch: {is_feasible} vs {tour_distance} > {threshold}'
150
+ return True, 'Feasible: {} <= {}'.format(tour_distance, threshold)
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/hard_GCP.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import xml.etree.ElementTree as ET
3
+
4
+ from datasets import Dataset
5
+
6
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
7
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
8
+
9
+ from ..base import BaseDataset
10
+ from .prompts import gcpPrompts
11
+
12
+
13
+ def q2text(q, p=gcpPrompts): # q is the data for the HP-hard question, p is the prompt
14
+ # print(q)
15
+ chromatic_number = q.split('\n')[0][-1] # last character of the first line
16
+ number_of_vertices = q.split('\n')[1].split(' ')[2] # third word of the second line
17
+ prompt_text = p['Intro'] + '\n' \
18
+ + p['Initial_question'].format(max_vertices=number_of_vertices,max_colors=chromatic_number) + '\n' \
19
+ + p['Output_content'] + '\n' \
20
+ + p['Output_format'] + \
21
+ '\n The graph is below: \n'
22
+ for line in q.split('\n')[2:]:
23
+ vertex_list = line.split(' ')
24
+ this_line = 'Vertex {} is connected to vertex {}.'.format(vertex_list[1], vertex_list[2])
25
+ prompt_text += this_line + '\n'
26
+
27
+ return prompt_text
28
+
29
+
30
+ @LOAD_DATASET.register_module(force=True)
31
+ class hard_GCP_Dataset(BaseDataset):
32
+
33
+ @staticmethod
34
+ def load(path: str):
35
+ raw_data = []
36
+ data_path = path
37
+ all_data = []
38
+ for file_num in range(10):
39
+ with open(data_path + 'synthesized_data_GCP_{}.txt'.format(file_num)) as f:
40
+ data = f.read()
41
+ sample = data.split('\n\n')[:-1]
42
+ all_data += zip([file_num + 1] * len(sample), sample)
43
+ for (level, q) in all_data:
44
+ prompt = q2text(q)
45
+ raw_data.append({
46
+ 'prompt': prompt,
47
+ 'q': str(level) + '####\n' + q,
48
+ 'level': level
49
+ })
50
+ dataset = Dataset.from_list(raw_data)
51
+ return dataset
52
+
53
+
54
+ @ICL_EVALUATORS.register_module(force=True)
55
+ class hard_GCP_Evaluator(BaseEvaluator):
56
+
57
+ def score(self, predictions, references):
58
+ assert len(predictions) == len(references)
59
+
60
+ result = {'pass': 0, 'fail': 0}
61
+ details = {}
62
+ for index, (q, output) in enumerate(zip(references, predictions)):
63
+ output_dict = {}
64
+ level = int(q.split('####\n')[0])
65
+ q = q.split('####\n')[-1]
66
+
67
+ output_dict['output'] = output
68
+ try:
69
+ output_dict['correctness'] = self.gcpCheck(q, output)
70
+ except Exception as e:
71
+ print(f'Check failed: {e}')
72
+ output_dict['correctness'] = False
73
+ output_dict['level'] = level
74
+
75
+ if output_dict['correctness']:
76
+ r = 'pass'
77
+ else:
78
+ r = 'fail'
79
+ result[r] += level
80
+ details[str(index)] = {'q': q, 'output': output, 'result': r}
81
+
82
+ result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100
83
+ result['details'] = details
84
+ final_result = {'Weighted Accuracy': result['score']}
85
+ return final_result
86
+
87
+ def parse_xml_to_dict(self, xml_string):
88
+ try:
89
+ # Parse the XML string
90
+ root = ET.fromstring(xml_string)
91
+
92
+ # Find the 'final_answer' tag
93
+ final_answer_element = root.find('final_answer')
94
+
95
+ # Find the 'reasoning' tag
96
+ reasoning_element = root.find('reasoning')
97
+ except Exception:
98
+ try:
99
+ assert '<final_answer>' in xml_string
100
+ assert '</final_answer>' in xml_string
101
+ assert '<reasoning>' in xml_string
102
+ assert '</reasoning>' in xml_string
103
+ final_answer_start = xml_string.index('<final_answer>') + len('<final_answer>')
104
+ final_answer_end = xml_string.index('</final_answer>')
105
+ reasoning_start = xml_string.index('<reasoning>') + len('<reasoning>')
106
+ reasoning_end = xml_string.index('</reasoning>')
107
+ final_answer_element = xml_string[final_answer_start:final_answer_end]
108
+ reasoning_element = xml_string[reasoning_start:reasoning_end]
109
+ except Exception:
110
+ final_answer_element = ''
111
+ reasoning_element = ''
112
+
113
+ return final_answer_element, reasoning_element
114
+
115
+ def gcpCheck(self, dimacs_str, answer_str):
116
+ num_vertices, adjacency_list = self.read_dimacs_format(dimacs_str)
117
+ answer_colors = self.parse_answer(answer_str)
118
+ # print(adjacency_list)
119
+ # print(answer_colors)
120
+
121
+ # Check if all colors in the answer are valid
122
+ for vertex, neighbors in adjacency_list.items():
123
+ for neighbor in neighbors:
124
+ try:
125
+ if answer_colors[vertex] == answer_colors[neighbor]:
126
+ print(f'Invalid coloring: Vertex {vertex} and {neighbor} have the same color.')
127
+ return False
128
+ except:
129
+ print(f'Invalid input.') # dealing with hullucination
130
+ return False
131
+
132
+ print(f'Valid coloring found with {len(set(answer_colors.values()))} colors: {answer_colors}')
133
+ return True
134
+
135
+ def read_dimacs_format(self, dimacs_str):
136
+ lines = dimacs_str.strip().split('\n')
137
+ # Read the number of vertices and edges
138
+ p_line = next(line for line in lines if line.startswith('p'))
139
+ _, _, num_vertices, num_edges = p_line.split()
140
+ num_vertices, num_edges = int(num_vertices), int(num_edges)
141
+
142
+ # Create adjacency list
143
+ adjacency_list = {i: set() for i in range(1, num_vertices + 1)}
144
+
145
+ # Read the edges and ignore those that reference non-existing vertices
146
+ for line in lines:
147
+ if line.startswith('e'):
148
+ _, vertex1, vertex2 = line.split()
149
+ vertex1, vertex2 = int(vertex1), int(vertex2)
150
+ if vertex1 in adjacency_list and vertex2 in adjacency_list:
151
+ adjacency_list[vertex1].add(vertex2)
152
+ adjacency_list[vertex2].add(vertex1)
153
+
154
+ return num_vertices, adjacency_list
155
+
156
+ def parse_answer(self, llm_string):
157
+ # # Convert the answer string to a dictionary
158
+ # answer_dict = {}
159
+ # # Remove the braces and split the string by commas
160
+ # entries = answer_str.strip("}{").split(', ')
161
+ # for entry in entries:
162
+ # vertex, color = entry.split(':')
163
+ # answer_dict[int(vertex)] = color
164
+ # return answer_dict
165
+
166
+ all_answers, reasoning_element = self.parse_xml_to_dict(llm_string)
167
+
168
+ if all_answers == '':
169
+ return {}
170
+ elif all_answers is None:
171
+ return {}
172
+ else:
173
+ if isinstance(all_answers, str):
174
+ try:
175
+ all_answers = ast.literal_eval(all_answers)
176
+ except Exception:
177
+ try:
178
+ all_answers = ast.literal_eval('{' + all_answers + '}')
179
+ except Exception:
180
+ return {}
181
+ else:
182
+ all_answers = ast.literal_eval(all_answers.text)
183
+ # answer_dict = {}
184
+ # for pair in all_answers:
185
+ # vertex, color = pair.split(":")
186
+ # answer_dict[int(vertex)] = color
187
+ # convert key type to int
188
+ all_answers = {int(k): v for k, v in all_answers.items()}
189
+ return all_answers # answer_dict
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/hard_MSP.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import json
3
+ import xml.etree.ElementTree as ET
4
+
5
+ from datasets import Dataset
6
+
7
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
8
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
9
+
10
+ from ..base import BaseDataset
11
+ from .prompts import mspPrompts
12
+
13
+
14
+ def q2text(q, p=mspPrompts): # q is the data for the HP-hard question, p is the prompt
15
+ total_participants = q['participants']
16
+ total_timeslots = q['time_slots']
17
+ prompt_text = p['Intro'] + '\n' \
18
+ + p['Initial_question'].format(total_participants=total_participants,total_timeslots=total_timeslots) + '\n' \
19
+ + p['Output_content'] + '\n' \
20
+ + p['Output_format'] + \
21
+ '\n The meetings and participants details are as below: \n'
22
+ meetings = q['meetings']
23
+ participants = q['participants']
24
+ for meeting in meetings:
25
+ this_line = 'Meeting {} is with duration {}.'.format(meeting['id'], meeting['duration'])
26
+ prompt_text += this_line + '\n'
27
+ for j in participants.keys():
28
+ this_line = 'Participant {} is available at time slots {} and has meetings {}.'.format(j, participants[j]['available_slots'], participants[j]['meetings'])
29
+ prompt_text += this_line + '\n'
30
+ return prompt_text
31
+
32
+
33
+ @LOAD_DATASET.register_module(force=True)
34
+ class hard_MSP_Dataset(BaseDataset):
35
+
36
+ @staticmethod
37
+ def load(path: str):
38
+ raw_data = []
39
+ data_path = path
40
+ all_data = []
41
+ with open(data_path + 'msp_instances.json', 'r') as f:
42
+ data = json.load(f)
43
+ all_data = zip([int(d['complexity_level']) for d in data], data)
44
+
45
+ for (level, q) in all_data:
46
+ prompt = q2text(q)
47
+ raw_data.append({
48
+ 'prompt': prompt,
49
+ 'q': str(level) + '####\n' + json.dumps(q),
50
+ 'level': level
51
+ })
52
+ dataset = Dataset.from_list(raw_data)
53
+ return dataset
54
+
55
+
56
+ @ICL_EVALUATORS.register_module(force=True)
57
+ class hard_MSP_Evaluator(BaseEvaluator):
58
+
59
+ def score(self, predictions, references):
60
+ assert len(predictions) == len(references)
61
+
62
+ result = {'pass': 0, 'fail': 0}
63
+ for index, (q, output) in enumerate(zip(references, predictions)):
64
+ output_dict = {}
65
+ level = int(q.split('####\n')[0])
66
+ q = json.loads(q.split('####\n')[-1])
67
+
68
+ output_dict['output'] = output
69
+ output_dict['level'] = level
70
+ try:
71
+ output_dict['correctness'], _ = self.mspCheck(q, output)
72
+ except Exception as e:
73
+ print(f'Check failed: {e}')
74
+ output_dict['correctness'] = False
75
+
76
+ if output_dict['correctness']:
77
+ r = 'pass'
78
+ else:
79
+ r = 'fail'
80
+ result[r] += level
81
+
82
+ result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100
83
+ final_result = {'Weighted Accuracy': result['score']}
84
+ return final_result
85
+
86
+ def mspCheck(self, instance, llm_string):
87
+ """Validate the MSP solution.
88
+
89
+ Parameters:
90
+ - instance: The MSP instance as a dictionary.
91
+ - solution: A dictionary with meeting ids as keys and lists of scheduled time slots as values.
92
+
93
+ Returns:
94
+ - A tuple (is_valid, message). is_valid is True if the solution is valid, False otherwise.
95
+ message contains information about the validity of the solution.
96
+ """
97
+ # print(llm_string)
98
+ solution, reasoning_element = self.parse_xml_to_dict(llm_string)
99
+ # print(solution.text)
100
+
101
+ # convert solution to dictionary
102
+ if solution == '':
103
+ return False, None
104
+ elif solution is None:
105
+ return False, None
106
+ else:
107
+ if isinstance(solution, str):
108
+ try:
109
+ solution = ast.literal_eval(solution)
110
+ if solution is None:
111
+ return False, None
112
+ except Exception:
113
+ try:
114
+ solution = ast.literal_eval('{' + solution + '}')
115
+ if solution is None:
116
+ return False, None
117
+ except Exception:
118
+ return False, None
119
+ else:
120
+ try:
121
+ solution = ast.literal_eval(solution.text)
122
+ if solution is None:
123
+ return False, None
124
+ except Exception:
125
+ return False, None
126
+ # convert key type to int
127
+ if isinstance(solution, dict):
128
+ print(solution)
129
+ solution = {int(k): v for k, v in solution.items()}
130
+ else:
131
+ return False, None
132
+
133
+ # Check if all meetings are scheduled within the available time slots
134
+ for meeting in instance['meetings']:
135
+ m_id = meeting['id']
136
+ duration = meeting['duration']
137
+ scheduled_slots = solution.get(m_id, None)
138
+
139
+ # Check if the meeting is scheduled
140
+ if scheduled_slots is None:
141
+ return False, f'Meeting {m_id} is not scheduled.'
142
+
143
+ # Check if the meeting fits within the number of total time slots
144
+ if any(slot >= instance['time_slots'] for slot in scheduled_slots):
145
+ return False, f'Meeting {m_id} does not fit within the available time slots.'
146
+
147
+ # Check if the scheduled slots are contiguous and fit the meeting duration
148
+ if len(scheduled_slots) != duration or not all(scheduled_slots[i] + 1 == scheduled_slots[i + 1]
149
+ for i in range(len(scheduled_slots) - 1)):
150
+ return False, f'Meeting {m_id} is not scheduled in contiguous time slots fitting its duration.'
151
+
152
+ # Check if all participants are available at the scheduled time
153
+ for p_id, participant in instance['participants'].items():
154
+ if m_id in participant['meetings']:
155
+ if not all(slot in participant['available_slots'] for slot in scheduled_slots):
156
+ return False, f'Participant {p_id} is not available for meeting {m_id} at the scheduled time.'
157
+
158
+ # Check if any participant is double-booked
159
+ participants_schedule = {p_id: [] for p_id in instance['participants']}
160
+ for m_id, time_slots in solution.items():
161
+ try:
162
+ duration = next(meeting['duration'] for meeting in instance['meetings'] if meeting['id'] == m_id)
163
+ if len(time_slots) != duration:
164
+ return False, f'Meeting {m_id} duration does not match the number of scheduled time slots.'
165
+ for p_id, participant in instance['participants'].items():
166
+ if m_id in participant['meetings']:
167
+ participants_schedule[p_id].extend(time_slots)
168
+ except Exception:
169
+ return False, f'Meeting {m_id} is not in the instance or program error.'
170
+
171
+ for p_id, slots in participants_schedule.items():
172
+ if len(slots) != len(set(slots)):
173
+ return False, f'Participant {p_id} is double-booked.'
174
+
175
+ return True, 'The solution is valid.'
176
+
177
+ def parse_xml_to_dict(self, xml_string):
178
+ try:
179
+ # Parse the XML string
180
+ root = ET.fromstring(xml_string)
181
+
182
+ # Find the 'final_answer' tag
183
+ final_answer_element = root.find('final_answer')
184
+
185
+ # Find the 'reasoning' tag
186
+ reasoning_element = root.find('reasoning')
187
+ except:
188
+ try:
189
+ assert '<final_answer>' in xml_string
190
+ assert '</final_answer>' in xml_string
191
+ assert '<reasoning>' in xml_string
192
+ assert '</reasoning>' in xml_string
193
+ final_answer_start = xml_string.index('<final_answer>') + len('<final_answer>')
194
+ final_answer_end = xml_string.index('</final_answer>')
195
+ reasoning_start = xml_string.index('<reasoning>') + len('<reasoning>')
196
+ reasoning_end = xml_string.index('</reasoning>')
197
+ final_answer_element = xml_string[final_answer_start:final_answer_end]
198
+ reasoning_element = xml_string[reasoning_start:reasoning_end]
199
+ except:
200
+ final_answer_element = ''
201
+ reasoning_element = ''
202
+
203
+ return final_answer_element, reasoning_element
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/hard_TSP.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import json
3
+ import xml.etree.ElementTree as ET
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from datasets import Dataset
8
+
9
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
10
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
11
+
12
+ from ..base import BaseDataset
13
+ from .prompts import tspPrompts
14
+
15
+
16
+ def q2text(q, p=tspPrompts): # q is the data for the HP-hard question, p is the prompt
17
+ total_cities = q.shape[0]
18
+ prompt_text = p['Intro'] + '\n' \
19
+ + p['Initial_question'].format(total_cities=total_cities) + '\n' \
20
+ + p['Output_content'] + '\n' \
21
+ + p['Output_format'] + \
22
+ '\n The distances between cities are below: \n'
23
+ for i in range(q.shape[0]):
24
+ for j in range(q.shape[1]):
25
+ if i < j: # only use the upper triangle
26
+ this_line = 'The path between City {} and City {} is with distance {}.'.format(i, j, q.iloc[i, j])
27
+ prompt_text += this_line + '\n'
28
+ return prompt_text
29
+
30
+
31
+ @LOAD_DATASET.register_module(force=True)
32
+ class hard_TSP_Dataset(BaseDataset):
33
+
34
+ @staticmethod
35
+ def load(path: str):
36
+ raw_data = []
37
+ data_path = path
38
+ all_data = []
39
+ for level in range(10):
40
+ for file_num in range(10):
41
+ # read np array
42
+ df = pd.read_csv(data_path + 'synthesized_data_TSP_level_{}_instance_{}.csv'.format(level, file_num + 1),
43
+ header=None,
44
+ index_col=False)
45
+ # transform df to
46
+ all_data.append((level + 1, df))
47
+ for (level, q) in all_data:
48
+ prompt = q2text(q)
49
+ raw_data.append({
50
+ 'prompt': prompt,
51
+ 'q': str(level) + '####\n' + json.dumps(q.to_json()),
52
+ 'level': level
53
+ })
54
+ dataset = Dataset.from_list(raw_data)
55
+ return dataset
56
+
57
+
58
+ @ICL_EVALUATORS.register_module(force=True)
59
+ class hard_TSP_Evaluator(BaseEvaluator):
60
+
61
+ def score(self, predictions, references):
62
+ assert len(predictions) == len(references)
63
+
64
+ result = {'pass': 0, 'fail': 0}
65
+ for index, (q, output) in enumerate(zip(references, predictions)):
66
+ output_dict = {}
67
+ level = int(q.split('####\n')[0])
68
+ q = json.loads(q.split('####\n')[-1])
69
+ q = pd.DataFrame(eval(q))
70
+
71
+ output_dict['output'] = output
72
+ try:
73
+ output_dict['correctness'], _ = self.tspCheck(q, output)
74
+ except Exception as e:
75
+ print(f'Check failed: {e}')
76
+ output_dict['correctness'] = False
77
+ output_dict['level'] = level
78
+
79
+ if output_dict['correctness']:
80
+ r = 'pass'
81
+ else:
82
+ r = 'fail'
83
+ result[r] += level
84
+
85
+ result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100
86
+ final_result = {'Weighted Accuracy': result['score']}
87
+ return final_result
88
+
89
+ def parse_xml_to_dict(self, xml_string):
90
+ try:
91
+ # Parse the XML string
92
+ root = ET.fromstring(xml_string)
93
+
94
+ # Find the 'final_answer' tag
95
+ final_answer_element = root.find('final_answer')
96
+
97
+ # Find the 'reasoning' tag
98
+ reasoning_element = root.find('reasoning')
99
+ except:
100
+ try:
101
+ assert '<final_answer>' in xml_string
102
+ assert '</final_answer>' in xml_string
103
+ assert '<reasoning>' in xml_string
104
+ assert '</reasoning>' in xml_string
105
+ final_answer_start = xml_string.index('<final_answer>') + len('<final_answer>')
106
+ final_answer_end = xml_string.index('</final_answer>')
107
+ reasoning_start = xml_string.index('<reasoning>') + len('<reasoning>')
108
+ reasoning_end = xml_string.index('</reasoning>')
109
+ final_answer_element = xml_string[final_answer_start:final_answer_end]
110
+ reasoning_element = xml_string[reasoning_start:reasoning_end]
111
+ except:
112
+ final_answer_element = ''
113
+ reasoning_element = ''
114
+
115
+ return final_answer_element, reasoning_element
116
+
117
+ def tspCheck(self, distance_matrix, llm_string):
118
+ """Check if the TSP solution is complete and if the distance matches
119
+ the greedy solution.
120
+
121
+ :param tour_string: String representing the TSP tour in the format "0->1->2->...->N->0"
122
+ :param distance_matrix: 2D numpy array representing the distances between cities
123
+ :return: Boolean indicating whether the tour is complete and matches the greedy distance
124
+ """
125
+ # convert distance_matrix to numpy array
126
+ distance_matrix = np.array(distance_matrix)
127
+
128
+ # Convert the tour string to a list of integers
129
+ # print(llm_string)
130
+ final_answer_element, reasoning_element = self.parse_xml_to_dict(llm_string)
131
+ # convert solution to dictionary
132
+ if final_answer_element == '':
133
+ return False, ''
134
+ elif final_answer_element is None:
135
+ return False, ''
136
+ else:
137
+ if isinstance(final_answer_element, str):
138
+ try:
139
+ tour_string = ast.literal_eval(final_answer_element)['Path']
140
+ if tour_string is None:
141
+ return False, ''
142
+ except Exception:
143
+ try:
144
+ tour_string = ast.literal_eval('{' + final_answer_element + '}')['Path']
145
+ if tour_string is None:
146
+ return False, ''
147
+ except Exception:
148
+ return False, ''
149
+ else:
150
+ try:
151
+ tour_string = ast.literal_eval(final_answer_element.text)['Path']
152
+ if tour_string is None:
153
+ return False, ''
154
+ except Exception:
155
+ return False, ''
156
+ try:
157
+ tour = list(map(int, tour_string.split('->')))
158
+ except Exception:
159
+ return False, ''
160
+ # we could also prinpt `reasoning_element` to see the reasoning of the answer
161
+ # we could also print the final distance of the tour by `final_answer_element['Distance']`
162
+
163
+ # Check if tour is a cycle
164
+ if tour[0] != tour[-1]:
165
+ return False, 'The tour must start and end at the same city.'
166
+
167
+ # Check if all cities are visited
168
+ if len(tour) != len(distance_matrix) + 1:
169
+ return False, 'The tour does not visit all cities exactly once.'
170
+
171
+ # Calculate the distance of the provided tour
172
+ tour_distance = sum(distance_matrix[tour[i]][tour[i + 1]]
173
+ for i in range(len(tour) - 1))
174
+
175
+ # Find the greedy tour distance for comparison
176
+ greedy_tour, greedy_distance = self.greedy_tsp(distance_matrix)
177
+
178
+ # Check if the provided tour distance is equal to the greedy tour distance
179
+ if tour_distance != greedy_distance:
180
+ return False, f'The tour distance ({tour_distance}) does not match the greedy solution ({greedy_distance}).'
181
+
182
+ return True, 'The solution is complete and matches the greedy solution distance.'
183
+
184
+ def greedy_tsp(self, distance_matrix):
185
+ """Solve the Traveling Salesman Problem using a greedy algorithm.
186
+
187
+ :param distance_matrix: 2D numpy array where the element at [i, j] is the distance between city i and j
188
+ :return: A tuple containing a list of the cities in the order they were visited and the total distance
189
+ """
190
+ num_cities = distance_matrix.shape[0]
191
+ unvisited_cities = set(range(num_cities))
192
+ current_city = np.random.choice(list(unvisited_cities))
193
+ tour = [current_city]
194
+ total_distance = 0
195
+
196
+ while unvisited_cities:
197
+ unvisited_cities.remove(current_city)
198
+ if unvisited_cities:
199
+ # Find the nearest unvisited city
200
+ distances_to_unvisited = distance_matrix[current_city][list(unvisited_cities)]
201
+ nearest_city = list(unvisited_cities)[np.argmin(distances_to_unvisited)]
202
+ tour.append(nearest_city)
203
+ # Update the total distance
204
+ total_distance += distance_matrix[current_city, nearest_city]
205
+ current_city = nearest_city
206
+
207
+ # Return to start
208
+ total_distance += distance_matrix[current_city, tour[0]]
209
+ tour.append(tour[0])
210
+
211
+ return tour, total_distance
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/p_BSP.py ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import json
3
+
4
+ from datasets import Dataset
5
+
6
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
7
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
8
+
9
+ from ..base import BaseDataset
10
+ from .prompts import bspPrompts
11
+
12
+
13
+ def q2text(q, p=bspPrompts):
14
+ target_value = q['target']
15
+ # TO-DO: fix data not being sorted
16
+ array = sorted(q['array'])
17
+ prompt_text = p['Intro'] + '\n' + \
18
+ p['Initial_question'].format(target_value=target_value) + '\n' + \
19
+ p['Output_content'] + '\n' + \
20
+ p['Output_format'] + \
21
+ '\n The sorted array elements are: ' + ', '.join(map(str, array)) + '\n'
22
+
23
+ return prompt_text
24
+
25
+
26
+ @LOAD_DATASET.register_module(force=True)
27
+ class p_BSP_Dataset(BaseDataset):
28
+
29
+ @staticmethod
30
+ def load(path: str):
31
+ raw_data = []
32
+ data_path = path
33
+ all_data, newdata = [], []
34
+ with open(data_path + 'bsp_instances.json', 'r') as f:
35
+ data = json.load(f)
36
+ for sample in data:
37
+ level = len(sample['array']) - 2
38
+ all_data.append((level, sample))
39
+
40
+ for level, q in all_data:
41
+ prompt = q2text(q)
42
+ raw_data.append({
43
+ 'prompt': prompt,
44
+ 'q': str(level) + '####\n' + json.dumps(q),
45
+ 'level': level
46
+ })
47
+ dataset = Dataset.from_list(raw_data)
48
+ return dataset
49
+
50
+
51
+ @ICL_EVALUATORS.register_module(force=True)
52
+ class p_BSP_Evaluator(BaseEvaluator):
53
+
54
+ def score(self, predictions, references):
55
+ assert len(predictions) == len(references)
56
+
57
+ result = {'pass': 0, 'fail': 0}
58
+ for index, (q, output) in enumerate(zip(references, predictions)):
59
+ output_dict = {}
60
+ level = int(q.split('####\n')[0])
61
+ q = json.loads(q.split('####\n')[-1])
62
+ output, reasoning = self.parse_xml_to_dict(output)
63
+ output_dict['output'] = output
64
+ try:
65
+ output_dict['correctness'], _ = self.bsp_check(q, output)
66
+ except Exception as e:
67
+ print(f'Check failed: {e}')
68
+ output_dict['correctness'] = False
69
+ output_dict['reasoning'] = reasoning
70
+ output_dict['level'] = level
71
+
72
+ if output_dict['correctness']:
73
+ r = 'pass'
74
+ else:
75
+ r = 'fail'
76
+ result[r] += level
77
+
78
+ result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100
79
+ final_result = {'Weighted Accuracy': result['score']}
80
+ return final_result
81
+
82
+ def parse_xml_to_dict(self, xml_string):
83
+ try:
84
+ assert '<final_answer>' in xml_string
85
+ assert '</final_answer>' in xml_string
86
+ assert '<reasoning>' in xml_string
87
+ assert '</reasoning>' in xml_string
88
+ final_answer_start = xml_string.index('<final_answer>') + len('<final_answer>')
89
+ final_answer_end = xml_string.index('</final_answer>')
90
+ reasoning_start = xml_string.index('<reasoning>') + len('<reasoning>')
91
+ reasoning_end = xml_string.index('</reasoning>')
92
+ final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip()
93
+ reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip()
94
+ try:
95
+ final_answer_element = ast.literal_eval(final_answer_element)
96
+ except Exception:
97
+ final_answer_element = ''
98
+ except Exception:
99
+ final_answer_element = ''
100
+ reasoning_element = ''
101
+
102
+ return final_answer_element, reasoning_element
103
+
104
+ def bsp_check(self, instance, solution):
105
+ """Check if the binary search solution is valid.
106
+
107
+ :param instance: The instance dictionary with array and target value.
108
+ :param solution: The solution dictionary with the position of the target value.
109
+ :return: A tuple of (is_correct, message).
110
+ """
111
+ array = sorted(instance['array'])
112
+ target_value = instance['target']
113
+ solution, reasoning = self.parse_xml_to_dict(solution)
114
+ if isinstance(solution, str):
115
+ return False, f'The solution is invalid.'
116
+ try:
117
+ position = int(solution['Position'])
118
+ except Exception:
119
+ return False, f'The solution is invalid.'
120
+ if position == -1 or position >= len(array):
121
+ return False, f'The solution is invalid.'
122
+ elif array[position] != target_value:
123
+ return False, f'The target index is incorrect.'
124
+ return True, 'The solution is valid.'
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/p_EDP.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import json
3
+
4
+ from datasets import Dataset
5
+
6
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
7
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
8
+
9
+ from ..base import BaseDataset
10
+ from .prompts import edpPrompts
11
+
12
+
13
+ def q2text(q, p=edpPrompts):
14
+ string_a = q['string_a']
15
+ string_b = q['string_b']
16
+ prompt_text = p['Intro'] + '\n' + \
17
+ p['Initial_question'].format(string_a=string_a, string_b=string_b) + '\n' + \
18
+ p['Output_content'] + '\n' + \
19
+ p['Output_format']
20
+ return prompt_text
21
+
22
+
23
+ @LOAD_DATASET.register_module(force=True)
24
+ class p_EDP_Dataset(BaseDataset):
25
+
26
+ @staticmethod
27
+ def load(path: str):
28
+ raw_data = []
29
+ data_path = path
30
+ all_data = []
31
+ with open(data_path + 'edp_instances.json', 'r') as f:
32
+ data = json.load(f)
33
+ for sample in data:
34
+ level = len(sample['string_a']) - 2
35
+ all_data.append((level, sample))
36
+
37
+ for level, q in all_data:
38
+ prompt = q2text(q)
39
+ raw_data.append({
40
+ 'prompt': prompt,
41
+ 'q': str(level) + '####\n' + json.dumps(q),
42
+ 'level': level
43
+ })
44
+ dataset = Dataset.from_list(raw_data)
45
+ return dataset
46
+
47
+
48
+ @ICL_EVALUATORS.register_module(force=True)
49
+ class p_EDP_Evaluator(BaseEvaluator):
50
+
51
+ def score(self, predictions, references):
52
+ assert len(predictions) == len(references)
53
+
54
+ result = {'pass': 0, 'fail': 0}
55
+ for index, (q, output) in enumerate(zip(references, predictions)):
56
+ output_dict = {}
57
+ level = int(q.split('####\n')[0])
58
+ q = json.loads(q.split('####\n')[-1])
59
+ output, reasoning = self.parse_xml_to_dict(output)
60
+ output_dict['output'] = output
61
+ try:
62
+ output_dict['correctness'], _ = self.edp_check(q, output)
63
+ except Exception as e:
64
+ print(f'Check failed: {e}')
65
+ output_dict['correctness'] = False
66
+ output_dict['reasoning'] = reasoning
67
+ output_dict['level'] = level
68
+
69
+ if output_dict['correctness']:
70
+ r = 'pass'
71
+ else:
72
+ r = 'fail'
73
+ result[r] += level
74
+
75
+ result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100
76
+ final_result = {'Weighted Accuracy': result['score']}
77
+ return final_result
78
+
79
+ def compute_min_edit_distance(self, string_a, string_b):
80
+ """Computes the minimum edit distance between two strings using dynamic
81
+ programming."""
82
+ m, n = len(string_a), len(string_b)
83
+ dp = [[0] * (n + 1) for _ in range(m + 1)]
84
+
85
+ for i in range(m + 1):
86
+ for j in range(n + 1):
87
+ if i == 0:
88
+ dp[i][j] = j
89
+ elif j == 0:
90
+ dp[i][j] = i
91
+ elif string_a[i - 1] == string_b[j - 1]:
92
+ dp[i][j] = dp[i - 1][j - 1]
93
+ else:
94
+ dp[i][j] = 1 + min(dp[i - 1][j], dp[i][j - 1], dp[i - 1][j - 1])
95
+ return dp[m][n]
96
+
97
+ def edp_check(self, instance, solution):
98
+ """Check if the edit distance solution is valid.
99
+
100
+ :param instance: The instance dictionary with 'string_a' and 'string_b'.
101
+ :param solution: The solution dictionary with the reported 'edit_distance'.
102
+ :return: A tuple of (is_correct, message).
103
+ """
104
+ string_a = instance['string_a']
105
+ string_b = instance['string_b']
106
+ try:
107
+ reported_distance = int(solution.get('Operations', -1))
108
+ except Exception:
109
+ reported_distance = -1
110
+
111
+ actual_distance = self.compute_min_edit_distance(string_a, string_b)
112
+
113
+ if reported_distance == -1:
114
+ return False, 'No solution provided.'
115
+ elif reported_distance != actual_distance:
116
+ return False, f'The reported edit distance ({reported_distance}) is incorrect. Actual distance: {actual_distance}.'
117
+ return True, 'The solution is valid.'
118
+
119
+ def parse_xml_to_dict(self, xml_string):
120
+ try:
121
+ assert '<final_answer>' in xml_string
122
+ assert '</final_answer>' in xml_string
123
+ # assert '<reasoning>' in xml_string
124
+ # assert '</reasoning>' in xml_string
125
+ final_answer_start = xml_string.index('<final_answer>') + len('<final_answer>')
126
+ final_answer_end = xml_string.index('</final_answer>')
127
+ # reasoning_start = xml_string.index('<reasoning>') + len('<reasoning>')
128
+ # reasoning_end = xml_string.index('</reasoning>')
129
+ final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip()
130
+ assert '{' in final_answer_element
131
+ assert '}' in final_answer_element
132
+ dic_start = final_answer_element.index('{')
133
+ dic_end = final_answer_element.index('}')
134
+ final_answer_element = final_answer_element[dic_start:dic_end + 1].rstrip().strip().rstrip()
135
+ reasoning_element = xml_string
136
+ # reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip()
137
+ try:
138
+ final_answer_element = ast.literal_eval(final_answer_element)
139
+ except Exception:
140
+ final_answer_element = ''
141
+ except Exception:
142
+ final_answer_element = ''
143
+ reasoning_element = ''
144
+
145
+ return final_answer_element, reasoning_element
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/p_SPP.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import json
3
+
4
+ import networkx as nx
5
+ from datasets import Dataset
6
+
7
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
8
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
9
+
10
+ from ..base import BaseDataset
11
+ from .prompts import sppPrompts
12
+
13
+
14
+ def q2text(q, p=sppPrompts):
15
+ # start_node = q['start_node']
16
+ # end_node = q['end_node']
17
+ # TO-DO: fix later
18
+ start_node = q['nodes'][0]
19
+ end_node = q['nodes'][-1]
20
+ edges = q['edges']
21
+ prompt_text = p['Intro'] + '\n' + \
22
+ p['Initial_question'].format(start_node=start_node, end_node=end_node) + '\n' + \
23
+ p['Output_content'] + '\n' + \
24
+ p['Output_format'] + \
25
+ "\n The graph's edges and weights are as follows: \n"
26
+ for edge in edges:
27
+ this_line = f"Edge from {edge['from']} to {edge['to']} has a weight of {edge['weight']}."
28
+ prompt_text += this_line + '\n'
29
+ return prompt_text
30
+
31
+
32
+ @LOAD_DATASET.register_module(force=True)
33
+ class p_SPP_Dataset(BaseDataset):
34
+
35
+ @staticmethod
36
+ def load(path: str):
37
+ raw_data = []
38
+ data_path = path
39
+ all_data = []
40
+ with open(data_path + 'spp_instances.json', 'r') as f:
41
+ data = json.load(f)
42
+ all_data = zip([int(d['complexity_level']) for d in data], data)
43
+ for level, q in all_data:
44
+ prompt = q2text(q)
45
+ raw_data.append({
46
+ 'prompt': prompt,
47
+ 'q': str(level) + '####\n' + json.dumps(q),
48
+ 'level': level
49
+ })
50
+ dataset = Dataset.from_list(raw_data)
51
+ return dataset
52
+
53
+
54
+ @ICL_EVALUATORS.register_module(force=True)
55
+ class p_SPP_Evaluator(BaseEvaluator):
56
+
57
+ def score(self, predictions, references):
58
+ assert len(predictions) == len(references)
59
+
60
+ result = {'pass': 0, 'fail': 0}
61
+ for index, (q, output) in enumerate(zip(references, predictions)):
62
+ output_dict = {}
63
+ level = int(q.split('####\n')[0])
64
+ q = json.loads(q.split('####\n')[-1])
65
+ output, reasoning = self.parse_xml_to_dict(output)
66
+ output_dict['output'] = output
67
+ try:
68
+ output_dict['correctness'], _ = self.spp_check(q, output)
69
+ except Exception as e:
70
+ print(f'Check failed: {e}')
71
+ output_dict['correctness'] = False
72
+ output_dict['reasoning'] = reasoning
73
+ output_dict['level'] = level
74
+
75
+ if output_dict['correctness']:
76
+ r = 'pass'
77
+ else:
78
+ r = 'fail'
79
+ result[r] += level
80
+
81
+ result['score'] = result['pass'] / (result['pass'] + result['fail']) * 100
82
+ final_result = {'Weighted Accuracy': result['score']}
83
+ return final_result
84
+
85
+ def parse_xml_to_dict(self, xml_string):
86
+ try:
87
+ assert '<final_answer>' in xml_string
88
+ assert '</final_answer>' in xml_string
89
+ # assert '<reasoning>' in xml_string
90
+ # assert '</reasoning>' in xml_string
91
+ final_answer_start = xml_string.index('<final_answer>') + len('<final_answer>')
92
+ final_answer_end = xml_string.index('</final_answer>')
93
+ # reasoning_start = xml_string.index('<reasoning>') + len('<reasoning>')
94
+ # reasoning_end = xml_string.index('</reasoning>')
95
+ final_answer_element = xml_string[final_answer_start:final_answer_end].rstrip().strip().rstrip()
96
+ assert '{' in final_answer_element
97
+ assert '}' in final_answer_element
98
+ dic_start = final_answer_element.index('{')
99
+ dic_end = final_answer_element.index('}')
100
+ final_answer_element = final_answer_element[dic_start:dic_end + 1].rstrip().strip().rstrip()
101
+ # reasoning_element = xml_string[reasoning_start:reasoning_end].rstrip().strip().rstrip()
102
+ try:
103
+ final_answer_element = ast.literal_eval(final_answer_element)
104
+ reasoning_element = xml_string
105
+ except Exception:
106
+ final_answer_element = ''
107
+ reasoning_element = xml_string
108
+ except Exception:
109
+ final_answer_element = ''
110
+ reasoning_element = ''
111
+
112
+ return final_answer_element, reasoning_element
113
+
114
+ def ssp_optimal_solution(self, instance, source, target):
115
+ """Provides the optimal solution for the SSP instance.
116
+
117
+ :param instance: The SSP instance as a dictionary with 'nodes' and 'edges'.
118
+ :param source: The source node.
119
+ :param target: The destination node.
120
+ :return: The optimal shortest path length and path.
121
+ """
122
+ G = nx.Graph()
123
+ G.add_nodes_from(instance['nodes'])
124
+ G.add_weighted_edges_from([(edge['from'], edge['to'], edge['weight'])
125
+ for edge in instance['edges']])
126
+ shortest_path_length = None
127
+ shortest_path = None
128
+ if nx.has_path(G, source=source, target=target):
129
+ shortest_path_length = nx.shortest_path_length(G, source=source, target=target, weight='weight')
130
+ shortest_path = nx.shortest_path(G, source=source, target=target, weight='weight')
131
+ return shortest_path_length, shortest_path
132
+
133
+ # SPP
134
+ def spp_check(self, instance, solution, start_node=None, end_node=None):
135
+ """Validate the solution of the SPP problem.
136
+
137
+ :param instance: The instance dictionary with nodes and edges.
138
+ :param solution: The solution dictionary with the path and total distance.
139
+ :param start_node: The start node.
140
+ :param end_node: The end node.
141
+ :return: A tuple of (is_correct, message).
142
+ """
143
+ # Get the start and end nodes
144
+ # Currently, the start and end nodes are the first and last nodes in the instance
145
+ if start_node is None:
146
+ start_node = instance['nodes'][0]
147
+ if end_node is None:
148
+ end_node = instance['nodes'][-1]
149
+
150
+ # Convert solution to dictionary
151
+ try:
152
+ path_string = solution.get('Path', '')
153
+ cost_string = solution.get('TotalDistance', '')
154
+ except Exception:
155
+ return False, 'The solution is not a dictionary.'
156
+
157
+ # Calculate the optimal solution
158
+ ssp_optimal_length, ssp_optimal_path = self.ssp_optimal_solution(
159
+ instance, start_node, end_node)
160
+ if ssp_optimal_length is None:
161
+ if isinstance(cost_string, int) or cost_string.isdigit():
162
+ return False, f'No path between from node {start_node} to node {end_node}.'
163
+ else:
164
+ return True, 'No path found from node {start_node} to node {end_node}.'
165
+
166
+ try:
167
+ path = list(map(int, path_string.split('->')))
168
+ total_cost = int(cost_string)
169
+ except Exception:
170
+ return False, 'The solution is not a valid dictionary.'
171
+
172
+ # Check if path starts and ends with the correct nodes
173
+ if not path or path[0] != start_node or path[-1] != end_node:
174
+ return False, 'The path does not start or end at the correct nodes.'
175
+
176
+ # Check if the path is continuous and calculate the cost
177
+ calculated_cost = 0
178
+ is_in_edge = lambda edge, from_node, to_node: (edge['from'] == from_node and edge['to'] == to_node) or (edge['from'] == to_node and edge['to'] == from_node)
179
+ for i in range(len(path) - 1):
180
+ from_node, to_node = path[i], path[i + 1]
181
+ edge = next((edge for edge in instance['edges'] if is_in_edge(edge, from_node, to_node)), None)
182
+
183
+ if not edge:
184
+ return False, f'No edge found from node {from_node} to node {to_node}.'
185
+
186
+ calculated_cost += edge['weight']
187
+
188
+ # Check if the calculated cost matches the total cost provided in the solution
189
+ if calculated_cost != total_cost:
190
+ return False, f'The calculated cost ({calculated_cost}) does not match the provided total cost ({total_cost}).'
191
+
192
+ if calculated_cost != ssp_optimal_length:
193
+ # spp_optimal_path = '->'.join(map(str, ssp_optimal_path))
194
+ return False, f'The calculated cost ({calculated_cost}) does not match the optimal solution ({ssp_optimal_length}): {ssp_optimal_path}.'
195
+
196
+ return True, 'The solution is valid.'
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/prompts.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Overall fewshot prompts
2
+ FEW_SHOT_SELF = 'Please refer to a few examples of this problem and the corresponding reasoning process. The examples are:'
3
+ FEW_SHOT_OTHERS = 'Please refer to a few examples of another problem and the corresponding reasoning process. The problem is {initial_question}. {output_content}. The examples are:'
4
+
5
+ # P problems
6
+ sppPrompts = {
7
+ 'Intro': 'The Shortest Path Problem (SPP) involves finding the shortest path between two nodes in a weighted graph.',
8
+ 'Initial_question': "You need to find the shortest path between node {start_node} and node {end_node} in a graph. The graph's edges and their weights are given.",
9
+ 'Output_content': 'Please provide the shortest path from {start_node} to {end_node} and its total distance. Offer a concise step-by-step explanation of your reasoning process. Aim for brevity and clarity in your response.',
10
+ 'Output_format': "Your output should be enclosed within <root></root> tags. Include your reasoning in <reasoning></reasoning> tags and the final path and total distance in <final_answer></final_answer> tags, like <final_answer>{'Path': 'START->...->END', 'TotalDistance': 'INT_TOTAL_DISTANCE'}</final_answer>.",
11
+ 'Few_shot_self': FEW_SHOT_SELF,
12
+ 'Few_shot_others': FEW_SHOT_OTHERS
13
+ }
14
+
15
+ mfpPrompts = {
16
+ 'Intro': 'The Maximum Flow Problem (MFP) seeks to find the maximum possible flow from a source node to a sink node in a flow network, subject to capacity constraints on the edges.',
17
+ 'Initial_question': 'Determine the maximum flow from the source node {source_node} to the sink node {sink_node} in the given flow network. The capacities of the edges are provided.',
18
+ 'Output_content': 'Please indicate the maximum flow value and the flow for each edge. Provide a brief explanation of your methodology. Keep your response concise and focused.',
19
+ 'Output_format': "Enclose your output within <root></root> tags. Present your reasoning in <reasoning></reasoning> tags and the final maximum flow and edge flows in <final_answer></final_answer> tags, like <final_answer>{'MaxFlow': 'MAX_FLOW_VALUE', 'Flows': {'NODE_1->NODE_2': FLOW, ...}}</final_answer>.",
20
+ 'Few_shot_self': FEW_SHOT_SELF,
21
+ 'Few_shot_others': FEW_SHOT_OTHERS
22
+ }
23
+
24
+ bspPrompts = {
25
+ 'Intro': 'The Binary Search Problem (BSP) deals with finding the position of a target value within a sorted array using a binary search algorithm, which efficiently narrows down the search range.',
26
+ 'Initial_question': 'Find the position of the target value {target_value} in the sorted array. The index begins with 0. The array elements are provided.',
27
+ 'Output_content': 'Please identify the position of the target value in the array. Offer a brief, step-by-step account of your search process. Aim for conciseness in your response.',
28
+ 'Output_format': "Your output should be enclosed in <root></root> tags. Include your search process in <reasoning></reasoning> tags and the final position of the target value in <final_answer></final_answer> tags, like <final_answer>{'Position': 'TARGET_POSITION'}</final_answer>.",
29
+ 'Few_shot_self': FEW_SHOT_SELF,
30
+ 'Few_shot_others': FEW_SHOT_OTHERS
31
+ }
32
+
33
+ edpPrompts = {
34
+ 'Intro': 'The Edit Distance Problem (EDP) involves finding the minimum number of operations required to transform one string into another, where each operation is either an insertion, deletion, or substitution of a single character.',
35
+ 'Initial_question': 'Find the minimum number of operations required to transform the first string {string_a} into the second string {string_b}. The operations are insertion, deletion, and substitution of a single character, each requiring 1 edit operation.',
36
+ 'Output_content': 'Please provide the minimum number of operations required to transform the first string into the second string. Offer a brief explanation of your methodology. Keep your response concise and focused.',
37
+ 'Output_format': "Enclose your output within <root></root> tags. Present your reasoning in <reasoning></reasoning> tags and the final minimum number of operations in <final_answer></final_answer> tags, like <final_answer>{'Operations': 'MINIMUM_NUMBER_OF_OPERATIONS'}</final_answer>.",
38
+ 'Few_shot_self': FEW_SHOT_SELF,
39
+ 'Few_shot_others': FEW_SHOT_OTHERS
40
+ }
41
+
42
+ # NP-complete problems
43
+ tsp_dPrompts = {
44
+ 'Intro': 'The Traveling Salesman Problem (Decision Version, TSP-D) focuses on determining if a salesman can complete a route, visiting each city at least once, with the total travel distance being less than a specified value.',
45
+ 'Initial_question': "Check if it's possible for a salesman to visit each of the {total_cities} cities at least once and return to the starting city with the total distance less than {distance_limit}. The distances between each pair of cities are given.",
46
+ 'Output_content': 'Provide a yes or no answer, with a succinct explanation of your decision process. Focus on clarity and brevity in your response.',
47
+ 'Output_format': "Enclose your output in <root></root> tags. Present your reasoning in <reasoning></reasoning> tags and the final yes/no answer in <final_answer></final_answer> tags, like <final_answer>{'Feasible': 'YES_OR_NO'}</final_answer>.",
48
+ 'Few_shot_self': FEW_SHOT_SELF,
49
+ 'Few_shot_others': FEW_SHOT_OTHERS
50
+ }
51
+
52
+ gcp_dPrompts = {
53
+ 'Intro': 'The Graph Coloring Problem (Decision Version, GCP-D) involves determining if it is possible to color the vertices of a graph using a given number of colors, ensuring no two adjacent vertices have the same color.',
54
+ 'Initial_question': 'Find out if the vertices of a graph with {total_vertices} vertices can be colored using only {number_of_colors} colors, such that no adjacent vertices share the same color.',
55
+ 'Output_content': 'Provide a yes or no answer, along with a concise explanation of your reasoning. Keep your explanation focused and brief.',
56
+ 'Output_format': "Enclose your output in <root></root> tags. Include your reasoning in <reasoning></reasoning> tags and the final yes/no answer in <final_answer></final_answer> tags, like <final_answer>{'Feasible': 'YES_OR_NO'}</final_answer>.",
57
+ 'Few_shot_self': FEW_SHOT_SELF,
58
+ 'Few_shot_others': FEW_SHOT_OTHERS
59
+ }
60
+
61
+ kspPrompts = {
62
+ 'Intro': 'The 0-1 Knapsack Problem (KSP) asks whether a subset of items, each with a given weight and value, can be chosen to fit into a knapsack of fixed capacity, maximizing the total value without exceeding the capacity.',
63
+ 'Initial_question': 'Determine if a subset of items can be selected to fit into a knapsack with a capacity of {knapsack_capacity}, maximizing value without exceeding the capacity. Item weights and values are provided.',
64
+ 'Output_content': 'Indicate if an optimal subset exists and its total value. Offer a concise explanation of your selection process. Aim for clarity and brevity in your response.',
65
+ 'Output_format': "Your output should be enclosed within <root></root> tags. Include your selection process in <reasoning></reasoning> tags and the final decision and total value in <final_answer></final_answer> tags, like <final_answer>{'Feasible': 'YES_OR_NO', 'TotalValue': 'TOTAL_VALUE', 'SelectedItemIds': [0, 1]}</final_answer>.",
66
+ 'Few_shot_self': FEW_SHOT_SELF,
67
+ 'Few_shot_others': FEW_SHOT_OTHERS
68
+ }
69
+
70
+ # NP-hard problems
71
+ tspPrompts = {
72
+ 'Intro': 'The traveling salesman problem (TSP) is a classic optimization problem that aims to find the shortest possible route that visits a set of cities, with each city being visited exactly once and the route returning to the original city.',
73
+ 'Initial_question': 'You must find the shortest path that visits all {total_cities} cities, labelled from 1 to {total_cities}. The distances between each pair of cities are provided.',
74
+ 'Output_content': 'Please list each city in the order they are visited. Provide the total distance of the trip. You should also provide very short step by step reasoning. Do not use multiple lines and try your best to save output tokens.',
75
+ 'Output_format': "Your output should contain two parts enclosed by <root></root>. First, your step by step reasoning like <reasoning>The reasoning process</reasoning>. Second, the final output of the result path and total distance wrapped by final_answer tag, like <final_answer>{'Path': '0->1->2->...->N->0', 'TotalDistance': 'INT_TOTAL_DISTANCE'}</final_answer>",
76
+ 'Few_shot_self': FEW_SHOT_SELF,
77
+ 'Few_shot_others': FEW_SHOT_OTHERS
78
+ }
79
+
80
+ gcpPrompts = {
81
+ 'Intro': 'Graph coloring refers to the problem of coloring vertices of a graph in such a way that no two adjacent vertices have the same color. ',
82
+ 'Initial_question': 'There are {max_vertices} vertices 1 to {max_vertices} in a graph. You may use {max_colors} colors with alphabats from A, B, C,... to color the graph.',
83
+ 'Output_content': "Please label every vertex, even if it is disconnected from the rest of the graph. Please provide each vertex's color. Do not skip any vertices. You should also provide very short step by step reasoning. Do not use multiple lines and try your best to save output tokens.",
84
+ 'Output_format': "Your output should contain two parts enclosed by <root></root>. First, your step by step reasoning wrapped by <reasoning></reasoning>. Second, the final output of all vertex numbers and their associated colors, wrapped by final_answer tag, like <final_answer>{0:'COLOR_1', 1:'COLOR_2', ...}</final_answer>.",
85
+ 'Few_shot_self': FEW_SHOT_SELF,
86
+ 'Few_shot_others': FEW_SHOT_OTHERS
87
+ }
88
+
89
+ mspPrompts = {
90
+ 'Intro': 'The meeting scheduling problem (MSP) is a type of constraint satisfaction problem where the goal is to find a suitable time slot for a meeting that all participants can attend without conflicts in their schedules.',
91
+ 'Initial_question': "There are {total_participants} participants with their available time slots. There are {total_timeslots} consecutive non-overlapping time slots. Let's assume all meetings has duration of 1.",
92
+ 'Output_content': 'Please provide a time slot where all participants can attend the meeting. You should also provide very short step by step reasoning. Do not use multiple lines and try your best to save output tokens.',
93
+ 'Output_format': 'Your output should contain two parts enclosed by <root></root>. First, your step by step reasoning wrapped by <reasoning></reasoning>. Second, the final output of meeting numbers followed by a list of slots, like <final_answer>{0:[1,2], 1:[4], ...}</final_answer>.',
94
+ 'Few_shot_self': FEW_SHOT_SELF,
95
+ 'Few_shot_others': FEW_SHOT_OTHERS
96
+ }
opencompass-my-api/build/lib/opencompass/datasets/NPHardEval/utils.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import xml.etree.ElementTree as ET
3
+
4
+
5
+ def append_root_tags(string):
6
+ if not string.strip().startswith('<root>'):
7
+ string = '<root>\n' + string
8
+ if not string.strip().endswith('</root>'):
9
+ string += '\n</root>'
10
+ return string
11
+
12
+
13
+ def parse_xml_to_dict(xml_string):
14
+ final_answer_element = ''
15
+ reasoning_element = ''
16
+
17
+ try:
18
+ # Parse the XML string
19
+ root = ET.fromstring(xml_string)
20
+
21
+ # Find the 'final_answer' tag
22
+ final_answer_element = root.find('final_answer').text
23
+
24
+ # Find the 'reasoning' tag
25
+ reasoning_element = root.find('reasoning').text
26
+ except Exception:
27
+ try:
28
+ assert '<final_answer>' in xml_string
29
+ assert '</final_answer>' in xml_string
30
+ assert '<reasoning>' in xml_string
31
+ assert '</reasoning>' in xml_string
32
+ final_answer_start = xml_string.index('<final_answer>') + len('<final_answer>')
33
+ final_answer_end = xml_string.index('</final_answer>')
34
+ reasoning_start = xml_string.index('<reasoning>') + len('<reasoning>')
35
+ reasoning_end = xml_string.index('</reasoning>')
36
+ final_answer_element = xml_string[final_answer_start:final_answer_end]
37
+ reasoning_element = xml_string[reasoning_start:reasoning_end]
38
+ except Exception:
39
+ final_answer_element = ''
40
+ reasoning_element = ''
41
+
42
+ final_answer_element = ast.literal_eval(final_answer_element.strip())
43
+ return final_answer_element, reasoning_element
opencompass-my-api/build/lib/opencompass/datasets/OpenFinData.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os.path as osp
3
+
4
+ from datasets import Dataset
5
+
6
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
7
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
8
+
9
+ from .base import BaseDataset
10
+
11
+
12
+ @LOAD_DATASET.register_module()
13
+ class OpenFinDataDataset(BaseDataset):
14
+
15
+ @staticmethod
16
+ def load(path: str, name: str):
17
+ with open(osp.join(path, f'{name}.json'), 'r') as f:
18
+ data = json.load(f)
19
+ return Dataset.from_list(data)
20
+
21
+
22
+ @ICL_EVALUATORS.register_module()
23
+ class OpenFinDataKWEvaluator(BaseEvaluator):
24
+
25
+ def __init__(self, ):
26
+ super().__init__()
27
+
28
+ def score(self, predictions, references):
29
+ assert len(predictions) == len(references)
30
+
31
+ scores = []
32
+ results = dict()
33
+
34
+ for i in range(len(references)):
35
+ all_hit = True
36
+ judgement = references[i].split('、')
37
+ for item in judgement:
38
+ if item not in predictions[i]:
39
+ all_hit = False
40
+ break
41
+ if all_hit:
42
+ scores.append(True)
43
+ else:
44
+ scores.append(False)
45
+
46
+ results['accuracy'] = round(sum(scores) / len(scores), 4) * 100
47
+ return results
opencompass-my-api/build/lib/opencompass/datasets/TheoremQA.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ from datasets import load_dataset
4
+
5
+ from opencompass.registry import LOAD_DATASET, TEXT_POSTPROCESSORS
6
+
7
+ from .base import BaseDataset
8
+
9
+
10
+ @LOAD_DATASET.register_module()
11
+ class TheoremQADataset(BaseDataset):
12
+
13
+ @staticmethod
14
+ def load(path: str):
15
+ return load_dataset('csv', data_files={'test': path})
16
+
17
+
18
+ @TEXT_POSTPROCESSORS.register_module('TheoremQA')
19
+ def TheoremQA_postprocess(text: str) -> str:
20
+ text = text.strip()
21
+ matches = re.findall(r'answer is ([^\s]+)', text)
22
+ if len(matches) == 0:
23
+ return text
24
+ else:
25
+ text = matches[0].strip().strip('.,?!\"\';:')
26
+ return text
27
+
28
+
29
+ def TheoremQA_postprocess_v2(text: str) -> str:
30
+ prediction = text.strip().strip('\n').split('\n')[-1]
31
+ tmp = ''
32
+ for entry in prediction.split(' ')[::-1]:
33
+ if entry == 'is' or entry == 'be' or entry == 'are' or entry.endswith(
34
+ ':'):
35
+ break
36
+ tmp = entry + ' ' + tmp
37
+ prediction = tmp.strip().strip('.')
38
+ return prediction
opencompass-my-api/build/lib/opencompass/datasets/advglue.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import List, Union
3
+
4
+ from datasets import Dataset, concatenate_datasets
5
+
6
+ from opencompass.openicl.icl_evaluator import AccEvaluator
7
+
8
+ from .base import BaseDataset
9
+
10
+
11
+ class AdvDataset(BaseDataset):
12
+ """Base adv GLUE dataset. Adv GLUE is built on GLUE dataset. The main
13
+ purpose is to eval the accuracy drop on original set and adv set.
14
+
15
+ Args:
16
+ subset (str): The subset task of adv GLUE dataset.
17
+ filter_keys (str): The keys to be filtered to create the original
18
+ set for comparison.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ subset: str,
24
+ filter_keys: Union[str, List[str]],
25
+ **kwargs,
26
+ ):
27
+ self.subset = subset
28
+ if isinstance(filter_keys, str):
29
+ filter_keys = [filter_keys]
30
+ self.filter_keys = filter_keys
31
+ super().__init__(**kwargs)
32
+
33
+ def aug_with_original_data(self, dataset):
34
+ """Create original dataset and concat to the end."""
35
+ # Remove data without original reference
36
+ dataset = dataset.filter(
37
+ lambda x: any([x[k] for k in self.filter_keys]))
38
+
39
+ def ori_preprocess(example):
40
+ for k in self.filter_keys:
41
+ if example[k]:
42
+ new_k = k.split('original_')[-1]
43
+ example[new_k] = example[k]
44
+ example['type'] = 'original'
45
+ return example
46
+
47
+ original_dataset = dataset.map(ori_preprocess)
48
+
49
+ return concatenate_datasets([dataset, original_dataset])
50
+
51
+ def load(self, path):
52
+ """Load dataset and aug with original dataset."""
53
+
54
+ with open(path, 'r') as f:
55
+ raw_data = json.load(f)
56
+ subset = raw_data[self.subset]
57
+
58
+ # In case the missing keys in first example causes Dataset
59
+ # to ignore them in the following examples when building.
60
+ for k in self.filter_keys:
61
+ if k not in subset[0]:
62
+ subset[0][k] = None
63
+
64
+ dataset = Dataset.from_list(raw_data[self.subset])
65
+
66
+ dataset = self.aug_with_original_data(dataset)
67
+
68
+ def choices_process(example):
69
+ example['label_option'] = chr(ord('A') + example['label'])
70
+ return example
71
+
72
+ dataset = dataset.map(choices_process)
73
+ return dataset
74
+
75
+
76
+ # label 0 for A. negative
77
+ # label 1 for B. positive
78
+ class AdvSst2Dataset(AdvDataset):
79
+ """Adv GLUE sst2 dataset."""
80
+
81
+ def __init__(self, **kwargs):
82
+ super().__init__(subset='sst2',
83
+ filter_keys='original_sentence',
84
+ **kwargs)
85
+
86
+
87
+ # label 0 for not_duplicate, A. no
88
+ # label 1 for duplicate, B. yes
89
+ class AdvQqpDataset(AdvDataset):
90
+ """Adv GLUE qqp dataset."""
91
+
92
+ def __init__(self, **kwargs):
93
+ super().__init__(
94
+ subset='qqp',
95
+ filter_keys=['original_question1', 'original_question2'],
96
+ **kwargs)
97
+
98
+
99
+ # # label 0 for entailment, A. yes
100
+ # # label 1 for neutral, B. maybe
101
+ # # label 2 for contradiction, C. no
102
+ class AdvMnliDataset(AdvDataset):
103
+ """Adv GLUE mnli dataset."""
104
+
105
+ def __init__(self, **kwargs):
106
+ super().__init__(
107
+ subset='mnli',
108
+ filter_keys=['original_premise', 'original_hypothesis'],
109
+ **kwargs)
110
+
111
+
112
+ # # label 0 for entailment, A. yes
113
+ # # label 1 for neutral, B. maybe
114
+ # # label 2 for contradiction, C. no
115
+ class AdvMnliMMDataset(AdvDataset):
116
+ """Adv GLUE mnli mm dataset."""
117
+
118
+ def __init__(self, **kwargs):
119
+ super().__init__(
120
+ subset='mnli-mm',
121
+ filter_keys=['original_premise', 'original_hypothesis'],
122
+ **kwargs)
123
+
124
+
125
+ # # label 0 for entailment, A. yes
126
+ # # label 1 for not entailment, B. no
127
+ class AdvQnliDataset(AdvDataset):
128
+ """Adv GLUE qnli dataset."""
129
+
130
+ def __init__(self, **kwargs):
131
+ super().__init__(
132
+ subset='qnli',
133
+ filter_keys=['original_question', 'original_sentence'],
134
+ **kwargs)
135
+
136
+
137
+ # # label 0 for entailment, A. yes
138
+ # # label 1 for not entailment, B. no
139
+ class AdvRteDataset(AdvDataset):
140
+ """Adv GLUE rte dataset."""
141
+
142
+ def __init__(self, **kwargs):
143
+ super().__init__(
144
+ subset='rte',
145
+ filter_keys=['original_sentence1', 'original_sentence2'],
146
+ **kwargs)
147
+
148
+
149
+ class AccDropEvaluator(AccEvaluator):
150
+ """Eval accuracy drop."""
151
+
152
+ def __init__(self) -> None:
153
+ super().__init__()
154
+
155
+ def score(self, predictions: List, references: List) -> dict:
156
+ """Calculate scores and accuracy.
157
+
158
+ Args:
159
+ predictions (List): List of probabilities for each class of each
160
+ sample.
161
+ references (List): List of target labels for each sample.
162
+
163
+ Returns:
164
+ dict: calculated scores.
165
+ """
166
+
167
+ n = len(predictions)
168
+ assert n % 2 == 0, 'Number of examples should be even.'
169
+ acc_after = super().score(predictions[:n // 2], references[:n // 2])
170
+ acc_before = super().score(predictions[n // 2:], references[n // 2:])
171
+ acc_drop = 1 - acc_after['accuracy'] / acc_before['accuracy']
172
+ return dict(acc_drop=acc_drop,
173
+ acc_after=acc_after['accuracy'],
174
+ acc_before=acc_before['accuracy'])
opencompass-my-api/build/lib/opencompass/datasets/afqmcd.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import Dataset
4
+
5
+ from opencompass.registry import LOAD_DATASET
6
+
7
+ from .base import BaseDataset
8
+
9
+
10
+ @LOAD_DATASET.register_module()
11
+ class AFQMCDataset_V2(BaseDataset):
12
+
13
+ @staticmethod
14
+ def load(path):
15
+ data = []
16
+ with open(path, 'r', encoding='utf-8') as f:
17
+ for line in f:
18
+ line = json.loads(line)
19
+ line['label'] = 'AB'[int(line['label'])]
20
+ data.append(line)
21
+ return Dataset.from_list(data)
opencompass-my-api/build/lib/opencompass/datasets/agieval/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # flake8: noqa
2
+
3
+ from .agieval import * # noqa: F401, F403
opencompass-my-api/build/lib/opencompass/datasets/agieval/agieval.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os.path as osp
3
+
4
+ from datasets import Dataset
5
+
6
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
7
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
8
+
9
+ from ..base import BaseDataset
10
+ from .math_equivalence import is_equiv
11
+ from .post_process import parse_math_answer
12
+
13
+
14
+ @LOAD_DATASET.register_module()
15
+ class AGIEvalDataset(BaseDataset):
16
+
17
+ @staticmethod
18
+ def load(path: str, name: str, setting_name: str):
19
+ from .dataset_loader import load_dataset, load_dataset_as_result_schema
20
+
21
+ assert setting_name in 'zero-shot', 'only support zero-shot setting'
22
+ dataset_wo_label = load_dataset(name, setting_name, path)
23
+ dataset_with_label = load_dataset_as_result_schema(name, path)
24
+ dataset = []
25
+ for d1, d2 in zip(dataset_wo_label, dataset_with_label):
26
+ dataset.append({
27
+ 'id': d2.index,
28
+ 'problem_input': d1['context'],
29
+ 'label': d2.label,
30
+ })
31
+ dataset = Dataset.from_list(dataset)
32
+ return dataset
33
+
34
+
35
+ @LOAD_DATASET.register_module()
36
+ class AGIEvalDataset_v2(BaseDataset):
37
+
38
+ @staticmethod
39
+ def load(path: str, name: str, setting_name: str):
40
+ assert setting_name in 'zero-shot', 'only support zero-shot setting'
41
+ filename = osp.join(path, name + '.jsonl')
42
+ with open(filename, encoding='utf-8') as f:
43
+ data = [json.loads(line.strip()) for line in f]
44
+ dataset = []
45
+ for item in data:
46
+ passage = item['passage'] if item['passage'] else ''
47
+ question = passage + item['question']
48
+ options = '\n'.join(item['options']) if item['options'] else ''
49
+ if item['label']:
50
+ if isinstance(item['label'], list):
51
+ label = ''.join(item['label'])
52
+ else:
53
+ label = item['label']
54
+ else:
55
+ label = item['answer']
56
+ d = {'question': question, 'options': options, 'label': label}
57
+ dataset.append(d)
58
+ dataset = Dataset.from_list(dataset)
59
+ return dataset
60
+
61
+
62
+ @ICL_EVALUATORS.register_module()
63
+ class AGIEvalEvaluator(BaseEvaluator):
64
+
65
+ def score(self, predictions, references):
66
+ predictions = [parse_math_answer('', pred) for pred in predictions]
67
+ details = []
68
+ cnt = 0
69
+ for pred, ref in zip(predictions, references):
70
+ detail = {'pred': pred, 'answer': ref, 'correct': False}
71
+ if is_equiv(pred, ref):
72
+ cnt += 1
73
+ detail['correct'] = True
74
+ details.append(detail)
75
+ score = cnt / len(predictions) * 100
76
+ return {'score': score, 'details': details}
77
+
78
+
79
+ @ICL_EVALUATORS.register_module()
80
+ class AGIEvalEvaluator_mcq(BaseEvaluator):
81
+
82
+ def score(self, predictions, references):
83
+ if len(predictions) != len(references):
84
+ return {
85
+ 'error': 'predictions and references have different '
86
+ 'length'
87
+ }
88
+ details = []
89
+ cnt = 0
90
+ for pred, ref in zip(predictions, references):
91
+ detail = {'pred': pred, 'answer': ref, 'correct': False}
92
+ if pred == ref:
93
+ cnt += 1
94
+ detail['correct'] = True
95
+ details.append(detail)
96
+
97
+ score = cnt / len(predictions) * 100
98
+
99
+ return {'score': score, 'details': details}
opencompass-my-api/build/lib/opencompass/datasets/agieval/constructions.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ import pandas as pd
3
+
4
+
5
+ class TaskSchema(object):
6
+
7
+ def __init__(self,
8
+ passage=None,
9
+ question=None,
10
+ options=None,
11
+ label=None,
12
+ answer=None,
13
+ other=None):
14
+ self.passage = passage
15
+ self.question = question
16
+ self.options = options
17
+ self.label = label
18
+ self.answer = answer
19
+ self.other = other
20
+
21
+ def to_dict(self):
22
+ return {
23
+ 'passage': self.passage,
24
+ 'question': self.question,
25
+ 'options': self.options,
26
+ 'label': self.label,
27
+ 'answer': self.answer,
28
+ 'other': self.other
29
+ }
30
+
31
+
32
+ # define README.json
33
+ class AgiInstance(object):
34
+
35
+ def __init__(self, task_description, data_source, task_schema, output,
36
+ evaluation_metric, task_example):
37
+ self.task_description = task_description
38
+ self.data_source = data_source
39
+ self.task_schema = task_schema
40
+ self.output = output
41
+ self.evaluation_metric = evaluation_metric
42
+ self.task_example = task_example
43
+
44
+ def to_dict(self):
45
+ return {
46
+ 'task description': self.task_description,
47
+ 'data source': self.data_source,
48
+ 'task schema': self.task_schema.to_dict(),
49
+ 'output': self.output,
50
+ 'evaluation metric': self.evaluation_metric,
51
+ 'task example': self.task_example
52
+ }
53
+
54
+
55
+ class ChatGPTSchema(object):
56
+
57
+ def __init__(self, context=None, metadata=''):
58
+ self.context = context
59
+ self.metadata = metadata
60
+
61
+ def to_dict(self):
62
+ return {'context': self.context, 'metadata': self.metadata}
63
+
64
+
65
+ class ResultsForHumanSchema(object):
66
+
67
+ def __init__(self,
68
+ index,
69
+ problem_input,
70
+ label,
71
+ model_input='',
72
+ model_output='',
73
+ parse_result='',
74
+ first_stage_output='',
75
+ second_stage_input='',
76
+ is_correct=False):
77
+ self.index = index
78
+ self.problem_input = problem_input
79
+ self.model_input = model_input
80
+ self.model_output = model_output
81
+ self.parse_result = parse_result
82
+ self.label = label
83
+ self.first_stage_output = first_stage_output
84
+ self.second_stage_input = second_stage_input
85
+ self.is_correct = is_correct
86
+
87
+ def to_dict(self):
88
+ return {
89
+ 'index': self.index,
90
+ 'problem_input': self.problem_input,
91
+ 'model_input': self.model_input,
92
+ 'model_output': self.model_output,
93
+ 'parse_result': self.parse_result,
94
+ 'label': self.label,
95
+ 'is_correct': self.is_correct,
96
+ 'first_stage_output': self.first_stage_output,
97
+ 'second_stage_input': self.second_stage_input,
98
+ }
99
+
100
+ @staticmethod
101
+ def to_tsv(result_list, path):
102
+ result_json = [item.to_dict() for item in result_list]
103
+ table = pd.json_normalize(result_json)
104
+ table.to_excel(path, index=False)
opencompass-my-api/build/lib/opencompass/datasets/agieval/dataset_loader.py ADDED
@@ -0,0 +1,392 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ import ast
3
+ import json
4
+ import os
5
+
6
+ import pandas as pd
7
+ import tiktoken
8
+ from tqdm import tqdm
9
+
10
+ from .constructions import ChatGPTSchema, ResultsForHumanSchema
11
+ from .utils import extract_answer, read_jsonl, save_jsonl
12
+
13
+ # define the datasets
14
+ english_qa_datasets = [
15
+ 'lsat-ar', 'lsat-lr', 'lsat-rc', 'logiqa-en', 'sat-math', 'sat-en',
16
+ 'aqua-rat', 'sat-en-without-passage', 'gaokao-english'
17
+ ]
18
+ chinese_qa_datasets = [
19
+ 'logiqa-zh', 'jec-qa-kd', 'jec-qa-ca', 'gaokao-chinese',
20
+ 'gaokao-geography', 'gaokao-history', 'gaokao-biology', 'gaokao-chemistry',
21
+ 'gaokao-physics', 'gaokao-mathqa'
22
+ ]
23
+ english_cloze_datasets = ['math']
24
+ chinese_cloze_datasets = ['gaokao-mathcloze']
25
+
26
+ multi_choice_datasets = ['jec-qa-kd', 'jec-qa-ca', 'gaokao-physics']
27
+ math_output_datasets = ['gaokao-mathcloze', 'math']
28
+
29
+
30
+ def convert_zero_shot(line, dataset_name):
31
+ try:
32
+ passage = line['passage'] if line['passage'] is not None else ''
33
+ if dataset_name in english_qa_datasets:
34
+ option_string = 'ABCDEFG'
35
+ count = len(line['options'])
36
+ if count == 1:
37
+ count = 5
38
+ return passage + 'Q: ' + line['question'] + ' ' \
39
+ + 'Answer Choices: ' + ' '.join(line['options']) + '\n' + \
40
+ 'A: Among A through {}, the answer is'.format(option_string[count - 1])
41
+
42
+ elif dataset_name in chinese_qa_datasets:
43
+ option_string = 'ABCDEFG'
44
+ count = len(line['options'])
45
+ if count == 1:
46
+ count = 4
47
+ return passage + '问题:' + line['question'] + ' ' \
48
+ + '选项:' + ' '.join(line['options']) + '\n' + \
49
+ '答案:从A到{}, 我们应选择'.format(option_string[count - 1])
50
+
51
+ elif dataset_name in english_cloze_datasets:
52
+ return passage + 'Q: ' + line['question'] + '\n' \
53
+ 'A: The answer is'
54
+
55
+ elif dataset_name in chinese_cloze_datasets:
56
+ return passage + '问题:' + line['question'] + '\n' \
57
+ '答案:'
58
+ except NameError:
59
+ print('Dataset not defined.')
60
+
61
+
62
+ prefix = '该问题为单选题,所有选项中必有一个正确答案,且只有一个正确答案。\n'
63
+
64
+
65
+ def convert_zero_shot_CoT_stage1(line, dataset_name):
66
+ try:
67
+ passage = line['passage'] if line['passage'] is not None else ''
68
+ if dataset_name in english_qa_datasets:
69
+ return passage + 'Q: ' + line['question'] + ' ' \
70
+ + 'Answer Choices: ' + ' '.join(line['options']) + '\n' + \
71
+ "Let's think step by step."
72
+
73
+ elif dataset_name in chinese_qa_datasets:
74
+ option_string = 'ABCDEFG'
75
+ count = len(line['options'])
76
+ if count == 1:
77
+ count = 4
78
+ return passage + '问题:' + line['question'] + ' ' \
79
+ + '选项:' + ' '.join(line['options']) + '\n' + \
80
+ '从A到{}, 我们应选择什么?让我们逐步思考:'.format(option_string[count - 1])
81
+
82
+ elif dataset_name in english_cloze_datasets:
83
+ return passage + 'Q: ' + line['question'] + '\n' \
84
+ "A: Let's think step by step."
85
+
86
+ elif dataset_name in chinese_cloze_datasets:
87
+ return passage + '问题:' + line['question'] + '\n' \
88
+ '答案:让我们逐步思考:'
89
+ except NameError:
90
+ print('Dataset not defined.')
91
+
92
+
93
+ # process few-shot raw_prompts
94
+ def combine_prompt(prompt_path,
95
+ dataset_name,
96
+ load_explanation=True,
97
+ chat_mode=False):
98
+ skip_passage = False
99
+ if dataset_name == 'sat-en-without-passage':
100
+ skip_passage = True
101
+ dataset_name = 'sat-en'
102
+ demostrations = []
103
+ # read the prompts by context and explanation
104
+ context_row = [0, 1, 3, 5, 7, 9]
105
+ explanation_row = [0, 2, 4, 6, 8, 10]
106
+ raw_prompts_context = pd.read_csv(prompt_path,
107
+ header=0,
108
+ skiprows=lambda x: x not in context_row,
109
+ keep_default_na=False)
110
+ raw_prompts_explanation = pd.read_csv(
111
+ prompt_path,
112
+ header=0,
113
+ skiprows=lambda x: x not in explanation_row,
114
+ keep_default_na=False).replace(r'\n\n', '\n', regex=True)
115
+ contexts = []
116
+ for line in list(raw_prompts_context[dataset_name]):
117
+ if line:
118
+ # print(line)
119
+ contexts.append(ast.literal_eval(line))
120
+ explanations = [
121
+ exp for exp in raw_prompts_explanation[dataset_name] if exp
122
+ ]
123
+
124
+ for idx, (con, exp) in enumerate(zip(contexts, explanations)):
125
+ passage = con['passage'] if con[
126
+ 'passage'] is not None and not skip_passage else ''
127
+ question = con['question']
128
+ options = con['options'] if con['options'] is not None else ''
129
+ label = con['label'] if con['label'] is not None else ''
130
+ answer = con[
131
+ 'answer'] if 'answer' in con and con['answer'] is not None else ''
132
+
133
+ if dataset_name in english_qa_datasets:
134
+ question_input = 'Problem {}. '.format(idx + 1) + passage + ' ' + question + '\n' \
135
+ + 'Choose from the following options: ' + ' '.join(options) + '\n'
136
+ question_output = (('Explanation for Problem {}: '.format(idx + 1) + exp + '\n') if load_explanation else '') \
137
+ + 'The answer is therefore {}'.format(label)
138
+
139
+ elif dataset_name in chinese_qa_datasets:
140
+ question_input = '问题 {}. '.format(idx + 1) + passage + ' ' + question + '\n' \
141
+ + '从以下选项中选择: ' + ' '.join(options) + '\n'
142
+ question_output = (('问题 {}的解析: '.format(idx + 1) + exp + '\n') if load_explanation else '') \
143
+ + '答案是 {}'.format(label)
144
+
145
+ elif dataset_name in english_cloze_datasets:
146
+ question_input = 'Problem {}. '.format(idx + 1) + question + '\n'
147
+ question_output = (('Explanation for Problem {}: '.format(idx + 1) + exp + '\n') if load_explanation else '') \
148
+ + 'The answer is therefore {}'.format(answer)
149
+
150
+ elif dataset_name in chinese_cloze_datasets:
151
+ question_input = '问题 {}. '.format(idx + 1) + question + '\n'
152
+ question_output = (('问题 {}的解析: '.format(idx + 1) + exp + '\n') if load_explanation else '') \
153
+ + '答案是 {}'.format(answer)
154
+ else:
155
+ raise ValueError(
156
+ f'During loading few-sot examples, found unknown dataset: {dataset_name}'
157
+ )
158
+ if chat_mode:
159
+ demostrations.append((question_input, question_output))
160
+ else:
161
+ demostrations.append(question_input + question_output + '\n')
162
+
163
+ return demostrations
164
+
165
+
166
+ enc = None
167
+
168
+
169
+ def _lazy_load_enc():
170
+ global enc
171
+ if enc is None:
172
+ enc = tiktoken.encoding_for_model('gpt-4')
173
+
174
+
175
+ # cut prompt if reach max token length
176
+ def concat_prompt(demos,
177
+ dataset_name,
178
+ max_tokens,
179
+ end_of_example='\n',
180
+ verbose=False):
181
+ _lazy_load_enc()
182
+ demostration_en = 'Here are the answers for the problems in the exam.\n'
183
+ demostration_zh = '以下是考试中各个问题的答案。\n'
184
+
185
+ for i in range(len(demos)):
186
+ # print(len(enc.encode(demostration_en)), len(enc.encode(demostration_zh)))
187
+ if dataset_name in english_qa_datasets:
188
+ demostration_en = demostration_en + demos[i] + end_of_example
189
+ elif dataset_name in chinese_qa_datasets:
190
+ demostration_zh = demostration_zh + demos[i] + end_of_example
191
+ elif dataset_name in english_cloze_datasets:
192
+ demostration_en = demostration_en + demos[i] + end_of_example
193
+ elif dataset_name in chinese_cloze_datasets:
194
+ demostration_zh = demostration_zh + demos[i] + end_of_example
195
+ # break if reach max token limit
196
+ if len(enc.encode(demostration_en)) < max_tokens and len(
197
+ enc.encode(demostration_zh)) < max_tokens:
198
+ output = demostration_en if len(demostration_en) > len(
199
+ demostration_zh) else demostration_zh
200
+ prompt_num = i + 1
201
+ else:
202
+ break
203
+ if verbose:
204
+ print('max_tokens set as ', max_tokens, 'actual_tokens is',
205
+ len(enc.encode(output)), 'num_shot is', prompt_num)
206
+ return output, prompt_num
207
+
208
+
209
+ def concat_prompt_chat_mode(demos,
210
+ dataset_name,
211
+ max_tokens,
212
+ end_of_example='\n',
213
+ verbose=False):
214
+ _lazy_load_enc()
215
+ answers = []
216
+ sentences = ''
217
+ for i in range(len(demos)):
218
+ answers += [
219
+ {
220
+ 'role': 'user',
221
+ 'content': demos[i][0]
222
+ },
223
+ {
224
+ 'role': 'assistant',
225
+ 'content': demos[i][1]
226
+ },
227
+ ]
228
+ sentences += json.dumps(answers[-1])
229
+ # break if reach max token limit
230
+ if len(enc.encode(sentences)) > max_tokens:
231
+ answers.pop()
232
+ answers.pop()
233
+ break
234
+ if verbose:
235
+ print('max_tokens set as ', max_tokens, 'actual_tokens is',
236
+ len(enc.encode(sentences)), 'num_shot is',
237
+ len(answers) // 2)
238
+ return answers, len(answers) // 2
239
+
240
+
241
+ def convert_few_shot(line, dataset_name, demo, n_shot, chat_mode=False):
242
+ passage = line['passage'] if line['passage'] is not None else ''
243
+ question = line['question']
244
+ options = line['options'] if line['options'] is not None else ''
245
+
246
+ if dataset_name in english_qa_datasets:
247
+ question_input = 'Problem {}. '.format(n_shot + 1) + passage + ' ' + question + '\n' \
248
+ + 'Choose from the following options: ' + ' '.join(options) + '\n'
249
+ # + "Explanation for Problem {}: ".format(n_shot + 1)
250
+
251
+ if dataset_name in chinese_qa_datasets:
252
+ question_input = '问题 {}. '.format(n_shot + 1) + passage + ' ' + question + '\n' \
253
+ + '从以下选项中选择: ' + ' '.join(options) + '\n'
254
+ # + "问题 {}的解析: ".format(n_shot + 1)
255
+
256
+ if dataset_name in english_cloze_datasets:
257
+ question_input = 'Problem {}. '.format(n_shot + 1) + question + '\n'
258
+ # + "Explanation for Problem {}: ".format(n_shot + 1)
259
+
260
+ if dataset_name in chinese_cloze_datasets:
261
+ question_input = '问题 {}. '.format(n_shot + 1) + question + '\n'
262
+ # + "问题 {}的解析: ".format(n_shot + 1)
263
+ if chat_mode:
264
+ return demo + [
265
+ {
266
+ 'role': 'user',
267
+ 'content': question_input
268
+ },
269
+ ]
270
+ else:
271
+ return demo + question_input
272
+
273
+
274
+ def load_dataset(dataset_name,
275
+ setting_name,
276
+ parent_path,
277
+ prompt_path=None,
278
+ max_tokens=None,
279
+ end_of_example='\n',
280
+ chat_mode=False,
281
+ verbose=False):
282
+ test_path = os.path.join(parent_path, dataset_name + '.jsonl')
283
+ loaded_jsonl = read_jsonl(test_path)
284
+ processed = []
285
+ if setting_name == 'few-shot-CoT' or setting_name == 'few-shot':
286
+ # process demo once if it is few-shot-CoT
287
+ processed_demos = combine_prompt(
288
+ prompt_path,
289
+ dataset_name,
290
+ load_explanation=setting_name == 'few-shot-CoT',
291
+ chat_mode=chat_mode)
292
+ if chat_mode:
293
+ chosen_prompt, n_shot = concat_prompt_chat_mode(processed_demos,
294
+ dataset_name,
295
+ max_tokens,
296
+ end_of_example,
297
+ verbose=verbose)
298
+ else:
299
+ chosen_prompt, n_shot = concat_prompt(processed_demos,
300
+ dataset_name,
301
+ max_tokens,
302
+ end_of_example,
303
+ verbose=verbose)
304
+ if verbose:
305
+ loaded_jsonl = tqdm(loaded_jsonl)
306
+ for meta_idx, line in enumerate(loaded_jsonl):
307
+ if setting_name == 'zero-shot':
308
+ ctxt = convert_zero_shot(line, dataset_name)
309
+ elif setting_name == 'zero-shot-CoT':
310
+ ctxt = convert_zero_shot_CoT_stage1(line, dataset_name)
311
+ elif setting_name == 'few-shot-CoT' or setting_name == 'few-shot':
312
+ ctxt = convert_few_shot(line, dataset_name, chosen_prompt, n_shot,
313
+ chat_mode)
314
+ try:
315
+ new_instance = ChatGPTSchema(context=ctxt, metadata=meta_idx)
316
+ processed.append(new_instance.to_dict())
317
+ except NameError:
318
+ print('Dataset not defined.')
319
+ return processed
320
+
321
+
322
+ def generate_second_stage_input(dataset_name,
323
+ input_list,
324
+ output_list,
325
+ with_format_prompt=False):
326
+ try:
327
+ english_format_prompt = 'Based on the previous results, your task is to extract the final answer and provide the output enclosed in brackets【】, such as 【0】 or 【A】.'
328
+ chinese_format_prompt = '根据以上内容,你的任务是把最终的答案提取出来并填在【】中,例如【0】或者【A】。'
329
+ if dataset_name in english_qa_datasets:
330
+ prompt_suffix = 'Therefore, among A through E, the answer is'
331
+ if with_format_prompt:
332
+ prompt_suffix = english_format_prompt + prompt_suffix
333
+ elif dataset_name in chinese_qa_datasets:
334
+ prompt_suffix = '因此,从A到D, 我们应选择'
335
+ if with_format_prompt:
336
+ prompt_suffix = chinese_format_prompt + prompt_suffix
337
+ elif dataset_name in english_cloze_datasets:
338
+ prompt_suffix = 'Therefore, the answer is'
339
+ if with_format_prompt:
340
+ prompt_suffix = english_format_prompt + prompt_suffix
341
+ elif dataset_name in chinese_cloze_datasets:
342
+ prompt_suffix = '因此,答案是'
343
+ if with_format_prompt:
344
+ prompt_suffix = chinese_format_prompt + prompt_suffix
345
+ except NameError:
346
+ print('Dataset not defined.')
347
+ processed = []
348
+ for i in range(len(input_list)):
349
+ ctxt = '{0}\n{1}\n{2}'.format(input_list[i]['context'],
350
+ extract_answer(output_list[i]),
351
+ prompt_suffix)
352
+ new_instance = ChatGPTSchema(context=ctxt,
353
+ metadata=input_list[i]['metadata'])
354
+ processed.append(new_instance.to_dict())
355
+ return processed
356
+
357
+
358
+ def load_dataset_as_result_schema(dataset_name, parent_path):
359
+ test_path = os.path.join(parent_path, dataset_name + '.jsonl')
360
+ loaded_jsonl = read_jsonl(test_path)
361
+
362
+ processed = []
363
+ for i, line in enumerate(loaded_jsonl):
364
+ problem_input = convert_zero_shot(line, dataset_name)
365
+ processed.append(
366
+ ResultsForHumanSchema(
367
+ index=i,
368
+ problem_input=problem_input,
369
+ label=line['label'] if line['label'] else line['answer'],
370
+ ))
371
+ return processed
372
+
373
+
374
+ if __name__ == '__main__':
375
+
376
+ # set variables
377
+ parent_dir = '../../data/V1_1/'
378
+ raw_prompt_path = '../data/few_shot_prompts.csv'
379
+
380
+ # set dataset name to process
381
+ setting_name = 'few-shot-CoT' # setting_name can be chosen from ["zero-shot", "zero-shot-CoT", "few-shot-CoT"]
382
+ data_name = 'jec-qa-kd'
383
+ save_dir = '../../experiment_input/{}/'.format(setting_name)
384
+ if not os.path.exists(save_dir):
385
+ os.makedirs(save_dir)
386
+ processed_data = load_dataset(data_name,
387
+ setting_name,
388
+ parent_dir,
389
+ prompt_path=raw_prompt_path,
390
+ max_tokens=2048)
391
+ save_jsonl(processed_data,
392
+ os.path.join(save_dir, '{}.jsonl'.format(data_name)))
opencompass-my-api/build/lib/opencompass/datasets/agieval/evaluation.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ from . import dataset_loader, utils
3
+ from .math_equivalence import is_equiv
4
+
5
+
6
+ def convert_to_set(item):
7
+ if isinstance(item, list):
8
+ return set(item)
9
+ if isinstance(item, str):
10
+ return {item}
11
+ if item is None:
12
+ return {}
13
+ raise ValueError("Input can't parse:", item)
14
+
15
+
16
+ def evaluate_single_sample(dataset_name, prediction, label):
17
+ if dataset_name in dataset_loader.multi_choice_datasets:
18
+ p = convert_to_set(prediction)
19
+ l = convert_to_set(label)
20
+ return p == l
21
+ elif dataset_name in dataset_loader.math_output_datasets:
22
+ return is_equiv(prediction, label)
23
+ else:
24
+ return prediction == label
25
+
26
+
27
+ # def evaluate(dataset_name, prediction_list, label_list):
28
+ # correct = 0
29
+ # if dataset_name in multi_choice_datasets:
30
+ # for prediction, label in zip(prediction_list, label_list):
31
+ # p = convert_to_set(prediction)
32
+ # l = convert_to_set(label)
33
+ # if p == l:
34
+ # correct += 1
35
+ # elif dataset_name in math_output_datasets:
36
+ # for prediction, label in zip(prediction_list, label_list):
37
+ # if is_equiv(prediction, label):
38
+ # correct += 1
39
+ # else:
40
+ # for prediction, label in zip(prediction_list, label_list):
41
+ # if prediction == label:
42
+ # correct += 1
43
+ # return "{0:.2%}".format(correct / len(label_list))
opencompass-my-api/build/lib/opencompass/datasets/agieval/math_equivalence.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+
3
+
4
+ # code from https://github.com/hendrycks/math/blob/main/modeling/math_equivalence.py
5
+ def _fix_fracs(string):
6
+ substrs = string.split('\\frac')
7
+ new_str = substrs[0]
8
+ if len(substrs) > 1:
9
+ substrs = substrs[1:]
10
+ for substr in substrs:
11
+ new_str += '\\frac'
12
+ if substr[0] == '{':
13
+ new_str += substr
14
+ else:
15
+ try:
16
+ assert len(substr) >= 2
17
+ except:
18
+ return string
19
+ a = substr[0]
20
+ b = substr[1]
21
+ if b != '{':
22
+ if len(substr) > 2:
23
+ post_substr = substr[2:]
24
+ new_str += '{' + a + '}{' + b + '}' + post_substr
25
+ else:
26
+ new_str += '{' + a + '}{' + b + '}'
27
+ else:
28
+ if len(substr) > 2:
29
+ post_substr = substr[2:]
30
+ new_str += '{' + a + '}' + b + post_substr
31
+ else:
32
+ new_str += '{' + a + '}' + b
33
+ string = new_str
34
+ return string
35
+
36
+
37
+ def _fix_a_slash_b(string):
38
+ if len(string.split('/')) != 2:
39
+ return string
40
+ a = string.split('/')[0]
41
+ b = string.split('/')[1]
42
+ try:
43
+ a = int(a)
44
+ b = int(b)
45
+ assert string == '{}/{}'.format(a, b)
46
+ new_string = '\\frac{' + str(a) + '}{' + str(b) + '}'
47
+ return new_string
48
+ except:
49
+ return string
50
+
51
+
52
+ def _remove_right_units(string):
53
+ # "\\text{ " only ever occurs (at least in the val set) when describing units
54
+ if '\\text{ ' in string:
55
+ splits = string.split('\\text{ ')
56
+ assert len(splits) == 2
57
+ return splits[0]
58
+ else:
59
+ return string
60
+
61
+
62
+ def _fix_sqrt(string):
63
+ if '\\sqrt' not in string:
64
+ return string
65
+ splits = string.split('\\sqrt')
66
+ new_string = splits[0]
67
+ for split in splits[1:]:
68
+ if split[0] != '{':
69
+ a = split[0]
70
+ new_substr = '\\sqrt{' + a + '}' + split[1:]
71
+ else:
72
+ new_substr = '\\sqrt' + split
73
+ new_string += new_substr
74
+ return new_string
75
+
76
+
77
+ def _strip_string(string):
78
+ # linebreaks
79
+ string = string.replace('\n', '')
80
+ # print(string)
81
+
82
+ # remove inverse spaces
83
+ string = string.replace('\\!', '')
84
+ # print(string)
85
+
86
+ # replace \\ with \
87
+ string = string.replace('\\\\', '\\')
88
+ # print(string)
89
+
90
+ # replace tfrac and dfrac with frac
91
+ string = string.replace('tfrac', 'frac')
92
+ string = string.replace('dfrac', 'frac')
93
+ # print(string)
94
+
95
+ # remove \left and \right
96
+ string = string.replace('\\left', '')
97
+ string = string.replace('\\right', '')
98
+ # print(string)
99
+
100
+ # Remove circ (degrees)
101
+ string = string.replace('^{\\circ}', '')
102
+ string = string.replace('^\\circ', '')
103
+
104
+ # remove dollar signs
105
+ string = string.replace('\\$', '')
106
+
107
+ # remove units (on the right)
108
+ string = _remove_right_units(string)
109
+
110
+ # remove percentage
111
+ string = string.replace('\\%', '')
112
+ string = string.replace('\%', '')
113
+
114
+ # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
115
+ string = string.replace(' .', ' 0.')
116
+ string = string.replace('{.', '{0.')
117
+ # if empty, return empty string
118
+ if len(string) == 0:
119
+ return string
120
+ if string[0] == '.':
121
+ string = '0' + string
122
+
123
+ # to consider: get rid of e.g. "k = " or "q = " at beginning
124
+ if len(string.split('=')) == 2:
125
+ if len(string.split('=')[0]) <= 2:
126
+ string = string.split('=')[1]
127
+
128
+ # fix sqrt3 --> sqrt{3}
129
+ string = _fix_sqrt(string)
130
+
131
+ # remove spaces
132
+ string = string.replace(' ', '')
133
+
134
+ # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
135
+ string = _fix_fracs(string)
136
+
137
+ # manually change 0.5 --> \frac{1}{2}
138
+ if string == '0.5':
139
+ string = '\\frac{1}{2}'
140
+
141
+ # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
142
+ string = _fix_a_slash_b(string)
143
+
144
+ return string
145
+
146
+
147
+ def is_equiv(str1, str2, verbose=False):
148
+ if str1 is None and str2 is None:
149
+ print('WARNING: Both None')
150
+ return True
151
+ if str1 is None or str2 is None:
152
+ return False
153
+
154
+ try:
155
+ ss1 = _strip_string(str1)
156
+ ss2 = _strip_string(str2)
157
+ if verbose:
158
+ print(ss1, ss2)
159
+ return ss1 == ss2
160
+ except:
161
+ return str1 == str2
opencompass-my-api/build/lib/opencompass/datasets/agieval/post_process.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ import json
3
+ import re
4
+
5
+ from . import dataset_loader
6
+
7
+
8
+ def extract_last_line(string):
9
+ lines = string.split('\n')
10
+ for item in lines[::-1]:
11
+ if item.strip() != '':
12
+ string = item
13
+ break
14
+ return string
15
+
16
+
17
+ def remove_few_shot_prefix(string: str):
18
+ prefix_list = ['The answer is therefore', '答案是']
19
+ for prefix in prefix_list:
20
+ if string.startswith(prefix):
21
+ string = string[len(prefix):].strip()
22
+ elif prefix in string:
23
+ index = string.rfind(prefix)
24
+ if index >= 0:
25
+ string = string[index + len(prefix):].strip()
26
+ return string
27
+
28
+
29
+ def try_parse_few_shot_qa_single_answer(string, setting_name, language='en'):
30
+ if setting_name == 'few-shot-CoT':
31
+ string = extract_last_line(string)
32
+ if language == 'en':
33
+ pattern = 'answer is .*?([A-G])'
34
+ match = re.search(pattern, string)
35
+ elif language == 'zh':
36
+ pattern = '答案是.*?([A-G])'
37
+ match = re.search(pattern, string)
38
+ else:
39
+ raise ValueError('Unknown language {0}'.format(language))
40
+ if match:
41
+ return match.group(1)
42
+ else:
43
+ return None
44
+
45
+
46
+ def try_parse_few_shot_pattern(string: str, dataset_name, setting_name):
47
+ if setting_name == 'few-shot-CoT':
48
+ string = extract_last_line(string)
49
+ if dataset_name in dataset_loader.chinese_cloze_datasets:
50
+ return string.startswith('答案是')
51
+ elif dataset_name in dataset_loader.english_cloze_datasets:
52
+ return string.startswith('The answer is therefore')
53
+ elif dataset_name in dataset_loader.chinese_qa_datasets:
54
+ pattern = '答案是.*?([A-G])'
55
+ match = re.search(pattern, string)
56
+ return match is not None
57
+ elif dataset_name in dataset_loader.english_qa_datasets:
58
+ pattern = 'answer is .*?([A-G])'
59
+ match = re.search(pattern, string)
60
+ return match is not None
61
+ return False
62
+
63
+
64
+ def parse_few_shot_qa_single_answer(string, setting_name, language='en'):
65
+ answer = try_parse_few_shot_qa_single_answer(string, setting_name,
66
+ language)
67
+ if answer is None:
68
+ return find_first_capital_letter(string)
69
+ else:
70
+ return answer
71
+
72
+
73
+ def find_first_capital_letter(answer):
74
+ letter_set = {'A', 'B', 'C', 'D', 'E', 'F'}
75
+ for c in answer:
76
+ if c in letter_set:
77
+ return c
78
+ # print("Can't find capital letter in:", answer)
79
+ return ''
80
+
81
+
82
+ def extract_answer_in_bracket(answer, prefix='【', suffix='】'):
83
+ if prefix not in answer and suffix not in answer:
84
+ # print("doesn't found special tokens in:", answer)
85
+ return ''
86
+ s = answer.index(prefix) + len(prefix)
87
+ t = answer.index(suffix)
88
+ ret = answer[s:t]
89
+ return ret
90
+
91
+
92
+ def parse_math_answer(setting_name, raw_string):
93
+ if setting_name == 'few-shot-CoT':
94
+ raw_string = extract_last_line(raw_string)
95
+ if setting_name == 'few-shot-CoT' or setting_name == 'few-shot':
96
+ raw_string = remove_few_shot_prefix(raw_string)
97
+ return raw_string
98
+
99
+ def remove_boxed(s):
100
+ left = '\\boxed{'
101
+ try:
102
+ assert s[:len(left)] == left
103
+ assert s[-1] == '}'
104
+ answer = s[len(left):-1]
105
+ if '=' in answer:
106
+ answer = answer.split('=')[-1].lstrip(' ')
107
+ return answer
108
+ except:
109
+ return None
110
+
111
+ def last_boxed_only_string(string):
112
+ idx = string.rfind('\\boxed')
113
+ if idx < 0:
114
+ idx = string.rfind('\\fbox')
115
+ if idx < 0:
116
+ return None
117
+ i = idx
118
+ right_brace_idx = None
119
+ num_left_braces_open = 0
120
+ while i < len(string):
121
+ if string[i] == '{':
122
+ num_left_braces_open += 1
123
+ if string[i] == '}':
124
+ num_left_braces_open -= 1
125
+ if num_left_braces_open == 0:
126
+ right_brace_idx = i
127
+ break
128
+ i += 1
129
+
130
+ if right_brace_idx == None:
131
+ retval = None
132
+ else:
133
+ retval = string[idx:right_brace_idx + 1]
134
+
135
+ return retval
136
+
137
+ def get_answer_with_dollar_sign(s):
138
+ first_pattern = '\$(.*)\$'
139
+ last_match = None
140
+ matches = re.findall(first_pattern, s)
141
+ if matches:
142
+ last_match = matches[-1]
143
+ if '=' in last_match:
144
+ last_match = last_match.split('=')[-1].lstrip(' ')
145
+ return last_match
146
+
147
+ def get_answer_without_dollar_sign(s):
148
+ last_match = None
149
+ if '=' in s:
150
+ last_match = s.split('=')[-1].lstrip(' ').rstrip('.')
151
+ if '\\n' in last_match:
152
+ last_match = last_match.split('\\n')[0]
153
+ else:
154
+ pattern = '(?:\\$)?\d+(?:\.\d+)?(?![\w\d])'
155
+ matches = re.findall(pattern, s)
156
+ if matches:
157
+ last_match = matches[-1]
158
+ return last_match
159
+
160
+ raw_string = remove_few_shot_prefix(raw_string)
161
+ if '\\boxed' in raw_string:
162
+ answer = remove_boxed(last_boxed_only_string(raw_string))
163
+ else:
164
+ answer = get_answer_with_dollar_sign(raw_string)
165
+ if not answer:
166
+ answer = get_answer_without_dollar_sign(raw_string)
167
+ return answer
168
+
169
+
170
+ def parse_qa_multiple_answer(string, setting_name):
171
+ if setting_name == 'few-shot-CoT':
172
+ string = extract_last_line(string)
173
+ pattern = '\(*([A-Z])\)*'
174
+ match = re.findall(pattern, string)
175
+ if match:
176
+ return match
177
+ return []
178
+
179
+
180
+ def post_process(dataset_name, setting_name, prediction):
181
+ if dataset_name in dataset_loader.english_cloze_datasets or dataset_name in dataset_loader.chinese_cloze_datasets:
182
+ return parse_math_answer(setting_name, prediction)
183
+
184
+ if dataset_name in ['jec-qa-kd', 'jec-qa-ca', 'gaokao-physics']:
185
+ return parse_qa_multiple_answer(prediction, setting_name)
186
+
187
+ # all other datasets are QA problems with single answer
188
+ if 'zero-shot' in setting_name:
189
+ answer = find_first_capital_letter(prediction)
190
+ return answer
191
+
192
+ # all other datasets are QA problems with single answer and setting_name are few-shot
193
+ language = 'en' if dataset_name in dataset_loader.english_qa_datasets else 'zh'
194
+ if dataset_name in dataset_loader.english_qa_datasets or dataset_name in dataset_loader.chinese_qa_datasets:
195
+ return parse_few_shot_qa_single_answer(prediction, setting_name,
196
+ language)
197
+ else:
198
+ raise ValueError(f'Unsupported dataset name {dataset_name}')
opencompass-my-api/build/lib/opencompass/datasets/agieval/utils.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ import json
3
+
4
+
5
+ def read_jsonl(path):
6
+ with open(path, encoding='utf8') as fh:
7
+ results = []
8
+ for line in fh:
9
+ if line is None:
10
+ continue
11
+ try:
12
+ results.append(json.loads(line) if line != 'null' else line)
13
+ except Exception as e:
14
+ print(e)
15
+ print(path)
16
+ print(line)
17
+ raise e
18
+ return results
19
+
20
+
21
+ def save_jsonl(lines, directory):
22
+ with open(directory, 'w', encoding='utf8') as f:
23
+ for line in lines:
24
+ f.write(json.dumps(line, ensure_ascii=False) + '\n')
25
+
26
+
27
+ def extract_answer(js):
28
+ try:
29
+ if js is None or js == 'null':
30
+ return ''
31
+ answer = ''
32
+ if isinstance(js, str):
33
+ answer = js
34
+ elif 'text' in js['choices'][0]:
35
+ answer = js['choices'][0]['text']
36
+ else:
37
+ answer = js['choices'][0]['message']['content']
38
+ # answer = js['']
39
+ return answer
40
+ except Exception as e:
41
+ # print(e)
42
+ # print(js)
43
+ return ''
opencompass-my-api/build/lib/opencompass/datasets/anli.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import Dataset
4
+
5
+ from .base import BaseDataset
6
+
7
+
8
+ class AnliDataset(BaseDataset):
9
+
10
+ @staticmethod
11
+ def load(path: str):
12
+ dataset = []
13
+ with open(path, 'r') as f:
14
+ for line in f:
15
+ line = json.loads(line)
16
+ line['label'] = {'c': 'A', 'e': 'B', 'n': 'C'}[line['label']]
17
+ dataset.append(line)
18
+ return Dataset.from_list(dataset)
opencompass-my-api/build/lib/opencompass/datasets/anthropics_evals.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+
3
+ from .base import BaseDataset
4
+
5
+
6
+ class AiRiskDataset(BaseDataset):
7
+
8
+ @staticmethod
9
+ def load(path: str):
10
+ """Load dataset."""
11
+
12
+ dataset = load_dataset('json', data_files=path)
13
+
14
+ def choices_process(example):
15
+ # the original answer format is ` (A)`, etc.
16
+ for i in 'ABCDEFGH':
17
+ if i in example['answer_matching_behavior']:
18
+ example['answer_matching_behavior'] = i
19
+ break
20
+ return example
21
+
22
+ dataset = dataset.map(choices_process)
23
+ return dataset
24
+
25
+
26
+ class PersonaDataset(BaseDataset):
27
+
28
+ @staticmethod
29
+ def load(path: str):
30
+ """Load dataset."""
31
+
32
+ dataset = load_dataset('json', data_files=path)
33
+
34
+ def choices_process(example):
35
+ # the original answer format is ` No` or ` Yes`.
36
+ if example['answer_matching_behavior'] == ' Yes':
37
+ example['answer_matching_behavior'] = 'A'
38
+ else:
39
+ example['answer_matching_behavior'] = 'B'
40
+ return example
41
+
42
+ dataset = dataset.map(choices_process)
43
+ return dataset
44
+
45
+
46
+ class SycophancyDataset(BaseDataset):
47
+
48
+ @staticmethod
49
+ def load(path: str):
50
+ """Load dataset."""
51
+
52
+ dataset = load_dataset('json', data_files=path)
53
+
54
+ def choices_process(example):
55
+ # the original answer format is ` (A)`, etc.
56
+ for i in 'ABCDEFG':
57
+ if i in example['answer_matching_behavior']:
58
+ example['answer_matching_behavior'] = i
59
+ break
60
+ return example
61
+
62
+ dataset = dataset.map(choices_process)
63
+ return dataset
opencompass-my-api/build/lib/opencompass/datasets/arc.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os.path as osp
3
+
4
+ from datasets import Dataset
5
+
6
+ from opencompass.registry import LOAD_DATASET
7
+
8
+ from .base import BaseDataset
9
+
10
+
11
+ @LOAD_DATASET.register_module()
12
+ class ARCDataset(BaseDataset):
13
+
14
+ @staticmethod
15
+ def load(path: str):
16
+ with open(path, 'r', errors='ignore') as in_f:
17
+ rows = []
18
+ for line in in_f:
19
+ item = json.loads(line.strip())
20
+ question = item['question']
21
+ if len(question['choices']) != 4:
22
+ continue
23
+ labels = [c['label'] for c in question['choices']]
24
+ answerKey = 'ABCD'[labels.index(item['answerKey'])]
25
+ rows.append({
26
+ 'question': question['stem'],
27
+ 'answerKey': answerKey,
28
+ 'textA': question['choices'][0]['text'],
29
+ 'textB': question['choices'][1]['text'],
30
+ 'textC': question['choices'][2]['text'],
31
+ 'textD': question['choices'][3]['text'],
32
+ })
33
+ return Dataset.from_list(rows)
34
+
35
+
36
+ class ARCDatasetClean(BaseDataset):
37
+
38
+ # load the contamination annotations of CEval from
39
+ # https://github.com/liyucheng09/Contamination_Detector
40
+ @staticmethod
41
+ def load_contamination_annotations(path, split='val'):
42
+ import requests
43
+
44
+ assert split == 'test', 'We only have test set annotation for ARC'
45
+ annotation_cache_path = osp.join(
46
+ path, f'ARC_c_{split}_contamination_annotations.json')
47
+ if osp.exists(annotation_cache_path):
48
+ with open(annotation_cache_path, 'r') as f:
49
+ annotations = json.load(f)
50
+ return annotations
51
+ link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc/ARC_annotations.json' # noqa
52
+ annotations = json.loads(requests.get(link_of_annotations).text)
53
+ with open(annotation_cache_path, 'w') as f:
54
+ json.dump(annotations, f)
55
+ return annotations
56
+
57
+ @staticmethod
58
+ def load(path: str):
59
+ annotations = ARCDatasetClean.load_contamination_annotations(
60
+ osp.dirname(path), 'test')
61
+ with open(path, 'r', errors='ignore') as in_f:
62
+ rows = []
63
+ for line in in_f:
64
+ item = json.loads(line.strip())
65
+ id_ = item['id']
66
+ question = item['question']
67
+ if id_ in annotations:
68
+ is_clean = annotations[id_][0]
69
+ else:
70
+ is_clean = 'not labeled'
71
+ if len(question['choices']) != 4:
72
+ continue
73
+ labels = [c['label'] for c in question['choices']]
74
+ answerKey = 'ABCD'[labels.index(item['answerKey'])]
75
+ rows.append({
76
+ 'question': question['stem'],
77
+ 'answerKey': answerKey,
78
+ 'textA': question['choices'][0]['text'],
79
+ 'textB': question['choices'][1]['text'],
80
+ 'textC': question['choices'][2]['text'],
81
+ 'textD': question['choices'][3]['text'],
82
+ 'is_clean': is_clean,
83
+ })
84
+ return Dataset.from_list(rows)
opencompass-my-api/build/lib/opencompass/datasets/ax.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import Dataset
4
+
5
+ from opencompass.registry import LOAD_DATASET
6
+
7
+ from .base import BaseDataset
8
+
9
+
10
+ @LOAD_DATASET.register_module()
11
+ class AXDataset_V2(BaseDataset):
12
+
13
+ @staticmethod
14
+ def load(path: str):
15
+ dataset = []
16
+ with open(path, 'r') as f:
17
+ for line in f:
18
+ line = json.loads(line)
19
+ line['label'] = {
20
+ 'entailment': 'A',
21
+ 'not_entailment': 'B'
22
+ }[line['label']]
23
+ dataset.append(line)
24
+ return Dataset.from_list(dataset)
opencompass-my-api/build/lib/opencompass/datasets/base.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractstaticmethod
2
+ from typing import Dict, Optional, Union
3
+
4
+ from datasets import Dataset, DatasetDict
5
+
6
+ from opencompass.openicl import DatasetReader
7
+
8
+
9
+ class BaseDataset:
10
+
11
+ def __init__(self, reader_cfg: Optional[Dict] = {}, **kwargs):
12
+ self.dataset = self.load(**kwargs)
13
+ self._init_reader(**reader_cfg)
14
+
15
+ def _init_reader(self, **kwargs):
16
+ self.reader = DatasetReader(self.dataset, **kwargs)
17
+
18
+ @property
19
+ def train(self):
20
+ return self.reader.dataset['train']
21
+
22
+ @property
23
+ def test(self):
24
+ return self.reader.dataset['test']
25
+
26
+ @abstractstaticmethod
27
+ def load(**kwargs) -> Union[Dataset, DatasetDict]:
28
+ pass
opencompass-my-api/build/lib/opencompass/datasets/bbh.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os.path as osp
3
+ import re
4
+
5
+ from datasets import Dataset
6
+
7
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
8
+ from opencompass.registry import (ICL_EVALUATORS, LOAD_DATASET,
9
+ TEXT_POSTPROCESSORS)
10
+
11
+ from .base import BaseDataset
12
+
13
+
14
+ @LOAD_DATASET.register_module()
15
+ class BBHDataset(BaseDataset):
16
+
17
+ @staticmethod
18
+ def load(path: str, name: str):
19
+ with open(osp.join(path, f'{name}.json'), 'r') as f:
20
+ data = json.load(f)['examples']
21
+ dataset = Dataset.from_list(data)
22
+ return dataset
23
+
24
+
25
+ @TEXT_POSTPROCESSORS.register_module('bbh-mcq')
26
+ def bbh_mcq_postprocess(text: str) -> str:
27
+ ans = text
28
+ ans_line = ans.split('answer is ')
29
+ if len(ans_line) != 1:
30
+ ans = ans_line[1].strip()
31
+ match = re.search(r'\(([A-Z])\)*', ans)
32
+ if match:
33
+ return match.group(1)
34
+ match = re.search(r'([A-Z])', ans)
35
+ if match:
36
+ return match.group(1)
37
+ return ans
38
+
39
+
40
+ @TEXT_POSTPROCESSORS.register_module('bbh-freeform')
41
+ def bbh_freeform_postprocess(text: str) -> str:
42
+ ans = text
43
+ ans_line = ans.split('answer is ')
44
+ if len(ans_line) != 1:
45
+ ans = ans_line[1].strip()
46
+ ans = ans.split('\n')[0]
47
+ if ans.endswith('.'):
48
+ ans = ans[:-1]
49
+ return ans
50
+
51
+
52
+ @ICL_EVALUATORS.register_module()
53
+ class BBHEvaluator(BaseEvaluator):
54
+
55
+ def score(self, predictions, references):
56
+ if len(predictions) != len(references):
57
+ return {
58
+ 'error': 'predictions and references have different '
59
+ 'length'
60
+ }
61
+
62
+ predictions = [bbh_freeform_postprocess(pred) for pred in predictions]
63
+
64
+ details = []
65
+ cnt = 0
66
+ for pred, ref in zip(predictions, references):
67
+ detail = {'pred': pred, 'answer': ref, 'correct': False}
68
+ if pred == ref:
69
+ cnt += 1
70
+ detail['correct'] = True
71
+ details.append(detail)
72
+
73
+ score = cnt / len(predictions) * 100
74
+
75
+ return {'score': score, 'details': details}
76
+
77
+
78
+ @ICL_EVALUATORS.register_module()
79
+ class BBHEvaluator_mcq(BaseEvaluator):
80
+
81
+ def score(self, predictions, references):
82
+ if len(predictions) != len(references):
83
+ return {
84
+ 'error': 'predictions and references have different '
85
+ 'length'
86
+ }
87
+ details = []
88
+ cnt = 0
89
+ for pred, ref in zip(predictions, references):
90
+ detail = {'pred': pred, 'answer': ref, 'correct': False}
91
+ if pred == ref:
92
+ cnt += 1
93
+ detail['correct'] = True
94
+ details.append(detail)
95
+
96
+ score = cnt / len(predictions) * 100
97
+
98
+ return {'score': score, 'details': details}
opencompass-my-api/build/lib/opencompass/datasets/boolq.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import Dataset, load_dataset
4
+
5
+ from opencompass.registry import LOAD_DATASET
6
+
7
+ from .base import BaseDataset
8
+
9
+
10
+ @LOAD_DATASET.register_module()
11
+ class BoolQDataset(BaseDataset):
12
+
13
+ @staticmethod
14
+ def load(**kwargs):
15
+ dataset = load_dataset(**kwargs)
16
+
17
+ def preprocess(example):
18
+ if example['label'] == 'true':
19
+ example['answer'] = 1
20
+ else:
21
+ example['answer'] = 0
22
+ return example
23
+
24
+ dataset = dataset.map(preprocess)
25
+ return dataset
26
+
27
+
28
+ @LOAD_DATASET.register_module()
29
+ class BoolQDataset_V2(BaseDataset):
30
+
31
+ @staticmethod
32
+ def load(path):
33
+ dataset = []
34
+ with open(path, 'r') as f:
35
+ for line in f:
36
+ line = json.loads(line)
37
+ line['label'] = {'true': 'A', 'false': 'B'}[line['label']]
38
+ dataset.append(line)
39
+ return Dataset.from_list(dataset)
40
+
41
+
42
+ @LOAD_DATASET.register_module()
43
+ class BoolQDataset_V3(BaseDataset):
44
+
45
+ @staticmethod
46
+ def load(path):
47
+ dataset = []
48
+ with open(path, 'r') as f:
49
+ for line in f:
50
+ line = json.loads(line)
51
+ line['passage'] = ' -- '.join(
52
+ line['passage'].split(' -- ')[1:])
53
+ line['question'] = line['question'][0].upper(
54
+ ) + line['question'][1:]
55
+ dataset.append(line)
56
+ return Dataset.from_list(dataset)
opencompass-my-api/build/lib/opencompass/datasets/bustum.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import Dataset
4
+
5
+ from opencompass.registry import LOAD_DATASET
6
+
7
+ from .base import BaseDataset
8
+
9
+
10
+ @LOAD_DATASET.register_module()
11
+ class bustumDataset_V2(BaseDataset):
12
+
13
+ @staticmethod
14
+ def load(path):
15
+ data = []
16
+ with open(path, 'r', encoding='utf-8') as f:
17
+ for line in f:
18
+ line = json.loads(line)
19
+ line['label'] = 'AB'[int(line['label'])]
20
+ data.append(line)
21
+ return Dataset.from_list(data)
opencompass-my-api/build/lib/opencompass/datasets/c3.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import Dataset
4
+
5
+ from opencompass.registry import LOAD_DATASET
6
+
7
+ from .base import BaseDataset
8
+
9
+
10
+ @LOAD_DATASET.register_module()
11
+ class C3Dataset(BaseDataset):
12
+
13
+ @staticmethod
14
+ def load(path: str):
15
+
16
+ with open(path, 'r', encoding='utf-8') as f:
17
+ data = json.load(f)
18
+ rows = []
19
+ for _, row in enumerate(data):
20
+ content = row[0]
21
+ content_str = ' '.join(
22
+ [''.join(paragraph) for paragraph in content])
23
+
24
+ for question in row[1]:
25
+ label = question['choice'].index(question['answer'])
26
+ length = len(question['choice'])
27
+ if length < 4:
28
+ fill_value = question['choice'][0] # 以第一个值为填充值
29
+ fill_count = 4 - length # 需要填充的数量
30
+ question['choice'] += [fill_value] * fill_count # 填充
31
+
32
+ rows.append({
33
+ 'content': content_str,
34
+ 'question': question['question'],
35
+ 'choices': question['choice'],
36
+ 'choice0': question['choice'][0],
37
+ 'choice1': question['choice'][1],
38
+ 'choice2': question['choice'][2],
39
+ 'choice3': question['choice'][3],
40
+ 'label': label
41
+ })
42
+
43
+ dataset = Dataset.from_dict({
44
+ 'content': [row['content'] for row in rows],
45
+ 'question': [row['question'] for row in rows],
46
+ 'choice0': [row['choice0'] for row in rows],
47
+ 'choice1': [row['choice1'] for row in rows],
48
+ 'choice2': [row['choice2'] for row in rows],
49
+ 'choice3': [row['choice3'] for row in rows],
50
+ 'choices': [row['choices'] for row in rows],
51
+ 'label': [row['label'] for row in rows]
52
+ })
53
+ return dataset
54
+
55
+
56
+ @LOAD_DATASET.register_module()
57
+ class C3Dataset_V2(BaseDataset):
58
+
59
+ @staticmethod
60
+ def load(path: str):
61
+ with open(path, 'r', encoding='utf-8') as f:
62
+ raw = json.load(f)
63
+ data = []
64
+ for line in raw:
65
+ content = ''.join([''.join(paragraph) for paragraph in line[0]])
66
+ for question in line[1]:
67
+ label = question['choice'].index(question['answer'])
68
+ label = 'ABCD'[label]
69
+ while len(question['choice']) < 4:
70
+ question['choice'].append('[NULL]')
71
+ data.append({
72
+ 'content': content,
73
+ 'question': question['question'],
74
+ 'choice0': question['choice'][0],
75
+ 'choice1': question['choice'][1],
76
+ 'choice2': question['choice'][2],
77
+ 'choice3': question['choice'][3],
78
+ 'label': label
79
+ })
80
+ return Dataset.from_list(data)
opencompass-my-api/build/lib/opencompass/datasets/cb.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import Dataset
4
+
5
+ from opencompass.registry import LOAD_DATASET
6
+
7
+ from .base import BaseDataset
8
+
9
+
10
+ @LOAD_DATASET.register_module()
11
+ class CBDataset_V2(BaseDataset):
12
+
13
+ @staticmethod
14
+ def load(path):
15
+ dataset = []
16
+ with open(path, 'r') as f:
17
+ for line in f:
18
+ line = json.loads(line)
19
+ line['label'] = {
20
+ 'contradiction': 'A',
21
+ 'entailment': 'B',
22
+ 'neutral': 'C'
23
+ }[line['label']]
24
+ dataset.append(line)
25
+ return Dataset.from_list(dataset)
opencompass-my-api/build/lib/opencompass/datasets/ceval.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import json
3
+ import os.path as osp
4
+
5
+ from datasets import Dataset, DatasetDict
6
+
7
+ from opencompass.registry import LOAD_DATASET
8
+
9
+ from .base import BaseDataset
10
+
11
+
12
+ @LOAD_DATASET.register_module()
13
+ class CEvalDataset(BaseDataset):
14
+
15
+ @staticmethod
16
+ def load(path: str, name: str):
17
+ dataset = {}
18
+ for split in ['dev', 'val', 'test']:
19
+ filename = osp.join(path, split, f'{name}_{split}.csv')
20
+ with open(filename, encoding='utf-8') as f:
21
+ reader = csv.reader(f)
22
+ header = next(reader)
23
+ for row in reader:
24
+ item = dict(zip(header, row))
25
+ item.setdefault('explanation', '')
26
+ item.setdefault('answer', '')
27
+ dataset.setdefault(split, []).append(item)
28
+ dataset = {i: Dataset.from_list(dataset[i]) for i in dataset}
29
+ return DatasetDict(dataset)
30
+
31
+
32
+ class CEvalDatasetClean(BaseDataset):
33
+
34
+ # load the contamination annotations of CEval from
35
+ # https://github.com/liyucheng09/Contamination_Detector
36
+ @staticmethod
37
+ def load_contamination_annotations(path, split='val'):
38
+ import requests
39
+
40
+ assert split == 'val', 'Now we only have annotations for val set'
41
+ annotation_cache_path = osp.join(
42
+ path, split, 'ceval_contamination_annotations.json')
43
+ if osp.exists(annotation_cache_path):
44
+ with open(annotation_cache_path, 'r') as f:
45
+ annotations = json.load(f)
46
+ return annotations
47
+ link_of_annotations = 'https://github.com/liyucheng09/Contamination_Detector/releases/download/v0.1.1rc/ceval_annotations.json' # noqa
48
+ annotations = json.loads(requests.get(link_of_annotations).text)
49
+ with open(annotation_cache_path, 'w') as f:
50
+ json.dump(annotations, f)
51
+ return annotations
52
+
53
+ @staticmethod
54
+ def load(path: str, name: str):
55
+ dataset = {}
56
+ for split in ['dev', 'val', 'test']:
57
+ if split == 'val':
58
+ annotations = CEvalDatasetClean.load_contamination_annotations(
59
+ path, split)
60
+ filename = osp.join(path, split, f'{name}_{split}.csv')
61
+ with open(filename, encoding='utf-8') as f:
62
+ reader = csv.reader(f)
63
+ header = next(reader)
64
+ for row_index, row in enumerate(reader):
65
+ item = dict(zip(header, row))
66
+ item.setdefault('explanation', '')
67
+ item.setdefault('answer', '')
68
+ if split == 'val':
69
+ row_id = f'{name}-{row_index}'
70
+ if row_id in annotations:
71
+ item['is_clean'] = annotations[row_id][0]
72
+ else:
73
+ item['is_clean'] = 'not labeled'
74
+ dataset.setdefault(split, []).append(item)
75
+ dataset = {i: Dataset.from_list(dataset[i]) for i in dataset}
76
+ return DatasetDict(dataset)
opencompass-my-api/build/lib/opencompass/datasets/chid.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from datasets import Dataset, load_dataset
4
+
5
+ from opencompass.registry import LOAD_DATASET
6
+
7
+ from .base import BaseDataset
8
+
9
+
10
+ @LOAD_DATASET.register_module()
11
+ class CHIDDataset(BaseDataset):
12
+
13
+ @staticmethod
14
+ def load(**kwargs):
15
+
16
+ dataset = load_dataset(**kwargs)
17
+
18
+ def preprocess(example):
19
+ content = example['content']
20
+ for i, c in enumerate(example['candidates']):
21
+ example[f'content{i}'] = content.replace('#idiom#', c)
22
+ return example
23
+
24
+ dataset = dataset.map(preprocess)
25
+ return dataset
26
+
27
+
28
+ @LOAD_DATASET.register_module()
29
+ class CHIDDataset_V2(BaseDataset):
30
+
31
+ @staticmethod
32
+ def load(path):
33
+ data = []
34
+ with open(path, 'r', encoding='utf-8') as f:
35
+ for line in f:
36
+ line = json.loads(line)
37
+ item = {}
38
+ item['content'] = line['content'].replace('#idiom#', '______')
39
+ for i, c in enumerate(line['candidates']):
40
+ item[chr(ord('A') + i)] = c
41
+ item['answer'] = 'ABCDEFG'[line['answer']]
42
+ data.append(item)
43
+ return Dataset.from_list(data)
opencompass-my-api/build/lib/opencompass/datasets/cibench.py ADDED
@@ -0,0 +1,511 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import os.path as osp
4
+ import re
5
+ import subprocess
6
+ from collections import defaultdict
7
+ from typing import List, Optional
8
+
9
+ import numpy as np
10
+ from datasets import Dataset
11
+
12
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
13
+ from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
14
+
15
+ from .base import BaseDataset
16
+
17
+
18
+ def load_experiment(file: str) -> dict:
19
+ """Load single experiment file with solutions."""
20
+ with open(file, 'r') as f:
21
+ notebook = json.load(f)
22
+ example = notebook['cells']
23
+ metadata = notebook['metadata']
24
+ modules = metadata.get('modules', [])
25
+ if modules:
26
+ # these two annotations should be the same
27
+ assert len(modules) == len(metadata.get('step_types'))
28
+ # reformat annotations
29
+ modules = [[_m.strip() for _m in _modules.split('&')]
30
+ for _modules in modules]
31
+ questions = []
32
+ source_codes = []
33
+ outputs = []
34
+ tags = []
35
+ for cell in example:
36
+ if cell['cell_type'] == 'markdown':
37
+ text = ''.join(cell['source']).strip()
38
+ if modules:
39
+ _modules = modules.pop(0)
40
+ text += f"Please use {' and '.join(_modules)} modules."
41
+ text = text.strip() + '\n'
42
+ # append the formatted text
43
+ questions.append(text)
44
+ elif cell['cell_type'] == 'code':
45
+ source_codes.append(''.join(cell['source']))
46
+ if cell['outputs'] and 'data' in cell['outputs'][-1]:
47
+ if 'image/png' in cell['outputs'][-1]['data']:
48
+ # skip vis temporarily due to lack of evaluation
49
+ tags.append('vis')
50
+ outputs.append(
51
+ cell['outputs'][-1]['data']['image/png'])
52
+ elif 'text/plain' in cell['outputs'][-1]['data']:
53
+ tags.append('general')
54
+ outputs.append(''.join(
55
+ cell['outputs'][-1]['data']['text/plain']))
56
+ else:
57
+ tags.append('exec')
58
+ outputs.append(None)
59
+ return dict(
60
+ experiment=file,
61
+ questions=sum(([
62
+ dict(role='user', content=question),
63
+ dict(role='assistant', content=source_code)
64
+ ] for question, source_code in zip(questions, source_codes)), []),
65
+ references=dict(outputs=outputs,
66
+ tags=tags,
67
+ metadata=metadata,
68
+ experiment=file),
69
+ )
70
+
71
+
72
+ def load_experiment_template(file: str) -> dict:
73
+ """Load single experiment file with solutions for template experiment."""
74
+ with open(file, 'r') as f:
75
+ notebook = json.load(f)
76
+ example = notebook['cells']
77
+ metadata = notebook['metadata']
78
+ modules = metadata.get('modules', [])
79
+ if modules:
80
+ # these two annotations should be the same
81
+ assert len(modules) == len(metadata.get('step_types'))
82
+ # reformat annotations
83
+ modules = [[_m.strip() for _m in _modules.split('&')]
84
+ for _modules in modules]
85
+ questions = []
86
+ source_codes = []
87
+ outputs = []
88
+ tags = []
89
+ for cell in example:
90
+ if cell['cell_type'] == 'markdown':
91
+ text = ''.join(cell['source']).strip()
92
+ if modules:
93
+ _modules = modules.pop(0)
94
+ if 'chinese' not in file:
95
+ text += f"Please use {' and '.join(_modules)} modules."
96
+ else:
97
+ text += f"请用 {' 和 '.join(_modules)} 模块."
98
+ text = text.strip() + '\n'
99
+ # append the formatted text
100
+ questions.append(text)
101
+ elif cell['cell_type'] == 'code':
102
+ source_codes.append(''.join(cell['source']))
103
+ output_flag = False
104
+ if cell['outputs']:
105
+ for _output in cell['outputs']:
106
+ if _output['output_type'] == 'display_data':
107
+ assert not output_flag
108
+ if 'image/png' in _output['data']:
109
+ output_flag = True
110
+ tags.append('vis')
111
+ outputs.append(_output['data']['image/png'])
112
+ for _output in cell['outputs'][::-1]:
113
+ if output_flag:
114
+ break
115
+ if _output['output_type'] == 'stream' and _output[
116
+ 'name'] == 'stdout':
117
+ assert not output_flag
118
+ output_flag = True
119
+ tags.append('general')
120
+ outputs.append(''.join(_output['text']))
121
+ elif _output['output_type'] == 'execute_result':
122
+ assert not output_flag
123
+ output_flag = True
124
+ tags.append('general')
125
+ outputs.append(''.join(
126
+ _output['data']['text/plain']))
127
+ if not output_flag:
128
+ # no output fallback to exec
129
+ tags.append('exec')
130
+ outputs.append(None)
131
+ return dict(
132
+ experiment=file,
133
+ questions=sum(([
134
+ dict(role='user', content=question),
135
+ dict(role='assistant', content=source_code)
136
+ ] for question, source_code in zip(questions, source_codes)), []),
137
+ references=dict(outputs=outputs,
138
+ tags=tags,
139
+ metadata=metadata,
140
+ experiment=file),
141
+ )
142
+
143
+
144
+ def check_internet():
145
+ """A tricky way to check internet."""
146
+ import socket
147
+
148
+ import nltk
149
+ socket.setdefaulttimeout(10)
150
+ ret = nltk.download('stopwords', quiet=True)
151
+ socket.setdefaulttimeout(None)
152
+ if not ret:
153
+ raise ConnectionError('CIBench needs internet to get response. Please'
154
+ 'check your internet and proxy.')
155
+
156
+
157
+ @LOAD_DATASET.register_module()
158
+ class CIBenchDataset(BaseDataset):
159
+ """Code Interpreter dataset."""
160
+
161
+ @staticmethod
162
+ def load(path: str, internet_check: bool = False):
163
+ """Load whole dataset.
164
+
165
+ Args:
166
+ path(str): Path of cibench dataset.
167
+ internet_check(bool): Whether to check internet.
168
+ Defaults to False.
169
+ """
170
+ if internet_check:
171
+ check_internet()
172
+ assert os.path.exists(path), f'Path {path} does not exist.'
173
+ data_list = []
174
+ for cwd, dirs, files in os.walk(path):
175
+ dirs.sort()
176
+ files.sort()
177
+ for f in files:
178
+ if '.ipynb' in f:
179
+ data = load_experiment(os.path.join(cwd, f))
180
+ data_list.append(data)
181
+
182
+ dataset = Dataset.from_list(data_list)
183
+ return dataset
184
+
185
+
186
+ @LOAD_DATASET.register_module()
187
+ class CIBenchTemplateDataset(BaseDataset):
188
+ """Code Interpreter dataset for template dataset."""
189
+
190
+ @staticmethod
191
+ def load(path: str, internet_check: bool = False):
192
+ """Load whole dataset.
193
+
194
+ Args:
195
+ path(str): Path of cibench dataset.
196
+ internet_check(bool): Whether to check internet.
197
+ Defaults to False.
198
+ """
199
+ if internet_check:
200
+ check_internet()
201
+ assert os.path.exists(path), f'Path {path} does not exist.'
202
+ data_list = []
203
+ for cwd, dirs, files in os.walk(path):
204
+ dirs.sort()
205
+ files.sort()
206
+ for f in files:
207
+ if '.ipynb' in f:
208
+ data = load_experiment_template(os.path.join(cwd, f))
209
+ data_list.append(data)
210
+
211
+ dataset = Dataset.from_list(data_list)
212
+ return dataset
213
+
214
+
215
+ class CIBenchEvaluator(BaseEvaluator):
216
+ """Evaluator for CI dataset.
217
+
218
+ Args:
219
+ text_evaluator (optional, dict): The text evaluator for text result
220
+ comparison[]. Defaults to None, which use Rouge as defaults.
221
+ Please notice that a extra key for `metric_name` should be set
222
+ to get the exact metric result, such as `rouge1`.
223
+ output_dir (optional, str): The directory to save experiment
224
+ files in a markdown or notebook format.
225
+ with_ipynb (bool): Generate ipynb correspondingly.
226
+ Defaults to False.
227
+ user_data_dir (str): The directory to load local files.
228
+ Defaults to 'ENV', which means use environment variable
229
+ `USER_DATA_DIR` to get the data dir.
230
+ """
231
+
232
+ def __init__(self,
233
+ text_evaluator: Optional[dict] = None,
234
+ output_dir: Optional[str] = None,
235
+ with_ipynb: bool = False,
236
+ user_data_dir: str = 'ENV') -> None:
237
+ if text_evaluator is None:
238
+ from opencompass.openicl.icl_evaluator import RougeEvaluator
239
+ self.text_evaluator = ICL_EVALUATORS.build(
240
+ dict(type=RougeEvaluator))
241
+ self.text_eval_metric = 'rouge1'
242
+ else:
243
+ self.text_eval_metric = text_evaluator.pop('metric_name')
244
+ self.text_evaluator = ICL_EVALUATORS.build(text_evaluator)
245
+ # TODO: should use work dir for this task.
246
+ self.output_dir = output_dir
247
+ self.user_data_dir = self.check_user_data_dir(user_data_dir)
248
+ self.with_ipynb = with_ipynb
249
+ self.TAG_MAPPING = {
250
+ 'exec': ('executable', self.valid_step),
251
+ 'general': ('general_correct', self.correct_step),
252
+ 'num': ('numeric_correct', self.correct_step),
253
+ 'text': ('text_score', self.text_step),
254
+ 'vis': ('vis_sim', self.vis_similarity_step),
255
+ }
256
+
257
+ def check_user_data_dir(self, user_data_dir):
258
+ if user_data_dir == 'ENV':
259
+ default_path = osp.abspath('./data/cibench_dataset/datasources')
260
+ user_data_dir = os.environ.get('USER_DATA_DIR', default_path)
261
+ user_data_dir = user_data_dir.rstrip('/')
262
+ basename = osp.basename(user_data_dir)
263
+ if basename and basename != 'data':
264
+ user_data_dir = osp.join(user_data_dir, 'data')
265
+ assert osp.exists(user_data_dir), \
266
+ f'a subfolder named `data` should exist under {user_data_dir}.'
267
+ elif basename:
268
+ assert osp.exists(user_data_dir), \
269
+ f'{user_data_dir} does not exist.'
270
+ return user_data_dir
271
+
272
+ @staticmethod
273
+ def valid_step(step):
274
+ """Whether the step is executable and valid."""
275
+ # Found the latest code interpreter to determine valid
276
+ for action in step[::-1]:
277
+ if action['type'] == 'IPythonInterpreter':
278
+ if action['errmsg']:
279
+ return False
280
+ else:
281
+ return True
282
+ # No code interpreter for this step, reckon as False
283
+ return False
284
+
285
+ @staticmethod
286
+ def correct_step(step, target):
287
+ """Whether the step output is correct."""
288
+ # Found the latest code interpreter to determine correct
289
+ for action in step[::-1]:
290
+ if action['type'] == 'IPythonInterpreter':
291
+ if action['result']:
292
+ try:
293
+ pred = action['result']['text']
294
+ match_exec = re.search(
295
+ 'execute_result:\n\n```\n(.*?)\n```', pred,
296
+ re.DOTALL)
297
+ match_stdout = re.search('stdout:\n\n```\n(.*?)\n```',
298
+ pred, re.DOTALL)
299
+ # get pred result from execute_result by default
300
+ # else stdout
301
+ if match_exec and match_stdout:
302
+ match = match_exec
303
+ elif match_exec:
304
+ match = match_exec
305
+ elif match_stdout:
306
+ match = match_stdout
307
+ else:
308
+ match = None
309
+ if match:
310
+ out = match.group(1)
311
+ score = (out.strip() == target.strip()
312
+ or target.strip() in out.strip())
313
+ return score
314
+ except Exception:
315
+ return False
316
+ # Fall back to False
317
+ return False
318
+
319
+ def text_step(self, step, target):
320
+ """Whether the step output is correct."""
321
+ # Found the latest code interpreter to determine correct
322
+ for action in step[::-1]:
323
+ if action['type'] == 'IPythonInterpreter':
324
+ if action['result']:
325
+ try:
326
+ pred = action['result']['text']
327
+ match = re.search('```\n(.*?)\n```', pred, re.DOTALL)
328
+ if match:
329
+ out = match.group(1)
330
+ score = self.text_evaluator.score([out], [target])
331
+ return score[self.text_eval_metric] / 100
332
+ except Exception:
333
+ return False
334
+ # Fall back to False
335
+ return False
336
+
337
+ @staticmethod
338
+ def vis_similarity_step(step, target):
339
+ """Whether the step output image has the same structure similarity with
340
+ the given images."""
341
+ # Found the latest code interpreter to determine correct
342
+ import base64
343
+
344
+ import skimage
345
+
346
+ for action in step[::-1]:
347
+ if action['type'] == 'IPythonInterpreter':
348
+ if action['result']:
349
+ try:
350
+ pred = action['result']['text']
351
+ match = re.search(r'!\[fig-[0-9]*\]\((.*?)\)', pred,
352
+ re.DOTALL)
353
+ if match:
354
+ img_pred = match.group(1)
355
+ img2 = base64.b64decode(target)
356
+ img2 = skimage.io.imread(img2, plugin='imageio')
357
+ img1 = skimage.io.imread(img_pred, plugin='imageio')
358
+ img1 = skimage.transform.resize(img1, img2.shape[:2])
359
+ img1 = 255 * img1
360
+ # Convert to integer data type pixels.
361
+ img1 = img1.astype(np.uint8)
362
+ ssim = skimage.metrics.structural_similarity(
363
+ img1, img2, channel_axis=-1)
364
+ # mse = skimage.metrics.mean_squared_error(img1, img2)
365
+ # ssim greater better
366
+ # mse smaller better but has no upper bound
367
+ return ssim
368
+ except Exception:
369
+ return 0
370
+ # Fall back to 0
371
+ return 0
372
+
373
+ def save_results(self, origin_prompt, steps):
374
+ """Save the prediction result in a markdown and notebook format."""
375
+
376
+ def check_jupytext():
377
+ """Check requirements existence."""
378
+ from shutil import which
379
+
380
+ assert which('jupytext'), (
381
+ "Please install jupytext use 'pip install jupytext' to ensure"
382
+ 'the conversion processes.')
383
+
384
+ check_jupytext()
385
+ p_list = []
386
+ from opencompass.lagent.actions.ipython_interpreter import extract_code
387
+ for idx, (example_origin_prompt,
388
+ example_steps) in enumerate(zip(origin_prompt, steps)):
389
+ markdown_lines = []
390
+ for prompt, step in zip(example_origin_prompt, example_steps):
391
+ for action in step[::-1]:
392
+ if action['type'] == 'IPythonInterpreter':
393
+ valid_action = action
394
+ break
395
+ # fall back to final action
396
+ valid_action = step[-1]
397
+ markdown_lines.append(prompt)
398
+ markdown_lines.append('\n')
399
+ code_text = valid_action['args']['text']
400
+ code_text = extract_code(code_text)
401
+ code_text = '```python\n' + code_text + '\n```'
402
+ markdown_lines.append(code_text)
403
+ markdown_lines.append('\n')
404
+
405
+ md_file = f'experiment{idx}.md'
406
+ with open(md_file, 'w') as f:
407
+ f.writelines(markdown_lines)
408
+
409
+ # TODO: be careful for this
410
+ # The result might be different with infer process
411
+ # please check carefully
412
+ # convert markdown to ipynb and exectue with error tolerance
413
+ if self.with_ipynb:
414
+ p = subprocess.Popen(
415
+ 'jupytext --to ipynb --pipe-fmt ipynb '
416
+ "--pipe 'jupyter nbconvert --to ipynb --execute "
417
+ f"--allow-errors --stdin --stdout' {md_file}",
418
+ shell=True)
419
+ p_list.append(p)
420
+ # TODO: async wait
421
+ for p in p_list:
422
+ p.wait()
423
+
424
+ def set_data_dir(self, work_dir):
425
+ """Set work directory and link data files for save notebook results."""
426
+ if self.user_data_dir:
427
+ basename = osp.basename(self.user_data_dir)
428
+
429
+ if not osp.exists(osp.join(self.output_dir, basename)):
430
+ os.symlink(self.user_data_dir,
431
+ osp.join(self.output_dir, basename))
432
+ os.chdir(work_dir)
433
+
434
+ def unset_data_dir(self, work_dir):
435
+ """Change work directory and keep the symlink."""
436
+ os.chdir(work_dir)
437
+
438
+ def single_exp(self, gold, steps):
439
+ tags = gold['tags']
440
+ outputs = gold['outputs']
441
+ metadata = gold['metadata']
442
+ hard_tags = metadata.get('step_types', [])
443
+ if hard_tags:
444
+ tags = hard_tags
445
+
446
+ # executable: exec succeed
447
+ # general_correct: general correct
448
+ # numeric_correct: numerical correct
449
+ # text_score: text score
450
+ # vis_sim: visual similarity
451
+
452
+ # create empty results
453
+ result = dict()
454
+ if hard_tags:
455
+ check_tags = ['exec', 'num', 'text', 'vis']
456
+ else:
457
+ check_tags = ['exec', 'general', 'vis']
458
+ for tag in check_tags:
459
+ key = self.TAG_MAPPING[tag][0]
460
+ result[key] = []
461
+
462
+ for tag, step, output in zip(tags, steps, outputs):
463
+ # check whether this step is valid
464
+ result['executable'].append(self.valid_step(step))
465
+ if tag != 'exec':
466
+ key, func = self.TAG_MAPPING[tag]
467
+ result[key].append(func(step, output))
468
+
469
+ return result
470
+
471
+ def get_output_dir(self):
472
+ """Get output dir from eval task.
473
+
474
+ Notice: output dir should be in format xxx/data.
475
+ All the needed files should be
476
+ """
477
+ # hard hack for get output dir from eval task
478
+ if hasattr(self, '_out_dir') and self.output_dir is None:
479
+ self.output_dir = self._out_dir
480
+
481
+ def score(self, predictions: List, references: List, steps: List,
482
+ origin_prompt: List):
483
+ """Calculate accuracy."""
484
+ if len(steps) != len(references):
485
+ return {'error': 'steps and refrs have different length'}
486
+ cwd = os.getcwd()
487
+ self.get_output_dir()
488
+ if self.output_dir:
489
+ if not osp.exists(self.output_dir):
490
+ os.makedirs(self.output_dir)
491
+ self.set_data_dir(self.output_dir)
492
+ self.save_results(origin_prompt, steps)
493
+ self.unset_data_dir(cwd)
494
+
495
+ total_results = defaultdict(float)
496
+ total_scores = defaultdict(float)
497
+ total_nums = defaultdict(int)
498
+ for gold, single_steps in zip(references, steps):
499
+ result = self.single_exp(gold, single_steps)
500
+
501
+ for k, v in result.items():
502
+ total_scores[k] += sum(v)
503
+ total_nums[k] += len(v)
504
+
505
+ for k, v in total_scores.items():
506
+ if total_nums[k] > 0:
507
+ total_results[k] = total_scores[k] / total_nums[k] * 100
508
+ else:
509
+ total_results[k] = -1
510
+
511
+ return total_results
opencompass-my-api/build/lib/opencompass/datasets/circular.py ADDED
@@ -0,0 +1,373 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import itertools
3
+ from typing import Callable, List, Optional, Union
4
+
5
+ from datasets import Dataset, DatasetDict
6
+
7
+ from opencompass.openicl.icl_evaluator import BaseEvaluator
8
+
9
+ from .arc import ARCDataset
10
+ from .ceval import CEvalDataset
11
+ from .cmmlu import CMMLUDataset
12
+ from .commonsenseqa import commonsenseqaDataset
13
+ from .hellaswag import hellaswagDataset_V2
14
+ from .mmlu import MMLUDataset
15
+ from .obqa import OBQADataset
16
+ from .piqa import piqaDataset_V2
17
+ from .race import RaceDataset
18
+ from .siqa import siqaDataset_V3
19
+ from .xiezhi import XiezhiDataset
20
+
21
+
22
+ def get_origin_patterns(option_keys):
23
+ return [tuple(option_keys)]
24
+
25
+
26
+ def get_circular_patterns(option_keys):
27
+ double_option_keys = option_keys + option_keys
28
+ circular_patterns = [
29
+ tuple(double_option_keys[i:i + len(option_keys)])
30
+ for i in range(len(option_keys))
31
+ ]
32
+ return circular_patterns
33
+
34
+
35
+ def get_all_possible_patterns(option_keys):
36
+ circular_patterns = list(itertools.permutations(option_keys))
37
+ return circular_patterns
38
+
39
+
40
+ class CircularDatasetMeta(type):
41
+ """This Meta Class is designed to transform a class that reads datasets
42
+ into one that supports reading datasets required for CircularEval. It
43
+ overloads an existing load method for the original class.
44
+
45
+ The Meta Class should possess the following attributes:
46
+
47
+ - `dataset_class` (class): The class for reading datasets, such as
48
+ `CEvalDataset`.
49
+ - `default_circular_splits` (list, optional): The default splits of the
50
+ dataset that need to undergo CircularEval, like ['val', 'test']. If a
51
+ `Dataset` is loaded originally, this field will be ignored.
52
+ - `default_option_keys` (list): The keys for options in the dataset, such
53
+ as ['A', 'B', 'C', 'D'].
54
+ - `default_answer_key` (str, optional): The key for answers in the dataset,
55
+ like 'answer'. This is an alternative to
56
+ `default_answer_key_switch_method`.
57
+ - `default_answer_key_switch_method` (function, optional): The method to
58
+ transform the key for answers in the dataset. This is an alternative to
59
+ `default_answer_key`.
60
+ """
61
+
62
+ @staticmethod
63
+ def make_circular_items(
64
+ origin_item,
65
+ circular_patterns,
66
+ option_keys,
67
+ answer_key,
68
+ answer_key_switch_method,
69
+ qid,
70
+ ):
71
+ items = []
72
+ for circular_pattern in circular_patterns:
73
+ item = copy.deepcopy(origin_item)
74
+ for i in range(len(option_keys)):
75
+ item[circular_pattern[i]] = origin_item[option_keys[i]]
76
+ if answer_key_switch_method is None:
77
+ if origin_item[answer_key] in option_keys:
78
+ item[answer_key] = circular_pattern[option_keys.index(
79
+ origin_item[answer_key])]
80
+ else:
81
+ pass
82
+ else:
83
+ item = answer_key_switch_method(item, circular_pattern)
84
+ item['qid'] = qid
85
+ item['circular_pattern'] = tuple(circular_pattern)
86
+ items.append(item)
87
+ return items
88
+
89
+ @staticmethod
90
+ def make_circular_dataset(dataset, circular_patterns, option_keys,
91
+ answer_key, answer_key_switch_method):
92
+ circulated_items = []
93
+ for i, item in enumerate(dataset):
94
+ item = CircularDatasetMeta.make_circular_items(
95
+ item,
96
+ circular_patterns,
97
+ option_keys,
98
+ answer_key,
99
+ answer_key_switch_method,
100
+ i,
101
+ )
102
+ circulated_items.extend(item)
103
+ return Dataset.from_list(circulated_items)
104
+
105
+ def make_circular(
106
+ dataset: Union[Dataset, DatasetDict],
107
+ circular_splits: Optional[List[str]] = ['test'],
108
+ circular_patterns: str = 'circular',
109
+ option_keys: List[str] = ['A', 'B', 'C', 'D'],
110
+ answer_key: Optional[str] = 'answer',
111
+ answer_key_switch_method: Optional[Callable] = None,
112
+ ):
113
+ """Transform the dataset into one that is compatible with CircularEval.
114
+ In CircularEval, the original multiple-choice questions with options
115
+ ABCD are augmented by shuffling the order of options, such as BCDA,
116
+ CDAB, DABC, etc. A model is considered correct only if it answers all
117
+ augmented questions correctly. This method effectively prevents models
118
+ from memorizing answers.
119
+
120
+ Args:
121
+ datasets: The dataset to be augmented.
122
+ circular_splits: List of splits to make circular. This is only
123
+ effective when the dataset is a DatasetDict.
124
+ circular_patterns: Method for circular processing, can be 'circular'
125
+ for single cycle or 'all_possible' for all permutations, default
126
+ is 'circular'.
127
+ option_keys: List of keys for options, default to ['A', 'B', 'C', 'D'].
128
+ answer_key: Key for the answer, default to 'answer'. When specified,
129
+ ensure that the content of answer_key is among the option_keys.
130
+ It is an alternative to specifying answer_key_switch_method.
131
+ answer_key_switch_method: Function to modify the answer_key. It is an
132
+ alternative to specifying answer_key.
133
+ """
134
+
135
+ if isinstance(circular_patterns, str):
136
+ if circular_patterns == 'circular':
137
+ circular_patterns = get_circular_patterns(option_keys)
138
+ elif circular_patterns == 'all_possible':
139
+ circular_patterns = get_all_possible_patterns(option_keys)
140
+ else:
141
+ raise ValueError(
142
+ f'Unknown circular_patterns: {circular_patterns}')
143
+ else:
144
+ assert isinstance(circular_patterns, list)
145
+ assert all([isinstance(i, list) for i in circular_patterns])
146
+ # TODO: other necessary sanity checks
147
+ raise NotImplementedError(
148
+ 'circular_patterns int list of list has not been tested yet')
149
+
150
+ if answer_key is None and answer_key_switch_method is None:
151
+ raise ValueError(
152
+ 'answer_key and answer_key_switch_method cannot be both None')
153
+ if answer_key is not None and answer_key_switch_method is not None:
154
+ raise ValueError(
155
+ 'either answer_key or answer_key_switch_method should be None')
156
+
157
+ if isinstance(dataset, Dataset):
158
+ dataset = CircularDatasetMeta.make_circular_dataset(
159
+ dataset,
160
+ circular_patterns,
161
+ option_keys,
162
+ answer_key,
163
+ answer_key_switch_method,
164
+ )
165
+ else:
166
+ assert isinstance(dataset, DatasetDict)
167
+ dataset_dict = {}
168
+ for split in dataset:
169
+ if circular_splits is not None and split in circular_splits:
170
+ dataset_dict[
171
+ split] = CircularDatasetMeta.make_circular_dataset(
172
+ dataset[split],
173
+ circular_patterns,
174
+ option_keys,
175
+ answer_key,
176
+ answer_key_switch_method,
177
+ )
178
+ else:
179
+ dataset_dict[split] = dataset[split]
180
+ dataset = DatasetDict(dataset_dict)
181
+ return dataset
182
+
183
+ def __new__(cls, name, bases, dct):
184
+ new_cls = super().__new__(cls, name, bases, dct)
185
+
186
+ def load(cls, circular_patterns='circular', *args, **kwargs):
187
+ circular_splits = getattr(cls, 'default_circular_splits', None)
188
+ option_keys = getattr(cls, 'default_option_keys', None)
189
+ if 'option_keys' in kwargs:
190
+ option_keys = kwargs.pop('option_keys')
191
+ assert option_keys is not None, 'option_keys cannot be None'
192
+ answer_key = getattr(cls, 'default_answer_key', None)
193
+ if 'answer_key' in kwargs:
194
+ answer_key = kwargs.pop('answer_key')
195
+ answer_key_switch_method = getattr(
196
+ cls, 'default_answer_key_switch_method', None)
197
+ dataset = cls.dataset_class.load(*args, **kwargs)
198
+ return CircularDatasetMeta.make_circular(
199
+ dataset,
200
+ circular_splits,
201
+ circular_patterns,
202
+ option_keys,
203
+ answer_key,
204
+ answer_key_switch_method,
205
+ )
206
+
207
+ setattr(new_cls, 'load', classmethod(load))
208
+ return new_cls
209
+
210
+
211
+ class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
212
+ dataset_class = CEvalDataset
213
+ default_circular_splits = ['val', 'test']
214
+ default_option_keys = ['A', 'B', 'C', 'D']
215
+ default_answer_key = 'answer'
216
+
217
+
218
+ class CircularMMLUDataset(MMLUDataset, metaclass=CircularDatasetMeta):
219
+ dataset_class = MMLUDataset
220
+ default_circular_splits = ['test']
221
+ default_option_keys = ['A', 'B', 'C', 'D']
222
+ default_answer_key = 'target'
223
+
224
+
225
+ class CircularCMMLUDataset(CMMLUDataset, metaclass=CircularDatasetMeta):
226
+ dataset_class = CMMLUDataset
227
+ default_circular_splits = ['test']
228
+ default_option_keys = ['A', 'B', 'C', 'D']
229
+ default_answer_key = 'answer'
230
+
231
+
232
+ class CircularCSQADataset(commonsenseqaDataset, metaclass=CircularDatasetMeta):
233
+ dataset_class = commonsenseqaDataset
234
+ default_circular_splits = ['validation']
235
+ default_option_keys = ['A', 'B', 'C', 'D', 'E']
236
+ default_answer_key = 'answerKey'
237
+
238
+
239
+ class CircularARCDataset(ARCDataset, metaclass=CircularDatasetMeta):
240
+ dataset_class = ARCDataset
241
+ default_circular_splits = None
242
+ default_option_keys = ['textA', 'textB', 'textC', 'textD']
243
+
244
+ def default_answer_key_switch_method(item, circular_pattern):
245
+ circular_pattern = tuple(i[-1] for i in circular_pattern)
246
+ item['answerKey'] = circular_pattern['ABCD'.index(item['answerKey'])]
247
+ return item
248
+
249
+
250
+ class CircularHSWAGDataset(hellaswagDataset_V2, metaclass=CircularDatasetMeta):
251
+ dataset_class = hellaswagDataset_V2
252
+ default_circular_splits = None
253
+ default_option_keys = ['A', 'B', 'C', 'D']
254
+ default_answer_key = 'label'
255
+
256
+
257
+ class CircularOBQADataset(OBQADataset, metaclass=CircularDatasetMeta):
258
+ dataset_class = OBQADataset
259
+ default_circular_splits = None
260
+ default_option_keys = ['A', 'B', 'C', 'D']
261
+ default_answer_key = 'answerKey'
262
+
263
+
264
+ class CircularRaceDataset(RaceDataset, metaclass=CircularDatasetMeta):
265
+ dataset_class = RaceDataset
266
+ default_circular_splits = ['test']
267
+ default_option_keys = ['A', 'B', 'C', 'D']
268
+ default_answer_key = 'answer'
269
+
270
+
271
+ class CircularXiezhiDataset(XiezhiDataset, metaclass=CircularDatasetMeta):
272
+ dataset_class = XiezhiDataset
273
+ default_circular_splits = None
274
+ default_option_keys = ['A', 'B', 'C', 'D']
275
+ default_answer_key = 'answer'
276
+
277
+
278
+ class CircularsiqaDataset(siqaDataset_V3, metaclass=CircularDatasetMeta):
279
+ dataset_class = siqaDataset_V3
280
+ default_circular_splits = ['validation']
281
+ default_option_keys = ['A', 'B', 'C']
282
+ default_answer_key = 'answer'
283
+
284
+
285
+ class CircularpiqaDataset(piqaDataset_V2, metaclass=CircularDatasetMeta):
286
+ dataset_class = piqaDataset_V2
287
+ default_circular_splits = ['validation']
288
+ default_option_keys = ['sol1', 'sol2']
289
+
290
+ def default_answer_key_switch_method(item, circular_pattern):
291
+ circular_pattern = tuple(int(i[-1]) - 1 for i in circular_pattern)
292
+ item['answer'] = 'AB'[circular_pattern['AB'.index(item['answer'])]]
293
+ return item
294
+
295
+
296
+ class CircularEvaluator(BaseEvaluator):
297
+ """This Evaluator assesses datasets post-Circular processing, generating
298
+ the following evaluation metrics:
299
+
300
+ - `acc_{origin|circular|all_possible}`: Treats each question with shuffled
301
+ answer options as separate, calculating accuracy.
302
+ - `perf_{origin|circular|all_possible}`: According Circular logic, a
303
+ question is considered correct only if all its variations with shuffled
304
+ options are answered correctly, calculating accuracy. perf is short for
305
+ perfect.
306
+ - `more_{num}_{origin|circular|all_possible}`: According to Circular logic,
307
+ a question is considered correct only if the number of its variations
308
+ answered correctly is greater than or equal to `num`, calculating
309
+ accuracy.
310
+
311
+ Note that when the `all_possible` method is used to shuffle option order,
312
+ it naturally includes the Circular method, and its metrics will also be
313
+ output.
314
+
315
+ Args:
316
+ circular_pattern: The method of shuffling options, either 'circular' or
317
+ 'all_possible', defaulting to 'circular'.
318
+ """
319
+
320
+ def __init__(self, circular_pattern='circular'):
321
+ super().__init__()
322
+ self.circular_pattern = circular_pattern
323
+
324
+ def score(self, predictions, references, test_set):
325
+ circular_patterns = {}
326
+ circular_patterns['origin'] = get_origin_patterns(
327
+ test_set[0]['circular_pattern'])
328
+ circular_patterns['circular'] = get_circular_patterns(
329
+ test_set[0]['circular_pattern'])
330
+ if self.circular_pattern == 'all_possible':
331
+ circular_patterns['all_possible'] = get_all_possible_patterns(
332
+ test_set[0]['circular_pattern'])
333
+
334
+ metrics = {}
335
+ tmp_metrics = {}
336
+ tmp_metrics.update({f'correct_{k}': 0 for k in circular_patterns})
337
+ tmp_metrics.update({f'count_{k}': 0 for k in circular_patterns})
338
+ # calculate the original accuracy
339
+ for pred, refr, origin_item in zip(predictions, references, test_set):
340
+ circular_pattern = origin_item['circular_pattern']
341
+ for k in circular_patterns:
342
+ if tuple(circular_pattern) in circular_patterns[k]:
343
+ tmp_metrics[f'correct_{k}'] += 1 if pred == refr else 0
344
+ tmp_metrics[f'count_{k}'] += 1
345
+
346
+ for k in circular_patterns:
347
+ metrics[f'acc_{k}'] = (tmp_metrics[f'correct_{k}'] /
348
+ tmp_metrics[f'count_{k}'] * 100)
349
+
350
+ # calculate the circular accuracy
351
+ _details = {k: {} for k in circular_patterns}
352
+ for pred, refr, origin_item in zip(predictions, references, test_set):
353
+ index = origin_item['qid']
354
+ circular_pattern = origin_item['circular_pattern']
355
+ for k in circular_patterns:
356
+ if tuple(circular_pattern) in circular_patterns[k]:
357
+ _details[k].setdefault(
358
+ index, []).append(True if pred == refr else False)
359
+ for k in _details:
360
+ _details[k] = {
361
+ index: sum(_details[k][index])
362
+ for index in _details[k]
363
+ }
364
+ for k in _details:
365
+ for j in range(1, len(circular_patterns[k]) + 1):
366
+ count = sum([_details[k][index] >= j for index in _details[k]])
367
+ total = len(_details[k])
368
+ if j != len(circular_patterns[k]):
369
+ metrics[f'more_{j}_{k}'] = count / total * 100
370
+ else:
371
+ metrics[f'perf_{k}'] = count / total * 100
372
+
373
+ return metrics