You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: sagemaker-train/src/sagemaker/train/evaluate/benchmark_evaluator.py
+32-49Lines changed: 32 additions & 49 deletions
Original file line number
Diff line number
Diff line change
@@ -35,10 +35,8 @@ class _Benchmark(str, Enum):
35
35
MATH="math"
36
36
STRONG_REJECT="strong_reject"
37
37
IFEVAL="ifeval"
38
-
GEN_QA="gen_qa"
39
38
MMMU="mmmu"
40
39
LLM_JUDGE="llm_judge"
41
-
INFERENCE_ONLY="inference_only"
42
40
43
41
44
42
# Internal benchmark configuration mapping - using plain dictionaries
@@ -138,14 +136,6 @@ class _Benchmark(str, Enum):
138
136
"subtask_available": False,
139
137
"subtasks": None
140
138
},
141
-
_Benchmark.GEN_QA: {
142
-
"modality": "Multi-Modal (image)",
143
-
"description": "Custom Dataset Evaluation – Lets you supply your own dataset for benchmarking, comparing model outputs to reference answers with metrics such as ROUGE and BLEU. gen_qa supports image inference for models which have multimodal support.",
144
-
"metrics": ["all"],
145
-
"strategy": "gen_qa",
146
-
"subtask_available": False,
147
-
"subtasks": None
148
-
},
149
139
_Benchmark.MMMU: {
150
140
"modality": "Multi-Modal",
151
141
"description": "Massive Multidiscipline Multimodal Understanding (MMMU) – College-level benchmark comprising multiple-choice and open-ended questions from 30 disciplines.",
@@ -171,14 +161,6 @@ class _Benchmark(str, Enum):
171
161
"subtask_available": False,
172
162
"subtasks": None
173
163
},
174
-
_Benchmark.INFERENCE_ONLY: {
175
-
"modality": "Text",
176
-
"description": "Lets you supply your own dataset to generate inference responses which can be used with the llm_judge task. No metrics are computed for this task.",
177
-
"metrics": ["N/A"],
178
-
"strategy": "--",
179
-
"subtask_available": False,
180
-
"subtasks": None
181
-
},
182
164
}
183
165
184
166
@@ -278,10 +260,6 @@ class BenchMarkEvaluator(BaseEvaluator):
278
260
Optional. If not provided, the system will attempt to resolve it using the default
279
261
MLflow app experience (checks domain match, account default, or creates a new app).
0 commit comments