/evals/spicepod-local.yml
1name: spicepod
2version: v1beta1
3kind: Spicepod
4
5# This file is generate from the evals & data in the repo. Specifically using [evalconverter](https://github.com/spiceai/spiceai/tree/trunk/tools/evalconverter).
6# ```shell
7# evalconverter -i evals/registry/evals -b evals/registry/data
8#```
9
10datasets:
11- from: file:evals/registry/data/actors-sequence/samples.jsonl
12 name: actors_sequence__dev__match_v1
13- from: file:evals/registry/data/adultery-state-laws/samples.jsonl
14 name: adultery_state_laws__dev__v0
15- from: file:evals/registry/data/proofreader/samples.jsonl
16 name: proofreader__dev__v0
17- from: file:evals/registry/data/rock-climbing/samples.jsonl
18 name: rock_climbing__dev__v0
19- from: file:evals/registry/data/banking77/samples.jsonl
20 name: match_banking77__test__v1
21- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_prep.jsonl
22 name: ukraine_gec_grammar_prep__dev__v0
23- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_case.jsonl
24 name: ukraine_gec_grammar_case__dev__v0
25- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_gender.jsonl
26 name: ukraine_gec_grammar_gender__dev__v0
27- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_partvoice.jsonl
28 name: ukraine_gec_grammar_partvoice__dev__v0
29- from: file:evals/registry/data/ukraine_gec/ukraine_gec_fluency_poorflow.jsonl
30 name: ukraine_gec_fluency_poorflow__dev__v0
31- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_verbvoice.jsonl
32 name: ukraine_gec_grammar_verbvoice__dev__v0
33- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_number.jsonl
34 name: ukraine_gec_grammar_number__dev__v0
35- from: file:evals/registry/data/ukraine_gec/ukraine_gec_fluency_repetition.jsonl
36 name: ukraine_gec_fluency_repetition__dev__v0
37- from: file:evals/registry/data/ukraine_gec/ukraine_gec_fluency_calque.jsonl
38 name: ukraine_gec_fluency_calque__dev__v0
39- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_verbaform.jsonl
40 name: ukraine_gec_grammar_verbaform__dev__v0
41- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_ungrammaticalstructure.jsonl
42 name: ukraine_gec_grammar_ungrammaticalstructure__dev__v0
43- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_other.jsonl
44 name: ukraine_gec_grammar_other__dev__v0
45- from: file:evals/registry/data/ukraine_gec/ukraine_gec_fluency_style.jsonl
46 name: ukraine_gec_fluency_style__dev__v0
47- from: file:evals/registry/data/ukraine_gec/ukraine_gec_fluency_other.jsonl
48 name: ukraine_gec_fluency_other__dev__v0
49- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_conjunction.jsonl
50 name: ukraine_gec_grammar_conjunction__dev__v0
51- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_comparison.jsonl
52 name: ukraine_gec_grammar_comparison__dev__v0
53- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_tense.jsonl
54 name: ukraine_gec_grammar_tense__dev__v0
55- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_aspect.jsonl
56 name: ukraine_gec_grammar_aspect__dev__v0
57- from: file:evals/registry/data/irish_plural_nouns/samples.jsonl
58 name: irish_plural_nouns__dev__v0
59- from: file:evals/registry/data/shape_in_shape/shape_in_shape.jsonl
60 name: shape_in_shape__dev__v1
61- from: file:evals/registry/data/russian_sarcasm/samples.jsonl
62 name: russian_sarcasm__dev__v0
63- from: file:evals/registry/data/syllables_long_words/long_word_samples.jsonl
64 name: syllables__dev__v1
65- from: file:evals/registry/data/crepe/samples.jsonl
66 name: crepe__dev__v2
67- from: file:evals/registry/data/coq-proof-step/match.jsonl
68 name: coq_proof_step_match__dev__v0
69- from: file:evals/registry/data/ukraine_eit/samples.jsonl
70 name: ukraine_eit__val__v0
71- from: file:evals/registry/data/belarusian_proverbs/samples.jsonl
72 name: belarusian_proverbs__dev__v0
73- from: file:evals/registry/data/invoices/match.jsonl
74 name: invoices__dev__v0
75- from: file:evals/registry/data/urdu-lexicon/samples.jsonl
76 name: urdu_lexicon__dev__v0
77- from: file:evals/registry/data/qa/q_and_a.jsonl
78 name: qa__dev__v0
79- from: file:evals/registry/data/french-part-of-speech/samples.jsonl
80 name: french_part_of_speech__dev__v0
81- from: file:evals/registry/data/internal_representations/samples.jsonl
82 name: internal_representations__dev__v0
83- from: file:evals/registry/data/python_list_comprehension/samples.jsonl
84 name: python_list_comprehension__dev__v0
85- from: file:evals/registry/data/nepali_numerals/samples.jsonl
86 name: nepali_numerals__dev__v0
87- from: file:evals/registry/data/belarusian_syllable_count/samples.jsonl
88 name: belarusian_syllable_count__dev__v0
89- from: file:evals/registry/data/mandaliof-table/samples.jsonl
90 name: mandaliof_table__dev__v0
91- from: file:evals/registry/data/test_japanese_english_numerals/samples.jsonl
92 name: test_japanese_english_numerals__dev__v0
93- from: file:evals/registry/data/tracking-shuffled-objects/samples.jsonl
94 name: tracking_shuffled_objects__dev__v0
95- from: file:evals/registry/data/squares-gpt/square-samples.jsonl
96 name: squares_gpt__dev__v0
97- from: file:evals/registry/data/convert-hex-hsl-lightness/samples.jsonl
98 name: convert_hex_hsl_lightness__dev__v0
99- from: file:evals/registry/data/russe/samples.jsonl
100 name: russe__test__v0
101- from: file:evals/registry/data/aba_mrpc_true_false/samples.jsonl
102 name: aba_mrpc_true_false__dev__v0
103- from: file:evals/registry/data/logical_counting/samples.jsonl
104 name: logical_counting__dev__v0
105- from: file:evals/registry/data/vigenere/samples.jsonl
106 name: vigenere__s1__simple_v0
107- from: file:evals/registry/data/map-electronic-component-part-to-fact/samples.jsonl
108 name: map_electronic_component_part_to_fact__dev__v0
109- from: file:evals/registry/data/rare-and-loanwords-dutch-lexicon/samples.jsonl
110 name: rare_and_loanwords_dutch_lexicon__dev__v0
111- from: file:evals/registry/data/product-ie/fewshot/product_ie_one_shot_samples.jsonl
112 name: product_information_extraction_one_shot__dev__v0
113- from: file:evals/registry/data/product-ie/zeroshot/product_ie_zero_shot_samples.jsonl
114 name: product_information_extraction_zero_shot__dev__v0
115- from: file:evals/registry/data/sort_numeric/samples.jsonl
116 name: sort_numbers__s1__simple_v0
117- from: file:evals/registry/data/product-matching/zeroshot/samples.jsonl
118 name: match_product_matching_zeroshot__dev__v1
119- from: file:evals/registry/data/product-matching/fewshot/samples.jsonl
120 name: match_product_matching_fewshot__dev__v1
121- from: file:evals/registry/data/product-matching/rules/samples.jsonl
122 name: match_product_matching_rules__dev__v1
123- from: file:evals/registry/data/russian-lexicon/samples.jsonl
124 name: russian_lexicon__dev__v0
125- from: file:evals/registry/data/dutch-lexicon/samples.jsonl
126 name: dutch_lexicon__dev__v0
127- from: file:evals/registry/data/greek_nt_manuscripts/codes-sigla-centuries.jsonl
128 name: greek_nt_manuscripts__v0
129- from: file:evals/registry/data/matrix_mult_rows/samples.jsonl
130 name: matrix_mult_rows__dev__v0
131- from: file:evals/registry/data/moral_exceptQA/samples.jsonl
132 name: moral_exceptqa__test__v1
133- from: file:evals/registry/data/music-theory/triads-samples.jsonl
134 name: music_theory_triads_identification__dev__v0
135- from: file:evals/registry/data/music-theory/tetrads-samples.jsonl
136 name: music_theory_tetrads_identification__dev__v0
137- from: file:evals/registry/data/find-thirukkural/samples.jsonl
138 name: find_thirukkural__dev__v0
139- from: file:evals/registry/data/building_floorplan/samples.jsonl
140 name: building_floorplan__test__v1
141- from: file:evals/registry/data/japanese-national-medical-exam01/japanese-national-medical-exam01.jsonl
142 name: japanese_national_medical_exam01__dev__v0
143- from: file:evals/registry/data/lat_long_identify/samples.jsonl
144 name: lat_long_identify__dev__v0
145- from: file:evals/registry/data/norwegian-lexicon/samples.jsonl
146 name: norwegian_lexicon__dev__v0
147- from: file:evals/registry/data/german-part-of-speech/samples.jsonl
148 name: german_part_of_speech__dev__v0
149- from: file:evals/registry/data/swedish_sat/samples.jsonl
150 name: swedish_sat__dev__v0
151- from: file:evals/registry/data/utility_price_parsing/samples.jsonl
152 name: utility_price_parsing__dev__v0
153- from: file:evals/registry/data/korean-consonant-vowel-combination/samples.jsonl
154 name: korean_consonant_vowel_combination__dev__v0
155- from: file:evals/registry/data/mate-in-one/samples.jsonl
156 name: mate_in_one__dev__v0
157- from: file:evals/registry/data/french-lexicon/samples.jsonl
158 name: french_lexicon__dev__v0
159- from: file:evals/registry/data/swedish-spelling/samples.jsonl
160 name: swedish_spelling__dev__v0
161- from: file:evals/registry/data/knot-theory/knot-theory-unknotting-numbers.jsonl
162 name: knot_theory_unknotting_number__dev__v0
163- from: file:evals/registry/data/knot-theory/knot-theory-unknotting-problems.jsonl
164 name: knot_theory_unknotting_problem__dev__v0
165- from: file:evals/registry/data/knot-theory/knot-theory-code-conversions.jsonl
166 name: knot_theory_code_conversion__dev__v0
167- from: file:evals/registry/data/hindi_words/samples.jsonl
168 name: hindi_words__dev__v0
169- from: file:evals/registry/data/arithmetical_puzzles/arithmetical_puzzles.jsonl
170 name: arithmetical_puzzles__dev__v0
171- from: file:evals/registry/data/belarusian_antonyms/samples.jsonl
172 name: belarusian_antonyms__dev__v0
173- from: file:evals/registry/data/body_movement/body_movement.jsonl
174 name: body_movement__dev__zero_shot_v0
175- from: file:evals/registry/data/afrikaans-lexicon/samples.jsonl
176 name: afrikaans_lexicon__dev__v0
177- from: file:evals/registry/data/cricket_situations/samples.jsonl
178 name: cricket_situations__dev__v0
179- from: file:evals/registry/data/korean_spelling/samples.jsonl
180 name: korean_spelling__dev__v0
181- from: file:evals/registry/data/rucola/samples.jsonl
182 name: rucola__test__v0
183- from: file:evals/registry/data/logiqa-logical-reasoning-plus/reclor-logical-reasoning-plus.jsonl
184 name: reclor_logical_reasoning_plus__dev__v0
185- from: file:evals/registry/data/logiqa-logical-reasoning-plus/logiqav2-logical-reasoning-plus.jsonl
186 name: logiqav2_logical_reasoning_plus__dev__v0
187- from: file:evals/registry/data/logiqa-logical-reasoning-plus/logiqa-logical-reasoning-plus.jsonl
188 name: logiqa_logical_reasoning_plus__dev__v0
189- from: file:evals/registry/data/medmcqa/samples.jsonl
190 name: medmcqa__dev__v0
191- from: file:evals/registry/data/multi-step-equations/samples.jsonl
192 name: multi_step_equations__dev__v0
193- from: file:evals/registry/data/islands/japanese_remote_island_to_prefecture.jsonl
194 name: japanese_remote_island_to_prefecture__dev__v0
195- from: file:evals/registry/data/chinese_homonym/samples.jsonl
196 name: chinese_homonym__dev__v0
197- from: file:evals/registry/data/norwegian-rhymes/samples.jsonl
198 name: norwegian_rhymes__dev__v0
199- from: file:evals/registry/data/chinese_shi_jing/samples.jsonl
200 name: chinese_shi_jing__test__v1
201- from: file:evals/registry/data/forth_stack_sim/basic_samples.jsonl
202 name: forth_stack_sim_basic__dev__v0
203- from: file:evals/registry/data/forth_stack_sim/samples.jsonl
204 name: forth_stack_sim__dev__v0
205- from: file:evals/registry/data/forth_stack_sim/detailed_samples.jsonl
206 name: forth_stack_sim_detailed__dev__v0
207- from: file:evals/registry/data/japanese_city_name_pronunciation/samples.jsonl
208 name: japanese_city_name_pronunciation__dev__v0
209- from: file:evals/registry/data/escher_sentences/samples.jsonl
210 name: escher_sentences__dev__v0
211- from: file:evals/registry/data/track_objects/samples.jsonl
212 name: track_objects__dev__v0
213- from: file:evals/registry/data/shopping_discount_comparison/samples.jsonl
214 name: shopping_discount_comparison__dev__v0
215- from: file:evals/registry/data/test_comp_sci/questions.jsonl
216 name: computer_science_problems__s1__simple_v0
217- from: file:evals/registry/data/mendelian_inheritance/samples.jsonl
218 name: mendelian_inheritance__dev__v0
219- from: file:evals/registry/data/override-system-instruction/samples.jsonl
220 name: override_system_instruction__dev__v0
221- from: file:evals/registry/data/poker_hand_ranks/full_samples.jsonl
222 name: hand_ranks__test__v1
223- from: file:evals/registry/data/diabetes/samples.jsonl
224 name: diabetes__dev__v0
225- from: file:evals/registry/data/job_listing_title_for_a_caregiver_in_japan/samples.jsonl
226 name: job_listing_title_for_a_caregiver_in_japan__test__v1
227- from: file:evals/registry/data/poker_analysis/samples.jsonl
228 name: poker_analysis__test__v1
229- from: file:evals/registry/data/belarusian_numerals/samples.jsonl
230 name: belarusian_numerals__dev__v0
231- from: file:evals/registry/data/algebra_word_problems/samples.jsonl
232 name: algebra_word_problems__s1__simple_v0
233- from: file:evals/registry/data/belarusian_grammar/samples.jsonl
234 name: belarusian_grammar__dev__v0
235- from: file:evals/registry/data/svg_understanding/samples.jsonl
236 name: svg_understanding__v0
237- from: file:evals/registry/data/cissp-study-questions/samples.jsonl
238 name: cissp_study_questions__test__v1
239- from: file:evals/registry/data/linear_equations/samples.jsonl
240 name: linear_equations__dev__v0
241- from: file:evals/registry/data/japanese_driving_license/samples.jsonl
242 name: japanese_driving_license__s1__simple_v0
243- from: file:evals/registry/data/first-letters/samples.jsonl
244 name: first_letters__dev__v0
245- from: file:evals/registry/data/arc/samples.jsonl
246 name: arc__dev__v0
247- from: file:evals/registry/data/css-selectors/verbal.jsonl
248 name: css_selectors_verbal__dev__v0
249- from: file:evals/registry/data/japanese-itpassport-exam01/japanese-itpassport-exam01.jsonl
250 name: japanese_itpassport_exam01__dev__v0
251- from: file:evals/registry/data/logiqa/logiqa.jsonl
252 name: logiqa__dev__v0
253- from: file:evals/registry/data/chinese_zodiac/samples.jsonl
254 name: chinese_zodiac__dev__v0
255- from: file:evals/registry/data/spanish-lexicon/samples.jsonl
256 name: spanish_lexicon__dev__v0
257- from: file:evals/registry/data/food/samples.jsonl
258 name: food__test__v1
259- from: file:evals/registry/data/countries/samples.jsonl
260 name: countries__dev__v0
261- from: file:evals/registry/data/which_is_heavier/which_is_heavier.jsonl
262 name: which_is_heavier__dev__v0
263- from: file:evals/registry/data/korean_date_counting/samples.jsonl
264 name: korean_date_counting__dev__v0
265- from: file:evals/registry/data/fcc_amateur_extra/samples.jsonl
266 name: fcc_amateur_extra__dev__v0
267- from: file:evals/registry/data/multistep-word-problems/samples.jsonl
268 name: multistep_word_problems__dev__v0
269- from: file:evals/registry/data/list_comparison_missing_name/samples.jsonl
270 name: list_comparison_missing_name__dev__v0
271- from: file:evals/registry/data/newsology/samples.jsonl
272 name: newsology__dev__v0
273- from: file:evals/registry/data/simple-visual-understanding/simple-visual-understanding.jsonl
274 name: simple_visual_understanding__dev__v0
275- from: file:evals/registry/data/portuguese-syllable-count/samples.jsonl
276 name: portuguese_syllable_count__dev__v0
277- from: file:evals/registry/data/south-african-bands/south-african-bands.jsonl
278 name: south_african_bands__dev__v0
279- from: file:evals/registry/data/hebrew_plurals/samples.jsonl
280 name: hebrew_plurals__dev__v0
281- from: file:evals/registry/data/rot13/rot13.jsonl
282 name: rot13__s1__simple_v0
283- from: file:evals/registry/data/korean_dialects/samples.jsonl
284 name: korean_dialects__dev__v0
285- from: file:evals/registry/data/test_time_zone_conversion/samples.v0.jsonl
286 name: test_time_zone_conversion__dev__v0
287- from: file:evals/registry/data/music_theory/music_theory_chord_notes.jsonl
288 name: music_theory_chord_notes__dev__v0
289- from: file:evals/registry/data/russian-english-homonym-context-resolution/samples.jsonl
290 name: russian_english_homonym_context_resolution__dev__v0
291- from: file:evals/registry/data/number_reading/number_reading.jsonl
292 name: number_reading__dev__v0
293- from: file:evals/registry/data/simple-knowledge-mongolian/samples.v0.jsonl
294 name: simple_knowledge_mongolian__dev__v0
295- from: file:evals/registry/data/base64_decode/base64_decode.jsonl
296 name: base64_decode_simple__dev__v0
297- from: file:evals/registry/data/urdu-transliteration/samples.jsonl
298 name: urdu_transliteration__dev__v0
299- from: file:evals/registry/data/reverse-polish-notation/questions.jsonl
300 name: reverse_polish_notation__dev__v0
301- from: file:evals/registry/data/music_theory/music_theory_chord_names.jsonl
302 name: music_theory_chord_names__dev__v0
303- from: file:evals/registry/data/born_first/born_first.jsonl
304 name: born_first__dev__v0
305- from: file:evals/registry/data/tetris/tetris.jsonl
306 name: tetris__dev__v0
307- from: file:evals/registry/data/pure_korean/samples.jsonl
308 name: pure_korean__dev__v0
309- from: file:evals/registry/data/determinant/samples.jsonl
310 name: determinant__test__v1
311- from: file:evals/registry/data/split_chinese_characters/samples.jsonl
312 name: split_chinese_characters__dev__v0
313- from: file:evals/registry/data/syntax-check/samples.jsonl
314 name: syntax_check__dev__v1
315- from: file:evals/registry/data/balance_chemical_equation/samples.jsonl
316 name: balance_chemical_equation__dev__v0
317- from: file:evals/registry/data/emotional-intelligence/samples.jsonl
318 name: emotional_intelligence__dev__v0
319- from: file:evals/registry/data/nutrition/facts.jsonl
320 name: nutrition__dev__v0
321- from: file:evals/registry/data/reverse-sort-words-eng/samples.jsonl
322 name: reverse_sort_words_eng_simple__dev__v0
323- from: file:evals/registry/data/day-of-week-from-date/samples.jsonl
324 name: day_of_week_from_date__dev__v0
325- from: file:evals/registry/data/regex-match/samples.jsonl
326 name: regex__match__dev__v0
327- from: file:evals/registry/data/find-letter/samples.jsonl
328 name: find_letter__dev__v0
329- from: file:evals/registry/data/korean_foreign_words/samples.jsonl
330 name: korean_foreign_words__dev__v0
331- from: file:evals/registry/data/greek_vocabulary/samples.jsonl
332 name: greek_vocabulary__dev__v0
333- from: file:evals/registry/data/rubiks-colors/samples.jsonl
334 name: rubiks_colors__dev__v0
335- from: file:evals/registry/data/decrypt_caesar_cipher/samples.jsonl
336 name: decrypt_caesar_cipher__dev__v0
337- from: file:evals/registry/data/us_tort_law/samples.jsonl
338 name: us_tort_law__dev__v0
339- from: file:evals/registry/data/number_pattern/samples.jsonl
340 name: number_pattern__dev__v0
341- from: file:evals/registry/data/confusing_korean/samples.jsonl
342 name: confusing_korean__dev__v0
343- from: file:evals/registry/data/kanji-idioms/samples.jsonl
344 name: kanji_idioms__test__v0
345- from: file:evals/registry/data/missing_operators/samples.jsonl
346 name: missing_operators__s1__simple_v0
347- from: file:evals/registry/data/unsolvable_questions/samples.jsonl
348 name: unsolvable_questions__dev__v0
349- from: file:evals/registry/data/portuguese-sarcasm/samples.jsonl
350 name: portuguese_sarcasm__dev__v0
351- from: file:evals/registry/data/swap-words/samples.jsonl
352 name: swap_words__dev__v0
353- from: file:evals/registry/data/hebrew_same_noun_gender/samples.jsonl
354 name: hebrew_same_noun_gender__v0
355- from: file:evals/registry/data/heart-disease/samples.jsonl
356 name: heart_disease__v0
357- from: file:evals/registry/data/last_word_nth/samples.jsonl
358 name: last_word_nth__s1__simple_v0
359- from: file:evals/registry/data/ascii_wordart/ascii_wordart.jsonl
360 name: ascii_wordart__dev__v0
361- from: file:evals/registry/data/direct-speech-tag/samples.jsonl
362 name: direct_speech_tag__dev__v0
363- from: file:evals/registry/data/italian-new-words/samples.jsonl
364 name: italian_new_words__dev__v0
365- from: file:evals/registry/data/irony/samples.jsonl
366 name: irony__dev__v0
367- from: file:evals/registry/data/math_polish/samples.jsonl
368 name: math_polish__dev__v0
369- from: file:evals/registry/data/irish-lexicon/samples.jsonl
370 name: irish_lexicon__dev__v0
371- from: file:evals/registry/data/canto_wu_pronunciation/samples_zero.jsonl
372 name: canto_wu_pronunciation__dev__v0
373- from: file:evals/registry/data/irrelevant-negative-diversion/irrelevant-negative-diversion.jsonl
374 name: irrelevant_negative_diversion__dev__v0
375- from: file:evals/registry/data/invert_word_wise/invert.jsonl
376 name: invert_word_wise__dev__v0
377- from: file:evals/registry/data/imperial_date_to_string/samples.jsonl
378 name: imperial_date_to_string__dev__v0
379- from: file:evals/registry/data/gujarati_numerals/samples.jsonl
380 name: gujarati_numerals__dev__v0
381- from: file:evals/registry/data/count_token_freq_dna/samples.jsonl
382 name: count_token_freq_dna__dev__v0
383- from: file:evals/registry/data/french_homonym_and_homograph/samples.jsonl
384 name: french_homonym_and_homograph__dev__v0
385- from: file:evals/registry/data/cube-pack/samples.jsonl
386 name: cube_pack__dev__v0
387- from: file:evals/registry/data/historical-kana-orthography-reading/samples.jsonl
388 name: historical_kana_orthography_reading__dev__v0
389- from: file:evals/registry/data/canto_wu_pronunciation/samples_few.jsonl
390 name: canto_wu_pronunciation_fewshot__dev__v0
391- from: file:evals/registry/data/accounting_audit/samples.jsonl
392 name: accounting_audit__dev__v0
393- from: file:evals/registry/data/brazilian-lexicon/samples.jsonl
394 name: brazilian_lexicon__dev__v0
395- from: file:evals/registry/data/naughty_strings/samples.jsonl
396 name: naughty_strings__test__v1
397- from: file:evals/registry/data/korean-phonetics/samples.jsonl
398 name: korean_phonetics__dev__v0
399- from: file:evals/registry/data/chinese_homophonic/chinese_homophonic.jsonl
400 name: chinese_homophonic__dev__v0
401- from: file:evals/registry/data/count_intersections_polynomial/samples.jsonl
402 name: count_intersections_polynomial__dev__v0
403- from: file:evals/registry/data/coqa/match.jsonl
404 name: coqa_match__dev__v0
405- from: file:evals/registry/data/latin_grammar/samples.jsonl
406 name: latin_grammar__dev__v0
407- from: file:evals/registry/data/bitwise/samples.jsonl
408 name: bitwise__dev__v0
409- from: file:evals/registry/data/shared_border/samples.jsonl
410 name: shared_borders__dev__v0
411- from: file:evals/registry/data/japanese-station/samples.jsonl
412 name: japanese_station__dev__v0
413- from: file:evals/registry/data/atpl_exams/samples.jsonl
414 name: atpl_exams__dev__v0
415- from: file:evals/registry/data/invoice_due_date_leap_day_adjustment/samples.jsonl
416 name: invoice_due_date_leap_day_adjustment__dev__v0
417- from: file:evals/registry/data/romanian_homonyms/samples.jsonl
418 name: romanian_homonyms__dev__v0
419- from: file:evals/registry/data/infiniteloop-match/infiniteloop-match.jsonl
420 name: infiniteloop_match__s1__simple_v0
421- from: file:evals/registry/data/russian-nlp-tasks/samples.jsonl
422 name: russian_nlp_tasks__dev__v0
423- from: file:evals/registry/data/chinese_chu_ci/samples.jsonl
424 name: chinese_chu_ci__dev__v0
425- from: file:evals/registry/data/polish-syllable-count/samples.jsonl
426 name: polish_syllable_count__val__v0
427- from: file:evals/registry/data/korean-postposition/samples.jsonl
428 name: korean_postposition__dev__v0
429- from: file:evals/registry/data/bulgarian-lexicon/samples.jsonl
430 name: bulgarian_lexicon__dev__v0
431- from: file:evals/registry/data/compare-countries-area/samples.jsonl
432 name: compare_countries_area__dev__v0
433- from: file:evals/registry/data/pattern_identification/samples.v0.jsonl
434 name: pattern_identification__dev__v0
435- from: file:evals/registry/data/belarusian_synonyms/samples.jsonl
436 name: belarusian_synonyms__dev__v0
437- from: file:evals/registry/data/spanish_feminine_noun_masculine_article/samples.jsonl
438 name: spanish_feminine_noun_masculine_article__dev__v0
439- from: file:evals/registry/data/sarcasm/samples.jsonl
440 name: sarcasm__test__v1
441- from: file:evals/registry/data/chinese_tang_poetries/sample.jsonl
442 name: chinese_tang_poetries__dev__match_v1
443- from: file:evals/registry/data/japanese_number_reading/japanese_number_reading.jsonl
444 name: japanese_number_reading__dev__v0
445- from: file:evals/registry/data/korean-honorific/samples.jsonl
446 name: korean_honorific__dev__v0
447- from: file:evals/registry/data/complex_replace_characters/samples.jsonl
448 name: complex_replace_characters__dev__v0
449- from: file:evals/registry/data/dice-rotation-sequence/samples.jsonl
450 name: dice_rotation_sequence__dev__v0
451- from: file:evals/registry/data/utah_real_estate/samples.jsonl
452 name: utah_real_estate__dev__v0
453- from: file:evals/registry/data/formal_logic/formal_logic_expressions.jsonl
454 name: formal_logic__dev__v0
455- from: file:evals/registry/data/resistor_ohm_calculator/samples.jsonl
456 name: resistor_ohm_calculator__dev__simple_v0
457- from: file:evals/registry/data/GOL/samples.jsonl
458 name: gol__dev__v1
459- from: file:evals/registry/data/icelandic-sentences-gec/samples.jsonl
460 name: icelandic_sentences_gec__dev__v0
461- from: file:evals/registry/data/chinese_modern_poem_identification/samples.jsonl
462 name: chinese_modern_poem_identification__test__v1
463- from: file:evals/registry/data/reverse_string/reverse_string.jsonl
464 name: reverse_string__s1__simple_v0
465- from: file:evals/registry/data/complex-analogies-en-ru/samples.jsonl
466 name: complex_analogies_en_ru__dev__v0
467- from: file:evals/registry/data/positive-binary-operations/samples.jsonl
468 name: positive_binary_operations__test__v1
469- from: file:evals/registry/data/hindi_shuddha/samples.jsonl
470 name: hindi_shuddha__dev__v0
471- from: file:evals/registry/data/tokyo-station-number/samples.jsonl
472 name: tokyo_station_number__dev__v0
473- from: file:evals/registry/data/chinese_famous_novel/samples.jsonl
474 name: chinese_famous_novel__dev__v0
475- from: file:evals/registry/data/diagrammatic_logic/samples.jsonl
476 name: diagrammatic_logic__dev__v2
477- from: file:evals/registry/data/polish-lexicon/samples.jsonl
478 name: polish_lexicon__dev__v0
479- from: file:evals/registry/data/wkt_understanding/samples.jsonl
480 name: wkt_understanding__dev__v0
481- from: file:evals/registry/data/japanese-national-medical-exam02/japanese-national-medical-exam02.jsonl
482 name: japanese_national_medical_exam02__dev__v0
483- from: file:evals/registry/data/cardinal-directions/samples.jsonl
484 name: cardinal_directions__dev__v0
485- from: file:evals/registry/data/rectangles/samples.jsonl
486 name: rectangles__dev__v0
487- from: file:evals/registry/data/hindi_upsc/samples.jsonl
488 name: hindi_upsc__dev__v0
489- from: file:evals/registry/data/three-pt-mapping/three_pt_mapping.jsonl
490 name: three_pt_mapping__dev__v0
491- from: file:evals/registry/data/polish-proverbs/samples.jsonl
492 name: polish_proverbs__dev__v0
493- from: file:evals/registry/data/indonesian_numbers/indonesian_numbers.jsonl
494 name: indonesian_numbers__dev__v0
495- from: file:evals/registry/data/chinese_song_ci/samples.jsonl
496 name: chinese_song_ci__dev__v0
497- from: file:evals/registry/data/cybersecurity/filepaths.jsonl
498 name: cybersecurity_filepaths__dev__v0
499- from: file:evals/registry/data/taxes/samples.jsonl
500 name: taxes__dev__v0
501- from: file:evals/registry/data/crontab/samples.jsonl
502 name: crontab__dev__v0
503- from: file:evals/registry/data/integer-sequence-predictions/misc-and-recent-sequences.jsonl
504 name: integer_sequence_predictions_misc__dev__v0
505- from: file:evals/registry/data/integer-sequence-predictions/obscure-sequences.jsonl
506 name: integer_sequence_predictions_obscure__dev__v0
507- from: file:evals/registry/data/integer-sequence-predictions/notable-sequences.jsonl
508 name: integer_sequence_predictions_notable__dev__v0
509- from: file:evals/registry/data/integer-sequence-predictions/samples.jsonl
510 name: integer_sequence_predictions__dev__v0
511- from: file:evals/registry/data/belarusian_orthography/samples.jsonl
512 name: belarusian_orthography__dev__v0
513- from: file:evals/registry/data/date-booking/samples.jsonl
514 name: date_booking__dev__v0
515- from: file:evals/registry/data/interlingual-homograph/samples.jsonl
516 name: interlingual_homograph__dev__v0
517- from: file:evals/registry/data/stats-tests/samples.jsonl
518 name: stats_tests__dev__v0
519- from: file:evals/registry/data/belarusian_russian_translation/samples.jsonl
520 name: belarusian_russian_translation__dev__v0
521- from: file:evals/registry/data/date-calculator/samples.jsonl
522 name: date_calculator__test__v1
523- from: file:evals/registry/data/chinese_poem/samples.jsonl
524 name: chinese_poem__dev__v0
525- from: file:evals/registry/data/belarusian_lexicon/samples.jsonl
526 name: belarusian_lexicon__dev__v0
527- from: file:evals/registry/data/test_english_pronunciations/samples.jsonl
528 name: test_english_pronunciations__dev__v0
529- from: file:evals/registry/data/anagrams/samples.jsonl
530 name: anagrams__test__v1
531- from: file:evals/registry/data/guess_the_singer/samples.jsonl
532 name: guess_the_singer__dev__v0
533- from: file:evals/registry/data/illinois-law/samples.jsonl
534 name: illinois_law__v0
535- from: file:evals/registry/data/russian_medical/samples.jsonl
536 name: russian_medical__dev__v0
537- from: file:evals/registry/data/bigrams/samples.jsonl
538 name: bigrams__dev__v0
539- from: file:evals/registry/data/probability_questions/probability_questions.jsonl
540 name: probability_questions__dev__v0
541- from: file:evals/registry/data/vintage_phone_keyboard_decode/samples.jsonl
542 name: vintage_phone_keyboard_decode__dev__v0
543- from: file:evals/registry/data/connect4/samples.jsonl
544 name: connect4__s1__v1
545- from: file:evals/registry/data/stock_options/stock_options_bull_call_spread.jsonl
546 name: stock_options_bull_call_spread__dev__v0
547- from: file:evals/registry/data/stock_options/stock_options_bear_call_spread.jsonl
548 name: stock_options_bear_call_spread__dev__v0
549- from: file:evals/registry/data/stock_options/stock_option_terms_bear_call_spread.jsonl
550 name: stock_option_terms_bear_call_spread__dev__v0
551- from: file:evals/registry/data/stock_options/stock_option_terms_iron_butterfly_spread.jsonl
552 name: stock_option_terms_iron_butterfly_spread__dev__v0
553- from: file:evals/registry/data/stock_options/stock_option_terms_bull_call_spread.jsonl
554 name: stock_option_terms_bull_call_spread__dev__v0
555- from: file:evals/registry/data/stock_options/stock_options_inverse_iron_condor_spread.jsonl
556 name: stock_options_inverse_iron_condor_spread__dev__v0
557- from: file:evals/registry/data/stock_options/stock_options_iron_condor_spread.jsonl
558 name: stock_options_iron_condor_spread__dev__v0
559- from: file:evals/registry/data/stock_options/stock_option_terms_iron_condor_spread.jsonl
560 name: stock_option_terms_iron_condor_spread__dev__v0
561- from: file:evals/registry/data/stock_options/stock_options_inverse_iron_butterfly_spread.jsonl
562 name: stock_options_inverse_iron_butterfly_spread__dev__v0
563- from: file:evals/registry/data/stock_options/stock_option_terms_inverse_iron_condor_spread.jsonl
564 name: stock_option_terms_inverse_iron_condor_spread__dev__v0
565- from: file:evals/registry/data/japanese_romantic_context/samples.jsonl
566 name: japanese_romantic_context__dev__v0
567- from: file:evals/registry/data/phonetics-identify-words-needing-missing-gpcs/samples.jsonl
568 name: phonetics_identify_words_needing_missing_gpcs__s1__simple_v0
569- from: file:evals/registry/data/prompt-injection/samples.jsonl
570 name: prompt_injection__dev__v0
571- from: file:evals/registry/data/word_vector_over_reliance/word_vector_over_reliance_samples.jsonl
572 name: word_vector_over_reliance__dev__simple_v0
573- from: file:evals/registry/data/lunar_calendar/iso_to_lunar_calendar.jsonl
574 name: iso_to_lunar_calendar__dev__v0
575- from: file:evals/registry/data/lunar_calendar/lunar_calendar_to_iso.jsonl
576 name: lunar_calendar_to_iso__dev__v0
577- from: file:evals/registry/data/code_combination/samples.jsonl
578 name: code_combination__dev__v0
579- from: file:evals/registry/data/partially_solved_crossword_clues/samples.jsonl
580 name: partially_solved_crossword_clues__dev__v0
581- from: file:evals/registry/data/quartz/samples.jsonl
582 name: quartz__test__v1
583- from: file:evals/registry/data/physics-interaction/samples.jsonl
584 name: physics__interaction__dev__v0
585- from: file:evals/registry/data/next-val-series/next-val-series.jsonl
586 name: next_val_series__dev__simple_v0
587evals:
588- name: actors-sequence
589 dataset: actors_sequence__dev__match_v1
590 scorers:
591 - match
592- name: adultery_state_laws
593 dataset: adultery_state_laws__dev__v0
594 scorers:
595 - match
596- name: proofreader
597 dataset: proofreader__dev__v0
598 scorers:
599 - match
600- name: rock-climbing
601 dataset: rock_climbing__dev__v0
602 scorers:
603 - match
604- name: match_banking77
605 dataset: match_banking77__test__v1
606 scorers:
607 - match
608- name: ukraine-gec-grammar-prep
609 dataset: ukraine_gec_grammar_prep__dev__v0
610 scorers:
611 - match
612- name: ukraine-gec-grammar-case
613 dataset: ukraine_gec_grammar_case__dev__v0
614 scorers:
615 - match
616- name: ukraine-gec-grammar-gender
617 dataset: ukraine_gec_grammar_gender__dev__v0
618 scorers:
619 - match
620- name: ukraine-gec-grammar-partvoice
621 dataset: ukraine_gec_grammar_partvoice__dev__v0
622 scorers:
623 - match
624- name: ukraine-gec-fluency-poorflow
625 dataset: ukraine_gec_fluency_poorflow__dev__v0
626 scorers:
627 - match
628- name: ukraine-gec-grammar-verbvoice
629 dataset: ukraine_gec_grammar_verbvoice__dev__v0
630 scorers:
631 - match
632- name: ukraine-gec-grammar-number
633 dataset: ukraine_gec_grammar_number__dev__v0
634 scorers:
635 - match
636- name: ukraine-gec-fluency-repetition
637 dataset: ukraine_gec_fluency_repetition__dev__v0
638 scorers:
639 - match
640- name: ukraine-gec-fluency-calque
641 dataset: ukraine_gec_fluency_calque__dev__v0
642 scorers:
643 - match
644- name: ukraine-gec-grammar-verbaform
645 dataset: ukraine_gec_grammar_verbaform__dev__v0
646 scorers:
647 - match
648- name: ukraine-gec-grammar-ungrammaticalstructure
649 dataset: ukraine_gec_grammar_ungrammaticalstructure__dev__v0
650 scorers:
651 - match
652- name: ukraine-gec-grammar-other
653 dataset: ukraine_gec_grammar_other__dev__v0
654 scorers:
655 - match
656- name: ukraine-gec-fluency-style
657 dataset: ukraine_gec_fluency_style__dev__v0
658 scorers:
659 - match
660- name: ukraine-gec-fluency-other
661 dataset: ukraine_gec_fluency_other__dev__v0
662 scorers:
663 - match
664- name: ukraine-gec-grammar-conjunction
665 dataset: ukraine_gec_grammar_conjunction__dev__v0
666 scorers:
667 - match
668- name: ukraine-gec-grammar-comparison
669 dataset: ukraine_gec_grammar_comparison__dev__v0
670 scorers:
671 - match
672- name: ukraine-gec-grammar-tense
673 dataset: ukraine_gec_grammar_tense__dev__v0
674 scorers:
675 - match
676- name: ukraine-gec-grammar-aspect
677 dataset: ukraine_gec_grammar_aspect__dev__v0
678 scorers:
679 - match
680- name: irish-plural-nouns
681 dataset: irish_plural_nouns__dev__v0
682 scorers:
683 - match
684- name: shape-in-shape
685 dataset: shape_in_shape__dev__v1
686 scorers:
687 - match
688- name: russian_sarcasm
689 dataset: russian_sarcasm__dev__v0
690 scorers:
691 - match
692- name: syllables_long_words
693 dataset: syllables__dev__v1
694 scorers:
695 - match
696- name: crepe
697 dataset: crepe__dev__v2
698 scorers:
699 - match
700- name: coq-proof-step-match
701 dataset: coq_proof_step_match__dev__v0
702 scorers:
703 - match
704- name: ukraine-eit
705 dataset: ukraine_eit__val__v0
706 scorers:
707 - match
708- name: belarusian-proverbs
709 dataset: belarusian_proverbs__dev__v0
710 scorers:
711 - match
712- name: invoices
713 dataset: invoices__dev__v0
714 scorers:
715 - match
716- name: urdu-lexicon
717 dataset: urdu_lexicon__dev__v0
718 scorers:
719 - match
720- name: qa
721 dataset: qa__dev__v0
722 scorers:
723 - match
724- name: french-part-of-speech
725 dataset: french_part_of_speech__dev__v0
726 scorers:
727 - match
728- name: internal_representations
729 dataset: internal_representations__dev__v0
730 scorers:
731 - match
732- name: python_list_comprehension
733 dataset: python_list_comprehension__dev__v0
734 scorers:
735 - match
736- name: nepali-numerals
737 dataset: nepali_numerals__dev__v0
738 scorers:
739 - match
740- name: belarusian-syllable-count
741 dataset: belarusian_syllable_count__dev__v0
742 scorers:
743 - match
744- name: mandaliof-table
745 dataset: mandaliof_table__dev__v0
746 scorers:
747 - match
748- name: test_japanese_english_numerals
749 dataset: test_japanese_english_numerals__dev__v0
750 scorers:
751 - match
752- name: tracking-shuffled-objects
753 dataset: tracking_shuffled_objects__dev__v0
754 scorers:
755 - match
756- name: squares-gpt
757 dataset: squares_gpt__dev__v0
758 scorers:
759 - match
760- name: convert-hex-hsl-lightness
761 dataset: convert_hex_hsl_lightness__dev__v0
762 scorers:
763 - match
764- name: russe
765 dataset: russe__test__v0
766 scorers:
767 - match
768- name: aba_mrpc_true_false
769 dataset: aba_mrpc_true_false__dev__v0
770 scorers:
771 - match
772- name: logical_counting
773 dataset: logical_counting__dev__v0
774 scorers:
775 - match
776- name: vigenere
777 dataset: vigenere__s1__simple_v0
778 scorers:
779 - match
780- name: map-electronic-component-part-to-fact
781 dataset: map_electronic_component_part_to_fact__dev__v0
782 scorers:
783 - match
784- name: rare-and-loanwords-dutch-lexicon
785 dataset: rare_and_loanwords_dutch_lexicon__dev__v0
786 scorers:
787 - match
788- name: product_information_extraction_one_shot
789 dataset: product_information_extraction_one_shot__dev__v0
790 scorers:
791 - match
792- name: product_information_extraction_zero_shot
793 dataset: product_information_extraction_zero_shot__dev__v0
794 scorers:
795 - match
796- name: sort-numbers
797 dataset: sort_numbers__s1__simple_v0
798 scorers:
799 - match
800- name: match_product-matching_zeroshot
801 dataset: match_product_matching_zeroshot__dev__v1
802 scorers:
803 - match
804- name: match_product-matching_fewshot
805 dataset: match_product_matching_fewshot__dev__v1
806 scorers:
807 - match
808- name: match_product-matching_rules
809 dataset: match_product_matching_rules__dev__v1
810 scorers:
811 - match
812- name: russian-lexicon
813 dataset: russian_lexicon__dev__v0
814 scorers:
815 - match
816- name: dutch-lexicon
817 dataset: dutch_lexicon__dev__v0
818 scorers:
819 - match
820- name: greek-nt-manuscripts
821 dataset: greek_nt_manuscripts__v0
822 scorers:
823 - match
824- name: matrix_mult_rows
825 dataset: matrix_mult_rows__dev__v0
826 scorers:
827 - match
828- name: moral_exceptQA
829 dataset: moral_exceptqa__test__v1
830 scorers:
831 - match
832- name: music-theory-triads-identification
833 dataset: music_theory_triads_identification__dev__v0
834 scorers:
835 - match
836- name: music-theory-tetrads-identification
837 dataset: music_theory_tetrads_identification__dev__v0
838 scorers:
839 - match
840- name: find-thirukkural
841 dataset: find_thirukkural__dev__v0
842 scorers:
843 - match
844- name: building_floorplan
845 dataset: building_floorplan__test__v1
846 scorers:
847 - match
848- name: japanese-national-medical-exam01
849 dataset: japanese_national_medical_exam01__dev__v0
850 scorers:
851 - match
852- name: lat_long_identify
853 dataset: lat_long_identify__dev__v0
854 scorers:
855 - match
856- name: norwegian-lexicon
857 dataset: norwegian_lexicon__dev__v0
858 scorers:
859 - match
860- name: german-part-of-speech
861 dataset: german_part_of_speech__dev__v0
862 scorers:
863 - match
864- name: swedish_sat
865 dataset: swedish_sat__dev__v0
866 scorers:
867 - match
868- name: utility_price_parsing
869 dataset: utility_price_parsing__dev__v0
870 scorers:
871 - match
872- name: korean-consonant-vowel-combination
873 dataset: korean_consonant_vowel_combination__dev__v0
874 scorers:
875 - match
876- name: mate-in-one
877 dataset: mate_in_one__dev__v0
878 scorers:
879 - match
880- name: french-lexicon
881 dataset: french_lexicon__dev__v0
882 scorers:
883 - match
884- name: swedish-spelling
885 dataset: swedish_spelling__dev__v0
886 scorers:
887 - match
888- name: knot-theory-unknotting-number
889 dataset: knot_theory_unknotting_number__dev__v0
890 scorers:
891 - match
892- name: knot-theory-unknotting-problem
893 dataset: knot_theory_unknotting_problem__dev__v0
894 scorers:
895 - match
896- name: knot-theory-code-conversion
897 dataset: knot_theory_code_conversion__dev__v0
898 scorers:
899 - match
900- name: hindi_words
901 dataset: hindi_words__dev__v0
902 scorers:
903 - match
904- name: arithmetical_puzzles
905 dataset: arithmetical_puzzles__dev__v0
906 scorers:
907 - match
908- name: belarusian-antonyms
909 dataset: belarusian_antonyms__dev__v0
910 scorers:
911 - match
912- name: body-movement
913 dataset: body_movement__dev__zero_shot_v0
914 scorers:
915 - match
916- name: afrikaans-lexicon
917 dataset: afrikaans_lexicon__dev__v0
918 scorers:
919 - match
920- name: cricket_situations
921 dataset: cricket_situations__dev__v0
922 scorers:
923 - match
924- name: korean_spelling
925 dataset: korean_spelling__dev__v0
926 scorers:
927 - match
928- name: rucola
929 dataset: rucola__test__v0
930 scorers:
931 - match
932- name: reclor-logical-reasoning-plus
933 dataset: reclor_logical_reasoning_plus__dev__v0
934 scorers:
935 - match
936- name: logiqav2-logical-reasoning-plus
937 dataset: logiqav2_logical_reasoning_plus__dev__v0
938 scorers:
939 - match
940- name: logiqa-logical-reasoning-plus
941 dataset: logiqa_logical_reasoning_plus__dev__v0
942 scorers:
943 - match
944- name: medmcqa
945 dataset: medmcqa__dev__v0
946 scorers:
947 - match
948- name: multi-step-equations
949 dataset: multi_step_equations__dev__v0
950 scorers:
951 - match
952- name: japanese-remote-island-to-prefecture
953 dataset: japanese_remote_island_to_prefecture__dev__v0
954 scorers:
955 - match
956- name: chinese_homonym
957 dataset: chinese_homonym__dev__v0
958 scorers:
959 - match
960- name: norwegian-rhymes
961 dataset: norwegian_rhymes__dev__v0
962 scorers:
963 - match
964- name: chinese_shi_jing
965 dataset: chinese_shi_jing__test__v1
966 scorers:
967 - match
968- name: forth-stack-sim-basic
969 dataset: forth_stack_sim_basic__dev__v0
970 scorers:
971 - match
972- name: forth-stack-sim
973 dataset: forth_stack_sim__dev__v0
974 scorers:
975 - match
976- name: forth-stack-sim-detailed
977 dataset: forth_stack_sim_detailed__dev__v0
978 scorers:
979 - match
980- name: japanese_city_name_pronunciation
981 dataset: japanese_city_name_pronunciation__dev__v0
982 scorers:
983 - match
984- name: escher-sentences
985 dataset: escher_sentences__dev__v0
986 scorers:
987 - match
988- name: track_objects
989 dataset: track_objects__dev__v0
990 scorers:
991 - match
992- name: shopping_discount_comparison
993 dataset: shopping_discount_comparison__dev__v0
994 scorers:
995 - match
996- name: computer-science-problems
997 dataset: computer_science_problems__s1__simple_v0
998 scorers:
999 - match
1000- name: mendelian_inheritance
1001 dataset: mendelian_inheritance__dev__v0
1002 scorers:
1003 - match
1004- name: override-system-instruction
1005 dataset: override_system_instruction__dev__v0
1006 scorers:
1007 - match
1008- name: hand_ranks-match
1009 dataset: hand_ranks__test__v1
1010 scorers:
1011 - match
1012- name: diabetes
1013 dataset: diabetes__dev__v0
1014 scorers:
1015 - match
1016- name: job_listing_title_for_a_caregiver_in_japan
1017 dataset: job_listing_title_for_a_caregiver_in_japan__test__v1
1018 scorers:
1019 - match
1020- name: poker_analysis
1021 dataset: poker_analysis__test__v1
1022 scorers:
1023 - match
1024- name: belarusian-numerals
1025 dataset: belarusian_numerals__dev__v0
1026 scorers:
1027 - match
1028- name: algebra-word-problems
1029 dataset: algebra_word_problems__s1__simple_v0
1030 scorers:
1031 - match
1032- name: belarusian-grammar
1033 dataset: belarusian_grammar__dev__v0
1034 scorers:
1035 - match
1036- name: svg_understanding
1037 dataset: svg_understanding__v0
1038 scorers:
1039 - match
1040- name: cissp-study-questions
1041 dataset: cissp_study_questions__test__v1
1042 scorers:
1043 - match
1044- name: linear-equations
1045 dataset: linear_equations__dev__v0
1046 scorers:
1047 - match
1048- name: japanese_driving_license
1049 dataset: japanese_driving_license__s1__simple_v0
1050 scorers:
1051 - match
1052- name: first-letters
1053 dataset: first_letters__dev__v0
1054 scorers:
1055 - match
1056- name: arc
1057 dataset: arc__dev__v0
1058 scorers:
1059 - match
1060- name: css-selectors-verbal
1061 dataset: css_selectors_verbal__dev__v0
1062 scorers:
1063 - match
1064- name: japanese-itpassport-exam01
1065 dataset: japanese_itpassport_exam01__dev__v0
1066 scorers:
1067 - match
1068- name: logiqa
1069 dataset: logiqa__dev__v0
1070 scorers:
1071 - match
1072- name: chinese_zodiac
1073 dataset: chinese_zodiac__dev__v0
1074 scorers:
1075 - match
1076- name: spanish-lexicon
1077 dataset: spanish_lexicon__dev__v0
1078 scorers:
1079 - match
1080- name: food
1081 dataset: food__test__v1
1082 scorers:
1083 - match
1084- name: countries
1085 dataset: countries__dev__v0
1086 scorers:
1087 - match
1088- name: which-is-heavier
1089 dataset: which_is_heavier__dev__v0
1090 scorers:
1091 - match
1092- name: korean_date_counting
1093 dataset: korean_date_counting__dev__v0
1094 scorers:
1095 - match
1096- name: fcc_amateur_extra
1097 dataset: fcc_amateur_extra__dev__v0
1098 scorers:
1099 - match
1100- name: multistep-word-problems
1101 dataset: multistep_word_problems__dev__v0
1102 scorers:
1103 - match
1104- name: list_comparison_missing_name
1105 dataset: list_comparison_missing_name__dev__v0
1106 scorers:
1107 - match
1108- name: newsology
1109 dataset: newsology__dev__v0
1110 scorers:
1111 - match
1112- name: simple-visual-understanding
1113 dataset: simple_visual_understanding__dev__v0
1114 scorers:
1115 - match
1116- name: portuguese-syllable-count
1117 dataset: portuguese_syllable_count__dev__v0
1118 scorers:
1119 - match
1120- name: south-african-bands
1121 dataset: south_african_bands__dev__v0
1122 scorers:
1123 - match
1124- name: hebrew-plurals
1125 dataset: hebrew_plurals__dev__v0
1126 scorers:
1127 - match
1128- name: rot13
1129 dataset: rot13__s1__simple_v0
1130 scorers:
1131 - match
1132- name: korean_dialects
1133 dataset: korean_dialects__dev__v0
1134 scorers:
1135 - match
1136- name: test-time-zone-conversion
1137 dataset: test_time_zone_conversion__dev__v0
1138 scorers:
1139 - match
1140- name: music-theory-chord-notes
1141 dataset: music_theory_chord_notes__dev__v0
1142 scorers:
1143 - match
1144- name: russian-english-homonym-context-resolution
1145 dataset: russian_english_homonym_context_resolution__dev__v0
1146 scorers:
1147 - match
1148- name: number-reading
1149 dataset: number_reading__dev__v0
1150 scorers:
1151 - match
1152- name: simple-knowledge-mongolian
1153 dataset: simple_knowledge_mongolian__dev__v0
1154 scorers:
1155 - match
1156- name: base64-decode
1157 dataset: base64_decode_simple__dev__v0
1158 scorers:
1159 - match
1160- name: urdu-transliteration
1161 dataset: urdu_transliteration__dev__v0
1162 scorers:
1163 - match
1164- name: reverse-polish-notation
1165 dataset: reverse_polish_notation__dev__v0
1166 scorers:
1167 - match
1168- name: music-theory-chord-names
1169 dataset: music_theory_chord_names__dev__v0
1170 scorers:
1171 - match
1172- name: born-first
1173 dataset: born_first__dev__v0
1174 scorers:
1175 - match
1176- name: tetris
1177 dataset: tetris__dev__v0
1178 scorers:
1179 - match
1180- name: pure_korean
1181 dataset: pure_korean__dev__v0
1182 scorers:
1183 - match
1184- name: determinant
1185 dataset: determinant__test__v1
1186 scorers:
1187 - match
1188- name: split_chinese_characters
1189 dataset: split_chinese_characters__dev__v0
1190 scorers:
1191 - match
1192- name: syntax-check
1193 dataset: syntax_check__dev__v1
1194 scorers:
1195 - match
1196- name: balance-chemical-equation
1197 dataset: balance_chemical_equation__dev__v0
1198 scorers:
1199 - match
1200- name: emotional-intelligence
1201 dataset: emotional_intelligence__dev__v0
1202 scorers:
1203 - match
1204- name: nutrition
1205 dataset: nutrition__dev__v0
1206 scorers:
1207 - match
1208- name: reverse-sort-words-eng
1209 dataset: reverse_sort_words_eng_simple__dev__v0
1210 scorers:
1211 - match
1212- name: day-of-week-from-date
1213 dataset: day_of_week_from_date__dev__v0
1214 scorers:
1215 - match
1216- name: regex-match
1217 dataset: regex__match__dev__v0
1218 scorers:
1219 - match
1220- name: find-letter
1221 dataset: find_letter__dev__v0
1222 scorers:
1223 - match
1224- name: korean_foreign_words
1225 dataset: korean_foreign_words__dev__v0
1226 scorers:
1227 - match
1228- name: greek-vocabulary
1229 dataset: greek_vocabulary__dev__v0
1230 scorers:
1231 - match
1232- name: rubiks-colors
1233 dataset: rubiks_colors__dev__v0
1234 scorers:
1235 - match
1236- name: decrypt-caesar-cipher
1237 dataset: decrypt_caesar_cipher__dev__v0
1238 scorers:
1239 - match
1240- name: us-tort-law
1241 dataset: us_tort_law__dev__v0
1242 scorers:
1243 - match
1244- name: number-pattern
1245 dataset: number_pattern__dev__v0
1246 scorers:
1247 - match
1248- name: confusing_korean
1249 dataset: confusing_korean__dev__v0
1250 scorers:
1251 - match
1252- name: kanji-idioms
1253 dataset: kanji_idioms__test__v0
1254 scorers:
1255 - match
1256- name: missing-operators
1257 dataset: missing_operators__s1__simple_v0
1258 scorers:
1259 - match
1260- name: unsolvable_questions
1261 dataset: unsolvable_questions__dev__v0
1262 scorers:
1263 - match
1264- name: portuguese-sarcasm
1265 dataset: portuguese_sarcasm__dev__v0
1266 scorers:
1267 - match
1268- name: swap-words
1269 dataset: swap_words__dev__v0
1270 scorers:
1271 - match
1272- name: hebrew-same-noun-gender
1273 dataset: hebrew_same_noun_gender__v0
1274 scorers:
1275 - match
1276- name: heart-disease
1277 dataset: heart_disease__v0
1278 scorers:
1279 - match
1280- name: last-word-nth
1281 dataset: last_word_nth__s1__simple_v0
1282 scorers:
1283 - match
1284- name: ascii-wordart
1285 dataset: ascii_wordart__dev__v0
1286 scorers:
1287 - match
1288- name: direct-speech-tag
1289 dataset: direct_speech_tag__dev__v0
1290 scorers:
1291 - match
1292- name: italian-new-words
1293 dataset: italian_new_words__dev__v0
1294 scorers:
1295 - match
1296- name: irony
1297 dataset: irony__dev__v0
1298 scorers:
1299 - match
1300- name: math_polish
1301 dataset: math_polish__dev__v0
1302 scorers:
1303 - match
1304- name: irish-lexicon
1305 dataset: irish_lexicon__dev__v0
1306 scorers:
1307 - match
1308- name: canto_wu_pronunciation
1309 dataset: canto_wu_pronunciation__dev__v0
1310 scorers:
1311 - match
1312- name: irrelevant-negative-diversion
1313 dataset: irrelevant_negative_diversion__dev__v0
1314 scorers:
1315 - match
1316- name: invert_word_wise
1317 dataset: invert_word_wise__dev__v0
1318 scorers:
1319 - match
1320- name: imperial_date_to_string
1321 dataset: imperial_date_to_string__dev__v0
1322 scorers:
1323 - match
1324- name: gujarati-numerals
1325 dataset: gujarati_numerals__dev__v0
1326 scorers:
1327 - match
1328- name: count_token_freq_dna
1329 dataset: count_token_freq_dna__dev__v0
1330 scorers:
1331 - match
1332- name: french_homonym_and_homograph
1333 dataset: french_homonym_and_homograph__dev__v0
1334 scorers:
1335 - match
1336- name: cube-pack
1337 dataset: cube_pack__dev__v0
1338 scorers:
1339 - match
1340- name: historical-kana-orthography-reading
1341 dataset: historical_kana_orthography_reading__dev__v0
1342 scorers:
1343 - match
1344- name: canto_wu_pronunciation_fewshot
1345 dataset: canto_wu_pronunciation_fewshot__dev__v0
1346 scorers:
1347 - match
1348- name: accounting_audit
1349 dataset: accounting_audit__dev__v0
1350 scorers:
1351 - match
1352- name: brazilian-lexicon
1353 dataset: brazilian_lexicon__dev__v0
1354 scorers:
1355 - match
1356- name: naughty_strings
1357 dataset: naughty_strings__test__v1
1358 scorers:
1359 - match
1360- name: korean-phonetics
1361 dataset: korean_phonetics__dev__v0
1362 scorers:
1363 - match
1364- name: chinese-homo
1365 dataset: chinese_homophonic__dev__v0
1366 scorers:
1367 - match
1368- name: count_intersections_polynomial
1369 dataset: count_intersections_polynomial__dev__v0
1370 scorers:
1371 - match
1372- name: coqa-match
1373 dataset: coqa_match__dev__v0
1374 scorers:
1375 - match
1376- name: latin-grammar
1377 dataset: latin_grammar__dev__v0
1378 scorers:
1379 - match
1380- name: bitwise
1381 dataset: bitwise__dev__v0
1382 scorers:
1383 - match
1384- name: shared-borders
1385 dataset: shared_borders__dev__v0
1386 scorers:
1387 - match
1388- name: japanese-station
1389 dataset: japanese_station__dev__v0
1390 scorers:
1391 - match
1392- name: atpl_exams
1393 dataset: atpl_exams__dev__v0
1394 scorers:
1395 - match
1396- name: invoice_due_date_leap_day_adjustment
1397 dataset: invoice_due_date_leap_day_adjustment__dev__v0
1398 scorers:
1399 - match
1400- name: romanian_homonyms
1401 dataset: romanian_homonyms__dev__v0
1402 scorers:
1403 - match
1404- name: infiniteloop-match
1405 dataset: infiniteloop_match__s1__simple_v0
1406 scorers:
1407 - match
1408- name: russian-nlp-tasks
1409 dataset: russian_nlp_tasks__dev__v0
1410 scorers:
1411 - match
1412- name: chinese_chu_ci
1413 dataset: chinese_chu_ci__dev__v0
1414 scorers:
1415 - match
1416- name: polish-syllable-count
1417 dataset: polish_syllable_count__val__v0
1418 scorers:
1419 - match
1420- name: korean-postposition
1421 dataset: korean_postposition__dev__v0
1422 scorers:
1423 - match
1424- name: bulgarian-lexicon
1425 dataset: bulgarian_lexicon__dev__v0
1426 scorers:
1427 - match
1428- name: compare-countries-area
1429 dataset: compare_countries_area__dev__v0
1430 scorers:
1431 - match
1432- name: pattern_identification
1433 dataset: pattern_identification__dev__v0
1434 scorers:
1435 - match
1436- name: belarusian-synonyms
1437 dataset: belarusian_synonyms__dev__v0
1438 scorers:
1439 - match
1440- name: spanish_feminine_noun_masculine_article
1441 dataset: spanish_feminine_noun_masculine_article__dev__v0
1442 scorers:
1443 - match
1444- name: sarcasm
1445 dataset: sarcasm__test__v1
1446 scorers:
1447 - match
1448- name: chinese_tang_poetries
1449 dataset: chinese_tang_poetries__dev__match_v1
1450 scorers:
1451 - match
1452- name: japanese-number-reading
1453 dataset: japanese_number_reading__dev__v0
1454 scorers:
1455 - match
1456- name: korean-honorific
1457 dataset: korean_honorific__dev__v0
1458 scorers:
1459 - match
1460- name: complex-replace-characters
1461 dataset: complex_replace_characters__dev__v0
1462 scorers:
1463 - match
1464- name: dice-rotation-sequence
1465 dataset: dice_rotation_sequence__dev__v0
1466 scorers:
1467 - match
1468- name: utah_real_estateh
1469 dataset: utah_real_estate__dev__v0
1470 scorers:
1471 - match
1472- name: formal-logic
1473 dataset: formal_logic__dev__v0
1474 scorers:
1475 - match
1476- name: resistor-ohm-calculator
1477 dataset: resistor_ohm_calculator__dev__simple_v0
1478 scorers:
1479 - match
1480- name: gol
1481 dataset: gol__dev__v1
1482 scorers:
1483 - match
1484- name: icelandic-sentences-gec
1485 dataset: icelandic_sentences_gec__dev__v0
1486 scorers:
1487 - match
1488- name: chinese_modern_poem_identification
1489 dataset: chinese_modern_poem_identification__test__v1
1490 scorers:
1491 - match
1492- name: reverse-string
1493 dataset: reverse_string__s1__simple_v0
1494 scorers:
1495 - match
1496- name: complex-analogies-en-ru
1497 dataset: complex_analogies_en_ru__dev__v0
1498 scorers:
1499 - match
1500- name: positive-binary-operations
1501 dataset: positive_binary_operations__test__v1
1502 scorers:
1503 - match
1504- name: hindi_shuddha
1505 dataset: hindi_shuddha__dev__v0
1506 scorers:
1507 - match
1508- name: tokyo-station-number
1509 dataset: tokyo_station_number__dev__v0
1510 scorers:
1511 - match
1512- name: chinese_famous_novel
1513 dataset: chinese_famous_novel__dev__v0
1514 scorers:
1515 - match
1516- name: diagrammatic_logic
1517 dataset: diagrammatic_logic__dev__v2
1518 scorers:
1519 - match
1520- name: polish-lexicon
1521 dataset: polish_lexicon__dev__v0
1522 scorers:
1523 - match
1524- name: wkt_understanding
1525 dataset: wkt_understanding__dev__v0
1526 scorers:
1527 - match
1528- name: japanese-national-medical-exam02
1529 dataset: japanese_national_medical_exam02__dev__v0
1530 scorers:
1531 - match
1532- name: cardinal-directions
1533 dataset: cardinal_directions__dev__v0
1534 scorers:
1535 - match
1536- name: rectangles
1537 dataset: rectangles__dev__v0
1538 scorers:
1539 - match
1540- name: hindi_upsc
1541 dataset: hindi_upsc__dev__v0
1542 scorers:
1543 - match
1544- name: three-pt-mapping
1545 dataset: three_pt_mapping__dev__v0
1546 scorers:
1547 - match
1548- name: polish-proverbs
1549 dataset: polish_proverbs__dev__v0
1550 scorers:
1551 - match
1552- name: indonesian_numbers
1553 dataset: indonesian_numbers__dev__v0
1554 scorers:
1555 - match
1556- name: chinese_song_ci
1557 dataset: chinese_song_ci__dev__v0
1558 scorers:
1559 - match
1560- name: cybersecurity-filepaths
1561 dataset: cybersecurity_filepaths__dev__v0
1562 scorers:
1563 - match
1564- name: taxes
1565 dataset: taxes__dev__v0
1566 scorers:
1567 - match
1568- name: crontab
1569 dataset: crontab__dev__v0
1570 scorers:
1571 - match
1572- name: integer-sequence-predictions-misc
1573 dataset: integer_sequence_predictions_misc__dev__v0
1574 scorers:
1575 - match
1576- name: integer-sequence-predictions-obscure
1577 dataset: integer_sequence_predictions_obscure__dev__v0
1578 scorers:
1579 - match
1580- name: integer-sequence-predictions-notable
1581 dataset: integer_sequence_predictions_notable__dev__v0
1582 scorers:
1583 - match
1584- name: integer-sequence-predictions
1585 dataset: integer_sequence_predictions__dev__v0
1586 scorers:
1587 - match
1588- name: belarusian-orthography
1589 dataset: belarusian_orthography__dev__v0
1590 scorers:
1591 - match
1592- name: date-booking
1593 dataset: date_booking__dev__v0
1594 scorers:
1595 - match
1596- name: interlingual-homograph
1597 dataset: interlingual_homograph__dev__v0
1598 scorers:
1599 - match
1600- name: stats-tests
1601 dataset: stats_tests__dev__v0
1602 scorers:
1603 - match
1604- name: belarusian-russian-translation
1605 dataset: belarusian_russian_translation__dev__v0
1606 scorers:
1607 - match
1608- name: date-calculator
1609 dataset: date_calculator__test__v1
1610 scorers:
1611 - match
1612- name: chinese_poem
1613 dataset: chinese_poem__dev__v0
1614 scorers:
1615 - match
1616- name: belarusian-lexicon
1617 dataset: belarusian_lexicon__dev__v0
1618 scorers:
1619 - match
1620- name: test_english_pronunciations
1621 dataset: test_english_pronunciations__dev__v0
1622 scorers:
1623 - match
1624- name: anagrams
1625 dataset: anagrams__test__v1
1626 scorers:
1627 - match
1628- name: guess-the-singer
1629 dataset: guess_the_singer__dev__v0
1630 scorers:
1631 - match
1632- name: illinois-law
1633 dataset: illinois_law__v0
1634 scorers:
1635 - match
1636- name: russian_medical
1637 dataset: russian_medical__dev__v0
1638 scorers:
1639 - match
1640- name: bigrams
1641 dataset: bigrams__dev__v0
1642 scorers:
1643 - match
1644- name: probability-questions
1645 dataset: probability_questions__dev__v0
1646 scorers:
1647 - match
1648- name: vintage_phone_keyboard_decode
1649 dataset: vintage_phone_keyboard_decode__dev__v0
1650 scorers:
1651 - match
1652- name: connect4
1653 dataset: connect4__s1__v1
1654 scorers:
1655 - match
1656- name: stock-options-bull-call-spread
1657 dataset: stock_options_bull_call_spread__dev__v0
1658 scorers:
1659 - match
1660- name: stock-options-bear-call-spread
1661 dataset: stock_options_bear_call_spread__dev__v0
1662 scorers:
1663 - match
1664- name: stock-option-terms-bear-call-spread
1665 dataset: stock_option_terms_bear_call_spread__dev__v0
1666 scorers:
1667 - match
1668- name: stock-option-terms-iron-butteryfly-spread
1669 dataset: stock_option_terms_iron_butterfly_spread__dev__v0
1670 scorers:
1671 - match
1672- name: stock-option-terms-bull-call-spread
1673 dataset: stock_option_terms_bull_call_spread__dev__v0
1674 scorers:
1675 - match
1676- name: stock-options-inverse-iron-condor-spread
1677 dataset: stock_options_inverse_iron_condor_spread__dev__v0
1678 scorers:
1679 - match
1680- name: stock-options-iron-condor-spread
1681 dataset: stock_options_iron_condor_spread__dev__v0
1682 scorers:
1683 - match
1684- name: stock-option-terms-iron-condor-spread
1685 dataset: stock_option_terms_iron_condor_spread__dev__v0
1686 scorers:
1687 - match
1688- name: stock-options-inverse-iron-butterfly-spread
1689 dataset: stock_options_inverse_iron_butterfly_spread__dev__v0
1690 scorers:
1691 - match
1692- name: stock-option-terms-inverse-iron-condor-spread
1693 dataset: stock_option_terms_inverse_iron_condor_spread__dev__v0
1694 scorers:
1695 - match
1696- name: japanese_romantic_context
1697 dataset: japanese_romantic_context__dev__v0
1698 scorers:
1699 - match
1700- name: phonetics-identify-words-needing-missing-gpcs
1701 dataset: phonetics_identify_words_needing_missing_gpcs__s1__simple_v0
1702 scorers:
1703 - match
1704- name: prompt-injection
1705 dataset: prompt_injection__dev__v0
1706 scorers:
1707 - match
1708- name: word_vector_over_reliance
1709 dataset: word_vector_over_reliance__dev__simple_v0
1710 scorers:
1711 - match
1712- name: iso-to-lunar-calendar
1713 dataset: iso_to_lunar_calendar__dev__v0
1714 scorers:
1715 - match
1716- name: lunar-calendar-to-iso
1717 dataset: lunar_calendar_to_iso__dev__v0
1718 scorers:
1719 - match
1720- name: code_combination
1721 dataset: code_combination__dev__v0
1722 scorers:
1723 - match
1724- name: partially_solved_crossword_clues
1725 dataset: partially_solved_crossword_clues__dev__v0
1726 scorers:
1727 - match
1728- name: quartz
1729 dataset: quartz__test__v1
1730 scorers:
1731 - match
1732- name: physics-interaction
1733 dataset: physics__interaction__dev__v0
1734 scorers:
1735 - match
1736- name: next-val-series
1737 dataset: next_val_series__dev__simple_v0
1738 scorers:
1739 - match
1740
1name: spicepod
2version: v1beta1
3kind: Spicepod
4
5# This file is generate from the evals & data in the repo. Specifically using [evalconverter](https://github.com/spiceai/spiceai/tree/trunk/tools/evalconverter).
6# ```shell
7# evalconverter -i evals/registry/evals -b evals/registry/data
8#```
9
10datasets:
11- from: file:evals/registry/data/actors-sequence/samples.jsonl
12 name: actors_sequence__dev__match_v1
13- from: file:evals/registry/data/adultery-state-laws/samples.jsonl
14 name: adultery_state_laws__dev__v0
15- from: file:evals/registry/data/proofreader/samples.jsonl
16 name: proofreader__dev__v0
17- from: file:evals/registry/data/rock-climbing/samples.jsonl
18 name: rock_climbing__dev__v0
19- from: file:evals/registry/data/banking77/samples.jsonl
20 name: match_banking77__test__v1
21- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_prep.jsonl
22 name: ukraine_gec_grammar_prep__dev__v0
23- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_case.jsonl
24 name: ukraine_gec_grammar_case__dev__v0
25- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_gender.jsonl
26 name: ukraine_gec_grammar_gender__dev__v0
27- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_partvoice.jsonl
28 name: ukraine_gec_grammar_partvoice__dev__v0
29- from: file:evals/registry/data/ukraine_gec/ukraine_gec_fluency_poorflow.jsonl
30 name: ukraine_gec_fluency_poorflow__dev__v0
31- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_verbvoice.jsonl
32 name: ukraine_gec_grammar_verbvoice__dev__v0
33- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_number.jsonl
34 name: ukraine_gec_grammar_number__dev__v0
35- from: file:evals/registry/data/ukraine_gec/ukraine_gec_fluency_repetition.jsonl
36 name: ukraine_gec_fluency_repetition__dev__v0
37- from: file:evals/registry/data/ukraine_gec/ukraine_gec_fluency_calque.jsonl
38 name: ukraine_gec_fluency_calque__dev__v0
39- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_verbaform.jsonl
40 name: ukraine_gec_grammar_verbaform__dev__v0
41- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_ungrammaticalstructure.jsonl
42 name: ukraine_gec_grammar_ungrammaticalstructure__dev__v0
43- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_other.jsonl
44 name: ukraine_gec_grammar_other__dev__v0
45- from: file:evals/registry/data/ukraine_gec/ukraine_gec_fluency_style.jsonl
46 name: ukraine_gec_fluency_style__dev__v0
47- from: file:evals/registry/data/ukraine_gec/ukraine_gec_fluency_other.jsonl
48 name: ukraine_gec_fluency_other__dev__v0
49- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_conjunction.jsonl
50 name: ukraine_gec_grammar_conjunction__dev__v0
51- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_comparison.jsonl
52 name: ukraine_gec_grammar_comparison__dev__v0
53- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_tense.jsonl
54 name: ukraine_gec_grammar_tense__dev__v0
55- from: file:evals/registry/data/ukraine_gec/ukraine_gec_grammar_aspect.jsonl
56 name: ukraine_gec_grammar_aspect__dev__v0
57- from: file:evals/registry/data/irish_plural_nouns/samples.jsonl
58 name: irish_plural_nouns__dev__v0
59- from: file:evals/registry/data/shape_in_shape/shape_in_shape.jsonl
60 name: shape_in_shape__dev__v1
61- from: file:evals/registry/data/russian_sarcasm/samples.jsonl
62 name: russian_sarcasm__dev__v0
63- from: file:evals/registry/data/syllables_long_words/long_word_samples.jsonl
64 name: syllables__dev__v1
65- from: file:evals/registry/data/crepe/samples.jsonl
66 name: crepe__dev__v2
67- from: file:evals/registry/data/coq-proof-step/match.jsonl
68 name: coq_proof_step_match__dev__v0
69- from: file:evals/registry/data/ukraine_eit/samples.jsonl
70 name: ukraine_eit__val__v0
71- from: file:evals/registry/data/belarusian_proverbs/samples.jsonl
72 name: belarusian_proverbs__dev__v0
73- from: file:evals/registry/data/invoices/match.jsonl
74 name: invoices__dev__v0
75- from: file:evals/registry/data/urdu-lexicon/samples.jsonl
76 name: urdu_lexicon__dev__v0
77- from: file:evals/registry/data/qa/q_and_a.jsonl
78 name: qa__dev__v0
79- from: file:evals/registry/data/french-part-of-speech/samples.jsonl
80 name: french_part_of_speech__dev__v0
81- from: file:evals/registry/data/internal_representations/samples.jsonl
82 name: internal_representations__dev__v0
83- from: file:evals/registry/data/python_list_comprehension/samples.jsonl
84 name: python_list_comprehension__dev__v0
85- from: file:evals/registry/data/nepali_numerals/samples.jsonl
86 name: nepali_numerals__dev__v0
87- from: file:evals/registry/data/belarusian_syllable_count/samples.jsonl
88 name: belarusian_syllable_count__dev__v0
89- from: file:evals/registry/data/mandaliof-table/samples.jsonl
90 name: mandaliof_table__dev__v0
91- from: file:evals/registry/data/test_japanese_english_numerals/samples.jsonl
92 name: test_japanese_english_numerals__dev__v0
93- from: file:evals/registry/data/tracking-shuffled-objects/samples.jsonl
94 name: tracking_shuffled_objects__dev__v0
95- from: file:evals/registry/data/squares-gpt/square-samples.jsonl
96 name: squares_gpt__dev__v0
97- from: file:evals/registry/data/convert-hex-hsl-lightness/samples.jsonl
98 name: convert_hex_hsl_lightness__dev__v0
99- from: file:evals/registry/data/russe/samples.jsonl
100 name: russe__test__v0
101- from: file:evals/registry/data/aba_mrpc_true_false/samples.jsonl
102 name: aba_mrpc_true_false__dev__v0
103- from: file:evals/registry/data/logical_counting/samples.jsonl
104 name: logical_counting__dev__v0
105- from: file:evals/registry/data/vigenere/samples.jsonl
106 name: vigenere__s1__simple_v0
107- from: file:evals/registry/data/map-electronic-component-part-to-fact/samples.jsonl
108 name: map_electronic_component_part_to_fact__dev__v0
109- from: file:evals/registry/data/rare-and-loanwords-dutch-lexicon/samples.jsonl
110 name: rare_and_loanwords_dutch_lexicon__dev__v0
111- from: file:evals/registry/data/product-ie/fewshot/product_ie_one_shot_samples.jsonl
112 name: product_information_extraction_one_shot__dev__v0
113- from: file:evals/registry/data/product-ie/zeroshot/product_ie_zero_shot_samples.jsonl
114 name: product_information_extraction_zero_shot__dev__v0
115- from: file:evals/registry/data/sort_numeric/samples.jsonl
116 name: sort_numbers__s1__simple_v0
117- from: file:evals/registry/data/product-matching/zeroshot/samples.jsonl
118 name: match_product_matching_zeroshot__dev__v1
119- from: file:evals/registry/data/product-matching/fewshot/samples.jsonl
120 name: match_product_matching_fewshot__dev__v1
121- from: file:evals/registry/data/product-matching/rules/samples.jsonl
122 name: match_product_matching_rules__dev__v1
123- from: file:evals/registry/data/russian-lexicon/samples.jsonl
124 name: russian_lexicon__dev__v0
125- from: file:evals/registry/data/dutch-lexicon/samples.jsonl
126 name: dutch_lexicon__dev__v0
127- from: file:evals/registry/data/greek_nt_manuscripts/codes-sigla-centuries.jsonl
128 name: greek_nt_manuscripts__v0
129- from: file:evals/registry/data/matrix_mult_rows/samples.jsonl
130 name: matrix_mult_rows__dev__v0
131- from: file:evals/registry/data/moral_exceptQA/samples.jsonl
132 name: moral_exceptqa__test__v1
133- from: file:evals/registry/data/music-theory/triads-samples.jsonl
134 name: music_theory_triads_identification__dev__v0
135- from: file:evals/registry/data/music-theory/tetrads-samples.jsonl
136 name: music_theory_tetrads_identification__dev__v0
137- from: file:evals/registry/data/find-thirukkural/samples.jsonl
138 name: find_thirukkural__dev__v0
139- from: file:evals/registry/data/building_floorplan/samples.jsonl
140 name: building_floorplan__test__v1
141- from: file:evals/registry/data/japanese-national-medical-exam01/japanese-national-medical-exam01.jsonl
142 name: japanese_national_medical_exam01__dev__v0
143- from: file:evals/registry/data/lat_long_identify/samples.jsonl
144 name: lat_long_identify__dev__v0
145- from: file:evals/registry/data/norwegian-lexicon/samples.jsonl
146 name: norwegian_lexicon__dev__v0
147- from: file:evals/registry/data/german-part-of-speech/samples.jsonl
148 name: german_part_of_speech__dev__v0
149- from: file:evals/registry/data/swedish_sat/samples.jsonl
150 name: swedish_sat__dev__v0
151- from: file:evals/registry/data/utility_price_parsing/samples.jsonl
152 name: utility_price_parsing__dev__v0
153- from: file:evals/registry/data/korean-consonant-vowel-combination/samples.jsonl
154 name: korean_consonant_vowel_combination__dev__v0
155- from: file:evals/registry/data/mate-in-one/samples.jsonl
156 name: mate_in_one__dev__v0
157- from: file:evals/registry/data/french-lexicon/samples.jsonl
158 name: french_lexicon__dev__v0
159- from: file:evals/registry/data/swedish-spelling/samples.jsonl
160 name: swedish_spelling__dev__v0
161- from: file:evals/registry/data/knot-theory/knot-theory-unknotting-numbers.jsonl
162 name: knot_theory_unknotting_number__dev__v0
163- from: file:evals/registry/data/knot-theory/knot-theory-unknotting-problems.jsonl
164 name: knot_theory_unknotting_problem__dev__v0
165- from: file:evals/registry/data/knot-theory/knot-theory-code-conversions.jsonl
166 name: knot_theory_code_conversion__dev__v0
167- from: file:evals/registry/data/hindi_words/samples.jsonl
168 name: hindi_words__dev__v0
169- from: file:evals/registry/data/arithmetical_puzzles/arithmetical_puzzles.jsonl
170 name: arithmetical_puzzles__dev__v0
171- from: file:evals/registry/data/belarusian_antonyms/samples.jsonl
172 name: belarusian_antonyms__dev__v0
173- from: file:evals/registry/data/body_movement/body_movement.jsonl
174 name: body_movement__dev__zero_shot_v0
175- from: file:evals/registry/data/afrikaans-lexicon/samples.jsonl
176 name: afrikaans_lexicon__dev__v0
177- from: file:evals/registry/data/cricket_situations/samples.jsonl
178 name: cricket_situations__dev__v0
179- from: file:evals/registry/data/korean_spelling/samples.jsonl
180 name: korean_spelling__dev__v0
181- from: file:evals/registry/data/rucola/samples.jsonl
182 name: rucola__test__v0
183- from: file:evals/registry/data/logiqa-logical-reasoning-plus/reclor-logical-reasoning-plus.jsonl
184 name: reclor_logical_reasoning_plus__dev__v0
185- from: file:evals/registry/data/logiqa-logical-reasoning-plus/logiqav2-logical-reasoning-plus.jsonl
186 name: logiqav2_logical_reasoning_plus__dev__v0
187- from: file:evals/registry/data/logiqa-logical-reasoning-plus/logiqa-logical-reasoning-plus.jsonl
188 name: logiqa_logical_reasoning_plus__dev__v0
189- from: file:evals/registry/data/medmcqa/samples.jsonl
190 name: medmcqa__dev__v0
191- from: file:evals/registry/data/multi-step-equations/samples.jsonl
192 name: multi_step_equations__dev__v0
193- from: file:evals/registry/data/islands/japanese_remote_island_to_prefecture.jsonl
194 name: japanese_remote_island_to_prefecture__dev__v0
195- from: file:evals/registry/data/chinese_homonym/samples.jsonl
196 name: chinese_homonym__dev__v0
197- from: file:evals/registry/data/norwegian-rhymes/samples.jsonl
198 name: norwegian_rhymes__dev__v0
199- from: file:evals/registry/data/chinese_shi_jing/samples.jsonl
200 name: chinese_shi_jing__test__v1
201- from: file:evals/registry/data/forth_stack_sim/basic_samples.jsonl
202 name: forth_stack_sim_basic__dev__v0
203- from: file:evals/registry/data/forth_stack_sim/samples.jsonl
204 name: forth_stack_sim__dev__v0
205- from: file:evals/registry/data/forth_stack_sim/detailed_samples.jsonl
206 name: forth_stack_sim_detailed__dev__v0
207- from: file:evals/registry/data/japanese_city_name_pronunciation/samples.jsonl
208 name: japanese_city_name_pronunciation__dev__v0
209- from: file:evals/registry/data/escher_sentences/samples.jsonl
210 name: escher_sentences__dev__v0
211- from: file:evals/registry/data/track_objects/samples.jsonl
212 name: track_objects__dev__v0
213- from: file:evals/registry/data/shopping_discount_comparison/samples.jsonl
214 name: shopping_discount_comparison__dev__v0
215- from: file:evals/registry/data/test_comp_sci/questions.jsonl
216 name: computer_science_problems__s1__simple_v0
217- from: file:evals/registry/data/mendelian_inheritance/samples.jsonl
218 name: mendelian_inheritance__dev__v0
219- from: file:evals/registry/data/override-system-instruction/samples.jsonl
220 name: override_system_instruction__dev__v0
221- from: file:evals/registry/data/poker_hand_ranks/full_samples.jsonl
222 name: hand_ranks__test__v1
223- from: file:evals/registry/data/diabetes/samples.jsonl
224 name: diabetes__dev__v0
225- from: file:evals/registry/data/job_listing_title_for_a_caregiver_in_japan/samples.jsonl
226 name: job_listing_title_for_a_caregiver_in_japan__test__v1
227- from: file:evals/registry/data/poker_analysis/samples.jsonl
228 name: poker_analysis__test__v1
229- from: file:evals/registry/data/belarusian_numerals/samples.jsonl
230 name: belarusian_numerals__dev__v0
231- from: file:evals/registry/data/algebra_word_problems/samples.jsonl
232 name: algebra_word_problems__s1__simple_v0
233- from: file:evals/registry/data/belarusian_grammar/samples.jsonl
234 name: belarusian_grammar__dev__v0
235- from: file:evals/registry/data/svg_understanding/samples.jsonl
236 name: svg_understanding__v0
237- from: file:evals/registry/data/cissp-study-questions/samples.jsonl
238 name: cissp_study_questions__test__v1
239- from: file:evals/registry/data/linear_equations/samples.jsonl
240 name: linear_equations__dev__v0
241- from: file:evals/registry/data/japanese_driving_license/samples.jsonl
242 name: japanese_driving_license__s1__simple_v0
243- from: file:evals/registry/data/first-letters/samples.jsonl
244 name: first_letters__dev__v0
245- from: file:evals/registry/data/arc/samples.jsonl
246 name: arc__dev__v0
247- from: file:evals/registry/data/css-selectors/verbal.jsonl
248 name: css_selectors_verbal__dev__v0
249- from: file:evals/registry/data/japanese-itpassport-exam01/japanese-itpassport-exam01.jsonl
250 name: japanese_itpassport_exam01__dev__v0
251- from: file:evals/registry/data/logiqa/logiqa.jsonl
252 name: logiqa__dev__v0
253- from: file:evals/registry/data/chinese_zodiac/samples.jsonl
254 name: chinese_zodiac__dev__v0
255- from: file:evals/registry/data/spanish-lexicon/samples.jsonl
256 name: spanish_lexicon__dev__v0
257- from: file:evals/registry/data/food/samples.jsonl
258 name: food__test__v1
259- from: file:evals/registry/data/countries/samples.jsonl
260 name: countries__dev__v0
261- from: file:evals/registry/data/which_is_heavier/which_is_heavier.jsonl
262 name: which_is_heavier__dev__v0
263- from: file:evals/registry/data/korean_date_counting/samples.jsonl
264 name: korean_date_counting__dev__v0
265- from: file:evals/registry/data/fcc_amateur_extra/samples.jsonl
266 name: fcc_amateur_extra__dev__v0
267- from: file:evals/registry/data/multistep-word-problems/samples.jsonl
268 name: multistep_word_problems__dev__v0
269- from: file:evals/registry/data/list_comparison_missing_name/samples.jsonl
270 name: list_comparison_missing_name__dev__v0
271- from: file:evals/registry/data/newsology/samples.jsonl
272 name: newsology__dev__v0
273- from: file:evals/registry/data/simple-visual-understanding/simple-visual-understanding.jsonl
274 name: simple_visual_understanding__dev__v0
275- from: file:evals/registry/data/portuguese-syllable-count/samples.jsonl
276 name: portuguese_syllable_count__dev__v0
277- from: file:evals/registry/data/south-african-bands/south-african-bands.jsonl
278 name: south_african_bands__dev__v0
279- from: file:evals/registry/data/hebrew_plurals/samples.jsonl
280 name: hebrew_plurals__dev__v0
281- from: file:evals/registry/data/rot13/rot13.jsonl
282 name: rot13__s1__simple_v0
283- from: file:evals/registry/data/korean_dialects/samples.jsonl
284 name: korean_dialects__dev__v0
285- from: file:evals/registry/data/test_time_zone_conversion/samples.v0.jsonl
286 name: test_time_zone_conversion__dev__v0
287- from: file:evals/registry/data/music_theory/music_theory_chord_notes.jsonl
288 name: music_theory_chord_notes__dev__v0
289- from: file:evals/registry/data/russian-english-homonym-context-resolution/samples.jsonl
290 name: russian_english_homonym_context_resolution__dev__v0
291- from: file:evals/registry/data/number_reading/number_reading.jsonl
292 name: number_reading__dev__v0
293- from: file:evals/registry/data/simple-knowledge-mongolian/samples.v0.jsonl
294 name: simple_knowledge_mongolian__dev__v0
295- from: file:evals/registry/data/base64_decode/base64_decode.jsonl
296 name: base64_decode_simple__dev__v0
297- from: file:evals/registry/data/urdu-transliteration/samples.jsonl
298 name: urdu_transliteration__dev__v0
299- from: file:evals/registry/data/reverse-polish-notation/questions.jsonl
300 name: reverse_polish_notation__dev__v0
301- from: file:evals/registry/data/music_theory/music_theory_chord_names.jsonl
302 name: music_theory_chord_names__dev__v0
303- from: file:evals/registry/data/born_first/born_first.jsonl
304 name: born_first__dev__v0
305- from: file:evals/registry/data/tetris/tetris.jsonl
306 name: tetris__dev__v0
307- from: file:evals/registry/data/pure_korean/samples.jsonl
308 name: pure_korean__dev__v0
309- from: file:evals/registry/data/determinant/samples.jsonl
310 name: determinant__test__v1
311- from: file:evals/registry/data/split_chinese_characters/samples.jsonl
312 name: split_chinese_characters__dev__v0
313- from: file:evals/registry/data/syntax-check/samples.jsonl
314 name: syntax_check__dev__v1
315- from: file:evals/registry/data/balance_chemical_equation/samples.jsonl
316 name: balance_chemical_equation__dev__v0
317- from: file:evals/registry/data/emotional-intelligence/samples.jsonl
318 name: emotional_intelligence__dev__v0
319- from: file:evals/registry/data/nutrition/facts.jsonl
320 name: nutrition__dev__v0
321- from: file:evals/registry/data/reverse-sort-words-eng/samples.jsonl
322 name: reverse_sort_words_eng_simple__dev__v0
323- from: file:evals/registry/data/day-of-week-from-date/samples.jsonl
324 name: day_of_week_from_date__dev__v0
325- from: file:evals/registry/data/regex-match/samples.jsonl
326 name: regex__match__dev__v0
327- from: file:evals/registry/data/find-letter/samples.jsonl
328 name: find_letter__dev__v0
329- from: file:evals/registry/data/korean_foreign_words/samples.jsonl
330 name: korean_foreign_words__dev__v0
331- from: file:evals/registry/data/greek_vocabulary/samples.jsonl
332 name: greek_vocabulary__dev__v0
333- from: file:evals/registry/data/rubiks-colors/samples.jsonl
334 name: rubiks_colors__dev__v0
335- from: file:evals/registry/data/decrypt_caesar_cipher/samples.jsonl
336 name: decrypt_caesar_cipher__dev__v0
337- from: file:evals/registry/data/us_tort_law/samples.jsonl
338 name: us_tort_law__dev__v0
339- from: file:evals/registry/data/number_pattern/samples.jsonl
340 name: number_pattern__dev__v0
341- from: file:evals/registry/data/confusing_korean/samples.jsonl
342 name: confusing_korean__dev__v0
343- from: file:evals/registry/data/kanji-idioms/samples.jsonl
344 name: kanji_idioms__test__v0
345- from: file:evals/registry/data/missing_operators/samples.jsonl
346 name: missing_operators__s1__simple_v0
347- from: file:evals/registry/data/unsolvable_questions/samples.jsonl
348 name: unsolvable_questions__dev__v0
349- from: file:evals/registry/data/portuguese-sarcasm/samples.jsonl
350 name: portuguese_sarcasm__dev__v0
351- from: file:evals/registry/data/swap-words/samples.jsonl
352 name: swap_words__dev__v0
353- from: file:evals/registry/data/hebrew_same_noun_gender/samples.jsonl
354 name: hebrew_same_noun_gender__v0
355- from: file:evals/registry/data/heart-disease/samples.jsonl
356 name: heart_disease__v0
357- from: file:evals/registry/data/last_word_nth/samples.jsonl
358 name: last_word_nth__s1__simple_v0
359- from: file:evals/registry/data/ascii_wordart/ascii_wordart.jsonl
360 name: ascii_wordart__dev__v0
361- from: file:evals/registry/data/direct-speech-tag/samples.jsonl
362 name: direct_speech_tag__dev__v0
363- from: file:evals/registry/data/italian-new-words/samples.jsonl
364 name: italian_new_words__dev__v0
365- from: file:evals/registry/data/irony/samples.jsonl
366 name: irony__dev__v0
367- from: file:evals/registry/data/math_polish/samples.jsonl
368 name: math_polish__dev__v0
369- from: file:evals/registry/data/irish-lexicon/samples.jsonl
370 name: irish_lexicon__dev__v0
371- from: file:evals/registry/data/canto_wu_pronunciation/samples_zero.jsonl
372 name: canto_wu_pronunciation__dev__v0
373- from: file:evals/registry/data/irrelevant-negative-diversion/irrelevant-negative-diversion.jsonl
374 name: irrelevant_negative_diversion__dev__v0
375- from: file:evals/registry/data/invert_word_wise/invert.jsonl
376 name: invert_word_wise__dev__v0
377- from: file:evals/registry/data/imperial_date_to_string/samples.jsonl
378 name: imperial_date_to_string__dev__v0
379- from: file:evals/registry/data/gujarati_numerals/samples.jsonl
380 name: gujarati_numerals__dev__v0
381- from: file:evals/registry/data/count_token_freq_dna/samples.jsonl
382 name: count_token_freq_dna__dev__v0
383- from: file:evals/registry/data/french_homonym_and_homograph/samples.jsonl
384 name: french_homonym_and_homograph__dev__v0
385- from: file:evals/registry/data/cube-pack/samples.jsonl
386 name: cube_pack__dev__v0
387- from: file:evals/registry/data/historical-kana-orthography-reading/samples.jsonl
388 name: historical_kana_orthography_reading__dev__v0
389- from: file:evals/registry/data/canto_wu_pronunciation/samples_few.jsonl
390 name: canto_wu_pronunciation_fewshot__dev__v0
391- from: file:evals/registry/data/accounting_audit/samples.jsonl
392 name: accounting_audit__dev__v0
393- from: file:evals/registry/data/brazilian-lexicon/samples.jsonl
394 name: brazilian_lexicon__dev__v0
395- from: file:evals/registry/data/naughty_strings/samples.jsonl
396 name: naughty_strings__test__v1
397- from: file:evals/registry/data/korean-phonetics/samples.jsonl
398 name: korean_phonetics__dev__v0
399- from: file:evals/registry/data/chinese_homophonic/chinese_homophonic.jsonl
400 name: chinese_homophonic__dev__v0
401- from: file:evals/registry/data/count_intersections_polynomial/samples.jsonl
402 name: count_intersections_polynomial__dev__v0
403- from: file:evals/registry/data/coqa/match.jsonl
404 name: coqa_match__dev__v0
405- from: file:evals/registry/data/latin_grammar/samples.jsonl
406 name: latin_grammar__dev__v0
407- from: file:evals/registry/data/bitwise/samples.jsonl
408 name: bitwise__dev__v0
409- from: file:evals/registry/data/shared_border/samples.jsonl
410 name: shared_borders__dev__v0
411- from: file:evals/registry/data/japanese-station/samples.jsonl
412 name: japanese_station__dev__v0
413- from: file:evals/registry/data/atpl_exams/samples.jsonl
414 name: atpl_exams__dev__v0
415- from: file:evals/registry/data/invoice_due_date_leap_day_adjustment/samples.jsonl
416 name: invoice_due_date_leap_day_adjustment__dev__v0
417- from: file:evals/registry/data/romanian_homonyms/samples.jsonl
418 name: romanian_homonyms__dev__v0
419- from: file:evals/registry/data/infiniteloop-match/infiniteloop-match.jsonl
420 name: infiniteloop_match__s1__simple_v0
421- from: file:evals/registry/data/russian-nlp-tasks/samples.jsonl
422 name: russian_nlp_tasks__dev__v0
423- from: file:evals/registry/data/chinese_chu_ci/samples.jsonl
424 name: chinese_chu_ci__dev__v0
425- from: file:evals/registry/data/polish-syllable-count/samples.jsonl
426 name: polish_syllable_count__val__v0
427- from: file:evals/registry/data/korean-postposition/samples.jsonl
428 name: korean_postposition__dev__v0
429- from: file:evals/registry/data/bulgarian-lexicon/samples.jsonl
430 name: bulgarian_lexicon__dev__v0
431- from: file:evals/registry/data/compare-countries-area/samples.jsonl
432 name: compare_countries_area__dev__v0
433- from: file:evals/registry/data/pattern_identification/samples.v0.jsonl
434 name: pattern_identification__dev__v0
435- from: file:evals/registry/data/belarusian_synonyms/samples.jsonl
436 name: belarusian_synonyms__dev__v0
437- from: file:evals/registry/data/spanish_feminine_noun_masculine_article/samples.jsonl
438 name: spanish_feminine_noun_masculine_article__dev__v0
439- from: file:evals/registry/data/sarcasm/samples.jsonl
440 name: sarcasm__test__v1
441- from: file:evals/registry/data/chinese_tang_poetries/sample.jsonl
442 name: chinese_tang_poetries__dev__match_v1
443- from: file:evals/registry/data/japanese_number_reading/japanese_number_reading.jsonl
444 name: japanese_number_reading__dev__v0
445- from: file:evals/registry/data/korean-honorific/samples.jsonl
446 name: korean_honorific__dev__v0
447- from: file:evals/registry/data/complex_replace_characters/samples.jsonl
448 name: complex_replace_characters__dev__v0
449- from: file:evals/registry/data/dice-rotation-sequence/samples.jsonl
450 name: dice_rotation_sequence__dev__v0
451- from: file:evals/registry/data/utah_real_estate/samples.jsonl
452 name: utah_real_estate__dev__v0
453- from: file:evals/registry/data/formal_logic/formal_logic_expressions.jsonl
454 name: formal_logic__dev__v0
455- from: file:evals/registry/data/resistor_ohm_calculator/samples.jsonl
456 name: resistor_ohm_calculator__dev__simple_v0
457- from: file:evals/registry/data/GOL/samples.jsonl
458 name: gol__dev__v1
459- from: file:evals/registry/data/icelandic-sentences-gec/samples.jsonl
460 name: icelandic_sentences_gec__dev__v0
461- from: file:evals/registry/data/chinese_modern_poem_identification/samples.jsonl
462 name: chinese_modern_poem_identification__test__v1
463- from: file:evals/registry/data/reverse_string/reverse_string.jsonl
464 name: reverse_string__s1__simple_v0
465- from: file:evals/registry/data/complex-analogies-en-ru/samples.jsonl
466 name: complex_analogies_en_ru__dev__v0
467- from: file:evals/registry/data/positive-binary-operations/samples.jsonl
468 name: positive_binary_operations__test__v1
469- from: file:evals/registry/data/hindi_shuddha/samples.jsonl
470 name: hindi_shuddha__dev__v0
471- from: file:evals/registry/data/tokyo-station-number/samples.jsonl
472 name: tokyo_station_number__dev__v0
473- from: file:evals/registry/data/chinese_famous_novel/samples.jsonl
474 name: chinese_famous_novel__dev__v0
475- from: file:evals/registry/data/diagrammatic_logic/samples.jsonl
476 name: diagrammatic_logic__dev__v2
477- from: file:evals/registry/data/polish-lexicon/samples.jsonl
478 name: polish_lexicon__dev__v0
479- from: file:evals/registry/data/wkt_understanding/samples.jsonl
480 name: wkt_understanding__dev__v0
481- from: file:evals/registry/data/japanese-national-medical-exam02/japanese-national-medical-exam02.jsonl
482 name: japanese_national_medical_exam02__dev__v0
483- from: file:evals/registry/data/cardinal-directions/samples.jsonl
484 name: cardinal_directions__dev__v0
485- from: file:evals/registry/data/rectangles/samples.jsonl
486 name: rectangles__dev__v0
487- from: file:evals/registry/data/hindi_upsc/samples.jsonl
488 name: hindi_upsc__dev__v0
489- from: file:evals/registry/data/three-pt-mapping/three_pt_mapping.jsonl
490 name: three_pt_mapping__dev__v0
491- from: file:evals/registry/data/polish-proverbs/samples.jsonl
492 name: polish_proverbs__dev__v0
493- from: file:evals/registry/data/indonesian_numbers/indonesian_numbers.jsonl
494 name: indonesian_numbers__dev__v0
495- from: file:evals/registry/data/chinese_song_ci/samples.jsonl
496 name: chinese_song_ci__dev__v0
497- from: file:evals/registry/data/cybersecurity/filepaths.jsonl
498 name: cybersecurity_filepaths__dev__v0
499- from: file:evals/registry/data/taxes/samples.jsonl
500 name: taxes__dev__v0
501- from: file:evals/registry/data/crontab/samples.jsonl
502 name: crontab__dev__v0
503- from: file:evals/registry/data/integer-sequence-predictions/misc-and-recent-sequences.jsonl
504 name: integer_sequence_predictions_misc__dev__v0
505- from: file:evals/registry/data/integer-sequence-predictions/obscure-sequences.jsonl
506 name: integer_sequence_predictions_obscure__dev__v0
507- from: file:evals/registry/data/integer-sequence-predictions/notable-sequences.jsonl
508 name: integer_sequence_predictions_notable__dev__v0
509- from: file:evals/registry/data/integer-sequence-predictions/samples.jsonl
510 name: integer_sequence_predictions__dev__v0
511- from: file:evals/registry/data/belarusian_orthography/samples.jsonl
512 name: belarusian_orthography__dev__v0
513- from: file:evals/registry/data/date-booking/samples.jsonl
514 name: date_booking__dev__v0
515- from: file:evals/registry/data/interlingual-homograph/samples.jsonl
516 name: interlingual_homograph__dev__v0
517- from: file:evals/registry/data/stats-tests/samples.jsonl
518 name: stats_tests__dev__v0
519- from: file:evals/registry/data/belarusian_russian_translation/samples.jsonl
520 name: belarusian_russian_translation__dev__v0
521- from: file:evals/registry/data/date-calculator/samples.jsonl
522 name: date_calculator__test__v1
523- from: file:evals/registry/data/chinese_poem/samples.jsonl
524 name: chinese_poem__dev__v0
525- from: file:evals/registry/data/belarusian_lexicon/samples.jsonl
526 name: belarusian_lexicon__dev__v0
527- from: file:evals/registry/data/test_english_pronunciations/samples.jsonl
528 name: test_english_pronunciations__dev__v0
529- from: file:evals/registry/data/anagrams/samples.jsonl
530 name: anagrams__test__v1
531- from: file:evals/registry/data/guess_the_singer/samples.jsonl
532 name: guess_the_singer__dev__v0
533- from: file:evals/registry/data/illinois-law/samples.jsonl
534 name: illinois_law__v0
535- from: file:evals/registry/data/russian_medical/samples.jsonl
536 name: russian_medical__dev__v0
537- from: file:evals/registry/data/bigrams/samples.jsonl
538 name: bigrams__dev__v0
539- from: file:evals/registry/data/probability_questions/probability_questions.jsonl
540 name: probability_questions__dev__v0
541- from: file:evals/registry/data/vintage_phone_keyboard_decode/samples.jsonl
542 name: vintage_phone_keyboard_decode__dev__v0
543- from: file:evals/registry/data/connect4/samples.jsonl
544 name: connect4__s1__v1
545- from: file:evals/registry/data/stock_options/stock_options_bull_call_spread.jsonl
546 name: stock_options_bull_call_spread__dev__v0
547- from: file:evals/registry/data/stock_options/stock_options_bear_call_spread.jsonl
548 name: stock_options_bear_call_spread__dev__v0
549- from: file:evals/registry/data/stock_options/stock_option_terms_bear_call_spread.jsonl
550 name: stock_option_terms_bear_call_spread__dev__v0
551- from: file:evals/registry/data/stock_options/stock_option_terms_iron_butterfly_spread.jsonl
552 name: stock_option_terms_iron_butterfly_spread__dev__v0
553- from: file:evals/registry/data/stock_options/stock_option_terms_bull_call_spread.jsonl
554 name: stock_option_terms_bull_call_spread__dev__v0
555- from: file:evals/registry/data/stock_options/stock_options_inverse_iron_condor_spread.jsonl
556 name: stock_options_inverse_iron_condor_spread__dev__v0
557- from: file:evals/registry/data/stock_options/stock_options_iron_condor_spread.jsonl
558 name: stock_options_iron_condor_spread__dev__v0
559- from: file:evals/registry/data/stock_options/stock_option_terms_iron_condor_spread.jsonl
560 name: stock_option_terms_iron_condor_spread__dev__v0
561- from: file:evals/registry/data/stock_options/stock_options_inverse_iron_butterfly_spread.jsonl
562 name: stock_options_inverse_iron_butterfly_spread__dev__v0
563- from: file:evals/registry/data/stock_options/stock_option_terms_inverse_iron_condor_spread.jsonl
564 name: stock_option_terms_inverse_iron_condor_spread__dev__v0
565- from: file:evals/registry/data/japanese_romantic_context/samples.jsonl
566 name: japanese_romantic_context__dev__v0
567- from: file:evals/registry/data/phonetics-identify-words-needing-missing-gpcs/samples.jsonl
568 name: phonetics_identify_words_needing_missing_gpcs__s1__simple_v0
569- from: file:evals/registry/data/prompt-injection/samples.jsonl
570 name: prompt_injection__dev__v0
571- from: file:evals/registry/data/word_vector_over_reliance/word_vector_over_reliance_samples.jsonl
572 name: word_vector_over_reliance__dev__simple_v0
573- from: file:evals/registry/data/lunar_calendar/iso_to_lunar_calendar.jsonl
574 name: iso_to_lunar_calendar__dev__v0
575- from: file:evals/registry/data/lunar_calendar/lunar_calendar_to_iso.jsonl
576 name: lunar_calendar_to_iso__dev__v0
577- from: file:evals/registry/data/code_combination/samples.jsonl
578 name: code_combination__dev__v0
579- from: file:evals/registry/data/partially_solved_crossword_clues/samples.jsonl
580 name: partially_solved_crossword_clues__dev__v0
581- from: file:evals/registry/data/quartz/samples.jsonl
582 name: quartz__test__v1
583- from: file:evals/registry/data/physics-interaction/samples.jsonl
584 name: physics__interaction__dev__v0
585- from: file:evals/registry/data/next-val-series/next-val-series.jsonl
586 name: next_val_series__dev__simple_v0
587evals:
588- name: actors-sequence
589 dataset: actors_sequence__dev__match_v1
590 scorers:
591 - match
592- name: adultery_state_laws
593 dataset: adultery_state_laws__dev__v0
594 scorers:
595 - match
596- name: proofreader
597 dataset: proofreader__dev__v0
598 scorers:
599 - match
600- name: rock-climbing
601 dataset: rock_climbing__dev__v0
602 scorers:
603 - match
604- name: match_banking77
605 dataset: match_banking77__test__v1
606 scorers:
607 - match
608- name: ukraine-gec-grammar-prep
609 dataset: ukraine_gec_grammar_prep__dev__v0
610 scorers:
611 - match
612- name: ukraine-gec-grammar-case
613 dataset: ukraine_gec_grammar_case__dev__v0
614 scorers:
615 - match
616- name: ukraine-gec-grammar-gender
617 dataset: ukraine_gec_grammar_gender__dev__v0
618 scorers:
619 - match
620- name: ukraine-gec-grammar-partvoice
621 dataset: ukraine_gec_grammar_partvoice__dev__v0
622 scorers:
623 - match
624- name: ukraine-gec-fluency-poorflow
625 dataset: ukraine_gec_fluency_poorflow__dev__v0
626 scorers:
627 - match
628- name: ukraine-gec-grammar-verbvoice
629 dataset: ukraine_gec_grammar_verbvoice__dev__v0
630 scorers:
631 - match
632- name: ukraine-gec-grammar-number
633 dataset: ukraine_gec_grammar_number__dev__v0
634 scorers:
635 - match
636- name: ukraine-gec-fluency-repetition
637 dataset: ukraine_gec_fluency_repetition__dev__v0
638 scorers:
639 - match
640- name: ukraine-gec-fluency-calque
641 dataset: ukraine_gec_fluency_calque__dev__v0
642 scorers:
643 - match
644- name: ukraine-gec-grammar-verbaform
645 dataset: ukraine_gec_grammar_verbaform__dev__v0
646 scorers:
647 - match
648- name: ukraine-gec-grammar-ungrammaticalstructure
649 dataset: ukraine_gec_grammar_ungrammaticalstructure__dev__v0
650 scorers:
651 - match
652- name: ukraine-gec-grammar-other
653 dataset: ukraine_gec_grammar_other__dev__v0
654 scorers:
655 - match
656- name: ukraine-gec-fluency-style
657 dataset: ukraine_gec_fluency_style__dev__v0
658 scorers:
659 - match
660- name: ukraine-gec-fluency-other
661 dataset: ukraine_gec_fluency_other__dev__v0
662 scorers:
663 - match
664- name: ukraine-gec-grammar-conjunction
665 dataset: ukraine_gec_grammar_conjunction__dev__v0
666 scorers:
667 - match
668- name: ukraine-gec-grammar-comparison
669 dataset: ukraine_gec_grammar_comparison__dev__v0
670 scorers:
671 - match
672- name: ukraine-gec-grammar-tense
673 dataset: ukraine_gec_grammar_tense__dev__v0
674 scorers:
675 - match
676- name: ukraine-gec-grammar-aspect
677 dataset: ukraine_gec_grammar_aspect__dev__v0
678 scorers:
679 - match
680- name: irish-plural-nouns
681 dataset: irish_plural_nouns__dev__v0
682 scorers:
683 - match
684- name: shape-in-shape
685 dataset: shape_in_shape__dev__v1
686 scorers:
687 - match
688- name: russian_sarcasm
689 dataset: russian_sarcasm__dev__v0
690 scorers:
691 - match
692- name: syllables_long_words
693 dataset: syllables__dev__v1
694 scorers:
695 - match
696- name: crepe
697 dataset: crepe__dev__v2
698 scorers:
699 - match
700- name: coq-proof-step-match
701 dataset: coq_proof_step_match__dev__v0
702 scorers:
703 - match
704- name: ukraine-eit
705 dataset: ukraine_eit__val__v0
706 scorers:
707 - match
708- name: belarusian-proverbs
709 dataset: belarusian_proverbs__dev__v0
710 scorers:
711 - match
712- name: invoices
713 dataset: invoices__dev__v0
714 scorers:
715 - match
716- name: urdu-lexicon
717 dataset: urdu_lexicon__dev__v0
718 scorers:
719 - match
720- name: qa
721 dataset: qa__dev__v0
722 scorers:
723 - match
724- name: french-part-of-speech
725 dataset: french_part_of_speech__dev__v0
726 scorers:
727 - match
728- name: internal_representations
729 dataset: internal_representations__dev__v0
730 scorers:
731 - match
732- name: python_list_comprehension
733 dataset: python_list_comprehension__dev__v0
734 scorers:
735 - match
736- name: nepali-numerals
737 dataset: nepali_numerals__dev__v0
738 scorers:
739 - match
740- name: belarusian-syllable-count
741 dataset: belarusian_syllable_count__dev__v0
742 scorers:
743 - match
744- name: mandaliof-table
745 dataset: mandaliof_table__dev__v0
746 scorers:
747 - match
748- name: test_japanese_english_numerals
749 dataset: test_japanese_english_numerals__dev__v0
750 scorers:
751 - match
752- name: tracking-shuffled-objects
753 dataset: tracking_shuffled_objects__dev__v0
754 scorers:
755 - match
756- name: squares-gpt
757 dataset: squares_gpt__dev__v0
758 scorers:
759 - match
760- name: convert-hex-hsl-lightness
761 dataset: convert_hex_hsl_lightness__dev__v0
762 scorers:
763 - match
764- name: russe
765 dataset: russe__test__v0
766 scorers:
767 - match
768- name: aba_mrpc_true_false
769 dataset: aba_mrpc_true_false__dev__v0
770 scorers:
771 - match
772- name: logical_counting
773 dataset: logical_counting__dev__v0
774 scorers:
775 - match
776- name: vigenere
777 dataset: vigenere__s1__simple_v0
778 scorers:
779 - match
780- name: map-electronic-component-part-to-fact
781 dataset: map_electronic_component_part_to_fact__dev__v0
782 scorers:
783 - match
784- name: rare-and-loanwords-dutch-lexicon
785 dataset: rare_and_loanwords_dutch_lexicon__dev__v0
786 scorers:
787 - match
788- name: product_information_extraction_one_shot
789 dataset: product_information_extraction_one_shot__dev__v0
790 scorers:
791 - match
792- name: product_information_extraction_zero_shot
793 dataset: product_information_extraction_zero_shot__dev__v0
794 scorers:
795 - match
796- name: sort-numbers
797 dataset: sort_numbers__s1__simple_v0
798 scorers:
799 - match
800- name: match_product-matching_zeroshot
801 dataset: match_product_matching_zeroshot__dev__v1
802 scorers:
803 - match
804- name: match_product-matching_fewshot
805 dataset: match_product_matching_fewshot__dev__v1
806 scorers:
807 - match
808- name: match_product-matching_rules
809 dataset: match_product_matching_rules__dev__v1
810 scorers:
811 - match
812- name: russian-lexicon
813 dataset: russian_lexicon__dev__v0
814 scorers:
815 - match
816- name: dutch-lexicon
817 dataset: dutch_lexicon__dev__v0
818 scorers:
819 - match
820- name: greek-nt-manuscripts
821 dataset: greek_nt_manuscripts__v0
822 scorers:
823 - match
824- name: matrix_mult_rows
825 dataset: matrix_mult_rows__dev__v0
826 scorers:
827 - match
828- name: moral_exceptQA
829 dataset: moral_exceptqa__test__v1
830 scorers:
831 - match
832- name: music-theory-triads-identification
833 dataset: music_theory_triads_identification__dev__v0
834 scorers:
835 - match
836- name: music-theory-tetrads-identification
837 dataset: music_theory_tetrads_identification__dev__v0
838 scorers:
839 - match
840- name: find-thirukkural
841 dataset: find_thirukkural__dev__v0
842 scorers:
843 - match
844- name: building_floorplan
845 dataset: building_floorplan__test__v1
846 scorers:
847 - match
848- name: japanese-national-medical-exam01
849 dataset: japanese_national_medical_exam01__dev__v0
850 scorers:
851 - match
852- name: lat_long_identify
853 dataset: lat_long_identify__dev__v0
854 scorers:
855 - match
856- name: norwegian-lexicon
857 dataset: norwegian_lexicon__dev__v0
858 scorers:
859 - match
860- name: german-part-of-speech
861 dataset: german_part_of_speech__dev__v0
862 scorers:
863 - match
864- name: swedish_sat
865 dataset: swedish_sat__dev__v0
866 scorers:
867 - match
868- name: utility_price_parsing
869 dataset: utility_price_parsing__dev__v0
870 scorers:
871 - match
872- name: korean-consonant-vowel-combination
873 dataset: korean_consonant_vowel_combination__dev__v0
874 scorers:
875 - match
876- name: mate-in-one
877 dataset: mate_in_one__dev__v0
878 scorers:
879 - match
880- name: french-lexicon
881 dataset: french_lexicon__dev__v0
882 scorers:
883 - match
884- name: swedish-spelling
885 dataset: swedish_spelling__dev__v0
886 scorers:
887 - match
888- name: knot-theory-unknotting-number
889 dataset: knot_theory_unknotting_number__dev__v0
890 scorers:
891 - match
892- name: knot-theory-unknotting-problem
893 dataset: knot_theory_unknotting_problem__dev__v0
894 scorers:
895 - match
896- name: knot-theory-code-conversion
897 dataset: knot_theory_code_conversion__dev__v0
898 scorers:
899 - match
900- name: hindi_words
901 dataset: hindi_words__dev__v0
902 scorers:
903 - match
904- name: arithmetical_puzzles
905 dataset: arithmetical_puzzles__dev__v0
906 scorers:
907 - match
908- name: belarusian-antonyms
909 dataset: belarusian_antonyms__dev__v0
910 scorers:
911 - match
912- name: body-movement
913 dataset: body_movement__dev__zero_shot_v0
914 scorers:
915 - match
916- name: afrikaans-lexicon
917 dataset: afrikaans_lexicon__dev__v0
918 scorers:
919 - match
920- name: cricket_situations
921 dataset: cricket_situations__dev__v0
922 scorers:
923 - match
924- name: korean_spelling
925 dataset: korean_spelling__dev__v0
926 scorers:
927 - match
928- name: rucola
929 dataset: rucola__test__v0
930 scorers:
931 - match
932- name: reclor-logical-reasoning-plus
933 dataset: reclor_logical_reasoning_plus__dev__v0
934 scorers:
935 - match
936- name: logiqav2-logical-reasoning-plus
937 dataset: logiqav2_logical_reasoning_plus__dev__v0
938 scorers:
939 - match
940- name: logiqa-logical-reasoning-plus
941 dataset: logiqa_logical_reasoning_plus__dev__v0
942 scorers:
943 - match
944- name: medmcqa
945 dataset: medmcqa__dev__v0
946 scorers:
947 - match
948- name: multi-step-equations
949 dataset: multi_step_equations__dev__v0
950 scorers:
951 - match
952- name: japanese-remote-island-to-prefecture
953 dataset: japanese_remote_island_to_prefecture__dev__v0
954 scorers:
955 - match
956- name: chinese_homonym
957 dataset: chinese_homonym__dev__v0
958 scorers:
959 - match
960- name: norwegian-rhymes
961 dataset: norwegian_rhymes__dev__v0
962 scorers:
963 - match
964- name: chinese_shi_jing
965 dataset: chinese_shi_jing__test__v1
966 scorers:
967 - match
968- name: forth-stack-sim-basic
969 dataset: forth_stack_sim_basic__dev__v0
970 scorers:
971 - match
972- name: forth-stack-sim
973 dataset: forth_stack_sim__dev__v0
974 scorers:
975 - match
976- name: forth-stack-sim-detailed
977 dataset: forth_stack_sim_detailed__dev__v0
978 scorers:
979 - match
980- name: japanese_city_name_pronunciation
981 dataset: japanese_city_name_pronunciation__dev__v0
982 scorers:
983 - match
984- name: escher-sentences
985 dataset: escher_sentences__dev__v0
986 scorers:
987 - match
988- name: track_objects
989 dataset: track_objects__dev__v0
990 scorers:
991 - match
992- name: shopping_discount_comparison
993 dataset: shopping_discount_comparison__dev__v0
994 scorers:
995 - match
996- name: computer-science-problems
997 dataset: computer_science_problems__s1__simple_v0
998 scorers:
999 - match
1000- name: mendelian_inheritance
1001 dataset: mendelian_inheritance__dev__v0
1002 scorers:
1003 - match
1004- name: override-system-instruction
1005 dataset: override_system_instruction__dev__v0
1006 scorers:
1007 - match
1008- name: hand_ranks-match
1009 dataset: hand_ranks__test__v1
1010 scorers:
1011 - match
1012- name: diabetes
1013 dataset: diabetes__dev__v0
1014 scorers:
1015 - match
1016- name: job_listing_title_for_a_caregiver_in_japan
1017 dataset: job_listing_title_for_a_caregiver_in_japan__test__v1
1018 scorers:
1019 - match
1020- name: poker_analysis
1021 dataset: poker_analysis__test__v1
1022 scorers:
1023 - match
1024- name: belarusian-numerals
1025 dataset: belarusian_numerals__dev__v0
1026 scorers:
1027 - match
1028- name: algebra-word-problems
1029 dataset: algebra_word_problems__s1__simple_v0
1030 scorers:
1031 - match
1032- name: belarusian-grammar
1033 dataset: belarusian_grammar__dev__v0
1034 scorers:
1035 - match
1036- name: svg_understanding
1037 dataset: svg_understanding__v0
1038 scorers:
1039 - match
1040- name: cissp-study-questions
1041 dataset: cissp_study_questions__test__v1
1042 scorers:
1043 - match
1044- name: linear-equations
1045 dataset: linear_equations__dev__v0
1046 scorers:
1047 - match
1048- name: japanese_driving_license
1049 dataset: japanese_driving_license__s1__simple_v0
1050 scorers:
1051 - match
1052- name: first-letters
1053 dataset: first_letters__dev__v0
1054 scorers:
1055 - match
1056- name: arc
1057 dataset: arc__dev__v0
1058 scorers:
1059 - match
1060- name: css-selectors-verbal
1061 dataset: css_selectors_verbal__dev__v0
1062 scorers:
1063 - match
1064- name: japanese-itpassport-exam01
1065 dataset: japanese_itpassport_exam01__dev__v0
1066 scorers:
1067 - match
1068- name: logiqa
1069 dataset: logiqa__dev__v0
1070 scorers:
1071 - match
1072- name: chinese_zodiac
1073 dataset: chinese_zodiac__dev__v0
1074 scorers:
1075 - match
1076- name: spanish-lexicon
1077 dataset: spanish_lexicon__dev__v0
1078 scorers:
1079 - match
1080- name: food
1081 dataset: food__test__v1
1082 scorers:
1083 - match
1084- name: countries
1085 dataset: countries__dev__v0
1086 scorers:
1087 - match
1088- name: which-is-heavier
1089 dataset: which_is_heavier__dev__v0
1090 scorers:
1091 - match
1092- name: korean_date_counting
1093 dataset: korean_date_counting__dev__v0
1094 scorers:
1095 - match
1096- name: fcc_amateur_extra
1097 dataset: fcc_amateur_extra__dev__v0
1098 scorers:
1099 - match
1100- name: multistep-word-problems
1101 dataset: multistep_word_problems__dev__v0
1102 scorers:
1103 - match
1104- name: list_comparison_missing_name
1105 dataset: list_comparison_missing_name__dev__v0
1106 scorers:
1107 - match
1108- name: newsology
1109 dataset: newsology__dev__v0
1110 scorers:
1111 - match
1112- name: simple-visual-understanding
1113 dataset: simple_visual_understanding__dev__v0
1114 scorers:
1115 - match
1116- name: portuguese-syllable-count
1117 dataset: portuguese_syllable_count__dev__v0
1118 scorers:
1119 - match
1120- name: south-african-bands
1121 dataset: south_african_bands__dev__v0
1122 scorers:
1123 - match
1124- name: hebrew-plurals
1125 dataset: hebrew_plurals__dev__v0
1126 scorers:
1127 - match
1128- name: rot13
1129 dataset: rot13__s1__simple_v0
1130 scorers:
1131 - match
1132- name: korean_dialects
1133 dataset: korean_dialects__dev__v0
1134 scorers:
1135 - match
1136- name: test-time-zone-conversion
1137 dataset: test_time_zone_conversion__dev__v0
1138 scorers:
1139 - match
1140- name: music-theory-chord-notes
1141 dataset: music_theory_chord_notes__dev__v0
1142 scorers:
1143 - match
1144- name: russian-english-homonym-context-resolution
1145 dataset: russian_english_homonym_context_resolution__dev__v0
1146 scorers:
1147 - match
1148- name: number-reading
1149 dataset: number_reading__dev__v0
1150 scorers:
1151 - match
1152- name: simple-knowledge-mongolian
1153 dataset: simple_knowledge_mongolian__dev__v0
1154 scorers:
1155 - match
1156- name: base64-decode
1157 dataset: base64_decode_simple__dev__v0
1158 scorers:
1159 - match
1160- name: urdu-transliteration
1161 dataset: urdu_transliteration__dev__v0
1162 scorers:
1163 - match
1164- name: reverse-polish-notation
1165 dataset: reverse_polish_notation__dev__v0
1166 scorers:
1167 - match
1168- name: music-theory-chord-names
1169 dataset: music_theory_chord_names__dev__v0
1170 scorers:
1171 - match
1172- name: born-first
1173 dataset: born_first__dev__v0
1174 scorers:
1175 - match
1176- name: tetris
1177 dataset: tetris__dev__v0
1178 scorers:
1179 - match
1180- name: pure_korean
1181 dataset: pure_korean__dev__v0
1182 scorers:
1183 - match
1184- name: determinant
1185 dataset: determinant__test__v1
1186 scorers:
1187 - match
1188- name: split_chinese_characters
1189 dataset: split_chinese_characters__dev__v0
1190 scorers:
1191 - match
1192- name: syntax-check
1193 dataset: syntax_check__dev__v1
1194 scorers:
1195 - match
1196- name: balance-chemical-equation
1197 dataset: balance_chemical_equation__dev__v0
1198 scorers:
1199 - match
1200- name: emotional-intelligence
1201 dataset: emotional_intelligence__dev__v0
1202 scorers:
1203 - match
1204- name: nutrition
1205 dataset: nutrition__dev__v0
1206 scorers:
1207 - match
1208- name: reverse-sort-words-eng
1209 dataset: reverse_sort_words_eng_simple__dev__v0
1210 scorers:
1211 - match
1212- name: day-of-week-from-date
1213 dataset: day_of_week_from_date__dev__v0
1214 scorers:
1215 - match
1216- name: regex-match
1217 dataset: regex__match__dev__v0
1218 scorers:
1219 - match
1220- name: find-letter
1221 dataset: find_letter__dev__v0
1222 scorers:
1223 - match
1224- name: korean_foreign_words
1225 dataset: korean_foreign_words__dev__v0
1226 scorers:
1227 - match
1228- name: greek-vocabulary
1229 dataset: greek_vocabulary__dev__v0
1230 scorers:
1231 - match
1232- name: rubiks-colors
1233 dataset: rubiks_colors__dev__v0
1234 scorers:
1235 - match
1236- name: decrypt-caesar-cipher
1237 dataset: decrypt_caesar_cipher__dev__v0
1238 scorers:
1239 - match
1240- name: us-tort-law
1241 dataset: us_tort_law__dev__v0
1242 scorers:
1243 - match
1244- name: number-pattern
1245 dataset: number_pattern__dev__v0
1246 scorers:
1247 - match
1248- name: confusing_korean
1249 dataset: confusing_korean__dev__v0
1250 scorers:
1251 - match
1252- name: kanji-idioms
1253 dataset: kanji_idioms__test__v0
1254 scorers:
1255 - match
1256- name: missing-operators
1257 dataset: missing_operators__s1__simple_v0
1258 scorers:
1259 - match
1260- name: unsolvable_questions
1261 dataset: unsolvable_questions__dev__v0
1262 scorers:
1263 - match
1264- name: portuguese-sarcasm
1265 dataset: portuguese_sarcasm__dev__v0
1266 scorers:
1267 - match
1268- name: swap-words
1269 dataset: swap_words__dev__v0
1270 scorers:
1271 - match
1272- name: hebrew-same-noun-gender
1273 dataset: hebrew_same_noun_gender__v0
1274 scorers:
1275 - match
1276- name: heart-disease
1277 dataset: heart_disease__v0
1278 scorers:
1279 - match
1280- name: last-word-nth
1281 dataset: last_word_nth__s1__simple_v0
1282 scorers:
1283 - match
1284- name: ascii-wordart
1285 dataset: ascii_wordart__dev__v0
1286 scorers:
1287 - match
1288- name: direct-speech-tag
1289 dataset: direct_speech_tag__dev__v0
1290 scorers:
1291 - match
1292- name: italian-new-words
1293 dataset: italian_new_words__dev__v0
1294 scorers:
1295 - match
1296- name: irony
1297 dataset: irony__dev__v0
1298 scorers:
1299 - match
1300- name: math_polish
1301 dataset: math_polish__dev__v0
1302 scorers:
1303 - match
1304- name: irish-lexicon
1305 dataset: irish_lexicon__dev__v0
1306 scorers:
1307 - match
1308- name: canto_wu_pronunciation
1309 dataset: canto_wu_pronunciation__dev__v0
1310 scorers:
1311 - match
1312- name: irrelevant-negative-diversion
1313 dataset: irrelevant_negative_diversion__dev__v0
1314 scorers:
1315 - match
1316- name: invert_word_wise
1317 dataset: invert_word_wise__dev__v0
1318 scorers:
1319 - match
1320- name: imperial_date_to_string
1321 dataset: imperial_date_to_string__dev__v0
1322 scorers:
1323 - match
1324- name: gujarati-numerals
1325 dataset: gujarati_numerals__dev__v0
1326 scorers:
1327 - match
1328- name: count_token_freq_dna
1329 dataset: count_token_freq_dna__dev__v0
1330 scorers:
1331 - match
1332- name: french_homonym_and_homograph
1333 dataset: french_homonym_and_homograph__dev__v0
1334 scorers:
1335 - match
1336- name: cube-pack
1337 dataset: cube_pack__dev__v0
1338 scorers:
1339 - match
1340- name: historical-kana-orthography-reading
1341 dataset: historical_kana_orthography_reading__dev__v0
1342 scorers:
1343 - match
1344- name: canto_wu_pronunciation_fewshot
1345 dataset: canto_wu_pronunciation_fewshot__dev__v0
1346 scorers:
1347 - match
1348- name: accounting_audit
1349 dataset: accounting_audit__dev__v0
1350 scorers:
1351 - match
1352- name: brazilian-lexicon
1353 dataset: brazilian_lexicon__dev__v0
1354 scorers:
1355 - match
1356- name: naughty_strings
1357 dataset: naughty_strings__test__v1
1358 scorers:
1359 - match
1360- name: korean-phonetics
1361 dataset: korean_phonetics__dev__v0
1362 scorers:
1363 - match
1364- name: chinese-homo
1365 dataset: chinese_homophonic__dev__v0
1366 scorers:
1367 - match
1368- name: count_intersections_polynomial
1369 dataset: count_intersections_polynomial__dev__v0
1370 scorers:
1371 - match
1372- name: coqa-match
1373 dataset: coqa_match__dev__v0
1374 scorers:
1375 - match
1376- name: latin-grammar
1377 dataset: latin_grammar__dev__v0
1378 scorers:
1379 - match
1380- name: bitwise
1381 dataset: bitwise__dev__v0
1382 scorers:
1383 - match
1384- name: shared-borders
1385 dataset: shared_borders__dev__v0
1386 scorers:
1387 - match
1388- name: japanese-station
1389 dataset: japanese_station__dev__v0
1390 scorers:
1391 - match
1392- name: atpl_exams
1393 dataset: atpl_exams__dev__v0
1394 scorers:
1395 - match
1396- name: invoice_due_date_leap_day_adjustment
1397 dataset: invoice_due_date_leap_day_adjustment__dev__v0
1398 scorers:
1399 - match
1400- name: romanian_homonyms
1401 dataset: romanian_homonyms__dev__v0
1402 scorers:
1403 - match
1404- name: infiniteloop-match
1405 dataset: infiniteloop_match__s1__simple_v0
1406 scorers:
1407 - match
1408- name: russian-nlp-tasks
1409 dataset: russian_nlp_tasks__dev__v0
1410 scorers:
1411 - match
1412- name: chinese_chu_ci
1413 dataset: chinese_chu_ci__dev__v0
1414 scorers:
1415 - match
1416- name: polish-syllable-count
1417 dataset: polish_syllable_count__val__v0
1418 scorers:
1419 - match
1420- name: korean-postposition
1421 dataset: korean_postposition__dev__v0
1422 scorers:
1423 - match
1424- name: bulgarian-lexicon
1425 dataset: bulgarian_lexicon__dev__v0
1426 scorers:
1427 - match
1428- name: compare-countries-area
1429 dataset: compare_countries_area__dev__v0
1430 scorers:
1431 - match
1432- name: pattern_identification
1433 dataset: pattern_identification__dev__v0
1434 scorers:
1435 - match
1436- name: belarusian-synonyms
1437 dataset: belarusian_synonyms__dev__v0
1438 scorers:
1439 - match
1440- name: spanish_feminine_noun_masculine_article
1441 dataset: spanish_feminine_noun_masculine_article__dev__v0
1442 scorers:
1443 - match
1444- name: sarcasm
1445 dataset: sarcasm__test__v1
1446 scorers:
1447 - match
1448- name: chinese_tang_poetries
1449 dataset: chinese_tang_poetries__dev__match_v1
1450 scorers:
1451 - match
1452- name: japanese-number-reading
1453 dataset: japanese_number_reading__dev__v0
1454 scorers:
1455 - match
1456- name: korean-honorific
1457 dataset: korean_honorific__dev__v0
1458 scorers:
1459 - match
1460- name: complex-replace-characters
1461 dataset: complex_replace_characters__dev__v0
1462 scorers:
1463 - match
1464- name: dice-rotation-sequence
1465 dataset: dice_rotation_sequence__dev__v0
1466 scorers:
1467 - match
1468- name: utah_real_estateh
1469 dataset: utah_real_estate__dev__v0
1470 scorers:
1471 - match
1472- name: formal-logic
1473 dataset: formal_logic__dev__v0
1474 scorers:
1475 - match
1476- name: resistor-ohm-calculator
1477 dataset: resistor_ohm_calculator__dev__simple_v0
1478 scorers:
1479 - match
1480- name: gol
1481 dataset: gol__dev__v1
1482 scorers:
1483 - match
1484- name: icelandic-sentences-gec
1485 dataset: icelandic_sentences_gec__dev__v0
1486 scorers:
1487 - match
1488- name: chinese_modern_poem_identification
1489 dataset: chinese_modern_poem_identification__test__v1
1490 scorers:
1491 - match
1492- name: reverse-string
1493 dataset: reverse_string__s1__simple_v0
1494 scorers:
1495 - match
1496- name: complex-analogies-en-ru
1497 dataset: complex_analogies_en_ru__dev__v0
1498 scorers:
1499 - match
1500- name: positive-binary-operations
1501 dataset: positive_binary_operations__test__v1
1502 scorers:
1503 - match
1504- name: hindi_shuddha
1505 dataset: hindi_shuddha__dev__v0
1506 scorers:
1507 - match
1508- name: tokyo-station-number
1509 dataset: tokyo_station_number__dev__v0
1510 scorers:
1511 - match
1512- name: chinese_famous_novel
1513 dataset: chinese_famous_novel__dev__v0
1514 scorers:
1515 - match
1516- name: diagrammatic_logic
1517 dataset: diagrammatic_logic__dev__v2
1518 scorers:
1519 - match
1520- name: polish-lexicon
1521 dataset: polish_lexicon__dev__v0
1522 scorers:
1523 - match
1524- name: wkt_understanding
1525 dataset: wkt_understanding__dev__v0
1526 scorers:
1527 - match
1528- name: japanese-national-medical-exam02
1529 dataset: japanese_national_medical_exam02__dev__v0
1530 scorers:
1531 - match
1532- name: cardinal-directions
1533 dataset: cardinal_directions__dev__v0
1534 scorers:
1535 - match
1536- name: rectangles
1537 dataset: rectangles__dev__v0
1538 scorers:
1539 - match
1540- name: hindi_upsc
1541 dataset: hindi_upsc__dev__v0
1542 scorers:
1543 - match
1544- name: three-pt-mapping
1545 dataset: three_pt_mapping__dev__v0
1546 scorers:
1547 - match
1548- name: polish-proverbs
1549 dataset: polish_proverbs__dev__v0
1550 scorers:
1551 - match
1552- name: indonesian_numbers
1553 dataset: indonesian_numbers__dev__v0
1554 scorers:
1555 - match
1556- name: chinese_song_ci
1557 dataset: chinese_song_ci__dev__v0
1558 scorers:
1559 - match
1560- name: cybersecurity-filepaths
1561 dataset: cybersecurity_filepaths__dev__v0
1562 scorers:
1563 - match
1564- name: taxes
1565 dataset: taxes__dev__v0
1566 scorers:
1567 - match
1568- name: crontab
1569 dataset: crontab__dev__v0
1570 scorers:
1571 - match
1572- name: integer-sequence-predictions-misc
1573 dataset: integer_sequence_predictions_misc__dev__v0
1574 scorers:
1575 - match
1576- name: integer-sequence-predictions-obscure
1577 dataset: integer_sequence_predictions_obscure__dev__v0
1578 scorers:
1579 - match
1580- name: integer-sequence-predictions-notable
1581 dataset: integer_sequence_predictions_notable__dev__v0
1582 scorers:
1583 - match
1584- name: integer-sequence-predictions
1585 dataset: integer_sequence_predictions__dev__v0
1586 scorers:
1587 - match
1588- name: belarusian-orthography
1589 dataset: belarusian_orthography__dev__v0
1590 scorers:
1591 - match
1592- name: date-booking
1593 dataset: date_booking__dev__v0
1594 scorers:
1595 - match
1596- name: interlingual-homograph
1597 dataset: interlingual_homograph__dev__v0
1598 scorers:
1599 - match
1600- name: stats-tests
1601 dataset: stats_tests__dev__v0
1602 scorers:
1603 - match
1604- name: belarusian-russian-translation
1605 dataset: belarusian_russian_translation__dev__v0
1606 scorers:
1607 - match
1608- name: date-calculator
1609 dataset: date_calculator__test__v1
1610 scorers:
1611 - match
1612- name: chinese_poem
1613 dataset: chinese_poem__dev__v0
1614 scorers:
1615 - match
1616- name: belarusian-lexicon
1617 dataset: belarusian_lexicon__dev__v0
1618 scorers:
1619 - match
1620- name: test_english_pronunciations
1621 dataset: test_english_pronunciations__dev__v0
1622 scorers:
1623 - match
1624- name: anagrams
1625 dataset: anagrams__test__v1
1626 scorers:
1627 - match
1628- name: guess-the-singer
1629 dataset: guess_the_singer__dev__v0
1630 scorers:
1631 - match
1632- name: illinois-law
1633 dataset: illinois_law__v0
1634 scorers:
1635 - match
1636- name: russian_medical
1637 dataset: russian_medical__dev__v0
1638 scorers:
1639 - match
1640- name: bigrams
1641 dataset: bigrams__dev__v0
1642 scorers:
1643 - match
1644- name: probability-questions
1645 dataset: probability_questions__dev__v0
1646 scorers:
1647 - match
1648- name: vintage_phone_keyboard_decode
1649 dataset: vintage_phone_keyboard_decode__dev__v0
1650 scorers:
1651 - match
1652- name: connect4
1653 dataset: connect4__s1__v1
1654 scorers:
1655 - match
1656- name: stock-options-bull-call-spread
1657 dataset: stock_options_bull_call_spread__dev__v0
1658 scorers:
1659 - match
1660- name: stock-options-bear-call-spread
1661 dataset: stock_options_bear_call_spread__dev__v0
1662 scorers:
1663 - match
1664- name: stock-option-terms-bear-call-spread
1665 dataset: stock_option_terms_bear_call_spread__dev__v0
1666 scorers:
1667 - match
1668- name: stock-option-terms-iron-butteryfly-spread
1669 dataset: stock_option_terms_iron_butterfly_spread__dev__v0
1670 scorers:
1671 - match
1672- name: stock-option-terms-bull-call-spread
1673 dataset: stock_option_terms_bull_call_spread__dev__v0
1674 scorers:
1675 - match
1676- name: stock-options-inverse-iron-condor-spread
1677 dataset: stock_options_inverse_iron_condor_spread__dev__v0
1678 scorers:
1679 - match
1680- name: stock-options-iron-condor-spread
1681 dataset: stock_options_iron_condor_spread__dev__v0
1682 scorers:
1683 - match
1684- name: stock-option-terms-iron-condor-spread
1685 dataset: stock_option_terms_iron_condor_spread__dev__v0
1686 scorers:
1687 - match
1688- name: stock-options-inverse-iron-butterfly-spread
1689 dataset: stock_options_inverse_iron_butterfly_spread__dev__v0
1690 scorers:
1691 - match
1692- name: stock-option-terms-inverse-iron-condor-spread
1693 dataset: stock_option_terms_inverse_iron_condor_spread__dev__v0
1694 scorers:
1695 - match
1696- name: japanese_romantic_context
1697 dataset: japanese_romantic_context__dev__v0
1698 scorers:
1699 - match
1700- name: phonetics-identify-words-needing-missing-gpcs
1701 dataset: phonetics_identify_words_needing_missing_gpcs__s1__simple_v0
1702 scorers:
1703 - match
1704- name: prompt-injection
1705 dataset: prompt_injection__dev__v0
1706 scorers:
1707 - match
1708- name: word_vector_over_reliance
1709 dataset: word_vector_over_reliance__dev__simple_v0
1710 scorers:
1711 - match
1712- name: iso-to-lunar-calendar
1713 dataset: iso_to_lunar_calendar__dev__v0
1714 scorers:
1715 - match
1716- name: lunar-calendar-to-iso
1717 dataset: lunar_calendar_to_iso__dev__v0
1718 scorers:
1719 - match
1720- name: code_combination
1721 dataset: code_combination__dev__v0
1722 scorers:
1723 - match
1724- name: partially_solved_crossword_clues
1725 dataset: partially_solved_crossword_clues__dev__v0
1726 scorers:
1727 - match
1728- name: quartz
1729 dataset: quartz__test__v1
1730 scorers:
1731 - match
1732- name: physics-interaction
1733 dataset: physics__interaction__dev__v0
1734 scorers:
1735 - match
1736- name: next-val-series
1737 dataset: next_val_series__dev__simple_v0
1738 scorers:
1739 - match
1740