1{
2 "cells": [
3 {
4 "attachments": {},
5 "cell_type": "markdown",
6 "metadata": {},
7 "source": [
8 "## Building an MMLU Eval\n",
9 "\n",
10 "This notebook shows how to:\n",
11 "- Build and run an eval\n",
12 "- Load the results and into a Pandas Dataframe\n",
13 "\n",
14 "We use the `evals.elsuite.basic.match:Match` Eval class here to check whether new completions match the correct answer. Under the hood, it will generate a completion with the choice of model for each prompt, check if the completion matches the true answer, then logs a result."
15 ]
16 },
17 {
18 "cell_type": "code",
19 "execution_count": null,
20 "metadata": {},
21 "outputs": [],
22 "source": [
23 "# Install, and download MMLU if you haven't already\n",
24 "%pip install -e ../.\n",
25 "\n",
26 "!curl -O https://people.eecs.berkeley.edu/~hendrycks/data.tar\n",
27 "!tar -xf data.tar\n",
28 "data_path = \"data\""
29 ]
30 },
31 {
32 "cell_type": "code",
33 "execution_count": null,
34 "metadata": {},
35 "outputs": [],
36 "source": [
37 "import pandas as pd\n",
38 "import os\n",
39 "\n",
40 "# Assuming this notebook is in examples/\n",
41 "registry_path = os.path.join(os.getcwd(), \"../evals/registry\")"
42 ]
43 },
44 {
45 "cell_type": "code",
46 "execution_count": null,
47 "metadata": {},
48 "outputs": [],
49 "source": [
50 "# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models\n",
51 "\n",
52 "choices = [\"A\", \"B\", \"C\", \"D\"]\n",
53 "sys_msg = \"The following are multiple choice questions (with answers) about {}.\"\n",
54 "def create_chat_prompt(sys_msg, question, answers, subject):\n",
55 " user_prompt = f\"{question}\\n\" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)]) + \"\\nAnswer:\"\n",
56 " return [\n",
57 " {\"role\": \"system\", \"content\": sys_msg.format(subject)}, \n",
58 " {\"role\": \"user\", \"content\": user_prompt}\n",
59 " ]\n",
60 "\n",
61 "def create_chat_example(question, answers, correct_answer):\n",
62 " \"\"\"\n",
63 " Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting\n",
64 " \"\"\"\n",
65 " user_prompt = f\"{question}\\n\" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)]) + \"\\nAnswer:\"\n",
66 " return [\n",
67 " {\"role\": \"system\", \"content\": user_prompt, \"name\": \"example_user\"},\n",
68 " {\"role\": \"system\", \"content\": correct_answer, \"name\": \"example_assistant\"},\n",
69 " ]"
70 ]
71 },
72 {
73 "cell_type": "code",
74 "execution_count": null,
75 "metadata": {},
76 "outputs": [],
77 "source": [
78 "import yaml\n",
79 "subjects = sorted([f.split(\"_test.csv\")[0] for f in os.listdir(os.path.join(data_path, \"test\")) if \"_test.csv\" in f])\n",
80 "\n",
81 "registry_yaml = {}\n",
82 "\n",
83 "for subject in subjects:\n",
84 " subject_path = os.path.join(registry_path, \"data\", \"mmlu\", subject)\n",
85 " os.makedirs(subject_path, exist_ok=True)\n",
86 "\n",
87 " # Create few-shot prompts\n",
88 " dev_df = pd.read_csv(os.path.join(data_path, \"dev\", subject + \"_dev.csv\"), names=(\"Question\", \"A\", \"B\", \"C\", \"D\", \"Answer\"))\n",
89 " dev_df[\"sample\"] = dev_df.apply(lambda x: create_chat_example(x[\"Question\"], x[[\"A\", \"B\", \"C\", \"D\"]], x[\"Answer\"]), axis=1)\n",
90 " few_shot_path = os.path.join(subject_path, \"few_shot.jsonl\") \n",
91 " dev_df[[\"sample\"]].to_json(few_shot_path, lines=True, orient=\"records\")\n",
92 "\n",
93 " # Create test prompts and ideal completions\n",
94 " test_df = pd.read_csv(os.path.join(data_path, \"test\", subject + \"_test.csv\"), names=(\"Question\", \"A\", \"B\", \"C\", \"D\", \"Answer\"))\n",
95 " test_df[\"input\"] = test_df.apply(lambda x: create_chat_prompt(sys_msg, x[\"Question\"], x[[\"A\", \"B\", \"C\", \"D\"]], subject), axis=1)\n",
96 " test_df[\"ideal\"] = test_df.Answer\n",
97 " samples_path = os.path.join(subject_path, \"samples.jsonl\") \n",
98 " test_df[[\"input\", \"ideal\"]].to_json(samples_path, lines=True, orient=\"records\")\n",
99 "\n",
100 " eval_id = f\"match_mmlu_{subject}\"\n",
101 "\n",
102 " registry_yaml[eval_id] = {\n",
103 " \"id\": f\"{eval_id}.test.v1\",\n",
104 " \"metrics\": [\"accuracy\"]\n",
105 " }\n",
106 " registry_yaml[f\"{eval_id}.test.v1\"] = {\n",
107 " \"class\": \"evals.elsuite.basic.match:Match\",\n",
108 " \"args\": {\n",
109 " \"samples_jsonl\": samples_path,\n",
110 " \"few_shot_jsonl\": few_shot_path,\n",
111 " \"num_few_shot\": 4,\n",
112 " }\n",
113 " }\n",
114 "\n",
115 "with open(os.path.join(registry_path, \"evals\", \"mmlu.yaml\"), \"w\") as f:\n",
116 " yaml.dump(registry_yaml, f)"
117 ]
118 },
119 {
120 "cell_type": "code",
121 "execution_count": null,
122 "metadata": {},
123 "outputs": [],
124 "source": [
125 "# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs\n",
126 "!oaieval gpt-3.5-turbo match_mmlu_anatomy"
127 ]
128 },
129 {
130 "cell_type": "code",
131 "execution_count": null,
132 "metadata": {},
133 "outputs": [],
134 "source": [
135 "# How to process the log events generated by oaieval\n",
136 "events = \"/tmp/evallogs/{log_name}\"\n",
137 "\n",
138 "with open(events, \"r\") as f:\n",
139 " events_df = pd.read_json(f, lines=True)\n",
140 "\n",
141 "matches_df = events_df[events_df.type == \"match\"].reset_index(drop=True)\n",
142 "matches_df = matches_df.join(pd.json_normalize(matches_df.data))\n",
143 "matches_df.correct.value_counts().plot.bar(title=\"Correctness of generated answers\", xlabel=\"Correctness\", ylabel=\"Count\")"
144 ]
145 },
146 {
147 "cell_type": "code",
148 "execution_count": null,
149 "metadata": {},
150 "outputs": [],
151 "source": [
152 "# Inspect samples\n",
153 "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n",
154 " print(f\"Prompt: {r.prompt}\")\n",
155 " print(f\"Sampled: {r.sampled}\")\n",
156 " print(\"-\" * 25)"
157 ]
158 }
159 ],
160 "metadata": {
161 "kernelspec": {
162 "display_name": "oss_evals",
163 "language": "python",
164 "name": "python3"
165 },
166 "language_info": {
167 "codemirror_mode": {
168 "name": "ipython",
169 "version": 3
170 },
171 "file_extension": ".py",
172 "mimetype": "text/x-python",
173 "name": "python",
174 "nbconvert_exporter": "python",
175 "pygments_lexer": "ipython3",
176 "version": "3.10.9"
177 },
178 "orig_nbformat": 4
179 },
180 "nbformat": 4,
181 "nbformat_minor": 2
182}
183
1{
2 "cells": [
3 {
4 "attachments": {},
5 "cell_type": "markdown",
6 "metadata": {},
7 "source": [
8 "## Building an MMLU Eval\n",
9 "\n",
10 "This notebook shows how to:\n",
11 "- Build and run an eval\n",
12 "- Load the results and into a Pandas Dataframe\n",
13 "\n",
14 "We use the `evals.elsuite.basic.match:Match` Eval class here to check whether new completions match the correct answer. Under the hood, it will generate a completion with the choice of model for each prompt, check if the completion matches the true answer, then logs a result."
15 ]
16 },
17 {
18 "cell_type": "code",
19 "execution_count": null,
20 "metadata": {},
21 "outputs": [],
22 "source": [
23 "# Install, and download MMLU if you haven't already\n",
24 "%pip install -e ../.\n",
25 "\n",
26 "!curl -O https://people.eecs.berkeley.edu/~hendrycks/data.tar\n",
27 "!tar -xf data.tar\n",
28 "data_path = \"data\""
29 ]
30 },
31 {
32 "cell_type": "code",
33 "execution_count": null,
34 "metadata": {},
35 "outputs": [],
36 "source": [
37 "import pandas as pd\n",
38 "import os\n",
39 "\n",
40 "# Assuming this notebook is in examples/\n",
41 "registry_path = os.path.join(os.getcwd(), \"../evals/registry\")"
42 ]
43 },
44 {
45 "cell_type": "code",
46 "execution_count": null,
47 "metadata": {},
48 "outputs": [],
49 "source": [
50 "# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models\n",
51 "\n",
52 "choices = [\"A\", \"B\", \"C\", \"D\"]\n",
53 "sys_msg = \"The following are multiple choice questions (with answers) about {}.\"\n",
54 "def create_chat_prompt(sys_msg, question, answers, subject):\n",
55 " user_prompt = f\"{question}\\n\" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)]) + \"\\nAnswer:\"\n",
56 " return [\n",
57 " {\"role\": \"system\", \"content\": sys_msg.format(subject)}, \n",
58 " {\"role\": \"user\", \"content\": user_prompt}\n",
59 " ]\n",
60 "\n",
61 "def create_chat_example(question, answers, correct_answer):\n",
62 " \"\"\"\n",
63 " Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting\n",
64 " \"\"\"\n",
65 " user_prompt = f\"{question}\\n\" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)]) + \"\\nAnswer:\"\n",
66 " return [\n",
67 " {\"role\": \"system\", \"content\": user_prompt, \"name\": \"example_user\"},\n",
68 " {\"role\": \"system\", \"content\": correct_answer, \"name\": \"example_assistant\"},\n",
69 " ]"
70 ]
71 },
72 {
73 "cell_type": "code",
74 "execution_count": null,
75 "metadata": {},
76 "outputs": [],
77 "source": [
78 "import yaml\n",
79 "subjects = sorted([f.split(\"_test.csv\")[0] for f in os.listdir(os.path.join(data_path, \"test\")) if \"_test.csv\" in f])\n",
80 "\n",
81 "registry_yaml = {}\n",
82 "\n",
83 "for subject in subjects:\n",
84 " subject_path = os.path.join(registry_path, \"data\", \"mmlu\", subject)\n",
85 " os.makedirs(subject_path, exist_ok=True)\n",
86 "\n",
87 " # Create few-shot prompts\n",
88 " dev_df = pd.read_csv(os.path.join(data_path, \"dev\", subject + \"_dev.csv\"), names=(\"Question\", \"A\", \"B\", \"C\", \"D\", \"Answer\"))\n",
89 " dev_df[\"sample\"] = dev_df.apply(lambda x: create_chat_example(x[\"Question\"], x[[\"A\", \"B\", \"C\", \"D\"]], x[\"Answer\"]), axis=1)\n",
90 " few_shot_path = os.path.join(subject_path, \"few_shot.jsonl\") \n",
91 " dev_df[[\"sample\"]].to_json(few_shot_path, lines=True, orient=\"records\")\n",
92 "\n",
93 " # Create test prompts and ideal completions\n",
94 " test_df = pd.read_csv(os.path.join(data_path, \"test\", subject + \"_test.csv\"), names=(\"Question\", \"A\", \"B\", \"C\", \"D\", \"Answer\"))\n",
95 " test_df[\"input\"] = test_df.apply(lambda x: create_chat_prompt(sys_msg, x[\"Question\"], x[[\"A\", \"B\", \"C\", \"D\"]], subject), axis=1)\n",
96 " test_df[\"ideal\"] = test_df.Answer\n",
97 " samples_path = os.path.join(subject_path, \"samples.jsonl\") \n",
98 " test_df[[\"input\", \"ideal\"]].to_json(samples_path, lines=True, orient=\"records\")\n",
99 "\n",
100 " eval_id = f\"match_mmlu_{subject}\"\n",
101 "\n",
102 " registry_yaml[eval_id] = {\n",
103 " \"id\": f\"{eval_id}.test.v1\",\n",
104 " \"metrics\": [\"accuracy\"]\n",
105 " }\n",
106 " registry_yaml[f\"{eval_id}.test.v1\"] = {\n",
107 " \"class\": \"evals.elsuite.basic.match:Match\",\n",
108 " \"args\": {\n",
109 " \"samples_jsonl\": samples_path,\n",
110 " \"few_shot_jsonl\": few_shot_path,\n",
111 " \"num_few_shot\": 4,\n",
112 " }\n",
113 " }\n",
114 "\n",
115 "with open(os.path.join(registry_path, \"evals\", \"mmlu.yaml\"), \"w\") as f:\n",
116 " yaml.dump(registry_yaml, f)"
117 ]
118 },
119 {
120 "cell_type": "code",
121 "execution_count": null,
122 "metadata": {},
123 "outputs": [],
124 "source": [
125 "# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs\n",
126 "!oaieval gpt-3.5-turbo match_mmlu_anatomy"
127 ]
128 },
129 {
130 "cell_type": "code",
131 "execution_count": null,
132 "metadata": {},
133 "outputs": [],
134 "source": [
135 "# How to process the log events generated by oaieval\n",
136 "events = \"/tmp/evallogs/{log_name}\"\n",
137 "\n",
138 "with open(events, \"r\") as f:\n",
139 " events_df = pd.read_json(f, lines=True)\n",
140 "\n",
141 "matches_df = events_df[events_df.type == \"match\"].reset_index(drop=True)\n",
142 "matches_df = matches_df.join(pd.json_normalize(matches_df.data))\n",
143 "matches_df.correct.value_counts().plot.bar(title=\"Correctness of generated answers\", xlabel=\"Correctness\", ylabel=\"Count\")"
144 ]
145 },
146 {
147 "cell_type": "code",
148 "execution_count": null,
149 "metadata": {},
150 "outputs": [],
151 "source": [
152 "# Inspect samples\n",
153 "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n",
154 " print(f\"Prompt: {r.prompt}\")\n",
155 " print(f\"Sampled: {r.sampled}\")\n",
156 " print(\"-\" * 25)"
157 ]
158 }
159 ],
160 "metadata": {
161 "kernelspec": {
162 "display_name": "oss_evals",
163 "language": "python",
164 "name": "python3"
165 },
166 "language_info": {
167 "codemirror_mode": {
168 "name": "ipython",
169 "version": 3
170 },
171 "file_extension": ".py",
172 "mimetype": "text/x-python",
173 "name": "python",
174 "nbconvert_exporter": "python",
175 "pygments_lexer": "ipython3",
176 "version": "3.10.9"
177 },
178 "orig_nbformat": 4
179 },
180 "nbformat": 4,
181 "nbformat_minor": 2
182}
183