main

/evals/examples/mmlu.ipynb

1{ 2 "cells": [ 3 { 4 "attachments": {}, 5 "cell_type": "markdown", 6 "metadata": {}, 7 "source": [ 8 "## Building an MMLU Eval\n", 9 "\n", 10 "This notebook shows how to:\n", 11 "- Build and run an eval\n", 12 "- Load the results and into a Pandas Dataframe\n", 13 "\n", 14 "We use the `evals.elsuite.basic.match:Match` Eval class here to check whether new completions match the correct answer. Under the hood, it will generate a completion with the choice of model for each prompt, check if the completion matches the true answer, then logs a result." 15 ] 16 }, 17 { 18 "cell_type": "code", 19 "execution_count": null, 20 "metadata": {}, 21 "outputs": [], 22 "source": [ 23 "# Install, and download MMLU if you haven't already\n", 24 "%pip install -e ../.\n", 25 "\n", 26 "!curl -O https://people.eecs.berkeley.edu/~hendrycks/data.tar\n", 27 "!tar -xf data.tar\n", 28 "data_path = \"data\"" 29 ] 30 }, 31 { 32 "cell_type": "code", 33 "execution_count": null, 34 "metadata": {}, 35 "outputs": [], 36 "source": [ 37 "import pandas as pd\n", 38 "import os\n", 39 "\n", 40 "# Assuming this notebook is in examples/\n", 41 "registry_path = os.path.join(os.getcwd(), \"../evals/registry\")" 42 ] 43 }, 44 { 45 "cell_type": "code", 46 "execution_count": null, 47 "metadata": {}, 48 "outputs": [], 49 "source": [ 50 "# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models\n", 51 "\n", 52 "choices = [\"A\", \"B\", \"C\", \"D\"]\n", 53 "sys_msg = \"The following are multiple choice questions (with answers) about {}.\"\n", 54 "def create_chat_prompt(sys_msg, question, answers, subject):\n", 55 " user_prompt = f\"{question}\\n\" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)]) + \"\\nAnswer:\"\n", 56 " return [\n", 57 " {\"role\": \"system\", \"content\": sys_msg.format(subject)}, \n", 58 " {\"role\": \"user\", \"content\": user_prompt}\n", 59 " ]\n", 60 "\n", 61 "def create_chat_example(question, answers, correct_answer):\n", 62 " \"\"\"\n", 63 " Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting\n", 64 " \"\"\"\n", 65 " user_prompt = f\"{question}\\n\" + \"\\n\".join([f\"{choice}. {answer}\" for choice, answer in zip(choices, answers)]) + \"\\nAnswer:\"\n", 66 " return [\n", 67 " {\"role\": \"system\", \"content\": user_prompt, \"name\": \"example_user\"},\n", 68 " {\"role\": \"system\", \"content\": correct_answer, \"name\": \"example_assistant\"},\n", 69 " ]" 70 ] 71 }, 72 { 73 "cell_type": "code", 74 "execution_count": null, 75 "metadata": {}, 76 "outputs": [], 77 "source": [ 78 "import yaml\n", 79 "subjects = sorted([f.split(\"_test.csv\")[0] for f in os.listdir(os.path.join(data_path, \"test\")) if \"_test.csv\" in f])\n", 80 "\n", 81 "registry_yaml = {}\n", 82 "\n", 83 "for subject in subjects:\n", 84 " subject_path = os.path.join(registry_path, \"data\", \"mmlu\", subject)\n", 85 " os.makedirs(subject_path, exist_ok=True)\n", 86 "\n", 87 " # Create few-shot prompts\n", 88 " dev_df = pd.read_csv(os.path.join(data_path, \"dev\", subject + \"_dev.csv\"), names=(\"Question\", \"A\", \"B\", \"C\", \"D\", \"Answer\"))\n", 89 " dev_df[\"sample\"] = dev_df.apply(lambda x: create_chat_example(x[\"Question\"], x[[\"A\", \"B\", \"C\", \"D\"]], x[\"Answer\"]), axis=1)\n", 90 " few_shot_path = os.path.join(subject_path, \"few_shot.jsonl\") \n", 91 " dev_df[[\"sample\"]].to_json(few_shot_path, lines=True, orient=\"records\")\n", 92 "\n", 93 " # Create test prompts and ideal completions\n", 94 " test_df = pd.read_csv(os.path.join(data_path, \"test\", subject + \"_test.csv\"), names=(\"Question\", \"A\", \"B\", \"C\", \"D\", \"Answer\"))\n", 95 " test_df[\"input\"] = test_df.apply(lambda x: create_chat_prompt(sys_msg, x[\"Question\"], x[[\"A\", \"B\", \"C\", \"D\"]], subject), axis=1)\n", 96 " test_df[\"ideal\"] = test_df.Answer\n", 97 " samples_path = os.path.join(subject_path, \"samples.jsonl\") \n", 98 " test_df[[\"input\", \"ideal\"]].to_json(samples_path, lines=True, orient=\"records\")\n", 99 "\n", 100 " eval_id = f\"match_mmlu_{subject}\"\n", 101 "\n", 102 " registry_yaml[eval_id] = {\n", 103 " \"id\": f\"{eval_id}.test.v1\",\n", 104 " \"metrics\": [\"accuracy\"]\n", 105 " }\n", 106 " registry_yaml[f\"{eval_id}.test.v1\"] = {\n", 107 " \"class\": \"evals.elsuite.basic.match:Match\",\n", 108 " \"args\": {\n", 109 " \"samples_jsonl\": samples_path,\n", 110 " \"few_shot_jsonl\": few_shot_path,\n", 111 " \"num_few_shot\": 4,\n", 112 " }\n", 113 " }\n", 114 "\n", 115 "with open(os.path.join(registry_path, \"evals\", \"mmlu.yaml\"), \"w\") as f:\n", 116 " yaml.dump(registry_yaml, f)" 117 ] 118 }, 119 { 120 "cell_type": "code", 121 "execution_count": null, 122 "metadata": {}, 123 "outputs": [], 124 "source": [ 125 "# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs\n", 126 "!oaieval gpt-3.5-turbo match_mmlu_anatomy" 127 ] 128 }, 129 { 130 "cell_type": "code", 131 "execution_count": null, 132 "metadata": {}, 133 "outputs": [], 134 "source": [ 135 "# How to process the log events generated by oaieval\n", 136 "events = \"/tmp/evallogs/{log_name}\"\n", 137 "\n", 138 "with open(events, \"r\") as f:\n", 139 " events_df = pd.read_json(f, lines=True)\n", 140 "\n", 141 "matches_df = events_df[events_df.type == \"match\"].reset_index(drop=True)\n", 142 "matches_df = matches_df.join(pd.json_normalize(matches_df.data))\n", 143 "matches_df.correct.value_counts().plot.bar(title=\"Correctness of generated answers\", xlabel=\"Correctness\", ylabel=\"Count\")" 144 ] 145 }, 146 { 147 "cell_type": "code", 148 "execution_count": null, 149 "metadata": {}, 150 "outputs": [], 151 "source": [ 152 "# Inspect samples\n", 153 "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n", 154 " print(f\"Prompt: {r.prompt}\")\n", 155 " print(f\"Sampled: {r.sampled}\")\n", 156 " print(\"-\" * 25)" 157 ] 158 } 159 ], 160 "metadata": { 161 "kernelspec": { 162 "display_name": "oss_evals", 163 "language": "python", 164 "name": "python3" 165 }, 166 "language_info": { 167 "codemirror_mode": { 168 "name": "ipython", 169 "version": 3 170 }, 171 "file_extension": ".py", 172 "mimetype": "text/x-python", 173 "name": "python", 174 "nbconvert_exporter": "python", 175 "pygments_lexer": "ipython3", 176 "version": "3.10.9" 177 }, 178 "orig_nbformat": 4 179 }, 180 "nbformat": 4, 181 "nbformat_minor": 2 182} 183