main

/evals/examples/lafand-mt.ipynb

1{ 2 "cells": [ 3 { 4 "attachments": {}, 5 "cell_type": "markdown", 6 "metadata": {}, 7 "source": [ 8 "## Building a MAFAND Eval\n", 9 "\n", 10 "This notebook shows how to:\n", 11 "- Build and run an eval using the [MAFAND dataset](https://github.com/masakhane-io/lafand-mt)\n", 12 "- Load the results and into a Pandas Dataframe" 13 ] 14 }, 15 { 16 "cell_type": "code", 17 "execution_count": null, 18 "metadata": {}, 19 "outputs": [], 20 "source": [ 21 "%load_ext autoreload\n", 22 "%autoreload 2\n", 23 "\n", 24 "import os\n", 25 "import requests\n", 26 "import pandas as pd\n", 27 "\n", 28 "\n", 29 "# Install Evals if you haven't already\n", 30 "# %pip install -e ../." 31 ] 32 }, 33 { 34 "cell_type": "code", 35 "execution_count": null, 36 "metadata": {}, 37 "outputs": [], 38 "source": [ 39 "# Download the MAFAND dataset\n", 40 "\n", 41 "lang_pairs = [\n", 42 " \"en-amh\", \"en-hau\", \"en-ibo\", \"en-kin\", \"en-lug\", \"en-nya\", \"en-pcm\", \"en-sna\", \"en-swa\", \"en-tsn\",\n", 43 " \"en-twi\", \"en-xho\", \"en-yor\", \"en-zul\", \"fr-bam\", \"fr-bbj\", \"fr-ewe\", \"fr-fon\", \"fr-mos\", \"fr-wol\"\n", 44 "]\n", 45 "\n", 46 "# Assuming this notebook is in examples/\n", 47 "registry_path = os.path.join(os.getcwd(), \"..\", \"evals\", \"registry\")\n", 48 "data_path = os.path.join(registry_path, \"data\", \"lafand-mt\")\n", 49 "os.makedirs(data_path, exist_ok=True)\n", 50 "\n", 51 "for pair in lang_pairs:\n", 52 " os.makedirs(os.path.join(data_path, pair), exist_ok=True)\n", 53 " for dev_test in ['dev', 'test']:\n", 54 " raw_tsv_file = f'https://raw.githubusercontent.com/masakhane-io/lafand-mt/main/data/tsv_files/{pair}/{dev_test}.tsv'\n", 55 " with open(os.path.join(data_path, pair, f\"{dev_test}.tsv\"), \"w\", encoding=\"utf-8\") as f:\n", 56 " f.write(requests.get(raw_tsv_file).text)" 57 ] 58 }, 59 { 60 "cell_type": "code", 61 "execution_count": null, 62 "metadata": {}, 63 "outputs": [], 64 "source": [ 65 "# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models\n", 66 "\n", 67 "sys_msg = \"Translate the text from {} to {}.\"\n", 68 "def create_chat_prompt(sys_msg, input_lang, output_lang, input_text):\n", 69 " return [\n", 70 " {\"role\": \"system\", \"content\": sys_msg.format(input_lang, output_lang)}, \n", 71 " {\"role\": \"user\", \"content\": input_text}\n", 72 " ]\n", 73 "\n", 74 "def create_chat_example(input_text, correct_translation):\n", 75 " \"\"\"\n", 76 " Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting\n", 77 " \"\"\"\n", 78 " return [\n", 79 " {\"role\": \"system\", \"content\": input_text, \"name\": \"example_user\"},\n", 80 " {\"role\": \"system\", \"content\": correct_translation, \"name\": \"example_assistant\"},\n", 81 " ]" 82 ] 83 }, 84 { 85 "cell_type": "code", 86 "execution_count": null, 87 "metadata": {}, 88 "outputs": [], 89 "source": [ 90 "import yaml\n", 91 "import os\n", 92 "\n", 93 "translation_paths = sorted([os.path.join(data_path, d) for d in os.listdir(data_path)])\n", 94 "\n", 95 "# Assuming this notebook is in examples/\n", 96 "registry_path = os.path.join(os.getcwd(), \"..\", \"evals\", \"registry\")\n", 97 "output_path = os.path.join(registry_path, \"data\", \"lafand-mt\")\n", 98 "\n", 99 "registry_yaml = {}\n", 100 "\n", 101 "for input_path in translation_paths:\n", 102 " langs = input_path.split(\"/\")[-1]\n", 103 " input_lang, output_lang = langs.split('-')\n", 104 " pair_path = os.path.join(output_path, f\"{input_lang}-{output_lang}\")\n", 105 " os.makedirs(pair_path, exist_ok=True)\n", 106 "\n", 107 " # Create few-shot prompts\n", 108 " dev_df = pd.read_csv(os.path.join(input_path, \"dev.tsv\"), sep=\"\\t\")\n", 109 " dev_df[\"sample\"] = dev_df.apply(lambda x: create_chat_example(x[input_lang], x[output_lang]), axis=1)\n", 110 " few_shot_path = os.path.join(pair_path, f\"{input_lang}-{output_lang}_few_shot.jsonl\")\n", 111 " dev_df[[\"sample\"]].to_json(few_shot_path, lines=True, orient=\"records\")\n", 112 "\n", 113 " # Create test prompts and ideal completions\n", 114 " test_df = pd.read_csv(os.path.join(input_path, \"test.tsv\"), sep=\"\\t\")\n", 115 " test_df[\"input\"] = test_df[input_lang].apply(lambda x: create_chat_prompt(sys_msg, input_lang, output_lang, x))\n", 116 " test_df[\"ideal\"] = test_df[output_lang]\n", 117 " \n", 118 " samples_path = os.path.join(pair_path, f\"{input_lang}-{output_lang}_samples.jsonl\")\n", 119 " test_df[[\"input\", \"ideal\"]].to_json(samples_path, lines=True, orient=\"records\")\n", 120 " eval_id = f\"mafand_translation_{input_lang}-{output_lang}\"\n", 121 "\n", 122 " registry_yaml[eval_id] = {\n", 123 " \"id\": f\"{eval_id}.test.v1\",\n", 124 " \"metrics\": [\"accuracy\"]\n", 125 " }\n", 126 " registry_yaml[f\"{eval_id}.test.v1\"] = {\n", 127 " \"class\": \"evals.elsuite.translate:Translate\",\n", 128 " \"args\": {\n", 129 " \"samples_jsonl\": samples_path,\n", 130 " \"few_shot_jsonl\": few_shot_path,\n", 131 " \"num_few_shot\": 4,\n", 132 " }\n", 133 " }\n", 134 "\n", 135 "os.makedirs(os.path.join(registry_path, \"evals\"), exist_ok=True)\n", 136 "with open(os.path.join(registry_path, \"evals\", \"mafand.yaml\"), \"w\") as f:\n", 137 " yaml.dump(registry_yaml, f)" 138 ] 139 }, 140 { 141 "cell_type": "code", 142 "execution_count": null, 143 "metadata": {}, 144 "outputs": [], 145 "source": [ 146 "# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs\n", 147 "!oaieval gpt-3.5-turbo mafand_translation_en-ibo --max_samples 20" 148 ] 149 }, 150 { 151 "cell_type": "code", 152 "execution_count": null, 153 "metadata": {}, 154 "outputs": [], 155 "source": [ 156 "# How to process the log events generated by oaieval\n", 157 "\n", 158 "log_name = \"EDIT THIS\" # copy from above\n", 159 "events = f\"/tmp/evallogs/{log_name}\"\n", 160 "\n", 161 "with open(events, \"r\") as f:\n", 162 " events_df = pd.read_json(f, lines=True)\n", 163 "\n", 164 "matches_df = events_df[events_df.type == \"match\"].reset_index(drop=True)\n", 165 "matches_df = matches_df.join(pd.json_normalize(matches_df.data))\n", 166 "matches_df.correct.value_counts().plot.bar(title=\"Correctness of generated answers\", xlabel=\"sacrebleu score >30\", ylabel=\"Count\")" 167 ] 168 }, 169 { 170 "cell_type": "code", 171 "execution_count": null, 172 "metadata": {}, 173 "outputs": [], 174 "source": [ 175 "import matplotlib.pyplot as plt\n", 176 "\n", 177 "# your list of scores\n", 178 "scores = matches_df['sacrebleu_sentence_score']\n", 179 "\n", 180 "# define the threshold scores as a range from the minimum to the maximum score, in increments of 5\n", 181 "thresholds = range(int(min(scores)), int(max(scores)) + 5, 5)\n", 182 "\n", 183 "# count the number of scores above and below each threshold\n", 184 "above_counts = [len([score for score in scores if score >= threshold]) for threshold in thresholds]\n", 185 "\n", 186 "# plot the counts as a step function\n", 187 "plt.step(thresholds, above_counts, label='number of samples withabove')\n", 188 "\n", 189 "# set the x and y labels\n", 190 "plt.xlabel('sacrebleu threshold')\n", 191 "plt.ylabel('number of samples w/ score > threshold')\n", 192 "\n", 193 "# show the plot\n", 194 "plt.show()" 195 ] 196 }, 197 { 198 "cell_type": "code", 199 "execution_count": null, 200 "metadata": {}, 201 "outputs": [], 202 "source": [ 203 "# Inspect samples\n", 204 "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n", 205 " print(f\"Prompt: {r.prompt}\")\n", 206 " print(f\"Sampled: {r.sampled}\")\n", 207 " print(\"-\" * 25)" 208 ] 209 } 210 ], 211 "metadata": { 212 "kernelspec": { 213 "display_name": "Python 3 (ipykernel)", 214 "language": "python", 215 "name": "python3" 216 }, 217 "language_info": { 218 "codemirror_mode": { 219 "name": "ipython", 220 "version": 3 221 }, 222 "file_extension": ".py", 223 "mimetype": "text/x-python", 224 "name": "python", 225 "nbconvert_exporter": "python", 226 "pygments_lexer": "ipython3", 227 "version": "3.9.9" 228 }, 229 "toc": { 230 "base_numbering": 1, 231 "nav_menu": {}, 232 "number_sections": true, 233 "sideBar": true, 234 "skip_h1_title": false, 235 "title_cell": "Table of Contents", 236 "title_sidebar": "Contents", 237 "toc_cell": false, 238 "toc_position": {}, 239 "toc_section_display": true, 240 "toc_window_display": false 241 }, 242 "vscode": { 243 "interpreter": { 244 "hash": "fdbe172e46cfba2329a5e8d5b64cdf2d12f4dfd7d9bcea153ecef62d1d51933b" 245 } 246 } 247 }, 248 "nbformat": 4, 249 "nbformat_minor": 2 250} 251