1{
2 "cells": [
3 {
4 "attachments": {},
5 "cell_type": "markdown",
6 "metadata": {},
7 "source": [
8 "## Building a MAFAND Eval\n",
9 "\n",
10 "This notebook shows how to:\n",
11 "- Build and run an eval using the [MAFAND dataset](https://github.com/masakhane-io/lafand-mt)\n",
12 "- Load the results and into a Pandas Dataframe"
13 ]
14 },
15 {
16 "cell_type": "code",
17 "execution_count": null,
18 "metadata": {},
19 "outputs": [],
20 "source": [
21 "%load_ext autoreload\n",
22 "%autoreload 2\n",
23 "\n",
24 "import os\n",
25 "import requests\n",
26 "import pandas as pd\n",
27 "\n",
28 "\n",
29 "# Install Evals if you haven't already\n",
30 "# %pip install -e ../."
31 ]
32 },
33 {
34 "cell_type": "code",
35 "execution_count": null,
36 "metadata": {},
37 "outputs": [],
38 "source": [
39 "# Download the MAFAND dataset\n",
40 "\n",
41 "lang_pairs = [\n",
42 " \"en-amh\", \"en-hau\", \"en-ibo\", \"en-kin\", \"en-lug\", \"en-nya\", \"en-pcm\", \"en-sna\", \"en-swa\", \"en-tsn\",\n",
43 " \"en-twi\", \"en-xho\", \"en-yor\", \"en-zul\", \"fr-bam\", \"fr-bbj\", \"fr-ewe\", \"fr-fon\", \"fr-mos\", \"fr-wol\"\n",
44 "]\n",
45 "\n",
46 "# Assuming this notebook is in examples/\n",
47 "registry_path = os.path.join(os.getcwd(), \"..\", \"evals\", \"registry\")\n",
48 "data_path = os.path.join(registry_path, \"data\", \"lafand-mt\")\n",
49 "os.makedirs(data_path, exist_ok=True)\n",
50 "\n",
51 "for pair in lang_pairs:\n",
52 " os.makedirs(os.path.join(data_path, pair), exist_ok=True)\n",
53 " for dev_test in ['dev', 'test']:\n",
54 " raw_tsv_file = f'https://raw.githubusercontent.com/masakhane-io/lafand-mt/main/data/tsv_files/{pair}/{dev_test}.tsv'\n",
55 " with open(os.path.join(data_path, pair, f\"{dev_test}.tsv\"), \"w\", encoding=\"utf-8\") as f:\n",
56 " f.write(requests.get(raw_tsv_file).text)"
57 ]
58 },
59 {
60 "cell_type": "code",
61 "execution_count": null,
62 "metadata": {},
63 "outputs": [],
64 "source": [
65 "# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models\n",
66 "\n",
67 "sys_msg = \"Translate the text from {} to {}.\"\n",
68 "def create_chat_prompt(sys_msg, input_lang, output_lang, input_text):\n",
69 " return [\n",
70 " {\"role\": \"system\", \"content\": sys_msg.format(input_lang, output_lang)}, \n",
71 " {\"role\": \"user\", \"content\": input_text}\n",
72 " ]\n",
73 "\n",
74 "def create_chat_example(input_text, correct_translation):\n",
75 " \"\"\"\n",
76 " Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting\n",
77 " \"\"\"\n",
78 " return [\n",
79 " {\"role\": \"system\", \"content\": input_text, \"name\": \"example_user\"},\n",
80 " {\"role\": \"system\", \"content\": correct_translation, \"name\": \"example_assistant\"},\n",
81 " ]"
82 ]
83 },
84 {
85 "cell_type": "code",
86 "execution_count": null,
87 "metadata": {},
88 "outputs": [],
89 "source": [
90 "import yaml\n",
91 "import os\n",
92 "\n",
93 "translation_paths = sorted([os.path.join(data_path, d) for d in os.listdir(data_path)])\n",
94 "\n",
95 "# Assuming this notebook is in examples/\n",
96 "registry_path = os.path.join(os.getcwd(), \"..\", \"evals\", \"registry\")\n",
97 "output_path = os.path.join(registry_path, \"data\", \"lafand-mt\")\n",
98 "\n",
99 "registry_yaml = {}\n",
100 "\n",
101 "for input_path in translation_paths:\n",
102 " langs = input_path.split(\"/\")[-1]\n",
103 " input_lang, output_lang = langs.split('-')\n",
104 " pair_path = os.path.join(output_path, f\"{input_lang}-{output_lang}\")\n",
105 " os.makedirs(pair_path, exist_ok=True)\n",
106 "\n",
107 " # Create few-shot prompts\n",
108 " dev_df = pd.read_csv(os.path.join(input_path, \"dev.tsv\"), sep=\"\\t\")\n",
109 " dev_df[\"sample\"] = dev_df.apply(lambda x: create_chat_example(x[input_lang], x[output_lang]), axis=1)\n",
110 " few_shot_path = os.path.join(pair_path, f\"{input_lang}-{output_lang}_few_shot.jsonl\")\n",
111 " dev_df[[\"sample\"]].to_json(few_shot_path, lines=True, orient=\"records\")\n",
112 "\n",
113 " # Create test prompts and ideal completions\n",
114 " test_df = pd.read_csv(os.path.join(input_path, \"test.tsv\"), sep=\"\\t\")\n",
115 " test_df[\"input\"] = test_df[input_lang].apply(lambda x: create_chat_prompt(sys_msg, input_lang, output_lang, x))\n",
116 " test_df[\"ideal\"] = test_df[output_lang]\n",
117 " \n",
118 " samples_path = os.path.join(pair_path, f\"{input_lang}-{output_lang}_samples.jsonl\")\n",
119 " test_df[[\"input\", \"ideal\"]].to_json(samples_path, lines=True, orient=\"records\")\n",
120 " eval_id = f\"mafand_translation_{input_lang}-{output_lang}\"\n",
121 "\n",
122 " registry_yaml[eval_id] = {\n",
123 " \"id\": f\"{eval_id}.test.v1\",\n",
124 " \"metrics\": [\"accuracy\"]\n",
125 " }\n",
126 " registry_yaml[f\"{eval_id}.test.v1\"] = {\n",
127 " \"class\": \"evals.elsuite.translate:Translate\",\n",
128 " \"args\": {\n",
129 " \"samples_jsonl\": samples_path,\n",
130 " \"few_shot_jsonl\": few_shot_path,\n",
131 " \"num_few_shot\": 4,\n",
132 " }\n",
133 " }\n",
134 "\n",
135 "os.makedirs(os.path.join(registry_path, \"evals\"), exist_ok=True)\n",
136 "with open(os.path.join(registry_path, \"evals\", \"mafand.yaml\"), \"w\") as f:\n",
137 " yaml.dump(registry_yaml, f)"
138 ]
139 },
140 {
141 "cell_type": "code",
142 "execution_count": null,
143 "metadata": {},
144 "outputs": [],
145 "source": [
146 "# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs\n",
147 "!oaieval gpt-3.5-turbo mafand_translation_en-ibo --max_samples 20"
148 ]
149 },
150 {
151 "cell_type": "code",
152 "execution_count": null,
153 "metadata": {},
154 "outputs": [],
155 "source": [
156 "# How to process the log events generated by oaieval\n",
157 "\n",
158 "log_name = \"EDIT THIS\" # copy from above\n",
159 "events = f\"/tmp/evallogs/{log_name}\"\n",
160 "\n",
161 "with open(events, \"r\") as f:\n",
162 " events_df = pd.read_json(f, lines=True)\n",
163 "\n",
164 "matches_df = events_df[events_df.type == \"match\"].reset_index(drop=True)\n",
165 "matches_df = matches_df.join(pd.json_normalize(matches_df.data))\n",
166 "matches_df.correct.value_counts().plot.bar(title=\"Correctness of generated answers\", xlabel=\"sacrebleu score >30\", ylabel=\"Count\")"
167 ]
168 },
169 {
170 "cell_type": "code",
171 "execution_count": null,
172 "metadata": {},
173 "outputs": [],
174 "source": [
175 "import matplotlib.pyplot as plt\n",
176 "\n",
177 "# your list of scores\n",
178 "scores = matches_df['sacrebleu_sentence_score']\n",
179 "\n",
180 "# define the threshold scores as a range from the minimum to the maximum score, in increments of 5\n",
181 "thresholds = range(int(min(scores)), int(max(scores)) + 5, 5)\n",
182 "\n",
183 "# count the number of scores above and below each threshold\n",
184 "above_counts = [len([score for score in scores if score >= threshold]) for threshold in thresholds]\n",
185 "\n",
186 "# plot the counts as a step function\n",
187 "plt.step(thresholds, above_counts, label='number of samples withabove')\n",
188 "\n",
189 "# set the x and y labels\n",
190 "plt.xlabel('sacrebleu threshold')\n",
191 "plt.ylabel('number of samples w/ score > threshold')\n",
192 "\n",
193 "# show the plot\n",
194 "plt.show()"
195 ]
196 },
197 {
198 "cell_type": "code",
199 "execution_count": null,
200 "metadata": {},
201 "outputs": [],
202 "source": [
203 "# Inspect samples\n",
204 "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n",
205 " print(f\"Prompt: {r.prompt}\")\n",
206 " print(f\"Sampled: {r.sampled}\")\n",
207 " print(\"-\" * 25)"
208 ]
209 }
210 ],
211 "metadata": {
212 "kernelspec": {
213 "display_name": "Python 3 (ipykernel)",
214 "language": "python",
215 "name": "python3"
216 },
217 "language_info": {
218 "codemirror_mode": {
219 "name": "ipython",
220 "version": 3
221 },
222 "file_extension": ".py",
223 "mimetype": "text/x-python",
224 "name": "python",
225 "nbconvert_exporter": "python",
226 "pygments_lexer": "ipython3",
227 "version": "3.9.9"
228 },
229 "toc": {
230 "base_numbering": 1,
231 "nav_menu": {},
232 "number_sections": true,
233 "sideBar": true,
234 "skip_h1_title": false,
235 "title_cell": "Table of Contents",
236 "title_sidebar": "Contents",
237 "toc_cell": false,
238 "toc_position": {},
239 "toc_section_display": true,
240 "toc_window_display": false
241 },
242 "vscode": {
243 "interpreter": {
244 "hash": "fdbe172e46cfba2329a5e8d5b64cdf2d12f4dfd7d9bcea153ecef62d1d51933b"
245 }
246 }
247 },
248 "nbformat": 4,
249 "nbformat_minor": 2
250}
251
1{
2 "cells": [
3 {
4 "attachments": {},
5 "cell_type": "markdown",
6 "metadata": {},
7 "source": [
8 "## Building a MAFAND Eval\n",
9 "\n",
10 "This notebook shows how to:\n",
11 "- Build and run an eval using the [MAFAND dataset](https://github.com/masakhane-io/lafand-mt)\n",
12 "- Load the results and into a Pandas Dataframe"
13 ]
14 },
15 {
16 "cell_type": "code",
17 "execution_count": null,
18 "metadata": {},
19 "outputs": [],
20 "source": [
21 "%load_ext autoreload\n",
22 "%autoreload 2\n",
23 "\n",
24 "import os\n",
25 "import requests\n",
26 "import pandas as pd\n",
27 "\n",
28 "\n",
29 "# Install Evals if you haven't already\n",
30 "# %pip install -e ../."
31 ]
32 },
33 {
34 "cell_type": "code",
35 "execution_count": null,
36 "metadata": {},
37 "outputs": [],
38 "source": [
39 "# Download the MAFAND dataset\n",
40 "\n",
41 "lang_pairs = [\n",
42 " \"en-amh\", \"en-hau\", \"en-ibo\", \"en-kin\", \"en-lug\", \"en-nya\", \"en-pcm\", \"en-sna\", \"en-swa\", \"en-tsn\",\n",
43 " \"en-twi\", \"en-xho\", \"en-yor\", \"en-zul\", \"fr-bam\", \"fr-bbj\", \"fr-ewe\", \"fr-fon\", \"fr-mos\", \"fr-wol\"\n",
44 "]\n",
45 "\n",
46 "# Assuming this notebook is in examples/\n",
47 "registry_path = os.path.join(os.getcwd(), \"..\", \"evals\", \"registry\")\n",
48 "data_path = os.path.join(registry_path, \"data\", \"lafand-mt\")\n",
49 "os.makedirs(data_path, exist_ok=True)\n",
50 "\n",
51 "for pair in lang_pairs:\n",
52 " os.makedirs(os.path.join(data_path, pair), exist_ok=True)\n",
53 " for dev_test in ['dev', 'test']:\n",
54 " raw_tsv_file = f'https://raw.githubusercontent.com/masakhane-io/lafand-mt/main/data/tsv_files/{pair}/{dev_test}.tsv'\n",
55 " with open(os.path.join(data_path, pair, f\"{dev_test}.tsv\"), \"w\", encoding=\"utf-8\") as f:\n",
56 " f.write(requests.get(raw_tsv_file).text)"
57 ]
58 },
59 {
60 "cell_type": "code",
61 "execution_count": null,
62 "metadata": {},
63 "outputs": [],
64 "source": [
65 "# Build the prompts using Chat format. We support converting Chat conversations to text for non-Chat models\n",
66 "\n",
67 "sys_msg = \"Translate the text from {} to {}.\"\n",
68 "def create_chat_prompt(sys_msg, input_lang, output_lang, input_text):\n",
69 " return [\n",
70 " {\"role\": \"system\", \"content\": sys_msg.format(input_lang, output_lang)}, \n",
71 " {\"role\": \"user\", \"content\": input_text}\n",
72 " ]\n",
73 "\n",
74 "def create_chat_example(input_text, correct_translation):\n",
75 " \"\"\"\n",
76 " Form few-shot prompts in the recommended format: https://github.com/openai/openai-python/blob/main/chatml.md#few-shot-prompting\n",
77 " \"\"\"\n",
78 " return [\n",
79 " {\"role\": \"system\", \"content\": input_text, \"name\": \"example_user\"},\n",
80 " {\"role\": \"system\", \"content\": correct_translation, \"name\": \"example_assistant\"},\n",
81 " ]"
82 ]
83 },
84 {
85 "cell_type": "code",
86 "execution_count": null,
87 "metadata": {},
88 "outputs": [],
89 "source": [
90 "import yaml\n",
91 "import os\n",
92 "\n",
93 "translation_paths = sorted([os.path.join(data_path, d) for d in os.listdir(data_path)])\n",
94 "\n",
95 "# Assuming this notebook is in examples/\n",
96 "registry_path = os.path.join(os.getcwd(), \"..\", \"evals\", \"registry\")\n",
97 "output_path = os.path.join(registry_path, \"data\", \"lafand-mt\")\n",
98 "\n",
99 "registry_yaml = {}\n",
100 "\n",
101 "for input_path in translation_paths:\n",
102 " langs = input_path.split(\"/\")[-1]\n",
103 " input_lang, output_lang = langs.split('-')\n",
104 " pair_path = os.path.join(output_path, f\"{input_lang}-{output_lang}\")\n",
105 " os.makedirs(pair_path, exist_ok=True)\n",
106 "\n",
107 " # Create few-shot prompts\n",
108 " dev_df = pd.read_csv(os.path.join(input_path, \"dev.tsv\"), sep=\"\\t\")\n",
109 " dev_df[\"sample\"] = dev_df.apply(lambda x: create_chat_example(x[input_lang], x[output_lang]), axis=1)\n",
110 " few_shot_path = os.path.join(pair_path, f\"{input_lang}-{output_lang}_few_shot.jsonl\")\n",
111 " dev_df[[\"sample\"]].to_json(few_shot_path, lines=True, orient=\"records\")\n",
112 "\n",
113 " # Create test prompts and ideal completions\n",
114 " test_df = pd.read_csv(os.path.join(input_path, \"test.tsv\"), sep=\"\\t\")\n",
115 " test_df[\"input\"] = test_df[input_lang].apply(lambda x: create_chat_prompt(sys_msg, input_lang, output_lang, x))\n",
116 " test_df[\"ideal\"] = test_df[output_lang]\n",
117 " \n",
118 " samples_path = os.path.join(pair_path, f\"{input_lang}-{output_lang}_samples.jsonl\")\n",
119 " test_df[[\"input\", \"ideal\"]].to_json(samples_path, lines=True, orient=\"records\")\n",
120 " eval_id = f\"mafand_translation_{input_lang}-{output_lang}\"\n",
121 "\n",
122 " registry_yaml[eval_id] = {\n",
123 " \"id\": f\"{eval_id}.test.v1\",\n",
124 " \"metrics\": [\"accuracy\"]\n",
125 " }\n",
126 " registry_yaml[f\"{eval_id}.test.v1\"] = {\n",
127 " \"class\": \"evals.elsuite.translate:Translate\",\n",
128 " \"args\": {\n",
129 " \"samples_jsonl\": samples_path,\n",
130 " \"few_shot_jsonl\": few_shot_path,\n",
131 " \"num_few_shot\": 4,\n",
132 " }\n",
133 " }\n",
134 "\n",
135 "os.makedirs(os.path.join(registry_path, \"evals\"), exist_ok=True)\n",
136 "with open(os.path.join(registry_path, \"evals\", \"mafand.yaml\"), \"w\") as f:\n",
137 " yaml.dump(registry_yaml, f)"
138 ]
139 },
140 {
141 "cell_type": "code",
142 "execution_count": null,
143 "metadata": {},
144 "outputs": [],
145 "source": [
146 "# This will generate a JSONL which will record samples and logs and store it in /tmp/evallogs\n",
147 "!oaieval gpt-3.5-turbo mafand_translation_en-ibo --max_samples 20"
148 ]
149 },
150 {
151 "cell_type": "code",
152 "execution_count": null,
153 "metadata": {},
154 "outputs": [],
155 "source": [
156 "# How to process the log events generated by oaieval\n",
157 "\n",
158 "log_name = \"EDIT THIS\" # copy from above\n",
159 "events = f\"/tmp/evallogs/{log_name}\"\n",
160 "\n",
161 "with open(events, \"r\") as f:\n",
162 " events_df = pd.read_json(f, lines=True)\n",
163 "\n",
164 "matches_df = events_df[events_df.type == \"match\"].reset_index(drop=True)\n",
165 "matches_df = matches_df.join(pd.json_normalize(matches_df.data))\n",
166 "matches_df.correct.value_counts().plot.bar(title=\"Correctness of generated answers\", xlabel=\"sacrebleu score >30\", ylabel=\"Count\")"
167 ]
168 },
169 {
170 "cell_type": "code",
171 "execution_count": null,
172 "metadata": {},
173 "outputs": [],
174 "source": [
175 "import matplotlib.pyplot as plt\n",
176 "\n",
177 "# your list of scores\n",
178 "scores = matches_df['sacrebleu_sentence_score']\n",
179 "\n",
180 "# define the threshold scores as a range from the minimum to the maximum score, in increments of 5\n",
181 "thresholds = range(int(min(scores)), int(max(scores)) + 5, 5)\n",
182 "\n",
183 "# count the number of scores above and below each threshold\n",
184 "above_counts = [len([score for score in scores if score >= threshold]) for threshold in thresholds]\n",
185 "\n",
186 "# plot the counts as a step function\n",
187 "plt.step(thresholds, above_counts, label='number of samples withabove')\n",
188 "\n",
189 "# set the x and y labels\n",
190 "plt.xlabel('sacrebleu threshold')\n",
191 "plt.ylabel('number of samples w/ score > threshold')\n",
192 "\n",
193 "# show the plot\n",
194 "plt.show()"
195 ]
196 },
197 {
198 "cell_type": "code",
199 "execution_count": null,
200 "metadata": {},
201 "outputs": [],
202 "source": [
203 "# Inspect samples\n",
204 "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n",
205 " print(f\"Prompt: {r.prompt}\")\n",
206 " print(f\"Sampled: {r.sampled}\")\n",
207 " print(\"-\" * 25)"
208 ]
209 }
210 ],
211 "metadata": {
212 "kernelspec": {
213 "display_name": "Python 3 (ipykernel)",
214 "language": "python",
215 "name": "python3"
216 },
217 "language_info": {
218 "codemirror_mode": {
219 "name": "ipython",
220 "version": 3
221 },
222 "file_extension": ".py",
223 "mimetype": "text/x-python",
224 "name": "python",
225 "nbconvert_exporter": "python",
226 "pygments_lexer": "ipython3",
227 "version": "3.9.9"
228 },
229 "toc": {
230 "base_numbering": 1,
231 "nav_menu": {},
232 "number_sections": true,
233 "sideBar": true,
234 "skip_h1_title": false,
235 "title_cell": "Table of Contents",
236 "title_sidebar": "Contents",
237 "toc_cell": false,
238 "toc_position": {},
239 "toc_section_display": true,
240 "toc_window_display": false
241 },
242 "vscode": {
243 "interpreter": {
244 "hash": "fdbe172e46cfba2329a5e8d5b64cdf2d12f4dfd7d9bcea153ecef62d1d51933b"
245 }
246 }
247 },
248 "nbformat": 4,
249 "nbformat_minor": 2
250}
251