lukekim / evals

main

1{
2 "cells": [
3  {
4   "attachments": {},
5   "cell_type": "markdown",
6   "metadata": {},
7   "source": [
8    "## Building an eval for LAMBADA\n",
9    "\n",
10    "We show how to build an eval for the LAMBADA dataset"
11   ]
12  },
13  {
14   "cell_type": "code",
15   "execution_count": null,
16   "metadata": {},
17   "outputs": [],
18   "source": [
19    "# Download LAMBADA from https://zenodo.org/record/2630551 and place in examples/lambada-dataset\n",
20    "!curl -O https://zenodo.org/record/2630551/files/lambada-dataset.tar.gz\n",
21    "!tar -xzf lambada-dataset.tar.gz --one-top-level\n",
22    "!ls lambada-dataset\n",
23    "import os\n",
24    "import pandas as pd\n",
25    "\n",
26    "registry_path = os.path.join(\"..\", \"evals\", \"registry\")\n",
27    "os.makedirs(os.path.join(registry_path, \"data\", \"lambada\"), exist_ok=True)\n",
28    "\n",
29    "def create_chat_prompt(text):\n",
30    "    return [\n",
31    "        {\"role\": \"system\", \"content\": \"Please complete the passages with the correct next word.\"}, \n",
32    "        {\"role\": \"user\", \"content\": text}\n",
33    "    ]\n",
34    "\n",
35    "df = pd.read_csv('lambada-dataset/lambada_test_plain_text.txt', sep=\"\\t\", names=[\"text\"])\n",
36    "df[\"text\"] = df[\"text\"].str.split(\" \")\n",
37    "df[\"input\"], df[\"ideal\"] = df[\"text\"].str[:-1].str.join(\" \").apply(create_chat_prompt), df[\"text\"].str[-1]\n",
38    "df = df[[\"input\", \"ideal\"]]\n",
39    "df.to_json(os.path.join(registry_path, \"data/lambada/samples.jsonl\"), orient=\"records\", lines=True)\n",
40    "display(df.head())\n",
41    "\n",
42    "eval_yaml = \"\"\"\n",
43    "lambada:\n",
44    "  id: lambada.test.v1\n",
45    "  metrics: [accuracy]\n",
46    "lambada.test.v1:\n",
47    "  class: evals.elsuite.basic.match:Match\n",
48    "  args:\n",
49    "    samples_jsonl: lambada/samples.jsonl\n",
50    "\"\"\".strip()\n",
51    "with open(os.path.join(registry_path, \"evals\", \"lambada.yaml\"), \"w\") as f:\n",
52    "    f.write(eval_yaml)"
53   ]
54  },
55  {
56   "cell_type": "code",
57   "execution_count": null,
58   "metadata": {},
59   "outputs": [],
60   "source": [
61    "!oaieval gpt-3.5-turbo lambada --max_samples 20"
62   ]
63  },
64  {
65   "cell_type": "code",
66   "execution_count": null,
67   "metadata": {},
68   "outputs": [],
69   "source": [
70    "# Inspect samples\n",
71    "log_path = None # Set to jsonl path to logs from oaieval\n",
72    "events = f\"/tmp/evallogs/{log_path}\"\n",
73    "with open(events, \"r\") as f:\n",
74    "    events_df = pd.read_json(f, lines=True)\n",
75    "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n",
76    "    print(r)\n",
77    "    print(f\"Prompt: {r.prompt}\")\n",
78    "    print(f\"Sampled: {r.sampled}\")\n",
79    "    print(\"-\" * 25)"
80   ]
81  }
82 ],
83 "metadata": {
84  "kernelspec": {
85   "display_name": "base",
86   "language": "python",
87   "name": "python3"
88  },
89  "language_info": {
90   "codemirror_mode": {
91    "name": "ipython",
92    "version": 3
93   },
94   "file_extension": ".py",
95   "mimetype": "text/x-python",
96   "name": "python",
97   "nbconvert_exporter": "python",
98   "pygments_lexer": "ipython3",
99   "version": "3.10.9"
100  },
101  "orig_nbformat": 4
102 },
103 "nbformat": 4,
104 "nbformat_minor": 2
105}
106

1{
2 "cells": [
3  {
4   "attachments": {},
5   "cell_type": "markdown",
6   "metadata": {},
7   "source": [
8    "## Building an eval for LAMBADA\n",
9    "\n",
10    "We show how to build an eval for the LAMBADA dataset"
11   ]
12  },
13  {
14   "cell_type": "code",
15   "execution_count": null,
16   "metadata": {},
17   "outputs": [],
18   "source": [
19    "# Download LAMBADA from https://zenodo.org/record/2630551 and place in examples/lambada-dataset\n",
20    "!curl -O https://zenodo.org/record/2630551/files/lambada-dataset.tar.gz\n",
21    "!tar -xzf lambada-dataset.tar.gz --one-top-level\n",
22    "!ls lambada-dataset\n",
23    "import os\n",
24    "import pandas as pd\n",
25    "\n",
26    "registry_path = os.path.join(\"..\", \"evals\", \"registry\")\n",
27    "os.makedirs(os.path.join(registry_path, \"data\", \"lambada\"), exist_ok=True)\n",
28    "\n",
29    "def create_chat_prompt(text):\n",
30    "    return [\n",
31    "        {\"role\": \"system\", \"content\": \"Please complete the passages with the correct next word.\"}, \n",
32    "        {\"role\": \"user\", \"content\": text}\n",
33    "    ]\n",
34    "\n",
35    "df = pd.read_csv('lambada-dataset/lambada_test_plain_text.txt', sep=\"\\t\", names=[\"text\"])\n",
36    "df[\"text\"] = df[\"text\"].str.split(\" \")\n",
37    "df[\"input\"], df[\"ideal\"] = df[\"text\"].str[:-1].str.join(\" \").apply(create_chat_prompt), df[\"text\"].str[-1]\n",
38    "df = df[[\"input\", \"ideal\"]]\n",
39    "df.to_json(os.path.join(registry_path, \"data/lambada/samples.jsonl\"), orient=\"records\", lines=True)\n",
40    "display(df.head())\n",
41    "\n",
42    "eval_yaml = \"\"\"\n",
43    "lambada:\n",
44    "  id: lambada.test.v1\n",
45    "  metrics: [accuracy]\n",
46    "lambada.test.v1:\n",
47    "  class: evals.elsuite.basic.match:Match\n",
48    "  args:\n",
49    "    samples_jsonl: lambada/samples.jsonl\n",
50    "\"\"\".strip()\n",
51    "with open(os.path.join(registry_path, \"evals\", \"lambada.yaml\"), \"w\") as f:\n",
52    "    f.write(eval_yaml)"
53   ]
54  },
55  {
56   "cell_type": "code",
57   "execution_count": null,
58   "metadata": {},
59   "outputs": [],
60   "source": [
61    "!oaieval gpt-3.5-turbo lambada --max_samples 20"
62   ]
63  },
64  {
65   "cell_type": "code",
66   "execution_count": null,
67   "metadata": {},
68   "outputs": [],
69   "source": [
70    "# Inspect samples\n",
71    "log_path = None # Set to jsonl path to logs from oaieval\n",
72    "events = f\"/tmp/evallogs/{log_path}\"\n",
73    "with open(events, \"r\") as f:\n",
74    "    events_df = pd.read_json(f, lines=True)\n",
75    "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n",
76    "    print(r)\n",
77    "    print(f\"Prompt: {r.prompt}\")\n",
78    "    print(f\"Sampled: {r.sampled}\")\n",
79    "    print(\"-\" * 25)"
80   ]
81  }
82 ],
83 "metadata": {
84  "kernelspec": {
85   "display_name": "base",
86   "language": "python",
87   "name": "python3"
88  },
89  "language_info": {
90   "codemirror_mode": {
91    "name": "ipython",
92    "version": 3
93   },
94   "file_extension": ".py",
95   "mimetype": "text/x-python",
96   "name": "python",
97   "nbconvert_exporter": "python",
98   "pygments_lexer": "ipython3",
99   "version": "3.10.9"
100  },
101  "orig_nbformat": 4
102 },
103 "nbformat": 4,
104 "nbformat_minor": 2
105}
106