1{
2 "cells": [
3 {
4 "attachments": {},
5 "cell_type": "markdown",
6 "metadata": {},
7 "source": [
8 "## Building an eval for LAMBADA\n",
9 "\n",
10 "We show how to build an eval for the LAMBADA dataset"
11 ]
12 },
13 {
14 "cell_type": "code",
15 "execution_count": null,
16 "metadata": {},
17 "outputs": [],
18 "source": [
19 "# Download LAMBADA from https://zenodo.org/record/2630551 and place in examples/lambada-dataset\n",
20 "!curl -O https://zenodo.org/record/2630551/files/lambada-dataset.tar.gz\n",
21 "!tar -xzf lambada-dataset.tar.gz --one-top-level\n",
22 "!ls lambada-dataset\n",
23 "import os\n",
24 "import pandas as pd\n",
25 "\n",
26 "registry_path = os.path.join(\"..\", \"evals\", \"registry\")\n",
27 "os.makedirs(os.path.join(registry_path, \"data\", \"lambada\"), exist_ok=True)\n",
28 "\n",
29 "def create_chat_prompt(text):\n",
30 " return [\n",
31 " {\"role\": \"system\", \"content\": \"Please complete the passages with the correct next word.\"}, \n",
32 " {\"role\": \"user\", \"content\": text}\n",
33 " ]\n",
34 "\n",
35 "df = pd.read_csv('lambada-dataset/lambada_test_plain_text.txt', sep=\"\\t\", names=[\"text\"])\n",
36 "df[\"text\"] = df[\"text\"].str.split(\" \")\n",
37 "df[\"input\"], df[\"ideal\"] = df[\"text\"].str[:-1].str.join(\" \").apply(create_chat_prompt), df[\"text\"].str[-1]\n",
38 "df = df[[\"input\", \"ideal\"]]\n",
39 "df.to_json(os.path.join(registry_path, \"data/lambada/samples.jsonl\"), orient=\"records\", lines=True)\n",
40 "display(df.head())\n",
41 "\n",
42 "eval_yaml = \"\"\"\n",
43 "lambada:\n",
44 " id: lambada.test.v1\n",
45 " metrics: [accuracy]\n",
46 "lambada.test.v1:\n",
47 " class: evals.elsuite.basic.match:Match\n",
48 " args:\n",
49 " samples_jsonl: lambada/samples.jsonl\n",
50 "\"\"\".strip()\n",
51 "with open(os.path.join(registry_path, \"evals\", \"lambada.yaml\"), \"w\") as f:\n",
52 " f.write(eval_yaml)"
53 ]
54 },
55 {
56 "cell_type": "code",
57 "execution_count": null,
58 "metadata": {},
59 "outputs": [],
60 "source": [
61 "!oaieval gpt-3.5-turbo lambada --max_samples 20"
62 ]
63 },
64 {
65 "cell_type": "code",
66 "execution_count": null,
67 "metadata": {},
68 "outputs": [],
69 "source": [
70 "# Inspect samples\n",
71 "log_path = None # Set to jsonl path to logs from oaieval\n",
72 "events = f\"/tmp/evallogs/{log_path}\"\n",
73 "with open(events, \"r\") as f:\n",
74 " events_df = pd.read_json(f, lines=True)\n",
75 "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n",
76 " print(r)\n",
77 " print(f\"Prompt: {r.prompt}\")\n",
78 " print(f\"Sampled: {r.sampled}\")\n",
79 " print(\"-\" * 25)"
80 ]
81 }
82 ],
83 "metadata": {
84 "kernelspec": {
85 "display_name": "base",
86 "language": "python",
87 "name": "python3"
88 },
89 "language_info": {
90 "codemirror_mode": {
91 "name": "ipython",
92 "version": 3
93 },
94 "file_extension": ".py",
95 "mimetype": "text/x-python",
96 "name": "python",
97 "nbconvert_exporter": "python",
98 "pygments_lexer": "ipython3",
99 "version": "3.10.9"
100 },
101 "orig_nbformat": 4
102 },
103 "nbformat": 4,
104 "nbformat_minor": 2
105}
106
1{
2 "cells": [
3 {
4 "attachments": {},
5 "cell_type": "markdown",
6 "metadata": {},
7 "source": [
8 "## Building an eval for LAMBADA\n",
9 "\n",
10 "We show how to build an eval for the LAMBADA dataset"
11 ]
12 },
13 {
14 "cell_type": "code",
15 "execution_count": null,
16 "metadata": {},
17 "outputs": [],
18 "source": [
19 "# Download LAMBADA from https://zenodo.org/record/2630551 and place in examples/lambada-dataset\n",
20 "!curl -O https://zenodo.org/record/2630551/files/lambada-dataset.tar.gz\n",
21 "!tar -xzf lambada-dataset.tar.gz --one-top-level\n",
22 "!ls lambada-dataset\n",
23 "import os\n",
24 "import pandas as pd\n",
25 "\n",
26 "registry_path = os.path.join(\"..\", \"evals\", \"registry\")\n",
27 "os.makedirs(os.path.join(registry_path, \"data\", \"lambada\"), exist_ok=True)\n",
28 "\n",
29 "def create_chat_prompt(text):\n",
30 " return [\n",
31 " {\"role\": \"system\", \"content\": \"Please complete the passages with the correct next word.\"}, \n",
32 " {\"role\": \"user\", \"content\": text}\n",
33 " ]\n",
34 "\n",
35 "df = pd.read_csv('lambada-dataset/lambada_test_plain_text.txt', sep=\"\\t\", names=[\"text\"])\n",
36 "df[\"text\"] = df[\"text\"].str.split(\" \")\n",
37 "df[\"input\"], df[\"ideal\"] = df[\"text\"].str[:-1].str.join(\" \").apply(create_chat_prompt), df[\"text\"].str[-1]\n",
38 "df = df[[\"input\", \"ideal\"]]\n",
39 "df.to_json(os.path.join(registry_path, \"data/lambada/samples.jsonl\"), orient=\"records\", lines=True)\n",
40 "display(df.head())\n",
41 "\n",
42 "eval_yaml = \"\"\"\n",
43 "lambada:\n",
44 " id: lambada.test.v1\n",
45 " metrics: [accuracy]\n",
46 "lambada.test.v1:\n",
47 " class: evals.elsuite.basic.match:Match\n",
48 " args:\n",
49 " samples_jsonl: lambada/samples.jsonl\n",
50 "\"\"\".strip()\n",
51 "with open(os.path.join(registry_path, \"evals\", \"lambada.yaml\"), \"w\") as f:\n",
52 " f.write(eval_yaml)"
53 ]
54 },
55 {
56 "cell_type": "code",
57 "execution_count": null,
58 "metadata": {},
59 "outputs": [],
60 "source": [
61 "!oaieval gpt-3.5-turbo lambada --max_samples 20"
62 ]
63 },
64 {
65 "cell_type": "code",
66 "execution_count": null,
67 "metadata": {},
68 "outputs": [],
69 "source": [
70 "# Inspect samples\n",
71 "log_path = None # Set to jsonl path to logs from oaieval\n",
72 "events = f\"/tmp/evallogs/{log_path}\"\n",
73 "with open(events, \"r\") as f:\n",
74 " events_df = pd.read_json(f, lines=True)\n",
75 "for i, r in pd.json_normalize(events_df[events_df.type == \"sampling\"].data).iterrows():\n",
76 " print(r)\n",
77 " print(f\"Prompt: {r.prompt}\")\n",
78 " print(f\"Sampled: {r.sampled}\")\n",
79 " print(\"-\" * 25)"
80 ]
81 }
82 ],
83 "metadata": {
84 "kernelspec": {
85 "display_name": "base",
86 "language": "python",
87 "name": "python3"
88 },
89 "language_info": {
90 "codemirror_mode": {
91 "name": "ipython",
92 "version": 3
93 },
94 "file_extension": ".py",
95 "mimetype": "text/x-python",
96 "name": "python",
97 "nbconvert_exporter": "python",
98 "pygments_lexer": "ipython3",
99 "version": "3.10.9"
100 },
101 "orig_nbformat": 4
102 },
103 "nbformat": 4,
104 "nbformat_minor": 2
105}
106