From a095cb6285e6cc4e84a01dc2c04591e6429116ea Mon Sep 17 00:00:00 2001 From: Andrew Maguire Date: Mon, 4 Dec 2023 16:09:43 +0000 Subject: [PATCH] Add split_dataset.py and generate_oasst2.ipynb files --- oasst-data/examples/split_dataset.py | 9 +++++++ oasst-data/oasst2/generate_oasst2.ipynb | 34 ++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/oasst-data/examples/split_dataset.py b/oasst-data/examples/split_dataset.py index 0a47a7ca0c..618185bb24 100644 --- a/oasst-data/examples/split_dataset.py +++ b/oasst-data/examples/split_dataset.py @@ -1,3 +1,12 @@ +""" +Example usage: + + python split_dataset.py / + "2023-11-05_oasst_all.messages.jsonl" / + --val_output "2023-11-05_oasst_all.messages.validation.jsonl" / + --train_output "2023-11-05_oasst_all.messages.train.jsonl" +""" + import argparse import random diff --git a/oasst-data/oasst2/generate_oasst2.ipynb b/oasst-data/oasst2/generate_oasst2.ipynb index 48a706bfef..7ee968bc10 100644 --- a/oasst-data/oasst2/generate_oasst2.ipynb +++ b/oasst-data/oasst2/generate_oasst2.ipynb @@ -23,6 +23,8 @@ "instructions_path = f\"{data_dir}/instructions.xlsx\"\n", "trees_filename = f\"2023-11-05_oasst_all.trees.jsonl\"\n", "messages_filename = f\"2023-11-05_oasst_all.messages.jsonl\"\n", + "messages_train_filename = f\"2023-11-05_oasst_all.messages.train.jsonl\"\n", + "messages_validation_filename = f\"2023-11-05_oasst_all.messages.validation.jsonl\"\n", "\n", "# make data_out_dir if it doesn't exist\n", "if not os.path.exists(data_out_dir):\n", @@ -811,7 +813,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -858,16 +860,42 @@ "cell_type": "code", "execution_count": 7, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reading: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.jsonl\n", + "Found 208584 matching messages.\n", + "Writing train 198293 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.train.jsonl\n", + "Writing valid 10291 messages: C:/Users/andre/Downloads/oasst2/2023-11-05_oasst_all.messages.validation.jsonl\n" + ] + } + ], + "source": [ + "# split messages into train and validation\n", + "!python ../examples/split_dataset.py \\\n", + " \"{data_out_dir}/{messages_filename}\" \\\n", + " --train_output \"{data_out_dir}/{messages_train_filename}\" \\\n", + " --val_output \"{data_out_dir}/{messages_validation_filename}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, "outputs": [], "source": [ "# make .gz files, keeping the original files\n", "!gzip -c \"{data_out_dir}/{trees_filename}\" > \"{data_out_dir}/{trees_filename}.gz\"\n", - "!gzip -c \"{data_out_dir}/{messages_filename}\" > \"{data_out_dir}/{messages_filename}.gz\"" + "!gzip -c \"{data_out_dir}/{messages_filename}\" > \"{data_out_dir}/{messages_filename}.gz\"\n", + "!gzip -c \"{data_out_dir}/{messages_train_filename}\" > \"{data_out_dir}/{messages_train_filename}.gz\"\n", + "!gzip -c \"{data_out_dir}/{messages_validation_filename}\" > \"{data_out_dir}/{messages_validation_filename}.gz\"" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [