From 4882c8a317cc0f59c96624ce14c8c10d05fa6dbc Mon Sep 17 00:00:00 2001
From: Hongbin Sun <hongbin306@gmail.com>
Date: Tue, 1 Jun 2021 21:59:40 +0800
Subject: [PATCH] dataset preparation docs (#255)

---
 docs/datasets.md                        | 50 ++++++++++++++++++++-----
 tools/data/textdet/ctw1500_converter.py |  2 +-
 tools/data/textdet/icdar_converter.py   |  5 ++-
 3 files changed, 44 insertions(+), 13 deletions(-)

diff --git a/docs/datasets.md b/docs/datasets.md
index 91fa998e6..b54038141 100644
--- a/docs/datasets.md
+++ b/docs/datasets.md
@@ -44,23 +44,53 @@ The structure of the text detection dataset directory is organized as follows.
 | Synthtext | [homepage](https://www.robots.ox.ac.uk/~vgg/data/scenetext/)  |                                                                                      | [instances_training.lmdb](https://download.openmmlab.com/mmocr/data/synthtext/instances_training.lmdb) |                    -                    |
 
 - For `icdar2015`:
-  - Step1: Download `ch4_training_images.zip` and `ch4_test_images.zip` from [homepage](https://rrc.cvc.uab.es/?ch=4&com=downloads)
-  - Step2: Download [instances_training.json](https://download.openmmlab.com/mmocr/data/icdar2015/instances_training.json) and [instances_test.json](https://download.openmmlab.com/mmocr/data/icdar2015/instances_test.json)
-  - Step3:
-
+  - Step1: Download `ch4_training_images.zip`, `ch4_test_images.zip`, `ch4_training_localization_transcription_gt.zip`, `Challenge4_Test_Task1_GT.zip` from [homepage](https://rrc.cvc.uab.es/?ch=4&com=downloads)
+  - Step2:
   ```bash
   mkdir icdar2015 && cd icdar2015
-  mv /path/to/instances_training.json .
-  mv /path/to/instances_test.json .
-
-  mkdir imgs && cd imgs
-  ln -s /path/to/ch4_training_images training
-  ln -s /path/to/ch4_test_images test
+  mkdir imgs && mkdir annotations
+  # For images,
+  mv ch4_training_images imgs/training
+  mv ch4_test_images imgs/test
+  # For annotations,
+  mv ch4_training_localization_transcription_gt annotations/training
+  mv Challenge4_Test_Task1_GT annotations/test
+  ```
+  - Step3: Download [instances_training.json](https://download.openmmlab.com/mmocr/data/icdar2015/instances_training.json) and [instances_test.json](https://download.openmmlab.com/mmocr/data/icdar2015/instances_test.json) and move them to `icdar2015`
+  - Or, generate `instances_training.json` and `instances_test.json` with following command:
+  ```bash
+  python tools/data/textdet/icdar_converter.py /path/to/icdar2015 -o /path/to/icdar2015 -d icdar2015 --split-list training test
   ```
 
 - For `icdar2017`:
   - To avoid the effect of rotation when load `jpg` with opencv, We provide re-saved `png` format image in [renamed_images](https://download.openmmlab.com/mmocr/data/icdar2017/renamed_imgs.tar). You can copy these images to `imgs`.
 
+- For `ctw1500`:
+  - Step1: Download `train_images.zip`, `test_images.zip`, `train_labels.zip`, `test_labels.zip` from [github](https://github.com/Yuliang-Liu/Curve-Text-Detector)
+  ```bash
+  mkdir ctw1500 && cd ctw1500
+  mkdir imgs && mkdir annotations
+
+  # For annotations
+  cd annotations
+  wget -O train_labels.zip https://universityofadelaide.box.com/shared/static/jikuazluzyj4lq6umzei7m2ppmt3afyw.zip
+  wget -O test_labels.zip https://cloudstor.aarnet.edu.au/plus/s/uoeFl0pCN9BOCN5/download
+  unzip train_labels.zip && mv ctw1500_train_labels training
+  unzip test_labels.zip -d test
+  cd ..
+  # For images
+  cd imgs
+  wget -O train_images.zip https://universityofadelaide.box.com/shared/static/py5uwlfyyytbb2pxzq9czvu6fuqbjdh8.zip
+  wget -O test_images.zip https://universityofadelaide.box.com/shared/static/t4w48ofnqkdw7jyc4t11nsukoeqk9c3d.zip
+  unzip train_images.zip && mv train_images training
+  unzip test_images.zip && mv test_images test
+  ```
+  - Step2: Download [instances_training.json](https://download.openmmlab.com/mmocr/data/ctw1500/instances_training.json) and [instances_test.json](https://download.openmmlab.com/mmocr/data/ctw1500/instances_test.json), and move them to `ctw1500` folder.
+  - Or, generate `instances_training.json` and `instances_test.json` with following command:
+  ```bash
+  python tools/data/textdet/ctw1500_converter.py /path/to/ctw1500 -o /path/to/ctw1500 --split-list training test
+  ```
+
 ## Text Recognition
 
 **The structure of the text recognition dataset directory is organized as follows.**
diff --git a/tools/data/textdet/ctw1500_converter.py b/tools/data/textdet/ctw1500_converter.py
index 18b6f7666..ee42cf3fb 100644
--- a/tools/data/textdet/ctw1500_converter.py
+++ b/tools/data/textdet/ctw1500_converter.py
@@ -201,7 +201,7 @@ def parse_args():
     parser.add_argument(
         '--split-list',
         nargs='+',
-        help='a list of splits. e.g., "--split_list training test"')
+        help='a list of splits. e.g., "--split-list training test"')
 
     parser.add_argument(
         '--nproc', default=1, type=int, help='number of process')
diff --git a/tools/data/textdet/icdar_converter.py b/tools/data/textdet/icdar_converter.py
index 2c6b3f6ec..584eb4f90 100644
--- a/tools/data/textdet/icdar_converter.py
+++ b/tools/data/textdet/icdar_converter.py
@@ -150,11 +150,12 @@ def parse_args():
     )
     parser.add_argument('icdar_path', help='icdar root path')
     parser.add_argument('-o', '--out-dir', help='output path')
-    parser.add_argument('-d', '--dataset', help='icdar2017 or icdar2015')
+    parser.add_argument(
+        '-d', '--dataset', required=True, help='icdar2017 or icdar2015')
     parser.add_argument(
         '--split-list',
         nargs='+',
-        help='a list of splits. e.g., "--split-list training validation test"')
+        help='a list of splits. e.g., "--split-list training test"')
 
     parser.add_argument(
         '--nproc', default=1, type=int, help='number of process')