MetOffice · mo-meastman · Jul 14, 2023 · Jul 14, 2023 · Jul 14, 2023 · Aug 11, 2023
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 *.ipynb_checkpoints/
 *.DS_Store
+challenges/2023_cloud_classification/src/example_code/mlruns/
diff --git a/challenges/2023_cloud_classification/README.md b/challenges/2023_cloud_classification/README.md
@@ -0,0 +1,29 @@
+# understanding-clouds-kaggle
+This repository hosts the Data Science Community of Practice Understanding Clouds challenge. This challenge was built 
+upon the [Kaggle Understanding Clouds from Satellite Images](https://www.kaggle.com/competitions/understanding_cloud_organization/) 
+challenge hosted by the Max Planck Institute for Meteorology. 
+
+The original challenge required users to segment regions belonging to each of four classes - Fish, Flower, Gravel, and 
+Sugar. However, this challenge was adapted to a image classification task for the Data Science Community of Practice.
+A set of 224x224 images that consist of a single class have been extracted, and randomly separated into training and 
+test sets. These will be used to train image classification algorithms that can be evaluated against the test images
+using a provided script. 
+
+__Environment__
+
+The environment file describing the environment used to execute all the code in this subdirectory can be found here: 
+
+/data_science_cop/env/requirements_cloud_class.yml
+
+This repository contains the following code:
+
+__src/produce_test_train__
+
+Scripts used to produce train / test image sets and labels
+
+__src/example_code__
+
+Examples of image classification algorithms that may be used to develop classifiers
+
+__TODO:__
+- Add links to sharepoint, kick off meeting, monthly catchups, etc.
diff --git a/challenges/2023_cloud_classification/src/__init__.py b/challenges/2023_cloud_classification/src/__init__.py
diff --git a/challenges/2023_cloud_classification/src/example_code/nn_classifier_example.ipynb b/challenges/2023_cloud_classification/src/example_code/nn_classifier_example.ipynb
diff --git a/challenges/2023_cloud_classification/src/functions/__init__.py b/challenges/2023_cloud_classification/src/functions/__init__.py
diff --git a/challenges/2023_cloud_classification/src/functions/extract_labelled_images.py b/challenges/2023_cloud_classification/src/functions/extract_labelled_images.py
@@ -0,0 +1,225 @@
+"""
+Raw data is stored as a series of images and run-length-encoded labels.
+This script converts run-length encoded labels to a 2d array that matches the corresponding image. This image is then
+queried for rectangles that satisfy a certain criteria to produce a series of images that have a single label
+"""
+
+import argparse
+import os
+import numpy as np
+import pandas as pd
+from PIL import Image
+
+class Label_Images:
+    def __init__(self, input_fp, output_fp, labels_fn='train.csv', images_fp='train_images', image_shape=None,
+                 labels_suffix=None):
+        self.input_fp = input_fp
+        self.output_fp = output_fp
+        self.labels_fn = labels_fn
+        self.images_fp = images_fp
+        self.image_shape = image_shape
+        self.labels_suffix = labels_suffix
+
+        self.images_fn = os.listdir(os.path.join(input_fp, images_fp))
+
+        self.label_codes = None
+        self.labels_rle = None
+        self.labels_2d = {}
+
+    def read_labels(self):
+        """Read and process labels rle file"""
+        labels_rle = pd.read_csv(os.path.join(self.input_fp, self.labels_fn))
+
+        # labels stored with image name. Separate and remove original column
+        labels_rle['Image'] = labels_rle['Image_Label'].apply(lambda img_lbl: self.split_img_label(img_lbl)[0])
+        labels_rle['Label'] = labels_rle['Image_Label'].apply(lambda img_lbl: self.split_img_label(img_lbl)[1])
+        del labels_rle['Image_Label']
+
+        # set label codes
+        self.label_codes = {k: v for v, k in enumerate(set(labels_rle['Label']))}
+
+        self.labels_rle = labels_rle
+
+    def split_img_label(self, img_lbl):
+        """Return image and label from file name like '0011165.jpg_Flower'"""
+        s = img_lbl.split("_")
+        assert len(s) == 2
+        return s[0], s[1]
+
+    def read_image(self, fn):
+        """read image into numpy array"""
+        return Image.open(os.path.join(self.input_fp, self.images_fp, fn))
+
+    def rle_decode(self, rle, shape, value=1):
+        """
+        Decodes an RLE-encoded string.
+
+        Parameters
+        ----------
+        encoded
+            RLE mask.
+        shape
+            Mask shape in (height, width) format.
+        value
+            Value to fill in the mask.
+
+        Returns
+        -------
+        mask
+            The decoded mask as 2D image of shape (height, width).
+        """
+
+        s = rle.split()
+        starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
+        starts -= 1
+        ends = starts + lengths
+        img = np.empty(shape[0] * shape[1])
+        img[:] = np.nan
+        for lo, hi in zip(starts, ends):
+            img[lo:hi] = value
+        return img.reshape(shape)
+
+    def produce_2d_labels(self):
+        """converts rle labels into 2d np arrays of labels"""
+
+        if self.labels_rle is None:
+            self.read_labels()
+
+        images = self.labels_rle['Image']
+
+        for image_i in set(images):
+            print(f"Decoding {image_i}...")
+            shape = self.read_image(fn=image_i).size
+            self.labels_2d[image_i] = np.zeros(shape)
+            image_l_rle = self.labels_rle[self.labels_rle['Image'] == image_i].copy()
+            image_l_rle.dropna(inplace=True)
+
+            for label_i in set(image_l_rle['Label']):
+                v = image_l_rle['EncodedPixels'][image_l_rle['Label'] == label_i].values[0]
+                self.labels_2d[image_i] += self.rle_decode(v, shape=shape, value=self.label_codes[label_i])
+
+    def sample_rectangles_idx(self, arr, value=np.nan, rectangle_size=(224, 224), num_samples=np.Inf):
+        arr = arr.copy()
+        if arr.dtype != 'float64':
+            Warning('Converting array to float64')
+            arr = arr.astype(float)
+
+        sample_rectangles = []
+        failed_to_sample = False
+        while (~failed_to_sample) & (len(sample_rectangles) < num_samples):
+            print(f"Sampling rectangles: {len(sample_rectangles) + 1} of {num_samples}")
+
+            valid_indices = np.argwhere(arr == value)
+            np.random.shuffle(valid_indices)
+
+            # iterate through random valid indices until criteria are satisfied, or loop ends
+            for idx in valid_indices:
+                x0, y0 = idx
+                x1, y1 = x0 + rectangle_size[0], y0 + rectangle_size[1]
+
+                sample_arr = arr[x0:x1, y0:y1]
+
+                # if criteria is satisfied save indices, set values to na to sample without replacement,
+                # and start next cycle
+                if ((sample_arr != value).sum() == 0) & (sample_arr.shape == rectangle_size):
+                    sample_rectangles.append((x0, y0))
+
+                    arr[x0: x1,
+                        y0: y1] = np.nan
+
+                    break
+
+                # if reach end of loop and haven't found suitable rectangle, end while loop
+                if all(idx == valid_indices[-1]):
+                    failed_to_sample = True
+
+        return sample_rectangles
+
+    def plot_2d_labels(self, ):
+        pass
+
+    def uniquify(self, path):
+        """
+        return path with suffixed numbers if path already exists
+        """
+        filename, extension = os.path.splitext(path)
+        counter = 1
+
+        while os.path.exists(path):
+            path = filename + " (" + str(counter) + ")" + extension
+            counter += 1
+
+        return path
+
+    def extract_labelled_image(self, img_idx_to_load=None):
+        """saves images and labels where images are subsets of those provided that satisfy certain criteria
+        :param img_idx_to_load: list of images to load. Used for parallelisation. If None, does all
+        """
+        if self.labels_rle is None:
+            self.read_labels()
+
+        images = self.labels_rle['Image']
+        labels = []
+
+        if img_idx_to_load is None:
+            img_idx_to_load = [x for x in range(len(images.unique()))]
+
+        # remove those outside of range (cases come bash/sbatch scripts)
+        img_idx_to_load = [x for x in img_idx_to_load if x < len(images.unique())]
+
+        for image_i in images.unique()[img_idx_to_load]:
+            print(f"Analysing {image_i}...")
+
+            img = self.read_image(fn=image_i)
+
+            # labels and rle pixels
+            image_l_rle = self.labels_rle[self.labels_rle['Image'] == image_i].copy()
+            image_l_rle.dropna(inplace=True)
+
+            for label_i in image_l_rle['Label']:
+                # first decode the rle into a 2d array
+                v = image_l_rle['EncodedPixels'][image_l_rle['Label'] == label_i].values[0]
+                labels_2d = self.rle_decode(v, shape=img.size, value=self.label_codes[label_i])
+
+                # exhaustively search the rle for rectangles of predefined shape that satisfy criteria
+                idxs = self.sample_rectangles_idx(arr=labels_2d, value=self.label_codes[label_i], rectangle_size=(224, 224))
+
+                # output images and labels
+                for i, idx in enumerate(idxs):
+                    cropped_img = img.crop((idx[0], idx[1],
+                                            idx[0] + 224, idx[1] + 224))
+
+                    fp = os.path.join(self.output_fp, 'single_labels', '224s')
+                    os.makedirs(fp, exist_ok=True)
+
+                    fn = f"{image_i.split('.')[0]}_{i}.jpg"
+                    fp_fn = self.uniquify(os.path.join(fp, fn))
+
+                    cropped_img.save(fp_fn)
+                    labels.append(pd.DataFrame({
+                        "Image": [fp_fn.split('/')[-1]],
+                        "Label": [label_i]
+                    }))
+
+        labels = pd.concat(labels)
+        labels.to_csv(self.uniquify(os.path.join(fp, f"labels_{self.labels_suffix}.csv")))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--step', nargs='?', help='If running in steps, which step this iteration is', type=int)
+    parser.add_argument('--step_len', nargs='?', help='If running in steps, length of step', type=int)
+
+    args = parser.parse_args()
+    step = args.step
+    step_len = args.step_len
+
+    labels_suffix = None  # default is None, change if processing slices so file is not overwritten/parallel written
+    if step is not None:
+        idxs = [x for x in range(step * step_len, step * step_len + step_len)]
+        labels_suffix = f"{str(min(idxs))}-{str(max(idxs))}"
+
+    label_images_class = Label_Images(input_fp="/data/users/meastman/understanding_clouds_kaggle/input",
+                                      output_fp="/data/users/meastman/understanding_clouds_kaggle/input",
+                                      labels_suffix=labels_suffix)
+    label_images_class.extract_labelled_image(idxs)
diff --git a/challenges/2023_cloud_classification/src/produce_test_train/extract_labelled_images.sbatch b/challenges/2023_cloud_classification/src/produce_test_train/extract_labelled_images.sbatch
@@ -0,0 +1,11 @@
+#!/bin/bash
+#SBATCH --array=0-220
+#SBATCH --mem=8000
+#SBATCH --output=src/sbatch/o/extract_lab_img_%a.out
+#SBATCH --error=src/sbatch/e/extract_lab_img_%a.err
+#SBATCH --time=240s
+
+module load scitools
+
+# 5546 unique images. Iterate through and process 25 at a time
+python src/functions/extract_labelled_images.py --step $SLURM_ARRAY_TASK_ID --step_len 25
diff --git a/challenges/2023_cloud_classification/src/produce_test_train/post_process_labelled_images.py b/challenges/2023_cloud_classification/src/produce_test_train/post_process_labelled_images.py
@@ -0,0 +1,81 @@
+"""
+Takes output from extract_labelled_images.sbatch and separates into test/train images, and produces test/train labels
+"""
+
+import os
+import glob
+import shutil
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+class PPLabelledImages:
+
+    def __init__(self, parent_dir):
+        self.parent_dir = parent_dir
+
+        self.fps = None
+        self.labels = None
+        self.image_names = None
+        self.train_images = None
+        self.test_images = None
+
+    def extract_labels(self):
+        """Read label files"""
+        self.fps = glob.glob(os.path.join(self.parent_dir, '*.csv'))
+
+        dfs = [pd.read_csv(fp, index_col=0) for fp in self.fps]
+        self.labels = pd.concat(dfs, ignore_index=True)
+
+    def move_test_train_files(self):
+        if self.train_images is None or self.test_images is None:
+            ValueError("No images in self.train_images or self.test_images to move")
+            return
+
+        os.makedirs(os.path.join(self.parent_dir, 'train'), exist_ok=True)
+        os.makedirs(os.path.join(self.parent_dir, 'test'), exist_ok=True)
+
+        train_fps = [glob.glob(os.path.join(self.parent_dir, img + '*')) for img in self.train_images]
+        test_fps = [glob.glob(os.path.join(self.parent_dir, img + '*')) for img in self.test_images]
+
+        # join lists together
+        train_fps = sum(train_fps, [])
+        test_fps = sum(test_fps, [])
+
+        # move files
+        for fp in train_fps:
+            new_fp = os.path.join(self.parent_dir, 'train', fp.split('/')[-1])
+            shutil.move(fp, new_fp)
+        for fp in test_fps:
+            new_fp = os.path.join(self.parent_dir, 'test', fp.split('/')[-1])
+            shutil.move(fp, new_fp)
+
+    def delete_files(self):
+        pass
+
+    def process_images(self):
+        # read all label files into pd.DataFrame
+        self.extract_labels()
+
+        # separate entire images into test/train
+        image_names = [img.split('.')[0].split('_')[0] for img in self.labels['Image']]
+        self.image_names = list(set(image_names))
+
+        self.train_images, self.test_images = train_test_split(self.image_names, test_size=.2, random_state=11)
+
+        # move files into new directories with train/test specific label files
+        self.move_test_train_files()
+        train_labels = self.labels[[x in self.train_images for x in image_names]]
+        test_labels = self.labels[[x in self.test_images for x in image_names]]
+
+        train_labels.to_csv(os.path.join(self.parent_dir, 'train', 'train_labels.csv'))
+        test_labels.to_csv(os.path.join(self.parent_dir, 'test', 'test_labels.csv'))
+
+        # delete old label files
+        for fn in glob.glob(os.path.join(self.parent_dir, '*.csv')):
+            os.remove(fn)
+
+
+if __name__ == '__main__':
+    lab_img_processor = PPLabelledImages(
+        parent_dir='/data/users/meastman/understanding_clouds_kaggle/input/single_labels/224s')
+    lab_img_processor.process_images()
diff --git a/env/README.md b/env/README.md
@@ -51,6 +51,7 @@ Each environment has a focus on a particular technology, below lists the existin
 + `requirements_tensorflow.yml`: [Tensorflow](https://www.tensorflow.org/)
 + `requirements_xgboost.yml`: [XGBoost](https://xgboost.ai/)
 + `requirements_pymc.yml`: [PyMC](https://www.pymc.io/welcome.html) and [ArviZ](https://python.arviz.org/en/stable/)
++ `requirements_cloud_class.yml`: [Cloud Classification Challenge](https://github.com/MetOffice/data_science_cop/tree/understanding-clouds-challenge/challenges/2023_cloud_classification)
 
 ## Use on Met Office systems