Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Understanding clouds challenge #38

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
*.ipynb_checkpoints/
*.DS_Store
challenges/2023_cloud_classification/src/example_code/mlruns/
29 changes: 29 additions & 0 deletions challenges/2023_cloud_classification/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# understanding-clouds-kaggle
This repository hosts the Data Science Community of Practice Understanding Clouds challenge. This challenge was built
upon the [Kaggle Understanding Clouds from Satellite Images](https://www.kaggle.com/competitions/understanding_cloud_organization/)
challenge hosted by the Max Planck Institute for Meteorology.

The original challenge required users to segment regions belonging to each of four classes - Fish, Flower, Gravel, and
Sugar. However, this challenge was adapted to a image classification task for the Data Science Community of Practice.
A set of 224x224 images that consist of a single class have been extracted, and randomly separated into training and
test sets. These will be used to train image classification algorithms that can be evaluated against the test images
using a provided script.

__Environment__

The environment file describing the environment used to execute all the code in this subdirectory can be found here:

/data_science_cop/env/requirements_cloud_class.yml

This repository contains the following code:

__src/produce_test_train__

Scripts used to produce train / test image sets and labels

__src/example_code__

Examples of image classification algorithms that may be used to develop classifiers

__TODO:__
- Add links to sharepoint, kick off meeting, monthly catchups, etc.
Empty file.

Large diffs are not rendered by default.

Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
"""
Raw data is stored as a series of images and run-length-encoded labels.
This script converts run-length encoded labels to a 2d array that matches the corresponding image. This image is then
queried for rectangles that satisfy a certain criteria to produce a series of images that have a single label
"""

import argparse
import os
import numpy as np
import pandas as pd
from PIL import Image

class Label_Images:
def __init__(self, input_fp, output_fp, labels_fn='train.csv', images_fp='train_images', image_shape=None,
labels_suffix=None):
self.input_fp = input_fp
self.output_fp = output_fp
self.labels_fn = labels_fn
self.images_fp = images_fp
self.image_shape = image_shape
self.labels_suffix = labels_suffix

self.images_fn = os.listdir(os.path.join(input_fp, images_fp))

self.label_codes = None
self.labels_rle = None
self.labels_2d = {}

def read_labels(self):
"""Read and process labels rle file"""
labels_rle = pd.read_csv(os.path.join(self.input_fp, self.labels_fn))

# labels stored with image name. Separate and remove original column
labels_rle['Image'] = labels_rle['Image_Label'].apply(lambda img_lbl: self.split_img_label(img_lbl)[0])
labels_rle['Label'] = labels_rle['Image_Label'].apply(lambda img_lbl: self.split_img_label(img_lbl)[1])
del labels_rle['Image_Label']

# set label codes
self.label_codes = {k: v for v, k in enumerate(set(labels_rle['Label']))}

self.labels_rle = labels_rle

def split_img_label(self, img_lbl):
"""Return image and label from file name like '0011165.jpg_Flower'"""
s = img_lbl.split("_")
assert len(s) == 2
return s[0], s[1]

def read_image(self, fn):
"""read image into numpy array"""
return Image.open(os.path.join(self.input_fp, self.images_fp, fn))

def rle_decode(self, rle, shape, value=1):
"""
Decodes an RLE-encoded string.

Parameters
----------
encoded
RLE mask.
shape
Mask shape in (height, width) format.
value
Value to fill in the mask.

Returns
-------
mask
The decoded mask as 2D image of shape (height, width).
"""

s = rle.split()
starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
starts -= 1
ends = starts + lengths
img = np.empty(shape[0] * shape[1])
img[:] = np.nan
for lo, hi in zip(starts, ends):
img[lo:hi] = value
return img.reshape(shape)

def produce_2d_labels(self):
"""converts rle labels into 2d np arrays of labels"""

if self.labels_rle is None:
self.read_labels()

images = self.labels_rle['Image']

for image_i in set(images):
print(f"Decoding {image_i}...")
shape = self.read_image(fn=image_i).size
self.labels_2d[image_i] = np.zeros(shape)
image_l_rle = self.labels_rle[self.labels_rle['Image'] == image_i].copy()
image_l_rle.dropna(inplace=True)

for label_i in set(image_l_rle['Label']):
v = image_l_rle['EncodedPixels'][image_l_rle['Label'] == label_i].values[0]
self.labels_2d[image_i] += self.rle_decode(v, shape=shape, value=self.label_codes[label_i])

def sample_rectangles_idx(self, arr, value=np.nan, rectangle_size=(224, 224), num_samples=np.Inf):
arr = arr.copy()
if arr.dtype != 'float64':
Warning('Converting array to float64')
arr = arr.astype(float)

sample_rectangles = []
failed_to_sample = False
while (~failed_to_sample) & (len(sample_rectangles) < num_samples):
print(f"Sampling rectangles: {len(sample_rectangles) + 1} of {num_samples}")

valid_indices = np.argwhere(arr == value)
np.random.shuffle(valid_indices)

# iterate through random valid indices until criteria are satisfied, or loop ends
for idx in valid_indices:
x0, y0 = idx
x1, y1 = x0 + rectangle_size[0], y0 + rectangle_size[1]

sample_arr = arr[x0:x1, y0:y1]

# if criteria is satisfied save indices, set values to na to sample without replacement,
# and start next cycle
if ((sample_arr != value).sum() == 0) & (sample_arr.shape == rectangle_size):
sample_rectangles.append((x0, y0))

arr[x0: x1,
y0: y1] = np.nan

break

# if reach end of loop and haven't found suitable rectangle, end while loop
if all(idx == valid_indices[-1]):
failed_to_sample = True

return sample_rectangles

def plot_2d_labels(self, ):
pass

def uniquify(self, path):
"""
return path with suffixed numbers if path already exists
"""
filename, extension = os.path.splitext(path)
counter = 1

while os.path.exists(path):
path = filename + " (" + str(counter) + ")" + extension
counter += 1

return path

def extract_labelled_image(self, img_idx_to_load=None):
"""saves images and labels where images are subsets of those provided that satisfy certain criteria
:param img_idx_to_load: list of images to load. Used for parallelisation. If None, does all
"""
if self.labels_rle is None:
self.read_labels()

images = self.labels_rle['Image']
labels = []

if img_idx_to_load is None:
img_idx_to_load = [x for x in range(len(images.unique()))]

# remove those outside of range (cases come bash/sbatch scripts)
img_idx_to_load = [x for x in img_idx_to_load if x < len(images.unique())]

for image_i in images.unique()[img_idx_to_load]:
print(f"Analysing {image_i}...")

img = self.read_image(fn=image_i)

# labels and rle pixels
image_l_rle = self.labels_rle[self.labels_rle['Image'] == image_i].copy()
image_l_rle.dropna(inplace=True)

for label_i in image_l_rle['Label']:
# first decode the rle into a 2d array
v = image_l_rle['EncodedPixels'][image_l_rle['Label'] == label_i].values[0]
labels_2d = self.rle_decode(v, shape=img.size, value=self.label_codes[label_i])

# exhaustively search the rle for rectangles of predefined shape that satisfy criteria
idxs = self.sample_rectangles_idx(arr=labels_2d, value=self.label_codes[label_i], rectangle_size=(224, 224))

# output images and labels
for i, idx in enumerate(idxs):
cropped_img = img.crop((idx[0], idx[1],
idx[0] + 224, idx[1] + 224))

fp = os.path.join(self.output_fp, 'single_labels', '224s')
os.makedirs(fp, exist_ok=True)

fn = f"{image_i.split('.')[0]}_{i}.jpg"
fp_fn = self.uniquify(os.path.join(fp, fn))

cropped_img.save(fp_fn)
labels.append(pd.DataFrame({
"Image": [fp_fn.split('/')[-1]],
"Label": [label_i]
}))

labels = pd.concat(labels)
labels.to_csv(self.uniquify(os.path.join(fp, f"labels_{self.labels_suffix}.csv")))


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--step', nargs='?', help='If running in steps, which step this iteration is', type=int)
parser.add_argument('--step_len', nargs='?', help='If running in steps, length of step', type=int)

args = parser.parse_args()
step = args.step
step_len = args.step_len

labels_suffix = None # default is None, change if processing slices so file is not overwritten/parallel written
if step is not None:
idxs = [x for x in range(step * step_len, step * step_len + step_len)]
labels_suffix = f"{str(min(idxs))}-{str(max(idxs))}"

label_images_class = Label_Images(input_fp="/data/users/meastman/understanding_clouds_kaggle/input",
output_fp="/data/users/meastman/understanding_clouds_kaggle/input",
labels_suffix=labels_suffix)
label_images_class.extract_labelled_image(idxs)
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
#SBATCH --array=0-220
#SBATCH --mem=8000
#SBATCH --output=src/sbatch/o/extract_lab_img_%a.out
#SBATCH --error=src/sbatch/e/extract_lab_img_%a.err
#SBATCH --time=240s

module load scitools

# 5546 unique images. Iterate through and process 25 at a time
python src/functions/extract_labelled_images.py --step $SLURM_ARRAY_TASK_ID --step_len 25
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
"""
Takes output from extract_labelled_images.sbatch and separates into test/train images, and produces test/train labels
"""

import os
import glob
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

class PPLabelledImages:

def __init__(self, parent_dir):
self.parent_dir = parent_dir

self.fps = None
self.labels = None
self.image_names = None
self.train_images = None
self.test_images = None

def extract_labels(self):
"""Read label files"""
self.fps = glob.glob(os.path.join(self.parent_dir, '*.csv'))

dfs = [pd.read_csv(fp, index_col=0) for fp in self.fps]
self.labels = pd.concat(dfs, ignore_index=True)

def move_test_train_files(self):
if self.train_images is None or self.test_images is None:
ValueError("No images in self.train_images or self.test_images to move")
return

os.makedirs(os.path.join(self.parent_dir, 'train'), exist_ok=True)
os.makedirs(os.path.join(self.parent_dir, 'test'), exist_ok=True)

train_fps = [glob.glob(os.path.join(self.parent_dir, img + '*')) for img in self.train_images]
test_fps = [glob.glob(os.path.join(self.parent_dir, img + '*')) for img in self.test_images]

# join lists together
train_fps = sum(train_fps, [])
test_fps = sum(test_fps, [])

# move files
for fp in train_fps:
new_fp = os.path.join(self.parent_dir, 'train', fp.split('/')[-1])
shutil.move(fp, new_fp)
for fp in test_fps:
new_fp = os.path.join(self.parent_dir, 'test', fp.split('/')[-1])
shutil.move(fp, new_fp)

def delete_files(self):
pass

def process_images(self):
# read all label files into pd.DataFrame
self.extract_labels()

# separate entire images into test/train
image_names = [img.split('.')[0].split('_')[0] for img in self.labels['Image']]
self.image_names = list(set(image_names))

self.train_images, self.test_images = train_test_split(self.image_names, test_size=.2, random_state=11)

# move files into new directories with train/test specific label files
self.move_test_train_files()
train_labels = self.labels[[x in self.train_images for x in image_names]]
test_labels = self.labels[[x in self.test_images for x in image_names]]

train_labels.to_csv(os.path.join(self.parent_dir, 'train', 'train_labels.csv'))
test_labels.to_csv(os.path.join(self.parent_dir, 'test', 'test_labels.csv'))

# delete old label files
for fn in glob.glob(os.path.join(self.parent_dir, '*.csv')):
os.remove(fn)


if __name__ == '__main__':
lab_img_processor = PPLabelledImages(
parent_dir='/data/users/meastman/understanding_clouds_kaggle/input/single_labels/224s')
lab_img_processor.process_images()
1 change: 1 addition & 0 deletions env/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ Each environment has a focus on a particular technology, below lists the existin
+ `requirements_tensorflow.yml`: [Tensorflow](https://www.tensorflow.org/)
+ `requirements_xgboost.yml`: [XGBoost](https://xgboost.ai/)
+ `requirements_pymc.yml`: [PyMC](https://www.pymc.io/welcome.html) and [ArviZ](https://python.arviz.org/en/stable/)
+ `requirements_cloud_class.yml`: [Cloud Classification Challenge](https://github.com/MetOffice/data_science_cop/tree/understanding-clouds-challenge/challenges/2023_cloud_classification)

## Use on Met Office systems

Expand Down
Loading