MNIST データセットの分割

概要

MNISTデータセットを同じサイズで任意の数に分割する.

ヘッダがあるので単純に分割できず,ヘッダを書き換えながらデータセットを分割する

環境

スクリプト

今回はシェルスクリプト(とperl)を用いて実装する

詳しいコードの説明は気が向いたら

#!/bin/bash

if [ $# -ne 1]; then
    exit 1
fi

mkdir splited-$1

# Extract header information
dd if=train-mnist.image of=train-image.header bs=1 count=16
dd if=train-mnist.lable of=train-label.header bs=1 count=8
dd if=test-mnist.image of=test-image.header bs=1 count=16
dd if=test-mnist.label of=test-label.header bs=1 count=8

# Remove header information
dd if=train-mnist.image of=train-image.row ibs=1 skip=16
dd if=train-mnist.label of=train-label.row ibs=1 skip=8
dd if=test-mnist.image of=test-image.row ibs=1 skip=16
dd if=test-mnist.label of=test-label.row ibs=1 skip-8

#Split datasets and labels
split -n $1 -a 1 --numeric-suffixes=1 train-image.row splited-$1/train-image.
split -n $1 -a 1 --numeric-suffixes=1 test-image.row splited-$1/test-image.
split -n $1 -a 1 --numeric-suffixes=1 train-label.row splited-$1/train-label.
split -n $1 -a 1 --numeric-suffixes=1 test-label.row splited-$1/test-label.

# Create a new header
## label header
xxd train-label.header | perl -sale 'printf("%s %x %s\n", join(" ", @F[0..3]), hex($F[4])/$N, join(" ", @F[5..9]))' -- -N=$1 | xxd -r >> splited-$1/label_new.header
## image header
xxd train-image.header | perl -sale 'printf("%s %x %s\n", join(" ", @F[0..3]), hex($F[4])/$N, join(" ", @F[5..9]))' -- -N=$1 | xxd -r >> splited-$1/image_new.header

# Create new dataset
for i in `seq 1 $1`
do
    # Create dataset
    cat splited-$1/label_new.header splited-$1/train-label.$i >> splited-$1/train-label$1.$i
    cat splited-$1/label_new.header splited-$1/test-label.$i >> splited-$1/test-label$1.$i

    cat splited-$1/image_new.header splited-$1/train-image.$i >> splited-$1/train-image$1.$i
    cat splited-$1/image_new.header splited-$1/train-image.$i >> splited-$1/train-image$1.$i

    # Delete used file
    rm splited-$1/train-label.$i
    rm splited-$1/test-label.$i
    rm splited-$1/train-image.$i
    rm splited-$1/test-image.$i
done

# Compress as .gz file
for i in `seq 1 $1`
do
    gzip splited-$1/train-image$1.$i
    gzip splited-$1/train-label$1.$i
    gzip splited-$1/test-image$1.$i
    gzip splited-$1/test-label$1.$i
done

# Delete unnnecessary files
rm train-image.row train-label.row train-image.header train-label.header
rm test-image.row test-label.row test-image.header test-label.header

# Make dataset directory
for i in `seq 1 $1`
do
    mkdir splited-$1/dataset$i
    mv splited-$1/train-image$1.$i.gz splited-$1/dataset$i/train-image.gz
    mv splited-$1/train-label$1.$i.gz splited-$1/dataset$i/train-label.gz
    mv splited-$1/test-image$1.$i.gz splited-$1/dataset$i/test-image.gz
    mv splited-$1/test-label$1.$i.gz splited-$1/dataset$i/ttest-label.gz
done