MNIST データセットの分割
概要
MNISTデータセットを同じサイズで任意の数に分割する.
ヘッダがあるので単純に分割できず,ヘッダを書き換えながらデータセットを分割する
環境
スクリプト
詳しいコードの説明は気が向いたら
#!/bin/bash if [ $# -ne 1]; then exit 1 fi mkdir splited-$1 # Extract header information dd if=train-mnist.image of=train-image.header bs=1 count=16 dd if=train-mnist.lable of=train-label.header bs=1 count=8 dd if=test-mnist.image of=test-image.header bs=1 count=16 dd if=test-mnist.label of=test-label.header bs=1 count=8 # Remove header information dd if=train-mnist.image of=train-image.row ibs=1 skip=16 dd if=train-mnist.label of=train-label.row ibs=1 skip=8 dd if=test-mnist.image of=test-image.row ibs=1 skip=16 dd if=test-mnist.label of=test-label.row ibs=1 skip-8 #Split datasets and labels split -n $1 -a 1 --numeric-suffixes=1 train-image.row splited-$1/train-image. split -n $1 -a 1 --numeric-suffixes=1 test-image.row splited-$1/test-image. split -n $1 -a 1 --numeric-suffixes=1 train-label.row splited-$1/train-label. split -n $1 -a 1 --numeric-suffixes=1 test-label.row splited-$1/test-label. # Create a new header ## label header xxd train-label.header | perl -sale 'printf("%s %x %s\n", join(" ", @F[0..3]), hex($F[4])/$N, join(" ", @F[5..9]))' -- -N=$1 | xxd -r >> splited-$1/label_new.header ## image header xxd train-image.header | perl -sale 'printf("%s %x %s\n", join(" ", @F[0..3]), hex($F[4])/$N, join(" ", @F[5..9]))' -- -N=$1 | xxd -r >> splited-$1/image_new.header # Create new dataset for i in `seq 1 $1` do # Create dataset cat splited-$1/label_new.header splited-$1/train-label.$i >> splited-$1/train-label$1.$i cat splited-$1/label_new.header splited-$1/test-label.$i >> splited-$1/test-label$1.$i cat splited-$1/image_new.header splited-$1/train-image.$i >> splited-$1/train-image$1.$i cat splited-$1/image_new.header splited-$1/train-image.$i >> splited-$1/train-image$1.$i # Delete used file rm splited-$1/train-label.$i rm splited-$1/test-label.$i rm splited-$1/train-image.$i rm splited-$1/test-image.$i done # Compress as .gz file for i in `seq 1 $1` do gzip splited-$1/train-image$1.$i gzip splited-$1/train-label$1.$i gzip splited-$1/test-image$1.$i gzip splited-$1/test-label$1.$i done # Delete unnnecessary files rm train-image.row train-label.row train-image.header train-label.header rm test-image.row test-label.row test-image.header test-label.header # Make dataset directory for i in `seq 1 $1` do mkdir splited-$1/dataset$i mv splited-$1/train-image$1.$i.gz splited-$1/dataset$i/train-image.gz mv splited-$1/train-label$1.$i.gz splited-$1/dataset$i/train-label.gz mv splited-$1/test-image$1.$i.gz splited-$1/dataset$i/test-image.gz mv splited-$1/test-label$1.$i.gz splited-$1/dataset$i/ttest-label.gz done