Image Data Processing
Show, save and load a gray image and a color image from <np.ndarray>.
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
gray_image_array = np.random.randint(0, 256, size=(100, 100), dtype=np.uint8)
color_image_array = np.random.randint(0, 256, size=(100, 100, 3), dtype=np.uint8)
# Show gray and color images
plt.subplot(1, 2, 1)
plt.axis('off')
plt.imshow(gray_image_array, cmap='gray')
plt.subplot(1, 2, 2)
plt.axis('off')
plt.imshow(color_image_array)
plt.show()
# Save and load images
image = Image.fromarray(gray_image_array)
image.save('gray_image.png')
image = Image.fromarray(color_image_array)
image.save('color_image.png')
# Load images
loaded_gray_image = Image.open('gray_image.png')
loaded_gray_image_array = np.array(loaded_gray_image)
print(np.array_equal(loaded_gray_image_array, gray_image_array))
loaded_color_image = Image.open('color_image.png')
loaded_color_image_array = np.array(loaded_color_image)
print(np.array_equal(loaded_color_image_array, loaded_color_image))
feature_transform
defined above is a common transformer
for <PIL Image>.
from torchvision import transforms
features_transform = transforms.Compose([
transforms.Resize((256, 256)), # resize the shape of every image
transforms.ToTensor(),
transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])
transforms.Resize()
only receives a <PIL Image>.
This function should always be placed at the beginning.
transform.ToTensor()
’s input format is shape (height,
width, channel), and the output format is shape (channel, height,
width). What’s more, if the input data type is <PIL Image> or
<np.ndarray> with a dtype of <np.uint8>, its values will be
limited to a range of [0, 1].
transforms.Normalize()
will be applied to every
channel.
LeNet
LeNet structure code
# The input image's shape is (num_channels: 1, height: 28, width: 28)
# The ouput classes number is 10
import torch
from torch import nn
model = nn.Sequential(nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Conv2d(6, 16, kernel_size=5), nn.Sigmoid(),
nn.AvgPool2d(kernel_size=2, stride=2),
nn.Flatten(),
nn.Linear(16 * 5 * 5, 120), nn.Sigmoid(),
nn.Linear(120, 84), nn.Sigmoid(),
nn.Linear(84, 10))
nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, dilation, groups, bias=True, padding_mode='zeros')
in_channels: represent the number of feature images(channels) in the input layer;
out_channels: represents the number of channels in the output layer, meanwhile, it is the number of convolutional kernels;
kernel_size=5: means that each kernel’s pixel is 5×5.
Fashion-MNIST Data Loading code
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import numpy as np
transform = transforms.Compose(
[transforms.Resize((28, 28)),
transforms.ToTensor(),
transforms.Normalize([0.5], [0.5])])
train_dataset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
val_dataset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)
LeNet training code
from torch.utils.data import DataLoader
device = "cuda" if torch.cuda.is_available() else "cpu"
def init_weights(m):
if type(m) == nn.Linear or type(m) == nn.Conv2d:
nn.init.xavier_uniform_(m.weight)
model.apply(init_weights)
num_epochs = 30
batch_size = 128
lr = 0.3
model.to(device)
criterion = nn.CrossEntropyLoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=0.05)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
for epoch in range(num_epochs):
model.train()
train_loss = 0
for batch_features, batch_labels in train_loader:
optimizer.zero_grad()
batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
train_outputs = model(batch_features)
loss = criterion(train_outputs, batch_labels)
train_loss += loss.item()
loss.backward()
optimizer.step()
train_loss /= len(train_loader)
if (epoch + 1) % 2 == 0:
model.eval()
true_label, pred_label = [], []
val_loss = 0
with torch.no_grad():
for batch_features, batch_labels in val_loader:
batch_features, batch_labels = batch_features.to(device), batch_labels.to(device)
val_outputs = model(batch_features)
loss = criterion(val_outputs, batch_labels)
true_label.extend(batch_labels.cpu().numpy().reshape(-1))
pred_label.extend(np.argmax(val_outputs.cpu().numpy(), axis=1))
val_loss += loss.item()
val_accuracy = np.mean(np.array(true_label) == np.array(pred_label))
val_loss /= len(val_loader)
print('epoch {}/{} | train loss: {:.4f}, val loss: {:.4f} | Accuracy: {:.2f}'.format(epoch+1, num_epochs, train_loss, val_loss, val_accuracy))
The training result can reach an accuracy of \(87\%\).
LeNet prediction code
test_dataset = MyDataset(test_data, None, features_transform)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
pred_label = []
model.eval()
with torch.no_grad():
for batch_features, _ in test_loader:
batch_features = batch_features.to(device)
test_outputs = model(batch_features)
pred_label.extend(torch.argmax(test_outputs, dim=1).cpu().numpy())
# np.save('save/path', pred_label)
AlexNet
code
# The input image's shape is (num_channels: 1, height: 224, width: 224)
# The ouput classes number is 10
model = nn.Sequential(
nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),
nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Flatten(),
nn.Linear(6400, 4096), nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, 4096), nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, 10)
)
The in_channels of the first layer is 1 instead of 3 in order to train Fashion-MINST data set, which includes gray-scale images.
The model training code is the same with the code in LeNet trainging code. Because this is a deep convolutional network, perhaps I should try a smaller learning rate. And the final accuracy can reach \(92\%\).
Batch normalization
batch normalization
Batch normalization layer can constrain the batch data in network’s dataflow, alleviate the problem of gradient vanishing, and make deep networks converge more easily. It is applied between “affine transformation” and “activation function”.
The normalization formula is slightly different from the common one.
\[ BN(\boldsymbol{x})=\gamma\cdot\frac{\boldsymbol{x}-E(\boldsymbol{x})}{\sqrt{Var(\boldsymbol{x})}}+\beta\\ E(\boldsymbol{x})=\frac{1}{n}\sum_{i=1}^n\boldsymbol{x}\\ Var(\boldsymbol{x})=\frac{1}{n}\sum_{i=1}^n\big{(}\boldsymbol{x}-E(\boldsymbol{x})\big{)}^2+\epsilon \]
\(\gamma\) and \(\beta\) are parameters that needed to be learned by the model.
For example, suppose I have an input of dimension \((batch\_size=2, num\_channels=3, height=2, width=2)\).
\[ \begin{bmatrix}\begin{bmatrix}\color{red}{x^{(1)}_{11}}&\color{red}{x^{(1)}_{12}}\\\color{red}{x^{(1)}_{21}}&\color{red}{x^{(1)}_{22}}\end{bmatrix}&\begin{bmatrix}\color{green}{x^{(1)}_{11}}&\color{green}{x^{(1)}_{12}}\\\color{green}{x^{(1)}_{21}}&\color{green}{x^{(1)}_{22}}\end{bmatrix}&\begin{bmatrix}\color{blue}{x^{(1)}_{11}}&\color{blue}{x^{(1)}_{12}}\\\color{blue}{x^{(1)}_{21}}&\color{blue}{x^{(1)}_{22}}\end{bmatrix}\end{bmatrix}\\ \begin{bmatrix}\begin{bmatrix}\color{red}{x^{(2)}_{11}}&\color{red}{x^{(2)}_{12}}\\\color{red}{x^{(2)}_{21}}&\color{red}{x^{(2)}_{22}}\end{bmatrix}&\begin{bmatrix}\color{green}{x^{(2)}_{11}}&\color{green}{x^{(2)}_{12}}\\\color{green}{x^{(2)}_{21}}&\color{green}{x^{(2)}_{22}}\end{bmatrix}&\begin{bmatrix}\color{blue}{x^{(2)}_{11}}&\color{blue}{x^{(2)}_{12}}\\\color{blue}{x^{(2)}_{21}}&\color{blue}{x^{(2)}_{22}}\end{bmatrix}\end{bmatrix} \]
\[ \color{red}{mean}=\frac{1}{2\times2\times2}\sum_k\sum_i\sum_j\color{red}{x^{(k)}_{ij}}\\ \color{red}{var}=\frac{1}{2\times2\times2}\sum_k\sum_i\sum_j\big{(}\color{red}{x^{(k)}_{ij}}-\color{red}{mean}\big{)}^2 \]
bn = nn.BatchNorm2d(3)
# input_2d shape: (2, 3, 2, 2)
input_2d = torch.tensor([
[[[1, 2], [3, 4]],
[[5, 6], [7, 8]],
[[1, 3], [4, 5]]],
[[[1, 2], [1, 2]],
[[5, 6], [5, 6]],
[[1, 3], [2, 4]]]
], dtype=torch.float32)
with torch.no_grad():
output_2d = bn(input_2d)
print(output_2d)
## tensor([[[[-1.0000, 0.0000],
## [ 1.0000, 2.0000]],
##
## [[-1.0000, 0.0000],
## [ 1.0000, 2.0000]],
##
## [[-1.3750, 0.0917],
## [ 0.8250, 1.5584]]],
##
##
## [[[-1.0000, 0.0000],
## [-1.0000, 0.0000]],
##
## [[-1.0000, 0.0000],
## [-1.0000, 0.0000]],
##
## [[-1.3750, 0.0917],
## [-0.6417, 0.8250]]]])
AlexNet using batch normalization
# AlexNet using batch normalization
model = nn.Sequential(
nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.BatchNorm2d(96), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.BatchNorm2d(256), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.BatchNorm2d(384), nn.ReLU(),
nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.BatchNorm2d(384), nn.ReLU(),
nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.BatchNorm2d(256), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2),
nn.Flatten(),
nn.Linear(6400, 4096), nn.BatchNorm1d(4096), nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, 4096), nn.BatchNorm1d(4096), nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(4096, 10)
)
ResNet
A ResNet block
import torch.nn as nn
import torch.nn.functional as F
import torch
class Residual(nn.Module):
def __init__(self, input_channels, num_channels, use_1x1conv=False, strides=1):
super().__init__()
self.conv1 = nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1, stride=strides)
self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1)
if use_1x1conv:
self.conv3 = nn.Conv2d(input_channels, num_channels, kernel_size=1, stride=strides)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm2d(num_channels)
self.bn2 = nn.BatchNorm2d(num_channels)
def forward(self, X):
Y = F.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3 is not None:
X = self.conv3(X)
Y += X
return F.relu(Y)
ResNet-18
# The input image's shape is (num_channels: 1, height: 224, width: 224)
# The ouput classes number is 10
def resnet_block(input_channels, num_channels, num_residuals, first_block=False):
blk = []
for i in range(num_residuals):
if i == 0 and not first_block:
'''
The first block of two-blocks structures(b3, b4, b5) includes channel increasing(use_1x1conv=True)
and pixel decreasing(stride=2).
b2 is the first two-blocks structure which doesn't include channel increasing and pixel decreasing.
'''
blk.append(Residual(input_channels, num_channels, use_1x1conv=True, strides=2))
else:
blk.append(Residual(num_channels, num_channels))
return blk
b1 = nn.Sequential(
nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
b2 = nn.Sequential(*resnet_block(64, 64, 2, first_block=True))
b3 = nn.Sequential(*resnet_block(64, 128, 2))
b4 = nn.Sequential(*resnet_block(128, 256, 2))
b5 = nn.Sequential(*resnet_block(256, 512, 2))
model = nn.Sequential(
b1, b2, b3, b4, b5,
nn.AdaptiveAvgPool2d((1, 1)),
nn.Flatten(), nn.Linear(512, 10)
)
The training result can reach an accuracy of \(91\%\).
Dataflow
X = torch.randn(1, 1, 224, 224)
for layer in model:
X = layer(X)
print(layer.__calss__.__name__, 'output shape: \t', X.shape)
Conv3d
Self-designed model
import torch
import torch.nn as nn
# The input image's shape is (num_channels: 3, frames: 4, height: 224, width: 224)
# The ouput classes number is 10
class VideoClassifier(nn.Module):
def __init__(self, num_classes):
super(VideoClassifier, self).__init__()
self.conv_layers = nn.Sequential(
nn.Conv3d(3, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool3d(kernel_size=2, stride=2),
nn.Conv3d(32, 64, kernel_size=3, stride=1, padding=1),
nn.ReLU(),
nn.MaxPool3d(kernel_size=2, stride=2),
)
self.fc_layers = nn.Sequential(
nn.Flatten(),
nn.Linear(64 * 1 * 56 * 56, 1024),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(1024, 512),
nn.ReLU(),
nn.Dropout(0.5),
nn.Linear(512, num_classes),
)
def forward(self, x):
x = self.conv_layers(x)
x = x.view(x.size(0), -1)
x = self.fc_layers(x)
return x
model = VideoClassifier(num_classes=15)
If I have a frame size larger than 4, for example 16, I can change
nn.Linear(64 * 1 * 56 * 56, 1024)
to
nn.Linear(64 * 16 * 56 * 56, 1024)
, or I can add two more
blocks in self.conv_layer
.
ResNet18-3d
from torchvision.models.video import r3d_18
num_classes = 10
model = r3d_18(pretrained=True)
model.fc = nn.Linear(512, num_classes)
CNN & RNN
The code below shows ResNet-18 + LSTM structure model.
import torch
import torch.nn as nn
import torch.nn.functional as F
class Residual(nn.Module):
def __init__(self, input_channels, num_channels, use_1x1conv=False, strides=1):
super().__init__()
self.conv1 = nn.Conv2d(input_channels, num_channels, kernel_size=3, padding=1, stride=strides)
self.conv2 = nn.Conv2d(num_channels, num_channels, kernel_size=3, padding=1)
if use_1x1conv:
self.conv3 = nn.Conv2d(input_channels, num_channels, kernel_size=1, stride=strides)
else:
self.conv3 = None
self.bn1 = nn.BatchNorm2d(num_channels)
self.bn2 = nn.BatchNorm2d(num_channels)
def forward(self, X):
Y = F.relu(self.bn1(self.conv1(X)))
Y = self.bn2(self.conv2(Y))
if self.conv3 is not None:
X = self.conv3(X)
Y += X
return F.relu(Y)
def resnet_block(input_channels, num_channels, num_residuals, first_block=False):
blk = []
for i in range(num_residuals):
if i == 0 and not first_block:
blk.append(Residual(input_channels, num_channels, use_1x1conv=True, strides=2))
else:
blk.append(Residual(num_channels, num_channels))
return blk
b1 = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
nn.BatchNorm2d(64), nn.ReLU(),
nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
)
b2 = nn.Sequential(*resnet_block(64, 64, 2, first_block=True))
b3 = nn.Sequential(*resnet_block(64, 128, 2))
b4 = nn.Sequential(*resnet_block(128, 256, 2))
b5 = nn.Sequential(*resnet_block(256, 512, 2))
CNN_model = nn.Sequential(
b1, b2, b3, b4, b5,
nn.AdaptiveAvgPool2d((1, 1)),
nn.Flatten()
)
class CNNtoRNN(nn.Module):
def __init__(self, num_classes, rnn_hidden_size, num_layers=1):
super(CNNtoRNN, self).__init__()
self.cnn = CNN_model
cnn_output_features = 512
self.rnn_hidden_size = rnn_hidden_size
self.num_layers = num_layers
self.rnn = nn.LSTM(input_size=cnn_output_features,
hidden_size=self.rnn_hidden_size,
num_layers=self.num_layers,
batch_first=True)
self.fc = nn.Linear(self.rnn_hidden_size, num_classes)
def forward(self, x):
# x's shape is (batch_size, win_len, channels, height, width)
batch_size, win_len, channels, height, width = x.shape
'''
CNN:
c_in's shape is (batch_size*win_len, channels, height, width)
c_out's shape is (batch_size*win_len, cnn_output_features)
'''
c_in = x.view(batch_size * win_len, channels, height, width)
c_out = self.cnn(c_in)
'''
RNN:
r_in's shape is (batch_size, win_len, cnn_output_features)
r_out(batch_size, win_len, num_hiddens) is all of the hidden states
hn(1, batch_size, num_hiddens) is the last hidden state
r_out[:, -1, :].squeeze(1) is hn.squeeze(0)
cn(1, batch_size, num_hiddens) is LSTM's memory cell
'''
r_in = c_out.view(batch_size, win_len, -1)
r_out, (hn, cn) = self.rnn(r_in)
r_out = r_out[:, -1, :]
output = self.fc(r_out)
return output
model = CNNtoRNN(num_classes=15, rnn_hidden_size=32)
input_tensor = torch.rand(16, 10, 3, 224, 224) # (batch_size, frames, channels, height, width)
output = model(input_tensor)
print(output.shape) # output's shape is (batch_size, num_classes)
As to video classification mission, CNN&RNN has a worse effect than CNN3d.