import numpy as np
import matplotlib.pyplot as plt
import os
# 假设 mountains 是加载的单通道图像数组# mountains = np.load(os.path.join(figure_path, 'mountains.npy'))
H = 60
W = 100print(f'Mountain at Dusk is H = {H} and W = {W} pixels.')
这个图像的高度为 H=60,宽度为 W=100。我们将设置 P=20,因为它能够均匀地整除 H 和 W。
P = 20
N = int((H*W)/(P**2))
print(f'There will be {N} patches, each {P} by {P}.')
defget_sinusoid_encoding(num_tokens, token_len):
""" Make Sinusoid Encoding Table
Args:
num_tokens (int): number of tokens
token_len (int): length of a token
Returns:
(torch.FloatTensor) sinusoidal position encoding table
"""defget_position_angle_vec(i):
return [i / np.power(10000, 2 * (j // 2) / token_len) for j inrange(token_len)]
sinusoid_table = np.array([get_position_angle_vec(i) for i inrange(num_tokens)])
sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])
sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])
return torch.FloatTensor(sinusoid_table).unsqueeze(0)
PE = get_sinusoid_encoding(num_tokens+1, token_len)
print('Position embedding dimensions are\n\tnumber of tokens:', PE.shape[1], '\n\ttoken length:', PE.shape[2])
x = x + PE
print('Dimensions with Position Embedding are\n\tbatchsize:', x.shape[0], '\n\tnumber of tokens:', x.shape[1], '\n\ttoken length:', x.shape[2])
现在,我们的 Token 已经准备好进入编码块。
编码块
编码块是模型实际从图像标记中学习的地方。编码块的数量是用户设置的超参数。
编码块的代码如下。
from typing importOptionalimport torch.nn.functional as F
classEncoding(nn.Module):
def__init__(self,
dim: int,
num_heads: int=1,
hidden_chan_mul: float=4.,
qkv_bias: bool=False,
qk_scale: Optional[float]=None,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
""" Encoding Block
Args:
dim (int): size of a single token
num_heads(int): number of attention heads in MSA
hidden_chan_mul (float): multiplier to determine the number of hidden channels (features) in the NeuralNet component
qkv_bias (bool): determines if the qkv layer learns an additive bias
qk_scale (Optional[float]): value to scale the queries and keys by;
if None, queries and keys are scaled by head_dim ** -0.5
act_layer(nn.modules.activation): torch neural network layer class to use as activation
norm_layer(nn.modules.normalization): torch neural network layer class to use as normalization
"""super().__init__()
## Define Layersself.norm1 = norm_layer(dim)
self.attn = Attention(dim=dim,
chan=dim,
num_heads=num_heads,
qkv_bias=qkv_bias,
qk_scale=qk_scale)
self.norm2 = norm_layer(dim)
self.neuralnet = NeuralNet(in_chan=dim,
hidden_chan=int(dim*hidden_chan_mul),
out_chan=dim,
act_layer=act_layer)
defforward(self, x):
x = x + self.attn(self.norm1(x))
x = x + self.neuralnet(self.norm2(x))
return x
classNeuralNet(nn.Module):
def__init__(self,
in_chan: int,
hidden_chan: Optional[float]=None,
out_chan: Optional[float]=None,
act_layer=nn.GELU):
""" Neural Network Module
Args:
in_chan (int): number of channels (features) at input
hidden_chan (Optional[float]): number of channels (features) in the hidden layer;
if None, number of channels in hidden layer is the same as the number of input channels
out_chan (Optional[float]): number of channels (features) at output;
if None, number of output channels is same as the number of input channels
act_layer(nn.modules.activation): torch neural network layer class to use as activation
"""super().__init__()
## Define Number of Channels
hidden_chan = hidden_chan or in_chan
self.fc1 = nn.Linear(in_chan, hidden_chan)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_chan, out_chan if out_chan else in_chan)
defforward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.fc2(x)
return x
pred_token = x[:, 0]
print('Length of prediction token:', pred_token.shape[-1])
最后,将预测 Token 传递到头部以进行预测。头部通常是某种类型的神经网络,根据模型的不同而变化。在 An Image is Worth 16x16 Words 中,他们在预训练期间使用具有一个隐藏层的 MLP(多层感知器),在微调期间使用单个线性层。在 Tokens-to-Token ViT 中,他们使用单个线性层作为头部。此示例将使用输出形状为 1,以表示单个估计回归值。
head = nn.Linear(token_len, 1)
pred = head(pred_token)
print('Length of prediction:', (pred.shape[0], pred.shape[1]))
print('Prediction:', float(pred))
这就是全部内容!模型已经进行了预测!
完整代码
为了创建完整的 ViT 模块,我们使用上面定义的 Patch Tokenization 模块和 ViT Backbone 模块。ViT Backbone 如下所定义,包含了 Token 处理、编码块和预测处理组件。
classViT_Backbone(nn.Module):
def__init__(self,
preds: int=1,
token_len: int=768,
num_heads: int=1,
Encoding_hidden_chan_mul: float=4.,
depth: int=12,
qkv_bias=False,
qk_scale=None,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
""" VisTransformer Backbone
Args:
preds (int): number of predictions to output
token_len (int): length of a token
num_heads(int): number of attention heads in MSA
Encoding_hidden_chan_mul (float): multiplier to determine the number of hidden channels (features) in the NeuralNet component of the Encoding Module
depth (int): number of encoding blocks in the model
qkv_bias (bool): determines if the qkv layer learns an additive bias
qk_scale (Optional[float]): value to scale the queries and keys by;
if None, queries and keys are scaled by head_dim ** -0.5
act_layer(nn.modules.activation): torch neural network layer class to use as activation
norm_layer(nn.modules.normalization): torch neural network layer class to use as normalization
"""super().__init__()
## Defining Parametersself.num_heads = num_heads
self.Encoding_hidden_chan_mul = Encoding_hidden_chan_mul
self.depth = depth
## Defining Token Processing Componentsself.cls_token = nn.Parameter(torch.zeros(1, 1, self.token_len))
self.pos_embed = nn.Parameter(data=get_sinusoid_encoding(num_tokens=self.num_tokens+1, token_len=self.token_len), requires_grad=False)
## Defining Encoding blocksself.blocks = nn.ModuleList([Encoding(dim = self.token_len,
num_heads = self.num_heads,
hidden_chan_mul = self.Encoding_hidden_chan_mul,
qkv_bias = qkv_bias,
qk_scale = qk_scale,
act_layer = act_layer,
norm_layer = norm_layer)
for i inrange(self.depth)])
## Defining Prediction Processingself.norm = norm_layer(self.token_len)
self.head = nn.Linear(self.token_len, preds)
## Make the class token sampled from a truncated normal distribution# timm.layers.trunc_normal_(self.cls_token, std=.02)defforward(self, x):
## Assumes x is already tokenized## Get Batch Size
B = x.shape[0]
## Concatenate Class Token
x = torch.cat((self.cls_token.expand(B, -1, -1), x), dim=1)
## Add Positional Embedding
x = x + self.pos_embed
## Run Through Encoding Blocksfor blk inself.blocks:
x = blk(x)
## Take Norm
x = self.norm(x)
## Make Prediction on Class Token
x = self.head(x[:, 0])
return x
通过 ViT Backbone 模块,我们可以定义完整的 ViT 模型。
classViT_Model(nn.Module):
def__init__(self,
img_size: tuple[int, int, int]=(1, 400, 100),
patch_size: int=50,
token_len: int=768,
preds: int=1,
num_heads: int=1,
Encoding_hidden_chan_mul: float=4.,
depth: int=12,
qkv_bias=False,
qk_scale=None,
act_layer=nn.GELU,
norm_layer=nn.LayerNorm):
""" VisTransformer Model
Args:
img_size (tuple[int, int, int]): size of input (channels, height, width)
patch_size (int): the side length of a square patch
token_len (int): desired length of an output token
preds (int): number of predictions to output
num_heads(int): number of attention heads in MSA
Encoding_hidden_chan_mul (float): multiplier to determine the number of hidden channels (features) in the NeuralNet component of the Encoding Module
depth (int): number of encoding blocks in the model
qkv_bias (bool): determines if the qkv layer learns an additive bias
qk_scale (Optional[float]): value to scale the queries and keys by;
if None, queries and keys are scaled by head_dim ** -0.5
act_layer(nn.modules.activation): torch neural network layer class to use as activation
norm_layer(nn.modules.normalization): torch neural network layer class to use as normalization
"""super().__init__()
## Defining Parametersself.img_size = img_size
C, H, W = self.img_size
self.patch_size = patch_size
self.token_len = token_len
self.num_heads = num_heads
self.Encoding_hidden_chan_mul = Encoding_hidden_chan_mul
self.depth = depth
## Defining Patch Embedding Moduleself.patch_tokens = Patch_Tokenization(img_size,
patch_size,
token_len)
## Defining ViT Backboneself.backbone = ViT_Backbone(preds,
self.token_len,
self.num_heads,
self.Encoding_hidden_chan_mul,
self.depth,
qkv_bias,
qk_scale,
act_layer,
norm_layer)
## Initialize the Weightsself.apply(self._init_weights)
def_init_weights(self, m):
""" Initialize the weights of the linear layers & the layernorms
"""## For Linear Layersifisinstance(m, nn.Linear):
## Weights are initialized from a truncated normal distribution# timm.layers.trunc_normal_(m.weight, std=.02)ifisinstance(m, nn.Linear) and m.bias isnotNone:
## If bias is present, bias is initialized at zero
nn.init.constant_(m.bias, 0)
## For Layernorm Layerselifisinstance(m, nn.LayerNorm):
## Weights are initialized at one
nn.init.constant_(m.weight, 1.0)
## Bias is initialized at zero
nn.init.constant_(m.bias, 0)
@torch.jit.ignore ##Tell pytorch to not compile as TorchScriptdefno_weight_decay(self):
""" Used in Optimizer to ignore weight decay in the class token
"""return {'cls_token'}
defforward(self, x):
x = self.patch_tokens(x)
x = self.backbone(x)
return x