Tan_pytorch_segmentation/pytorch_segmentation/PV_Model/FuseDisNet (2).py

377 lines
19 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import torch
import torch.nn as nn
import torch.nn.functional as F
from einops import rearrange, repeat
from timm.models.layers import DropPath, to_2tuple, trunc_normal_
import timm
from timm.models import MobileNetV3
class ConvBNReLU(nn.Sequential):
def __init__(self, in_channels, out_channels, kernel_size=3, dilation=1, stride=1, norm_layer=nn.BatchNorm2d,
bias=False):
super(ConvBNReLU, self).__init__(
nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, bias=bias,
dilation=dilation, stride=stride, padding=((stride - 1) + dilation * (kernel_size - 1)) // 2),
norm_layer(out_channels),
nn.ReLU6()
)
class ConvBN(nn.Sequential):
def __init__(self, in_channels, out_channels, kernel_size=3, dilation=1, stride=1, norm_layer=nn.BatchNorm2d,
bias=False):
super(ConvBN, self).__init__(
nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, bias=bias,
dilation=dilation, stride=stride, padding=((stride - 1) + dilation * (kernel_size - 1)) // 2),
norm_layer(out_channels)
)
class Conv(nn.Sequential):
def __init__(self, in_channels, out_channels, kernel_size=3, dilation=1, stride=1, bias=False):
super(Conv, self).__init__(
nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, bias=bias,
dilation=dilation, stride=stride, padding=((stride - 1) + dilation * (kernel_size - 1)) // 2)
)
class SeparableConvBNReLU(nn.Sequential):
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1,
norm_layer=nn.BatchNorm2d):
super(SeparableConvBNReLU, self).__init__(
nn.Conv2d(in_channels, in_channels, kernel_size, stride=stride, dilation=dilation,
padding=((stride - 1) + dilation * (kernel_size - 1)) // 2,
groups=in_channels, bias=False),
norm_layer(out_channels),
# 逐点卷积
nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False),
nn.ReLU6()
)
class SeparableConvBN(nn.Sequential):
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1,
norm_layer=nn.BatchNorm2d):
super(SeparableConvBN, self).__init__(
nn.Conv2d(in_channels, in_channels, kernel_size, stride=stride, dilation=dilation,
padding=((stride - 1) + dilation * (kernel_size - 1)) // 2,
groups=in_channels, bias=False),
norm_layer(out_channels),
nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
)
class SeparableConv(nn.Sequential):
def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1):
super(SeparableConv, self).__init__(
nn.Conv2d(in_channels, in_channels, kernel_size, stride=stride, dilation=dilation,
padding=((stride - 1) + dilation * (kernel_size - 1)) // 2,
groups=in_channels, bias=False),
nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
)
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU6, drop=0.):
super().__init__()
out_features = out_features or in_features
# 这行代码的意思是如果out_features已经被提供了一个值那么out_features就等于这个提供的值。如果没有为out_features提供值即out_features为None
# 那么out_features将被设置为in_features的值。
hidden_features = hidden_features or in_features
self.fc1 = nn.Conv2d(in_features, hidden_features, 1, 1, 0, bias=True)
self.act = act_layer()
self.fc2 = nn.Conv2d(hidden_features, out_features, 1, 1, 0, bias=True)
self.drop = nn.Dropout(drop, inplace=True)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class GlobalLocalAttention(nn.Module):
def __init__(self,
dim=256,
num_heads=16,
qkv_bias=False,
window_size=8,
relative_pos_embedding=True
):
super().__init__()
self.num_heads = num_heads # 初始化注意力头的数量
head_dim = dim // self.num_heads # 计算每个注意力头的特征维度
self.scale = head_dim ** -0.5 # 计算缩放因子,用于注意力计算中的点积。
self.ws = window_size # 初始化局部窗口的大小。
self.qkv = Conv(dim, 3 * dim, kernel_size=1, bias=qkv_bias) # 初始化一个卷积层用于生成Query、Key和Value。
self.local1 = ConvBN(dim, dim, kernel_size=3) # 初始化第一个卷积层和批量归一化层,用于处理局部特征。
self.local2 = ConvBN(dim, dim, kernel_size=1) # 初始化第二个卷积层和批量归一化层,用于处理局部特征。
self.proj = SeparableConvBN(dim, dim, kernel_size=window_size) # 初始化一个可分离卷积层,用于投影输出。
self.attn_x = nn.AvgPool2d(kernel_size=(window_size, 1), stride=1,
padding=(window_size // 2 - 1, 0)) # 初始化水平方向的平均池化层,用于整合全局信息。
self.attn_y = nn.AvgPool2d(kernel_size=(1, window_size), stride=1,
padding=(0, window_size // 2 - 1)) # 初始化垂直方向的平均池化层,用于整合全局信息。
self.relative_pos_embedding = relative_pos_embedding
# 初始化是否使用相对位置嵌入的标志。
if self.relative_pos_embedding: # 如果使用了相对位置嵌入,会定义一个相对位置偏置表
# define a parameter table of relative position bias
self.relative_position_bias_table = nn.Parameter(
torch.zeros((2 * window_size - 1) * (2 * window_size - 1), num_heads)) # 2*Wh-1 * 2*Ww-1, nH
coords_h = torch.arange(self.ws) # 创建一个包含窗口大小ws内所有水平坐标的张量。
coords_w = torch.arange(self.ws) # 创建一个包含窗口大小ws内所有垂直坐标的张量。
coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww #使用meshgrid函数创建一个包含所有水平和垂直坐标的张量。
coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww将三维坐标张量展平为一维张量。
relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
# 2, Wh*Ww, Wh*Ww 计算所有坐标对之间的相对位置,即每个坐标相对于其他所有坐标的差
relative_coords = relative_coords.permute(1, 2, 0).contiguous()
relative_coords[:, :, 0] += self.ws - 1 # shift to start from 0将相对坐标的第一个维度增加ws - 1以确保坐标从0开始。
relative_coords[:, :, 1] += self.ws - 1 # 将相对坐标的第二个维度增加ws - 1以确保坐标从0开始。
relative_coords[:, :, 0] *= 2 * self.ws - 1 # 调整相对坐标的第一个维度,使其范围变为[-2*ws+1, 2*ws-1]。
relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww将相对坐标的两个维度合并为一个单一的索引用于访问相对位置偏置表。
self.register_buffer("relative_position_index", relative_position_index) # 将相对位置索引注册为一个缓冲区,以便在模型训练过程中重复使用。
trunc_normal_(self.relative_position_bias_table, std=.02)
# 使用trunc_normal_函数初始化相对位置偏置表这是一种常用的初始化技术用于生成服从截断正态分布的参数。
def pad(self, x, ps): # 定义一个函数接受一个特征图x和一个填充大小ps作为参数。
_, _, H, W = x.size() # 获取特征图x的形状并提取高度H和宽度W。
if W % ps != 0: # 如果特征图的宽度W不能被填充大小ps整除则需要进行填充。
x = F.pad(x, (0, ps - W % ps),
mode='reflect') # 使用F.pad函数在特征图的右侧添加填充填充大小为ps - W % ps填充模式为'reflect',这意味着新的像素值将反映原始像素值。
if H % ps != 0: # 如果特征图的高度H不能被填充大小ps整除则需要进行额外的填充。
x = F.pad(x, (0, 0, 0, ps - H % ps),
mode='reflect') # 使用F.pad函数在特征图的下方添加填充填充大小为ps - H % ps填充模式为'reflect'。
return x # 返回填充后的特征图。
def pad_out(self, x): # 定义一个函数接受一个特征图x作为参数。
x = F.pad(x, pad=(0, 1, 0, 1), mode='reflect') # 使用F.pad函数在特征图的右侧和下方添加填充填充大小为1填充模式为'reflect'。
return x
def forward(self, x):
B, C, H, W = x.shape
local = self.local2(x) + self.local1(x) # 计算局部特征,通过两个卷积层。
x = self.pad(x, self.ws) # 填充输入特征图以适应窗口大小。
B, C, Hp, Wp = x.shape # 获取填充后的特征图的形状。
qkv = self.qkv(x) # 生成Query、Key和Value。
q, k, v = rearrange(qkv, 'b (qkv h d) (hh ws1) (ww ws2) -> qkv (b hh ww) h (ws1 ws2) d', h=self.num_heads,
d=C // self.num_heads, hh=Hp // self.ws, ww=Wp // self.ws, qkv=3, ws1=self.ws,
ws2=self.ws) # 重新排列Query、Key和Value以适应注意力机制的计算。
dots = (q @ k.transpose(-2, -1)) * self.scale # 计算点积,并应用缩放因子。
# 如果使用了相对位置嵌入,将相对位置偏置加到点积上。
if self.relative_pos_embedding: # 如果启用了相对位置嵌入
relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
self.ws * self.ws, self.ws * self.ws, -1) # Wh*Ww,Wh*Ww,nH获取相对位置偏置表并根据相对位置索引进行调整。
relative_position_bias = relative_position_bias.permute(2, 0,
1).contiguous() # nH, Wh*Ww, Wh*Ww重新排列相对位置偏置以便与点积的形状匹配
dots += relative_position_bias.unsqueeze(0) # 将相对位置偏置加到点积上。
attn = dots.softmax(dim=-1) # 应用softmax函数计算注意力权重。
attn = attn @ v # 注意力权重应用于Value。
attn = rearrange(attn, '(b hh ww) h (ws1 ws2) d -> b (h d) (hh ws1) (ww ws2)', h=self.num_heads,
d=C // self.num_heads, hh=Hp // self.ws, ww=Wp // self.ws, ws1=self.ws, ws2=self.ws)
attn = attn[:, :, :H, :W] # 裁剪注意力权重,使其与原始输入特征图的形状匹配。
out = self.attn_x(F.pad(attn, pad=(0, 0, 0, 1), mode='reflect')) + \
self.attn_y(F.pad(attn, pad=(0, 1, 0, 0), mode='reflect'))
out = out + local # 将局部特征与全局特征相加。
out = self.pad_out(out) # 添加额外的填充,以适应输出特征图的尺寸。
out = self.proj(out)
# print(out.size())
out = out[:, :, :H, :W]
return out
class Block(nn.Module):
def __init__(self, dim=256, num_heads=16, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0.,
drop_path=0., act_layer=nn.ReLU6, norm_layer=nn.BatchNorm2d, window_size=8):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = GlobalLocalAttention(dim, num_heads=num_heads, qkv_bias=qkv_bias, window_size=window_size)
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, out_features=dim, act_layer=act_layer,
drop=drop)
self.norm2 = norm_layer(dim)
def forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class WF(nn.Module):
def __init__(self, in_channels=128, decode_channels=128, eps=1e-8):
super(WF, self).__init__()
self.pre_conv = Conv(in_channels, decode_channels, kernel_size=1)
self.weights = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
self.eps = eps
self.post_conv = ConvBNReLU(decode_channels, decode_channels, kernel_size=3)
def forward(self, x, res):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=False)
weights = nn.ReLU()(
self.weights)
fuse_weights = weights / (torch.sum(weights, dim=0) + self.eps)
x = fuse_weights[0] * self.pre_conv(res) + fuse_weights[1] * x
x = self.post_conv(x)
return x
class FeatureRefinementHead(nn.Module):
def __init__(self, in_channels=64, decode_channels=64):
super().__init__()
self.pre_conv = Conv(in_channels, decode_channels, kernel_size=1)
self.weights = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
self.eps = 1e-8
self.post_conv = ConvBNReLU(decode_channels, decode_channels, kernel_size=3)
self.pa = nn.Sequential(
nn.Conv2d(decode_channels, decode_channels, kernel_size=3, padding=1, groups=decode_channels),
nn.Sigmoid())
self.ca = nn.Sequential(nn.AdaptiveAvgPool2d(1),
Conv(decode_channels, decode_channels // 16, kernel_size=1),
nn.ReLU6(),
Conv(decode_channels // 16, decode_channels, kernel_size=1),
nn.Sigmoid())
self.shortcut = ConvBN(decode_channels, decode_channels, kernel_size=1)
self.proj = SeparableConvBN(decode_channels, decode_channels, kernel_size=3)
self.act = nn.ReLU6()
def forward(self, x, res):
x = F.interpolate(x, scale_factor=2, mode='bilinear', align_corners=False)
weights = nn.ReLU()(self.weights)
fuse_weights = weights / (torch.sum(weights, dim=0) + self.eps)
x = fuse_weights[0] * self.pre_conv(res) + fuse_weights[1] * x
x = self.post_conv(x)
shortcut = self.shortcut(x)
pa = self.pa(x) * x
ca = self.ca(x) * x
x = pa + ca
x = self.proj(x) + shortcut
x = self.act(x)
return x
class AuxHead(nn.Module):
def __init__(self, in_channels=64, num_classes=8):
super().__init__()
self.conv = ConvBNReLU(in_channels, in_channels)
self.drop = nn.Dropout(0.1)
self.conv_out = Conv(in_channels, num_classes, kernel_size=1)
def forward(self, x, h, w):
feat = self.conv(x)
feat = self.drop(feat)
feat = self.conv_out(feat)
feat = F.interpolate(feat, size=(h, w), mode='bilinear', align_corners=False)
return feat
class Decoder(nn.Module):
def __init__(self,
encoder_channels=(64, 128, 256, 512),
decode_channels=64,
dropout=0.1,
window_size=8,
num_classes=6):
super(Decoder, self).__init__()
self.pre_conv = ConvBN(encoder_channels[-1], decode_channels, kernel_size=1)
self.b4 = Block(dim=decode_channels, num_heads=8, window_size=window_size)
self.b3 = Block(dim=decode_channels, num_heads=8, window_size=window_size)
self.p3 = WF(encoder_channels[-2], decode_channels) # 三个WS模块
self.b2 = Block(dim=decode_channels, num_heads=8, window_size=window_size)
self.p2 = WF(encoder_channels[-3], decode_channels)
if self.training:
self.up4 = nn.UpsamplingBilinear2d(scale_factor=4)
self.up3 = nn.UpsamplingBilinear2d(scale_factor=2)
self.aux_head = AuxHead(decode_channels, num_classes)
self.p1 = FeatureRefinementHead(encoder_channels[-4], decode_channels)
self.segmentation_head = nn.Sequential(ConvBNReLU(decode_channels, decode_channels),
nn.Dropout2d(p=dropout, inplace=True),
Conv(decode_channels, num_classes, kernel_size=1))
self.init_weight()
def forward(self, res1, res2, res3, res4, h, w):
if self.training == True:
x = self.b4(self.pre_conv(res4))
h4 = self.up4(x)
x = self.p3(x, res3)
x = self.b3(x)
h3 = self.up3(x)
x = self.p2(x, res2)
x = self.b2(x)
h2 = x
x = self.p1(x, res1)
x = self.segmentation_head(x)
x = F.interpolate(x, size=(h, w), mode='bilinear', align_corners=False)
ah = h4 + h3 + h2
ah = self.aux_head(ah, h, w)
return x, ah
else:
x = self.b4(self.pre_conv(res4))
x = self.p3(x, res3)
x = self.b3(x)
x = self.p2(x, res2)
x = self.b2(x)
x = self.p1(x, res1)
x = self.segmentation_head(x)
x = F.interpolate(x, size=(h, w), mode='bilinear', align_corners=False)
return x
def init_weight(self):
for m in self.children():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, a=1)
if m.bias is not None:
nn.init.constant_(m.bias, 0)
class FuseDisNet(nn.Module):
def __init__(self,
decode_channels=64,
dropout=0.1,
backbone_name='swsl_resnet18',
pretrained=True,
window_size=8,
num_classes=6
):
super().__init__()
self.backbone = timm.create_model(backbone_name, features_only=True, output_stride=32,
out_indices=(1, 2, 3, 4), pretrained=pretrained)
encoder_channels = self.backbone.feature_info.channels()
self.decoder = Decoder(encoder_channels, decode_channels, dropout, window_size, num_classes)
# 定义了一个解码器Decoder传入了主干网络提取的特征通道数、解码器的参数等。
def forward(self, x):
h, w = x.size()[-2:]
res1, res2, res3, res4 = self.backbone(x)
if self.training:
x, ah = self.decoder(res1, res2, res3, res4, h, w)
return x, ah
else:
x = self.decoder(res1, res2, res3, res4, h, w)
return x