import torch from torch import nn from torch.nn import functional as F class CoTAttention(nn.Module): def __init__(self, dim=512,kernel_size=3): super().__init__() self.dim=dim self.kernel_size=kernel_size self.key_embed=nn.Sequential( nn.Conv2d(dim,dim,kernel_size=kernel_size,padding=kernel_size//2,groups=4,bias=False), nn.BatchNorm2d(dim), nn.ReLU() ) self.value_embed=nn.Sequential( nn.Conv2d(dim,dim,1,bias=False), nn.BatchNorm2d(dim) ) factor=4 self.attention_embed=nn.Sequential( nn.Conv2d(2*dim,2*dim//factor,1,bias=False), nn.BatchNorm2d(2*dim//factor), nn.ReLU(), nn.Conv2d(2*dim//factor,kernel_size*kernel_size*dim,1) ) def forward(self, x): bs,c,h,w=x.shape k1=self.key_embed(x) #bs,c,h,w v=self.value_embed(x).view(bs,c,-1) #bs,c,h,w y=torch.cat([k1,x],dim=1) #bs,2c,h,w att=self.attention_embed(y) #bs,c*k*k,h,w att=att.reshape(bs,c,self.kernel_size*self.kernel_size,h,w) att=att.mean(2,keepdim=False).view(bs,c,-1) #bs,c,h*w k2=F.softmax(att,dim=-1)*v k2=k2.view(bs,c,h,w) return k1+k2 # 输入 N C H W, 输出 N C H W if __name__ == '__main__': block = CoTAttention(64) input = torch.rand(1, 64, 64, 64) output = block(input) print(input.size(), output.size())