#!/usr/bin/env python 
# -*- coding: utf-8 -*-
"""
@project: 
@File    : farsegloss
@Author  : qiqq
@create_time    : 2022/11/25 14:54
farseg论文里的F-A模块对于loss的优化
"""

import torch
import torch.nn.functional as F
import math
from torch import  nn


def softmax_focalloss(y_pred, y_true, ignore_index=255, gamma=2.0, normalize=False):
    """

    Args:
        y_pred: [N, #class, H, W]
        y_true: [N, H, W] from 0 to #class
        gamma: scalar

    Returns:

    """
    losses = F.cross_entropy(y_pred, y_true, ignore_index=ignore_index, reduction='none')
    with torch.no_grad():##.你tm的为啥torch.no_grad()这还怎么在训练的时候用 ？？难道是为了节省显存？？我不理解
        p = y_pred.softmax(dim=1)
        modulating_factor = (1 - p).pow(gamma)  #权重
        valid_mask = ~ y_true.eq(ignore_index)
        masked_y_true = torch.where(valid_mask, y_true, torch.zeros_like(y_true))  # 我觉得这个可能还是为了去选择那些不是ignorindex的去参与计算
        #不对把，你tm把255的也弄成0了，tm这不就和背景一样了吗
        '''
        torch.where(condition，a，b)其中
        输入参数condition：条件限制，如果满足条件，则选择a，否则选择b作为输出
        '''
        #这个是干啥的..
        modulating_factor = torch.gather(modulating_factor, dim=1, index=masked_y_true.unsqueeze(dim=1)).squeeze_(dim=1)
        '''
        torch.gather:
        index实际上是索引，具体是行还是列的索引要看前面dim 的指定,index的大小就是输出的大小
        举个例子：
        输入:[[1,2,3],[4,5,6]  index [[0 ,1],[2 ,0]] dim=1 输出：[[1 ,2],[6,4]]
        gather在one-hot为输出的多分类问题中，可以把最大值坐标作为index传进去，然后提取到每一行的正确预测结果，这也是gather可能的一个作用。


        在这里dim1就是类别方向比如batch,channel,h,w dim1就是channel方向
         torch.gather相当于以类别为索引把modulating_factor对应的取出来
        这里的
        '''
        scale = 1.
        if normalize:
            scale = losses.sum() / (losses * modulating_factor).sum()
    losses = scale * (losses * modulating_factor).sum() / (valid_mask.sum() + p.size(0))  # focalloss里的对于困难样本的加权
    #我不太明白为什么这里/的时候嗨哟啊+batch 这个valid_mask.sum()不久包含了batch在内的所有的样本点吗

    return losses


def cosine_annealing(lower_bound, upper_bound, _t, _t_max):
    '''

    '''
    return upper_bound + 0.5 * (lower_bound - upper_bound) * (math.cos(math.pi * _t / _t_max) + 1)


def poly_annealing(lower_bound, upper_bound, _t, _t_max):
    factor = (1 - _t / _t_max) ** 0.9
    return upper_bound + factor * (lower_bound - upper_bound)


def linear_annealing(lower_bound, upper_bound, _t, _t_max):
    factor = 1 - _t / _t_max
    return upper_bound + factor * (lower_bound - upper_bound)


def annealing_softmax_focalloss(y_pred, y_true, t, t_max, ignore_index=255, gamma=2.0,
                                annealing_function=cosine_annealing):
    losses = F.cross_entropy(y_pred, y_true, ignore_index=ignore_index, reduction='none')
    with torch.no_grad():
        p = y_pred.softmax(dim=1)
        modulating_factor = (1 - p).pow(gamma)
        valid_mask = ~ y_true.eq(ignore_index)  # 没有的话这个可以忽略
        masked_y_true = torch.where(valid_mask, y_true, torch.zeros_like(y_true))
        modulating_factor = torch.gather(modulating_factor, dim=1, index=masked_y_true.unsqueeze(dim=1)).squeeze_(dim=1)
        # 截止到这里还是和focalloss那边差不多
        normalizer = losses.sum() / (losses * modulating_factor).sum()  # 大概就是论文种那个z  其实这个/换成h*w就成了mean loss了
        scales = modulating_factor * normalizer  # modulating_factor对应原文的那个（1-pi）r  这个scales对应原文的（1/z）*（1-pi）r
    if t > t_max:  # 大概的意思是整个训练过程比如1000轮 t_max=500轮 就是500以内我的factor就是不断呈现一个余弦函数变化，然后500-100轮 我的factor就固定了
        scale = scales
    else:
        scale = annealing_function(1, scales, t,
                                   t_max)  # 比起一般的focalloss从头到位一直是modulating_factor我这里是动态加权从1开始一直到modulating_factor
    losses = (losses * scale).sum() / (valid_mask.sum() + p.size(0))
    return losses


class OhemCELoss(nn.Module):
    """
    Online hard example mining cross-entropy loss:在线难样本挖掘
    if loss[self.n_min] > self.thresh: 最少考虑 n_min 个损失最大的 pixel，
    如果前 n_min 个损失中最小的那个的损失仍然大于设定的阈值，
    那么取实际所有大于该阈值的元素计算损失:loss=loss[loss>thresh]。
    否则，计算前 n_min 个损失:loss = loss[:self.n_min]
    """

    def __init__(self, thresh, n_min, ignore_lb=255, *args, **kwargs):
        super(OhemCELoss, self).__init__()
        self.thresh = -torch.log(torch.tensor(thresh, dtype=torch.float)).cuda()  # 将输入的概率 转换为loss值
        self.n_min = n_min
        self.ignore_lb = ignore_lb
        self.criteria = nn.CrossEntropyLoss(ignore_index=ignore_lb, reduction='none')  # 交叉熵

    def forward(self, logits, labels):
        N, C, H, W = logits.size()
        loss = self.criteria(logits, labels).view(-1)
        loss, _ = torch.sort(loss, descending=True)  # 排序
        if loss[self.n_min] > self.thresh:  # 当loss大于阈值(由输入概率转换成loss阈值)的像素数量比n_min多时，取所以大于阈值的loss值
            loss = loss[loss > self.thresh]
        else:
            loss = loss[:self.n_min]
        return torch.mean(loss)





def setup_seed(seed=0):
    import torch
    import os
    import numpy as np
    import random
    torch.manual_seed(seed)  # 为CPU设置随机种子
    np.random.seed(seed)  # Numpy module.
    random.seed(seed)  # Python random module.
############################################################################
#2022.12.8我感觉废了



#########################以下作废，准备重写


####从此处开始往下到main 有关自己损失函数优化的全部作废......
def sampleloss(y_pred, y_true, ignore_index=255, gamma=2.0, normalize=False):
    #
    # 在定义标签的时候背景必须是0
    # 想法：loss分成两部分：
    # 所有前景样本的loss+部分困难背景样本的loss,然后背景像素的调制系数在参照farseg
    # 其中困难的定义是prob<0.5 (是个超参数，暂定)
    # y_pred: [N, #class, H, W]
    # y_true: [N, H, W] from 0 to #class
    #
    # :return:
    #
    pass

    # #1.首先把所有前景样本取出来
    # losses = F.cross_entropy(y_pred, y_true, ignore_index=ignore_index, reduction='none')
    # with torch.no_grad():
    #     p = y_pred.softmax(dim=1)
    #     modulating_factor = (1 - p).pow(gamma)  #
    #     valid_mask = ~ y_true.eq(ignore_index)
    #     valid_mask = ~ y_true.eq(ignore_index)
    #     masked_y_true = torch.where(valid_mask, y_true, torch.zeros_like(y_true))  #
    #     modulating_factor = torch.gather(modulating_factor, dim=1, index=masked_y_true.unsqueeze(dim=1)).squeeze_(dim=1)




    #2.把符合条件的背景样本取出来


def sample_foreground(y_pred, y_true, ignore_index=255,backgroud=0, gamma=2.0, normalize=False):
    # 把所有前景样本取出来。形成专门的前景loss
    losses = F.cross_entropy(y_pred, y_true, ignore_index=ignore_index, reduction='none')  #这一部分的loss是把前景和背景的loss都给计算出来了
    with torch.no_grad():
        p = y_pred.softmax(dim=1)
        mask1 = torch.ones_like(y_true)  #
        # valid_mask1 = ~ y_true.eq(ignore_index)  #ignor的像素找出来
        # valid_mask2= ~ y_true.eq(backgroud)  #背景像素找出来
        # # valid_mask=valid_mask1.eq(valid_mask2)
        valid_mask=~ y_true.eq(ignore_index) &  ~ y_true.eq(backgroud)   #背景和ignor的像素的位置设置为false

        foregroundcount= valid_mask[valid_mask==True].shape[0]  #统计前景像素的个数


        foregmask = torch.where(valid_mask, mask1, torch.zeros_like(y_true))  #


    losses = (losses * foregmask).sum() / (foregroundcount )  #  #这一部分的loss是把前景和背景的loss都给计算出来了。。我们只要前景的所以背景的就是
    # 我不太明白为什么这里原来/的时候要 p.size(0)（+batch ）这个valid_mask.sum()不久包含了batch在内的所有的样本点吗...所以我把原来的+ p.size(0)去掉了


    # foregmask是前景的就是1 不是前景的就是0，这样就把原来loss所有的loss中的前景的loss提取出来了

    return losses

def sample_hardbackgroundversion1(y_pred, y_true, ignore_index=255,threshold=0.5, backgroud=0,gamma=2.0, normalize=False):

    # 采样困难的背景像素。设置小于阈值的就算是困难的  .整个训练过程中一直保持0.5不变


    # 把所有符合阈值的背景像素。形成专门的背景loss
    losses = F.cross_entropy(y_pred, y_true, ignore_index=ignore_index,
                             reduction='none')  # 这一部分的loss是把前景和背景的loss都给计算出来了

    with torch.no_grad():

        p = y_pred.softmax(dim=1)
        mask = torch.ones_like(y_true)
        valid_mask = ~ y_true.eq(ignore_index) & y_true.eq(backgroud)          # 背景和ignor的像素的位置设置为false，背景的设置为true
        threshold_mask =p[:,0,:,:]<threshold   #，在预测的背景里边大于阈值的就属于容易，所以不要，小于阈值的是困难

        backgroundmask = torch.where(valid_mask, mask, torch.zeros_like(y_true))  #
        backgroundthresholdmask = torch.where(threshold_mask, backgroundmask, torch.zeros_like(y_true))  #

        # backgroundloss = losses * backgroundmask
        background_thresholdloss = losses * backgroundthresholdmask
        count=backgroundthresholdmask[backgroundthresholdmask==1].shape[0]

    finalloss =background_thresholdloss.sum()/count



    # backgroundmask 不是背景就是1，不是背景的（前景和ignor的）就是0，这样就把原来所有的loss中的背景的loss提取出来了

    return finalloss

    


def sample_hardbackgroundversion2(y_pred, y_true, t, t_max,ignore_index=255,backgroud=0,  annealing_function=cosine_annealing):
    '''
    采样困难的背景像素。设置小于阈值的就算是困难的
    想仿照farseg做一个动态阈值
    '''
    # 把所有符合阈值的背景像素。形成专门的背景loss

    losses = F.cross_entropy(y_pred, y_true, ignore_index=ignore_index,
                             reduction='none')  # 这一部分的loss是把前景和背景的loss都给计算出来了

    with torch.no_grad():

        p = y_pred.softmax(dim=1)
        mask1 = torch.ones_like(y_true)  #

        valid_mask = ~ y_true.eq(ignore_index) & y_true.eq(backgroud)  # 背景和ignor的像素的位置设置为false
        if t > t_max:  # 大概的意思是整个训练过程比如1000轮 t_max=500轮 就是500以内我的factor就是不断呈现一个余弦函数变化，然后500-100轮 我的factor就固定了
            threshold = 0.5
        else:
            threshold = annealing_function(0.8, 0.5, t,
                                       t_max)
        threshold_mask = p[:, 0, :, :] < threshold

        backgroundmask = torch.where(valid_mask, mask1, torch.zeros_like(y_true))  #
        backgroundthresholdmask = torch.where(threshold_mask, backgroundmask, torch.zeros_like(y_true))  #

        # backgroundloss = losses * backgroundmask
        background_thresholdloss = losses * backgroundthresholdmask
        count = backgroundthresholdmask[backgroundthresholdmask == 1].shape[0]

    finalloss = background_thresholdloss.sum() / count

    return finalloss




def foreground_sample_hardbackgroundversion1(y_pred, y_true, ignore_index=255,threshold=0.5, backgroud=0,gamma=2.0, normalize=False):
    '''我还就不信了。分开写不行合起来：在做判断规则的时候直接一部到位，前景的和符合条件的背景的'''
    # 把所有前景样本取出来。形成专门的前景loss
    losses = F.cross_entropy(y_pred, y_true, ignore_index=ignore_index,
                             reduction='none')  # 这一部分的loss是把前景和背景的loss都给计算出来了
    with torch.no_grad():
        p = y_pred.softmax(dim=1)
        mask1 = torch.ones_like(y_true)  #
        #1.前景的mask
        fore_mask = ~ y_true.eq(ignore_index) & ~ y_true.eq(backgroud)  #
        foregmask = torch.where(fore_mask, mask1, torch.zeros_like(y_true))  #

        #2.符合条件的背景的mask

        #背景mask
        background_mask = ~ y_true.eq(ignore_index) & y_true.eq(backgroud)
        backgroundmask = torch.where(background_mask, mask1, torch.zeros_like(y_true))  #

        #背景中符合条件的mask
        threshold_mask = p[:, 0, :, :] < threshold  # ，
        backgroundthresholdmask = torch.where(threshold_mask, backgroundmask, torch.zeros_like(y_true))  #
        #

        #3.前景的mask和符合条件的背景的mask
        totmask=foregmask.eq(1)  | backgroundthresholdmask.eq(1)
        total_mask = torch.where(totmask, mask1, torch.zeros_like(y_true))
        valcount= total_mask[total_mask==1].shape[0]

    finaloss = (losses * total_mask).sum() / (valcount)

    return finaloss



def foreground_sample_hardbackgroundversion2(y_pred, y_true, t, t_max,ignore_index=255,backgroud=0,  annealing_function=cosine_annealing):
    '''我还就不信了。分开写不行合起来：在做判断规则的时候直接一部到位，前景的和符合条件的背景的'''
    # 把所有前景样本取出来。形成专门的前景loss
    losses = F.cross_entropy(y_pred, y_true, ignore_index=ignore_index,
                             reduction='none')  # 这一部分的loss是把前景和背景的loss都给计算出来了
    with torch.no_grad():
        p = y_pred.softmax(dim=1)
        mask1 = torch.ones_like(y_true)  #
        #1.前景的mask
        fore_mask = ~ y_true.eq(ignore_index) & ~ y_true.eq(backgroud)  #
        foregmask = torch.where(fore_mask, mask1, torch.zeros_like(y_true))  #

        #2.符合条件的背景的mask

        #背景mask
        background_mask = ~ y_true.eq(ignore_index) & y_true.eq(backgroud)
        backgroundmask = torch.where(background_mask, mask1, torch.zeros_like(y_true))  #

        #背景中符合条件的mask
        if t > t_max:  # 大概的意思是整个训练过程比如1000轮 t_max=500轮 就是500以内我的factor就是不断呈现一个余弦函数变化，然后500-100轮 我的factor就固定了
            threshold = 0.5
        else:
            threshold = annealing_function(0.99, 0.5, t,
                                           t_max)

        threshold_mask = p[:, 0, :, :] < threshold  # ，
        backgroundthresholdmask = torch.where(threshold_mask, backgroundmask, torch.zeros_like(y_true))  #
        #

        #3.前景的mask和符合条件的背景的mask
        totmask=foregmask.eq(1)  | backgroundthresholdmask.eq(1)
        total_mask = torch.where(totmask, mask1, torch.zeros_like(y_true))
        valcount= total_mask[total_mask==1].shape[0]

    finaloss = (losses * total_mask).sum() / (valcount)

    return finaloss


###########################################################





##关于损失函数优化的###########重写
'''
1.第一版本的思想
对于前背景不平衡的问题：
前景像素全部参与计算 ，背景只选择困难的参与计算，
背景的困难样本选择：设置一个阈值 比如0.5 预测的背景概率小于0.5的就算是困难的
阈值设置为动态阈值用一个cos函数 前x个epoch 从0.99到0.5，x epoch后固定为0.5（有点farseg的思想） 

2.第二版本的思想
关于困难样本挖掘的问题：
基本和focal loss差不多 
1.先根据第一版的把前景样本和符合阈值的背景样本采出来
2.然后用focal loss里边的关于困难样本挖掘的（1-p）r对所有的进行加权,其中加权方式也可以按照动态加权
'''


def fb_loss_version1(y_pred, y_true, t, t_max,ignore_index=255,backgroud=0, gamma=2.0, annealing_function=cosine_annealing):
    '''这个基本代码没什么问题了，可以不动了'''
    losses = F.cross_entropy(y_pred, y_true, ignore_index=ignore_index,
                             reduction='none')  # 这一部分的loss是把前景和背景的loss都给计算出来了
    with torch.no_grad():
        p = y_pred.softmax(dim=1)
        mask = torch.ones_like(y_true)  # #通用的mask

        fore_mask = ~ y_true.eq(ignore_index) & ~ y_true.eq(backgroud)  # 形成前景的mask（boole的） #非背景和ignore的就是前景像素
        foregmask = torch.where(fore_mask, mask, torch.zeros_like(y_true))  #  #形成前景的mask（0 1形式的

        # 2.背景中符合阈值的

        background_mask = ~ y_true.eq(ignore_index) & y_true.eq(backgroud)  #背景的mask boole类型
        backgroundmask = torch.where(background_mask, mask, torch.zeros_like(y_true))  #  #背景的mask int (0,1)类型

        # 背景中符合条件的mask
        if t > t_max:  # 大概的意思是整个训练过程比如1000轮 t_max=500轮 就是500以内我的factor就是不断呈现一个余弦函数变化，然后500-100轮 我的factor就固定了
            threshold = 0.5
        else:
            threshold = annealing_function(0.99, 0.5, t,
                                           t_max)

        threshold_mask = p[:, 0, :, :] < threshold  # ，像素预测为背景的概率小于阈值的则为背景困难的  #boole类型的
        backgroundthresholdmask = torch.where(threshold_mask, backgroundmask, torch.zeros_like(y_true))  # int 0 1 类型的   #最终的符合阈值的background的样本
        #

        # 3.前景的mask和符合条件的背景的mask（是前景或者是符合要求的背景）  #如果摊上了所有背景样本都符合阈值的情况那就是原来的loss
        totmask = foregmask.eq(1) | backgroundthresholdmask.eq(1)
        total_mask = torch.where(totmask, mask, torch.zeros_like(y_true))
        valcount = total_mask[total_mask == 1].shape[0]

    finaloss = (losses * total_mask).sum() / (valcount)

    return finaloss


def fb_loss_version2(y_pred, y_true, t, t_max,ignore_index=255,backgroud=0, gamma=2.0, annealing_function=cosine_annealing):
    losses = F.cross_entropy(y_pred, y_true, ignore_index=ignore_index,
                             reduction='none')  # 这一部分的loss是把前景和背景的loss都给计算出来了
    with torch.no_grad():
        p = y_pred.softmax(dim=1)
        mask = torch.ones_like(y_true)  # #通用的mask



        fore_mask = ~ y_true.eq(ignore_index) & ~ y_true.eq(backgroud)  # 形成前景的mask（boole的） #非背景和ignore的就是前景像素
        foregmask = torch.where(fore_mask, mask, torch.zeros_like(y_true))  # #形成前景的mask（0 1形式的

        # 2.背景中符合阈值的

        background_mask = ~ y_true.eq(ignore_index) & y_true.eq(backgroud)  # 背景的mask boole类型
        backgroundmask = torch.where(background_mask, mask, torch.zeros_like(y_true))  # #背景的mask int (0,1)类型

        # 背景中符合条件的mask
        if t > t_max:  # 大概的意思是整个训练过程比如1000轮 t_max=500轮 就是500以内我的factor就是不断呈现一个余弦函数变化，然后500-100轮 我的factor就固定了
            threshold = 0.5
        else:
            threshold = annealing_function(0.99, 0.5, t,
                                           t_max)

        threshold_mask = p[:, 0, :, :] < threshold  # ，像素预测为背景的概率小于阈值的则为背景困难的  #boole类型的
        backgroundthresholdmask = torch.where(threshold_mask, backgroundmask,
                                              torch.zeros_like(y_true))  # int 0 1 类型的   #最终的符合阈值的background的样本
        #

        # 3.前景的mask和符合条件的背景的mask（是前景或者是符合要求的背景）  #如果摊上了所有背景样本都符合阈值的情况那就是原来的loss
        totmask = foregmask.eq(1) | backgroundthresholdmask.eq(1)
        total_mask = torch.where(totmask, mask, torch.zeros_like(y_true))
        valcount = total_mask[total_mask == 1].shape[0]


        #在version1的基础上

        modulating_factor = (1 - p).pow(gamma)  #
        valid_mask = ~ y_true.eq(ignore_index)
        masked_y_true = torch.where(valid_mask, y_true, torch.zeros_like(y_true))
        '''
        torch.where(condition，a，b)其中
        输入参数condition：条件限制，如果满足条件，则选择a，否则选择b作为输出
        '''
        modulating_factor = torch.gather(modulating_factor, dim=1, index=masked_y_true.unsqueeze(dim=1)).squeeze_(dim=1)
        '''
        modulating_factor=(1 - p).pow(gamma)是个全通道的调制系数 和ypred的尺寸一样 但是我们想要的是一个单通道的
        也就是每个pix的一个权重，也就是仙子我们的每个有效的位置的pix的loss已经有了 就差权重了
        torch.gather(modulating_factor, dim=1, index=masked_y_true.unsqueeze(dim=1)).squeeze_(dim=1)
        就是根据gt的index索引，从相应的dim维度去取对应的factor的值，比如某个位置 gt是2 那就从dim上的第二个位置取他的factor
        '''

    finaloss = (modulating_factor*(losses * total_mask)).sum() / (valcount)

    return finaloss






if __name__ == '__main__':
    setup_seed()
    # input = torch.randn(2, 3, 4, 4, dtype=torch.float32)
    input = torch.randn(1, 3, 4, 4, dtype=torch.float32)
    # target = torch.randint(3, (1, 4, 4), dtype=torch.int64)
    # target = torch.randint(3, (2, 4, 4), dtype=torch.int64)
    target = torch.tensor(([[[2, 1, 1, 2],
         [2, 255, 2, 2],
         [2, 0, 255, 2],
         [1, 1, 0, 0]]]))


    # target = torch.tensor(([[[1, 1, 2, 1],
    #      [2, 0, 1, 0],
    #      [2, 0, 255, 2],
    #      [2, 2, 0, 1]],
    #     [[2, 2, 1, 0],
    #      [1, 0, 0, 0],
    #      [0, 255, 0, 1],
    #      [1, 2, 1, 1]]]))

    print("---------pred----------")
    print(input)
    print("---------target----------")
    print(target)
    print("__________softmax___________")
    soft = F.softmax(input, dim=1)
    print(soft)
    print("__________log___________")
    log = torch.log(soft)
    print(log)

    # backloss =foreground_sample_hardbackgroundversion1(input,target)
    backloss =softmax_focalloss(input,target)

    print(backloss)
#

#
#
# if __name__ == '__main__':
#
#     setup_seed()
#
#     input = torch.randn(1, 3, 4, 4, dtype=torch.float32)
#     target = torch.randint(3, (1, 4, 4), dtype=torch.int64)
#
#     print("---------pred----------")
#     print(input)
#
#     print("---------target----------")
#     print(target)
#
#     print("__________softmax___________")
#     soft = F.softmax(input, dim=1)
#     print(soft)
#     print("__________log___________")
#     log = torch.log(soft)
#     print(log)
#
#     print("-----------------------------")
#     # softmax_focalloss=softmax_focalloss(input,target)
#
#
#     # tmx = 10000
#     # for i in range(tmx):
#     #     loss = annealing_softmax_focalloss(input, target, t=i, t_max=tmx)
#     # # softmax_focalloss=softmax_focalloss(input,target)
#
#
#     # import matplotlib.pyplot as plt
#     #
#     # _t_max=300
#     # ylist= []
#     # xlist=[]
#     # for i in range(_t_max):
#     #      y=cosine_annealing(0.1,0.5,_t=i,_t_max=_t_max)
#     #      ylist.append(y)
#     #      xlist.append(i)
#     #
#     # plt.figure()
#     # plt.plot(xlist,ylist)
#     # plt.show()
#     #
#




#
# if __name__ == '__main__':
#
#     target = torch.tensor(([[[2, 1, 1, 2],
#                              [2, 255, 2, 2],
#                              [2, 0, 255, 2],
#                              [1, 1, 0, 0]]]))
#
#     epoch =300
#     for i in range(epoch):
#         # input = torch.randn(2, 3, 4, 4, dtype=torch.float32)
#         input = torch.randn(1, 3, 4, 4, dtype=torch.float32)
#
#
#         print("---------pred----------")
#         print(input)
#         print("---------target----------")
#         print(target)
#         print("__________softmax___________")
#         soft = F.softmax(input, dim=1)
#         print(soft)
#         print("__________log___________")
#         log = torch.log(soft)
#         print(log)
#         backloss =sample_hardbackgroundversion2(input,target,t=i+1, t_max=50)
#
#         print(backloss)
# #
#
#