import numpy as np
import torch
import torch.nn as nn
import torchvision
import torchvision.models as models
from typing import Union, List, Tuple
import os
import video3d.utils.misc as misc
import torch.nn.functional as F
from siren_pytorch import SirenNet
from video3d.triplane_texture.lift_architecture import Lift_Encoder
from video3d.triplane_texture.triplane_transformer import Triplane_Transformer


EPS = 1e-7


def get_activation(name, inplace=True, lrelu_param=0.2):
    if name == 'tanh':
        return nn.Tanh()
    elif name == 'sigmoid':
        return nn.Sigmoid()
    elif name == 'relu':
        return nn.ReLU(inplace=inplace)
    elif name == 'lrelu':
        return nn.LeakyReLU(lrelu_param, inplace=inplace)
    else:
        raise NotImplementedError


class MLPWithPositionalEncoding(nn.Module):
    def __init__(self, 
                 cin,
                 cout, 
                 num_layers, 
                 nf=256, 
                 dropout=0, 
                 activation=None, 
                 n_harmonic_functions=10, 
                 omega0=1,
                 extra_dim=0,
                 embed_concat_pts=True,
                 symmetrize=False):
        super().__init__()
        self.extra_dim = extra_dim

        if n_harmonic_functions > 0:
            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
            dim_in = cin * 2 * n_harmonic_functions
            self.embed_concat_pts = embed_concat_pts
            if embed_concat_pts:
                dim_in += cin
        else:
            self.embedder = None
            dim_in = cin
        
        self.in_layer = nn.Linear(dim_in, nf)
        self.relu = nn.ReLU(inplace=True)
        self.mlp = MLP(nf + extra_dim, cout, num_layers, nf, dropout, activation)
        self.symmetrize = symmetrize

    def forward(self, x, feat=None):
        assert (feat is None and self.extra_dim == 0) or feat.shape[-1] == self.extra_dim
        if self.symmetrize:
            xs, ys, zs = x.unbind(-1)
            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
        
        if self.embedder is not None:
            x_in = self.embedder(x)
            if self.embed_concat_pts:
                x_in = torch.cat([x, x_in], -1)
        else:
            x_in = x
        
        x_in = self.relu(self.in_layer(x_in))

        if feat is not None:
            # if len(feat.shape) == 1:
            #     for _ in range(len(x_in.shape) - 1):
            #         feat = feat.unsqueeze(0)
            #     feat = feat.repeat(*x_in.shape[:-1], 1)
            x_in = torch.concat([x_in, feat], dim=-1)
            
        return self.mlp(x_in)


class MLPWithPositionalEncoding_Style(nn.Module):
    def __init__(self, 
                 cin,
                 cout, 
                 num_layers, 
                 nf=256, 
                 dropout=0, 
                 activation=None, 
                 n_harmonic_functions=10, 
                 omega0=1,
                 extra_dim=0,
                 embed_concat_pts=True,
                 symmetrize=False,
                 style_choice='film'):
        super().__init__()
        self.extra_dim = extra_dim

        if n_harmonic_functions > 0:
            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
            dim_in = cin * 2 * n_harmonic_functions
            self.embed_concat_pts = embed_concat_pts
            if embed_concat_pts:
                dim_in += cin
        else:
            self.embedder = None
            dim_in = cin
        
        self.in_layer = nn.Linear(dim_in, nf)
        self.relu = nn.ReLU(inplace=True)

        if extra_dim == 0:
            self.mlp = MLP(nf + extra_dim, cout, num_layers, nf, dropout, activation)
        
        else:
            if style_choice == 'film':
                self.mlp = MLP_FiLM(nf, cout, num_layers, nf, dropout, activation)
                self.style_mlp = MLP(extra_dim, nf*2, 2, nf, dropout, None)
            
            elif style_choice == 'mod':
                self.mlp = MLP_Mod(nf, cout, num_layers, nf, dropout, activation)
                self.style_mlp = MLP(extra_dim, nf, 2, nf, dropout, None)
            
            else:
                raise NotImplementedError

            self.style_choice = style_choice
        
        self.symmetrize = symmetrize

    def forward(self, x, feat=None):
        assert (feat is None and self.extra_dim == 0) or feat.shape[-1] == self.extra_dim
        if self.symmetrize:
            xs, ys, zs = x.unbind(-1)
            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
        
        if self.embedder is not None:
            x_in = self.embedder(x)
            if self.embed_concat_pts:
                x_in = torch.cat([x, x_in], -1)
        else:
            x_in = x
        
        x_in = self.relu(self.in_layer(x_in))

        if feat is not None:
            style = self.style_mlp(feat)

            if self.style_choice == 'film':
                style = style.reshape(style.shape[:-1] + (-1, 2))
            
            out = self.mlp(x_in, style)
        
        else:
            out = self.mlp(x_in)
            
        return out


class MLP_FiLM(nn.Module):
    def __init__(self, cin, cout, num_layers, nf=256, dropout=0, activation=None):
        # default no dropout
        super().__init__()
        assert num_layers >= 1
        self.num_layers = num_layers
        if num_layers == 1:
            self.network = Linear_FiLM(cin, cout, bias=False)
        else:
            self.relu = nn.ReLU(inplace=True)
            for i in range(num_layers):
                if i == 0:
                    setattr(self, f'linear_{i}', Linear_FiLM(cin, nf, bias=False))
                elif i == (num_layers-1):
                    setattr(self, f'linear_{i}', Linear_FiLM(nf, cout, bias=False))
                else:
                    setattr(self, f'linear_{i}', Linear_FiLM(nf, nf, bias=False))

    def forward(self, input, style):
        if self.num_layers == 1:
            out = self.network(input, style)
        else:
            x = input
            for i in range(self.num_layers):
                linear_layer = getattr(self, f'linear_{i}')
                if i == (self.num_layers - 1):
                    x = linear_layer(x, style)
                else:
                    x = linear_layer(x, style)
                    x = self.relu(x)
            
            out = x
        return out


class MLP_Mod(nn.Module):
    def __init__(self, cin, cout, num_layers, nf=256, dropout=0, activation=None):
        # default no dropout
        super().__init__()
        assert num_layers >= 1
        self.num_layers = num_layers
        if num_layers == 1:
            self.network = Linear_Mod(cin, cout, bias=False)
        else:
            self.relu = nn.ReLU(inplace=True)
            for i in range(num_layers):
                if i == 0:
                    setattr(self, f'linear_{i}', Linear_Mod(cin, nf, bias=False))
                elif i == (num_layers-1):
                    setattr(self, f'linear_{i}', Linear_Mod(nf, cout, bias=False))
                else:
                    setattr(self, f'linear_{i}', Linear_Mod(nf, nf, bias=False))

    def forward(self, input, style):
        if self.num_layers == 1:
            out = self.network(input, style)
        else:
            x = input
            for i in range(self.num_layers):
                linear_layer = getattr(self, f'linear_{i}')
                if i == (self.num_layers - 1):
                    x = linear_layer(x, style)
                else:
                    x = linear_layer(x, style)
                    x = self.relu(x)
            
            out = x
        return out


import math

class Linear_FiLM(nn.Module):
    def __init__(self, in_features: int, out_features: int, bias: bool = True,
                 device=None, dtype=None) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.empty((out_features, in_features), **factory_kwargs))
        if bias:
            self.bias = nn.Parameter(torch.empty(out_features, **factory_kwargs))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
            nn.init.uniform_(self.bias, -bound, bound)

    def forward(self, input, style):
        # if input is [..., D], style should be [..., D, 2]
        x = input * style[..., 0] + style[..., 1]
        return torch.nn.functional.linear(x, self.weight, self.bias)

    def extra_repr(self) -> str:
        return 'in_features={}, out_features={}, bias={}'.format(
            self.in_features, self.out_features, self.bias is not None
        )


class Linear_Mod(nn.Module):
    def __init__(self, in_features: int, out_features: int, bias: bool = True,
                 device=None, dtype=None) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.Parameter(torch.empty((out_features, in_features), **factory_kwargs))
        if bias:
            self.bias = nn.Parameter(torch.empty(out_features, **factory_kwargs))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
            nn.init.uniform_(self.bias, -bound, bound)

    def forward(self, input, style):
        # weight: [out_features, in_features]
        # style: [..., in_features]
        if len(style.shape) > 1:
            style = style.reshape(-1, style.shape[-1])
            style = style[0]
        
        weight = self.weight * style.unsqueeze(0)
        decoefs = ((weight * weight).sum(dim=-1, keepdim=True) + 1e-5).sqrt()
        weight = weight / decoefs

        return torch.nn.functional.linear(input, weight, self.bias)

    def extra_repr(self) -> str:
        return 'in_features={}, out_features={}, bias={}'.format(
            self.in_features, self.out_features, self.bias is not None
        )


class MLPTextureSimple(nn.Module):
    def __init__(self, 
                 cin, 
                 cout, 
                 num_layers, 
                 nf=256, 
                 dropout=0, 
                 activation=None, 
                 min_max=None, 
                 n_harmonic_functions=10, 
                 omega0=1,
                 extra_dim=0,
                 embed_concat_pts=True, 
                 perturb_normal=False,
                 symmetrize=False,
                 texture_act='relu',
                 linear_bias=False):
        super().__init__()
        self.extra_dim = extra_dim

        if n_harmonic_functions > 0:
            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
            dim_in = cin * 2 * n_harmonic_functions
            self.embed_concat_pts = embed_concat_pts
            if embed_concat_pts:
                dim_in += cin
        else:
            self.embedder = None
            dim_in = cin
        
        self.in_layer = nn.Linear(dim_in, nf)
        self.relu = nn.ReLU(inplace=True)

        if texture_act == 'sin':
            print('using siren network for texture mlp here')
            self.mlp = SirenNet(
                dim_in=(nf + extra_dim),
                dim_hidden=nf,
                dim_out=cout,
                num_layers=num_layers,
                final_activation=get_activation(activation),
                w0_initial=30,
                use_bias=linear_bias,
                dropout=dropout
            )
        else:
            self.mlp = MLP(nf + extra_dim, cout, num_layers, nf, dropout, activation, inner_act=texture_act, linear_bias=linear_bias)
        self.perturb_normal = perturb_normal
        self.symmetrize = symmetrize
        if min_max is not None:
            self.register_buffer('min_max', min_max)
        else:
            self.min_max = None
        self.bsdf = None

    def sample(self, x, feat=None):
        assert (feat is None and self.extra_dim == 0) or (feat.shape[-1] == self.extra_dim)
        b, h, w, c = x.shape

        if self.symmetrize:
            xs, ys, zs = x.unbind(-1)
            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
        
        x = x.view(-1, c)
        if self.embedder is not None:
            x_in = self.embedder(x)
            if self.embed_concat_pts:
                x_in = torch.cat([x, x_in], -1)
        else:
            x_in = x
        
        x_in = self.in_layer(x_in)
        if feat is not None:
            feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
            x_in = torch.concat([x_in, feat], dim=-1)
        out = self.mlp(self.relu(x_in))
        if self.min_max is not None:
            out = out * (self.min_max[1][None, :] - self.min_max[0][None, :]) + self.min_max[0][None, :]
        return out.view(b, h, w, -1)


class MLPTextureTriplane(nn.Module):
    def __init__(self, 
                 cin, 
                 cout, 
                 num_layers, 
                 nf=256, 
                 dropout=0, 
                 activation=None, 
                 min_max=None, 
                 n_harmonic_functions=10, 
                 omega0=1,
                 extra_dim=0,
                 embed_concat_pts=True, 
                 perturb_normal=False,
                 symmetrize=False,
                 texture_act='relu',
                 linear_bias=False,
                 cam_pos_z_offset=10.,
                 grid_scale=7,):
        super().__init__()
        self.extra_dim = extra_dim

        if n_harmonic_functions > 0:
            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
            dim_in = cin * 2 * n_harmonic_functions
            self.embed_concat_pts = embed_concat_pts
            if embed_concat_pts:
                dim_in += cin
        else:
            self.embedder = None
            dim_in = cin
        
        self.in_layer = nn.Linear(dim_in, nf)
        self.relu = nn.ReLU(inplace=True)

        self.feat_net = Triplane_Transformer(
            emb_dim=256,
            num_layers=8,
            triplane_dim=80,
            triplane_scale=grid_scale
        )
        self.extra_dim -= extra_dim
        self.extra_dim += (self.feat_net.triplane_dim * 3)

        if texture_act == 'sin':
            print('using siren network for texture mlp here')
            self.mlp = SirenNet(
                dim_in=(nf + self.extra_dim),
                dim_hidden=nf,
                dim_out=cout,
                num_layers=num_layers,
                final_activation=get_activation(activation),
                w0_initial=30,
                use_bias=linear_bias,
                dropout=dropout
            )
        else:
            self.mlp = MLP(nf + self.extra_dim, cout, num_layers, nf, dropout, activation, inner_act=texture_act, linear_bias=linear_bias)
        self.perturb_normal = perturb_normal
        self.symmetrize = symmetrize
        if min_max is not None:
            self.register_buffer('min_max', min_max)
        else:
            self.min_max = None
        self.bsdf = None

    def sample(self, x, feat=None, feat_map=None, mvp=None, w2c=None, deform_xyz=None):
        # assert (feat is None and self.extra_dim == 0) or (feat.shape[-1] == self.extra_dim)
        b, h, w, c = x.shape

        if self.symmetrize:
            xs, ys, zs = x.unbind(-1)
            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
        
        if isinstance(feat_map, dict):
            feat_map = feat_map["im_features_map"]

        feat_map = feat_map.permute(0, 2, 3, 1)
        _, ph, pw, _ = feat_map.shape
        feat_map = feat_map.reshape(feat_map.shape[0], ph*pw, feat_map.shape[-1])
        pts_feat = self.feat_net(feat_map, x.reshape(b, -1, 3))
        pts_c = pts_feat.shape[-1]
        pts_feat = pts_feat.reshape(-1, pts_c)
        
        x = x.view(-1, c)
        if self.embedder is not None:
            x_in = self.embedder(x)
            if self.embed_concat_pts:
                x_in = torch.cat([x, x_in], -1)
        else:
            x_in = x
        
        x_in = self.in_layer(x_in)
        
        x_in = torch.concat([x_in, pts_feat], dim=-1)

        out = self.mlp(self.relu(x_in))
        if self.min_max is not None:
            out = out * (self.min_max[1][None, :] - self.min_max[0][None, :]) + self.min_max[0][None, :]
        return out.view(b, h, w, -1)


class LocalFeatureBlock(nn.Module):
    def __init__(self, local_feat_dim, input_dim=384, output_dim=384, upscale_num=3):
        super().__init__()
        self.local_feat_dim = local_feat_dim
        self.conv_list = nn.ModuleList([])
        self.upscale_list = nn.ModuleList([])

        for i in range(upscale_num):
            if i == 0:
                self.conv_list.append(nn.Conv2d(input_dim, 4 * local_feat_dim, 3, stride=1, padding=1, dilation=1))
            else:
                self.conv_list.append(nn.Conv2d(local_feat_dim, 4 * local_feat_dim, 3, stride=1, padding=1, dilation=1))
            self.upscale_list.append(nn.PixelShuffle(2))
        
        self.conv_head = nn.Conv2d(local_feat_dim, output_dim, 3, stride=1, padding=1, dilation=1)
    
    def forward(self, x):
        for idx, conv in enumerate(self.conv_list):
            x = conv(x)
            x = self.upscale_list[idx](x)
        
        out = self.conv_head(x)
        return out


class MLPTextureLocal(nn.Module):
    def __init__(self, 
                 cin, 
                 cout, 
                 num_layers, 
                 nf=256, 
                 dropout=0, 
                 activation=None, 
                 min_max=None, 
                 n_harmonic_functions=10, 
                 omega0=1,
                 extra_dim=0,
                 embed_concat_pts=True, 
                 perturb_normal=False,
                 symmetrize=False,
                 texture_way=None,
                 larger_tex_dim=False,
                 cam_pos_z_offset=10.,
                 grid_scale=7.):
        super().__init__()
        self.extra_dim = extra_dim
        self.cam_pos_z_offset = cam_pos_z_offset
        self.grid_scale = grid_scale

        local_feat_dim = 64

        assert texture_way is not None
        self.texture_way = texture_way
        if 'local' in texture_way and 'global' in texture_way:
            # self.extra_dim = extra_dim + local_feat_dim
            self.extra_dim = extra_dim
        elif 'local' in texture_way and 'global' not in texture_way:
            # self.extra_dim = local_feat_dim
            self.extra_dim = extra_dim
        elif 'local' not in texture_way and 'global' in texture_way:
            self.extra_dim = extra_dim

        if n_harmonic_functions > 0:
            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
            dim_in = cin * 2 * n_harmonic_functions
            self.embed_concat_pts = embed_concat_pts
            if embed_concat_pts:
                dim_in += cin
        else:
            self.embedder = None
            dim_in = cin
        
        # self.local_feature_block = LocalFeatureBlock(local_feat_dim=local_feat_dim, input_dim=384, output_dim=256)
        self.local_feature_block = nn.Linear(384, nf, bias=False)

        self.in_layer = nn.Linear(dim_in, nf)
        self.relu = nn.ReLU(inplace=True)
        self.mlp = MLP(nf + self.extra_dim, cout, num_layers, nf, dropout, activation)
        self.perturb_normal = perturb_normal
        self.symmetrize = symmetrize
        if min_max is not None:
            self.register_buffer('min_max', min_max)
        else:
            self.min_max = None
        self.bsdf = None
    
    def get_uv_depth(self, xyz, mvp):
        # xyz: [b, k, 3]
        # mvp: [b, 4, 4]
        cam4 = torch.matmul(torch.nn.functional.pad(xyz, pad=(0,1), mode='constant', value=1.0), torch.transpose(mvp, 1, 2))
        cam3 = cam4[..., :3] / cam4[..., 3:4]
        cam_uv = cam3[..., :2]
        # cam_uv = cam_uv.detach()
        cam_depth = cam3 + torch.FloatTensor([0, 0, self.cam_pos_z_offset]).to(xyz.device).view(1, 1, 3)
        cam_depth = cam_depth / self.grid_scale * 2
        cam_depth = cam_depth[..., 2:3]
        # cam_depth = cam_depth.detach()
        return cam_uv, cam_depth
    
    def proj_sample_deform(self, xyz, feat_map, mvp, w2c, img_h, img_w):
        # here the xyz is deformed points
        # and we don't cast any symmtery here
        b, k, c = xyz.shape
        THRESHOLD = 1e-4
        if isinstance(feat_map, torch.Tensor):
            coordinates = xyz
            # use pre-symmetry points to get feature and record depth
            cam_uv, cam_depth = self.get_uv_depth(coordinates, mvp)
            cam_uv = cam_uv.detach()
            cam_depth = cam_depth.detach()

            # get local feature
            feature = F.grid_sample(feat_map, cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]

            self.input_depth = cam_depth.reshape(b, 256, 256, 1)  # [B, 256, 256, 1]
            self.input_pts = coordinates.detach()
        
        elif isinstance(feat_map, dict):
            original_mvp = feat_map['original_mvp']
            local_feat_map = feat_map['im_features_map']
            original_depth = self.input_depth[0:b]

            coordinates = xyz
            cam_uv, cam_depth = self.get_uv_depth(coordinates, original_mvp)
            cam_uv = cam_uv.detach()
            cam_depth = cam_depth.detach()

            project_feature = F.grid_sample(local_feat_map, cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]
            project_depth = F.grid_sample(original_depth.permute(0, 3, 1, 2), cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, 1]

            use_mask = cam_depth <= project_depth + THRESHOLD
            feature = project_feature * use_mask.repeat(1, 1, project_feature.shape[-1])
        
        ret_feature = self.local_feature_block(feature.reshape(b*k, -1))  # the linear is without bias, so 0 value feature will still get 0 value
        return ret_feature
    
    def proj_sample(self, xyz, feat_map, mvp, w2c, img_h, img_w, xyz_before_sym=None):
        # the new one with no input feature map upsampling
        # feat_map: [B, C, H, W]
        b, k, c = xyz.shape
        if isinstance(feat_map, torch.Tensor):
            if xyz_before_sym is None:
                coordinates = xyz
            else:
                coordinates = xyz_before_sym
            # use pre-symmetry points to get feature and record depth
            cam_uv, cam_depth = self.get_uv_depth(coordinates, mvp)
            cam_uv = cam_uv.detach()
            cam_depth = cam_depth.detach()

            # get local feature
            feature = F.grid_sample(feat_map, cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]

            self.input_depth = cam_depth.reshape(b, 256, 256, 1)  # [B, 256, 256, 1]
            self.input_pts = coordinates.detach()

        elif isinstance(feat_map, dict):
            original_mvp = feat_map['original_mvp']
            local_feat_map = feat_map['im_features_map']
            THRESHOLD = 1e-4
            original_depth = self.input_depth[0:b]
            # if b == 1:
            #    from pdb import set_trace; set_trace()
            #    tmp_mask = xyz[0].reshape(256, 256, 3).sum(dim=-1) != 0
            #    tmp_mask = tmp_mask.cpu().numpy()
            #    tmp_mask = tmp_mask * 255
            #    src_dp = self.input_depth[0,:,:,0].cpu().numpy()
            #    input_pts = self.input_pts[0].cpu().numpy()
            #    input_mask = self.input_pts[0].reshape(256, 256, 3).sum(dim=-1) != 0
            #    input_mask = input_mask.int().cpu().numpy()
            #    input_mask = input_mask * 255
            #    np.save('./tmp_save/src_dp.npy', src_dp)
            #    np.save('./tmp_save/input_pts.npy', input_pts)
            #    import cv2
            #    cv2.imwrite('./tmp_save/input_mask.png', input_mask)
            #    cv2.imwrite('./tmp_save/mask.png', tmp_mask)
            #    test_pts_pos = xyz[0].cpu().numpy()
            #    np.save('./tmp_save/test_pts_pos.npy', test_pts_pos)
            #    test_pts_raw = xyz_before_sym[0].cpu().numpy()
            #    np.save('./tmp_save/test_pts_raw.npy', test_pts_raw)
            #    mvp_now = mvp[0].detach().cpu().numpy()
            #    mvp_original = original_mvp[0].detach().cpu().numpy()
            #    np.save('./tmp_save/mvp_now.npy', mvp_now)
            #    np.save('./tmp_save/mvp_original.npy', mvp_original)
            if xyz_before_sym is None:
                # just check the project depth of xyz
                coordinates = xyz
                cam_uv, cam_depth = self.get_uv_depth(coordinates, original_mvp)
                cam_uv = cam_uv.detach()
                cam_depth = cam_depth.detach()

                project_feature = F.grid_sample(local_feat_map, cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]
                project_depth = F.grid_sample(original_depth.permute(0, 3, 1, 2), cam_uv.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, 1]

                use_mask = cam_depth <= project_depth + THRESHOLD
                feature = project_feature * use_mask.repeat(1, 1, project_feature.shape[-1])
            else:
                # need to double check, but now we are still use symmetry! Even if the two points are all visible in input view
                coords_inp = xyz
                x_check, y_check, z_check = xyz.unbind(-1)
                xyz_check = torch.stack([-1 * x_check, y_check, z_check], -1)
                coords_rev = xyz_check      # we directly use neg-x to get the points of another side

                uv_inp, dp_inp = self.get_uv_depth(coords_inp, original_mvp)
                uv_rev, dp_rev = self.get_uv_depth(coords_rev, original_mvp)
                uv_inp = uv_inp.detach()
                uv_rev = uv_rev.detach()
                dp_inp = dp_inp.detach()
                dp_rev = dp_rev.detach()

                proj_feat_inp = F.grid_sample(local_feat_map, uv_inp.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]
                proj_feat_rev = F.grid_sample(local_feat_map, uv_rev.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, c]
                
                proj_dp_inp = F.grid_sample(original_depth.permute(0, 3, 1, 2), uv_inp.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, 1]
                proj_dp_rev = F.grid_sample(original_depth.permute(0, 3, 1, 2), uv_rev.view(b, 1, k, 2), mode='bilinear').squeeze(dim=-2).permute(0, 2, 1)  # [b, k, 1]

                use_mask_inp = dp_inp <= proj_dp_inp + THRESHOLD
                use_mask_rev = dp_rev <= proj_dp_rev + THRESHOLD

                # for those points we can see in two sides, we use average
                use_mask_inp = use_mask_inp.int()
                use_mask_rev = use_mask_rev.int()
                both_vis = (use_mask_inp == 1) & (use_mask_rev == 1)
                use_mask_inp[both_vis] = 0.5
                use_mask_rev[both_vis] = 0.5

                feature = proj_feat_inp * use_mask_inp.repeat(1, 1, proj_feat_inp.shape[-1]) + proj_feat_rev * use_mask_rev.repeat(1, 1, proj_feat_rev.shape[-1])
        else:
            raise NotImplementedError
        
        ret_feature = self.local_feature_block(feature.reshape(b*k, -1))  # the linear is without bias, so 0 value feature will still get 0 value
        return ret_feature

    def sample(self, x, feat=None, feat_map=None, mvp=None, w2c=None, deform_xyz=None):
        # assert (feat is None and self.extra_dim == 0) or (feat.shape[-1] <= self.extra_dim)
        b, h, w, c = x.shape

        xyz_before_sym = None
        if self.symmetrize:
            xyz_before_sym = x.reshape(b, -1, c)
            xs, ys, zs = x.unbind(-1)
            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
        
        mvp = mvp.detach()  # [b, 4, 4]
        w2c = w2c.detach()  # [b, 4, 4]
        
        pts_xyz = x.reshape(b, -1, c)
        deform_xyz = deform_xyz.reshape(b, -1, c)

        if 'global' in self.texture_way and 'local' in self.texture_way:
            global_feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
            # local_feat = self.proj_sample(pts_xyz, feat_map, mvp, w2c, h, w, xyz_before_sym=xyz_before_sym)
            local_feat = self.proj_sample_deform(deform_xyz, feat_map, mvp, w2c, h, w)
            # feature_rep = torch.concat([global_feat, local_feat], dim=-1)
            feature_rep = global_feat + local_feat
        elif 'global' not in self.texture_way and 'local' in self.texture_way:
            # local_feat = self.proj_sample(pts_xyz, feat_map, mvp, w2c, h, w, xyz_before_sym=xyz_before_sym)
            local_feat = self.proj_sample_deform(deform_xyz, feat_map, mvp, w2c, h, w)
            feature_rep = local_feat
        elif 'global' in self.texture_way and 'local' not in self.texture_way:
            global_feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
            feature_rep = global_feat
        else:
            global_feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
            feature_rep = global_feat

        x = x.view(-1, c)

        if self.embedder is not None:
            x_in = self.embedder(x)
            if self.embed_concat_pts:
                x_in = torch.cat([x, x_in], -1)
        else:
            x_in = x
        
        x_in = self.in_layer(x_in)

        # if feat is not None:
        #     feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
        #     x_in = torch.concat([x_in, feat], dim=-1)

        x_in = torch.concat([x_in, feature_rep], dim=-1)

        out = self.mlp(self.relu(x_in))
        if self.min_max is not None:
            out = out * (self.min_max[1][None, :] - self.min_max[0][None, :]) + self.min_max[0][None, :]
        return out.view(b, h, w, -1)


class LiftTexture(nn.Module):
    def __init__(self, 
                 cin, 
                 cout, 
                 num_layers, 
                 nf=256, 
                 dropout=0, 
                 activation=None, 
                 min_max=None, 
                 n_harmonic_functions=10, 
                 omega0=1,
                 extra_dim=0,
                 embed_concat_pts=True, 
                 perturb_normal=False,
                 symmetrize=False,
                 texture_way=None,
                 cam_pos_z_offset=10.,
                 grid_scale=7.,
                 local_feat_dim=128,
                 grid_size=32,
                 optim_latent=False):
        super().__init__()
        self.extra_dim = extra_dim
        self.cam_pos_z_offset = cam_pos_z_offset
        self.grid_scale = grid_scale

        assert texture_way is not None
        self.extra_dim = local_feat_dim + extra_dim

        if n_harmonic_functions > 0:
            self.embedder = HarmonicEmbedding(n_harmonic_functions, omega0)
            dim_in = cin * 2 * n_harmonic_functions
            self.embed_concat_pts = embed_concat_pts
            if embed_concat_pts:
                dim_in += cin
        else:
            self.embedder = None
            dim_in = cin
        
        self.encoder = Lift_Encoder(
            cin=384,
            feat_dim=local_feat_dim,
            grid_scale=grid_scale / 2,  # the dmtet is initialized in (-0.5, 0.5)
            grid_size=grid_size,
            optim_latent=optim_latent,
            with_z_feature=True,
            cam_pos_z_offset=cam_pos_z_offset
        )


        self.in_layer = nn.Linear(dim_in, nf)
        self.relu = nn.ReLU(inplace=True)
        self.mlp = MLP(nf + self.extra_dim, cout, num_layers, nf, dropout, activation)
        self.perturb_normal = perturb_normal
        self.symmetrize = symmetrize
        if min_max is not None:
            self.register_buffer('min_max', min_max)
        else:
            self.min_max = None
        self.bsdf = None
    
    def get_uv_depth(self, xyz, mvp):
        # xyz: [b, k, 3]
        # mvp: [b, 4, 4]
        cam4 = torch.matmul(torch.nn.functional.pad(xyz, pad=(0,1), mode='constant', value=1.0), torch.transpose(mvp, 1, 2))
        cam3 = cam4[..., :3] / cam4[..., 3:4]
        cam_uv = cam3[..., :2]
        # cam_uv = cam_uv.detach()
        cam_depth = cam3 + torch.FloatTensor([0, 0, self.cam_pos_z_offset]).to(xyz.device).view(1, 1, 3)
        cam_depth = cam_depth / self.grid_scale * 2
        cam_depth = cam_depth[..., 2:3]
        # cam_depth = cam_depth.detach()
        return cam_uv, cam_depth
    
    def proj_sample_deform(self, xyz, feat_map, mvp, w2c, img_h, img_w):
        # here the xyz is deformed points
        # and we don't cast any symmtery here
        if isinstance(feat_map, torch.Tensor):
            feature = self.encoder(feat_map, mvp, xyz, inference="unproject")
        
        elif isinstance(feat_map, dict):
            feature = self.encoder(feat_map['im_features_map'], mvp, xyz, inference="sample")
        C = feature.shape[-1]
        feature = feature.reshape(-1, C)
        return feature

    def sample(self, x, feat=None, feat_map=None, mvp=None, w2c=None, deform_xyz=None):
        # assert (feat is None and self.extra_dim == 0) or (feat.shape[-1] <= self.extra_dim)
        b, h, w, c = x.shape

        xyz_before_sym = None
        if self.symmetrize:
            xyz_before_sym = x.reshape(b, -1, c)
            xs, ys, zs = x.unbind(-1)
            x = torch.stack([xs.abs(), ys, zs], -1)  # mirror -x to +x
        
        mvp = mvp.detach()  # [b, 4, 4]
        w2c = w2c.detach()  # [b, 4, 4]
        
        pts_xyz = x.reshape(b, -1, c)
        deform_xyz = deform_xyz.reshape(b, -1, c)

        global_feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
        local_feat = self.proj_sample_deform(deform_xyz, feat_map, mvp, w2c, h, w)
        feature_rep = torch.concat([global_feat, local_feat], dim=-1)
        x = x.view(-1, c)

        if self.embedder is not None:
            x_in = self.embedder(x)
            if self.embed_concat_pts:
                x_in = torch.cat([x, x_in], -1)
        else:
            x_in = x
        
        x_in = self.in_layer(x_in)

        # if feat is not None:
        #     feat = feat[:,None,None].expand(b, h, w, -1).reshape(b*h*w, -1)
        #     x_in = torch.concat([x_in, feat], dim=-1)

        x_in = torch.concat([x_in, feature_rep], dim=-1)

        out = self.mlp(self.relu(x_in))
        if self.min_max is not None:
            out = out * (self.min_max[1][None, :] - self.min_max[0][None, :]) + self.min_max[0][None, :]
        return out.view(b, h, w, -1)


class HarmonicEmbedding(nn.Module):
    def __init__(self, n_harmonic_functions=10, omega0=1):
        """
        Positional Embedding implementation (adapted from Pytorch3D).
        Given an input tensor `x` of shape [minibatch, ... , dim],
        the harmonic embedding layer converts each feature
        in `x` into a series of harmonic features `embedding`
        as follows:
            embedding[..., i*dim:(i+1)*dim] = [
                sin(x[..., i]),
                sin(2*x[..., i]),
                sin(4*x[..., i]),
                ...
                sin(2**self.n_harmonic_functions * x[..., i]),
                cos(x[..., i]),
                cos(2*x[..., i]),
                cos(4*x[..., i]),
                ...
                cos(2**self.n_harmonic_functions * x[..., i])
            ]
        Note that `x` is also premultiplied by `omega0` before
        evaluting the harmonic functions.
        """
        super().__init__()
        self.frequencies = omega0 * (2.0 ** torch.arange(n_harmonic_functions))

    def forward(self, x):
        """
        Args:
            x: tensor of shape [..., dim]
        Returns:
            embedding: a harmonic embedding of `x`
                of shape [..., n_harmonic_functions * dim * 2]
        """
        embed = (x[..., None] * self.frequencies.to(x.device)).view(*x.shape[:-1], -1)
        return torch.cat((embed.sin(), embed.cos()), dim=-1)


class VGGEncoder(nn.Module):
    def __init__(self, cout, pretrained=False):
        super().__init__()
        if pretrained:
            raise NotImplementedError
        vgg = models.vgg16()
        self.vgg_encoder = nn.Sequential(vgg.features, vgg.avgpool)
        self.linear1 = nn.Linear(25088, 4096)
        self.linear2 = nn.Linear(4096, cout)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        batch_size, _, _, _ = x.shape
        out = self.relu(self.linear1(self.vgg_encoder(x).view(batch_size, -1)))
        return self.linear2(out)


class ResnetEncoder(nn.Module):
    def __init__(self, cout, pretrained=False):
        super().__init__()
        self.resnet = nn.Sequential(list(models.resnet18(weights="DEFAULT" if pretrained else None).modules())[:-1])
        self.final_linear = nn.Linear(512, cout)

    def forward(self, x):
        return self.final_linear(self.resnet(x))


class Encoder(nn.Module):
    def __init__(self, cin, cout, in_size=128, zdim=None, nf=64, activation=None):
        super().__init__()
        network = [
            nn.Conv2d(cin, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 128x128 -> 64x64
            nn.GroupNorm(16, nf),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(nf, nf*2, kernel_size=4, stride=2, padding=1, bias=False),  # 64x64 -> 32x32
            nn.GroupNorm(16*2, nf*2),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(nf*2, nf*4, kernel_size=4, stride=2, padding=1, bias=False),  # 32x32 -> 16x16
            nn.GroupNorm(16*4, nf*4),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(nf*4, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
            # nn.GroupNorm(16*8, nf*8),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
        ]

        add_downsample = int(np.log2(in_size//128))
        if add_downsample > 0:
            for _ in range(add_downsample):
                network += [
                    nn.Conv2d(nf*8, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
                    # nn.GroupNorm(16*8, nf*8),
                    # nn.ReLU(inplace=True),
                    nn.LeakyReLU(0.2, inplace=True),
                ]

        network += [
            nn.Conv2d(nf*8, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 8x8 -> 4x4
            nn.LeakyReLU(0.2, inplace=True),
        ]

        if zdim is None:
            network += [
                nn.Conv2d(nf*8, cout, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
                ]
        else:
            network += [
                nn.Conv2d(nf*8, zdim, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
                # nn.ReLU(inplace=True),
                nn.LeakyReLU(0.2, inplace=True),
                nn.Conv2d(zdim, cout, kernel_size=1, stride=1, padding=0, bias=False),
                ]

        if activation is not None:
            network += [get_activation(activation)]
        self.network = nn.Sequential(*network)

    def forward(self, input):
        return self.network(input).reshape(input.size(0), -1)


class EncoderWithDINO(nn.Module):
    def __init__(self, cin_rgb, cin_dino, cout, in_size=128, zdim=None, nf=64, activation=None):
        super().__init__()
        network_rgb_in = [
            nn.Conv2d(cin_rgb, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 128x128 -> 64x64
            nn.GroupNorm(16, nf),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(nf, nf*2, kernel_size=4, stride=2, padding=1, bias=False),  # 64x64 -> 32x32
            nn.GroupNorm(16*2, nf*2),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(nf*2, nf*4, kernel_size=4, stride=2, padding=1, bias=False),  # 32x32 -> 16x16
            nn.GroupNorm(16*4, nf*4),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
        ]
        self.network_rgb_in = nn.Sequential(*network_rgb_in)
        network_dino_in = [
            nn.Conv2d(cin_dino, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 128x128 -> 64x64
            nn.GroupNorm(16, nf),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(nf, nf*2, kernel_size=4, stride=2, padding=1, bias=False),  # 64x64 -> 32x32
            nn.GroupNorm(16*2, nf*2),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(nf*2, nf*4, kernel_size=4, stride=2, padding=1, bias=False),  # 32x32 -> 16x16
            nn.GroupNorm(16*4, nf*4),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
        ]
        self.network_dino_in = nn.Sequential(*network_dino_in)

        network_fusion = [
            nn.Conv2d(nf*4*2, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
            # nn.GroupNorm(16*8, nf*8),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
        ]

        add_downsample = int(np.log2(in_size//128))
        if add_downsample > 0:
            for _ in range(add_downsample):
                network_fusion += [
                    nn.Conv2d(nf*8, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
                    # nn.GroupNorm(16*8, nf*8),
                    # nn.ReLU(inplace=True),
                    nn.LeakyReLU(0.2, inplace=True),
                ]

        network_fusion += [
            nn.Conv2d(nf*8, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 8x8 -> 4x4
            nn.LeakyReLU(0.2, inplace=True),
        ]

        if zdim is None:
            network_fusion += [
                nn.Conv2d(nf*8, cout, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
                ]
        else:
            network_fusion += [
                nn.Conv2d(nf*8, zdim, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
                # nn.ReLU(inplace=True),
                nn.LeakyReLU(0.2, inplace=True),
                nn.Conv2d(zdim, cout, kernel_size=1, stride=1, padding=0, bias=False),
                ]

        if activation is not None:
            network_fusion += [get_activation(activation)]
        self.network_fusion = nn.Sequential(*network_fusion)

    def forward(self, rgb_image, dino_image):
        rgb_feat = self.network_rgb_in(rgb_image)
        dino_feat = self.network_dino_in(dino_image)
        out = self.network_fusion(torch.cat([rgb_feat, dino_feat], dim=1))
        return out.reshape(rgb_image.size(0), -1)


class Encoder32(nn.Module):
    def __init__(self, cin, cout, nf=256, activation=None):
        super().__init__()
        network = [
            nn.Conv2d(cin, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 32x32 -> 16x16
            nn.GroupNorm(nf//4, nf),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(nf, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
            nn.GroupNorm(nf//4, nf),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(nf, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 8x8 -> 4x4
            nn.GroupNorm(nf//4, nf),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(nf, cout, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
        ]
        if activation is not None:
            network += [get_activation(activation)]
        self.network = nn.Sequential(*network)

    def forward(self, input):
        return self.network(input).reshape(input.size(0), -1)


class MLP(nn.Module):
    def __init__(self, cin, cout, num_layers, nf=256, dropout=0, activation=None, inner_act='relu', linear_bias=False):
        super().__init__()
        assert num_layers >= 1
        layer_act = get_activation(inner_act)
        if num_layers == 1:
            network = [nn.Linear(cin, cout, bias=linear_bias)]
        else:
            # network = [nn.Linear(cin, nf, bias=False)]
            # for _ in range(num_layers-2):
            #     network += [
            #         nn.ReLU(inplace=True),
            #         nn.Linear(nf, nf, bias=False)]
            #     if dropout:
            #         network += [nn.Dropout(dropout)]
            # network += [
            #     nn.ReLU(inplace=True),
            #     nn.Linear(nf, cout, bias=False)]
            network = [nn.Linear(cin, nf, bias=linear_bias)]
            for _ in range(num_layers-2):
                network += [
                    layer_act,
                    nn.Linear(nf, nf, bias=linear_bias)]
                if dropout:
                    network += [nn.Dropout(dropout)]
            network += [
                layer_act,
                nn.Linear(nf, cout, bias=linear_bias)]
        if activation is not None:
            network += [get_activation(activation)]
        self.network = nn.Sequential(*network)

    def forward(self, input):
        return self.network(input)


class Embedding(nn.Module):
    def __init__(self, cin, cout, zdim=128, nf=64, activation=None):
        super().__init__()
        network = [
            nn.Linear(cin, nf, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(nf, zdim, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(zdim, cout, bias=False)]
        if activation is not None:
            network += [get_activation(activation)]
        self.network = nn.Sequential(*network)

    def forward(self, input):
        return self.network(input.reshape(input.size(0), -1)).reshape(input.size(0), -1)


class PerceptualLoss(nn.Module):
    def __init__(self, requires_grad=False):
        super(PerceptualLoss, self).__init__()
        mean_rgb = torch.FloatTensor([0.485, 0.456, 0.406])
        std_rgb = torch.FloatTensor([0.229, 0.224, 0.225])
        self.register_buffer('mean_rgb', mean_rgb)
        self.register_buffer('std_rgb', std_rgb)

        vgg_pretrained_features = torchvision.models.vgg16(pretrained=True).features
        self.slice1 = nn.Sequential()
        self.slice2 = nn.Sequential()
        self.slice3 = nn.Sequential()
        self.slice4 = nn.Sequential()
        for x in range(4):
            self.slice1.add_module(str(x), vgg_pretrained_features[x])
        for x in range(4, 9):
            self.slice2.add_module(str(x), vgg_pretrained_features[x])
        for x in range(9, 16):
            self.slice3.add_module(str(x), vgg_pretrained_features[x])
        for x in range(16, 23):
            self.slice4.add_module(str(x), vgg_pretrained_features[x])
        if not requires_grad:
            for param in self.parameters():
                param.requires_grad = False

    def normalize(self, x):
        out = x/2 + 0.5
        out = (out - self.mean_rgb.view(1,3,1,1)) / self.std_rgb.view(1,3,1,1)
        return out

    def __call__(self, im1, im2, mask=None, conf_sigma=None):
        im = torch.cat([im1,im2], 0)
        im = self.normalize(im)  # normalize input

        ## compute features
        feats = []
        f = self.slice1(im)
        feats += [torch.chunk(f, 2, dim=0)]
        f = self.slice2(f)
        feats += [torch.chunk(f, 2, dim=0)]
        f = self.slice3(f)
        feats += [torch.chunk(f, 2, dim=0)]
        f = self.slice4(f)
        feats += [torch.chunk(f, 2, dim=0)]

        losses = []
        for f1, f2 in feats[2:3]:  # use relu3_3 features only
            loss = (f1-f2)**2
            if conf_sigma is not None:
                loss = loss / (2*conf_sigma**2 +EPS) + (conf_sigma +EPS).log()
            if mask is not None:
                b, c, h, w = loss.shape
                _, _, hm, wm = mask.shape
                sh, sw = hm//h, wm//w
                mask0 = nn.functional.avg_pool2d(mask, kernel_size=(sh,sw), stride=(sh,sw)).expand_as(loss)
                loss = (loss * mask0).sum() / mask0.sum()
            else:
                loss = loss.mean()
            losses += [loss]
        return sum(losses)


## from: https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py
def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(BasicBlock, self).__init__()
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)

        self.norm_layer = norm_layer
        if norm_layer is not None:
            self.bn1 = norm_layer(planes)
            self.bn2 = norm_layer(planes)

        if inplanes != planes:
            self.downsample = nn.Sequential(
                conv1x1(inplanes, planes, stride),
                norm_layer(planes),
            )
        else:
            self.downsample = None
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        if self.norm_layer is not None:
            out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        if self.norm_layer is not None:
            out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class ResEncoder(nn.Module):
    def __init__(self, cin, cout, in_size=128, zdim=None, nf=64, activation=None):
        super().__init__()
        network = [
            nn.Conv2d(cin, nf, kernel_size=4, stride=2, padding=1, bias=False),  # 128x128 -> 64x64
            # nn.GroupNorm(16, nf),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv2d(nf, nf*2, kernel_size=4, stride=2, padding=1, bias=False),  # 64x64 -> 32x32
            # nn.GroupNorm(16*2, nf*2),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
            BasicBlock(nf*2, nf*2, norm_layer=None),
            BasicBlock(nf*2, nf*2, norm_layer=None),
            nn.Conv2d(nf*2, nf*4, kernel_size=4, stride=2, padding=1, bias=False),  # 32x32 -> 16x16
            # nn.GroupNorm(16*4, nf*4),
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
            BasicBlock(nf*4, nf*4, norm_layer=None),
            BasicBlock(nf*4, nf*4, norm_layer=None),
            nn.Conv2d(nf*4, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 16x16 -> 8x8
            # nn.ReLU(inplace=True),
            nn.LeakyReLU(0.2, inplace=True),
            BasicBlock(nf*8, nf*8, norm_layer=None),
            BasicBlock(nf*8, nf*8, norm_layer=None),
        ]

        add_downsample = int(np.log2(in_size//64))
        if add_downsample > 0:
            for _ in range(add_downsample):
                network += [
                    nn.Conv2d(nf*8, nf*8, kernel_size=4, stride=2, padding=1, bias=False),  # 8x8 -> 4x4
                    # nn.ReLU(inplace=True),
                    nn.LeakyReLU(0.2, inplace=True),
                    BasicBlock(nf*8, nf*8, norm_layer=None),
                    BasicBlock(nf*8, nf*8, norm_layer=None),
                ]

        if zdim is None:
            network += [
                nn.Conv2d(nf*8, cout, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
                ]
        else:
            network += [
                nn.Conv2d(nf*8, zdim, kernel_size=4, stride=1, padding=0, bias=False),  # 4x4 -> 1x1
                # nn.ReLU(inplace=True),
                nn.LeakyReLU(0.2, inplace=True),
                nn.Conv2d(zdim, cout, kernel_size=1, stride=1, padding=0, bias=False),
                ]

        if activation is not None:
            network += [get_activation(activation)]
        self.network = nn.Sequential(*network)

    def forward(self, input):
        return self.network(input).reshape(input.size(0), -1)


class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x, attn


class ViTEncoder(nn.Module):
    def __init__(self, cout, which_vit='dino_vits8', pretrained=False, frozen=False, in_size=256, final_layer_type='none', root='/root'):
        super().__init__()
        if misc.is_main_process():
            force_reload = not os.path.exists(os.path.join(root, ".cache/torch/hub/checkpoints/"))
        else:
            force_reload = False
        if "dinov2" in which_vit:
            self.ViT = torch.hub.load('facebookresearch/dinov2:main', which_vit, pretrained=pretrained, force_reload=force_reload)
        else:
            self.ViT = torch.hub.load('facebookresearch/dino:main', which_vit, pretrained=pretrained, force_reload=force_reload)
        
        if frozen:
            for p in self.ViT.parameters():
                p.requires_grad = False
        if which_vit == 'dino_vits8':
            self.vit_feat_dim = 384
            self.patch_size = 8
        elif which_vit == 'dinov2_vits14':
            self.vit_feat_dim = 384
            self.patch_size = 14
        elif which_vit == 'dino_vitb8':
            self.vit_feat_dim = 768
            self.patch_size = 8
        
        self._feats = []
        self.hook_handlers = []

        if final_layer_type == 'none':
            pass
        elif final_layer_type == 'conv':
            self.final_layer_patch_out = Encoder32(self.vit_feat_dim, cout, nf=256, activation=None)
            self.final_layer_patch_key = Encoder32(self.vit_feat_dim, cout, nf=256, activation=None)
        elif final_layer_type == 'attention':
            raise NotImplementedError
            self.final_layer = Attention(
                dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
            self.fc = nn.Linear(self.vit_feat_dim, cout)
        else:
            raise NotImplementedError
        self.final_layer_type = final_layer_type
    
    def _get_hook(self, facet: str):
        """
        generate a hook method for a specific block and facet.
        """
        if facet in ['attn', 'token']:
            def _hook(model, input, output):
                self._feats.append(output)
            return _hook

        if facet == 'query':
            facet_idx = 0
        elif facet == 'key':
            facet_idx = 1
        elif facet == 'value':
            facet_idx = 2
        else:
            raise TypeError(f"{facet} is not a supported facet.")

        def _inner_hook(module, input, output):
            input = input[0]
            B, N, C = input.shape
            qkv = module.qkv(input).reshape(B, N, 3, module.num_heads, C // module.num_heads).permute(2, 0, 3, 1, 4)
            self._feats.append(qkv[facet_idx]) #Bxhxtxd
        return _inner_hook
    
    def _register_hooks(self, layers: List[int], facet: str) -> None:
        """
        register hook to extract features.
        :param layers: layers from which to extract features.
        :param facet: facet to extract. One of the following options: ['key' | 'query' | 'value' | 'token' | 'attn']
        """
        for block_idx, block in enumerate(self.ViT.blocks):
            if block_idx in layers:
                if facet == 'token':
                    self.hook_handlers.append(block.register_forward_hook(self._get_hook(facet)))
                elif facet == 'attn':
                    self.hook_handlers.append(block.attn.attn_drop.register_forward_hook(self._get_hook(facet)))
                elif facet in ['key', 'query', 'value']:
                    self.hook_handlers.append(block.attn.register_forward_hook(self._get_hook(facet)))
                else:
                    raise TypeError(f"{facet} is not a supported facet.")
    
    def _unregister_hooks(self) -> None:
        """
        unregisters the hooks. should be called after feature extraction.
        """
        for handle in self.hook_handlers:
            handle.remove()
        self.hook_handlers = []
    
    def forward(self, x, return_patches=False):
        b, c, h, w = x.shape
        self._feats = []
        self._register_hooks([11], 'key')
        #self._register_hooks([11], 'token')
        x = self.ViT.prepare_tokens(x)
        #x = self.ViT.prepare_tokens_with_masks(x)
        
        for blk in self.ViT.blocks:
            x = blk(x)
        out = self.ViT.norm(x)
        self._unregister_hooks()

        ph, pw = h // self.patch_size, w // self.patch_size
        patch_out = out[:, 1:]  # first is class token
        patch_out = patch_out.reshape(b, ph, pw, self.vit_feat_dim).permute(0, 3, 1, 2)

        patch_key = self._feats[0][:,:,1:]  # B, num_heads, num_patches, dim
        patch_key = patch_key.permute(0, 1, 3, 2).reshape(b, self.vit_feat_dim, ph, pw)

        if self.final_layer_type == 'none':
            global_feat_out = out[:, 0].reshape(b, -1)  # first is class token
            global_feat_key = self._feats[0][:, :, 0].reshape(b, -1)  # first is class token
        elif self.final_layer_type == 'conv':
            global_feat_out = self.final_layer_patch_out(patch_out).view(b, -1)
            global_feat_key = self.final_layer_patch_key(patch_key).view(b, -1)
        elif self.final_layer_type == 'attention':
            raise NotImplementedError
        else:
            raise NotImplementedError
        if not return_patches:
            patch_out = patch_key = None
        return global_feat_out, global_feat_key, patch_out, patch_key


class ArticulationNetwork(nn.Module):
    def __init__(self, net_type, feat_dim, pos_dim, num_layers, nf, n_harmonic_functions=0, omega0=1, activation=None, enable_articulation_idadd=False):
        super().__init__()
        if n_harmonic_functions > 0:
            self.posenc = HarmonicEmbedding(n_harmonic_functions=n_harmonic_functions, omega0=omega0)
            pos_dim = pos_dim * (n_harmonic_functions * 2 + 1)
        else:
            self.posenc = None
            pos_dim = 4
        cout = 3
        
        if net_type == 'mlp':
            self.network = MLP(
                feat_dim + pos_dim,  # + bone xyz pos and index
                cout,  # We represent the rotation of each bone by its Euler angles ψ, θ, and φ
                num_layers,
                nf=nf,
                dropout=0,
                activation=activation
            )
        elif net_type == 'attention':
            self.in_layer = nn.Sequential(
                nn.Linear(feat_dim + pos_dim, nf),
                nn.GELU(),
                nn.LayerNorm(nf),
            )
            self.blocks = nn.ModuleList([
            Block(
                dim=nf, num_heads=8, mlp_ratio=2., qkv_bias=False, qk_scale=None,
                drop=0., attn_drop=0., drop_path=0., norm_layer=nn.LayerNorm)
            for i in range(num_layers)])
            out_layer = [nn.Linear(nf, cout)]
            if activation:
                out_layer += [get_activation(activation)]
            self.out_layer = nn.Sequential(*out_layer)
        else:
            raise NotImplementedError
        self.net_type = net_type
        self.enable_articulation_idadd = enable_articulation_idadd
    
    def forward(self, x, pos):
        pos_inp = pos
        if self.posenc is not None:
            pos = torch.cat([pos, self.posenc(pos)], dim=-1)
        x = torch.cat([x, pos], dim=-1)
        if self.enable_articulation_idadd:
            articulation_id = pos_inp[..., -1:]
            x = x + articulation_id
        if self.net_type == 'mlp':
            out = self.network(x)
        elif self.net_type == 'attention':
            x = self.in_layer(x)
            for blk in self.blocks:
                x = blk(x)
            out = self.out_layer(x)
        else:
            raise NotImplementedError
        return out


## Attention block from ViT (https://github.com/facebookresearch/dino/blob/main/vision_transformer.py)
class Attention(nn.Module):
    def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        self.scale = qk_scale or head_dim ** -0.5

        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
        self.attn_drop = nn.Dropout(attn_drop)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_drop)

    def forward(self, x):
        B, N, C = x.shape
        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
        q, k, v = qkv[0], qkv[1], qkv[2]

        attn = (q @ k.transpose(-2, -1)) * self.scale
        attn = attn.softmax(dim=-1)
        attn = self.attn_drop(attn)

        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
        x = self.proj(x)
        x = self.proj_drop(x)
        return x, attn


class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x


class DropPath(nn.Module):
    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
    """
    def __init__(self, drop_prob=None):
        super(DropPath, self).__init__()
        self.drop_prob = drop_prob

    def forward(self, x):
        return drop_path(x, self.drop_prob, self.training)


class Block(nn.Module):
    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm):
        super().__init__()
        self.norm1 = norm_layer(dim)
        self.attn = Attention(
            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
        self.norm2 = norm_layer(dim)
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)

    def forward(self, x, return_attention=False):
        y, attn = self.attn(self.norm1(x))
        if return_attention:
            return attn
        x = x + self.drop_path(y)
        x = x + self.drop_path(self.mlp(self.norm2(x)))
        return x


class FeatureAttention(nn.Module):
    def __init__(self, vit_type, pos_dim, embedder_freq=0, zdim=128, img_size=256, activation=None):
        super().__init__()
        self.zdim = zdim
        if embedder_freq > 0:
            self.posenc = HarmonicEmbedding(n_harmonic_functions=embedder_freq, omega0=1)
            pos_dim = pos_dim * (embedder_freq * 2 + 1)
        else:
            self.posenc = None
        self.pos_dim = pos_dim

        if vit_type == 'dino_vits8':
            self.vit_feat_dim = 384
            patch_size = 8
        elif which_vit == 'dinov2_vits14':
            self.vit_feat_dim = 384
            self.patch_size = 14
        elif vit_type == 'dino_vitb8':
            self.vit_feat_dim = 768
            patch_size = 8
        else:
            raise NotImplementedError
        self.num_patches_per_dim = img_size // patch_size

        self.kv = nn.Sequential(
            nn.Linear(self.vit_feat_dim, zdim),
            nn.ReLU(inplace=True),
            nn.LayerNorm(zdim),
            nn.Linear(zdim, zdim*2),
        )
        
        self.q = nn.Sequential(
            nn.Linear(pos_dim, zdim),
            nn.ReLU(inplace=True),
            nn.LayerNorm(zdim),
            nn.Linear(zdim, zdim),
        )
        
        final_mlp = [
            nn.Linear(zdim, zdim),
            nn.ReLU(inplace=True),
            nn.LayerNorm(zdim),
            nn.Linear(zdim, self.vit_feat_dim)
        ]
        if activation is not None:
            final_mlp += [get_activation(activation)]
        self.final_ln = nn.Sequential(*final_mlp)

    def forward(self, x, feat):
        _, vit_feat_dim, ph, pw = feat.shape
        assert ph == pw and ph == self.num_patches_per_dim and vit_feat_dim == self.vit_feat_dim
        
        if self.posenc is not None:
            x = torch.cat([x, self.posenc(x)], dim=-1)
        bxf, k, c = x.shape
        assert c == self.pos_dim
        
        query = self.q(x)
        feat_in = feat.view(bxf, vit_feat_dim, ph*pw).permute(0, 2, 1)  # N, K, C
        k, v = self.kv(feat_in).chunk(2, dim=-1)
        attn = torch.einsum('bnd,bpd->bnp', query, k).softmax(dim=-1)
        out = torch.einsum('bnp,bpd->bnd', attn, v)
        out = self.final_ln(out)
        return out