Spaces:

menghanxia
/

disco

Running

App Files Files Community

menghanxia commited on Oct 7, 2022

Commit

b3640b9

•

1 Parent(s): 4636eb6

upload whole project

Browse files

Files changed (23) hide show

LICENSE +21 -0
app.py +89 -4
checkpoints/disco_download.sh +1 -0
cog.yaml +41 -0
environment.yml +121 -0
inference.py +109 -0
models/__init__.py +0 -0
models/anchor_gen.py +107 -0
models/basic.py +504 -0
models/clusterkit.py +290 -0
models/loss.py +222 -0
models/model.py +196 -0
models/network.py +352 -0
models/position_encoding.py +86 -0
models/transformer2d.py +229 -0
predict.py +104 -0
requirements.txt +17 -0
utils/__init__.py +0 -0
utils/cielab.py +71 -0
utils/dataset_lab.py +37 -0
utils/gamut_probs.npy +0 -0
utils/gamut_pts.npy +0 -0
utils/util.py +178 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2022 Menghan Xia
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py CHANGED Viewed

@@ -1,7 +1,92 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-iface = gr.Interface(fn=greet, inputs="text", outputs="text")
-iface.launch()

 import gradio as gr
+import os, requests
+from inference import setup_model, colorize_grayscale, predict_anchors
+## download checkpoint
+def download_file_from_google_drive(id, destination):
+    def get_confirm_token(response):
+        for key, value in response.cookies.items():
+            if key.startswith('download_warning'):
+                return value
+        return None
+    def save_response_content(response, destination):
+        CHUNK_SIZE = 32768
+        with open(destination, "wb") as f:
+            for chunk in response.iter_content(CHUNK_SIZE):
+                if chunk: # filter out keep-alive new chunks
+                    f.write(chunk)
+    URL = "https://docs.google.com/uc?export=download"
+    session = requests.Session()
+    response = session.get(URL, params = { 'id' : id }, stream = True)
+    token = get_confirm_token(response)
+    if token:
+        params = { 'id' : id, 'confirm' : token }
+        response = session.get(URL, params = params, stream = True)
+    save_response_content(response, destination)
+id = "1J4vB6kG4xBLUUKpXr5IhnSSa4maXgRvQ"
+destination = "disco-beta.pth.rar"
+download_file_from_google_drive(id, destination)
+os.rename("disco-beta.pth.tar", "./checkpoints/disco-beta.pth.tar")
+## step 1: set up model
+device = "cuda"
+checkpt_path = "./checkpoints/disco-beta.pth.tar"
+assert os.path.exists(checkpt_path), "No checkpoint found!"
+colorizer, colorLabeler = setup_model(checkpt_path, device=device)
+def click_colorize(rgb_img, hint_img, n_anchors, is_high_res, is_editable):
+    if hint_img is None:
+        hint_img = rgb_img
+    output = colorize_grayscale(colorizer, colorLabeler, rgb_img, hint_img, n_anchors, is_high_res, is_editable, device)
+    return output
+def click_predanchors(rgb_img, n_anchors, is_high_res, is_editable):
+    output = predict_anchors(colorizer, colorLabeler, rgb_img, n_anchors, is_high_res, is_editable, device)
+    return output
+## step 2: configure interface
+def switch_states(is_checked):
+    if is_checked:
+        return gr.Image.update(visible=True), gr.Button.update(visible=True)
+    else:
+        return gr.Image.update(visible=False), gr.Button.update(visible=False)
+demo = gr.Blocks(title="DISCO: Image Colorization")
+with demo:
+    gr.Markdown(value="""**DISCO: image colorization that disentangles color multimodality and spatial affinity via global anchors**.""")
+    with gr.Row():
+        with gr.Column(scale=1):
+            Image_input = gr.Image(type="numpy", label="Input", interactive=True)
+            Image_anchor = gr.Image(type="numpy", label="Anchor", tool="color-sketch", interactive=True, visible=False)
+            with gr.Row():
+                Num_anchor = gr.Number(type="int", value=8, label="Num. of anchors (3~14)")
+                Radio_resolution = gr.Radio(type="index", choices=["Low (256x256)", "High (512x512)"], \
+                                            label="Colorization resolution", value="Low (256x256)")
+            Ckeckbox_editable = gr.Checkbox(default=False, label='Show editable anchors')
+            with gr.Row():
+                Button_show_anchor = gr.Button(value="Predict anchors", visible=False)
+                Button_run = gr.Button(value="Colorize")
+        with gr.Column(scale=1):
+            Image_output = gr.Image(type="numpy", label="Output", shape=[100,100])
+    Ckeckbox_editable.change(fn=switch_states, inputs=Ckeckbox_editable, outputs=[Image_anchor, Button_show_anchor])
+    Button_show_anchor.click(fn=click_predanchors, inputs=[Image_input, Num_anchor, Radio_resolution, Ckeckbox_editable], outputs=Image_anchor)
+    Button_run.click(fn=click_colorize, inputs=[Image_input, Image_anchor, Num_anchor, Radio_resolution, Ckeckbox_editable], \
+                    outputs=Image_output)
+    ## guiline
+    gr.Markdown(value="""
+                    **Guideline**
+                    1. Upload your image;
+                    2. Set up the arguments: "Num. of anchors" and "Colorization resolution";
+                    3. Two modes are supported:
+                        - **Editable**: check ""Show editable anchors" and click "Predict anchors". Then, modify the colors of the predicted anchors (anchor mask will be applied afterward). Finally, click "Colorize" to get the result.
+                        - **Automatic**: click "Colorize" to get the automatically colorized output.
+                    *To know more about the method, please refer to our project page: [https://menghanxia.github.io/projects/disco.html](https://menghanxia.github.io/projects/disco.html)*
+                    """)
+demo.launch(server_name='9.134.253.83',server_port=7788)

checkpoints/disco_download.sh ADDED Viewed

	@@ -0,0 +1 @@

+ wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1J4vB6kG4xBLUUKpXr5IhnSSa4maXgRvQ' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1J4vB6kG4xBLUUKpXr5IhnSSa4maXgRvQ" -O disco-beta.pth.tar && rm -rf /tmp/cookies.txt

cog.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# Configuration for Cog ⚙️
+# Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
+build:
+  # set to true if your model requires a GPU
+  cuda: "10.2"
+  gpu: true
+  # a list of ubuntu apt packages to install
+  system_packages:
+    # - "libgl1-mesa-glx"
+    # - "libglib2.0-0"
+    - "libgl1-mesa-dev"
+  # python version in the form '3.8' or '3.8.12'
+  python_version: "3.8"
+  # a list of packages in the format <package-name>==<version>
+  python_packages:
+    # - "numpy==1.19.4"
+    # - "torch==1.8.0"
+    # - "torchvision==0.9.0"
+    - "numpy==1.23.1"
+    - "torch==1.8.0"
+    - "torchvision==0.9.0"
+    - "opencv-python==4.6.0.66"
+    - "pandas==1.4.3"
+    - "pillow==9.2.0"
+    - "tqdm==4.64.0"
+    - "scikit-image==0.19.3"
+    - "scikit-learn==1.1.2"
+    - "scipy==1.9.1"
+  # commands run after the environment is setup
+  # run:
+    # - "echo env is ready!"
+    # - "echo another command if needed"
+# predict.py defines how predictions are run on your model
+predict: "predict.py:Predictor"
+#image: "r8.im/menghanxia/disco"

environment.yml ADDED Viewed

	@@ -0,0 +1,121 @@

+name: DISCO
+channels:
+  - pytorch
+  - defaults
+  - conda-forge
+dependencies:
+  - blas=1.0=mkl
+  - bzip2=1.0.8=h7b6447c_0
+  - ca-certificates=2022.07.19=h06a4308_0
+  - certifi=2022.6.15=py38h06a4308_0
+  - cudatoolkit=10.2.89=hfd86e86_1
+  - freetype=2.11.0=h70c0345_0
+  - giflib=5.2.1=h7b6447c_0
+  - gmp=6.2.1=h295c915_3
+  - gnutls=3.6.15=he1e5248_0
+  - intel-openmp=2021.4.0=h06a4308_3561
+  - jpeg=9b=h024ee3a_2
+  - lame=3.100=h7b6447c_0
+  - lcms2=2.12=h3be6417_0
+  - ld_impl_linux-64=2.38=h1181459_1
+  - libffi=3.3=he6710b0_2
+  - libgcc-ng=11.2.0=h1234567_1
+  - libiconv=1.16=h7f8727e_2
+  - libidn2=2.3.2=h7f8727e_0
+  - libpng=1.6.37=hbc83047_0
+  - libstdcxx-ng=11.2.0=h1234567_1
+  - libtasn1=4.16.0=h27cfd23_0
+  - libtiff=4.1.0=h2733197_1
+  - libunistring=0.9.10=h27cfd23_0
+  - libuv=1.40.0=h7b6447c_0
+  - libwebp=1.2.0=h89dd481_0
+  - lz4-c=1.9.3=h295c915_1
+  - mkl=2021.4.0=h06a4308_640
+  - mkl-service=2.4.0=py38h7f8727e_0
+  - mkl_fft=1.3.1=py38hd3c417c_0
+  - mkl_random=1.2.2=py38h51133e4_0
+  - ncurses=6.3=h5eee18b_3
+  - nettle=3.7.3=hbbd107a_1
+  - ninja=1.10.2=h06a4308_5
+  - ninja-base=1.10.2=hd09550d_5
+  - numpy=1.23.1=py38h6c91a56_0
+  - numpy-base=1.23.1=py38ha15fc14_0
+  - openh264=2.1.1=h4ff587b_0
+  - openssl=1.1.1q=h7f8727e_0
+  - pillow=9.2.0=py38hace64e9_1
+  - pip=22.1.2=py38h06a4308_0
+  - python=3.8.13=h12debd9_0
+  - readline=8.1.2=h7f8727e_1
+  - setuptools=63.4.1=py38h06a4308_0
+  - six=1.16.0=pyhd3eb1b0_1
+  - sqlite=3.39.2=h5082296_0
+  - tk=8.6.12=h1ccaba5_0
+  - typing_extensions=4.3.0=py38h06a4308_0
+  - wheel=0.37.1=pyhd3eb1b0_0
+  - xz=5.2.5=h7f8727e_1
+  - zlib=1.2.12=h7f8727e_2
+  - zstd=1.4.9=haebb681_0
+  - ffmpeg=4.3=hf484d3e_0
+  - pytorch=1.8.0=py3.8_cuda10.2_cudnn7.6.5_0
+  - torchaudio=0.8.0=py38
+  - torchvision=0.9.0=py38_cu102
+  - pip:
+    - addict==2.4.0
+    - astunparse==1.6.3
+    - cachetools==4.2.4
+    - charset-normalizer==2.0.7
+    - clang==5.0
+    - cycler==0.11.0
+    - flatbuffers==1.12
+    - fonttools==4.37.1
+    - future==0.18.2
+    - gast==0.4.0
+    - google-auth==2.3.2
+    - google-auth-oauthlib==0.4.6
+    - google-pasta==0.2.0
+    - grpcio==1.41.1
+    - h5py==3.1.0
+    - idna==3.3
+    - imageio==2.21.1
+    - joblib==1.1.0
+    - keras==2.6.0
+    - keras-preprocessing==1.1.2
+    - kiwisolver==1.4.4
+    - lpips==0.1.4
+    - markdown==3.3.4
+    - matplotlib==3.5.3
+    - networkx==2.8.6
+    - oauthlib==3.1.1
+    - opencv-python==4.6.0.66
+    - opt-einsum==3.3.0
+    - packaging==21.3
+    - pandas==1.4.3
+    - protobuf==3.19.0
+    - pyasn1==0.4.8
+    - pyasn1-modules==0.2.8
+    - pyparsing==3.0.9
+    - python-dateutil==2.8.2
+    - pytz==2022.2.1
+    - pywavelets==1.3.0
+    - pyyaml==6.0
+    - requests==2.26.0
+    - requests-oauthlib==1.3.0
+    - rsa==4.7.2
+    - scikit-image==0.19.3
+    - scikit-learn==1.1.2
+    - scipy==1.9.1
+    - tensorboard-data-server==0.6.1
+    - tensorboard-plugin-wit==1.8.0
+    - tensorflow-estimator==2.6.0
+    - tensorflow-gpu==2.6.0
+    - termcolor==1.1.0
+    - threadpoolctl==3.1.0
+    - tifffile==2022.8.12
+    - torch==1.8.0
+    - tqdm==4.64.0
+    - urllib3==1.26.7
+    - werkzeug==2.0.2
+    - wrapt==1.12.1
+    - yapf==0.32.0
+prefix: /root/data/programs/anaconda3/envs/DISCO

inference.py ADDED Viewed

	@@ -0,0 +1,109 @@

+import os, glob, sys, logging
+import argparse, datetime, time
+import numpy as np
+import cv2
+from PIL import Image
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models import model, basic
+from utils import util
+def setup_model(checkpt_path, device="cuda"):
+    seed = 130
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    #print('--------------', torch.cuda.is_available())
+    """Load the model into memory to make running multiple predictions efficient"""
+    colorLabeler = basic.ColorLabel(device=device)
+    colorizer = model.AnchorColorProb(inChannel=1, outChannel=313, enhanced=True, colorLabeler=colorLabeler)
+    colorizer = colorizer.to(device)
+    #checkpt_path = "./checkpoints/disco-beta.pth.rar"
+    assert os.path.exists(checkpt_path)
+    data_dict = torch.load(checkpt_path, map_location=torch.device('cpu'))
+    colorizer.load_state_dict(data_dict['state_dict'])
+    colorizer.eval()
+    return colorizer, colorLabeler
+def resize_ab2l(gray_img, lab_imgs, vis=False):
+    H, W = gray_img.shape[:2]
+    reszied_ab = cv2.resize(lab_imgs[:,:,1:], (W,H), interpolation=cv2.INTER_LINEAR)
+    if vis:
+        gray_img = cv2.resize(lab_imgs[:,:,:1], (W,H), interpolation=cv2.INTER_LINEAR)
+        return np.concatenate((gray_img[:,:,np.newaxis], reszied_ab), axis=2)
+    else:
+        return np.concatenate((gray_img, reszied_ab), axis=2)
+def prepare_data(rgb_img, target_res):
+    rgb_img = np.array(rgb_img / 255., np.float32)
+    lab_img = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2LAB)
+    org_grays = (lab_img[:,:,[0]]-50.) / 50.
+    lab_img = cv2.resize(lab_img, target_res, interpolation=cv2.INTER_LINEAR)
+    lab_img = torch.from_numpy(lab_img.transpose((2, 0, 1)))
+    gray_img = (lab_img[0:1,:,:]-50.) / 50.
+    ab_chans = lab_img[1:3,:,:] / 110.
+    input_grays = gray_img.unsqueeze(0)
+    input_colors = ab_chans.unsqueeze(0)
+    return input_grays, input_colors, org_grays
+def colorize_grayscale(colorizer, color_class, rgb_img, hint_img, n_anchors, is_high_res, is_editable, device="cuda"):
+    n_anchors = int(n_anchors)
+    n_anchors = max(n_anchors, 3)
+    n_anchors = min(n_anchors, 14)
+    target_res = (512,512) if is_high_res else (256,256)
+    input_grays, input_colors, org_grays = prepare_data(rgb_img, target_res)
+    input_grays = input_grays.to(device)
+    input_colors = input_colors.to(device)
+    if is_editable:
+        print('>>>:editable mode')
+        sampled_T = -1
+        _, input_colors, _ = prepare_data(hint_img, target_res)
+        input_colors = input_colors.to(device)
+        pal_logit, ref_logit, enhanced_ab, affinity_map, spix_colors, hint_mask = colorizer(input_grays, \
+                                                                input_colors, n_anchors, sampled_T)
+    else:
+        print('>>>:automatic mode')
+        sampled_T = 0
+        pal_logit, ref_logit, enhanced_ab, affinity_map, spix_colors, hint_mask = colorizer(input_grays, \
+                                                                input_colors, n_anchors, sampled_T)
+    pred_labs = torch.cat((input_grays,enhanced_ab), dim=1)
+    lab_imgs = basic.tensor2array(pred_labs).squeeze(axis=0)
+    lab_imgs = resize_ab2l(org_grays, lab_imgs)
+    lab_imgs[:,:,0] = lab_imgs[:,:,0] * 50.0 + 50.0
+    lab_imgs[:,:,1:3] = lab_imgs[:,:,1:3] * 110.0
+    rgb_output = cv2.cvtColor(lab_imgs[:,:,:], cv2.COLOR_LAB2RGB)
+    return (rgb_output*255.0).astype(np.uint8)
+def predict_anchors(colorizer, color_class, rgb_img, n_anchors, is_high_res, is_editable, device="cuda"):
+    n_anchors = int(n_anchors)
+    n_anchors = max(n_anchors, 3)
+    n_anchors = min(n_anchors, 14)
+    target_res = (512,512) if is_high_res else (256,256)
+    input_grays, input_colors, org_grays = prepare_data(rgb_img, target_res)
+    input_grays = input_grays.cuda(non_blocking=True)
+    input_colors = input_colors.cuda(non_blocking=True)
+    sampled_T, sp_size = 0, 16
+    pal_logit, ref_logit, enhanced_ab, affinity_map, spix_colors, hint_mask = colorizer(input_grays, \
+                                                            input_colors, n_anchors, sampled_T)
+    pred_probs = pal_logit
+    guided_colors = color_class.decode_ind2ab(ref_logit, T=0)
+    guided_colors = basic.upfeat(guided_colors, affinity_map, sp_size, sp_size)
+    anchor_masks = basic.upfeat(hint_mask, affinity_map, sp_size, sp_size)
+    marked_labs = basic.mark_color_hints(input_grays, guided_colors, anchor_masks, base_ABs=None)
+    lab_imgs = basic.tensor2array(marked_labs).squeeze(axis=0)
+    lab_imgs = resize_ab2l(org_grays, lab_imgs, vis=True)
+    lab_imgs[:,:,0] = lab_imgs[:,:,0] * 50.0 + 50.0
+    lab_imgs[:,:,1:3] = lab_imgs[:,:,1:3] * 110.0
+    rgb_output = cv2.cvtColor(lab_imgs[:,:,:], cv2.COLOR_LAB2RGB)
+    return (rgb_output*255.0).astype(np.uint8)

models/__init__.py ADDED Viewed

File without changes

models/anchor_gen.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd import Function
+from models import basic, clusterkit
+import pdb
+class AnchorAnalysis:
+    def __init__(self, mode, colorLabeler):
+        ## anchor generating mode: 1.random; 2.clustering
+        self.mode = mode
+        self.colorLabeler = colorLabeler
+    def _detect_correlation(self, data_tensors, color_probs, hint_masks, thres=0.1):
+        N,C,H,W = data_tensors.shape
+        ## (N,C,HW)
+        data_vecs = data_tensors.flatten(2)
+        prob_vecs = color_probs.flatten(2)
+        mask_vecs = hint_masks.flatten(2)
+        #anchor_data = torch.masked_select(data_vecs, mask_vecs.bool()).view(N,C,-1)
+        #anchor_prob = torch.masked_select(prob_vecs, mask_vecs.bool()).view(N,313,-1)
+        #_,_,K = anchor_data.shape
+        anchor_mask = torch.matmul(mask_vecs.permute(0,2,1), mask_vecs)
+        cosine_sim = True
+        ## non-similarity matrix
+        if cosine_sim:
+            norm_data = F.normalize(data_vecs, p=2, dim=1)
+            ## (N,HW,HW) = (N,HW,C) X (N,C,HW)
+            corr_matrix = torch.matmul(norm_data.permute(0,2,1), norm_data)
+            ## remapping: [-1.0,1.0] to [0.0,1.0], and convert into dis-similarity
+            dist_matrix = 1.0 - 0.5*(corr_matrix + 1.0)
+        else:
+            ## (N,HW,HW) = (N,HW,C) X (N,C,HW)
+            XtX = torch.matmul(data_vecs.permute(0,2,1), data_vecs)
+            diag_vec = torch.diagonal(XtX, dim1=-2, dim2=-1)
+            A = diag_vec.unsqueeze(1).repeat(1,H*W,1)
+            At = diag_vec.unsqueeze(2).repeat(1,1,H*W)
+            dist_matrix = A - 2*XtX + At
+        #dist_matrix = dist_matrix + 1e7*torch.eye(K).to(data_tensors.device).repeat(N,1,1)
+        ## for debug use
+        K = 8
+        anchor_adj_matrix = torch.masked_select(dist_matrix, anchor_mask.bool()).view(N,K,K)
+        ## dectect connected nodes
+        adj_matrix = torch.where((dist_matrix < thres) & (anchor_mask > 0), torch.ones_like(dist_matrix), torch.zeros_like(dist_matrix))
+        adj_matrix = torch.matmul(adj_matrix, adj_matrix)
+        adj_matrix = adj_matrix / (1e-7+adj_matrix)
+        ## merge nodes
+        ## (N,K,C) = (N,K,K) X (N,K,C)
+        anchor_prob = torch.matmul(adj_matrix, prob_vecs.permute(0,2,1)) / torch.sum(adj_matrix, dim=2, keepdim=True)
+        updated_prob_vecs = anchor_prob.permute(0,2,1) * mask_vecs + (1-mask_vecs) * prob_vecs
+        color_probs = updated_prob_vecs.view(N,313,H,W)
+        return color_probs, anchor_adj_matrix
+    def _sample_anchor_colors(self, pred_prob, hint_mask, T=0):
+        N,C,H,W = pred_prob.shape
+        topk = 10
+        assert T < topk
+        sorted_probs, batch_indexs = torch.sort(pred_prob, dim=1, descending=True)
+        ## (N,topk,H,W,1)
+        topk_probs = torch.softmax(sorted_probs[:,:topk,:,:], dim=1).unsqueeze(4)
+        topk_indexs = batch_indexs[:,:topk,:,:]
+        topk_ABs = torch.stack([self.colorLabeler.q_to_ab.index_select(0, q_i.flatten()).reshape(topk,H,W,2)
+                    for q_i in topk_indexs])
+        ## (N,topk,H,W,2)
+        topk_ABs = topk_ABs / 110.0
+        ## choose the most distinctive 3 colors for each anchor
+        if T == 0:
+            sampled_ABs = topk_ABs[:,0,:,:,:]
+        elif T == 1:
+            sampled_AB0 = topk_ABs[:,[0],:,:,:]
+            internal_diff = torch.norm(topk_ABs-sampled_AB0, p=2, dim=4, keepdim=True)
+            _, batch_indexs = torch.sort(internal_diff, dim=1, descending=True)
+            ## (N,1,H,W,2)
+            selected_index = batch_indexs[:,[0],:,:,:].expand([-1,-1,-1,-1,2])
+            sampled_ABs = torch.gather(topk_ABs, 1, selected_index)
+            sampled_ABs = sampled_ABs.squeeze(1)
+        else:
+            sampled_AB0 = topk_ABs[:,[0],:,:,:]
+            internal_diff = torch.norm(topk_ABs-sampled_AB0, p=2, dim=4, keepdim=True)
+            _, batch_indexs = torch.sort(internal_diff, dim=1, descending=True)
+            selected_index = batch_indexs[:,[0],:,:,:].expand([-1,-1,-1,-1,2])
+            sampled_AB1 = torch.gather(topk_ABs, 1, selected_index)
+            internal_diff2 = torch.norm(topk_ABs-sampled_AB1, p=2, dim=4, keepdim=True)
+            _, batch_indexs = torch.sort(internal_diff+internal_diff2, dim=1, descending=True)
+            ## (N,1,H,W,2)
+            selected_index = batch_indexs[:,[T-2],:,:,:].expand([-1,-1,-1,-1,2])
+            sampled_ABs = torch.gather(topk_ABs, 1, selected_index)
+            sampled_ABs = sampled_ABs.squeeze(1)
+        return sampled_ABs.permute(0,3,1,2)
+    def __call__(self, data_tensors, n_anchors, spixel_sizes, use_sklearn_kmeans=False):
+        N,C,H,W = data_tensors.shape
+        if self.mode == 'clustering':
+            ## clusters map: (N,K,H,W)
+            cluster_mask = clusterkit.batch_kmeans_pytorch(data_tensors, n_anchors, 'euclidean', use_sklearn_kmeans)
+            #noises = torch.rand(N,1,H,W).to(cluster_mask.device)
+            perturb_factors = spixel_sizes
+            cluster_prob = cluster_mask + perturb_factors * 0.01
+            hint_mask_layers = F.one_hot(torch.argmax(cluster_prob.flatten(2), dim=-1), num_classes=H*W).float()
+            hint_mask = torch.sum(hint_mask_layers, dim=1, keepdim=True).view(N,1,H,W)
+        else:
+            #print('----------hello, random!')
+            cluster_mask = torch.zeros(N,n_anchors,H,W).to(data_tensors.device)
+            binary_mask = basic.get_random_mask(N, H, W, minNum=n_anchors, maxNum=n_anchors)
+            hint_mask = torch.from_numpy(binary_mask).to(data_tensors.device)
+        return hint_mask, cluster_mask

models/basic.py ADDED Viewed

	@@ -0,0 +1,504 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.utils.spectral_norm as spectral_norm
+from torch.autograd import Function
+from utils import util, cielab
+import cv2, math, random
+def tensor2array(tensors):
+    arrays = tensors.detach().to("cpu").numpy()
+    return np.transpose(arrays, (0, 2, 3, 1))
+def rgb2gray(color_batch):
+    #! gray = 0.299*R+0.587*G+0.114*B
+    gray_batch = color_batch[:, 0, ...] * 0.299 + color_batch[:, 1, ...] * 0.587 + color_batch[:, 2, ...] * 0.114
+    gray_batch = gray_batch.unsqueeze_(1)
+    return gray_batch
+def getParamsAmount(model):
+    params = list(model.parameters())
+    count = 0
+    for var in params:
+        l = 1
+        for j in var.size():
+            l *= j
+        count += l
+    return count
+def checkAverageGradient(model):
+    meanGrad, cnt = 0.0, 0
+    for name, parms in model.named_parameters():
+        if parms.requires_grad:
+            meanGrad += torch.mean(torch.abs(parms.grad))
+            cnt += 1
+    return meanGrad.item() / cnt
+def get_random_mask(N, H, W, minNum, maxNum):
+    binary_maps = np.zeros((N, H*W), np.float32)
+    for i in range(N):
+        locs = random.sample(range(0, H*W), random.randint(minNum,maxNum))
+        binary_maps[i, locs] = 1
+    return binary_maps.reshape(N,1,H,W)
+def io_user_control(hint_mask, spix_colors, output=True):
+    cache_dir = '/apdcephfs/private_richardxia'
+    if output:
+        print('--- data saving')
+        mask_imgs = tensor2array(hint_mask) * 2.0 - 1.0
+        util.save_images_from_batch(mask_imgs, cache_dir, ['mask.png'], -1)
+        fake_gray = torch.zeros_like(spix_colors[:,[0],:,:])
+        spix_labs = torch.cat((fake_gray,spix_colors), dim=1)
+        spix_imgs = tensor2array(spix_labs)
+        util.save_normLabs_from_batch(spix_imgs, cache_dir, ['color.png'], -1)
+        return hint_mask, spix_colors
+    else:
+        print('--- data loading')
+        mask_img = cv2.imread(cache_dir+'/mask.png', cv2.IMREAD_GRAYSCALE)
+        mask_img = np.expand_dims(mask_img, axis=2) / 255.
+        hint_mask = torch.from_numpy(mask_img.transpose((2, 0, 1)))
+        hint_mask = hint_mask.unsqueeze(0).cuda()
+        bgr_img = cv2.imread(cache_dir+'/color.png', cv2.IMREAD_COLOR)
+        rgb_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2RGB)
+        rgb_img = np.array(rgb_img / 255., np.float32)
+        lab_img = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2LAB)
+        lab_img = torch.from_numpy(lab_img.transpose((2, 0, 1)))
+        ab_chans = lab_img[1:3,:,:] / 110.
+        spix_colors = ab_chans.unsqueeze(0).cuda()
+        return hint_mask.float(), spix_colors.float()
+class Quantize(Function):
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        y = x.round()
+        return y
+    @staticmethod
+    def backward(ctx, grad_output):
+        """
+        In the backward pass we receive a Tensor containing the gradient of the loss
+        with respect to the output, and we need to compute the gradient of the loss
+        with respect to the input.
+        """
+        inputX = ctx.saved_tensors
+        return grad_output
+def mark_color_hints(input_grays, target_ABs, gate_maps, kernel_size=3, base_ABs=None):
+    ## to highlight the seeds with 1-pixel margin
+    binary_map = torch.where(gate_maps>0.7, torch.ones_like(gate_maps), torch.zeros_like(gate_maps))
+    center_mask = dilate_seeds(binary_map, kernel_size=kernel_size)
+    margin_mask = dilate_seeds(binary_map, kernel_size=kernel_size+2) - center_mask
+    ## drop colors
+    dilated_seeds = dilate_seeds(gate_maps, kernel_size=kernel_size+2)
+    marked_grays = torch.where(margin_mask > 1e-5, torch.ones_like(gate_maps), input_grays)
+    if base_ABs is None:
+        marked_ABs = torch.where(center_mask < 1e-5, torch.zeros_like(target_ABs), target_ABs)
+    else:
+        marked_ABs = torch.where(margin_mask > 1e-5, torch.zeros_like(base_ABs), base_ABs)
+        marked_ABs = torch.where(center_mask > 1e-5, target_ABs, marked_ABs)
+    return torch.cat((marked_grays,marked_ABs), dim=1)
+def dilate_seeds(gate_maps, kernel_size=3):
+    N,C,H,W = gate_maps.shape
+    input_unf = F.unfold(gate_maps, kernel_size, padding=kernel_size//2)
+    #! Notice: differentiable? just like max pooling?
+    dilated_seeds, _ = torch.max(input_unf, dim=1, keepdim=True)
+    output = F.fold(dilated_seeds, output_size=(H,W), kernel_size=1)
+    #print('-------', input_unf.shape)
+    return output
+class RebalanceLoss(Function):
+    @staticmethod
+    def forward(ctx, data_input, weights):
+        ctx.save_for_backward(weights)
+        return data_input.clone()
+    @staticmethod
+    def backward(ctx, grad_output):
+        weights, = ctx.saved_tensors
+        # reweigh gradient pixelwise so that rare colors get a chance to
+        # contribute
+        grad_input = grad_output * weights
+        # second return value is None since we are not interested in the
+        # gradient with respect to the weights
+        return grad_input, None
+class GetClassWeights:
+    def __init__(self, cielab, lambda_=0.5, device='cuda'):
+        prior = torch.from_numpy(cielab.gamut.prior).cuda()
+        uniform = torch.zeros_like(prior)
+        uniform[prior > 0] = 1 / (prior > 0).sum().type_as(uniform)
+        self.weights = 1 / ((1 - lambda_) * prior + lambda_ * uniform)
+        self.weights /= torch.sum(prior * self.weights)
+    def __call__(self, ab_actual):
+        return self.weights[ab_actual.argmax(dim=1, keepdim=True)]
+class ColorLabel:
+    def __init__(self, lambda_=0.5, device='cuda'):
+        self.cielab = cielab.CIELAB()
+        self.q_to_ab = torch.from_numpy(self.cielab.q_to_ab).to(device)
+        prior = torch.from_numpy(self.cielab.gamut.prior).to(device)
+        uniform = torch.zeros_like(prior)
+        uniform[prior>0] = 1 / (prior>0).sum().type_as(uniform)
+        self.weights = 1 / ((1-lambda_) * prior + lambda_ * uniform)
+        self.weights /= torch.sum(prior * self.weights)
+    def visualize_label(self, step=3):
+        height, width = 200, 313*step
+        label_lab = np.ones((height,width,3), np.float32)
+        for x in range(313):
+            ab = self.cielab.q_to_ab[x,:]
+            label_lab[:,step*x:step*(x+1),1:] = ab / 110.
+        label_lab[:,:,0] = np.zeros((height,width), np.float32)
+        return label_lab
+    @staticmethod
+    def _gauss_eval(x, mu, sigma):
+        norm = 1 / (2 * math.pi * sigma)
+        return norm * torch.exp(-torch.sum((x - mu)**2, dim=0) / (2 * sigma**2))
+    def get_classweights(self, batch_gt_indx):
+        #return self.weights[batch_gt_q.argmax(dim=1, keepdim=True)]
+        return self.weights[batch_gt_indx]
+    def encode_ab2ind(self, batch_ab, neighbours=5, sigma=5.0):
+        batch_ab = batch_ab * 110.
+        n, _, h, w = batch_ab.shape
+        m = n * h * w
+        # find nearest neighbours
+        ab_ = batch_ab.permute(1, 0, 2, 3).reshape(2, -1) # (2, n*h*w)
+        cdist = torch.cdist(self.q_to_ab, ab_.t())
+        nns = cdist.argsort(dim=0)[:neighbours, :]
+        # gaussian weighting
+        nn_gauss = batch_ab.new_zeros(neighbours, m)
+        for i in range(neighbours):
+            nn_gauss[i, :] = self._gauss_eval(self.q_to_ab[nns[i, :], :].t(), ab_, sigma)
+        nn_gauss /= nn_gauss.sum(dim=0, keepdim=True)
+        # expand
+        bins = self.cielab.gamut.EXPECTED_SIZE
+        q = batch_ab.new_zeros(bins, m)
+        q[nns, torch.arange(m).repeat(neighbours, 1)] = nn_gauss
+        return q.reshape(bins, n, h, w).permute(1, 0, 2, 3)
+    def decode_ind2ab(self, batch_q, T=0.38):
+        _, _, h, w = batch_q.shape
+        batch_q = F.softmax(batch_q, dim=1)
+        if T%1 == 0:
+            # take the T-st probable index
+            sorted_probs, batch_indexs = torch.sort(batch_q, dim=1, descending=True)
+            #print('checking [index]', batch_indexs[:,0:5,5,5])
+            #print('checking [probs]', sorted_probs[:,0:5,5,5])
+            batch_indexs = batch_indexs[:,T:T+1,:,:]
+            #batch_indexs = torch.where(sorted_probs[:,T:T+1,:,:] > 0.25, batch_indexs[:,T:T+1,:,:], batch_indexs[:,0:1,:,:])
+            ab = torch.stack([
+                self.q_to_ab.index_select(0, q_i.flatten()).reshape(h,w,2).permute(2,0,1)
+                for q_i in batch_indexs])
+        else:
+            batch_q = torch.exp(batch_q / T)
+            batch_q /= batch_q.sum(dim=1, keepdim=True)
+            a = torch.tensordot(batch_q, self.q_to_ab[:,0], dims=((1,), (0,)))
+            a = a.unsqueeze(dim=1)
+            b = torch.tensordot(batch_q, self.q_to_ab[:,1], dims=((1,), (0,)))
+            b = b.unsqueeze(dim=1)
+            ab = torch.cat((a, b), dim=1)
+        ab = ab / 110.
+        return ab.type(batch_q.dtype)
+def init_spixel_grid(img_height, img_width, spixel_size=16):
+    # get spixel id for the final assignment
+    n_spixl_h = int(np.floor(img_height/spixel_size))
+    n_spixl_w = int(np.floor(img_width/spixel_size))
+    spixel_height = int(img_height / (1. * n_spixl_h))
+    spixel_width = int(img_width / (1. * n_spixl_w))
+    spix_values = np.int32(np.arange(0, n_spixl_w * n_spixl_h).reshape((n_spixl_h, n_spixl_w)))
+    def shift9pos(input, h_shift_unit=1, w_shift_unit=1):
+        # input should be padding as (c, 1+ height+1, 1+width+1)
+        input_pd = np.pad(input, ((h_shift_unit, h_shift_unit), (w_shift_unit, w_shift_unit)), mode='edge')
+        input_pd = np.expand_dims(input_pd, axis=0)
+        # assign to ...
+        top     = input_pd[:, :-2 * h_shift_unit,          w_shift_unit:-w_shift_unit]
+        bottom  = input_pd[:, 2 * h_shift_unit:,           w_shift_unit:-w_shift_unit]
+        left    = input_pd[:, h_shift_unit:-h_shift_unit,  :-2 * w_shift_unit]
+        right   = input_pd[:, h_shift_unit:-h_shift_unit,  2 * w_shift_unit:]
+        center = input_pd[:,h_shift_unit:-h_shift_unit,w_shift_unit:-w_shift_unit]
+        bottom_right    = input_pd[:, 2 * h_shift_unit:,   2 * w_shift_unit:]
+        bottom_left     = input_pd[:, 2 * h_shift_unit:,   :-2 * w_shift_unit]
+        top_right       = input_pd[:, :-2 * h_shift_unit,  2 * w_shift_unit:]
+        top_left        = input_pd[:, :-2 * h_shift_unit,  :-2 * w_shift_unit]
+        shift_tensor = np.concatenate([     top_left,    top,      top_right,
+                                            left,        center,      right,
+                                            bottom_left, bottom,    bottom_right], axis=0)
+        return shift_tensor
+    spix_idx_tensor_ = shift9pos(spix_values)
+    spix_idx_tensor = np.repeat(
+        np.repeat(spix_idx_tensor_, spixel_height, axis=1), spixel_width, axis=2)
+    spixel_id_tensor = torch.from_numpy(spix_idx_tensor).type(torch.float)
+    #! pixel coord feature maps
+    all_h_coords = np.arange(0, img_height, 1)
+    all_w_coords = np.arange(0, img_width, 1)
+    curr_pxl_coord = np.array(np.meshgrid(all_h_coords, all_w_coords, indexing='ij'))
+    coord_feat_tensor = np.concatenate([curr_pxl_coord[1:2, :, :], curr_pxl_coord[:1, :, :]])
+    coord_feat_tensor = torch.from_numpy(coord_feat_tensor).type(torch.float)
+    return spixel_id_tensor, coord_feat_tensor
+def split_spixels(assign_map, spixel_ids):
+    N,C,H,W = assign_map.shape
+    spixel_id_map = spixel_ids.expand(N,-1,-1,-1)
+    assig_max,_ = torch.max(assign_map, dim=1, keepdim=True)
+    assignment_ = torch.where(assign_map == assig_max, torch.ones(assign_map.shape).cuda(),torch.zeros(assign_map.shape).cuda())
+    ## winner take all
+    new_spixl_map_ = spixel_id_map * assignment_
+    new_spixl_map = torch.sum(new_spixl_map_,dim=1,keepdim=True).type(torch.int)
+    return new_spixl_map
+def poolfeat(input, prob, sp_h=2, sp_w=2, need_entry_prob=False):
+    def feat_prob_sum(feat_sum, prob_sum, shift_feat):
+        feat_sum += shift_feat[:, :-1, :, :]
+        prob_sum += shift_feat[:, -1:, :, :]
+        return feat_sum, prob_sum
+    b, _, h, w = input.shape
+    h_shift_unit = 1
+    w_shift_unit = 1
+    p2d = (w_shift_unit, w_shift_unit, h_shift_unit, h_shift_unit)
+    feat_ = torch.cat([input, torch.ones([b, 1, h, w], device=input.device)], dim=1)  # b* (n+1) *h*w
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 0, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w)) # b * (n+1) * h* w
+    send_to_top_left =  F.pad(prob_feat, p2d, mode='constant', value=0)[:, :, 2 * h_shift_unit:, 2 * w_shift_unit:]
+    feat_sum = send_to_top_left[:, :-1, :, :].clone()
+    prob_sum = send_to_top_left[:, -1:, :, :].clone()
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 1, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    top = F.pad(prob_feat, p2d, mode='constant', value=0)[:,  :, 2 * h_shift_unit:, w_shift_unit:-w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, top)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 2, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    top_right = F.pad(prob_feat, p2d, mode='constant', value=0)[:,  :, 2 * h_shift_unit:, :-2 * w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, top_right)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 3, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    left = F.pad(prob_feat, p2d, mode='constant', value=0)[:,  :, h_shift_unit:-h_shift_unit, 2 * w_shift_unit:]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, left)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 4, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    center = F.pad(prob_feat, p2d, mode='constant', value=0)[:, :, h_shift_unit:-h_shift_unit, w_shift_unit:-w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, center)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 5, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    right = F.pad(prob_feat, p2d, mode='constant', value=0)[:,  :, h_shift_unit:-h_shift_unit, :-2 * w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, right)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 6, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    bottom_left = F.pad(prob_feat, p2d, mode='constant', value=0)[:,  :, :-2 * h_shift_unit, 2 * w_shift_unit:]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, bottom_left)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 7, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    bottom = F.pad(prob_feat, p2d, mode='constant', value=0)[:, :, :-2 * h_shift_unit, w_shift_unit:-w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, bottom)
+    prob_feat = F.avg_pool2d(feat_ * prob.narrow(1, 8, 1), kernel_size=(sp_h, sp_w), stride=(sp_h, sp_w))  # b * (n+1) * h* w
+    bottom_right = F.pad(prob_feat, p2d, mode='constant', value=0)[:, :, :-2 * h_shift_unit, :-2 * w_shift_unit]
+    feat_sum, prob_sum = feat_prob_sum(feat_sum, prob_sum, bottom_right)
+    pooled_feat = feat_sum / (prob_sum + 1e-8)
+    if need_entry_prob:
+        return pooled_feat, prob_sum
+    return pooled_feat
+def get_spixel_size(affinity_map, sp_h=2, sp_w=2, elem_thres=25):
+    N,C,H,W = affinity_map.shape
+    device = affinity_map.device
+    assign_max,_ = torch.max(affinity_map, dim=1, keepdim=True)
+    assign_map = torch.where(affinity_map==assign_max, torch.ones(affinity_map.shape, device=device), torch.zeros(affinity_map.shape, device=device))
+    ## one_map = (N,1,H,W)
+    _, elem_num_maps = poolfeat(torch.ones(assign_max.shape, device=device), assign_map, sp_h, sp_w, True)
+    #all_one_map = torch.ones(elem_num_maps.shape).cuda()
+    #empty_mask = torch.where(elem_num_maps < elem_thres/256, all_one_map, 1-all_one_map)
+    return elem_num_maps
+def upfeat(input, prob, up_h=2, up_w=2):
+    # input b*n*H*W  downsampled
+    # prob b*9*h*w
+    b, c, h, w = input.shape
+    h_shift = 1
+    w_shift = 1
+    p2d = (w_shift, w_shift, h_shift, h_shift)
+    feat_pd = F.pad(input, p2d, mode='constant', value=0)
+    gt_frm_top_left = F.interpolate(feat_pd[:, :, :-2 * h_shift, :-2 * w_shift], size=(h * up_h, w * up_w),mode='nearest')
+    feat_sum = gt_frm_top_left * prob.narrow(1,0,1)
+    top = F.interpolate(feat_pd[:, :, :-2 * h_shift, w_shift:-w_shift], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += top * prob.narrow(1, 1, 1)
+    top_right = F.interpolate(feat_pd[:, :, :-2 * h_shift, 2 * w_shift:], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += top_right * prob.narrow(1,2,1)
+    left = F.interpolate(feat_pd[:, :, h_shift:-w_shift, :-2 * w_shift], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += left * prob.narrow(1, 3, 1)
+    center = F.interpolate(input, (h * up_h, w * up_w), mode='nearest')
+    feat_sum += center * prob.narrow(1, 4, 1)
+    right = F.interpolate(feat_pd[:, :, h_shift:-w_shift, 2 * w_shift:], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += right * prob.narrow(1, 5, 1)
+    bottom_left = F.interpolate(feat_pd[:, :, 2 * h_shift:, :-2 * w_shift], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += bottom_left * prob.narrow(1, 6, 1)
+    bottom = F.interpolate(feat_pd[:, :, 2 * h_shift:, w_shift:-w_shift], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += bottom * prob.narrow(1, 7, 1)
+    bottom_right =  F.interpolate(feat_pd[:, :, 2 * h_shift:, 2 * w_shift:], size=(h * up_h, w * up_w), mode='nearest')
+    feat_sum += bottom_right * prob.narrow(1, 8, 1)
+    return feat_sum
+def suck_and_spread(self, base_maps, seg_layers):
+    N,S,H,W = seg_layers.shape
+    base_maps = base_maps.unsqueeze(1)
+    seg_layers = seg_layers.unsqueeze(2)
+    ## (N,S,C,1,1) = (N,1,C,H,W) * (N,S,1,H,W)
+    mean_val_layers = (base_maps * seg_layers).sum(dim=(3,4), keepdim=True) / (1e-5 + seg_layers.sum(dim=(3,4), keepdim=True))
+    ## normalized to be sum one
+    weight_layers = seg_layers / (1e-5 + torch.sum(seg_layers, dim=1, keepdim=True))
+    ## (N,S,C,H,W) = (N,S,C,1,1) * (N,S,1,H,W)
+    recon_maps = mean_val_layers * weight_layers
+    return recon_maps.sum(dim=1)
+#! copy from Richard Zhang [SIGGRAPH2017]
+# RGB grid points maps to Lab range: L[0,100], a[-86.183,98,233], b[-107.857,94.478]
+#------------------------------------------------------------------------------
+def rgb2xyz(rgb):  # rgb from [0,1]
+    # xyz_from_rgb = np.array([[0.412453, 0.357580, 0.180423],
+        #  [0.212671, 0.715160, 0.072169],
+        #  [0.019334, 0.119193, 0.950227]])
+    mask = (rgb > .04045).type(torch.FloatTensor)
+    if(rgb.is_cuda):
+        mask = mask.cuda()
+    rgb = (((rgb+.055)/1.055)**2.4)*mask + rgb/12.92*(1-mask)
+    x = .412453*rgb[:,0,:,:]+.357580*rgb[:,1,:,:]+.180423*rgb[:,2,:,:]
+    y = .212671*rgb[:,0,:,:]+.715160*rgb[:,1,:,:]+.072169*rgb[:,2,:,:]
+    z = .019334*rgb[:,0,:,:]+.119193*rgb[:,1,:,:]+.950227*rgb[:,2,:,:]
+    out = torch.cat((x[:,None,:,:],y[:,None,:,:],z[:,None,:,:]),dim=1)
+    return out
+def xyz2rgb(xyz):
+    # array([[ 3.24048134, -1.53715152, -0.49853633],
+    #        [-0.96925495,  1.87599   ,  0.04155593],
+    #        [ 0.05564664, -0.20404134,  1.05731107]])
+    r = 3.24048134*xyz[:,0,:,:]-1.53715152*xyz[:,1,:,:]-0.49853633*xyz[:,2,:,:]
+    g = -0.96925495*xyz[:,0,:,:]+1.87599*xyz[:,1,:,:]+.04155593*xyz[:,2,:,:]
+    b = .05564664*xyz[:,0,:,:]-.20404134*xyz[:,1,:,:]+1.05731107*xyz[:,2,:,:]
+    rgb = torch.cat((r[:,None,:,:],g[:,None,:,:],b[:,None,:,:]),dim=1)
+    #！ sometimes reaches a small negative number, which causes NaNs
+    rgb = torch.max(rgb,torch.zeros_like(rgb))
+    mask = (rgb > .0031308).type(torch.FloatTensor)
+    if(rgb.is_cuda):
+        mask = mask.cuda()
+    rgb = (1.055*(rgb**(1./2.4)) - 0.055)*mask + 12.92*rgb*(1-mask)
+    return rgb
+def xyz2lab(xyz):
+    # 0.95047, 1., 1.08883 # white
+    sc = torch.Tensor((0.95047, 1., 1.08883))[None,:,None,None]
+    if(xyz.is_cuda):
+        sc = sc.cuda()
+    xyz_scale = xyz/sc
+    mask = (xyz_scale > .008856).type(torch.FloatTensor)
+    if(xyz_scale.is_cuda):
+        mask = mask.cuda()
+    xyz_int = xyz_scale**(1/3.)*mask + (7.787*xyz_scale + 16./116.)*(1-mask)
+    L = 116.*xyz_int[:,1,:,:]-16.
+    a = 500.*(xyz_int[:,0,:,:]-xyz_int[:,1,:,:])
+    b = 200.*(xyz_int[:,1,:,:]-xyz_int[:,2,:,:])
+    out = torch.cat((L[:,None,:,:],a[:,None,:,:],b[:,None,:,:]),dim=1)
+    return out
+def lab2xyz(lab):
+    y_int = (lab[:,0,:,:]+16.)/116.
+    x_int = (lab[:,1,:,:]/500.) + y_int
+    z_int = y_int - (lab[:,2,:,:]/200.)
+    if(z_int.is_cuda):
+        z_int = torch.max(torch.Tensor((0,)).cuda(), z_int)
+    else:
+        z_int = torch.max(torch.Tensor((0,)), z_int)
+    out = torch.cat((x_int[:,None,:,:],y_int[:,None,:,:],z_int[:,None,:,:]),dim=1)
+    mask = (out > .2068966).type(torch.FloatTensor)
+    if(out.is_cuda):
+        mask = mask.cuda()
+    out = (out**3.)*mask + (out - 16./116.)/7.787*(1-mask)
+    sc = torch.Tensor((0.95047, 1., 1.08883))[None,:,None,None]
+    sc = sc.to(out.device)
+    out = out*sc
+    return out
+def rgb2lab(rgb, l_mean=50, l_norm=50, ab_norm=110):
+    #! input rgb: [0,1]
+    #! output lab: [-1,1]
+    lab = xyz2lab(rgb2xyz(rgb))
+    l_rs = (lab[:,[0],:,:]-l_mean) / l_norm
+    ab_rs = lab[:,1:,:,:] / ab_norm
+    out = torch.cat((l_rs,ab_rs),dim=1)
+    return out
+def lab2rgb(lab_rs, l_mean=50, l_norm=50, ab_norm=110):
+    #! input lab: [-1,1]
+    #! output rgb: [0,1]
+    l_ = lab_rs[:,[0],:,:] * l_norm + l_mean
+    ab = lab_rs[:,1:,:,:] * ab_norm
+    lab = torch.cat((l_,ab), dim=1)
+    out = xyz2rgb(lab2xyz(lab))
+    return out
+if __name__ == '__main__':
+    minL, minA, minB = 999., 999., 999.
+    maxL, maxA, maxB = 0., 0., 0.
+    for r in range(256):
+        print('h',r)
+        for g in range(256):
+            for b in range(256):
+                rgb = np.array([r,g,b], np.float32).reshape(1,1,-1) / 255.0
+                #lab_img = cv2.cvtColor(rgb, cv2.COLOR_RGB2LAB)
+                rgb = torch.from_numpy(rgb.transpose((2, 0, 1)))
+                rgb = rgb.reshape(1,3,1,1)
+                lab = rgb2lab(rgb)
+                lab[:,[0],:,:] = lab[:,[0],:,:] * 50 + 50
+                lab[:,1:,:,:] = lab[:,1:,:,:] * 110
+                lab = lab.squeeze()
+                lab_float = lab.numpy()
+                #print('zhang vs. cv2:', lab_float, lab_img.squeeze())
+                minL = min(lab_float[0], minL)
+                minA = min(lab_float[1], minA)
+                minB = min(lab_float[2], minB)
+                maxL = max(lab_float[0], maxL)
+                maxA = max(lab_float[1], maxA)
+                maxB = max(lab_float[2], maxB)
+    print('L:', minL, maxL)
+    print('A:', minA, maxA)
+    print('B:', minB, maxB)

models/clusterkit.py ADDED Viewed

	@@ -0,0 +1,290 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+import numpy as np
+import torch
+from tqdm import tqdm
+import math, random
+#from sklearn.cluster import KMeans, kmeans_plusplus, MeanShift, estimate_bandwidth
+def tensor_kmeans_sklearn(data_vecs, n_clusters=7, metric='euclidean', need_layer_masks=False, max_iters=20):
+    N,C,H,W = data_vecs.shape
+    assert N == 1, 'only support singe image tensor'
+    ## (1,C,H,W) -> (HW,C)
+    data_vecs = data_vecs.permute(0,2,3,1).view(-1,C)
+    ## convert tensor to array
+    data_vecs_np = data_vecs.squeeze().detach().to("cpu").numpy()
+    km = KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300)
+    pred = km.fit_predict(data_vecs_np)
+    cluster_ids_x = torch.from_numpy(km.labels_).to(data_vecs.device)
+    id_maps = cluster_ids_x.reshape(1,1,H,W).long()
+    if need_layer_masks:
+        one_hot_labels = F.one_hot(id_maps.squeeze(1), num_classes=n_clusters).float()
+        cluster_mask = one_hot_labels.permute(0,3,1,2)
+        return cluster_mask
+    return id_maps
+def tensor_kmeans_pytorch(data_vecs, n_clusters=7, metric='euclidean', need_layer_masks=False, max_iters=20):
+    N,C,H,W = data_vecs.shape
+    assert N == 1, 'only support singe image tensor'
+    ## (1,C,H,W) -> (HW,C)
+    data_vecs = data_vecs.permute(0,2,3,1).view(-1,C)
+    ## cosine | euclidean
+    #cluster_ids_x, cluster_centers = kmeans(X=data_vecs, num_clusters=n_clusters, distance=metric, device=data_vecs.device)
+    cluster_ids_x, cluster_centers = kmeans(X=data_vecs, num_clusters=n_clusters, distance=metric,\
+                                                    tqdm_flag=False, iter_limit=max_iters, device=data_vecs.device)
+    id_maps = cluster_ids_x.reshape(1,1,H,W)
+    if need_layer_masks:
+        one_hot_labels = F.one_hot(id_maps.squeeze(1), num_classes=n_clusters).float()
+        cluster_mask = one_hot_labels.permute(0,3,1,2)
+        return cluster_mask
+    return id_maps
+def batch_kmeans_pytorch(data_vecs, n_clusters=7, metric='euclidean', use_sklearn_kmeans=False):
+    N,C,H,W = data_vecs.shape
+    sample_list = []
+    for idx in range(N):
+        if use_sklearn_kmeans:
+            cluster_mask = tensor_kmeans_sklearn(data_vecs[idx:idx+1,:,:,:], n_clusters, metric, True)
+        else:
+            cluster_mask = tensor_kmeans_pytorch(data_vecs[idx:idx+1,:,:,:], n_clusters, metric, True)
+        sample_list.append(cluster_mask)
+    return torch.cat(sample_list, dim=0)
+def get_centroid_candidates(data_vecs, n_clusters=7, metric='euclidean', max_iters=20):
+    N,C,H,W = data_vecs.shape
+    data_vecs = data_vecs.permute(0,2,3,1).view(-1,C)
+    cluster_ids_x, cluster_centers = kmeans(X=data_vecs, num_clusters=n_clusters, distance=metric,\
+                                                    tqdm_flag=False, iter_limit=max_iters, device=data_vecs.device)
+    return cluster_centers
+def find_distinctive_elements(data_tensor, n_clusters=7, topk=3, metric='euclidean'):
+    N,C,H,W = data_tensor.shape
+    centroid_list = []
+    for idx in range(N):
+        cluster_centers = get_centroid_candidates(data_tensor[idx:idx+1,:,:,:], n_clusters, metric)
+        centroid_list.append(cluster_centers)
+    batch_centroids = torch.stack(centroid_list, dim=0)
+    data_vecs = data_tensor.flatten(2)
+    ## distance matrix: (N,K,HW) = (N,K,C) x (N,C,HW)
+    AtB = torch.matmul(batch_centroids, data_vecs)
+    AtA = torch.matmul(batch_centroids, batch_centroids.permute(0,2,1))
+    BtB = torch.matmul(data_vecs.permute(0,2,1), data_vecs)
+    diag_A = torch.diagonal(AtA, dim1=-2, dim2=-1)
+    diag_B = torch.diagonal(BtB, dim1=-2, dim2=-1)
+    A2 = diag_A.unsqueeze(2).repeat(1,1,H*W)
+    B2 = diag_B.unsqueeze(1).repeat(1,n_clusters,1)
+    distance_map = A2 - 2*AtB + B2
+    values, indices = distance_map.topk(topk, dim=2, largest=False, sorted=True)
+    cluster_mask = torch.where(distance_map <= values[:,:,topk-1:], torch.ones_like(distance_map), torch.zeros_like(distance_map))
+    cluster_mask = cluster_mask.view(N,n_clusters,H,W)
+    return cluster_mask
+##---------------------------------------------------------------------------------
+'''
+    resource from github: https://github.com/subhadarship/kmeans_pytorch
+'''
+##---------------------------------------------------------------------------------
+def initialize(X, num_clusters):
+    """
+    initialize cluster centers
+    :param X: (torch.tensor) matrix
+    :param num_clusters: (int) number of clusters
+    :return: (np.array) initial state
+    """
+    num_samples = len(X)
+    indices = np.random.choice(num_samples, num_clusters, replace=False)
+    initial_state = X[indices]
+    return initial_state
+def kmeans(
+        X,
+        num_clusters,
+        distance='euclidean',
+        cluster_centers=[],
+        tol=1e-4,
+        tqdm_flag=True,
+        iter_limit=0,
+        device=torch.device('cpu'),
+        gamma_for_soft_dtw=0.001
+):
+    """
+    perform kmeans
+    :param X: (torch.tensor) matrix
+    :param num_clusters: (int) number of clusters
+    :param distance: (str) distance [options: 'euclidean', 'cosine'] [default: 'euclidean']
+    :param tol: (float) threshold [default: 0.0001]
+    :param device: (torch.device) device [default: cpu]
+    :param tqdm_flag: Allows to turn logs on and off
+    :param iter_limit: hard limit for max number of iterations
+    :param gamma_for_soft_dtw: approaches to (hard) DTW as gamma -> 0
+    :return: (torch.tensor, torch.tensor) cluster ids, cluster centers
+    """
+    if tqdm_flag:
+        print(f'running k-means on {device}..')
+    if distance == 'euclidean':
+        pairwise_distance_function = partial(pairwise_distance, device=device, tqdm_flag=tqdm_flag)
+    elif distance == 'cosine':
+        pairwise_distance_function = partial(pairwise_cosine, device=device)
+    else:
+        raise NotImplementedError
+    # convert to float
+    X = X.float()
+    # transfer to device
+    X = X.to(device)
+    # initialize
+    if type(cluster_centers) == list:  # ToDo: make this less annoyingly weird
+        initial_state = initialize(X, num_clusters)
+    else:
+        if tqdm_flag:
+            print('resuming')
+        # find data point closest to the initial cluster center
+        initial_state = cluster_centers
+        dis = pairwise_distance_function(X, initial_state)
+        choice_points = torch.argmin(dis, dim=0)
+        initial_state = X[choice_points]
+        initial_state = initial_state.to(device)
+    iteration = 0
+    if tqdm_flag:
+        tqdm_meter = tqdm(desc='[running kmeans]')
+    while True:
+        dis = pairwise_distance_function(X, initial_state)
+        choice_cluster = torch.argmin(dis, dim=1)
+        initial_state_pre = initial_state.clone()
+        for index in range(num_clusters):
+            selected = torch.nonzero(choice_cluster == index).squeeze().to(device)
+            selected = torch.index_select(X, 0, selected)
+            # https://github.com/subhadarship/kmeans_pytorch/issues/16
+            if selected.shape[0] == 0:
+                selected = X[torch.randint(len(X), (1,))]
+            initial_state[index] = selected.mean(dim=0)
+        center_shift = torch.sum(
+            torch.sqrt(
+                torch.sum((initial_state - initial_state_pre) ** 2, dim=1)
+            ))
+        # increment iteration
+        iteration = iteration + 1
+        # update tqdm meter
+        if tqdm_flag:
+            tqdm_meter.set_postfix(
+                iteration=f'{iteration}',
+                center_shift=f'{center_shift ** 2:0.6f}',
+                tol=f'{tol:0.6f}'
+            )
+            tqdm_meter.update()
+        if center_shift ** 2 < tol:
+            break
+        if iter_limit != 0 and iteration >= iter_limit:
+            #print('hello, there!')
+            break
+    return choice_cluster.to(device), initial_state.to(device)
+def kmeans_predict(
+        X,
+        cluster_centers,
+        distance='euclidean',
+        device=torch.device('cpu'),
+        gamma_for_soft_dtw=0.001,
+        tqdm_flag=True
+):
+    """
+    predict using cluster centers
+    :param X: (torch.tensor) matrix
+    :param cluster_centers: (torch.tensor) cluster centers
+    :param distance: (str) distance [options: 'euclidean', 'cosine'] [default: 'euclidean']
+    :param device: (torch.device) device [default: 'cpu']
+    :param gamma_for_soft_dtw: approaches to (hard) DTW as gamma -> 0
+    :return: (torch.tensor) cluster ids
+    """
+    if tqdm_flag:
+        print(f'predicting on {device}..')
+    if distance == 'euclidean':
+        pairwise_distance_function = partial(pairwise_distance, device=device, tqdm_flag=tqdm_flag)
+    elif distance == 'cosine':
+        pairwise_distance_function = partial(pairwise_cosine, device=device)
+    elif distance == 'soft_dtw':
+        sdtw = SoftDTW(use_cuda=device.type == 'cuda', gamma=gamma_for_soft_dtw)
+        pairwise_distance_function = partial(pairwise_soft_dtw, sdtw=sdtw, device=device)
+    else:
+        raise NotImplementedError
+    # convert to float
+    X = X.float()
+    # transfer to device
+    X = X.to(device)
+    dis = pairwise_distance_function(X, cluster_centers)
+    choice_cluster = torch.argmin(dis, dim=1)
+    return choice_cluster.cpu()
+def pairwise_distance(data1, data2, device=torch.device('cpu'), tqdm_flag=True):
+    if tqdm_flag:
+        print(f'device is :{device}')
+    # transfer to device
+    data1, data2 = data1.to(device), data2.to(device)
+    # N*1*M
+    A = data1.unsqueeze(dim=1)
+    # 1*N*M
+    B = data2.unsqueeze(dim=0)
+    dis = (A - B) ** 2.0
+    # return N*N matrix for pairwise distance
+    dis = dis.sum(dim=-1).squeeze()
+    return dis
+def pairwise_cosine(data1, data2, device=torch.device('cpu')):
+    # transfer to device
+    data1, data2 = data1.to(device), data2.to(device)
+    # N*1*M
+    A = data1.unsqueeze(dim=1)
+    # 1*N*M
+    B = data2.unsqueeze(dim=0)
+    # normalize the points  | [0.3, 0.4] -> [0.3/sqrt(0.09 + 0.16), 0.4/sqrt(0.09 + 0.16)] = [0.3/0.5, 0.4/0.5]
+    A_normalized = A / A.norm(dim=-1, keepdim=True)
+    B_normalized = B / B.norm(dim=-1, keepdim=True)
+    cosine = A_normalized * B_normalized
+    # return N*N matrix for pairwise distance
+    cosine_dis = 1 - cosine.sum(dim=-1).squeeze()
+    return cosine_dis

models/loss.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from __future__ import division
+import os, glob, shutil, math, random, json
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+import basic
+from utils import util
+eps = 0.0000001
+class SPixelLoss:
+    def __init__(self, psize=8, mpdist=False, gpu_no=0):
+        self.mpdist = mpdist
+        self.gpu_no = gpu_no
+        self.sp_size = psize
+    def __call__(self, data, epoch_no):
+        kernel_size = self.sp_size
+        #pos_weight = 0.003
+        prob = data['pred_prob']
+        labxy_feat = data['target_feat']
+        N,C,H,W = labxy_feat.shape
+        pooled_labxy = basic.poolfeat(labxy_feat, prob, kernel_size, kernel_size)
+        reconstr_feat = basic.upfeat(pooled_labxy, prob, kernel_size, kernel_size)
+        loss_map = reconstr_feat[:,:,:,:] - labxy_feat[:,:,:,:]
+        featLoss_idx = torch.norm(loss_map[:,:-2,:,:], p=2, dim=1).mean()
+        posLoss_idx = torch.norm(loss_map[:,-2:,:,:], p=2, dim=1).mean() / kernel_size
+        totalLoss_idx = 10*featLoss_idx + 0.003*posLoss_idx
+        return {'totalLoss':totalLoss_idx, 'featLoss':featLoss_idx, 'posLoss':posLoss_idx}
+class AnchorColorProbLoss:
+    def __init__(self, hint2regress=False, enhanced=False, with_grad=False, mpdist=False, gpu_no=0):
+        self.mpdist = mpdist
+        self.gpu_no = gpu_no
+        self.hint2regress = hint2regress
+        self.enhanced = enhanced
+        self.with_grad = with_grad
+        self.rebalance_gradient = basic.RebalanceLoss.apply
+        self.entropy_loss = nn.CrossEntropyLoss(ignore_index=-1)
+        if self.enhanced:
+            self.VGGLoss = VGG19Loss(gpu_no=gpu_no, is_ddp=mpdist)
+    def _perceptual_loss(self, input_grays, input_colors, pred_colors):
+        input_RGBs = basic.lab2rgb(torch.cat([input_grays,input_colors], dim=1))
+        pred_RGBs = basic.lab2rgb(torch.cat([input_grays,pred_colors], dim=1))
+        ## the output of "lab2rgb" just matches the input of "VGGLoss": [0,1]
+        return self.VGGLoss(input_RGBs, pred_RGBs)
+    def _laplace_gradient(self, pred_AB, target_AB):
+        N,C,H,W = pred_AB.shape
+        kernel = torch.tensor([[1, 1, 1], [1, -8, 1], [1, 1, 1]], device=pred_AB.get_device()).float()
+        kernel = kernel.view(1, 1, *kernel.size()).repeat(C,1,1,1)
+        grad_pred = F.conv2d(pred_AB, kernel, groups=C)
+        grad_trg = F.conv2d(target_AB, kernel, groups=C)
+        return l1_loss(grad_trg, grad_pred)
+    def __call__(self, data, epoch_no):
+        N,C,H,W = data['target_label'].shape
+        pal_probs = self.rebalance_gradient(data['pal_prob'], data['class_weight'])
+        #ref_probs = data['ref_prob']
+        pal_probs = pal_probs.permute(0,2,3,1).contiguous().view(N*H*W, -1)
+        gt_labels = data['target_label'].permute(0,2,3,1).contiguous().view(N*H*W, -1)
+        '''
+        igored_mask = data['empty_entries'].permute(0,2,3,1).contiguous().view(N*H*W, -1)
+        gt_labels[igored_mask] = -1
+        gt_labels = gt_probs.squeeze()
+        '''
+        palLoss_idx = self.entropy_loss(pal_probs, gt_labels.squeeze(dim=1))
+        if self.hint2regress:
+            ref_probs = data['ref_prob']
+            refLoss_idx = 50 * l2_loss(data['spix_color'], ref_probs)
+        else:
+            ref_probs = self.rebalance_gradient(data['ref_prob'], data['class_weight'])
+            ref_probs = ref_probs.permute(0,2,3,1).contiguous().view(N*H*W, -1)
+            refLoss_idx = self.entropy_loss(ref_probs, gt_labels.squeeze(dim=1))
+        reconLoss_idx = torch.zeros_like(palLoss_idx)
+        if self.enhanced:
+            scalar = 1.0 if self.hint2regress else 5.0
+            reconLoss_idx = scalar * self._perceptual_loss(data['input_gray'], data['pred_color'], data['input_color'])
+            if self.with_grad:
+                gradient_loss = self._laplace_gradient(data['pred_color'], data['input_color'])
+                reconLoss_idx += gradient_loss
+        totalLoss_idx = palLoss_idx + refLoss_idx + reconLoss_idx
+        #print("loss terms:", palLoss_idx.item(), refLoss_idx.item(), reconLoss_idx.item())
+        return {'totalLoss':totalLoss_idx, 'palLoss':palLoss_idx, 'refLoss':refLoss_idx, 'recLoss':reconLoss_idx}
+def compute_affinity_pos_loss(prob_in, labxy_feat, pos_weight=0.003, kernel_size=16):
+    S = kernel_size
+    m = pos_weight
+    prob = prob_in.clone()
+    N,C,H,W = labxy_feat.shape
+    pooled_labxy = basic.poolfeat(labxy_feat, prob, kernel_size, kernel_size)
+    reconstr_feat = basic.upfeat(pooled_labxy, prob, kernel_size, kernel_size)
+    loss_map = reconstr_feat[:,:,:,:] - labxy_feat[:,:,:,:]
+    loss_feat = torch.norm(loss_map[:,:-2,:,:], p=2, dim=1).mean()
+    loss_pos = torch.norm(loss_map[:,-2:,:,:], p=2, dim=1).mean() * m / S
+    loss_affinity = loss_feat + loss_pos
+    return loss_affinity
+def l2_loss(y_input, y_target, weight_map=None):
+    if weight_map is None:
+        return F.mse_loss(y_input, y_target)
+    else:
+        diff_map = torch.mean(torch.abs(y_input-y_target), dim=1, keepdim=True)
+        batch_dev = torch.sum(diff_map*diff_map*weight_map, dim=(1,2,3)) / (eps+torch.sum(weight_map, dim=(1,2,3)))
+        return batch_dev.mean()
+def l1_loss(y_input, y_target, weight_map=None):
+    if weight_map is None:
+        return F.l1_loss(y_input, y_target)
+    else:
+        diff_map = torch.mean(torch.abs(y_input-y_target), dim=1, keepdim=True)
+        batch_dev = torch.sum(diff_map*weight_map, dim=(1,2,3)) / (eps+torch.sum(weight_map, dim=(1,2,3)))
+        return batch_dev.mean()
+def masked_l1_loss(y_input, y_target, outlier_mask):
+    one = torch.tensor([1.0]).cuda(y_input.get_device())
+    weight_map = torch.where(outlier_mask, one * 0.0, one * 1.0)
+    return l1_loss(y_input, y_target, weight_map)
+def huber_loss(y_input, y_target, delta=0.01):
+    mask = torch.zeros_like(y_input)
+    mann = torch.abs(y_input - y_target)
+    eucl = 0.5 * (mann**2)
+    mask[...] = mann < delta
+    loss = eucl * mask / delta + (mann - 0.5 * delta) * (1 - mask)
+    return torch.mean(loss)
+## Perceptual loss that uses a pretrained VGG network
+class VGG19Loss(nn.Module):
+    def __init__(self, feat_type='liu', gpu_no=0, is_ddp=False, requires_grad=False):
+        super(VGG19Loss, self).__init__()
+        os.environ['TORCH_HOME'] = '/apdcephfs/share_1290939/richardxia/Saved/Checkpoints/VGG19'
+        ## data requirement: (N,C,H,W) in RGB format, [0,1] range, and resolution >= 224x224
+        self.mean = [0.485, 0.456, 0.406]
+        self.std = [0.229, 0.224, 0.225]
+        self.feat_type = feat_type
+        vgg_model = torchvision.models.vgg19(pretrained=True)
+        ## AssertionError: DistributedDataParallel is not needed when a module doesn't have any parameter that requires a gradient
+        '''
+        if is_ddp:
+            vgg_model = vgg_model.cuda(gpu_no)
+            vgg_model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(vgg_model)
+            vgg_model = torch.nn.parallel.DistributedDataParallel(vgg_model, device_ids=[gpu_no], find_unused_parameters=True)
+        else:
+            vgg_model = vgg_model.cuda(gpu_no)
+        '''
+        vgg_model = vgg_model.cuda(gpu_no)
+        if self.feat_type == 'liu':
+            ## conv1_1, conv2_1, conv3_1, conv4_1, conv5_1
+            self.slice1 = nn.Sequential(*list(vgg_model.features)[:2]).eval()
+            self.slice2 = nn.Sequential(*list(vgg_model.features)[2:7]).eval()
+            self.slice3 = nn.Sequential(*list(vgg_model.features)[7:12]).eval()
+            self.slice4 = nn.Sequential(*list(vgg_model.features)[12:21]).eval()
+            self.slice5 = nn.Sequential(*list(vgg_model.features)[21:30]).eval()
+            self.weights = [1.0/32, 1.0/16, 1.0/8, 1.0/4, 1.0]
+        elif self.feat_type == 'lei':
+            ## conv1_2, conv2_2, conv3_2, conv4_2, conv5_2
+            self.slice1 = nn.Sequential(*list(vgg_model.features)[:4]).eval()
+            self.slice2 = nn.Sequential(*list(vgg_model.features)[4:9]).eval()
+            self.slice3 = nn.Sequential(*list(vgg_model.features)[9:14]).eval()
+            self.slice4 = nn.Sequential(*list(vgg_model.features)[14:23]).eval()
+            self.slice5 = nn.Sequential(*list(vgg_model.features)[23:32]).eval()
+            self.weights = [1.0/2.6, 1.0/4.8, 1.0/3.7, 1.0/5.6, 10.0/1.5]
+        else:
+            ## maxpool after conv4_4
+            self.featureExactor = nn.Sequential(*list(vgg_model.features)[:28]).eval()
+        '''
+        for x in range(2):
+            self.slice1.add_module(str(x), pretrained_features[x])
+        for x in range(2, 7):
+            self.slice2.add_module(str(x), pretrained_features[x])
+        for x in range(7, 12):
+            self.slice3.add_module(str(x), pretrained_features[x])
+        for x in range(12, 21):
+            self.slice4.add_module(str(x), pretrained_features[x])
+        for x in range(21, 30):
+            self.slice5.add_module(str(x), pretrained_features[x])
+        '''
+        self.criterion = nn.L1Loss()
+        ## fixed parameters
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+        self.eval()
+        print('[*] VGG19Loss init!')
+    def normalize(self, tensor):
+        tensor = tensor.clone()
+        mean = torch.as_tensor(self.mean, dtype=torch.float32, device=tensor.device)
+        std = torch.as_tensor(self.std, dtype=torch.float32, device=tensor.device)
+        tensor.sub_(mean[None, :, None, None]).div_(std[None, :, None, None])
+        return tensor
+    def forward(self, x, y):
+        norm_x, norm_y = self.normalize(x), self.normalize(y)
+        ## feature extract
+        if self.feat_type == 'liu' or self.feat_type == 'lei':
+            x_relu1, y_relu1 = self.slice1(norm_x), self.slice1(norm_y)
+            x_relu2, y_relu2 = self.slice2(x_relu1), self.slice2(y_relu1)
+            x_relu3, y_relu3 = self.slice3(x_relu2), self.slice3(y_relu2)
+            x_relu4, y_relu4 = self.slice4(x_relu3), self.slice4(y_relu3)
+            x_relu5, y_relu5 = self.slice5(x_relu4), self.slice5(y_relu4)
+            x_vgg = [x_relu1, x_relu2, x_relu3, x_relu4, x_relu5]
+            y_vgg = [y_relu1, y_relu2, y_relu3, y_relu4, y_relu5]
+            loss = 0
+            for i in range(len(x_vgg)):
+                loss += self.weights[i] * self.criterion(x_vgg[i], y_vgg[i].detach())
+        else:
+            x_vgg, y_vgg = self.featureExactor(norm_x), self.featureExactor(norm_y)
+            loss = self.criterion(x_vgg, y_vgg.detach())
+        return loss

models/model.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models.network import HourGlass2, SpixelNet, ColorProbNet
+from models.transformer2d import EncoderLayer, DecoderLayer, TransformerEncoder, TransformerDecoder
+from models.position_encoding import build_position_encoding
+from models import basic, clusterkit, anchor_gen
+from collections import OrderedDict
+from utils import util, cielab
+class SpixelSeg(nn.Module):
+    def __init__(self, inChannel=1, outChannel=9, batchNorm=True):
+        super(SpixelSeg, self).__init__()
+        self.net = SpixelNet(inChannel=inChannel, outChannel=outChannel, batchNorm=batchNorm)
+    def get_trainable_params(self, lr=1.0):
+        #print('=> [optimizer] finetune backbone with smaller lr')
+        params = []
+        for name, param in self.named_parameters():
+            if 'xxx' in name:
+                params.append({'params': param, 'lr': lr})
+            else:
+                params.append({'params': param})
+        return params
+    def forward(self, input_grays):
+        pred_probs = self.net(input_grays)
+        return pred_probs
+class AnchorColorProb(nn.Module):
+    def __init__(self, inChannel=1, outChannel=313, sp_size=16, d_model=64, use_dense_pos=True, spix_pos=False, learning_pos=False, \
+                random_hint=False, hint2regress=False, enhanced=False, use_mask=False, rank=0, colorLabeler=None):
+        super(AnchorColorProb, self).__init__()
+        self.sp_size = sp_size
+        self.spix_pos = spix_pos
+        self.use_token_mask = use_mask
+        self.hint2regress = hint2regress
+        self.segnet = SpixelSeg(inChannel=1, outChannel=9, batchNorm=True)
+        self.repnet = ColorProbNet(inChannel=inChannel, outChannel=64)
+        self.enhanced = enhanced
+        if self.enhanced:
+            self.enhanceNet = HourGlass2(inChannel=64+1, outChannel=2, resNum=3, normLayer=nn.BatchNorm2d)
+        ## transformer architecture
+        self.n_vocab = 313
+        d_model, dim_feedforward, nhead = d_model, 4*d_model, 8
+        dropout, activation = 0.1, "relu"
+        n_enc_layers, n_dec_layers = 6, 6
+        enc_layer = EncoderLayer(d_model, nhead, dim_feedforward, dropout, activation, use_dense_pos)
+        self.wildpath = TransformerEncoder(enc_layer, n_enc_layers, use_dense_pos)
+        self.hintpath = TransformerEncoder(enc_layer, n_enc_layers, use_dense_pos)
+        if self.spix_pos:
+            n_pos_x, n_pos_y = 256, 256
+        else:
+            n_pos_x, n_pos_y = 256//sp_size, 16//sp_size
+        self.pos_enc = build_position_encoding(d_model//2, n_pos_x, n_pos_y, is_learned=False)
+        self.mid_word_prj = nn.Linear(d_model, self.n_vocab, bias=False)
+        if self.hint2regress:
+            self.trg_word_emb = nn.Linear(d_model+2+1, d_model, bias=False)
+            self.trg_word_prj = nn.Linear(d_model, 2, bias=False)
+        else:
+            self.trg_word_emb = nn.Linear(d_model+self.n_vocab+1, d_model, bias=False)
+            self.trg_word_prj = nn.Linear(d_model, self.n_vocab, bias=False)
+        self.colorLabeler = colorLabeler
+        anchor_mode = 'random' if random_hint else 'clustering'
+        self.anchorGen = anchor_gen.AnchorAnalysis(mode=anchor_mode, colorLabeler=self.colorLabeler)
+        self._reset_parameters()
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+    def load_and_froze_weight(self, checkpt_path):
+        data_dict = torch.load(checkpt_path, map_location=torch.device('cpu'))
+        '''
+        for param_tensor in data_dict['state_dict']:
+            print(param_tensor,'\t',data_dict['state_dict'][param_tensor].size())
+        '''
+        self.segnet.load_state_dict(data_dict['state_dict'])
+        for name, param in self.segnet.named_parameters():
+            param.requires_grad = False
+        self.segnet.eval()
+    def set_train(self):
+        ## running mode only affect certain modules, e.g. Dropout, BN, etc.
+        self.repnet.train()
+        self.wildpath.train()
+        self.hintpath.train()
+        if self.enhanced:
+            self.enhanceNet.train()
+    def get_entry_mask(self, mask_tensor):
+        if mask_tensor is None:
+            return None
+        ## flatten (N,1,H,W) to (N,HW)
+        return mask_tensor.flatten(1)
+    def forward(self, input_grays, input_colors, n_anchors=8, sampled_T=0):
+        '''
+        Notice: function was customized for inferece only
+        '''
+        affinity_map = self.segnet(input_grays)
+        pred_feats = self.repnet(input_grays)
+        if self.spix_pos:
+            full_pos_feats = self.pos_enc(pred_feats)
+            proxy_feats = torch.cat([pred_feats, input_colors, full_pos_feats], dim=1)
+            pooled_proxy_feats, conf_sum = basic.poolfeat(proxy_feats, affinity_map, self.sp_size, self.sp_size, True)
+            feat_tokens = pooled_proxy_feats[:,:64,:,:]
+            spix_colors = pooled_proxy_feats[:,64:66,:,:]
+            pos_feats = pooled_proxy_feats[:,66:,:,:]
+        else:
+            proxy_feats = torch.cat([pred_feats, input_colors], dim=1)
+            pooled_proxy_feats, conf_sum = basic.poolfeat(proxy_feats, affinity_map, self.sp_size, self.sp_size, True)
+            feat_tokens = pooled_proxy_feats[:,:64,:,:]
+            spix_colors = pooled_proxy_feats[:,64:,:,:]
+            pos_feats = self.pos_enc(feat_tokens)
+        token_labels = torch.max(self.colorLabeler.encode_ab2ind(spix_colors), dim=1, keepdim=True)[1]
+        spixel_sizes = basic.get_spixel_size(affinity_map, self.sp_size, self.sp_size)
+        all_one_map = torch.ones(spixel_sizes.shape, device=input_grays.device)
+        empty_entries = torch.where(spixel_sizes < 25/(self.sp_size**2), all_one_map, 1-all_one_map)
+        src_pad_mask = self.get_entry_mask(empty_entries) if self.use_token_mask else None
+        trg_pad_mask = src_pad_mask
+        ## parallel prob
+        N,C,H,W = feat_tokens.shape
+        ## (N,C,H,W) -> (HW,N,C)
+        src_pos_seq = pos_feats.flatten(2).permute(2, 0, 1)
+        src_seq = feat_tokens.flatten(2).permute(2, 0, 1)
+        ## color prob branch
+        enc_out, _ = self.wildpath(src_seq, src_pos_seq, src_pad_mask)
+        pal_logit = self.mid_word_prj(enc_out)
+        pal_logit = pal_logit.permute(1, 2, 0).view(N,self.n_vocab,H,W)
+        ## seed prob branch
+        ## mask(N,1,H,W): sample anchors at clustering layers
+        color_feat = enc_out.permute(1, 2, 0).view(N,C,H,W)
+        hint_mask, cluster_mask = self.anchorGen(color_feat, n_anchors, spixel_sizes, use_sklearn_kmeans=False)
+        pred_prob = torch.softmax(pal_logit, dim=1)
+        color_feat2 = src_seq.permute(1, 2, 0).view(N,C,H,W)
+        #pred_prob, adj_matrix = self.anchorGen._detect_correlation(color_feat, pred_prob, hint_mask, thres=0.1)
+        if sampled_T < 0:
+            ## GT anchor colors
+            sampled_spix_colors = spix_colors
+        elif sampled_T > 0:
+            top1_spix_colors = self.anchorGen._sample_anchor_colors(pred_prob, hint_mask, T=0)
+            top2_spix_colors = self.anchorGen._sample_anchor_colors(pred_prob, hint_mask, T=1)
+            top3_spix_colors = self.anchorGen._sample_anchor_colors(pred_prob, hint_mask, T=2)
+            ## duplicate meta tensors
+            sampled_spix_colors = torch.cat((top1_spix_colors,top2_spix_colors,top3_spix_colors), dim=0)
+            N = 3*N
+            input_grays = input_grays.expand(N,-1,-1,-1)
+            hint_mask = hint_mask.expand(N,-1,-1,-1)
+            affinity_map = affinity_map.expand(N,-1,-1,-1)
+            src_seq = src_seq.expand(-1, N,-1)
+            src_pos_seq = src_pos_seq.expand(-1, N,-1)
+        else:
+            sampled_spix_colors = self.anchorGen._sample_anchor_colors(pred_prob, hint_mask, T=sampled_T)
+        ## debug: controllable
+        if False:
+            hint_mask, sampled_spix_colors = basic.io_user_control(hint_mask, spix_colors, output=False)
+        sampled_token_labels = torch.max(self.colorLabeler.encode_ab2ind(sampled_spix_colors), dim=1, keepdim=True)[1]
+        ## hint based prediction
+        ## (N,C,H,W) -> (HW,N,C)
+        mask_seq = hint_mask.flatten(2).permute(2, 0, 1)
+        if self.hint2regress:
+            spix_colors_ = sampled_spix_colors
+            gt_seq = spix_colors_.flatten(2).permute(2, 0, 1)
+            hint_seq = self.trg_word_emb(torch.cat([src_seq, mask_seq * gt_seq, mask_seq], dim=2))
+            dec_out, _ = self.hintpath(hint_seq, src_pos_seq, src_pad_mask)
+        else:
+            token_labels_ = sampled_token_labels
+            label_map = F.one_hot(token_labels_, num_classes=313).squeeze(1).float()
+            label_seq = label_map.permute(0, 3, 1, 2).flatten(2).permute(2, 0, 1)
+            hint_seq = self.trg_word_emb(torch.cat([src_seq, mask_seq * label_seq, mask_seq], dim=2))
+            dec_out, _ = self.hintpath(hint_seq, src_pos_seq, src_pad_mask)
+        ref_logit = self.trg_word_prj(dec_out)
+        Ct = 2 if self.hint2regress else self.n_vocab
+        ref_logit = ref_logit.permute(1, 2, 0).view(N,Ct,H,W)
+        ## pixelwise enhancement
+        pred_colors = None
+        if self.enhanced:
+            proc_feats = dec_out.permute(1, 2, 0).view(N,64,H,W)
+            full_feats = basic.upfeat(proc_feats, affinity_map, self.sp_size, self.sp_size)
+            pred_colors = self.enhanceNet(torch.cat((input_grays,full_feats), dim=1))
+            pred_colors = torch.tanh(pred_colors)
+        return pal_logit, ref_logit, pred_colors, affinity_map, spix_colors, hint_mask

models/network.py ADDED Viewed

	@@ -0,0 +1,352 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn import init
+import torchvision
+import torch.nn.utils.spectral_norm as spectral_norm
+import math
+class ConvBlock(nn.Module):
+    def __init__(self, inChannels, outChannels, convNum, normLayer=None):
+        super(ConvBlock, self).__init__()
+        self.inConv = nn.Sequential(
+            nn.Conv2d(inChannels, outChannels, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True)
+        )
+        layers = []
+        for _ in range(convNum - 1):
+            layers.append(nn.Conv2d(outChannels, outChannels, kernel_size=3, padding=1))
+            layers.append(nn.ReLU(inplace=True))
+        if not (normLayer is None):
+            layers.append(normLayer(outChannels))
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        x = self.inConv(x)
+        x = self.conv(x)
+        return x
+class ResidualBlock(nn.Module):
+    def __init__(self, channels, normLayer=None):
+        super(ResidualBlock, self).__init__()
+        layers = []
+        layers.append(nn.Conv2d(channels, channels, kernel_size=3, padding=1))
+        layers.append(spectral_norm(nn.Conv2d(channels, channels, kernel_size=3, padding=1)))
+        if not (normLayer is None):
+            layers.append(normLayer(channels))
+        layers.append(nn.ReLU(inplace=True))
+        layers.append(nn.Conv2d(channels, channels, kernel_size=3, padding=1))
+        if not (normLayer is None):
+            layers.append(normLayer(channels))
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        residual = self.conv(x)
+        return F.relu(x + residual, inplace=True)
+class ResidualBlockSN(nn.Module):
+    def __init__(self, channels, normLayer=None):
+        super(ResidualBlockSN, self).__init__()
+        layers = []
+        layers.append(spectral_norm(nn.Conv2d(channels, channels, kernel_size=3, padding=1)))
+        layers.append(nn.LeakyReLU(0.2, True))
+        layers.append(spectral_norm(nn.Conv2d(channels, channels, kernel_size=3, padding=1)))
+        if not (normLayer is None):
+            layers.append(normLayer(channels))
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        residual = self.conv(x)
+        return F.leaky_relu(x + residual, 2e-1, inplace=True)
+class DownsampleBlock(nn.Module):
+    def __init__(self, inChannels, outChannels, convNum=2, normLayer=None):
+        super(DownsampleBlock, self).__init__()
+        layers = []
+        layers.append(nn.Conv2d(inChannels, outChannels, kernel_size=3, padding=1, stride=2))
+        layers.append(nn.ReLU(inplace=True))
+        for _ in range(convNum - 1):
+            layers.append(nn.Conv2d(outChannels, outChannels, kernel_size=3, padding=1))
+            layers.append(nn.ReLU(inplace=True))
+        if not (normLayer is None):
+            layers.append(normLayer(outChannels))
+        self.conv = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.conv(x)
+class UpsampleBlock(nn.Module):
+    def __init__(self, inChannels, outChannels, convNum=2, normLayer=None):
+        super(UpsampleBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inChannels, outChannels, kernel_size=3, padding=1, stride=1)
+        self.combine = nn.Conv2d(2 * outChannels, outChannels, kernel_size=3, padding=1)
+        layers = []
+        for _ in range(convNum - 1):
+            layers.append(nn.Conv2d(outChannels, outChannels, kernel_size=3, padding=1))
+            layers.append(nn.ReLU(inplace=True))
+        if not (normLayer is None):
+            layers.append(normLayer(outChannels))
+        self.conv2 = nn.Sequential(*layers)
+    def forward(self, x, x0):
+        x = self.conv1(x)
+        x = F.interpolate(x, scale_factor=2, mode='nearest')
+        x = self.combine(torch.cat((x, x0), 1))
+        x = F.relu(x)
+        return self.conv2(x)
+class UpsampleBlockSN(nn.Module):
+    def __init__(self, inChannels, outChannels, convNum=2, normLayer=None):
+        super(UpsampleBlockSN, self).__init__()
+        self.conv1 = spectral_norm(nn.Conv2d(inChannels, outChannels, kernel_size=3, stride=1, padding=1))
+        self.shortcut = spectral_norm(nn.Conv2d(outChannels, outChannels, kernel_size=3, stride=1, padding=1))
+        layers = []
+        for _ in range(convNum - 1):
+            layers.append(spectral_norm(nn.Conv2d(outChannels, outChannels, kernel_size=3, padding=1)))
+            layers.append(nn.LeakyReLU(0.2, True))
+        if not (normLayer is None):
+            layers.append(normLayer(outChannels))
+        self.conv2 = nn.Sequential(*layers)
+    def forward(self, x, x0):
+        x = self.conv1(x)
+        x = F.interpolate(x, scale_factor=2, mode='nearest')
+        x = x + self.shortcut(x0)
+        x = F.leaky_relu(x, 2e-1)
+        return self.conv2(x)
+class HourGlass2(nn.Module):
+    def __init__(self, inChannel=3, outChannel=1, resNum=3, normLayer=None):
+        super(HourGlass2, self).__init__()
+        self.inConv = ConvBlock(inChannel, 64, convNum=2, normLayer=normLayer)
+        self.down1 = DownsampleBlock(64, 128, convNum=2, normLayer=normLayer)
+        self.down2 = DownsampleBlock(128, 256, convNum=2, normLayer=normLayer)
+        self.residual = nn.Sequential(*[ResidualBlock(256) for _ in range(resNum)])
+        self.up2 = UpsampleBlock(256, 128, convNum=3, normLayer=normLayer)
+        self.up1 = UpsampleBlock(128, 64, convNum=3, normLayer=normLayer)
+        self.outConv = nn.Conv2d(64, outChannel, kernel_size=3, padding=1)
+    def forward(self, x):
+        f1 = self.inConv(x)
+        f2 = self.down1(f1)
+        f3 = self.down2(f2)
+        r3 = self.residual(f3)
+        r2 = self.up2(r3, f2)
+        r1 = self.up1(r2, f1)
+        y = self.outConv(r1)
+        return y
+class ColorProbNet(nn.Module):
+    def __init__(self, inChannel=1, outChannel=2, with_SA=False):
+        super(ColorProbNet, self).__init__()
+        BNFunc = nn.BatchNorm2d
+        # conv1: 256
+        conv1_2 = [spectral_norm(nn.Conv2d(inChannel, 64, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv1_2 += [spectral_norm(nn.Conv2d(64, 64, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv1_2 += [BNFunc(64, affine=True)]
+        # conv2: 128
+        conv2_3 = [spectral_norm(nn.Conv2d(64, 128, 3, stride=2, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv2_3 += [spectral_norm(nn.Conv2d(128, 128, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv2_3 += [spectral_norm(nn.Conv2d(128, 128, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv2_3 += [BNFunc(128, affine=True)]
+        # conv3: 64
+        conv3_3 = [spectral_norm(nn.Conv2d(128, 256, 3, stride=2, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv3_3 += [spectral_norm(nn.Conv2d(256, 256, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv3_3 += [spectral_norm(nn.Conv2d(256, 256, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv3_3 += [BNFunc(256, affine=True)]
+        # conv4: 32
+        conv4_3 = [spectral_norm(nn.Conv2d(256, 512, 3, stride=2, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv4_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv4_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv4_3 += [BNFunc(512, affine=True)]
+        # conv5: 32
+        conv5_3 = [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv5_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv5_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv5_3 += [BNFunc(512, affine=True)]
+        # conv6: 32
+        conv6_3 = [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv6_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv6_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv6_3 += [BNFunc(512, affine=True),]
+        if with_SA:
+            conv6_3 += [Self_Attn(512)]
+        # conv7: 32
+        conv7_3 = [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv7_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv7_3 += [spectral_norm(nn.Conv2d(512, 512, 3, stride=1, padding=1)), nn.LeakyReLU(0.2, True),]
+        conv7_3 += [BNFunc(512, affine=True)]
+        # conv8: 64
+        conv8up = [nn.Upsample(scale_factor=2, mode='nearest'), nn.Conv2d(512, 256, 3, stride=1, padding=1),]
+        conv3short8 = [nn.Conv2d(256, 256, 3, stride=1, padding=1),]
+        conv8_3 = [nn.ReLU(True),]
+        conv8_3 += [nn.Conv2d(256, 256, 3, stride=1, padding=1), nn.ReLU(True),]
+        conv8_3 += [nn.Conv2d(256, 256, 3, stride=1, padding=1), nn.ReLU(True),]
+        conv8_3 += [BNFunc(256, affine=True),]
+        # conv9: 128
+        conv9up = [nn.Upsample(scale_factor=2, mode='nearest'), nn.Conv2d(256, 128, 3, stride=1, padding=1),]
+        conv9_2 = [nn.Conv2d(128, 128, 3, stride=1, padding=1), nn.ReLU(True),]
+        conv9_2 += [BNFunc(128, affine=True)]
+        # conv10: 64
+        conv10up = [nn.Upsample(scale_factor=2, mode='nearest'), nn.Conv2d(128, 64, 3, stride=1, padding=1),]
+        conv10_2 = [nn.ReLU(True),]
+        conv10_2 += [nn.Conv2d(64, outChannel, 3, stride=1, padding=1), nn.ReLU(True),]
+        self.conv1_2 = nn.Sequential(*conv1_2)
+        self.conv2_3 = nn.Sequential(*conv2_3)
+        self.conv3_3 = nn.Sequential(*conv3_3)
+        self.conv4_3 = nn.Sequential(*conv4_3)
+        self.conv5_3 = nn.Sequential(*conv5_3)
+        self.conv6_3 = nn.Sequential(*conv6_3)
+        self.conv7_3 = nn.Sequential(*conv7_3)
+        self.conv8up = nn.Sequential(*conv8up)
+        self.conv3short8 = nn.Sequential(*conv3short8)
+        self.conv8_3 = nn.Sequential(*conv8_3)
+        self.conv9up = nn.Sequential(*conv9up)
+        self.conv9_2 = nn.Sequential(*conv9_2)
+        self.conv10up = nn.Sequential(*conv10up)
+        self.conv10_2 = nn.Sequential(*conv10_2)
+        # claffificaton output
+        #self.model_class = nn.Sequential(*[nn.Conv2d(256, 313, kernel_size=1, padding=0, stride=1),])
+    def forward(self, input_grays):
+        f1_2 = self.conv1_2(input_grays)
+        f2_3 = self.conv2_3(f1_2)
+        f3_3 = self.conv3_3(f2_3)
+        f4_3 = self.conv4_3(f3_3)
+        f5_3 = self.conv5_3(f4_3)
+        f6_3 = self.conv6_3(f5_3)
+        f7_3 = self.conv7_3(f6_3)
+        f8_up = self.conv8up(f7_3) + self.conv3short8(f3_3)
+        f8_3 = self.conv8_3(f8_up)
+        f9_up = self.conv9up(f8_3)
+        f9_2 = self.conv9_2(f9_up)
+        f10_up = self.conv10up(f9_2)
+        f10_2 = self.conv10_2(f10_up)
+        out_feats = f10_2
+        #out_probs = self.model_class(f8_3)
+        return out_feats
+def conv(batchNorm, in_planes, out_planes, kernel_size=3, stride=1):
+    if batchNorm:
+        return nn.Sequential(
+            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=(kernel_size-1)//2, bias=False),
+            nn.BatchNorm2d(out_planes),
+            nn.LeakyReLU(0.1)
+        )
+    else:
+        return nn.Sequential(
+            nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=(kernel_size-1)//2, bias=True),
+            nn.LeakyReLU(0.1)
+        )
+def deconv(in_planes, out_planes):
+    return nn.Sequential(
+        nn.ConvTranspose2d(in_planes, out_planes, kernel_size=4, stride=2, padding=1, bias=True),
+        nn.LeakyReLU(0.1)
+    )
+class SpixelNet(nn.Module):
+    def __init__(self, inChannel=3, outChannel=9, batchNorm=True):
+        super(SpixelNet,self).__init__()
+        self.batchNorm = batchNorm
+        self.conv0a = conv(self.batchNorm, inChannel, 16, kernel_size=3)
+        self.conv0b = conv(self.batchNorm, 16, 16, kernel_size=3)
+        self.conv1a = conv(self.batchNorm, 16, 32, kernel_size=3, stride=2)
+        self.conv1b = conv(self.batchNorm, 32, 32, kernel_size=3)
+        self.conv2a = conv(self.batchNorm, 32, 64, kernel_size=3, stride=2)
+        self.conv2b = conv(self.batchNorm, 64, 64, kernel_size=3)
+        self.conv3a = conv(self.batchNorm, 64, 128, kernel_size=3, stride=2)
+        self.conv3b = conv(self.batchNorm, 128, 128, kernel_size=3)
+        self.conv4a = conv(self.batchNorm, 128, 256, kernel_size=3, stride=2)
+        self.conv4b = conv(self.batchNorm, 256, 256, kernel_size=3)
+        self.deconv3 = deconv(256, 128)
+        self.conv3_1 = conv(self.batchNorm, 256, 128)
+        self.deconv2 = deconv(128, 64)
+        self.conv2_1 = conv(self.batchNorm, 128, 64)
+        self.deconv1 = deconv(64, 32)
+        self.conv1_1 = conv(self.batchNorm, 64, 32)
+        self.deconv0 = deconv(32, 16)
+        self.conv0_1 = conv(self.batchNorm, 32, 16)
+        self.pred_mask0 = nn.Conv2d(16, outChannel, kernel_size=3, stride=1, padding=1, bias=True)
+        self.softmax = nn.Softmax(1)
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d) or isinstance(m, nn.ConvTranspose2d):
+                init.kaiming_normal_(m.weight, 0.1)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                init.constant_(m.weight, 1)
+                init.constant_(m.bias, 0)
+    def forward(self, x):
+        out1 = self.conv0b(self.conv0a(x))    #5*5
+        out2 = self.conv1b(self.conv1a(out1)) #11*11
+        out3 = self.conv2b(self.conv2a(out2)) #23*23
+        out4 = self.conv3b(self.conv3a(out3)) #47*47
+        out5 = self.conv4b(self.conv4a(out4)) #95*95
+        out_deconv3 = self.deconv3(out5)
+        concat3 = torch.cat((out4, out_deconv3), 1)
+        out_conv3_1 = self.conv3_1(concat3)
+        out_deconv2 = self.deconv2(out_conv3_1)
+        concat2 = torch.cat((out3, out_deconv2), 1)
+        out_conv2_1 = self.conv2_1(concat2)
+        out_deconv1 = self.deconv1(out_conv2_1)
+        concat1 = torch.cat((out2, out_deconv1), 1)
+        out_conv1_1 = self.conv1_1(concat1)
+        out_deconv0 = self.deconv0(out_conv1_1)
+        concat0 = torch.cat((out1, out_deconv0), 1)
+        out_conv0_1 = self.conv0_1(concat0)
+        mask0 = self.pred_mask0(out_conv0_1)
+        prob0 = self.softmax(mask0)
+        return prob0
+## VGG architecter, used for the perceptual loss using a pretrained VGG network
+class VGG19(torch.nn.Module):
+    def __init__(self, requires_grad=False, local_pretrained_path='checkpoints/vgg19.pth'):
+        super().__init__()
+        #vgg_pretrained_features = torchvision.models.vgg19(pretrained=True).features
+        model = torchvision.models.vgg19()
+        model.load_state_dict(torch.load(local_pretrained_path))
+        vgg_pretrained_features = model.features
+        self.slice1 = torch.nn.Sequential()
+        self.slice2 = torch.nn.Sequential()
+        self.slice3 = torch.nn.Sequential()
+        self.slice4 = torch.nn.Sequential()
+        self.slice5 = torch.nn.Sequential()
+        for x in range(2):
+            self.slice1.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(2, 7):
+            self.slice2.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(7, 12):
+            self.slice3.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(12, 21):
+            self.slice4.add_module(str(x), vgg_pretrained_features[x])
+        for x in range(21, 30):
+            self.slice5.add_module(str(x), vgg_pretrained_features[x])
+        if not requires_grad:
+            for param in self.parameters():
+                param.requires_grad = False
+    def forward(self, X):
+        h_relu1 = self.slice1(X)
+        h_relu2 = self.slice2(h_relu1)
+        h_relu3 = self.slice3(h_relu2)
+        h_relu4 = self.slice4(h_relu3)
+        h_relu5 = self.slice5(h_relu4)
+        out = [h_relu1, h_relu2, h_relu3, h_relu4, h_relu5]
+        return out

models/position_encoding.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+"""
+Various positional encodings for the transformer.
+"""
+import math
+import torch
+from torch import nn
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+    def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+    def forward(self, token_tensors):
+        ## input: (B,C,H,W)
+        x = token_tensors
+        h, w = x.shape[-2:]
+        identity_map= torch.ones((h,w), device=x.device)
+        y_embed = identity_map.cumsum(0, dtype=torch.float32)
+        x_embed = identity_map.cumsum(1, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[-1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, -1:] + eps) * self.scale
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, None] / dim_t
+        pos_y = y_embed[:, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()), dim=3).flatten(2)
+        pos = torch.cat((pos_y, pos_x), dim=2).permute(2, 0, 1)
+        batch_pos = pos.unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return batch_pos
+class PositionEmbeddingLearned(nn.Module):
+    """
+    Absolute pos embedding, learned.
+    """
+    def __init__(self, n_pos_x=16, n_pos_y=16, num_pos_feats=64):
+        super().__init__()
+        self.row_embed = nn.Embedding(n_pos_y, num_pos_feats)
+        self.col_embed = nn.Embedding(n_pos_x, num_pos_feats)
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+    def forward(self, token_tensors):
+        ## input: (B,C,H,W)
+        x = token_tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ], dim=-1).permute(2, 0, 1)
+        batch_pos = pos.unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
+        return batch_pos
+def build_position_encoding(num_pos_feats=64, n_pos_x=16, n_pos_y=16, is_learned=False):
+    if is_learned:
+        position_embedding = PositionEmbeddingLearned(n_pos_x, n_pos_y, num_pos_feats)
+    else:
+        position_embedding = PositionEmbeddingSine(num_pos_feats, normalize=True)
+    return position_embedding

models/transformer2d.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import torch
+import torch.nn.functional as F
+from torch import nn
+import copy, math
+from models.position_encoding import build_position_encoding
+class TransformerEncoder(nn.Module):
+    def __init__(self, enc_layer, num_layers, use_dense_pos=False):
+        super().__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(enc_layer) for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.use_dense_pos = use_dense_pos
+    def forward(self, src, pos, padding_mask=None):
+        if self.use_dense_pos:
+            ## pos encoding at each MH-Attention block (q,k)
+            output, pos_enc = src, pos
+            for layer in self.layers:
+                output, att_map = layer(output, pos_enc, padding_mask)
+        else:
+            ## pos encoding at input only (q,k,v)
+            output, pos_enc = src + pos, None
+            for layer in self.layers:
+                output, att_map = layer(output, pos_enc, padding_mask)
+        return output, att_map
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
+                use_dense_pos=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward(self, src, pos, padding_mask):
+        q = k = self.with_pos_embed(src, pos)
+        src2, attn = self.self_attn(q, k, value=src, key_padding_mask=padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src, attn
+class TransformerDecoder(nn.Module):
+    def __init__(self, dec_layer, num_layers, use_dense_pos=False, return_intermediate=False):
+        super().__init__()
+        self.layers = nn.ModuleList([copy.deepcopy(dec_layer) for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.use_dense_pos = use_dense_pos
+        self.return_intermediate = return_intermediate
+    def forward(self, tgt, tgt_pos, memory, memory_pos,
+                tgt_padding_mask, src_padding_mask, tgt_attn_mask=None):
+        intermediate = []
+        if self.use_dense_pos:
+            ## pos encoding at each MH-Attention block (q,k)
+            output = tgt
+            tgt_pos_enc, memory_pos_enc = tgt_pos, memory_pos
+            for layer in self.layers:
+                output, att_map = layer(output, tgt_pos_enc, memory, memory_pos_enc,
+                                tgt_padding_mask, src_padding_mask, tgt_attn_mask)
+                if self.return_intermediate:
+                    intermediate.append(output)
+        else:
+            ## pos encoding at input only (q,k,v)
+            output = tgt + tgt_pos
+            tgt_pos_enc, memory_pos_enc = None, None
+            for layer in self.layers:
+                output, att_map = layer(output, tgt_pos_enc, memory, memory_pos_enc,
+                                tgt_padding_mask, src_padding_mask, tgt_attn_mask)
+                if self.return_intermediate:
+                    intermediate.append(output)
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+        return output, att_map
+class DecoderLayer(nn.Module):
+    def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, activation="relu",
+                 use_dense_pos=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.corr_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+        self.activation = _get_activation_fn(activation)
+    def with_pos_embed(self, tensor, pos):
+        return tensor if pos is None else tensor + pos
+    def forward(self, tgt, tgt_pos, memory, memory_pos,
+                tgt_padding_mask, memory_padding_mask, tgt_attn_mask):
+        q = k = self.with_pos_embed(tgt, tgt_pos)
+        tgt2, attn = self.self_attn(q, k, value=tgt, key_padding_mask=tgt_padding_mask,
+                                    attn_mask=tgt_attn_mask)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2, attn = self.corr_attn(query=self.with_pos_embed(tgt, tgt_pos),
+                                    key=self.with_pos_embed(memory, memory_pos),
+                                    value=memory, key_padding_mask=memory_padding_mask)
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt, attn
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(F"activation should be relu/gelu, not {activation}.")
+#-----------------------------------------------------------------------------------
+'''
+copy from the implementatoin of "attention-is-all-you-need-pytorch-master" by Yu-Hsiang Huang
+'''
+class MultiHeadAttention(nn.Module):
+    ''' Multi-Head Attention module '''
+    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
+        super().__init__()
+        self.n_head = n_head
+        self.d_k = d_k
+        self.d_v = d_v
+        self.w_qs = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_ks = nn.Linear(d_model, n_head * d_k, bias=False)
+        self.w_vs = nn.Linear(d_model, n_head * d_v, bias=False)
+        self.fc = nn.Linear(n_head * d_v, d_model, bias=False)
+        self.attention = ScaledDotProductAttention(temperature=d_k ** 0.5)
+        self.dropout = nn.Dropout(dropout)
+        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
+    def forward(self, q, k, v, mask=None):
+        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
+        sz_b, len_q, len_k, len_v = q.size(0), q.size(1), k.size(1), v.size(1)
+        residual = q
+        # Pass through the pre-attention projection: b x lq x (n*dv)
+        # Separate different heads: b x lq x n x dv
+        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
+        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
+        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
+        # Transpose for attention dot product: b x n x lq x dv
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2)
+        if mask is not None:
+            mask = mask.unsqueeze(1)   # For head axis broadcasting.
+        q, attn = self.attention(q, k, v, mask=mask)
+        # Transpose to move the head dimension back: b x lq x n x dv
+        # Combine the last two dimensions to concatenate all the heads together: b x lq x (n*dv)
+        q = q.transpose(1, 2).contiguous().view(sz_b, len_q, -1)
+        q = self.dropout(self.fc(q))
+        q += residual
+        q = self.layer_norm(q)
+        return q, attn
+class ScaledDotProductAttention(nn.Module):
+    ''' Scaled Dot-Product Attention '''
+    def __init__(self, temperature, attn_dropout=0.1):
+        super().__init__()
+        self.temperature = temperature
+        self.dropout = nn.Dropout(attn_dropout)
+    def forward(self, q, k, v, mask=None):
+        attn = torch.matmul(q / self.temperature, k.transpose(2, 3))
+        if mask is not None:
+            attn = attn.masked_fill(mask == 0, -1e9)
+        attn = self.dropout(F.softmax(attn, dim=-1))
+        output = torch.matmul(attn, v)
+        return output, attn

predict.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# Prediction interface for Cog ⚙️
+# https://github.com/replicate/cog/blob/main/docs/python.md
+from cog import BasePredictor, Input, Path
+import tempfile
+import os, glob
+import numpy as np
+import cv2
+from PIL import Image
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from models import model, basic
+from utils import util
+class Predictor(BasePredictor):
+    def setup(self):
+        seed = 130
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        #print('--------------', torch.cuda.is_available())
+        """Load the model into memory to make running multiple predictions efficient"""
+        self.colorizer = model.AnchorColorProb(inChannel=1, outChannel=313, enhanced=True)
+        self.colorizer = self.colorizer.cuda()
+        checkpt_path = "./checkpoints/disco-beta.pth.rar"
+        assert os.path.exists(checkpt_path)
+        data_dict = torch.load(checkpt_path, map_location=torch.device('cpu'))
+        self.colorizer.load_state_dict(data_dict['state_dict'])
+        self.colorizer.eval()
+        self.color_class = basic.ColorLabel(lambda_=0.5, device='cuda')
+    def resize_ab2l(self, gray_img, lab_imgs):
+        H, W = gray_img.shape[:2]
+        reszied_ab = cv2.resize(lab_imgs[:,:,1:], (W,H), interpolation=cv2.INTER_LINEAR)
+        return np.concatenate((gray_img, reszied_ab), axis=2)
+    def predict(
+        self,
+        image: Path = Input(description="input image. Output will be one or multiple colorized images."),
+        n_anchors: int = Input(
+            description="number of color anchors", ge=3, le=14, default=8
+        ),
+        multi_result: bool = Input(
+            description="to generate diverse results", default=False
+        ),
+        vis_anchors: bool = Input(
+            description="to visualize the anchor locations", default=False
+        )
+    ) -> Path:
+        """Run a single prediction on the model"""
+        bgr_img = cv2.imread(str(image), cv2.IMREAD_COLOR)
+        rgb_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2RGB)
+        rgb_img = np.array(rgb_img / 255., np.float32)
+        lab_img = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2LAB)
+        org_grays = (lab_img[:,:,[0]]-50.) / 50.
+        lab_img = cv2.resize(lab_img, (256,256), interpolation=cv2.INTER_LINEAR)
+        lab_img = torch.from_numpy(lab_img.transpose((2, 0, 1)))
+        gray_img = (lab_img[0:1,:,:]-50.) / 50.
+        ab_chans = lab_img[1:3,:,:] / 110.
+        input_grays = gray_img.unsqueeze(0)
+        input_colors = ab_chans.unsqueeze(0)
+        input_grays = input_grays.cuda(non_blocking=True)
+        input_colors = input_colors.cuda(non_blocking=True)
+        sampled_T = 2 if multi_result else 0
+        pal_logit, ref_logit, enhanced_ab, affinity_map, spix_colors, hint_mask = self.colorizer(input_grays, \
+                                                            input_colors, n_anchors, True, sampled_T)
+        pred_probs = pal_logit
+        guided_colors = self.color_class.decode_ind2ab(ref_logit, T=0)
+        sp_size = 16
+        guided_colors = basic.upfeat(guided_colors, affinity_map, sp_size, sp_size)
+        res_list = []
+        if multi_result:
+            for no in range(3):
+                pred_labs = torch.cat((input_grays,enhanced_ab[no:no+1,:,:,:]), dim=1)
+                lab_imgs = basic.tensor2array(pred_labs).squeeze(axis=0)
+                lab_imgs = self.resize_ab2l(org_grays, lab_imgs)
+                #util.save_normLabs_from_batch(lab_imgs, save_dir, [file_name], -1, suffix='c%d'%no)
+                res_list.append(lab_imgs)
+        else:
+            pred_labs = torch.cat((input_grays,enhanced_ab), dim=1)
+            lab_imgs = basic.tensor2array(pred_labs).squeeze(axis=0)
+            lab_imgs = self.resize_ab2l(org_grays, lab_imgs)
+            #util.save_normLabs_from_batch(lab_imgs, save_dir, [file_name], -1)#, suffix='enhanced')
+            res_list.append(lab_imgs)
+        if vis_anchors:
+            ## visualize anchor locations
+            anchor_masks = basic.upfeat(hint_mask, affinity_map, sp_size, sp_size)
+            marked_labs = basic.mark_color_hints(input_grays, enhanced_ab, anchor_masks, base_ABs=enhanced_ab)
+            hint_imgs = basic.tensor2array(marked_labs).squeeze(axis=0)
+            hint_imgs = self.resize_ab2l(org_grays, hint_imgs)
+            #util.save_normLabs_from_batch(hint_imgs, save_dir, [file_name], -1, suffix='anchors')
+            res_list.append(hint_imgs)
+        output = cv2.vconcat(res_list)
+        output[:,:,0] = output[:,:,0] * 50.0 + 50.0
+        output[:,:,1:3] = output[:,:,1:3] * 110.0
+        rgb_output = cv2.cvtColor(output[:,:,:], cv2.COLOR_LAB2BGR)
+        out_path = Path(tempfile.mkdtemp()) / "out.png"
+        cv2.imwrite(str(out_path), (rgb_output*255.0).astype(np.uint8))
+        return out_path

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+addict
+future
+numpy
+opencv-python
+pandas
+Pillow
+pyyaml
+requests
+scikit-image
+scikit-learn
+scipy
+torch>=1.8.0
+torchvision
+tensorboardx>=2.4
+tqdm
+yapf
+lpips

utils/__init__.py ADDED Viewed

File without changes

utils/cielab.py ADDED Viewed

	@@ -0,0 +1,71 @@

+from functools import partial
+import numpy as np
+class ABGamut:
+    RESOURCE_POINTS = "./utils/gamut_pts.npy"
+    RESOURCE_PRIOR = "./utils/gamut_probs.npy"
+    DTYPE = np.float32
+    EXPECTED_SIZE = 313
+    def __init__(self):
+        self.points = np.load(self.RESOURCE_POINTS).astype(self.DTYPE)
+        self.prior = np.load(self.RESOURCE_PRIOR).astype(self.DTYPE)
+        assert self.points.shape == (self.EXPECTED_SIZE, 2)
+        assert self.prior.shape == (self.EXPECTED_SIZE,)
+class CIELAB:
+    L_MEAN = 50
+    AB_BINSIZE = 10
+    AB_RANGE = [-110 - AB_BINSIZE // 2, 110 + AB_BINSIZE // 2, AB_BINSIZE]
+    AB_DTYPE = np.float32
+    Q_DTYPE = np.int64
+    RGB_RESOLUTION = 101
+    RGB_RANGE = [0, 1, RGB_RESOLUTION]
+    RGB_DTYPE = np.float64
+    def __init__(self, gamut=None):
+        self.gamut = gamut if gamut is not None else ABGamut()
+        a, b, self.ab = self._get_ab()
+        self.ab_gamut_mask = self._get_ab_gamut_mask(
+            a, b, self.ab, self.gamut)
+        self.ab_to_q = self._get_ab_to_q(self.ab_gamut_mask)
+        self.q_to_ab = self._get_q_to_ab(self.ab, self.ab_gamut_mask)
+    @classmethod
+    def _get_ab(cls):
+        a = np.arange(*cls.AB_RANGE, dtype=cls.AB_DTYPE)
+        b = np.arange(*cls.AB_RANGE, dtype=cls.AB_DTYPE)
+        b_, a_ = np.meshgrid(a, b)
+        ab = np.dstack((a_, b_))
+        return a, b, ab
+    @classmethod
+    def _get_ab_gamut_mask(cls, a, b, ab, gamut):
+        ab_gamut_mask = np.full(ab.shape[:-1], False, dtype=bool)
+        a = np.digitize(gamut.points[:, 0], a) - 1
+        b = np.digitize(gamut.points[:, 1], b) - 1
+        for a_, b_ in zip(a, b):
+            ab_gamut_mask[a_, b_] = True
+        return ab_gamut_mask
+    @classmethod
+    def _get_ab_to_q(cls, ab_gamut_mask):
+        ab_to_q = np.full(ab_gamut_mask.shape, -1, dtype=cls.Q_DTYPE)
+        ab_to_q[ab_gamut_mask] = np.arange(np.count_nonzero(ab_gamut_mask))
+        return ab_to_q
+    @classmethod
+    def _get_q_to_ab(cls, ab, ab_gamut_mask):
+        return ab[ab_gamut_mask] + cls.AB_BINSIZE / 2
+    def bin_ab(self, ab):
+        ab_discrete = ((ab + 110) / self.AB_RANGE[2]).astype(int)
+        a, b = np.hsplit(ab_discrete.reshape(-1, 2), 2)
+        return self.ab_to_q[a, b].reshape(*ab.shape[:2])

utils/dataset_lab.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from __future__ import print_function, division
+import torch, os, glob
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+from PIL import Image
+import cv2
+class LabDataset(Dataset):
+    def __init__(self, rootdir=None, filelist=None, resize=None):
+        if filelist:
+            self.file_list = filelist
+        else:
+            assert os.path.exists(rootdir), "@dir:'%s' NOT exist ..."%rootdir
+            self.file_list = glob.glob(os.path.join(rootdir, '*.*'))
+            self.file_list.sort()
+        self.resize = resize
+    def __len__(self):
+        return len(self.file_list)
+    def __getitem__(self, idx):
+        bgr_img = cv2.imread(self.file_list[idx], cv2.IMREAD_COLOR)
+        if self.resize:
+            bgr_img = cv2.resize(bgr_img, (self.resize,self.resize), interpolation=cv2.INTER_CUBIC)
+        bgr_img = np.array(bgr_img / 255., np.float32)
+        lab_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2LAB)
+        #print('--------L:', np.min(lab_img[:,:,0]), np.max(lab_img[:,:,0]))
+        #print('--------ab:', np.min(lab_img[:,:,1:3]), np.max(lab_img[:,:,1:3]))
+        lab_img = torch.from_numpy(lab_img.transpose((2, 0, 1)))
+        bgr_img = torch.from_numpy(bgr_img.transpose((2, 0, 1)))
+        gray_img = (lab_img[0:1,:,:]-50.) / 50.
+        color_map = lab_img[1:3,:,:] / 110.
+        bgr_img = bgr_img*2. - 1.
+        return {'gray': gray_img, 'color': color_map, 'BGR': bgr_img}

utils/gamut_probs.npy ADDED Viewed

Binary file (2.58 kB). View file

utils/gamut_pts.npy ADDED Viewed

Binary file (5.09 kB). View file

utils/util.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from __future__ import division
+from __future__ import print_function
+import os, glob, shutil, math, json
+from queue import Queue
+from threading import Thread
+from skimage.segmentation import mark_boundaries
+import numpy as np
+from PIL import Image
+import cv2, torch
+def get_gauss_kernel(size, sigma):
+    '''Function to mimic the 'fspecial' gaussian MATLAB function'''
+    x, y = np.mgrid[-size//2 + 1:size//2 + 1, -size//2 + 1:size//2 + 1]
+    g = np.exp(-((x**2 + y**2)/(2.0*sigma**2)))
+    return g/g.sum()
+def batchGray2Colormap(gray_batch):
+    colormap = plt.get_cmap('viridis')
+    heatmap_batch = []
+    for i in range(gray_batch.shape[0]):
+        # quantize [-1,1] to {0,1}
+        gray_map = gray_batch[i, :, :, 0]
+        heatmap = (colormap(gray_map) * 2**16).astype(np.uint16)[:,:,:3]
+        heatmap_batch.append(heatmap/127.5-1.0)
+    return np.array(heatmap_batch)
+class PlotterThread():
+    '''log tensorboard data in a background thread to save time'''
+    def __init__(self, writer):
+        self.writer = writer
+        self.task_queue = Queue(maxsize=0)
+        worker = Thread(target=self.do_work, args=(self.task_queue,))
+        worker.setDaemon(True)
+        worker.start()
+    def do_work(self, q):
+        while True:
+            content = q.get()
+            if content[-1] == 'image':
+                self.writer.add_image(*content[:-1])
+            elif content[-1] == 'scalar':
+                self.writer.add_scalar(*content[:-1])
+            else:
+                raise ValueError
+            q.task_done()
+    def add_data(self, name, value, step, data_type='scalar'):
+        self.task_queue.put([name, value, step, data_type])
+    def __len__(self):
+        return self.task_queue.qsize()
+def save_images_from_batch(img_batch, save_dir, filename_list, batch_no=-1, suffix=None):
+    N,H,W,C = img_batch.shape
+    if C == 3:
+        #! rgb color image
+        for i in range(N):
+            # [-1,1] >>> [0,255]
+            image = Image.fromarray((127.5*(img_batch[i,:,:,:]+1.)).astype(np.uint8))
+            save_name = filename_list[i] if batch_no==-1 else '%05d.png' % (batch_no*N+i)
+            save_name = save_name.replace('.png', '-%s.png'%suffix) if suffix else save_name
+            image.save(os.path.join(save_dir, save_name), 'PNG')
+    elif C == 1:
+        #! single-channel gray image
+        for i in range(N):
+            # [-1,1] >>> [0,255]
+            image = Image.fromarray((127.5*(img_batch[i,:,:,0]+1.)).astype(np.uint8))
+            save_name = filename_list[i] if batch_no==-1 else '%05d.png' % (batch_no*img_batch.shape[0]+i)
+            save_name = save_name.replace('.png', '-%s.png'%suffix) if suffix else save_name
+            image.save(os.path.join(save_dir, save_name), 'PNG')
+    else:
+        #! multi-channel: save each channel as a single image
+        for i in range(N):
+            # [-1,1] >>> [0,255]
+            for j in range(C):
+                image = Image.fromarray((127.5*(img_batch[i,:,:,j]+1.)).astype(np.uint8))
+                if batch_no == -1:
+                    _, file_name = os.path.split(filename_list[i])
+                    name_only, _ = os.path.os.path.splitext(file_name)
+                    save_name = name_only + '_c%d.png' % j
+                else:
+                    save_name = '%05d_c%d.png' % (batch_no*N+i, j)
+                save_name = save_name.replace('.png', '-%s.png'%suffix) if suffix else save_name
+                image.save(os.path.join(save_dir, save_name), 'PNG')
+    return None
+def save_normLabs_from_batch(img_batch, save_dir, filename_list, batch_no=-1, suffix=None):
+    N,H,W,C = img_batch.shape
+    if C != 3:
+        print('@Warning:the Lab images are NOT in 3 channels!')
+        return None
+    # denormalization: L: (L+1.0)*50.0 | a: a*110.0| b: b*110.0
+    img_batch[:,:,:,0] = img_batch[:,:,:,0] * 50.0 + 50.0
+    img_batch[:,:,:,1:3] = img_batch[:,:,:,1:3] * 110.0
+    #! convert into RGB color image
+    for i in range(N):
+        rgb_img = cv2.cvtColor(img_batch[i,:,:,:], cv2.COLOR_LAB2RGB)
+        image = Image.fromarray((rgb_img*255.0).astype(np.uint8))
+        save_name = filename_list[i] if batch_no==-1 else '%05d.png' % (batch_no*N+i)
+        save_name = save_name.replace('.png', '-%s.png'%suffix) if suffix else save_name
+        image.save(os.path.join(save_dir, save_name), 'PNG')
+    return None
+def save_markedSP_from_batch(img_batch, spix_batch, save_dir, filename_list, batch_no=-1, suffix=None):
+    N,H,W,C = img_batch.shape
+    #! img_batch: BGR nd-array (range:0~1)
+    #! map_batch: single-channel spixel map
+    #print('----------', img_batch.shape, spix_batch.shape)
+    for i in range(N):
+        norm_image = img_batch[i,:,:,:]*0.5+0.5
+        spixel_bd_image = mark_boundaries(norm_image, spix_batch[i,:,:,0].astype(int), color=(1,1,1))
+        #spixel_bd_image = cv2.cvtColor(spixel_bd_image, cv2.COLOR_BGR2RGB)
+        image = Image.fromarray((spixel_bd_image*255.0).astype(np.uint8))
+        save_name = filename_list[i] if batch_no==-1 else '%05d.png' % (batch_no*N+i)
+        save_name = save_name.replace('.png', '-%s.png'%suffix) if suffix else save_name
+        image.save(os.path.join(save_dir, save_name), 'PNG')
+    return None
+def get_filelist(data_dir):
+    file_list = glob.glob(os.path.join(data_dir, '*.*'))
+    file_list.sort()
+    return file_list
+def collect_filenames(data_dir):
+    file_list = get_filelist(data_dir)
+    name_list = []
+    for file_path in file_list:
+        _, file_name = os.path.split(file_path)
+        name_list.append(file_name)
+    name_list.sort()
+    return name_list
+def exists_or_mkdir(path, need_remove=False):
+    if not os.path.exists(path):
+        os.makedirs(path)
+    elif need_remove:
+        shutil.rmtree(path)
+        os.makedirs(path)
+    return None
+def save_list(save_path, data_list, append_mode=False):
+    n = len(data_list)
+    if append_mode:
+        with open(save_path, 'a') as f:
+            f.writelines([str(data_list[i]) + '\n' for i in range(n-1,n)])
+    else:
+        with open(save_path, 'w') as f:
+            f.writelines([str(data_list[i]) + '\n' for i in range(n)])
+    return None
+def save_dict(save_path, dict):
+    json.dumps(dict, open(save_path,"w"))
+    return None
+if __name__ == '__main__':
+    data_dir = '../PolyNet/PolyNet/cache/'
+    #visualizeLossCurves(data_dir)
+    clbar = GamutIndex()
+    ab, ab_gamut_mask = clbar._get_gamut_mask()
+    ab2q = clbar._get_ab_to_q(ab_gamut_mask)
+    q2ab = clbar._get_q_to_ab(ab, ab_gamut_mask)
+    maps = ab_gamut_mask*255.0
+    image = Image.fromarray(maps.astype(np.uint8))
+    image.save('gamut.png', 'PNG')
+    print(ab2q.shape)
+    print(q2ab.shape)
+    print('label range:', np.min(ab2q), np.max(ab2q))