Pytorch3d: camera_position_optimization.py does not converge with multiple runs

Created on 4 Mar 2021 · 3Comments · Source: facebookresearch/pytorch3d

If you do not know the root cause of the problem / bug, and wish someone to help you, please
post according to this template:

🐛 Bugs / Unexpected behaviors

camera_position_optimization_with_differentiable_rendering.py does not converge with multiple runs. I was trying to repeat the output with teapot.obj file for multiple runs. However, the code does not converge everytime. Sometimes, the self.camera_position becomes nan. I also tried seeding everything but this does not change the abrupt behavior.

This code is borrowed from the Pytorch3D tutorial

Instructions To Reproduce the Issue:

The code

import os
import sys
import torch
if torch.__version__=='1.6.0+cu101' and sys.platform.startswith('linux'):
    get_ipython().system('pip install pytorch3d')
else:
    need_pytorch3d=False
    try:
        import pytorch3d
    except ModuleNotFoundError:
        need_pytorch3d=True
    if need_pytorch3d:
        get_ipython().system('curl -LO https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz')
        get_ipython().system('tar xzf 1.10.0.tar.gz')
        os.environ["CUB_HOME"] = os.getcwd() + "/cub-1.10.0"
        get_ipython().system("pip install 'git+https://github.com/facebookresearch/pytorch3d.git@stable'")





import os
import torch
import numpy as np
from tqdm.notebook import tqdm
import imageio
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt
from skimage import img_as_ubyte
import random

# io utils
from pytorch3d.io import load_obj

# datastructures
from pytorch3d.structures import Meshes

# 3D transformations functions
from pytorch3d.transforms import Rotate, Translate

# rendering components
from pytorch3d.renderer import (
    FoVPerspectiveCameras, look_at_view_transform, look_at_rotation, 
    RasterizationSettings, MeshRenderer, MeshRasterizer, BlendParams,
    SoftSilhouetteShader, HardPhongShader, PointLights, TexturesVertex,
)

import argparse
parser = argparse.ArgumentParser("Calculate ego motion of the camera")

parser.add_argument("--obj_file_path"         , type=str  , default= "/home/abhinav/Desktop/data_3d/teapot.obj", help='input obj file')
parser.add_argument("--seed"                  , type=int  , default= 0             , help='seed')
parser.add_argument("--lr"                    , type=float, default= 0.05)
parser.add_argument("--distance"              , type=float, default= 3  , help='distance')
parser.add_argument("--elevation"             , type=float, default= 50 , help='elevation')
parser.add_argument("--azimuth"               , type=float, default= 0  , help='azimuth')
parser.add_argument("--num_iter"              , type=int  , default= 200, help='num_iter')
parser.add_argument("--print_frequency"       , type=int  , default= 20)
args = parser.parse_args()

obj_file_path = args.obj_file_path
seed          = args.seed
lr            = args.lr

# The world coordinate system is defined as +Y up, +X left and +Z in. The teapot in world coordinates has the spout pointing to the left.
# We defined a camera which is positioned on the positive z axis hence sees the spout to the right.
# Select the viewpoint using spherical angles  
distance  = args.distance    # distance from camera to the object
elevation = args.elevation   # angle of elevation in degrees
azimuth   = args.azimuth     # angle of azimuth/yaw. No rotation so the camera is positioned on the +Z axis.

num_iter  = args.num_iter
print_frequency = args.print_frequency

light_location = (2.0, 2.0, -2.0)
camera_init_position = np.array([3.0,  6.9, +2.5], dtype=np.float32)
show_current_target = False

obj_file_basename = os.path.basename(obj_file_path)
obj_file_basename_no_ext = obj_file_basename.split(".")[0]
# We will save images periodically and compose them into a GIF.
filename_output = os.path.join(os.getcwd(), obj_file_basename_no_ext + "_demo.gif")

# seed everything
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# torch.backends.cudnn.enabled = False
# torch.set_deterministic(True)

# Set the cuda device
if torch.cuda.is_available():
    device = torch.device("cuda:0")
    torch.cuda.set_device(device)
else:
    device = torch.device("cpu")

if obj_file_path.endswith(".obj"):
    # Load the obj and ignore the textures and materials.
    verts, faces_idx, _ = load_obj(obj_file_path)
    faces = faces_idx.verts_idx

print("Input file        = {}".format(obj_file_path))
print("Number of vertices= {}".format(verts.shape[0]))
print("Number of faces   = {}".format(faces.shape[0]))
print("min x= {: .2f}, max x= {: .2f}".format(torch.min(verts[:, 0]).item(), torch.max(verts[:, 0]).item()))
print("min y= {: .2f}, max y= {: .2f}".format(torch.min(verts[:, 1]).item(), torch.max(verts[:, 1]).item()))
print("min z= {: .2f}, max z= {: .2f}".format(torch.min(verts[:, 2]).item(), torch.max(verts[:, 2]).item()))

print("Distance = {}".format(distance))
print("Elevation= {}".format(elevation))
print("Azimuth  = {}".format(azimuth))
print("Seed     = {}".format(seed))
print("lr       = {}".format(lr))
print("Num iter = {}".format(num_iter))

# Initialize each vertex to be white in color.
verts_rgb = torch.ones_like(verts)[None]  # (1, V, 3)
textures = TexturesVertex(verts_features=verts_rgb.to(device))

# Create a Meshes object for the teapot. Here we have only one mesh in the batch.
teapot_mesh = Meshes(
    verts=[verts.to(device)],   
    faces=[faces.to(device)], 
    textures=textures
)


# ## 2. Optimization setup
# ### Create a renderer
# A **renderer** in PyTorch3D is composed of a **rasterizer** and a **shader** which each have a
# number of subcomponents such as a **camera** (orthgraphic/perspective). Here we initialize some of
# these components and use default values for the rest.
# For optimizing the camera position we will use a renderer which produces a **silhouette** of the
# object only and does not apply any **lighting** or **shading**. We will also initialize another
# renderer which applies full **phong shading** and use this for visualizing the outputs.

# Initialize a perspective camera.
cameras = FoVPerspectiveCameras(device=device)

# To blend the 100 faces we set a few parameters which control the opacity and the sharpness of 
# edges. Refer to blending.py for more details. 
blend_params = BlendParams(sigma=1e-4, gamma=1e-4)

# Define the settings for rasterization and shading. Here we set the output image to be of size
# 256x256. To form the blended image we use 100 faces for each pixel. We also set bin_size and max_faces_per_bin to None which ensure that 
# the faster coarse-to-fine rasterization method is used. Refer to rasterize_meshes.py for 
# explanations of these parameters. Refer to docs/notes/renderer.md for an explanation of 
# the difference between naive and coarse-to-fine rasterization. 
raster_settings = RasterizationSettings(
    image_size=256, 
    blur_radius=np.log(1. / 1e-4 - 1.) * blend_params.sigma, 
    faces_per_pixel=100, 
)

# Create a silhouette mesh renderer by composing a rasterizer and a shader. 
silhouette_renderer = MeshRenderer(
    rasterizer=MeshRasterizer(
        cameras=cameras, 
        raster_settings=raster_settings
    ),
    shader=SoftSilhouetteShader(blend_params=blend_params)
)

# We will also create a phong renderer. This is simpler and only needs to render one face per pixel.
raster_settings = RasterizationSettings(
    image_size=256, 
    blur_radius=0.0, 
    faces_per_pixel=1, 
)
# We can add a point light in front of the object. 
lights = PointLights(device=device, location=(light_location,))
phong_renderer = MeshRenderer(
    rasterizer=MeshRasterizer(
        cameras=cameras, 
        raster_settings=raster_settings
    ),
    shader=HardPhongShader(device=device, cameras=cameras, lights=lights)
)


# ### Create a reference image
# We will first position the teapot and generate an image. We use helper functions to rotate the
# teapot to a desired viewpoint. Then we can use the renderers to produce an image. Here we will use
# both renderers and visualize the silhouette and full shaded image.

# Get the position of the camera based on the spherical angles
R, T = look_at_view_transform(distance, elevation, azimuth, device=device)

# Render the teapot providing the values of R and T. 
silhouete = silhouette_renderer(meshes_world=teapot_mesh, R=R, T=T)
image_ref = phong_renderer(meshes_world=teapot_mesh, R=R, T=T)

silhouete = silhouete.cpu().numpy()
image_ref = image_ref.cpu().numpy()

# plt.figure(figsize=(10, 10))
# plt.subplot(1, 2, 1)
# plt.imshow(silhouete.squeeze()[..., 3])  # only plot the alpha channel of the RGBA image
# plt.grid(False)
# plt.subplot(1, 2, 2)
# plt.imshow(image_ref.squeeze())
# plt.grid(False)


# ### Set up a basic model 
class Model(nn.Module):
    def __init__(self, meshes, renderer, image_ref):
        super().__init__()
        self.meshes = meshes
        self.device = meshes.device
        self.renderer = renderer

        # Get the silhouette of the reference RGB image by finding all non-white pixel values. 
        image_ref = torch.from_numpy((image_ref[..., :3].max(-1) != 1).astype(np.float32))
        self.register_buffer('image_ref', image_ref)

        # Create an optimizable parameter for the x, y, z position of the camera. 
        self.camera_position = nn.Parameter(
            torch.from_numpy(camera_init_position).to(meshes.device))
        self.previous_camera_position = None

    def forward(self):
        # Render the image using the updated camera position. Based on the new position of the
        # camer we calculate the rotation and translation matrices
        R = look_at_rotation(self.camera_position[None, :], device=self.device)  # (1, 3, 3)
        if(torch.any(torch.isnan(R[0])) ):
            print(self.camera_position)
            print(R[0])
            sys.exit(0)

        T = -torch.bmm(R.transpose(1, 2), self.camera_position[None, :, None])[:, :, 0]   # (1, 3)

        image = self.renderer(meshes_world=self.meshes.clone(), R=R, T=T)

        # Calculate the silhouette loss
        loss = torch.sum((image[..., 3] - self.image_ref) ** 2)
        return loss, image


# Initialize a model using the renderer, mesh and reference image
model = Model(meshes=teapot_mesh, renderer=silhouette_renderer, image_ref=image_ref).to(device)

# Create an optimizer. Here we are using Adam and we pass in the parameters of the model
optimizer = torch.optim.Adam(model.parameters(), lr= lr)


# ### Visualize the starting position and the reference position
writer = imageio.get_writer(filename_output, mode='I', duration=0.3)


_, image_init = model()

plt.figure(figsize=(10, 10))
plt.subplot(1, 2, 1)
plt.imshow(image_init.detach().squeeze().cpu().numpy()[..., 3])
plt.grid(False)
plt.xticks([]); plt.yticks([])
plt.title("Starting position")
plt.grid(False)
plt.axis("off")
# plt.xticks([]); plt.yticks([])

plt.subplot(1, 2, 2)
plt.imshow(model.image_ref.cpu().numpy().squeeze())
plt.grid(False)
plt.title("Reference silhouette");
plt.axis("off")
if show_current_target:
    plt.show()

# ## 4. Run the optimization 
# 
# We run several iterations of the forward and backward pass and save outputs every 10 iterations. When this has finished take a look at `./teapot_optimization_demo.gif` for a cool gif of the optimization process!
for i in range(num_iter):
    model.train()
    optimizer.zero_grad()
    loss, _ = model()
    loss.backward()
    optimizer.step()

    if loss.clone().item() < 200:
        break

    # Save outputs to create a GIF. 
    if (i+1) % print_frequency == 0:
        model.eval()
        R = look_at_rotation(model.camera_position[None, :], device=model.device)
        T = -torch.bmm(R.transpose(1, 2), model.camera_position[None, :, None])[:, :, 0]   # (1, 3)
        image = phong_renderer(meshes_world=model.meshes.clone(), R=R, T=T)
        image = image[0, ..., :3].detach().squeeze().cpu().numpy()
        image = img_as_ubyte(image)
        writer.append_data(image)

        plt.figure()
        plt.imshow(image[..., :3])
        plt.title("iter: {:5d}, loss: {:.4f}".format(i+1, loss.data))
        plt.grid("off")
        plt.axis("off")

        print("Optimizing iter: {:5d}, loss {:.4f}".format(i+1, loss.data))
writer.close()
print("=> Saving to {}".format(filename_output))

Environment:
a. Ubuntu 18.04
b. Pytorch 1.7.1
c. Torchvision 0.8.2
d. Cuda 10.2

The exact command(s) I ran:

python camera_position_optimization.py
python camera_position_optimization.py

Observed (including the full logs):

nan

potential-bug

Source

abhi1kumar

All 3 comments

@abhi1kumar can you try the fix suggested in https://github.com/facebookresearch/pytorch3d/issues/561?

nikhilaravi on 4 Mar 2021

🎉1

@nikhilaravi Thank you for suggesting this fix. The nan errors are gone now.

abhi1kumar on 4 Mar 2021

👍1

@abhi1kumar great! We'll update the codebase with a fix for this!

nikhilaravi on 6 Mar 2021

Was this page helpful?

0 / 5 - 0 ratings

Related issues

[Documentation] DensePose documentation IO error

OmriKaduri · 3Comments

Undersampled images in documentation

eliemichel · 3Comments

ModuleNotFoundError: No module named 'pytorch3d.visualization'

OOF-dura · 3Comments

How can I do a faster rendering with a big mesh?

TSKongLingwei · 3Comments

Error encountered when trying to load a .ply model using load_ply

farhanrw · 3Comments