935 lines
37 KiB
HTML
935 lines
37 KiB
HTML
|
||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
|
||
<html>
|
||
|
||
<script type="text/javascript" charset="utf-8" src="https://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js"></script>
|
||
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>
|
||
|
||
<style type="text/css">
|
||
body {
|
||
font-family: "Titillium Web", "HelveticaNeue-Light", "Helvetica Neue Light", "Helvetica Neue", Helvetica, Arial, "Lucida Grande", sans-serif;
|
||
font-weight: 300;
|
||
font-size: 17px;
|
||
margin-left: auto;
|
||
margin-right: auto;
|
||
}
|
||
|
||
@media screen and (min-width: 980px){
|
||
body {
|
||
width: 980px;
|
||
}
|
||
}
|
||
|
||
h1 {
|
||
font-weight:300;
|
||
line-height: 1.15em;
|
||
}
|
||
|
||
h2 {
|
||
font-size: 1.75em;
|
||
}
|
||
a:link,a:visited {
|
||
color: #5364cc;
|
||
text-decoration: none;
|
||
}
|
||
a:hover {
|
||
color: #208799;
|
||
}
|
||
h1 {
|
||
text-align: center;
|
||
}
|
||
h2,h3 {
|
||
text-align: left;
|
||
}
|
||
|
||
h1 {
|
||
font-size: 40px;
|
||
font-weight: 500;
|
||
}
|
||
h2 {
|
||
font-weight: 400;
|
||
margin: 16px 0px 4px 0px;
|
||
}
|
||
h3 {
|
||
font-weight: 600;
|
||
margin: 16px 0px 4px 0px;
|
||
}
|
||
|
||
.paper-title {
|
||
padding: 1px 0px 1px 0px;
|
||
}
|
||
section {
|
||
margin: 32px 0px 32px 0px;
|
||
text-align: justify;
|
||
clear: both;
|
||
}
|
||
.col-5 {
|
||
width: 20%;
|
||
float: left;
|
||
}
|
||
.col-4 {
|
||
width: 25%;
|
||
float: left;
|
||
}
|
||
.col-3 {
|
||
width: 33%;
|
||
float: left;
|
||
}
|
||
.col-2 {
|
||
width: 50%;
|
||
float: left;
|
||
}
|
||
.col-1 {
|
||
width: 100%;
|
||
float: left;
|
||
}
|
||
|
||
.author-row, .affil-row {
|
||
font-size: 26px;
|
||
}
|
||
|
||
.author-row-new {
|
||
text-align: center;
|
||
}
|
||
|
||
.author-row-new a {
|
||
display: inline-block;
|
||
font-size: 20px;
|
||
padding: 4px;
|
||
}
|
||
|
||
.author-row-new sup {
|
||
color: #313436;
|
||
font-size: 12px;
|
||
}
|
||
|
||
.affiliations-new {
|
||
font-size: 18px;
|
||
text-align: center;
|
||
width: 80%;
|
||
margin: 0 auto;
|
||
margin-bottom: 20px;
|
||
}
|
||
|
||
.row {
|
||
margin: 16px 0px 16px 0px;
|
||
}
|
||
.authors {
|
||
font-size: 26px;
|
||
}
|
||
.affiliatons {
|
||
font-size: 18px;
|
||
}
|
||
.affil-row {
|
||
margin-top: 18px;
|
||
}
|
||
.teaser {
|
||
max-width: 100%;
|
||
}
|
||
.text-center {
|
||
text-align: center;
|
||
}
|
||
.screenshot {
|
||
width: 256px;
|
||
border: 1px solid #ddd;
|
||
}
|
||
.screenshot-el {
|
||
margin-bottom: 16px;
|
||
}
|
||
hr {
|
||
height: 1px;
|
||
border: 0;
|
||
border-top: 1px solid #ddd;
|
||
margin: 0;
|
||
}
|
||
.material-icons {
|
||
vertical-align: -6px;
|
||
}
|
||
p {
|
||
line-height: 1.25em;
|
||
}
|
||
.caption {
|
||
font-size: 16px;
|
||
color: #666;
|
||
margin-top: 4px;
|
||
margin-bottom: 10px;
|
||
}
|
||
video {
|
||
display: block;
|
||
margin: auto;
|
||
}
|
||
figure {
|
||
display: block;
|
||
margin: auto;
|
||
margin-top: 10px;
|
||
margin-bottom: 10px;
|
||
}
|
||
#bibtex pre {
|
||
font-size: 14px;
|
||
background-color: #eee;
|
||
padding: 16px;
|
||
}
|
||
.blue {
|
||
color: #2c82c9;
|
||
font-weight: bold;
|
||
}
|
||
.orange {
|
||
color: #d35400;
|
||
font-weight: bold;
|
||
}
|
||
.flex-row {
|
||
display: flex;
|
||
flex-flow: row wrap;
|
||
padding: 0;
|
||
margin: 0;
|
||
list-style: none;
|
||
}
|
||
|
||
.paper-btn-coming-soon {
|
||
position: relative;
|
||
top: 0;
|
||
left: 0;
|
||
}
|
||
|
||
.coming-soon {
|
||
position: absolute;
|
||
top: -15px;
|
||
right: -15px;
|
||
}
|
||
|
||
.paper-btn {
|
||
position: relative;
|
||
text-align: center;
|
||
|
||
display: inline-block;
|
||
margin: 8px;
|
||
padding: 8px 8px;
|
||
|
||
border-width: 0;
|
||
outline: none;
|
||
border-radius: 2px;
|
||
|
||
background-color: #5364cc;
|
||
color: white !important;
|
||
font-size: 20px;
|
||
width: 100px;
|
||
font-weight: 600;
|
||
}
|
||
.paper-btn-parent {
|
||
display: flex;
|
||
justify-content: center;
|
||
margin: 16px 0px;
|
||
}
|
||
|
||
.paper-btn:hover {
|
||
opacity: 0.85;
|
||
}
|
||
|
||
.container {
|
||
margin-left: auto;
|
||
margin-right: auto;
|
||
padding-left: 16px;
|
||
padding-right: 16px;
|
||
}
|
||
|
||
.venue {
|
||
font-size: 23px;
|
||
}
|
||
|
||
.topnav {
|
||
background-color: #EEEEEE;
|
||
overflow: hidden;
|
||
}
|
||
|
||
.topnav div {
|
||
max-width: 1070px;
|
||
margin: 0 auto;
|
||
}
|
||
|
||
.topnav a {
|
||
display: inline-block;
|
||
color: black;
|
||
text-align: center;
|
||
vertical-align: middle;
|
||
padding: 16px 16px;
|
||
text-decoration: none;
|
||
font-size: 18px;
|
||
}
|
||
|
||
.topnav img {
|
||
padding: 2px 0px;
|
||
width: 100%;
|
||
margin: 0.2em 0px 0.3em 0px;
|
||
vertical-align: middle;
|
||
}
|
||
|
||
pre {
|
||
font-size: 0.9em;
|
||
padding-left: 7px;
|
||
padding-right: 7px;
|
||
padding-top: 3px;
|
||
padding-bottom: 3px;
|
||
border-radius: 3px;
|
||
background-color: rgb(235, 235, 235);
|
||
overflow-x: auto;
|
||
}
|
||
|
||
.download-thumb {
|
||
display: flex;
|
||
}
|
||
|
||
@media only screen and (max-width: 620px) {
|
||
.download-thumb {
|
||
display: none;
|
||
}
|
||
}
|
||
|
||
.paper-stuff {
|
||
width: 50%;
|
||
font-size: 20px;
|
||
}
|
||
|
||
@media only screen and (max-width: 620px) {
|
||
.paper-stuff {
|
||
width: 100%;
|
||
}
|
||
}
|
||
* {
|
||
box-sizing: border-box;
|
||
}
|
||
|
||
.column {
|
||
text-align: center;
|
||
float: left;
|
||
width: 16.666%;
|
||
padding: 5px;
|
||
}
|
||
.column3 {
|
||
text-align: center;
|
||
float: left;
|
||
width: 33.333%;
|
||
padding: 5px;
|
||
}
|
||
|
||
/* Clearfix (clear floats) */
|
||
.row::after {
|
||
content: "";
|
||
clear: both;
|
||
display: table;
|
||
}
|
||
|
||
/* Responsive layout - makes the three columns stack on top of each other instead of next to each other */
|
||
@media screen and (max-width: 500px) {
|
||
.column {
|
||
width: 100%;
|
||
}
|
||
}
|
||
@media screen and (max-width: 500px) {
|
||
.column3 {
|
||
width: 100%;
|
||
}
|
||
}
|
||
|
||
</style>
|
||
|
||
<script type="text/javascript" src="../js/hidebib.js"></script>
|
||
<link href='https://fonts.googleapis.com/css?family=Titillium+Web:400,600,400italic,600italic,300,300italic' rel='stylesheet' type='text/css'>
|
||
<head>
|
||
<title> LION: Latent Point Diffusion Models for 3D Shape Generation </title>
|
||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||
<!-- meta property="og:description" content="Score-Based Generative Modeling with Critically-Damped Langevin Diffusion"/ -->
|
||
<link href="https://fonts.googleapis.com/css2?family=Material+Icons" rel="stylesheet">
|
||
<!-- meta name="twitter:card" content="summary_large_image" -->
|
||
<!-- meta name="twitter:creator" content="@timudk" -->
|
||
<!-- meta name="twitter:title" content="Score-Based Generative Modeling with Critically-Damped Langevin Diffusion" -->
|
||
<!-- meta name="twitter:description" content="Inspired by connections to statistical mechanics, we propose a novel diffusion process, critically-damped Langevin diffusion, that perturbs the data in a smoother manner by leveraging auxiliary velocity variables. This allows us to denoise more efficiently and learn higher quality generative models." -->
|
||
<!-- meta name="twitter:image" content="https://nv-tlabs.github.io/CLD-SGM/assets/cld_teaser_resized.png" -->
|
||
</head>
|
||
|
||
<body>
|
||
<div class="topnav" id="myTopnav">
|
||
<div>
|
||
<a href="https://www.nvidia.com/"><img width="100%" src="assets/nvidia.svg"></a>
|
||
<a href="https://nv-tlabs.github.io/" ><strong>Toronto AI Lab</strong></a>
|
||
</div>
|
||
</div>
|
||
<div class="container">
|
||
<div class="paper-title">
|
||
<h1>
|
||
<font color="#5364cc">LION</font>:
|
||
<font color="#5364cc">L</font>atent Po<font color="#5364cc">i</font>nt Diffusi<font color="#5364cc">on</font> Models <br> for 3D Shape Generation</h1>
|
||
</div>
|
||
|
||
<div id="authors">
|
||
<center>
|
||
<div class="author-row-new">
|
||
<a href="https://www.cs.utoronto.ca/~xiaohui/">Xiaohui Zeng<sup>1,2,3</sup></a>,
|
||
<a href="http://latentspace.cc/">Arash Vahdat<sup>1</sup></a>,
|
||
<a href="https://www.fwilliams.info/">Francis Williams<sup>1</sup></a>,
|
||
<a href="https://zgojcic.github.io/">Zan Gojcic<sup>1</sup></a>,
|
||
<a href="https://orlitany.github.io/">Or Litany<sup>1</sup></a>,
|
||
<a href="https://www.cs.utoronto.ca/~fidler/">Sanja Fidler<sup>1,2,3</sup></a>,
|
||
<a href="https://karstenkreis.github.io/">Karsten Kreis<sup>1</sup></a>
|
||
</div>
|
||
</center>
|
||
<center>
|
||
<div class="affiliations">
|
||
<span><sup>1</sup> NVIDIA</span>
|
||
<span><sup>2</sup> University of Toronto</span>
|
||
<span><sup>3</sup> Vector Institute</span> <br/>
|
||
</div>
|
||
|
||
<div class="affil-row">
|
||
<div class="venue text-center"><b>NeurIPS 2022 </b></div>
|
||
</div>
|
||
|
||
</center>
|
||
|
||
<div style="clear: both">
|
||
<div class="paper-btn-parent">
|
||
<a class="paper-btn" href="https://nv-tlabs.github.io/LION">
|
||
<span class="material-icons"> description </span>
|
||
Paper
|
||
</a>
|
||
<div class="paper-btn-coming-soon">
|
||
<a class="paper-btn" href="https://nv-tlabs.github.io/LION">
|
||
<span class="material-icons"> code </span>
|
||
Code
|
||
</a>
|
||
</div>
|
||
</div></div>
|
||
</div>
|
||
|
||
<section id="teaser-image">
|
||
<center>
|
||
<figure>
|
||
<video class="centered" width="80%" controls autoplay loop muted playsinline class="video-background " >
|
||
<source src="assets/LION_video_v9.mp4#t=0.001" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<!--
|
||
<p class="caption">
|
||
LION's Generation process.
|
||
</p> <br>
|
||
-->
|
||
</figure>
|
||
|
||
</center>
|
||
</section>
|
||
|
||
<section id="news">
|
||
<hr>
|
||
<h2>News</h2>
|
||
<div class="row">
|
||
<div><span class="material-icons"> event </span> [Oct 2022] <a href="https://nv-tlabs.github.io/LION">Project page</a> released!</div>
|
||
<div><span class="material-icons"> event </span> [Oct 2022] Paper released on <a href="https://nv-tlabs.github.io/LION">arXiv</a>!</div>
|
||
<div><span class="material-icons"> event </span> [Aug 2022] LION got accepted to <b>Advances in Neural Information Processing Systems (NeurIPS)</b>!</div>
|
||
</div>
|
||
</section>
|
||
|
||
<section id="abstract"/>
|
||
<hr>
|
||
<h2>Abstract</h2>
|
||
<div class="flex-row">
|
||
<p>
|
||
Denoising diffusion models (DDMs) have shown promising results in 3D point cloud synthesis. To advance 3D DDMs and make them useful
|
||
for digital artists, we require <i>(i)</i> high generation quality, <i>(ii)</i> flexibility for manipulation and applications such as conditional
|
||
synthesis and shape interpolation, and <i>(iii)</i> the ability to output smooth surfaces or meshes. To this end, we introduce the
|
||
hierarchical Latent Point Diffusion Model (LION) for 3D shape generation. LION is set up as a variational autoencoder (VAE) with
|
||
a hierarchical latent space that combines a global shape latent representation with a point-structured latent space. For generation,
|
||
we train two hierarchical DDMs in these latent spaces. The hierarchical VAE approach boosts performance compared to DDMs that operate
|
||
on point clouds directly, while the point-structured latents are still ideally suited for DDM-based modeling. Experimentally, LION
|
||
achieves state-of-the-art generation performance on multiple ShapeNet benchmarks. Furthermore, our VAE framework allows us to easily
|
||
use LION for different relevant tasks: LION excels at multimodal shape denoising and voxel-conditioned synthesis, and it can be adapted
|
||
for text- and image-driven 3D generation. We also demonstrate shape autoencoding and latent shape interpolation, and we augment LION with modern
|
||
surface reconstruction techniques to generate smooth 3D meshes. We hope that LION provides a powerful tool for artists working with
|
||
3D shapes due to its high-quality generation, flexibility, and surface reconstruction.
|
||
</p>
|
||
</div>
|
||
</section>
|
||
<section id="method"/>
|
||
<hr>
|
||
<h2>Method</h2>
|
||
<div class="flex-row">
|
||
<p>
|
||
We introduce the Latent Point Diffusion Model (LION), a DDM for 3D shape generation.
|
||
LION focuses on learning a 3D generative model directly from geometry data without image-based training.
|
||
Similar to previous 3D DDMs in this setting, LION operates on point clouds. However, it is constructed as a VAE with DDMs in latent
|
||
space. LION comprises a hierarchical latent space with a vector-valued global shape latent and another
|
||
point-structured latent space. The latent representations are predicted with point cloud processing
|
||
encoders, and two latent DDMs are trained in these latent spaces. Synthesis in LION proceeds by drawing
|
||
novel latent samples from the hierarchical latent DDMs and decoding back to the original point
|
||
cloud space. Importantly, we also demonstrate how to augment LION with modern surface reconstruction methods to
|
||
synthesize smooth shapes as desired by artists. LION has multiple advantages:
|
||
</p>
|
||
<p>
|
||
<b>Expressivity:</b> By mapping point clouds into regularized latent spaces, the DDMs in latent space are
|
||
effectively tasked with learning a smoothed distribution. This is easier than training on potentially
|
||
complex point clouds directly, thereby improving expressivity. However, point clouds are, in
|
||
principle, an ideal representation for DDMs. Because of that, we use latent points, this is, we keep a
|
||
point cloud structure for our main latent representation. Augmenting the model with an additional
|
||
global shape latent variable in a hierarchical manner further boosts expressivity.
|
||
</p>
|
||
<p>
|
||
<b>Varying Output Types:</b> Extending LION with Shape As Points (SAP) geometry reconstruction
|
||
allows us to also output smooth meshes. Fine-tuning SAP on data generated by LION’s autoencoder
|
||
reduces synthesis noise and enables us to generate high-quality geometry. LION combines (latent)
|
||
point cloud-based modeling, ideal for DDMs, with surface reconstruction, desired by artists.
|
||
</p>
|
||
<p>
|
||
<b>Flexibility:</b> Since LION is set up as a VAE, it can be easily adapted for different tasks without
|
||
retraining the latent DDMs: We can efficiently fine-tune LION’s encoders on voxelized or noisy inputs,
|
||
which a user can provide for guidance. This enables multimodal voxel-guided synthesis and shape
|
||
denoising. We also leverage LION’s latent spaces for shape interpolation and autoencoding. Optionally
|
||
training the DDMs conditioned on CLIP embeddings enables image- and text-driven 3D generation.
|
||
</p>
|
||
</div>
|
||
<center>
|
||
<figure style="width: 100%;">
|
||
<a>
|
||
<img width="80%" src="assets/pipeline.jpg">
|
||
</a>
|
||
<p class="caption" style="margin-bottom: 24px;"><br>
|
||
LION is set up as a hierarchical point cloud VAE with denoising diffusion models over the shape latent and latent point distributions.
|
||
Point-Voxel CNNs (PVCNN) with adaptive Group Normalization (Ada. GN) are used as neural networks.
|
||
The latent points can be interpreted as a smoothed version of the input point cloud.
|
||
Shape As Points (SAP) is optionally used for mesh reconstruction.
|
||
</p>
|
||
</figure>
|
||
|
||
</center>
|
||
</section>
|
||
|
||
<!--
|
||
<section id="teaser-video">
|
||
</p>
|
||
<center>
|
||
<figure>
|
||
<video class="centered" width="50%" controls muted autoplay>
|
||
<source src="assets/LION_demo.mp4#t=0.001" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption">
|
||
Generated output from LION.
|
||
</p>
|
||
</figure>
|
||
</center>
|
||
</p>
|
||
</section>
|
||
-->
|
||
|
||
<section id="novelties"/>
|
||
<hr>
|
||
<h2>Main Contributions</h2>
|
||
<div class="flex-row">
|
||
<p>
|
||
<ul style="list-style-type:disc;">
|
||
<li>We introduce LION, a novel generate model for 3D shape synthesis. We explore the training of multiple hierarchical denoising diffusion models in latent space.</li>
|
||
<!-- <li>We train latent DDMs in 3D generation.</li> -->
|
||
<li>We extensively validate LION's high synthesis quality and reach state-of-the-art performance on widely used ShapeNet benchmarks.</li>
|
||
<li>We demonstrate that LION scales to extremely diverse shape datasets. For instance, LION can model 13 or even 55 ShapeNet categories jointly without any class-conditioning. In the other extreme, we also verify that LION can be successfully trained on small datasets with less 100 shapes.</li>
|
||
<li>We propose to combine LION with Shape As Points-based surface reconstruction to directly extract practically useful meshes.</li>
|
||
<li>We show our model's flexibility by demonstrating how LION can be adapted to various relevant tasks, such as multimodal shape denoising, voxel-guided synthesis, text- and image-driven shape generation, and more.</li>
|
||
</ul>
|
||
</p>
|
||
</div>
|
||
</section>
|
||
|
||
|
||
<section id="results">
|
||
<hr>
|
||
<h2>Generation Results (Single Category Models)</h2>
|
||
<!-- <div class="flex-row">
|
||
<p>Samples from LION trained on single catgory. </p>
|
||
</div> -->
|
||
|
||
<center>
|
||
<figure>
|
||
<video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
|
||
<source src="assets/gen_airplane.mp4#t=0.001" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption">
|
||
Generated point clouds and reconstructed meshes of airplanes.
|
||
</p> <br>
|
||
</figure>
|
||
<figure>
|
||
<video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
|
||
<source src="assets/gen_chair.mp4#t=0.001" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption">
|
||
Generated point clouds and reconstructed meshes of chairs.
|
||
</p> <br>
|
||
</figure>
|
||
<figure>
|
||
<video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
|
||
<source src="assets/gen_car.mp4#t=0.001" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption">
|
||
Generated point clouds and reconstructed meshes of cars.
|
||
</p> <br>
|
||
</figure>
|
||
|
||
<figure style="width: 100%;">
|
||
<video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
|
||
<source src="assets/gen_animal553_v2.mp4#t=0.001" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption" style="margin-bottom: 24px;">
|
||
Generated point clouds and reconstructed meshes of animals (model trained on only 553 shapes).
|
||
</p> <br>
|
||
</figure>
|
||
</center>
|
||
<center>
|
||
<figure>
|
||
<video class="centered" width="100%" controls muted autoplay playsinline class="video-background " >
|
||
<source src="assets/gen_bottle.mp4#t=11" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption">
|
||
Generated point clouds and reconstructed meshes of bottles (model trained on only 340 shapes).
|
||
</p> <br>
|
||
</figure>
|
||
</center>
|
||
|
||
<center>
|
||
<figure>
|
||
<video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
|
||
<source src="assets/gen_mug.mp4#t=11" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption">
|
||
Generated point clouds and reconstructed meshes of mugs (model trained on only 149 shapes).
|
||
|
||
</p> <br>
|
||
</figure>
|
||
</center>
|
||
|
||
<hr>
|
||
<h2>Generation Results (Multi-Class)</h2>
|
||
<div class="flex-row">
|
||
<p>Below we show samples from LION models that were trained on shapes from multiple ShapeNet catgories, <l>without any class-conditioning</l>. We on purpose did not use conditioning to explore LION's scalability to diverse and multimodal datasets in the unconditional setting.</p>
|
||
</div>
|
||
<center>
|
||
<figure>
|
||
<video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
|
||
<source src="assets/gen_all_v13.mp4#t=0.001" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption">
|
||
Generated point clouds and reconstructed meshes. The LION model is trained on 13 ShapeNet categories jointly without conditioning.
|
||
</p>
|
||
<br>
|
||
</figure>
|
||
</center>
|
||
|
||
<center>
|
||
<figure>
|
||
<video class="centered" width="100%" controls muted playsinline class="video-background " >
|
||
<source src="assets/gen_all_55.mp4#t=0.001" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption">
|
||
Generated point clouds from a LION model that was trained on all 55 ShapeNet categories jointly without conditioning.
|
||
</p>
|
||
<br>
|
||
</figure>
|
||
</center>
|
||
|
||
</section>
|
||
|
||
<section id="more_results">
|
||
<hr>
|
||
<h2>More Results and Applications</h2>
|
||
Our main goal was to introduce a high-performance 3D shape generative model. Here, we qualitatively demonstrate how LION can be used for a variety of interesting applications.
|
||
<h3>Shape Interpolation</h3>
|
||
<div class="flex-row">
|
||
<p>LION can interpolate shapes by traversing the latent space (interpolation is performed in the latent diffusion models' prior space, using the <i>Probability Flow ODE</i> for deterministic DDM-generation). The generated shapes are clean and semantically plausible along the entire interpolation path.</p>
|
||
</div>
|
||
<figure>
|
||
<video class="centered" width="100%" controls muted playsinline class="video-background " >
|
||
<source src="assets/LION_interp.mp4#t=0.001" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption">
|
||
Leftmost shape: the source shape. Rightmost shape: the target shape. The shapes in the middle are interpolated between source and target shape.
|
||
</p>
|
||
</figure>
|
||
<center>
|
||
<figure>
|
||
<video class="centered" width="50%" controls loop autoplay muted playsinline class="video-background " >
|
||
<source src="assets/LION_interp_seq.mp4#t=0.001" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption">
|
||
LION traverses the latent space and interpolates many different shapes.
|
||
</p>
|
||
</figure>
|
||
</center>
|
||
<br>
|
||
<h3>Fast Sampling with DDIM</h3>
|
||
<div class="flex-row">
|
||
<p>LION's sampling time can be reduced by using fast DDM sampler, such as the DDIM sampler. DDIM sampling with 25 steps can already generate high-quality shapes, which takes less than 1 sec. This enables real-time and interactive applications.</p>
|
||
</div>
|
||
<center>
|
||
<figure style="width: 100%;">
|
||
<a>
|
||
<img width="100%" src="assets/ddim_sample.png">
|
||
</a>
|
||
<p class="caption" style="margin-bottom: 24px;">
|
||
DDIM samples from LION trained on different data. The top two rows show the number of steps and the wall-clock time required when drawing one sample.
|
||
With DDIM sampling, we can reduce the sampling time from 27.09 sec (1000 steps) to less than 1 sec (25 steps) to generate an object.
|
||
</p>
|
||
</figure>
|
||
</center>
|
||
|
||
<br>
|
||
<h3> Multimodal Generation</h3>
|
||
<div class="flex-row">
|
||
<p>
|
||
LION can synthesize different variations of a given shape, enabling multimodal generation in a controlled manner. This is achieved through a diffuse-denoise procedure, where shapes a diffused for only a few steps in the latent DDMs and then denoised again.
|
||
</p>
|
||
</div>
|
||
<!--
|
||
<div class="row">
|
||
<div class="column3">
|
||
<figure style="width: 100%;">
|
||
<a>
|
||
<img width="30%" src="assets/multi_modal/airplane/recon_313.jpg">
|
||
<img width="30%" src="assets/multi_modal/airplane/airplane_D200/sap_0000/recon_0.jpg">
|
||
<img width="30%" src="assets/multi_modal/airplane/airplane_D200/sap_0000/recon_2.jpg">
|
||
</a>
|
||
<figcaption align = "center">
|
||
Multimodal generation of airplane.
|
||
</figcaption>
|
||
</figure>
|
||
</div>
|
||
|
||
<div class="column3">
|
||
<figure style="width: 100%;">
|
||
<a>
|
||
<img width="30%" src="assets/multi_modal/chair/recon_137.jpg">
|
||
<img width="30%" src="assets/multi_modal/chair/chair_D160/sap_0000/recon_4.png">
|
||
<img width="30%" src="assets/multi_modal/chair/chair_D160/sap_0000/recon_9.png">
|
||
</a>
|
||
<figcaption align = "center">
|
||
Multimodal generation of chair.
|
||
</figcaption>
|
||
</figure>
|
||
|
||
</div>
|
||
<div class="column3">
|
||
<figure style="width: 100%;">
|
||
<a>
|
||
<img width="30%" src="assets/multi_modal/car/recon_46.jpg">
|
||
<img width="30%" src="assets/multi_modal/car/recon_0.png">
|
||
<img width="30%" src="assets/multi_modal/car/recon_5.png">
|
||
</a>
|
||
<figcaption align = "center">
|
||
Multimodal generation of car.
|
||
</figcaption>
|
||
</figure>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
-->
|
||
<center>
|
||
<figure style="width: 30%;">
|
||
<a>
|
||
<img width="30%" src="assets/multi_modal/airplane/recon_313.jpg">
|
||
<img width="30%" src="assets/multi_modal/airplane/airplane_D200/sap_0000/recon_0.png">
|
||
<img width="30%" src="assets/multi_modal/airplane/airplane_D200/sap_0000/recon_2.png">
|
||
</a>
|
||
<p class="caption" style="margin-bottom: 24px;">
|
||
Multimodal generation of airplanes.
|
||
</p>
|
||
</figure>
|
||
</center>
|
||
|
||
<center>
|
||
<figure style="width: 30%;">
|
||
<a>
|
||
<img width="30%" src="assets/multi_modal/chair/recon_137.jpg">
|
||
<img width="30%" src="assets/multi_modal/chair/chair_D160/sap_0000/recon_4.png">
|
||
<img width="30%" src="assets/multi_modal/chair/chair_D160/sap_0000/recon_9.png">
|
||
</a>
|
||
<p class="caption" style="margin-bottom: 24px;">
|
||
Multimodal generation of chairs.
|
||
</p>
|
||
</figure>
|
||
</center>
|
||
|
||
<center>
|
||
<figure style="width: 30%;">
|
||
<a>
|
||
<img width="30%" src="assets/multi_modal/car/recon_46.jpg">
|
||
<img width="30%" src="assets/multi_modal/car/recon_0.png">
|
||
<img width="30%" src="assets/multi_modal/car/recon_5.png">
|
||
</a>
|
||
<p class="caption" style="margin-bottom: 24px;">
|
||
Multimodal generation of cars.
|
||
</p>
|
||
</figure>
|
||
</center>
|
||
|
||
<h3>Voxel-Conditioned Synthesis </h3>
|
||
<div class="flex-row">
|
||
<p>Given a coarse voxel grid, LION can generate different plausible detailed shapes. In practice, an artist using a 3D generative model may have a rough idea of the desired shape. For instance, they may be able to quickly construct a coarse voxelized shape, to which the generative model then adds realistic details. We achieve this by fine-tuning our encoder networks on voxelized shapes, and performing a few steps of diffuse-denoise in latent space to generate various plausible detailed shapes.</p>
|
||
</div>
|
||
|
||
<center>
|
||
<figure style="width: 80%;">
|
||
|
||
<video class="centered" width="80%" controls muted playsinline class="video-background " >
|
||
<source src="assets/airplane_voxel.mp4#t=14.8" type="video/mp4">
|
||
Your browser does not support the video tag.
|
||
</video>
|
||
<p class="caption" style="margin-bottom: 24px;" width="30%">
|
||
Left: Input voxel grid. Right: two point clouds generated by LION and the reconstructed mesh.
|
||
<!-- Voxel-guided synthesis experiments, on different categories. We run diffuse-denoise in latent space to generate diverse plausible clean shapes (first row, left plane: 250 diffuse-denoise steps; first row, right plane: 200 steps;) -->
|
||
</p>
|
||
|
||
</figure>
|
||
</center>
|
||
<br>
|
||
|
||
|
||
<h3> Single View Reconstruction </h3>
|
||
<div class="flex-row">
|
||
<p>
|
||
We qualitatively demonstrate how LION can be extended to also allow for single view reconstruction (SVR) from RGB data, using the approach from CLIP-Forge.
|
||
We render 2D images from the 3D ShapeNet shapes, extract the images’ CLIP image embeddings, and
|
||
train LION’s latent diffusion models while conditioning on the shapes’ CLIP image embeddings.
|
||
At test time, we then take a single view 2D image, extract the CLIP image embedding, and generate
|
||
corresponding 3D shapes, thereby effectively performing SVR. We show SVR results from real
|
||
RGB data.
|
||
</p>
|
||
</div>
|
||
<center>
|
||
<figure style="width: 100%;">
|
||
<a>
|
||
<img width="49%" src="assets/svr/img2shape_mitsuba_full.jpg">
|
||
<img width="49%" src="assets/svr/img2shape_cari2s_mm_mitsuba_full.jpg">
|
||
</a>
|
||
<p class="caption" style="margin-bottom: 24px;">
|
||
Single view reconstruction from RGB images of chairs. For each input image, LION can generate multi-modal outputs.
|
||
</p>
|
||
</figure>
|
||
</center>
|
||
<!--
|
||
<figure style="width: 50%;">
|
||
<a>
|
||
<img width="100%" src="assets/svr/img2shape_cari2s_mm_mitsuba_full.jpg">
|
||
</a>
|
||
<p class="caption" style="margin-bottom: 24px;">
|
||
Single view reconstruction from RGB images of car. For each input image, LION can generate multi-modal outputs.
|
||
</p>
|
||
</figure>
|
||
-->
|
||
<center>
|
||
<figure style="width: 100%;">
|
||
<a>
|
||
<img width="100%" src="assets/svr/img2shape_cari2s_mitsuba_full.jpg">
|
||
</a>
|
||
<p class="caption" style="margin-bottom: 24px;">
|
||
More single view reconstructions from RGB images of cars.
|
||
</p>
|
||
</figure>
|
||
</center>
|
||
<br>
|
||
<h3> Text-Guided Generation </h3>
|
||
<div class="flex-row">
|
||
<p>
|
||
Using CLIP’s text encoder, our method additionally allows for text-guided generation.
|
||
</p>
|
||
</div>
|
||
<center>
|
||
<figure style="width: 100%;">
|
||
<a>
|
||
<img width="35%" src="assets/clipforge_chair.png">
|
||
<img width="60%" src="assets/clipforge_car.png">
|
||
</a>
|
||
<p class="caption" style="margin-bottom: 24px;">
|
||
Text-driven shape generation of chairs with LION. Bottom row is the text inputs.
|
||
</p>
|
||
</figure>
|
||
</center>
|
||
<h3> Per-sample Text-driven Texture Synthesis</h3>
|
||
<div class="flex-row">
|
||
<p>
|
||
We apply Text2mesh on some generated meshes from LION to additionally synthesize textures in a text-driven manner, leveraging CLIP. The original input meshes are generated by LION.
|
||
This is only possible because LION can output practically useful meshes with its SAP-based surface reconstruction component (even though the backbone generative modeling component is point cloud-based).
|
||
</p>
|
||
</div>
|
||
<div class="row">
|
||
<div class="column">
|
||
<img width="100%" src="assets/text2mesh/strawberries_airplane-rec_3.jpg">
|
||
<figcaption align = "center">an airplane made of strawberry</figcaption>
|
||
</div>
|
||
|
||
<div class="column">
|
||
<img width="100%" src="assets/text2mesh/fabric_leather_airplane-rec_3.jpg">
|
||
<figcaption align = "center">an airplane made of fabric leather </figcaption>
|
||
</div>
|
||
<div class="column">
|
||
<img width="100%" src="assets/text2mesh/wood_chair-rec_421_norm1.jpg">
|
||
<figcaption align = "center">a chair made of wood</figcaption>
|
||
</div>
|
||
<div class="column">
|
||
<img width="100%" src="assets/text2mesh/wrong_copied1-rec_293_norm0.jpg">
|
||
<figcaption align = "center">a car made of rusty metal</figcaption>
|
||
</div>
|
||
|
||
<div class="column">
|
||
<img width="100%" src="assets/text2mesh/brick_car-rec_67_norm1.jpg">
|
||
<figcaption align = "center">a car made of brick</figcaption>
|
||
</div>
|
||
|
||
|
||
<div class="column">
|
||
<img width="100%" src="assets/text2mesh/wrong_copied1-rec_12_norm1.jpg">
|
||
<figcaption align = "center">a denim fabric animal</figcaption>
|
||
</div>
|
||
</div>
|
||
<br>
|
||
|
||
|
||
</section>
|
||
<section id="paper">
|
||
<h2>Paper</h2>
|
||
<hr>
|
||
<div class="flex-row">
|
||
<div class="download-thumb">
|
||
<div style="box-sizing: border-box; padding: 16px; margin: auto;">
|
||
<a href="https://nv-tlabs.github.io/LION"><img class="screenshot" src="assets/cld_paper_preview.png"></a>
|
||
</div>
|
||
</div>
|
||
<div class="paper-stuff">
|
||
<p><b>LION: Latent Point Diffusion Models for 3D Shape Generation</b></p>
|
||
<p>Xiaohui Zeng, Arash Vahdat, Francis Williams, Zan Gojcic, Or Litany, Sanja Fidler, Karsten Kreis</p>
|
||
<p><i>Advances in Neural Information Processing Systems (NeurIPS), 2022 <b></b></i></p>
|
||
<div><span class="material-icons"> description </span><a href="https://nv-tlabs.github.io/LION"> arXiv version</a></div>
|
||
<div><span class="material-icons"> insert_comment </span><a href="assets/zeng2022lion.bib"> BibTeX</a></div>
|
||
<div><span class="material-icons"> integration_instructions </span><a href="https://nv-tlabs.github.io/LION"> Code</a></div>
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
|
||
<section id="bibtex">
|
||
<h2>Citation</h2>
|
||
<hr>
|
||
<pre><code>@inproceedings{zeng2022lion,
|
||
title={LION: Latent Point Diffusion Models for 3D Shape Generation},
|
||
author={Xiaohui Zeng and Arash Vahdat and Francis Williams and Zan Gojcic and Or Litany and Sanja Fidler and Karsten Kreis},
|
||
booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
|
||
year={2022}
|
||
}</code></pre>
|
||
</section>
|
||
</div>
|
||
</body>
|
||
</html>
|