LION/index.html


<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>

<script type="text/javascript" charset="utf-8" src="https://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js"></script> 
<script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>

<style type="text/css">
body {
    font-family: "Titillium Web", "HelveticaNeue-Light", "Helvetica Neue Light", "Helvetica Neue", Helvetica, Arial, "Lucida Grande", sans-serif;
    font-weight: 300;
    font-size: 17px;
    margin-left: auto;
    margin-right: auto;
}

@media screen and (min-width: 980px){
    body {
        width: 980px;
    }
}

h1 {
    font-weight:300;
    line-height: 1.15em;
}

h2 {
    font-size: 1.75em;
}
a:link,a:visited {
    color: #5364cc;
    text-decoration: none;
}
a:hover {
    color: #208799;
}
h1 {
    text-align: center;
}
h2,h3 {
    text-align: left;
}

h1 {
    font-size: 40px;
    font-weight: 500;
}
h2 {
    font-weight: 400;
    margin: 16px 0px 4px 0px;
}
h3 {
    font-weight: 600;
    margin: 16px 0px 4px 0px;
}

.paper-title {
    padding: 1px 0px 1px 0px;
}
section {
    margin: 32px 0px 32px 0px;
    text-align: justify;
    clear: both;
}
.col-5 {
     width: 20%;
     float: left;
}
.col-4 {
     width: 25%;
     float: left;
}
.col-3 {
     width: 33%;
     float: left;
}
.col-2 {
     width: 50%;
     float: left;
}
.col-1 {
     width: 100%;
     float: left;
}

.author-row, .affil-row {
    font-size: 26px;
}

.author-row-new { 
    text-align: center; 
}

.author-row-new a {
    display: inline-block;
    font-size: 20px;
    padding: 4px;
}

.author-row-new sup {
    color: #313436;
    font-size: 12px;
}

.affiliations-new {
    font-size: 18px;
    text-align: center;
    width: 80%;
    margin: 0 auto;
    margin-bottom: 20px;
}

.row {
    margin: 16px 0px 16px 0px;
}
.authors {
    font-size: 26px;
}
.affiliatons {
    font-size: 18px;
}
.affil-row {
    margin-top: 18px;
}
.teaser {
    max-width: 100%;
}
.text-center {
    text-align: center;  
}
.screenshot {
    width: 256px;
    border: 1px solid #ddd;
}
.screenshot-el {
    margin-bottom: 16px;
}
hr {
    height: 1px;
    border: 0; 
    border-top: 1px solid #ddd;
    margin: 0;
}
.material-icons {
    vertical-align: -6px;
}
p {
    line-height: 1.25em;
}
.caption {
    font-size: 16px;
    color: #666;
    margin-top: 4px;
    margin-bottom: 10px;
}
video {
    display: block;
    margin: auto;
}
figure {
    display: block;
    margin: auto;
    margin-top: 10px;
    margin-bottom: 10px;
}
#bibtex pre {
    font-size: 14px;
    background-color: #eee;
    padding: 16px;
}
.blue {
    color: #2c82c9;
    font-weight: bold;
}
.orange {
    color: #d35400;
    font-weight: bold;
}
.flex-row {
    display: flex;
    flex-flow: row wrap;
    padding: 0;
    margin: 0;
    list-style: none;
}

.paper-btn-coming-soon {
    position: relative; 
    top: 0;
    left: 0;
}

.coming-soon {
    position: absolute;
    top: -15px;
    right: -15px;
}

.paper-btn {
  position: relative;
  text-align: center;

  display: inline-block;
  margin: 8px;
  padding: 8px 8px;

  border-width: 0;
  outline: none;
  border-radius: 2px;
  
  background-color: #5364cc;
  color: white !important;
  font-size: 20px;
  width: 100px;
  font-weight: 600;
}
.paper-btn-parent {
    display: flex;
    justify-content: center;
    margin: 16px 0px;
}

.paper-btn:hover {
    opacity: 0.85;
}

.container {
    margin-left: auto;
    margin-right: auto;
    padding-left: 16px;
    padding-right: 16px;
}

.venue {
    font-size: 23px;
}

.topnav {
    background-color: #EEEEEE;
    overflow: hidden;
}

.topnav div {
    max-width: 1070px;
    margin: 0 auto;
}

.topnav a {
    display: inline-block;
    color: black;
    text-align: center;
    vertical-align: middle;
    padding: 16px 16px;
    text-decoration: none;
    font-size: 18px;
}

.topnav img {
    padding: 2px 0px;
    width: 100%;
    margin: 0.2em 0px 0.3em 0px;
    vertical-align: middle;
}

pre {
    font-size: 0.9em;
    padding-left: 7px;
    padding-right: 7px;
    padding-top: 3px;
    padding-bottom: 3px;
    border-radius: 3px;
    background-color: rgb(235, 235, 235);
    overflow-x: auto;
}

.download-thumb {
    display: flex;
}

@media only screen and (max-width: 620px) {
    .download-thumb {
        display: none;
    }
}

.paper-stuff {
    width: 50%;
    font-size: 20px;
}

@media only screen and (max-width: 620px) {
    .paper-stuff {
        width: 100%;
    }
}
* {
  box-sizing: border-box;
}

.column {
  text-align: center;
  float: left;
  width: 16.666%;
  padding: 5px;
}
.column3 {
  text-align: center;
  float: left;
  width: 33.333%;
  padding: 5px;
}

/* Clearfix (clear floats) */
.row::after {
  content: "";
  clear: both;
  display: table;
}

/* Responsive layout - makes the three columns stack on top of each other instead of next to each other */
@media screen and (max-width: 500px) {
  .column {
    width: 100%;
  }
}
@media screen and (max-width: 500px) {
  .column3 {
    width: 100%;
  }
}

</style>

<script type="text/javascript" src="../js/hidebib.js"></script>
    <link href='https://fonts.googleapis.com/css?family=Titillium+Web:400,600,400italic,600italic,300,300italic' rel='stylesheet' type='text/css'>
    <head>
        <title> LION: Latent Point Diffusion Models for 3D Shape Generation </title>
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <!-- meta property="og:description" content="Score-Based Generative Modeling with Critically-Damped Langevin Diffusion"/ -->
        <link href="https://fonts.googleapis.com/css2?family=Material+Icons" rel="stylesheet">
        <!-- meta name="twitter:card" content="summary_large_image" -->
        <!-- meta name="twitter:creator" content="@timudk" -->
        <!-- meta name="twitter:title" content="Score-Based Generative Modeling with Critically-Damped Langevin Diffusion" -->
        <!-- meta name="twitter:description" content="Inspired by connections to statistical mechanics, we propose a novel diffusion process, critically-damped Langevin diffusion, that perturbs the data in a smoother manner by leveraging auxiliary velocity variables. This allows us to denoise more efficiently and learn higher quality generative models." -->
        <!-- meta name="twitter:image" content="https://nv-tlabs.github.io/CLD-SGM/assets/cld_teaser_resized.png" -->
    </head>

 <body>
<div class="topnav" id="myTopnav">
    <div>
        <a href="https://www.nvidia.com/"><img width="100%" src="assets/nvidia.svg"></a>
        <a href="https://nv-tlabs.github.io/" ><strong>Toronto AI Lab</strong></a>
    </div>
</div>
<div class="container">
    <div class="paper-title">
    <h1> 
        <font color="#5364cc">LION</font>: 
        <font color="#5364cc">L</font>atent Po<font color="#5364cc">i</font>nt Diffusi<font color="#5364cc">on</font> Models <br> for 3D Shape Generation</h1>
    </div>

    <div id="authors">
    	<center>
            <div class="author-row-new">
                <a href="https://www.cs.utoronto.ca/~xiaohui/">Xiaohui Zeng<sup>1,2,3</sup></a>,
                <a href="http://latentspace.cc/">Arash Vahdat<sup>1</sup></a>,
                <a href="https://www.fwilliams.info/">Francis Williams<sup>1</sup></a>,
                <a href="https://zgojcic.github.io/">Zan Gojcic<sup>1</sup></a>,
                <a href="https://orlitany.github.io/">Or Litany<sup>1</sup></a>,
                <a href="https://www.cs.utoronto.ca/~fidler/">Sanja Fidler<sup>1,2,3</sup></a>,
                <a href="https://karstenkreis.github.io/">Karsten Kreis<sup>1</sup></a>
            </div>
        </center>
        <center>
        <div class="affiliations">
            <span><sup>1</sup> NVIDIA</span>
            <span><sup>2</sup> University of Toronto</span>
            <span><sup>3</sup> Vector Institute</span> <br/>
        </div>

        <div class="affil-row">
            <div class="venue text-center"><b>NeurIPS 2022 </b></div>
        </div>

        </center>

        <div style="clear: both">
            <div class="paper-btn-parent">
            <a class="paper-btn" href="https://nv-tlabs.github.io/LION">
                <span class="material-icons"> description </span> 
                 Paper
            </a>
            <div class="paper-btn-coming-soon">
                <a class="paper-btn" href="https://nv-tlabs.github.io/LION">
                    <span class="material-icons"> code </span>
                    Code
                </a>
            </div>
        </div></div>
    </div>

    <section id="teaser-image">
        <center>
            <figure>
                <video class="centered" width="80%" controls autoplay loop muted playsinline class="video-background " >
                    <source src="assets/LION_video_v9.mp4#t=0.001" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <!-- 
                <p class="caption">
                    LION's Generation process.
                </p> <br>
                -->
            </figure>

        </center>
    </section>

    <section id="news">
        <hr>
        <h2>News</h2>
        <div class="row">
        	<div><span class="material-icons"> event </span> [Oct 2022] <a href="https://nv-tlabs.github.io/LION">Project page</a> released!</div>
        	<div><span class="material-icons"> event </span> [Oct 2022] Paper released on <a href="https://nv-tlabs.github.io/LION">arXiv</a>!</div>
            <div><span class="material-icons"> event </span> [Aug 2022] LION got accepted to <b>Advances in Neural Information Processing Systems (NeurIPS)</b>!</div>
        </div>
    </section>

    <section id="abstract"/>
        <hr>
        <h2>Abstract</h2>
        <div class="flex-row">
            <p>
                Denoising diffusion models (DDMs) have shown promising results in 3D point cloud synthesis. To advance 3D DDMs and make them useful 
                for digital artists, we require <i>(i)</i> high generation quality, <i>(ii)</i> flexibility for manipulation and applications such as conditional 
                synthesis and shape interpolation, and <i>(iii)</i> the ability to output smooth surfaces or meshes. To this end, we introduce the 
                hierarchical Latent Point Diffusion Model (LION) for 3D shape generation. LION is set up as a variational autoencoder (VAE) with 
                a hierarchical latent space that combines a global shape latent representation with a point-structured latent space. For generation, 
                we train two hierarchical DDMs in these latent spaces. The hierarchical VAE approach boosts performance compared to DDMs that operate 
                on point clouds directly, while the point-structured latents are still ideally suited for DDM-based modeling. Experimentally, LION 
                achieves state-of-the-art generation performance on multiple ShapeNet benchmarks. Furthermore, our VAE framework allows us to easily 
                use LION for different relevant tasks: LION excels at multimodal shape denoising and voxel-conditioned synthesis, and it can be adapted
                for text- and image-driven 3D generation. We also demonstrate shape autoencoding and latent shape interpolation, and we augment LION with modern 
                surface reconstruction techniques to generate smooth 3D meshes. We hope that LION provides a powerful tool for artists working with 
                3D shapes due to its high-quality generation, flexibility, and surface reconstruction. 
            </p>
        </div>
    </section>
    <section id="method"/>
        <hr>
        <h2>Method</h2>
        <div class="flex-row">
            <p> 
				We introduce the Latent Point Diffusion Model (LION), a DDM for 3D shape generation.
				LION focuses on learning a 3D generative model directly from geometry data without image-based training. 
				Similar to previous 3D DDMs in this setting, LION operates on point clouds. However, it is constructed as a VAE with DDMs in latent
				space. LION comprises a hierarchical latent space with a vector-valued global shape latent and another
				point-structured latent space. The latent representations are predicted with point cloud processing
				encoders, and two latent DDMs are trained in these latent spaces. Synthesis in LION proceeds by drawing 
				novel latent samples from the hierarchical latent DDMs and decoding back to the original point
				cloud space. Importantly, we also demonstrate how to augment LION with modern surface reconstruction methods to 
				synthesize smooth shapes as desired by artists. LION has multiple advantages:
			</p>
			<p>
				<b>Expressivity:</b> By mapping point clouds into regularized latent spaces, the DDMs in latent space are
				effectively tasked with learning a smoothed distribution. This is easier than training on potentially
				complex point clouds directly, thereby improving expressivity. However, point clouds are, in
				principle, an ideal representation for DDMs. Because of that, we use latent points, this is, we keep a
				point cloud structure for our main latent representation. Augmenting the model with an additional
				global shape latent variable in a hierarchical manner further boosts expressivity.
			</p>
			<p>
				<b>Varying Output Types:</b> Extending LION with Shape As Points (SAP) geometry reconstruction
				allows us to also output smooth meshes. Fine-tuning SAP on data generated by LION’s autoencoder
				reduces synthesis noise and enables us to generate high-quality geometry. LION combines (latent)
				point cloud-based modeling, ideal for DDMs, with surface reconstruction, desired by artists.
			</p>
			<p>
				<b>Flexibility:</b> Since LION is set up as a VAE, it can be easily adapted for different tasks without 
				retraining the latent DDMs: We can efficiently fine-tune LION’s encoders on voxelized or noisy inputs,
				which a user can provide for guidance. This enables multimodal voxel-guided synthesis and shape 
				denoising. We also leverage LION’s latent spaces for shape interpolation and autoencoding. Optionally
				training the DDMs conditioned on CLIP embeddings enables image- and text-driven 3D generation.
            </p>
        </div>
        <center>
            <figure style="width: 100%;">
                    <a>
                        <img width="80%" src="assets/pipeline.jpg"> 
                    </a>
                    <p class="caption" style="margin-bottom: 24px;"><br>
                       LION is set up as a hierarchical point cloud VAE with denoising diffusion models over the shape latent and latent point distributions. 
                       Point-Voxel CNNs (PVCNN) with adaptive Group Normalization (Ada. GN) are used as neural networks. 
                       The latent points can be interpreted as a smoothed version of the input point cloud. 
                       Shape As Points (SAP) is optionally used for mesh reconstruction.
                    </p>
            </figure>

        </center>
    </section>

    <!--
    <section id="teaser-video">
    </p>
    <center>
    <figure>
        <video class="centered" width="50%" controls muted autoplay>
            <source src="assets/LION_demo.mp4#t=0.001" type="video/mp4">
            Your browser does not support the video tag.
        </video>
        <p class="caption">
            Generated output from LION.
        </p>
    </figure>
    </center>
    </p>
    </section>
    -->

    <section id="novelties"/>
        <hr>
        <h2>Main Contributions</h2>
        <div class="flex-row">
            <p>
                <ul style="list-style-type:disc;">
                    <li>We introduce LION, a novel generate model for 3D shape synthesis. We explore the training of multiple hierarchical denoising diffusion models in latent space.</li>
                    <!-- <li>We train latent DDMs in 3D generation.</li> -->
                    <li>We extensively validate LION's high synthesis quality and reach state-of-the-art performance on widely used ShapeNet benchmarks.</li>
                    <li>We demonstrate that LION scales to extremely diverse shape datasets. For instance, LION can model 13 or even 55 ShapeNet categories jointly without any class-conditioning. In the other extreme, we also verify that LION can be successfully trained on small datasets with less 100 shapes.</li> 
                    <li>We propose to combine LION with Shape As Points-based surface reconstruction to directly extract practically useful meshes.</li>
                    <li>We show our model's flexibility by demonstrating how LION can be adapted to various relevant tasks, such as multimodal shape denoising, voxel-guided synthesis, text- and image-driven shape generation, and more.</li>
                </ul>
            </p>
        </div>
    </section>


    <section id="results">
        <hr>
        <h2>Generation Results (Single Category Models)</h2>  
<!--             <div class="flex-row">
                <p>Samples from LION trained on single catgory.  </p>
            </div> -->

            <center>
            <figure>
                <video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
                    <source src="assets/gen_airplane.mp4#t=0.001" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <p class="caption">
                    Generated point clouds and reconstructed meshes of airplanes.             
                </p> <br>
            </figure>
            <figure>
                <video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
                    <source src="assets/gen_chair.mp4#t=0.001" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <p class="caption">
                    Generated point clouds and reconstructed meshes of chairs.             
                </p> <br>
            </figure>
            <figure>
                <video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
                    <source src="assets/gen_car.mp4#t=0.001" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <p class="caption">
                    Generated point clouds and reconstructed meshes of cars.             
                </p> <br>
            </figure>

            <figure style="width: 100%;">
                <video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
                    <source src="assets/gen_animal553_v2.mp4#t=0.001" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <p class="caption" style="margin-bottom: 24px;">
                    Generated point clouds and reconstructed meshes of animals (model trained on only 553 shapes).             
                </p> <br>
            </figure>
            </center>
            <center>
            <figure>
                <video class="centered" width="100%" controls muted autoplay playsinline class="video-background " >
                    <source src="assets/gen_bottle.mp4#t=11" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <p class="caption">
                    Generated point clouds and reconstructed meshes of bottles (model trained on only 340 shapes).             
                </p> <br>
            </figure>
            </center>

            <center>
            <figure>
                <video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
                    <source src="assets/gen_mug.mp4#t=11" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <p class="caption">
                    Generated point clouds and reconstructed meshes of mugs (model trained on only 149 shapes).             

                </p> <br>
            </figure>
            </center>

        <hr>
        <h2>Generation Results (Multi-Class)</h2>  
            <div class="flex-row">
                <p>Below we show samples from LION models that were trained on shapes from multiple ShapeNet catgories, <l>without any class-conditioning</l>. We on purpose did not use conditioning to explore LION's scalability to diverse and multimodal datasets in the unconditional setting.</p>
            </div> 
            <center>
            <figure>
                <video class="centered" width="100%" controls autoplay muted playsinline class="video-background " >
                    <source src="assets/gen_all_v13.mp4#t=0.001" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <p class="caption">
                    Generated point clouds and reconstructed meshes. The LION model is trained on 13 ShapeNet categories jointly without conditioning.              
                </p>
                <br>
            </figure>
            </center>

            <center>
            <figure>
                <video class="centered" width="100%" controls muted playsinline class="video-background " >
                    <source src="assets/gen_all_55.mp4#t=0.001" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <p class="caption">
                    Generated point clouds from a LION model that was trained on all 55 ShapeNet categories jointly without conditioning.              
                </p>
                <br>
            </figure>
            </center>

    </section> 

    <section id="more_results">
        <hr> 
        <h2>More Results and Applications</h2>
        Our main goal was to introduce a high-performance 3D shape generative model. Here, we qualitatively demonstrate how LION can be used for a variety of interesting applications.
        <h3>Shape Interpolation</h3> 
            <div class="flex-row">
                <p>LION can interpolate shapes by traversing the latent space (interpolation is performed in the latent diffusion models' prior space, using the <i>Probability Flow ODE</i> for deterministic DDM-generation). The generated shapes are clean and semantically plausible along the entire interpolation path.</p>
            </div>
            <figure>
                <video class="centered" width="100%" controls muted playsinline class="video-background " >
                    <source src="assets/LION_interp.mp4#t=0.001" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <p class="caption">
                    Leftmost shape: the source shape. Rightmost shape: the target shape. The shapes in the middle are interpolated between source and target shape.
                </p> 
            </figure>
           <center>
           <figure>
                <video class="centered" width="50%" controls loop autoplay muted playsinline class="video-background " >
                    <source src="assets/LION_interp_seq.mp4#t=0.001" type="video/mp4">
                    Your browser does not support the video tag.
                </video>
                <p class="caption">
                    LION traverses the latent space and interpolates many different shapes. 
                </p> 
            </figure>
           </center>
        <br>
        <h3>Fast Sampling with DDIM</h3>  
            <div class="flex-row">
                <p>LION's sampling time can be reduced by using fast DDM sampler, such as the DDIM sampler. DDIM sampling with 25 steps can already generate high-quality shapes, which takes less than 1 sec. This enables real-time and interactive applications.</p>
            </div>
            <center>
            <figure style="width: 100%;">
                    <a>
                        <img width="100%" src="assets/ddim_sample.png">
                    </a>
                    <p class="caption" style="margin-bottom: 24px;">
                        DDIM samples from LION trained on different data. The top two rows show the number of steps and the wall-clock time required when drawing one sample. 
                        With DDIM sampling, we can reduce the sampling time from 27.09 sec (1000 steps) to less than 1 sec (25 steps) to generate an object. 
                    </p>
            </figure>
            </center>

        <br>
        <h3> Multimodal Generation</h3>
            <div class="flex-row">
                <p>
                    LION can synthesize different variations of a given shape, enabling multimodal generation in a controlled manner. This is achieved through a diffuse-denoise procedure, where shapes a diffused for only a few steps in the latent DDMs and then denoised again. 
                </p>
            </div>
            <!--
			<div class="row">
			  		<div class="column3">
                        <figure style="width: 100%;">
                                <a>
                                    <img width="30%" src="assets/multi_modal/airplane/recon_313.jpg">
                                    <img width="30%" src="assets/multi_modal/airplane/airplane_D200/sap_0000/recon_0.jpg">
                                    <img width="30%" src="assets/multi_modal/airplane/airplane_D200/sap_0000/recon_2.jpg">
                                </a>
                                <figcaption align = "center">
                                Multimodal generation of airplane.
                                </figcaption>
                        </figure>
			  		</div>

			  		<div class="column3">
                        <figure style="width: 100%;">
                                <a>
                                    <img width="30%" src="assets/multi_modal/chair/recon_137.jpg">
                                    <img width="30%" src="assets/multi_modal/chair/chair_D160/sap_0000/recon_4.png"> 
                                    <img width="30%" src="assets/multi_modal/chair/chair_D160/sap_0000/recon_9.png"> 
                                </a>
                                <figcaption align = "center">
                                Multimodal generation of chair.
                                </figcaption>
                        </figure>

			  		</div>
			  		<div class="column3">
                        <figure style="width: 100%;">
                                <a>
                                    <img width="30%" src="assets/multi_modal/car/recon_46.jpg">
                                    <img width="30%" src="assets/multi_modal/car/recon_0.png"> 
                                    <img width="30%" src="assets/multi_modal/car/recon_5.png">
                                </a>
                                <figcaption align = "center">
                                Multimodal generation of car.
                                </figcaption>
                        </figure>


			  		</div>
			</div>
            -->
            <center>
            <figure style="width: 30%;">
                    <a>
                        <img width="30%" src="assets/multi_modal/airplane/recon_313.jpg">
                        <img width="30%" src="assets/multi_modal/airplane/airplane_D200/sap_0000/recon_0.png">
                        <img width="30%" src="assets/multi_modal/airplane/airplane_D200/sap_0000/recon_2.png">
                    </a>
                    <p class="caption" style="margin-bottom: 24px;">
                    Multimodal generation of airplanes.
                    </p>
            </figure>
            </center>

            <center>
            <figure style="width: 30%;">
                    <a>
                        <img width="30%" src="assets/multi_modal/chair/recon_137.jpg">
                        <img width="30%" src="assets/multi_modal/chair/chair_D160/sap_0000/recon_4.png"> 
                        <img width="30%" src="assets/multi_modal/chair/chair_D160/sap_0000/recon_9.png"> 
                    </a>
                    <p class="caption" style="margin-bottom: 24px;">
                    Multimodal generation of chairs.
                    </p>
            </figure>
            </center>

            <center>
            <figure style="width: 30%;">
                    <a>
                        <img width="30%" src="assets/multi_modal/car/recon_46.jpg">
                        <img width="30%" src="assets/multi_modal/car/recon_0.png"> 
                        <img width="30%" src="assets/multi_modal/car/recon_5.png">
                    </a>
                    <p class="caption" style="margin-bottom: 24px;">
                    Multimodal generation of cars.
                    </p>
            </figure>
            </center>
      
        <h3>Voxel-Conditioned Synthesis </h3>  
            <div class="flex-row">
                <p>Given a coarse voxel grid, LION can generate different plausible detailed shapes. In practice, an artist using a 3D generative model may have a rough idea of the desired shape. For instance, they may be able to quickly construct a coarse voxelized shape, to which the generative model then adds realistic details. We achieve this by fine-tuning our encoder networks on voxelized shapes, and performing a few steps of diffuse-denoise in latent space to generate various plausible detailed shapes.</p>
            </div>

            <center>
                <figure style="width: 80%;">

                    <video class="centered" width="80%" controls muted playsinline class="video-background " >
                        <source src="assets/airplane_voxel.mp4#t=14.8" type="video/mp4">
                        Your browser does not support the video tag.
                    </video>
                    <p class="caption" style="margin-bottom: 24px;" width="30%">
                        Left: Input voxel grid. Right: two point clouds generated by LION and the reconstructed mesh.
                        <!-- Voxel-guided synthesis experiments, on different categories. We run diffuse-denoise in latent space to generate diverse plausible clean shapes (first row, left plane: 250 diffuse-denoise steps; first row, right plane: 200 steps;) -->
                    </p>

                </figure>
            </center>
        <br>


        <h3> Single View Reconstruction </h3> 
            <div class="flex-row">
                <p> 
                    We qualitatively demonstrate how LION can be extended to also allow for single view reconstruction (SVR) from RGB data, using the approach from CLIP-Forge. 
                    We render 2D images from the 3D ShapeNet shapes, extract the images’ CLIP image embeddings, and
                    train LION’s latent diffusion models while conditioning on the shapes’ CLIP image embeddings.
                    At test time, we then take a single view 2D image, extract the CLIP image embedding, and generate
                    corresponding 3D shapes, thereby effectively performing SVR. We show SVR results from real
                    RGB data.
                </p>
            </div>
            <center>
            <figure style="width: 100%;">
                    <a>
                        <img width="49%" src="assets/svr/img2shape_mitsuba_full.jpg">
                        <img width="49%" src="assets/svr/img2shape_cari2s_mm_mitsuba_full.jpg">
                    </a>
                    <p class="caption" style="margin-bottom: 24px;">
                         Single view reconstruction from RGB images of chairs. For each input image, LION can generate multi-modal outputs.
                    </p>
            </figure>
            </center>
            <!-- 
            <figure style="width: 50%;">
                    <a>
                        <img width="100%" src="assets/svr/img2shape_cari2s_mm_mitsuba_full.jpg">
                    </a>
                    <p class="caption" style="margin-bottom: 24px;">
                         Single view reconstruction from RGB images of car. For each input image, LION can generate multi-modal outputs.
                    </p>
            </figure>
            --> 
            <center>
            <figure style="width: 100%;">
                    <a>
                        <img width="100%" src="assets/svr/img2shape_cari2s_mitsuba_full.jpg">
                    </a>
                    <p class="caption" style="margin-bottom: 24px;">
                         More single view reconstructions from RGB images of cars. 
                    </p>
            </figure>
            </center>
        <br>
        <h3> Text-Guided Generation </h3>
            <div class="flex-row">
                <p> 
                     Using CLIP’s text encoder, our method additionally allows for text-guided generation.  
                </p>
            </div>
            <center>
            <figure style="width: 100%;">
                    <a>
                        <img width="35%" src="assets/clipforge_chair.png">
                        <img width="60%" src="assets/clipforge_car.png">
                    </a>
                    <p class="caption" style="margin-bottom: 24px;">
                         Text-driven shape generation of chairs with LION. Bottom row is the text inputs.
                    </p>
            </figure>
            </center>
        <h3> Per-sample Text-driven Texture Synthesis</h3> 
            <div class="flex-row">
                <p> 
           			We apply Text2mesh on some generated meshes from LION to additionally synthesize textures in a text-driven manner, leveraging CLIP. The original input meshes are generated by LION. 
           			This is only possible because LION can output practically useful meshes with its SAP-based surface reconstruction component (even though the backbone generative modeling component is point cloud-based).
                </p>
            </div>
			<div class="row">
			  		<div class="column">
                        <img width="100%" src="assets/text2mesh/strawberries_airplane-rec_3.jpg"> 
                        <figcaption align = "center">an airplane made of strawberry</figcaption>
			  		</div>

			  		<div class="column">
                        <img width="100%" src="assets/text2mesh/fabric_leather_airplane-rec_3.jpg"> 
                        <figcaption align = "center">an airplane made of fabric leather </figcaption>
			  		</div>
			  		<div class="column">
                        <img width="100%" src="assets/text2mesh/wood_chair-rec_421_norm1.jpg"> 
                        <figcaption align = "center">a chair made of wood</figcaption>
			  		</div>
			  		<div class="column">
                        <img width="100%" src="assets/text2mesh/wrong_copied1-rec_293_norm0.jpg"> 
                        <figcaption align = "center">a car made of rusty metal</figcaption>
			  		</div>

			  		<div class="column">
                        <img width="100%" src="assets/text2mesh/brick_car-rec_67_norm1.jpg"> 
                        <figcaption align = "center">a car made of brick</figcaption>
			  		</div>


			  		<div class="column">
                        <img width="100%" src="assets/text2mesh/wrong_copied1-rec_12_norm1.jpg"> 
                        <figcaption align = "center">a denim fabric animal</figcaption>
			  		</div>
			</div>
        <br>


    </section>
    <section id="paper">
        <h2>Paper</h2>
        <hr>
        <div class="flex-row">
            <div class="download-thumb">
            <div style="box-sizing: border-box; padding: 16px; margin: auto;">
                <a href="https://nv-tlabs.github.io/LION"><img class="screenshot" src="assets/cld_paper_preview.png"></a>
            </div>
        </div>
            <div class="paper-stuff">
                <p><b>LION: Latent Point Diffusion Models for 3D Shape Generation</b></p>
                <p>Xiaohui Zeng, Arash Vahdat, Francis Williams, Zan Gojcic, Or Litany, Sanja Fidler, Karsten Kreis</p>
                <p><i>Advances in Neural Information Processing Systems (NeurIPS), 2022 <b></b></i></p> 
                <div><span class="material-icons"> description </span><a href="https://nv-tlabs.github.io/LION"> arXiv version</a></div>
                <div><span class="material-icons"> insert_comment </span><a href="assets/zeng2022lion.bib"> BibTeX</a></div>
                <div><span class="material-icons"> integration_instructions </span><a href="https://nv-tlabs.github.io/LION"> Code</a></div>
            </div>
            </div>
        </div>
    </section>

    <section id="bibtex">
        <h2>Citation</h2>
        <hr>
        <pre><code>@inproceedings{zeng2022lion,
    title={LION: Latent Point Diffusion Models for 3D Shape Generation},
    author={Xiaohui Zeng and Arash Vahdat and Francis Williams and Zan Gojcic and Or Litany and Sanja Fidler and Karsten Kreis},
    booktitle={Advances in Neural Information Processing Systems (NeurIPS)},
    year={2022}
}</code></pre>
    </section>
</div>
</body>
</html>