@misc{goodfellow_generative_2014, title = {Generative {Adversarial} {Networks}}, url = {http://arxiv.org/abs/1406.2661}, doi = {10.48550/arXiv.1406.2661}, abstract = {We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to 1/2 everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any Markov chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples.}, urldate = {2023-01-29}, publisher = {arXiv}, author = {Goodfellow, Ian J. and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua}, month = jun, year = {2014}, note = {arXiv:1406.2661 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/5STMX2XJ/Goodfellow et al. - 2014 - Generative Adversarial Networks.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/MYEGE7IK/1406.html:text/html}, } @misc{salimans_improved_2016, title = {Improved {Techniques} for {Training} {GANs}}, url = {http://arxiv.org/abs/1606.03498}, doi = {10.48550/arXiv.1606.03498}, abstract = {We present a variety of new architectural features and training procedures that we apply to the generative adversarial networks (GANs) framework. We focus on two applications of GANs: semi-supervised learning, and the generation of images that humans find visually realistic. Unlike most work on generative models, our primary goal is not to train a model that assigns high likelihood to test data, nor do we require the model to be able to learn well without using any labels. Using our new techniques, we achieve state-of-the-art results in semi-supervised classification on MNIST, CIFAR-10 and SVHN. The generated images are of high quality as confirmed by a visual Turing test: our model generates MNIST samples that humans cannot distinguish from real data, and CIFAR-10 samples that yield a human error rate of 21.3\%. We also present ImageNet samples with unprecedented resolution and show that our methods enable the model to learn recognizable features of ImageNet classes.}, urldate = {2023-01-29}, publisher = {arXiv}, author = {Salimans, Tim and Goodfellow, Ian and Zaremba, Wojciech and Cheung, Vicki and Radford, Alec and Chen, Xi}, month = jun, year = {2016}, note = {arXiv:1606.03498 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/PEYM38ZW/Salimans et al. - 2016 - Improved Techniques for Training GANs.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/5XMXB7WV/1606.html:text/html}, } @misc{arjovsky_towards_2017, title = {Towards {Principled} {Methods} for {Training} {Generative} {Adversarial} {Networks}}, url = {http://arxiv.org/abs/1701.04862}, doi = {10.48550/arXiv.1701.04862}, abstract = {The goal of this paper is not to introduce a single algorithm or method, but to make theoretical steps towards fully understanding the training dynamics of generative adversarial networks. In order to substantiate our theoretical analysis, we perform targeted experiments to verify our assumptions, illustrate our claims, and quantify the phenomena. This paper is divided into three sections. The first section introduces the problem at hand. The second section is dedicated to studying and proving rigorously the problems including instability and saturation that arize when training generative adversarial networks. The third section examines a practical and theoretically grounded direction towards solving these problems, while introducing new tools to study them.}, urldate = {2023-01-29}, publisher = {arXiv}, author = {Arjovsky, Martin and Bottou, Léon}, month = jan, year = {2017}, note = {arXiv:1701.04862 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/AEE2NPN4/Arjovsky and Bottou - 2017 - Towards Principled Methods for Training Generative.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/QEE7N7KP/1701.html:text/html}, } @misc{arjovsky_wasserstein_2017, title = {Wasserstein {GAN}}, url = {http://arxiv.org/abs/1701.07875}, doi = {10.48550/arXiv.1701.07875}, abstract = {We introduce a new algorithm named WGAN, an alternative to traditional GAN training. In this new model, we show that we can improve the stability of learning, get rid of problems like mode collapse, and provide meaningful learning curves useful for debugging and hyperparameter searches. Furthermore, we show that the corresponding optimization problem is sound, and provide extensive theoretical work highlighting the deep connections to other distances between distributions.}, urldate = {2023-01-29}, publisher = {arXiv}, author = {Arjovsky, Martin and Chintala, Soumith and Bottou, Léon}, month = dec, year = {2017}, note = {arXiv:1701.07875 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/KW83LJBX/Arjovsky et al. - 2017 - Wasserstein GAN.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/YA9DTUVB/1701.html:text/html}, } @misc{song_generative_2020, title = {Generative {Modeling} by {Estimating} {Gradients} of the {Data} {Distribution}}, url = {http://arxiv.org/abs/1907.05600}, doi = {10.48550/arXiv.1907.05600}, abstract = {We introduce a new generative model where samples are produced via Langevin dynamics using gradients of the data distribution estimated with score matching. Because gradients can be ill-defined and hard to estimate when the data resides on low-dimensional manifolds, we perturb the data with different levels of Gaussian noise, and jointly estimate the corresponding scores, i.e., the vector fields of gradients of the perturbed data distribution for all noise levels. For sampling, we propose an annealed Langevin dynamics where we use gradients corresponding to gradually decreasing noise levels as the sampling process gets closer to the data manifold. Our framework allows flexible model architectures, requires no sampling during training or the use of adversarial methods, and provides a learning objective that can be used for principled model comparisons. Our models produce samples comparable to GANs on MNIST, CelebA and CIFAR-10 datasets, achieving a new state-of-the-art inception score of 8.87 on CIFAR-10. Additionally, we demonstrate that our models learn effective representations via image inpainting experiments.}, urldate = {2023-01-29}, publisher = {arXiv}, author = {Song, Yang and Ermon, Stefano}, month = oct, year = {2020}, note = {arXiv:1907.05600 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/NDB8ZJRC/Song and Ermon - 2020 - Generative Modeling by Estimating Gradients of the.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/KG2SAQFI/1907.html:text/html}, } @article{yacoby_failure_2021, title = {Failure {Modes} of {Variational} {Autoencoders} and {Their} {Effects} on {Downstream} {Tasks}}, url = {https://openreview.net/forum?id=5Spjp0zDYt}, abstract = {Variational Auto-encoders (VAEs) are deep generative latent variable models that are widely used for a number of downstream tasks. While it has been demonstrated that VAE training can suffer from a number of pathologies, existing literature lacks characterizations of exactly when these pathologies occur and how they impact down-stream task performance. In this paper we concretely characterize conditions under which VAE training exhibits pathologies and connect these failure modes to undesirable effects on specific downstream tasks, such as learning compressed and disentangled representations, adversarial robustness and semi-supervised learning.}, language = {en}, urldate = {2023-01-29}, author = {Yacoby, Yaniv and Pan, Weiwei and Doshi-Velez, Finale}, month = mar, year = {2021}, file = {Full Text PDF:/home/laurent/Zotero/storage/J37MD8SR/Yacoby et al. - 2021 - Failure Modes of Variational Autoencoders and Thei.pdf:application/pdf}, } @inproceedings{higgins_beta-vae_2022, title = {beta-{VAE}: {Learning} {Basic} {Visual} {Concepts} with a {Constrained} {Variational} {Framework}}, shorttitle = {beta-{VAE}}, url = {https://openreview.net/forum?id=Sy2fzU9gl}, abstract = {Learning an interpretable factorised representation of the independent data generative factors of the world without supervision is an important precursor for the development of artificial intelligence that is able to learn and reason in the same way that humans do. We introduce beta-VAE, a new state-of-the-art framework for automated discovery of interpretable factorised latent representations from raw image data in a completely unsupervised manner. Our approach is a modification of the variational autoencoder (VAE) framework. We introduce an adjustable hyperparameter beta that balances latent channel capacity and independence constraints with reconstruction accuracy. We demonstrate that beta-VAE with appropriately tuned beta {\textgreater} 1 qualitatively outperforms VAE (beta = 1), as well as state of the art unsupervised (InfoGAN) and semi-supervised (DC-IGN) approaches to disentangled factor learning on a variety of datasets (celebA, faces and chairs). Furthermore, we devise a protocol to quantitatively compare the degree of disentanglement learnt by different models, and show that our approach also significantly outperforms all baselines quantitatively. Unlike InfoGAN, beta-VAE is stable to train, makes few assumptions about the data and relies on tuning a single hyperparameter, which can be directly optimised through a hyper parameter search using weakly labelled data or through heuristic visual inspection for purely unsupervised data.}, language = {en}, urldate = {2023-01-29}, author = {Higgins, Irina and Matthey, Loic and Pal, Arka and Burgess, Christopher and Glorot, Xavier and Botvinick, Matthew and Mohamed, Shakir and Lerchner, Alexander}, month = jul, year = {2022}, file = {Full Text PDF:/home/laurent/Zotero/storage/FD5Q6H4B/Higgins et al. - 2022 - beta-VAE Learning Basic Visual Concepts with a Co.pdf:application/pdf}, } @misc{kingma_auto-encoding_2022, title = {Auto-{Encoding} {Variational} {Bayes}}, url = {http://arxiv.org/abs/1312.6114}, doi = {10.48550/arXiv.1312.6114}, abstract = {How can we perform efficient inference and learning in directed probabilistic models, in the presence of continuous latent variables with intractable posterior distributions, and large datasets? We introduce a stochastic variational inference and learning algorithm that scales to large datasets and, under some mild differentiability conditions, even works in the intractable case. Our contributions are two-fold. First, we show that a reparameterization of the variational lower bound yields a lower bound estimator that can be straightforwardly optimized using standard stochastic gradient methods. Second, we show that for i.i.d. datasets with continuous latent variables per datapoint, posterior inference can be made especially efficient by fitting an approximate inference model (also called a recognition model) to the intractable posterior using the proposed lower bound estimator. Theoretical advantages are reflected in experimental results.}, urldate = {2023-01-29}, publisher = {arXiv}, author = {Kingma, Diederik P. and Welling, Max}, month = dec, year = {2022}, note = {arXiv:1312.6114 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/8MXMAC2E/Kingma and Welling - 2022 - Auto-Encoding Variational Bayes.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/TDNMVVSS/1312.html:text/html}, } @misc{zeng_lion_2022, title = {{LION}: {Latent} {Point} {Diffusion} {Models} for {3D} {Shape} {Generation}}, shorttitle = {{LION}}, url = {http://arxiv.org/abs/2210.06978}, doi = {10.48550/arXiv.2210.06978}, abstract = {Denoising diffusion models (DDMs) have shown promising results in 3D point cloud synthesis. To advance 3D DDMs and make them useful for digital artists, we require (i) high generation quality, (ii) flexibility for manipulation and applications such as conditional synthesis and shape interpolation, and (iii) the ability to output smooth surfaces or meshes. To this end, we introduce the hierarchical Latent Point Diffusion Model (LION) for 3D shape generation. LION is set up as a variational autoencoder (VAE) with a hierarchical latent space that combines a global shape latent representation with a point-structured latent space. For generation, we train two hierarchical DDMs in these latent spaces. The hierarchical VAE approach boosts performance compared to DDMs that operate on point clouds directly, while the point-structured latents are still ideally suited for DDM-based modeling. Experimentally, LION achieves state-of-the-art generation performance on multiple ShapeNet benchmarks. Furthermore, our VAE framework allows us to easily use LION for different relevant tasks: LION excels at multimodal shape denoising and voxel-conditioned synthesis, and it can be adapted for text- and image-driven 3D generation. We also demonstrate shape autoencoding and latent shape interpolation, and we augment LION with modern surface reconstruction techniques to generate smooth 3D meshes. We hope that LION provides a powerful tool for artists working with 3D shapes due to its high-quality generation, flexibility, and surface reconstruction. Project page and code: https://nv-tlabs.github.io/LION.}, urldate = {2023-01-29}, publisher = {arXiv}, author = {Zeng, Xiaohui and Vahdat, Arash and Williams, Francis and Gojcic, Zan and Litany, Or and Fidler, Sanja and Kreis, Karsten}, month = oct, year = {2022}, note = {arXiv:2210.06978 [cs, stat]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/FACF8TI9/Zeng et al. - 2022 - LION Latent Point Diffusion Models for 3D Shape G.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/X57XTJQR/2210.html:text/html}, } @misc{nichol_point-e_2022, title = {Point-{E}: {A} {System} for {Generating} {3D} {Point} {Clouds} from {Complex} {Prompts}}, shorttitle = {Point-{E}}, url = {http://arxiv.org/abs/2212.08751}, doi = {10.48550/arXiv.2212.08751}, abstract = {While recent work on text-conditional 3D object generation has shown promising results, the state-of-the-art methods typically require multiple GPU-hours to produce a single sample. This is in stark contrast to state-of-the-art generative image models, which produce samples in a number of seconds or minutes. In this paper, we explore an alternative method for 3D object generation which produces 3D models in only 1-2 minutes on a single GPU. Our method first generates a single synthetic view using a text-to-image diffusion model, and then produces a 3D point cloud using a second diffusion model which conditions on the generated image. While our method still falls short of the state-of-the-art in terms of sample quality, it is one to two orders of magnitude faster to sample from, offering a practical trade-off for some use cases. We release our pre-trained point cloud diffusion models, as well as evaluation code and models, at https://github.com/openai/point-e.}, urldate = {2023-01-29}, publisher = {arXiv}, author = {Nichol, Alex and Jun, Heewoo and Dhariwal, Prafulla and Mishkin, Pamela and Chen, Mark}, month = dec, year = {2022}, note = {arXiv:2212.08751 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/8IW28GBH/Nichol et al. - 2022 - Point-E A System for Generating 3D Point Clouds f.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/LMQF9Q55/2212.html:text/html}, } @misc{kim_setvae_2021, title = {{SetVAE}: {Learning} {Hierarchical} {Composition} for {Generative} {Modeling} of {Set}-{Structured} {Data}}, shorttitle = {{SetVAE}}, url = {http://arxiv.org/abs/2103.15619}, doi = {10.48550/arXiv.2103.15619}, abstract = {Generative modeling of set-structured data, such as point clouds, requires reasoning over local and global structures at various scales. However, adopting multi-scale frameworks for ordinary sequential data to a set-structured data is nontrivial as it should be invariant to the permutation of its elements. In this paper, we propose SetVAE, a hierarchical variational autoencoder for sets. Motivated by recent progress in set encoding, we build SetVAE upon attentive modules that first partition the set and project the partition back to the original cardinality. Exploiting this module, our hierarchical VAE learns latent variables at multiple scales, capturing coarse-to-fine dependency of the set elements while achieving permutation invariance. We evaluate our model on point cloud generation task and achieve competitive performance to the prior arts with substantially smaller model capacity. We qualitatively demonstrate that our model generalizes to unseen set sizes and learns interesting subset relations without supervision. Our implementation is available at https://github.com/jw9730/setvae.}, urldate = {2023-03-31}, publisher = {arXiv}, author = {Kim, Jinwoo and Yoo, Jaehoon and Lee, Juho and Hong, Seunghoon}, month = mar, year = {2021}, note = {arXiv:2103.15619 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/WUTNGI56/Kim et al. - 2021 - SetVAE Learning Hierarchical Composition for Gene.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/25K7W3C4/2103.html:text/html}, } @misc{takikawa_neural_2021, title = {Neural {Geometric} {Level} of {Detail}: {Real}-time {Rendering} with {Implicit} {3D} {Shapes}}, shorttitle = {Neural {Geometric} {Level} of {Detail}}, url = {http://arxiv.org/abs/2101.10994}, doi = {10.48550/arXiv.2101.10994}, abstract = {Neural signed distance functions (SDFs) are emerging as an effective representation for 3D shapes. State-of-the-art methods typically encode the SDF with a large, fixed-size neural network to approximate complex shapes with implicit surfaces. Rendering with these large networks is, however, computationally expensive since it requires many forward passes through the network for every pixel, making these representations impractical for real-time graphics. We introduce an efficient neural representation that, for the first time, enables real-time rendering of high-fidelity neural SDFs, while achieving state-of-the-art geometry reconstruction quality. We represent implicit surfaces using an octree-based feature volume which adaptively fits shapes with multiple discrete levels of detail (LODs), and enables continuous LOD with SDF interpolation. We further develop an efficient algorithm to directly render our novel neural SDF representation in real-time by querying only the necessary LODs with sparse octree traversal. We show that our representation is 2-3 orders of magnitude more efficient in terms of rendering speed compared to previous works. Furthermore, it produces state-of-the-art reconstruction quality for complex shapes under both 3D geometric and 2D image-space metrics.}, urldate = {2023-03-28}, publisher = {arXiv}, author = {Takikawa, Towaki and Litalien, Joey and Yin, Kangxue and Kreis, Karsten and Loop, Charles and Nowrouzezahrai, Derek and Jacobson, Alec and McGuire, Morgan and Fidler, Sanja}, month = jan, year = {2021}, note = {arXiv:2101.10994 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/KJMJR4CB/Takikawa et al. - 2021 - Neural Geometric Level of Detail Real-time Render.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/ST444B55/2101.html:text/html}, } @misc{nash_polygen_2020, title = {{PolyGen}: {An} {Autoregressive} {Generative} {Model} of {3D} {Meshes}}, shorttitle = {{PolyGen}}, url = {http://arxiv.org/abs/2002.10880}, doi = {10.48550/arXiv.2002.10880}, abstract = {Polygon meshes are an efficient representation of 3D geometry, and are of central importance in computer graphics, robotics and games development. Existing learning-based approaches have avoided the challenges of working with 3D meshes, instead using alternative object representations that are more compatible with neural architectures and training approaches. We present an approach which models the mesh directly, predicting mesh vertices and faces sequentially using a Transformer-based architecture. Our model can condition on a range of inputs, including object classes, voxels, and images, and because the model is probabilistic it can produce samples that capture uncertainty in ambiguous scenarios. We show that the model is capable of producing high-quality, usable meshes, and establish log-likelihood benchmarks for the mesh-modelling task. We also evaluate the conditional models on surface reconstruction metrics against alternative methods, and demonstrate competitive performance despite not training directly on this task.}, urldate = {2023-03-28}, publisher = {arXiv}, author = {Nash, Charlie and Ganin, Yaroslav and Eslami, S. M. Ali and Battaglia, Peter W.}, month = feb, year = {2020}, note = {arXiv:2002.10880 [cs, stat]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Statistics - Machine Learning, Computer Science - Graphics}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/JE5MEK9K/Nash et al. - 2020 - PolyGen An Autoregressive Generative Model of 3D .pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/7Y3HEDRQ/2002.html:text/html}, } @misc{zhang_3dshape2vecset_2023, title = {{3DShape2VecSet}: {A} {3D} {Shape} {Representation} for {Neural} {Fields} and {Generative} {Diffusion} {Models}}, shorttitle = {{3DShape2VecSet}}, url = {http://arxiv.org/abs/2301.11445}, doi = {10.48550/arXiv.2301.11445}, abstract = {We introduce 3DShape2VecSet, a novel shape representation for neural fields designed for generative diffusion models. Our shape representation can encode 3D shapes given as surface models or point clouds, and represents them as neural fields. The concept of neural fields has previously been combined with a global latent vector, a regular grid of latent vectors, or an irregular grid of latent vectors. Our new representation encodes neural fields on top of a set of vectors. We draw from multiple concepts, such as the radial basis function representation and the cross attention and self-attention function, to design a learnable representation that is especially suitable for processing with transformers. Our results show improved performance in 3D shape encoding and 3D shape generative modeling tasks. We demonstrate a wide variety of generative applications: unconditioned generation, category-conditioned generation, text-conditioned generation, point-cloud completion, and image-conditioned generation.}, urldate = {2023-03-28}, publisher = {arXiv}, author = {Zhang, Biao and Tang, Jiapeng and Niessner, Matthias and Wonka, Peter}, month = feb, year = {2023}, note = {arXiv:2301.11445 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/T8R7H6N4/Zhang et al. - 2023 - 3DShape2VecSet A 3D Shape Representation for Neur.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/6GNICSIQ/2301.html:text/html}, } @misc{yang_pointflow_2019, title = {{PointFlow}: {3D} {Point} {Cloud} {Generation} with {Continuous} {Normalizing} {Flows}}, shorttitle = {{PointFlow}}, url = {http://arxiv.org/abs/1906.12320}, doi = {10.48550/arXiv.1906.12320}, abstract = {As 3D point clouds become the representation of choice for multiple vision and graphics applications, the ability to synthesize or reconstruct high-resolution, high-fidelity point clouds becomes crucial. Despite the recent success of deep learning models in discriminative tasks of point clouds, generating point clouds remains challenging. This paper proposes a principled probabilistic framework to generate 3D point clouds by modeling them as a distribution of distributions. Specifically, we learn a two-level hierarchy of distributions where the first level is the distribution of shapes and the second level is the distribution of points given a shape. This formulation allows us to both sample shapes and sample an arbitrary number of points from a shape. Our generative model, named PointFlow, learns each level of the distribution with a continuous normalizing flow. The invertibility of normalizing flows enables the computation of the likelihood during training and allows us to train our model in the variational inference framework. Empirically, we demonstrate that PointFlow achieves state-of-the-art performance in point cloud generation. We additionally show that our model can faithfully reconstruct point clouds and learn useful representations in an unsupervised manner. The code will be available at https://github.com/stevenygd/PointFlow.}, urldate = {2023-03-28}, publisher = {arXiv}, author = {Yang, Guandao and Huang, Xun and Hao, Zekun and Liu, Ming-Yu and Belongie, Serge and Hariharan, Bharath}, month = sep, year = {2019}, note = {arXiv:1906.12320 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/V87MQMLC/Yang et al. - 2019 - PointFlow 3D Point Cloud Generation with Continuo.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/KEHU85VD/1906.html:text/html}, } @misc{fan_generative_2023, title = {Generative {Diffusion} {Models} on {Graphs}: {Methods} and {Applications}}, shorttitle = {Generative {Diffusion} {Models} on {Graphs}}, url = {http://arxiv.org/abs/2302.02591}, doi = {10.48550/arXiv.2302.02591}, abstract = {Diffusion models, as a novel generative paradigm, have achieved remarkable success in various image generation tasks such as image inpainting, image-to-text translation, and video generation. Graph generation is a crucial computational task on graphs with numerous real-world applications. It aims to learn the distribution of given graphs and then generate new graphs. Given the great success of diffusion models in image generation, increasing efforts have been made to leverage these techniques to advance graph generation in recent years. In this paper, we first provide a comprehensive overview of generative diffusion models on graphs, In particular, we review representative algorithms for three variants of graph diffusion models, i.e., Score Matching with Langevin Dynamics (SMLD), Denoising Diffusion Probabilistic Model (DDPM), and Score-based Generative Model (SGM). Then, we summarize the major applications of generative diffusion models on graphs with a specific focus on molecule and protein modeling. Finally, we discuss promising directions in generative diffusion models on graph-structured data.}, urldate = {2023-03-27}, publisher = {arXiv}, author = {Fan, Wenqi and Liu, Chengyi and Liu, Yunqing and Li, Jiatong and Li, Hang and Liu, Hui and Tang, Jiliang and Li, Qing}, month = feb, year = {2023}, note = {arXiv:2302.02591 [cs]}, keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Social and Information Networks}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/3M3G2JY5/Fan et al. - 2023 - Generative Diffusion Models on Graphs Methods and.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/8YV9HJ3W/2302.html:text/html}, } @misc{zhu_survey_2022, title = {A {Survey} on {Deep} {Graph} {Generation}: {Methods} and {Applications}}, shorttitle = {A {Survey} on {Deep} {Graph} {Generation}}, url = {http://arxiv.org/abs/2203.06714}, doi = {10.48550/arXiv.2203.06714}, abstract = {Graphs are ubiquitous in encoding relational information of real-world objects in many domains. Graph generation, whose purpose is to generate new graphs from a distribution similar to the observed graphs, has received increasing attention thanks to the recent advances of deep learning models. In this paper, we conduct a comprehensive review on the existing literature of deep graph generation from a variety of emerging methods to its wide application areas. Specifically, we first formulate the problem of deep graph generation and discuss its difference with several related graph learning tasks. Secondly, we divide the state-of-the-art methods into three categories based on model architectures and summarize their generation strategies. Thirdly, we introduce three key application areas of deep graph generation. Lastly, we highlight challenges and opportunities in the future study of deep graph generation. We hope that our survey will be useful for researchers and practitioners who are interested in this exciting and rapidly-developing field.}, urldate = {2023-03-27}, publisher = {arXiv}, author = {Zhu, Yanqiao and Du, Yuanqi and Wang, Yinkai and Xu, Yichen and Zhang, Jieyu and Liu, Qiang and Wu, Shu}, month = dec, year = {2022}, note = {arXiv:2203.06714 [cs, q-bio]}, keywords = {Computer Science - Machine Learning, Computer Science - Social and Information Networks, Quantitative Biology - Molecular Networks}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/SQWM9VTD/Zhu et al. - 2022 - A Survey on Deep Graph Generation Methods and App.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/IWAETBS6/2203.html:text/html}, } @misc{shah_auto-decoding_2020, title = {Auto-decoding {Graphs}}, url = {http://arxiv.org/abs/2006.02879}, doi = {10.48550/arXiv.2006.02879}, abstract = {We present an approach to synthesizing new graph structures from empirically specified distributions. The generative model is an auto-decoder that learns to synthesize graphs from latent codes. The graph synthesis model is learned jointly with an empirical distribution over the latent codes. Graphs are synthesized using self-attention modules that are trained to identify likely connectivity patterns. Graph-based normalizing flows are used to sample latent codes from the distribution learned by the auto-decoder. The resulting model combines accuracy and scalability. On benchmark datasets of large graphs, the presented model outperforms the state of the art by a factor of 1.5 in mean accuracy and average rank across at least three different graph statistics, with a 2x speedup during inference.}, urldate = {2023-03-27}, publisher = {arXiv}, author = {Shah, Sohil Atul and Koltun, Vladlen}, month = jun, year = {2020}, note = {arXiv:2006.02879 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/M7JSQ9YK/Shah et Koltun - 2020 - Auto-decoding Graphs.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/WLZYXR33/2006.html:text/html}, } @misc{faez_deep_2020, title = {Deep {Graph} {Generators}: {A} {Survey}}, shorttitle = {Deep {Graph} {Generators}}, url = {http://arxiv.org/abs/2012.15544}, doi = {10.48550/arXiv.2012.15544}, abstract = {Deep generative models have achieved great success in areas such as image, speech, and natural language processing in the past few years. Thanks to the advances in graph-based deep learning, and in particular graph representation learning, deep graph generation methods have recently emerged with new applications ranging from discovering novel molecular structures to modeling social networks. This paper conducts a comprehensive survey on deep learning-based graph generation approaches and classifies them into five broad categories, namely, autoregressive, autoencoder-based, RL-based, adversarial, and flow-based graph generators, providing the readers a detailed description of the methods in each class. We also present publicly available source codes, commonly used datasets, and the most widely utilized evaluation metrics. Finally, we highlight the existing challenges and discuss future research directions.}, urldate = {2023-03-27}, publisher = {arXiv}, author = {Faez, Faezeh and Ommi, Yassaman and Baghshah, Mahdieh Soleymani and Rabiee, Hamid R.}, month = dec, year = {2020}, note = {arXiv:2012.15544 [cs]}, keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Social and Information Networks}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/G3J3B658/Faez et al. - 2020 - Deep Graph Generators A Survey.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/NSQQDIEH/2012.html:text/html}, } @misc{shayestehfard_aligngraph_2023, title = {{AlignGraph}: {A} {Group} of {Generative} {Models} for {Graphs}}, shorttitle = {{AlignGraph}}, url = {http://arxiv.org/abs/2301.11273}, doi = {10.48550/arXiv.2301.11273}, abstract = {It is challenging for generative models to learn a distribution over graphs because of the lack of permutation invariance: nodes may be ordered arbitrarily across graphs, and standard graph alignment is combinatorial and notoriously expensive. We propose AlignGraph, a group of generative models that combine fast and efficient graph alignment methods with a family of deep generative models that are invariant to node permutations. Our experiments demonstrate that our framework successfully learns graph distributions, outperforming competitors by 25\% -560\% in relevant performance scores.}, urldate = {2023-03-27}, publisher = {arXiv}, author = {Shayestehfard, Kimia and Brooks, Dana and Ioannidis, Stratis}, month = jan, year = {2023}, note = {arXiv:2301.11273 [cs]}, keywords = {Computer Science - Machine Learning, Computer Science - Social and Information Networks}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/I69NJXUI/Shayestehfard et al. - 2023 - AlignGraph A Group of Generative Models for Graph.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/LPF9DVAW/2301.html:text/html}, } @article{kipf_graph_2020, title = {Graph {Neural} {Networks} for {Modeling} {Small} {Molecules}}, language = {en}, author = {Kipf, Thomas and Veličković, Petar and Li, Yujia}, month = mar, year = {2020}, file = {Kipf et al. - Graph Neural Networks for Modeling Small Molecules.pdf:/home/laurent/Zotero/storage/6WZAZFX8/Kipf et al. - Graph Neural Networks for Modeling Small Molecules.pdf:application/pdf}, } @misc{simonovsky_graphvae_2018, title = {{GraphVAE}: {Towards} {Generation} of {Small} {Graphs} {Using} {Variational} {Autoencoders}}, shorttitle = {{GraphVAE}}, url = {http://arxiv.org/abs/1802.03480}, doi = {10.48550/arXiv.1802.03480}, abstract = {Deep learning on graphs has become a popular research topic with many applications. However, past work has concentrated on learning graph embedding tasks, which is in contrast with advances in generative models for images and text. Is it possible to transfer this progress to the domain of graphs? We propose to sidestep hurdles associated with linearization of such discrete structures by having a decoder output a probabilistic fully-connected graph of a predefined maximum size directly at once. Our method is formulated as a variational autoencoder. We evaluate on the challenging task of molecule generation.}, urldate = {2023-03-27}, publisher = {arXiv}, author = {Simonovsky, Martin and Komodakis, Nikos}, month = feb, year = {2018}, note = {arXiv:1802.03480 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/IWG2JIVU/Simonovsky et Komodakis - 2018 - GraphVAE Towards Generation of Small Graphs Using.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/PW5ZG5WH/1802.html:text/html}, } @misc{liao_efficient_2020, title = {Efficient {Graph} {Generation} with {Graph} {Recurrent} {Attention} {Networks}}, url = {http://arxiv.org/abs/1910.00760}, abstract = {We propose a new family of efficient and expressive deep generative models of graphs, called Graph Recurrent Attention Networks (GRANs). Our model generates graphs one block of nodes and associated edges at a time. The block size and sampling stride allow us to trade off sample quality for efficiency. Compared to previous RNN-based graph generative models, our framework better captures the auto-regressive conditioning between the already-generated and to-be-generated parts of the graph using Graph Neural Networks (GNNs) with attention. This not only reduces the dependency on node ordering but also bypasses the long-term bottleneck caused by the sequential nature of RNNs. Moreover, we parameterize the output distribution per block using a mixture of Bernoulli, which captures the correlations among generated edges within the block. Finally, we propose to handle node orderings in generation by marginalizing over a family of canonical orderings. On standard benchmarks, we achieve state-of-the-art time efficiency and sample quality compared to previous models. Additionally, we show our model is capable of generating large graphs of up to 5K nodes with good quality. To the best of our knowledge, GRAN is the first deep graph generative model that can scale to this size. Our code is released at: https://github.com/lrjconan/GRAN.}, urldate = {2023-03-27}, publisher = {arXiv}, author = {Liao, Renjie and Li, Yujia and Song, Yang and Wang, Shenlong and Nash, Charlie and Hamilton, William L. and Duvenaud, David and Urtasun, Raquel and Zemel, Richard S.}, month = jul, year = {2020}, note = {arXiv:1910.00760 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/YB44QN2I/Liao et al. - 2020 - Efficient Graph Generation with Graph Recurrent At.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/XXCHWITE/1910.html:text/html}, } @misc{guo_systematic_2022, title = {A {Systematic} {Survey} on {Deep} {Generative} {Models} for {Graph} {Generation}}, url = {http://arxiv.org/abs/2007.06686}, doi = {10.48550/arXiv.2007.06686}, abstract = {Graphs are important data representations for describing objects and their relationships, which appear in a wide diversity of real-world scenarios. As one of a critical problem in this area, graph generation considers learning the distributions of given graphs and generating more novel graphs. Owing to their wide range of applications, generative models for graphs, which have a rich history, however, are traditionally hand-crafted and only capable of modeling a few statistical properties of graphs. Recent advances in deep generative models for graph generation is an important step towards improving the fidelity of generated graphs and paves the way for new kinds of applications. This article provides an extensive overview of the literature in the field of deep generative models for graph generation. Firstly, the formal definition of deep generative models for the graph generation and the preliminary knowledge are provided. Secondly, taxonomies of deep generative models for both unconditional and conditional graph generation are proposed respectively; the existing works of each are compared and analyzed. After that, an overview of the evaluation metrics in this specific domain is provided. Finally, the applications that deep graph generation enables are summarized and five promising future research directions are highlighted.}, urldate = {2023-03-24}, publisher = {arXiv}, author = {Guo, Xiaojie and Zhao, Liang}, month = oct, year = {2022}, note = {arXiv:2007.06686 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/M6I3YJP8/Guo et Zhao - 2022 - A Systematic Survey on Deep Generative Models for .pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/8N8L3XCF/2007.html:text/html}, } @misc{doersch_tutorial_2021, title = {Tutorial on {Variational} {Autoencoders}}, url = {http://arxiv.org/abs/1606.05908}, doi = {10.48550/arXiv.1606.05908}, abstract = {In just three years, Variational Autoencoders (VAEs) have emerged as one of the most popular approaches to unsupervised learning of complicated distributions. VAEs are appealing because they are built on top of standard function approximators (neural networks), and can be trained with stochastic gradient descent. VAEs have already shown promise in generating many kinds of complicated data, including handwritten digits, faces, house numbers, CIFAR images, physical models of scenes, segmentation, and predicting the future from static images. This tutorial introduces the intuitions behind VAEs, explains the mathematics behind them, and describes some empirical behavior. No prior knowledge of variational Bayesian methods is assumed.}, urldate = {2023-03-24}, publisher = {arXiv}, author = {Doersch, Carl}, month = jan, year = {2021}, note = {arXiv:1606.05908 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/84J4LNV2/Doersch - 2021 - Tutorial on Variational Autoencoders.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/EWJRB7BM/1606.html:text/html}, } @misc{salha-galvan_contributions_2022, title = {Contributions to {Representation} {Learning} with {Graph} {Autoencoders} and {Applications} to {Music} {Recommendation}}, url = {http://arxiv.org/abs/2205.14651}, doi = {10.48550/arXiv.2205.14651}, abstract = {Graph autoencoders (GAE) and variational graph autoencoders (VGAE) emerged as two powerful groups of unsupervised node embedding methods, with various applications to graph-based machine learning problems such as link prediction and community detection. Nonetheless, at the beginning of this Ph.D. project, GAE and VGAE models were also suffering from key limitations, preventing them from being adopted in the industry. In this thesis, we present several contributions to improve these models, with the general aim of facilitating their use to address industrial-level problems involving graph representations. Firstly, we propose two strategies to overcome the scalability issues of previous GAE and VGAE models, permitting to effectively train these models on large graphs with millions of nodes and edges. These strategies leverage graph degeneracy and stochastic subgraph decoding techniques, respectively. Besides, we introduce Gravity-Inspired GAE and VGAE, providing the first extensions of these models for directed graphs, that are ubiquitous in industrial applications. We also consider extensions of GAE and VGAE models for dynamic graphs. Furthermore, we argue that GAE and VGAE models are often unnecessarily complex, and we propose to simplify them by leveraging linear encoders. Lastly, we introduce Modularity-Aware GAE and VGAE to improve community detection on graphs, while jointly preserving good performances on link prediction. In the last part of this thesis, we evaluate our methods on several graphs extracted from the music streaming service Deezer. We put the emphasis on graph-based music recommendation problems. In particular, we show that our methods can improve the detection of communities of similar musical items to recommend to users, that they can effectively rank similar artists in a cold start setting, and that they permit modeling the music genre perception across cultures.}, urldate = {2023-03-24}, publisher = {arXiv}, author = {Salha-Galvan, Guillaume}, month = may, year = {2022}, note = {arXiv:2205.14651 [cs] version: 1}, keywords = {Computer Science - Machine Learning, Computer Science - Social and Information Networks, Computer Science - Information Retrieval}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/4R2Z87LG/Salha-Galvan - 2022 - Contributions to Representation Learning with Grap.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/AMRY4RUI/2205.html:text/html}, } @misc{su_f-vaes_2018, title = {f-{VAEs}: {Improve} {VAEs} with {Conditional} {Flows}}, shorttitle = {f-{VAEs}}, url = {http://arxiv.org/abs/1809.05861}, doi = {10.48550/arXiv.1809.05861}, abstract = {In this paper, we integrate VAEs and flow-based generative models successfully and get f-VAEs. Compared with VAEs, f-VAEs generate more vivid images, solved the blurred-image problem of VAEs. Compared with flow-based models such as Glow, f-VAE is more lightweight and converges faster, achieving the same performance under smaller-size architecture.}, urldate = {2023-03-24}, publisher = {arXiv}, author = {Su, Jianlin and Wu, Guang}, month = sep, year = {2018}, note = {arXiv:1809.05861 [cs, stat]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/2YPANJ73/Su et Wu - 2018 - f-VAEs Improve VAEs with Conditional Flows.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/9M6GUZEX/1809.html:text/html}, } @misc{burgess_understanding_2018, title = {Understanding disentangling in \${\textbackslash}beta\$-{VAE}}, url = {http://arxiv.org/abs/1804.03599}, doi = {10.48550/arXiv.1804.03599}, abstract = {We present new intuitions and theoretical assessments of the emergence of disentangled representation in variational autoencoders. Taking a rate-distortion theory perspective, we show the circumstances under which representations aligned with the underlying generative factors of variation of data emerge when optimising the modified ELBO bound in \${\textbackslash}beta\$-VAE, as training progresses. From these insights, we propose a modification to the training regime of \${\textbackslash}beta\$-VAE, that progressively increases the information capacity of the latent code during training. This modification facilitates the robust learning of disentangled representations in \${\textbackslash}beta\$-VAE, without the previous trade-off in reconstruction accuracy.}, urldate = {2023-03-23}, publisher = {arXiv}, author = {Burgess, Christopher P. and Higgins, Irina and Pal, Arka and Matthey, Loic and Watters, Nick and Desjardins, Guillaume and Lerchner, Alexander}, month = apr, year = {2018}, note = {arXiv:1804.03599 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning, Computer Science - Artificial Intelligence}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/I7FNXM4I/Burgess et al. - 2018 - Understanding disentangling in \$beta\$-VAE.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/4JPKDD7F/1804.html:text/html}, } @misc{brody_how_2022, title = {How {Attentive} are {Graph} {Attention} {Networks}?}, url = {http://arxiv.org/abs/2105.14491}, doi = {10.48550/arXiv.2105.14491}, abstract = {Graph Attention Networks (GATs) are one of the most popular GNN architectures and are considered as the state-of-the-art architecture for representation learning with graphs. In GAT, every node attends to its neighbors given its own representation as the query. However, in this paper we show that GAT computes a very limited kind of attention: the ranking of the attention scores is unconditioned on the query node. We formally define this restricted kind of attention as static attention and distinguish it from a strictly more expressive dynamic attention. Because GATs use a static attention mechanism, there are simple graph problems that GAT cannot express: in a controlled problem, we show that static attention hinders GAT from even fitting the training data. To remove this limitation, we introduce a simple fix by modifying the order of operations and propose GATv2: a dynamic graph attention variant that is strictly more expressive than GAT. We perform an extensive evaluation and show that GATv2 outperforms GAT across 11 OGB and other benchmarks while we match their parametric costs. Our code is available at https://github.com/tech-srl/how\_attentive\_are\_gats . GATv2 is available as part of the PyTorch Geometric library, the Deep Graph Library, and the TensorFlow GNN library.}, urldate = {2023-03-22}, publisher = {arXiv}, author = {Brody, Shaked and Alon, Uri and Yahav, Eran}, month = jan, year = {2022}, note = {arXiv:2105.14491 [cs]}, keywords = {Computer Science - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/C5CY9B82/Brody et al. - 2022 - How Attentive are Graph Attention Networks.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/RWEJ8RAY/2105.html:text/html}, } @misc{velickovic_graph_2018, title = {Graph {Attention} {Networks}}, url = {http://arxiv.org/abs/1710.10903}, doi = {10.48550/arXiv.1710.10903}, abstract = {We present graph attention networks (GATs), novel neural network architectures that operate on graph-structured data, leveraging masked self-attentional layers to address the shortcomings of prior methods based on graph convolutions or their approximations. By stacking layers in which nodes are able to attend over their neighborhoods' features, we enable (implicitly) specifying different weights to different nodes in a neighborhood, without requiring any kind of costly matrix operation (such as inversion) or depending on knowing the graph structure upfront. In this way, we address several key challenges of spectral-based graph neural networks simultaneously, and make our model readily applicable to inductive as well as transductive problems. Our GAT models have achieved or matched state-of-the-art results across four established transductive and inductive graph benchmarks: the Cora, Citeseer and Pubmed citation network datasets, as well as a protein-protein interaction dataset (wherein test graphs remain unseen during training).}, urldate = {2023-03-22}, publisher = {arXiv}, author = {Veličković, Petar and Cucurull, Guillem and Casanova, Arantxa and Romero, Adriana and Liò, Pietro and Bengio, Yoshua}, month = feb, year = {2018}, note = {arXiv:1710.10903 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Social and Information Networks}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/3X4HALUD/Veličković et al. - 2018 - Graph Attention Networks.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/JGM27EQ6/1710.html:text/html}, } @misc{kipf_semi-supervised_2017, title = {Semi-{Supervised} {Classification} with {Graph} {Convolutional} {Networks}}, url = {http://arxiv.org/abs/1609.02907}, doi = {10.48550/arXiv.1609.02907}, abstract = {We present a scalable approach for semi-supervised learning on graph-structured data that is based on an efficient variant of convolutional neural networks which operate directly on graphs. We motivate the choice of our convolutional architecture via a localized first-order approximation of spectral graph convolutions. Our model scales linearly in the number of graph edges and learns hidden layer representations that encode both local graph structure and features of nodes. In a number of experiments on citation networks and on a knowledge graph dataset we demonstrate that our approach outperforms related methods by a significant margin.}, urldate = {2023-03-22}, publisher = {arXiv}, author = {Kipf, Thomas N. and Welling, Max}, month = feb, year = {2017}, note = {arXiv:1609.02907 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/N2GXN6ZZ/Kipf et Welling - 2017 - Semi-Supervised Classification with Graph Convolut.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/WMTNID7V/1609.html:text/html}, } @misc{gao_graph_2019, title = {Graph {U}-{Nets}}, url = {http://arxiv.org/abs/1905.05178}, doi = {10.48550/arXiv.1905.05178}, abstract = {We consider the problem of representation learning for graph data. Convolutional neural networks can naturally operate on images, but have significant challenges in dealing with graph data. Given images are special cases of graphs with nodes lie on 2D lattices, graph embedding tasks have a natural correspondence with image pixel-wise prediction tasks such as segmentation. While encoder-decoder architectures like U-Nets have been successfully applied on many image pixel-wise prediction tasks, similar methods are lacking for graph data. This is due to the fact that pooling and up-sampling operations are not natural on graph data. To address these challenges, we propose novel graph pooling (gPool) and unpooling (gUnpool) operations in this work. The gPool layer adaptively selects some nodes to form a smaller graph based on their scalar projection values on a trainable projection vector. We further propose the gUnpool layer as the inverse operation of the gPool layer. The gUnpool layer restores the graph into its original structure using the position information of nodes selected in the corresponding gPool layer. Based on our proposed gPool and gUnpool layers, we develop an encoder-decoder model on graph, known as the graph U-Nets. Our experimental results on node classification and graph classification tasks demonstrate that our methods achieve consistently better performance than previous models.}, urldate = {2023-03-21}, publisher = {arXiv}, author = {Gao, Hongyang and Ji, Shuiwang}, month = may, year = {2019}, note = {arXiv:1905.05178 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/QIVY2Z39/Gao et Ji - 2019 - Graph U-Nets.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/YHWGK3H7/1905.html:text/html}, } @misc{kipf_variational_2016, title = {Variational {Graph} {Auto}-{Encoders}}, url = {http://arxiv.org/abs/1611.07308}, doi = {10.48550/arXiv.1611.07308}, abstract = {We introduce the variational graph auto-encoder (VGAE), a framework for unsupervised learning on graph-structured data based on the variational auto-encoder (VAE). This model makes use of latent variables and is capable of learning interpretable latent representations for undirected graphs. We demonstrate this model using a graph convolutional network (GCN) encoder and a simple inner product decoder. Our model achieves competitive results on a link prediction task in citation networks. In contrast to most existing models for unsupervised learning on graph-structured data and link prediction, our model can naturally incorporate node features, which significantly improves predictive performance on a number of benchmark datasets.}, urldate = {2023-03-21}, publisher = {arXiv}, author = {Kipf, Thomas N. and Welling, Max}, month = nov, year = {2016}, note = {arXiv:1611.07308 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/8LYSMTVS/Kipf et Welling - 2016 - Variational Graph Auto-Encoders.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/KCLQX6TX/1611.html:text/html}, } @misc{alemi_deep_2019, title = {Deep {Variational} {Information} {Bottleneck}}, url = {http://arxiv.org/abs/1612.00410}, doi = {10.48550/arXiv.1612.00410}, abstract = {We present a variational approximation to the information bottleneck of Tishby et al. (1999). This variational approach allows us to parameterize the information bottleneck model using a neural network and leverage the reparameterization trick for efficient training. We call this method "Deep Variational Information Bottleneck", or Deep VIB. We show that models trained with the VIB objective outperform those that are trained with other forms of regularization, in terms of generalization performance and robustness to adversarial attack.}, urldate = {2023-03-21}, publisher = {arXiv}, author = {Alemi, Alexander A. and Fischer, Ian and Dillon, Joshua V. and Murphy, Kevin}, month = oct, year = {2019}, note = {arXiv:1612.00410 [cs, math]}, keywords = {Computer Science - Machine Learning, Computer Science - Information Theory}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/LMPVVWG5/Alemi et al. - 2019 - Deep Variational Information Bottleneck.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/VBXN4EUZ/1612.html:text/html}, } @misc{thomas_kpconv_2019, title = {{KPConv}: {Flexible} and {Deformable} {Convolution} for {Point} {Clouds}}, shorttitle = {{KPConv}}, url = {http://arxiv.org/abs/1904.08889}, abstract = {We present Kernel Point Convolution (KPConv), a new design of point convolution, i.e. that operates on point clouds without any intermediate representation. The convolution weights of KPConv are located in Euclidean space by kernel points, and applied to the input points close to them. Its capacity to use any number of kernel points gives KPConv more flexibility than fixed grid convolutions. Furthermore, these locations are continuous in space and can be learned by the network. Therefore, KPConv can be extended to deformable convolutions that learn to adapt kernel points to local geometry. Thanks to a regular subsampling strategy, KPConv is also efficient and robust to varying densities. Whether they use deformable KPConv for complex tasks, or rigid KPconv for simpler tasks, our networks outperform state-of-the-art classification and segmentation approaches on several datasets. We also offer ablation studies and visualizations to provide understanding of what has been learned by KPConv and to validate the descriptive power of deformable KPConv.}, urldate = {2023-05-15}, publisher = {arXiv}, author = {Thomas, Hugues and Qi, Charles R. and Deschaud, Jean-Emmanuel and Marcotegui, Beatriz and Goulette, François and Guibas, Leonidas J.}, month = aug, year = {2019}, note = {arXiv:1904.08889 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv.org Snapshot:/home/laurent/Zotero/storage/5CY645DK/1904.html:text/html;Full Text PDF:/home/laurent/Zotero/storage/782FKEML/Thomas et al. - 2019 - KPConv Flexible and Deformable Convolution for Po.pdf:application/pdf}, } @misc{tang_searching_2020, title = {Searching {Efficient} {3D} {Architectures} with {Sparse} {Point}-{Voxel} {Convolution}}, url = {http://arxiv.org/abs/2007.16100}, doi = {10.48550/arXiv.2007.16100}, abstract = {Self-driving cars need to understand 3D scenes efficiently and accurately in order to drive safely. Given the limited hardware resources, existing 3D perception models are not able to recognize small instances (e.g., pedestrians, cyclists) very well due to the low-resolution voxelization and aggressive downsampling. To this end, we propose Sparse Point-Voxel Convolution (SPVConv), a lightweight 3D module that equips the vanilla Sparse Convolution with the high-resolution point-based branch. With negligible overhead, this point-based branch is able to preserve the fine details even from large outdoor scenes. To explore the spectrum of efficient 3D models, we first define a flexible architecture design space based on SPVConv, and we then present 3D Neural Architecture Search (3D-NAS) to search the optimal network architecture over this diverse design space efficiently and effectively. Experimental results validate that the resulting SPVNAS model is fast and accurate: it outperforms the state-of-the-art MinkowskiNet by 3.3\%, ranking 1st on the competitive SemanticKITTI leaderboard. It also achieves 8x computation reduction and 3x measured speedup over MinkowskiNet with higher accuracy. Finally, we transfer our method to 3D object detection, and it achieves consistent improvements over the one-stage detection baseline on KITTI.}, urldate = {2023-04-26}, publisher = {arXiv}, author = {Tang, Haotian and Liu, Zhijian and Zhao, Shengyu and Lin, Yujun and Lin, Ji and Wang, Hanrui and Han, Song}, month = aug, year = {2020}, note = {arXiv:2007.16100 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/A2S9RZVE/Tang et al. - 2020 - Searching Efficient 3D Architectures with Sparse P.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/QK6WTDZH/2007.html:text/html}, } @misc{nguyen_point-set_2021, title = {Point-set {Distances} for {Learning} {Representations} of {3D} {Point} {Clouds}}, url = {http://arxiv.org/abs/2102.04014}, doi = {10.48550/arXiv.2102.04014}, abstract = {Learning an effective representation of 3D point clouds requires a good metric to measure the discrepancy between two 3D point sets, which is non-trivial due to their irregularity. Most of the previous works resort to using the Chamfer discrepancy or Earth Mover's distance, but those metrics are either ineffective in measuring the differences between point clouds or computationally expensive. In this paper, we conduct a systematic study with extensive experiments on distance metrics for 3D point clouds. From this study, we propose to use sliced Wasserstein distance and its variants for learning representations of 3D point clouds. In addition, we introduce a new algorithm to estimate sliced Wasserstein distance that guarantees that the estimated value is close enough to the true one. Experiments show that the sliced Wasserstein distance and its variants allow the neural network to learn a more efficient representation compared to the Chamfer discrepancy. We demonstrate the efficiency of the sliced Wasserstein metric and its variants on several tasks in 3D computer vision including training a point cloud autoencoder, generative modeling, transfer learning, and point cloud registration.}, urldate = {2023-04-21}, publisher = {arXiv}, author = {Nguyen, Trung and Pham, Quang-Hieu and Le, Tam and Pham, Tung and Ho, Nhat and Hua, Binh-Son}, month = sep, year = {2021}, note = {arXiv:2102.04014 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/ZND8758D/Nguyen et al. - 2021 - Point-set Distances for Learning Representations o.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/IUDCHXC2/2102.html:text/html}, } @misc{peng_shape_2021, title = {Shape {As} {Points}: {A} {Differentiable} {Poisson} {Solver}}, shorttitle = {Shape {As} {Points}}, url = {http://arxiv.org/abs/2106.03452}, doi = {10.48550/arXiv.2106.03452}, abstract = {In recent years, neural implicit representations gained popularity in 3D reconstruction due to their expressiveness and flexibility. However, the implicit nature of neural implicit representations results in slow inference time and requires careful initialization. In this paper, we revisit the classic yet ubiquitous point cloud representation and introduce a differentiable point-to-mesh layer using a differentiable formulation of Poisson Surface Reconstruction (PSR) that allows for a GPU-accelerated fast solution of the indicator function given an oriented point cloud. The differentiable PSR layer allows us to efficiently and differentiably bridge the explicit 3D point representation with the 3D mesh via the implicit indicator field, enabling end-to-end optimization of surface reconstruction metrics such as Chamfer distance. This duality between points and meshes hence allows us to represent shapes as oriented point clouds, which are explicit, lightweight and expressive. Compared to neural implicit representations, our Shape-As-Points (SAP) model is more interpretable, lightweight, and accelerates inference time by one order of magnitude. Compared to other explicit representations such as points, patches, and meshes, SAP produces topology-agnostic, watertight manifold surfaces. We demonstrate the effectiveness of SAP on the task of surface reconstruction from unoriented point clouds and learning-based reconstruction.}, urldate = {2023-04-17}, publisher = {arXiv}, author = {Peng, Songyou and Jiang, Chiyu "Max" and Liao, Yiyi and Niemeyer, Michael and Pollefeys, Marc and Geiger, Andreas}, month = nov, year = {2021}, note = {arXiv:2106.03452 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/V5BVL34J/Peng et al. - 2021 - Shape As Points A Differentiable Poisson Solver.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/7J3IDAKQ/2106.html:text/html}, } @misc{sulzer_deep_2022, title = {Deep {Surface} {Reconstruction} from {Point} {Clouds} with {Visibility} {Information}}, url = {http://arxiv.org/abs/2202.01810}, doi = {10.48550/arXiv.2202.01810}, abstract = {Most current neural networks for reconstructing surfaces from point clouds ignore sensor poses and only operate on raw point locations. Sensor visibility, however, holds meaningful information regarding space occupancy and surface orientation. In this paper, we present two simple ways to augment raw point clouds with visibility information, so it can directly be leveraged by surface reconstruction networks with minimal adaptation. Our proposed modifications consistently improve the accuracy of generated surfaces as well as the generalization ability of the networks to unseen shape domains. Our code and data is available at https://github.com/raphaelsulzer/dsrv-data.}, urldate = {2023-04-17}, publisher = {arXiv}, author = {Sulzer, Raphael and Landrieu, Loic and Boulch, Alexandre and Marlet, Renaud and Vallet, Bruno}, month = feb, year = {2022}, note = {arXiv:2202.01810 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/ZDTHHW9H/Sulzer et al. - 2022 - Deep Surface Reconstruction from Point Clouds with.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/X84KMGRU/2202.html:text/html}, } @misc{nam_3d-ldm_2022, title = {{3D}-{LDM}: {Neural} {Implicit} {3D} {Shape} {Generation} with {Latent} {Diffusion} {Models}}, shorttitle = {{3D}-{LDM}}, url = {http://arxiv.org/abs/2212.00842}, doi = {10.48550/arXiv.2212.00842}, abstract = {Diffusion models have shown great promise for image generation, beating GANs in terms of generation diversity, with comparable image quality. However, their application to 3D shapes has been limited to point or voxel representations that can in practice not accurately represent a 3D surface. We propose a diffusion model for neural implicit representations of 3D shapes that operates in the latent space of an auto-decoder. This allows us to generate diverse and high quality 3D surfaces. We additionally show that we can condition our model on images or text to enable image-to-3D generation and text-to-3D generation using CLIP embeddings. Furthermore, adding noise to the latent codes of existing shapes allows us to explore shape variations.}, urldate = {2023-04-11}, publisher = {arXiv}, author = {Nam, Gimin and Khlifi, Mariem and Rodriguez, Andrew and Tono, Alberto and Zhou, Linqi and Guerrero, Paul}, month = dec, year = {2022}, note = {arXiv:2212.00842 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/8DKDU5WY/Nam et al. - 2022 - 3D-LDM Neural Implicit 3D Shape Generation with L.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/DMM8K287/2212.html:text/html}, } @misc{zhou_3d_2021, title = {{3D} {Shape} {Generation} and {Completion} through {Point}-{Voxel} {Diffusion}}, url = {http://arxiv.org/abs/2104.03670}, doi = {10.48550/arXiv.2104.03670}, abstract = {We propose a novel approach for probabilistic generative modeling of 3D shapes. Unlike most existing models that learn to deterministically translate a latent vector to a shape, our model, Point-Voxel Diffusion (PVD), is a unified, probabilistic formulation for unconditional shape generation and conditional, multi-modal shape completion. PVD marries denoising diffusion models with the hybrid, point-voxel representation of 3D shapes. It can be viewed as a series of denoising steps, reversing the diffusion process from observed point cloud data to Gaussian noise, and is trained by optimizing a variational lower bound to the (conditional) likelihood function. Experiments demonstrate that PVD is capable of synthesizing high-fidelity shapes, completing partial point clouds, and generating multiple completion results from single-view depth scans of real objects.}, urldate = {2023-04-04}, publisher = {arXiv}, author = {Zhou, Linqi and Du, Yilun and Wu, Jiajun}, month = aug, year = {2021}, note = {arXiv:2104.03670 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/WGECL3FJ/Zhou et al. - 2021 - 3D Shape Generation and Completion through Point-V.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/C3AEKFNE/2104.html:text/html}, } @misc{liu_point-voxel_2019, title = {Point-{Voxel} {CNN} for {Efficient} {3D} {Deep} {Learning}}, url = {http://arxiv.org/abs/1907.03739}, doi = {10.48550/arXiv.1907.03739}, abstract = {We present Point-Voxel CNN (PVCNN) for efficient, fast 3D deep learning. Previous work processes 3D data using either voxel-based or point-based NN models. However, both approaches are computationally inefficient. The computation cost and memory footprints of the voxel-based models grow cubically with the input resolution, making it memory-prohibitive to scale up the resolution. As for point-based networks, up to 80\% of the time is wasted on structuring the sparse data which have rather poor memory locality, not on the actual feature extraction. In this paper, we propose PVCNN that represents the 3D input data in points to reduce the memory consumption, while performing the convolutions in voxels to reduce the irregular, sparse data access and improve the locality. Our PVCNN model is both memory and computation efficient. Evaluated on semantic and part segmentation datasets, it achieves much higher accuracy than the voxel-based baseline with 10x GPU memory reduction; it also outperforms the state-of-the-art point-based models with 7x measured speedup on average. Remarkably, the narrower version of PVCNN achieves 2x speedup over PointNet (an extremely efficient model) on part and scene segmentation benchmarks with much higher accuracy. We validate the general effectiveness of PVCNN on 3D object detection: by replacing the primitives in Frustrum PointNet with PVConv, it outperforms Frustrum PointNet++ by 2.4\% mAP on average with 1.5x measured speedup and GPU memory reduction.}, urldate = {2023-04-04}, publisher = {arXiv}, author = {Liu, Zhijian and Tang, Haotian and Lin, Yujun and Han, Song}, month = dec, year = {2019}, note = {arXiv:1907.03739 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/A2XJARYA/Liu et al. - 2019 - Point-Voxel CNN for Efficient 3D Deep Learning.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/LF7RPTGF/1907.html:text/html}, } @misc{qi_pointnet_2017, title = {{PointNet}: {Deep} {Learning} on {Point} {Sets} for {3D} {Classification} and {Segmentation}}, shorttitle = {{PointNet}}, url = {http://arxiv.org/abs/1612.00593}, doi = {10.48550/arXiv.1612.00593}, abstract = {Point cloud is an important type of geometric data structure. Due to its irregular format, most researchers transform such data to regular 3D voxel grids or collections of images. This, however, renders data unnecessarily voluminous and causes issues. In this paper, we design a novel type of neural network that directly consumes point clouds and well respects the permutation invariance of points in the input. Our network, named PointNet, provides a unified architecture for applications ranging from object classification, part segmentation, to scene semantic parsing. Though simple, PointNet is highly efficient and effective. Empirically, it shows strong performance on par or even better than state of the art. Theoretically, we provide analysis towards understanding of what the network has learnt and why the network is robust with respect to input perturbation and corruption.}, urldate = {2023-04-04}, publisher = {arXiv}, author = {Qi, Charles R. and Su, Hao and Mo, Kaichun and Guibas, Leonidas J.}, month = apr, year = {2017}, note = {arXiv:1612.00593 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/SV6H7XA9/Qi et al. - 2017 - PointNet Deep Learning on Point Sets for 3D Class.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/YF79EZLH/1612.html:text/html}, } @misc{qi_pointnet_2017-1, title = {{PointNet}++: {Deep} {Hierarchical} {Feature} {Learning} on {Point} {Sets} in a {Metric} {Space}}, shorttitle = {{PointNet}++}, url = {http://arxiv.org/abs/1706.02413}, doi = {10.48550/arXiv.1706.02413}, abstract = {Few prior works study deep learning on point sets. PointNet by Qi et al. is a pioneer in this direction. However, by design PointNet does not capture local structures induced by the metric space points live in, limiting its ability to recognize fine-grained patterns and generalizability to complex scenes. In this work, we introduce a hierarchical neural network that applies PointNet recursively on a nested partitioning of the input point set. By exploiting metric space distances, our network is able to learn local features with increasing contextual scales. With further observation that point sets are usually sampled with varying densities, which results in greatly decreased performance for networks trained on uniform densities, we propose novel set learning layers to adaptively combine features from multiple scales. Experiments show that our network called PointNet++ is able to learn deep point set features efficiently and robustly. In particular, results significantly better than state-of-the-art have been obtained on challenging benchmarks of 3D point clouds.}, urldate = {2023-04-03}, publisher = {arXiv}, author = {Qi, Charles R. and Yi, Li and Su, Hao and Guibas, Leonidas J.}, month = jun, year = {2017}, note = {arXiv:1706.02413 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/4FPME54R/Qi et al. - 2017 - PointNet++ Deep Hierarchical Feature Learning on .pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/SXSSFMBW/1706.html:text/html}, } @misc{hang_efficient_2023, title = {Efficient {Diffusion} {Training} via {Min}-{SNR} {Weighting} {Strategy}}, url = {http://arxiv.org/abs/2303.09556}, doi = {10.48550/arXiv.2303.09556}, abstract = {Denoising diffusion models have been a mainstream approach for image generation, however, training these models often suffers from slow convergence. In this paper, we discovered that the slow convergence is partly due to conflicting optimization directions between timesteps. To address this issue, we treat the diffusion training as a multi-task learning problem, and introduce a simple yet effective approach referred to as Min-SNR-\${\textbackslash}gamma\$. This method adapts loss weights of timesteps based on clamped signal-to-noise ratios, which effectively balances the conflicts among timesteps. Our results demonstrate a significant improvement in converging speed, 3.4\${\textbackslash}times\$ faster than previous weighting strategies. It is also more effective, achieving a new record FID score of 2.06 on the ImageNet \$256{\textbackslash}times256\$ benchmark using smaller architectures than that employed in previous state-of-the-art. The code is available at https://github.com/TiankaiHang/Min-SNR-Diffusion-Training.}, urldate = {2023-06-15}, publisher = {arXiv}, author = {Hang, Tiankai and Gu, Shuyang and Li, Chen and Bao, Jianmin and Chen, Dong and Hu, Han and Geng, Xin and Guo, Baining}, month = mar, year = {2023}, note = {arXiv:2303.09556 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv.org Snapshot:/home/laurent/Zotero/storage/EQPT236P/Hang et al. - 2023 - Efficient Diffusion Training via Min-SNR Weighting.html:text/html}, } @misc{rombach_high-resolution_2022, title = {High-{Resolution} {Image} {Synthesis} with {Latent} {Diffusion} {Models}}, url = {http://arxiv.org/abs/2112.10752}, abstract = {By decomposing the image formation process into a sequential application of denoising autoencoders, diffusion models (DMs) achieve state-of-the-art synthesis results on image data and beyond. Additionally, their formulation allows for a guiding mechanism to control the image generation process without retraining. However, since these models typically operate directly in pixel space, optimization of powerful DMs often consumes hundreds of GPU days and inference is expensive due to sequential evaluations. To enable DM training on limited computational resources while retaining their quality and flexibility, we apply them in the latent space of powerful pretrained autoencoders. In contrast to previous work, training diffusion models on such a representation allows for the first time to reach a near-optimal point between complexity reduction and detail preservation, greatly boosting visual fidelity. By introducing cross-attention layers into the model architecture, we turn diffusion models into powerful and flexible generators for general conditioning inputs such as text or bounding boxes and high-resolution synthesis becomes possible in a convolutional manner. Our latent diffusion models (LDMs) achieve a new state of the art for image inpainting and highly competitive performance on various tasks, including unconditional image generation, semantic scene synthesis, and super-resolution, while significantly reducing computational requirements compared to pixel-based DMs. Code is available at https://github.com/CompVis/latent-diffusion .}, urldate = {2023-06-13}, publisher = {arXiv}, author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Björn}, month = apr, year = {2022}, note = {arXiv:2112.10752 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv.org Snapshot:/home/laurent/Zotero/storage/7AQALVMG/2112.html:text/html;Full Text PDF:/home/laurent/Zotero/storage/NSX4PSPP/Rombach et al. - 2022 - High-Resolution Image Synthesis with Latent Diffus.pdf:application/pdf}, } @misc{luo_understanding_2022, title = {Understanding {Diffusion} {Models}: {A} {Unified} {Perspective}}, shorttitle = {Understanding {Diffusion} {Models}}, url = {http://arxiv.org/abs/2208.11970}, abstract = {Diffusion models have shown incredible capabilities as generative models; indeed, they power the current state-of-the-art models on text-conditioned image generation such as Imagen and DALL-E 2. In this work we review, demystify, and unify the understanding of diffusion models across both variational and score-based perspectives. We first derive Variational Diffusion Models (VDM) as a special case of a Markovian Hierarchical Variational Autoencoder, where three key assumptions enable tractable computation and scalable optimization of the ELBO. We then prove that optimizing a VDM boils down to learning a neural network to predict one of three potential objectives: the original source input from any arbitrary noisification of it, the original source noise from any arbitrarily noisified input, or the score function of a noisified input at any arbitrary noise level. We then dive deeper into what it means to learn the score function, and connect the variational perspective of a diffusion model explicitly with the Score-based Generative Modeling perspective through Tweedie's Formula. Lastly, we cover how to learn a conditional distribution using diffusion models via guidance.}, urldate = {2023-06-12}, publisher = {arXiv}, author = {Luo, Calvin}, month = aug, year = {2022}, note = {arXiv:2208.11970 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning}, file = {arXiv.org Snapshot:/home/laurent/Zotero/storage/YBCUMCLB/2208.html:text/html;Full Text PDF:/home/laurent/Zotero/storage/6C9BARLG/Luo - 2022 - Understanding Diffusion Models A Unified Perspect.pdf:application/pdf}, } @misc{zhu_unpaired_2020, title = {Unpaired {Image}-to-{Image} {Translation} using {Cycle}-{Consistent} {Adversarial} {Networks}}, url = {http://arxiv.org/abs/1703.10593}, doi = {10.48550/arXiv.1703.10593}, abstract = {Image-to-image translation is a class of vision and graphics problems where the goal is to learn the mapping between an input image and an output image using a training set of aligned image pairs. However, for many tasks, paired training data will not be available. We present an approach for learning to translate an image from a source domain \$X\$ to a target domain \$Y\$ in the absence of paired examples. Our goal is to learn a mapping \$G: X {\textbackslash}rightarrow Y\$ such that the distribution of images from \$G(X)\$ is indistinguishable from the distribution \$Y\$ using an adversarial loss. Because this mapping is highly under-constrained, we couple it with an inverse mapping \$F: Y {\textbackslash}rightarrow X\$ and introduce a cycle consistency loss to push \$F(G(X)) {\textbackslash}approx X\$ (and vice versa). Qualitative results are presented on several tasks where paired training data does not exist, including collection style transfer, object transfiguration, season transfer, photo enhancement, etc. Quantitative comparisons against several prior methods demonstrate the superiority of our approach.}, urldate = {2023-07-06}, publisher = {arXiv}, author = {Zhu, Jun-Yan and Park, Taesung and Isola, Phillip and Efros, Alexei A.}, month = aug, year = {2020}, note = {arXiv:1703.10593 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/Y6SLL26A/Zhu et al. - 2020 - Unpaired Image-to-Image Translation using Cycle-Co.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/EWW8TRT2/1703.html:text/html}, } @misc{odena_semi-supervised_2016, title = {Semi-{Supervised} {Learning} with {Generative} {Adversarial} {Networks}}, url = {http://arxiv.org/abs/1606.01583}, doi = {10.48550/arXiv.1606.01583}, abstract = {We extend Generative Adversarial Networks (GANs) to the semi-supervised context by forcing the discriminator network to output class labels. We train a generative model G and a discriminator D on a dataset with inputs belonging to one of N classes. At training time, D is made to predict which of N+1 classes the input belongs to, where an extra class is added to correspond to the outputs of G. We show that this method can be used to create a more data-efficient classifier and that it allows for generating higher quality samples than a regular GAN.}, urldate = {2023-07-06}, publisher = {arXiv}, author = {Odena, Augustus}, month = oct, year = {2016}, note = {arXiv:1606.01583 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/XM4QQ2FW/Odena - 2016 - Semi-Supervised Learning with Generative Adversari.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/TXCYUE77/1606.html:text/html}, } @misc{kim_learning_2017, title = {Learning to {Discover} {Cross}-{Domain} {Relations} with {Generative} {Adversarial} {Networks}}, url = {http://arxiv.org/abs/1703.05192}, doi = {10.48550/arXiv.1703.05192}, abstract = {While humans easily recognize relations between data from different domains without any supervision, learning to automatically discover them is in general very challenging and needs many ground-truth pairs that illustrate the relations. To avoid costly pairing, we address the task of discovering cross-domain relations given unpaired data. We propose a method based on generative adversarial networks that learns to discover relations between different domains (DiscoGAN). Using the discovered relations, our proposed network successfully transfers style from one domain to another while preserving key attributes such as orientation and face identity. Source code for official implementation is publicly available https://github.com/SKTBrain/DiscoGAN}, urldate = {2023-07-06}, publisher = {arXiv}, author = {Kim, Taeksoo and Cha, Moonsu and Kim, Hyunsoo and Lee, Jung Kwon and Kim, Jiwon}, month = may, year = {2017}, note = {arXiv:1703.05192 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/Q6LM7LUP/Kim et al. - 2017 - Learning to Discover Cross-Domain Relations with G.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/VWK3IQCR/1703.html:text/html}, } @misc{ledig_photo-realistic_2017, title = {Photo-{Realistic} {Single} {Image} {Super}-{Resolution} {Using} a {Generative} {Adversarial} {Network}}, url = {http://arxiv.org/abs/1609.04802}, doi = {10.48550/arXiv.1609.04802}, abstract = {Despite the breakthroughs in accuracy and speed of single image super-resolution using faster and deeper convolutional neural networks, one central problem remains largely unsolved: how do we recover the finer texture details when we super-resolve at large upscaling factors? The behavior of optimization-based super-resolution methods is principally driven by the choice of the objective function. Recent work has largely focused on minimizing the mean squared reconstruction error. The resulting estimates have high peak signal-to-noise ratios, but they are often lacking high-frequency details and are perceptually unsatisfying in the sense that they fail to match the fidelity expected at the higher resolution. In this paper, we present SRGAN, a generative adversarial network (GAN) for image super-resolution (SR). To our knowledge, it is the first framework capable of inferring photo-realistic natural images for 4x upscaling factors. To achieve this, we propose a perceptual loss function which consists of an adversarial loss and a content loss. The adversarial loss pushes our solution to the natural image manifold using a discriminator network that is trained to differentiate between the super-resolved images and original photo-realistic images. In addition, we use a content loss motivated by perceptual similarity instead of similarity in pixel space. Our deep residual network is able to recover photo-realistic textures from heavily downsampled images on public benchmarks. An extensive mean-opinion-score (MOS) test shows hugely significant gains in perceptual quality using SRGAN. The MOS scores obtained with SRGAN are closer to those of the original high-resolution images than to those obtained with any state-of-the-art method.}, urldate = {2023-07-06}, publisher = {arXiv}, author = {Ledig, Christian and Theis, Lucas and Huszar, Ferenc and Caballero, Jose and Cunningham, Andrew and Acosta, Alejandro and Aitken, Andrew and Tejani, Alykhan and Totz, Johannes and Wang, Zehan and Shi, Wenzhe}, month = may, year = {2017}, note = {arXiv:1609.04802 [cs, stat]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/VN4Z76ZB/Ledig et al. - 2017 - Photo-Realistic Single Image Super-Resolution Usin.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/RN7MPPTH/1609.html:text/html}, } @article{ma_comprehensive_2021, title = {A {Comprehensive} {Survey} on {Graph} {Anomaly} {Detection} with {Deep} {Learning}}, issn = {1041-4347, 1558-2191, 2326-3865}, url = {http://arxiv.org/abs/2106.07178}, doi = {10.1109/TKDE.2021.3118815}, abstract = {Anomalies represent rare observations (e.g., data records or events) that deviate significantly from others. Over several decades, research on anomaly mining has received increasing interests due to the implications of these occurrences in a wide range of disciplines. Anomaly detection, which aims to identify rare observations, is among the most vital tasks in the world, and has shown its power in preventing detrimental events, such as financial fraud, network intrusion, and social spam. The detection task is typically solved by identifying outlying data points in the feature space and inherently overlooks the relational information in real-world data. Graphs have been prevalently used to represent the structural information, which raises the graph anomaly detection problem - identifying anomalous graph objects (i.e., nodes, edges and sub-graphs) in a single graph, or anomalous graphs in a database/set of graphs. However, conventional anomaly detection techniques cannot tackle this problem well because of the complexity of graph data. For the advent of deep learning, graph anomaly detection with deep learning has received a growing attention recently. In this survey, we aim to provide a systematic and comprehensive review of the contemporary deep learning techniques for graph anomaly detection. We compile open-sourced implementations, public datasets, and commonly-used evaluation metrics to provide affluent resources for future studies. More importantly, we highlight twelve extensive future research directions according to our survey results covering unsolved and emerging research problems and real-world applications. With this survey, our goal is to create a "one-stop-shop" that provides a unified understanding of the problem categories and existing approaches, publicly available hands-on resources, and high-impact open challenges for graph anomaly detection using deep learning.}, urldate = {2023-07-06}, journal = {IEEE Transactions on Knowledge and Data Engineering}, author = {Ma, Xiaoxiao and Wu, Jia and Xue, Shan and Yang, Jian and Zhou, Chuan and Sheng, Quan Z. and Xiong, Hui and Akoglu, Leman}, year = {2021}, note = {arXiv:2106.07178 [cs]}, keywords = {Computer Science - Machine Learning}, pages = {1--1}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/HLLPJA3X/Ma et al. - 2021 - A Comprehensive Survey on Graph Anomaly Detection .pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/RK3I3FR2/2106.html:text/html}, } @misc{gao_survey_2023, title = {A {Survey} of {Graph} {Neural} {Networks} for {Recommender} {Systems}: {Challenges}, {Methods}, and {Directions}}, shorttitle = {A {Survey} of {Graph} {Neural} {Networks} for {Recommender} {Systems}}, url = {http://arxiv.org/abs/2109.12843}, doi = {10.48550/arXiv.2109.12843}, abstract = {Recommender system is one of the most important information services on today's Internet. Recently, graph neural networks have become the new state-of-the-art approach to recommender systems. In this survey, we conduct a comprehensive review of the literature on graph neural network-based recommender systems. We first introduce the background and the history of the development of both recommender systems and graph neural networks. For recommender systems, in general, there are four aspects for categorizing existing works: stage, scenario, objective, and application. For graph neural networks, the existing methods consist of two categories, spectral models and spatial ones. We then discuss the motivation of applying graph neural networks into recommender systems, mainly consisting of the high-order connectivity, the structural property of data, and the enhanced supervision signal. We then systematically analyze the challenges in graph construction, embedding propagation/aggregation, model optimization, and computation efficiency. Afterward and primarily, we provide a comprehensive overview of a multitude of existing works of graph neural network-based recommender systems, following the taxonomy above. Finally, we raise discussions on the open problems and promising future directions in this area. We summarize the representative papers along with their code repositories in {\textbackslash}url\{https://github.com/tsinghua-fib-lab/GNN-Recommender-Systems\}.}, urldate = {2023-07-06}, publisher = {arXiv}, author = {Gao, Chen and Zheng, Yu and Li, Nian and Li, Yinfeng and Qin, Yingrong and Piao, Jinghua and Quan, Yuhan and Chang, Jianxin and Jin, Depeng and He, Xiangnan and Li, Yong}, month = jan, year = {2023}, note = {arXiv:2109.12843 [cs]}, keywords = {Computer Science - Information Retrieval}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/WUJ2Y5V4/Gao et al. - 2023 - A Survey of Graph Neural Networks for Recommender .pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/MADG65MH/2109.html:text/html}, } @misc{li_gated_2017, title = {Gated {Graph} {Sequence} {Neural} {Networks}}, url = {http://arxiv.org/abs/1511.05493}, doi = {10.48550/arXiv.1511.05493}, abstract = {Graph-structured data appears frequently in domains including chemistry, natural language semantics, social networks, and knowledge bases. In this work, we study feature learning techniques for graph-structured inputs. Our starting point is previous work on Graph Neural Networks (Scarselli et al., 2009), which we modify to use gated recurrent units and modern optimization techniques and then extend to output sequences. The result is a flexible and broadly useful class of neural network models that has favorable inductive biases relative to purely sequence-based models (e.g., LSTMs) when the problem is graph-structured. We demonstrate the capabilities on some simple AI (bAbI) and graph algorithm learning tasks. We then show it achieves state-of-the-art performance on a problem from program verification, in which subgraphs need to be matched to abstract data structures.}, urldate = {2023-07-06}, publisher = {arXiv}, author = {Li, Yujia and Tarlow, Daniel and Brockschmidt, Marc and Zemel, Richard}, month = sep, year = {2017}, note = {arXiv:1511.05493 [cs, stat]}, keywords = {Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, Statistics - Machine Learning, Computer Science - Artificial Intelligence}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/L3VNMV2A/Li et al. - 2017 - Gated Graph Sequence Neural Networks.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/5LW4NDAB/1511.html:text/html}, } @misc{kingma_variational_2023, title = {Variational {Diffusion} {Models}}, url = {http://arxiv.org/abs/2107.00630}, doi = {10.48550/arXiv.2107.00630}, abstract = {Diffusion-based generative models have demonstrated a capacity for perceptually impressive synthesis, but can they also be great likelihood-based models? We answer this in the affirmative, and introduce a family of diffusion-based generative models that obtain state-of-the-art likelihoods on standard image density estimation benchmarks. Unlike other diffusion-based models, our method allows for efficient optimization of the noise schedule jointly with the rest of the model. We show that the variational lower bound (VLB) simplifies to a remarkably short expression in terms of the signal-to-noise ratio of the diffused data, thereby improving our theoretical understanding of this model class. Using this insight, we prove an equivalence between several models proposed in the literature. In addition, we show that the continuous-time VLB is invariant to the noise schedule, except for the signal-to-noise ratio at its endpoints. This enables us to learn a noise schedule that minimizes the variance of the resulting VLB estimator, leading to faster optimization. Combining these advances with architectural improvements, we obtain state-of-the-art likelihoods on image density estimation benchmarks, outperforming autoregressive models that have dominated these benchmarks for many years, with often significantly faster optimization. In addition, we show how to use the model as part of a bits-back compression scheme, and demonstrate lossless compression rates close to the theoretical optimum. Code is available at https://github.com/google-research/vdm .}, urldate = {2023-07-07}, publisher = {arXiv}, author = {Kingma, Diederik P. and Salimans, Tim and Poole, Ben and Ho, Jonathan}, month = apr, year = {2023}, note = {arXiv:2107.00630 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/YKZMA3CJ/Kingma et al. - 2023 - Variational Diffusion Models.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/QR227N9K/2107.html:text/html}, } @misc{dhariwal_diffusion_2021, title = {Diffusion {Models} {Beat} {GANs} on {Image} {Synthesis}}, url = {http://arxiv.org/abs/2105.05233}, doi = {10.48550/arXiv.2105.05233}, abstract = {We show that diffusion models can achieve image sample quality superior to the current state-of-the-art generative models. We achieve this on unconditional image synthesis by finding a better architecture through a series of ablations. For conditional image synthesis, we further improve sample quality with classifier guidance: a simple, compute-efficient method for trading off diversity for fidelity using gradients from a classifier. We achieve an FID of 2.97 on ImageNet 128\${\textbackslash}times\$128, 4.59 on ImageNet 256\${\textbackslash}times\$256, and 7.72 on ImageNet 512\${\textbackslash}times\$512, and we match BigGAN-deep even with as few as 25 forward passes per sample, all while maintaining better coverage of the distribution. Finally, we find that classifier guidance combines well with upsampling diffusion models, further improving FID to 3.94 on ImageNet 256\${\textbackslash}times\$256 and 3.85 on ImageNet 512\${\textbackslash}times\$512. We release our code at https://github.com/openai/guided-diffusion}, urldate = {2023-07-07}, publisher = {arXiv}, author = {Dhariwal, Prafulla and Nichol, Alex}, month = jun, year = {2021}, note = {arXiv:2105.05233 [cs, stat]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Statistics - Machine Learning, Computer Science - Artificial Intelligence}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/PWT54DE7/Dhariwal and Nichol - 2021 - Diffusion Models Beat GANs on Image Synthesis.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/8J2YBIJV/2105.html:text/html}, } @misc{ho_classifier-free_2022, title = {Classifier-{Free} {Diffusion} {Guidance}}, url = {http://arxiv.org/abs/2207.12598}, doi = {10.48550/arXiv.2207.12598}, abstract = {Classifier guidance is a recently introduced method to trade off mode coverage and sample fidelity in conditional diffusion models post training, in the same spirit as low temperature sampling or truncation in other types of generative models. Classifier guidance combines the score estimate of a diffusion model with the gradient of an image classifier and thereby requires training an image classifier separate from the diffusion model. It also raises the question of whether guidance can be performed without a classifier. We show that guidance can be indeed performed by a pure generative model without such a classifier: in what we call classifier-free guidance, we jointly train a conditional and an unconditional diffusion model, and we combine the resulting conditional and unconditional score estimates to attain a trade-off between sample quality and diversity similar to that obtained using classifier guidance.}, urldate = {2023-07-07}, publisher = {arXiv}, author = {Ho, Jonathan and Salimans, Tim}, month = jul, year = {2022}, note = {arXiv:2207.12598 [cs]}, keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/TBVUH8VL/Ho and Salimans - 2022 - Classifier-Free Diffusion Guidance.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/ZLTAMX75/2207.html:text/html}, } @misc{song_score-based_2021, title = {Score-{Based} {Generative} {Modeling} through {Stochastic} {Differential} {Equations}}, url = {http://arxiv.org/abs/2011.13456}, doi = {10.48550/arXiv.2011.13456}, abstract = {Creating noise from data is easy; creating data from noise is generative modeling. We present a stochastic differential equation (SDE) that smoothly transforms a complex data distribution to a known prior distribution by slowly injecting noise, and a corresponding reverse-time SDE that transforms the prior distribution back into the data distribution by slowly removing the noise. Crucially, the reverse-time SDE depends only on the time-dependent gradient field ({\textbackslash}aka, score) of the perturbed data distribution. By leveraging advances in score-based generative modeling, we can accurately estimate these scores with neural networks, and use numerical SDE solvers to generate samples. We show that this framework encapsulates previous approaches in score-based generative modeling and diffusion probabilistic modeling, allowing for new sampling procedures and new modeling capabilities. In particular, we introduce a predictor-corrector framework to correct errors in the evolution of the discretized reverse-time SDE. We also derive an equivalent neural ODE that samples from the same distribution as the SDE, but additionally enables exact likelihood computation, and improved sampling efficiency. In addition, we provide a new way to solve inverse problems with score-based models, as demonstrated with experiments on class-conditional generation, image inpainting, and colorization. Combined with multiple architectural improvements, we achieve record-breaking performance for unconditional image generation on CIFAR-10 with an Inception score of 9.89 and FID of 2.20, a competitive likelihood of 2.99 bits/dim, and demonstrate high fidelity generation of 1024 x 1024 images for the first time from a score-based generative model.}, urldate = {2023-07-07}, publisher = {arXiv}, author = {Song, Yang and Sohl-Dickstein, Jascha and Kingma, Diederik P. and Kumar, Abhishek and Ermon, Stefano and Poole, Ben}, month = feb, year = {2021}, note = {arXiv:2011.13456 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/TBFPXY9C/Song et al. - 2021 - Score-Based Generative Modeling through Stochastic.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/DCVZ9TA3/2011.html:text/html}, } @misc{nichol_glide_2022, title = {{GLIDE}: {Towards} {Photorealistic} {Image} {Generation} and {Editing} with {Text}-{Guided} {Diffusion} {Models}}, shorttitle = {{GLIDE}}, url = {http://arxiv.org/abs/2112.10741}, doi = {10.48550/arXiv.2112.10741}, abstract = {Diffusion models have recently been shown to generate high-quality synthetic images, especially when paired with a guidance technique to trade off diversity for fidelity. We explore diffusion models for the problem of text-conditional image synthesis and compare two different guidance strategies: CLIP guidance and classifier-free guidance. We find that the latter is preferred by human evaluators for both photorealism and caption similarity, and often produces photorealistic samples. Samples from a 3.5 billion parameter text-conditional diffusion model using classifier-free guidance are favored by human evaluators to those from DALL-E, even when the latter uses expensive CLIP reranking. Additionally, we find that our models can be fine-tuned to perform image inpainting, enabling powerful text-driven image editing. We train a smaller model on a filtered dataset and release the code and weights at https://github.com/openai/glide-text2im.}, urldate = {2023-07-07}, publisher = {arXiv}, author = {Nichol, Alex and Dhariwal, Prafulla and Ramesh, Aditya and Shyam, Pranav and Mishkin, Pamela and McGrew, Bob and Sutskever, Ilya and Chen, Mark}, month = mar, year = {2022}, note = {arXiv:2112.10741 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Computer Science - Graphics}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/7A2X7CTA/Nichol et al. - 2022 - GLIDE Towards Photorealistic Image Generation and.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/9AD6NH2U/2112.html:text/html}, } @misc{jun_shap-e_2023, title = {Shap-{E}: {Generating} {Conditional} {3D} {Implicit} {Functions}}, shorttitle = {Shap-{E}}, url = {http://arxiv.org/abs/2305.02463}, doi = {10.48550/arXiv.2305.02463}, abstract = {We present Shap-E, a conditional generative model for 3D assets. Unlike recent work on 3D generative models which produce a single output representation, Shap-E directly generates the parameters of implicit functions that can be rendered as both textured meshes and neural radiance fields. We train Shap-E in two stages: first, we train an encoder that deterministically maps 3D assets into the parameters of an implicit function; second, we train a conditional diffusion model on outputs of the encoder. When trained on a large dataset of paired 3D and text data, our resulting models are capable of generating complex and diverse 3D assets in a matter of seconds. When compared to Point-E, an explicit generative model over point clouds, Shap-E converges faster and reaches comparable or better sample quality despite modeling a higher-dimensional, multi-representation output space. We release model weights, inference code, and samples at https://github.com/openai/shap-e.}, urldate = {2023-07-13}, publisher = {arXiv}, author = {Jun, Heewoo and Nichol, Alex}, month = may, year = {2023}, note = {arXiv:2305.02463 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/855RN3FY/Jun and Nichol - 2023 - Shap-E Generating Conditional 3D Implicit Functio.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/QTTJPPH2/2305.html:text/html}, } @misc{ho_denoising_2020, title = {Denoising {Diffusion} {Probabilistic} {Models}}, url = {http://arxiv.org/abs/2006.11239}, doi = {10.48550/arXiv.2006.11239}, abstract = {We present high quality image synthesis results using diffusion probabilistic models, a class of latent variable models inspired by considerations from nonequilibrium thermodynamics. Our best results are obtained by training on a weighted variational bound designed according to a novel connection between diffusion probabilistic models and denoising score matching with Langevin dynamics, and our models naturally admit a progressive lossy decompression scheme that can be interpreted as a generalization of autoregressive decoding. On the unconditional CIFAR10 dataset, we obtain an Inception score of 9.46 and a state-of-the-art FID score of 3.17. On 256x256 LSUN, we obtain sample quality similar to ProgressiveGAN. Our implementation is available at https://github.com/hojonathanho/diffusion}, urldate = {2023-07-17}, publisher = {arXiv}, author = {Ho, Jonathan and Jain, Ajay and Abbeel, Pieter}, month = dec, year = {2020}, note = {arXiv:2006.11239 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/X4CDD6W7/Ho et al. - 2020 - Denoising Diffusion Probabilistic Models.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/Q2SYF624/2006.html:text/html}, } @misc{pan_drag_2023, title = {Drag {Your} {GAN}: {Interactive} {Point}-based {Manipulation} on the {Generative} {Image} {Manifold}}, shorttitle = {Drag {Your} {GAN}}, url = {http://arxiv.org/abs/2305.10973}, doi = {10.48550/arXiv.2305.10973}, abstract = {Synthesizing visual content that meets users' needs often requires flexible and precise controllability of the pose, shape, expression, and layout of the generated objects. Existing approaches gain controllability of generative adversarial networks (GANs) via manually annotated training data or a prior 3D model, which often lack flexibility, precision, and generality. In this work, we study a powerful yet much less explored way of controlling GANs, that is, to "drag" any points of the image to precisely reach target points in a user-interactive manner, as shown in Fig.1. To achieve this, we propose DragGAN, which consists of two main components: 1) a feature-based motion supervision that drives the handle point to move towards the target position, and 2) a new point tracking approach that leverages the discriminative generator features to keep localizing the position of the handle points. Through DragGAN, anyone can deform an image with precise control over where pixels go, thus manipulating the pose, shape, expression, and layout of diverse categories such as animals, cars, humans, landscapes, etc. As these manipulations are performed on the learned generative image manifold of a GAN, they tend to produce realistic outputs even for challenging scenarios such as hallucinating occluded content and deforming shapes that consistently follow the object's rigidity. Both qualitative and quantitative comparisons demonstrate the advantage of DragGAN over prior approaches in the tasks of image manipulation and point tracking. We also showcase the manipulation of real images through GAN inversion.}, urldate = {2023-07-17}, publisher = {arXiv}, author = {Pan, Xingang and Tewari, Ayush and Leimkühler, Thomas and Liu, Lingjie and Meka, Abhimitra and Theobalt, Christian}, month = may, year = {2023}, note = {arXiv:2305.10973 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/8ITIWG9Q/Pan et al. - 2023 - Drag Your GAN Interactive Point-based Manipulatio.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/4G4JFKJV/2305.html:text/html}, } @misc{satorras_en_2022, title = {E(n) {Equivariant} {Graph} {Neural} {Networks}}, url = {http://arxiv.org/abs/2102.09844}, doi = {10.48550/arXiv.2102.09844}, abstract = {This paper introduces a new model to learn graph neural networks equivariant to rotations, translations, reflections and permutations called E(n)-Equivariant Graph Neural Networks (EGNNs). In contrast with existing methods, our work does not require computationally expensive higher-order representations in intermediate layers while it still achieves competitive or better performance. In addition, whereas existing methods are limited to equivariance on 3 dimensional spaces, our model is easily scaled to higher-dimensional spaces. We demonstrate the effectiveness of our method on dynamical systems modelling, representation learning in graph autoencoders and predicting molecular properties.}, urldate = {2023-08-09}, publisher = {arXiv}, author = {Satorras, Victor Garcia and Hoogeboom, Emiel and Welling, Max}, month = feb, year = {2022}, note = {arXiv:2102.09844 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/5KIWXDVC/Satorras et al. - 2022 - E(n) Equivariant Graph Neural Networks.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/9ZFR6NAN/2102.html:text/html}, } @misc{yu_pu-net_2018, title = {{PU}-{Net}: {Point} {Cloud} {Upsampling} {Network}}, shorttitle = {{PU}-{Net}}, url = {http://arxiv.org/abs/1801.06761}, doi = {10.48550/arXiv.1801.06761}, abstract = {Learning and analyzing 3D point clouds with deep networks is challenging due to the sparseness and irregularity of the data. In this paper, we present a data-driven point cloud upsampling technique. The key idea is to learn multi-level features per point and expand the point set via a multi-branch convolution unit implicitly in feature space. The expanded feature is then split to a multitude of features, which are then reconstructed to an upsampled point set. Our network is applied at a patch-level, with a joint loss function that encourages the upsampled points to remain on the underlying surface with a uniform distribution. We conduct various experiments using synthesis and scan data to evaluate our method and demonstrate its superiority over some baseline methods and an optimization-based method. Results show that our upsampled points have better uniformity and are located closer to the underlying surfaces.}, urldate = {2023-08-04}, publisher = {arXiv}, author = {Yu, Lequan and Li, Xianzhi and Fu, Chi-Wing and Cohen-Or, Daniel and Heng, Pheng-Ann}, month = mar, year = {2018}, note = {arXiv:1801.06761 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/HSLYQ9SV/Yu et al. - 2018 - PU-Net Point Cloud Upsampling Network.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/IN75N9XP/1801.html:text/html}, } @article{zhang_point_2022, title = {Point {Cloud} {Upsampling} {Algorithm}: {A} {Systematic} {Review}}, volume = {15}, copyright = {http://creativecommons.org/licenses/by/3.0/}, issn = {1999-4893}, shorttitle = {Point {Cloud} {Upsampling} {Algorithm}}, url = {https://www.mdpi.com/1999-4893/15/4/124}, doi = {10.3390/a15040124}, abstract = {Point cloud upsampling algorithms can improve the resolution of point clouds and generate dense and uniform point clouds, and are an important image processing technology. Significant progress has been made in point cloud upsampling research in recent years. This paper provides a comprehensive survey of point cloud upsampling algorithms. We classify existing point cloud upsampling algorithms into optimization-based methods and deep learning-based methods, and analyze the advantages and limitations of different algorithms from a modular perspective. In addition, we cover some other important issues such as public datasets and performance evaluation metrics. Finally, we conclude this survey by highlighting several future research directions and open issues that should be further addressed.}, language = {en}, number = {4}, urldate = {2023-08-04}, journal = {Algorithms}, author = {Zhang, Yan and Zhao, Wenhan and Sun, Bo and Zhang, Ying and Wen, Wen}, month = apr, year = {2022}, note = {Number: 4 Publisher: Multidisciplinary Digital Publishing Institute}, keywords = {deep learning, generative adversarial network (GAN), graph convolutional network (GCN), point cloud upsampling, unsupervised learning}, pages = {124}, file = {Full Text PDF:/home/laurent/Zotero/storage/PHKK549T/Zhang et al. - 2022 - Point Cloud Upsampling Algorithm A Systematic Rev.pdf:application/pdf}, } @misc{ma_rethinking_2022, title = {Rethinking {Network} {Design} and {Local} {Geometry} in {Point} {Cloud}: {A} {Simple} {Residual} {MLP} {Framework}}, shorttitle = {Rethinking {Network} {Design} and {Local} {Geometry} in {Point} {Cloud}}, url = {http://arxiv.org/abs/2202.07123}, doi = {10.48550/arXiv.2202.07123}, abstract = {Point cloud analysis is challenging due to irregularity and unordered data structure. To capture the 3D geometries, prior works mainly rely on exploring sophisticated local geometric extractors using convolution, graph, or attention mechanisms. These methods, however, incur unfavorable latency during inference, and the performance saturates over the past few years. In this paper, we present a novel perspective on this task. We notice that detailed local geometrical information probably is not the key to point cloud analysis -- we introduce a pure residual MLP network, called PointMLP, which integrates no sophisticated local geometrical extractors but still performs very competitively. Equipped with a proposed lightweight geometric affine module, PointMLP delivers the new state-of-the-art on multiple datasets. On the real-world ScanObjectNN dataset, our method even surpasses the prior best method by 3.3\% accuracy. We emphasize that PointMLP achieves this strong performance without any sophisticated operations, hence leading to a superior inference speed. Compared to most recent CurveNet, PointMLP trains 2x faster, tests 7x faster, and is more accurate on ModelNet40 benchmark. We hope our PointMLP may help the community towards a better understanding of point cloud analysis. The code is available at https://github.com/ma-xu/pointMLP-pytorch.}, urldate = {2023-08-03}, publisher = {arXiv}, author = {Ma, Xu and Qin, Can and You, Haoxuan and Ran, Haoxi and Fu, Yun}, month = nov, year = {2022}, note = {arXiv:2202.07123 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Artificial Intelligence}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/ZZGSSLLN/Ma et al. - 2022 - Rethinking Network Design and Local Geometry in Po.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/23QXY3QK/2202.html:text/html}, } @misc{oord_neural_2018, title = {Neural {Discrete} {Representation} {Learning}}, url = {http://arxiv.org/abs/1711.00937}, doi = {10.48550/arXiv.1711.00937}, abstract = {Learning useful representations without supervision remains a key challenge in machine learning. In this paper, we propose a simple yet powerful generative model that learns such discrete representations. Our model, the Vector Quantised-Variational AutoEncoder (VQ-VAE), differs from VAEs in two key ways: the encoder network outputs discrete, rather than continuous, codes; and the prior is learnt rather than static. In order to learn a discrete latent representation, we incorporate ideas from vector quantisation (VQ). Using the VQ method allows the model to circumvent issues of "posterior collapse" -- where the latents are ignored when they are paired with a powerful autoregressive decoder -- typically observed in the VAE framework. Pairing these representations with an autoregressive prior, the model can generate high quality images, videos, and speech as well as doing high quality speaker conversion and unsupervised learning of phonemes, providing further evidence of the utility of the learnt representations.}, urldate = {2023-07-19}, publisher = {arXiv}, author = {Oord, Aaron van den and Vinyals, Oriol and Kavukcuoglu, Koray}, month = may, year = {2018}, note = {arXiv:1711.00937 [cs]}, keywords = {Computer Science - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/ZHFTV3KY/Oord et al. - 2018 - Neural Discrete Representation Learning.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/I24NXN5D/1711.html:text/html}, } @misc{esser_taming_2021, title = {Taming {Transformers} for {High}-{Resolution} {Image} {Synthesis}}, url = {http://arxiv.org/abs/2012.09841}, doi = {10.48550/arXiv.2012.09841}, abstract = {Designed to learn long-range interactions on sequential data, transformers continue to show state-of-the-art results on a wide variety of tasks. In contrast to CNNs, they contain no inductive bias that prioritizes local interactions. This makes them expressive, but also computationally infeasible for long sequences, such as high-resolution images. We demonstrate how combining the effectiveness of the inductive bias of CNNs with the expressivity of transformers enables them to model and thereby synthesize high-resolution images. We show how to (i) use CNNs to learn a context-rich vocabulary of image constituents, and in turn (ii) utilize transformers to efficiently model their composition within high-resolution images. Our approach is readily applied to conditional synthesis tasks, where both non-spatial information, such as object classes, and spatial information, such as segmentations, can control the generated image. In particular, we present the first results on semantically-guided synthesis of megapixel images with transformers and obtain the state of the art among autoregressive models on class-conditional ImageNet. Code and pretrained models can be found at https://github.com/CompVis/taming-transformers .}, urldate = {2023-07-19}, publisher = {arXiv}, author = {Esser, Patrick and Rombach, Robin and Ommer, Björn}, month = jun, year = {2021}, note = {arXiv:2012.09841 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/P65UBNHY/Esser et al. - 2021 - Taming Transformers for High-Resolution Image Synt.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/N2ALIH7W/2012.html:text/html}, } @misc{isola_image--image_2018, title = {Image-to-{Image} {Translation} with {Conditional} {Adversarial} {Networks}}, url = {http://arxiv.org/abs/1611.07004}, doi = {10.48550/arXiv.1611.07004}, abstract = {We investigate conditional adversarial networks as a general-purpose solution to image-to-image translation problems. These networks not only learn the mapping from input image to output image, but also learn a loss function to train this mapping. This makes it possible to apply the same generic approach to problems that traditionally would require very different loss formulations. We demonstrate that this approach is effective at synthesizing photos from label maps, reconstructing objects from edge maps, and colorizing images, among other tasks. Indeed, since the release of the pix2pix software associated with this paper, a large number of internet users (many of them artists) have posted their own experiments with our system, further demonstrating its wide applicability and ease of adoption without the need for parameter tweaking. As a community, we no longer hand-engineer our mapping functions, and this work suggests we can achieve reasonable results without hand-engineering our loss functions either.}, urldate = {2023-07-19}, publisher = {arXiv}, author = {Isola, Phillip and Zhu, Jun-Yan and Zhou, Tinghui and Efros, Alexei A.}, month = nov, year = {2018}, note = {arXiv:1611.07004 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/3JNXIC89/Isola et al. - 2018 - Image-to-Image Translation with Conditional Advers.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/9SW285N5/1611.html:text/html}, } @misc{zhang_unreasonable_2018, title = {The {Unreasonable} {Effectiveness} of {Deep} {Features} as a {Perceptual} {Metric}}, url = {http://arxiv.org/abs/1801.03924}, doi = {10.48550/arXiv.1801.03924}, abstract = {While it is nearly effortless for humans to quickly assess the perceptual similarity between two images, the underlying processes are thought to be quite complex. Despite this, the most widely used perceptual metrics today, such as PSNR and SSIM, are simple, shallow functions, and fail to account for many nuances of human perception. Recently, the deep learning community has found that features of the VGG network trained on ImageNet classification has been remarkably useful as a training loss for image synthesis. But how perceptual are these so-called "perceptual losses"? What elements are critical for their success? To answer these questions, we introduce a new dataset of human perceptual similarity judgments. We systematically evaluate deep features across different architectures and tasks and compare them with classic metrics. We find that deep features outperform all previous metrics by large margins on our dataset. More surprisingly, this result is not restricted to ImageNet-trained VGG features, but holds across different deep architectures and levels of supervision (supervised, self-supervised, or even unsupervised). Our results suggest that perceptual similarity is an emergent property shared across deep visual representations.}, urldate = {2023-07-19}, publisher = {arXiv}, author = {Zhang, Richard and Isola, Phillip and Efros, Alexei A. and Shechtman, Eli and Wang, Oliver}, month = apr, year = {2018}, note = {arXiv:1801.03924 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/37ALS5ZY/Zhang et al. - 2018 - The Unreasonable Effectiveness of Deep Features as.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/AYX4F5NM/1801.html:text/html}, } @misc{dettmers_qlora_2023, title = {{QLoRA}: {Efficient} {Finetuning} of {Quantized} {LLMs}}, shorttitle = {{QLoRA}}, url = {http://arxiv.org/abs/2305.14314}, doi = {10.48550/arXiv.2305.14314}, abstract = {We present QLoRA, an efficient finetuning approach that reduces memory usage enough to finetune a 65B parameter model on a single 48GB GPU while preserving full 16-bit finetuning task performance. QLoRA backpropagates gradients through a frozen, 4-bit quantized pretrained language model into Low Rank Adapters{\textasciitilde}(LoRA). Our best model family, which we name Guanaco, outperforms all previous openly released models on the Vicuna benchmark, reaching 99.3\% of the performance level of ChatGPT while only requiring 24 hours of finetuning on a single GPU. QLoRA introduces a number of innovations to save memory without sacrificing performance: (a) 4-bit NormalFloat (NF4), a new data type that is information theoretically optimal for normally distributed weights (b) double quantization to reduce the average memory footprint by quantizing the quantization constants, and (c) paged optimziers to manage memory spikes. We use QLoRA to finetune more than 1,000 models, providing a detailed analysis of instruction following and chatbot performance across 8 instruction datasets, multiple model types (LLaMA, T5), and model scales that would be infeasible to run with regular finetuning (e.g. 33B and 65B parameter models). Our results show that QLoRA finetuning on a small high-quality dataset leads to state-of-the-art results, even when using smaller models than the previous SoTA. We provide a detailed analysis of chatbot performance based on both human and GPT-4 evaluations showing that GPT-4 evaluations are a cheap and reasonable alternative to human evaluation. Furthermore, we find that current chatbot benchmarks are not trustworthy to accurately evaluate the performance levels of chatbots. A lemon-picked analysis demonstrates where Guanaco fails compared to ChatGPT. We release all of our models and code, including CUDA kernels for 4-bit training.}, urldate = {2023-07-19}, publisher = {arXiv}, author = {Dettmers, Tim and Pagnoni, Artidoro and Holtzman, Ari and Zettlemoyer, Luke}, month = may, year = {2023}, note = {arXiv:2305.14314 [cs]}, keywords = {Computer Science - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/4KD7F73X/Dettmers et al. - 2023 - QLoRA Efficient Finetuning of Quantized LLMs.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/EI35JVE7/2305.html:text/html}, } @misc{hu_lora_2021, title = {{LoRA}: {Low}-{Rank} {Adaptation} of {Large} {Language} {Models}}, shorttitle = {{LoRA}}, url = {http://arxiv.org/abs/2106.09685}, doi = {10.48550/arXiv.2106.09685}, abstract = {An important paradigm of natural language processing consists of large-scale pre-training on general domain data and adaptation to particular tasks or domains. As we pre-train larger models, full fine-tuning, which retrains all model parameters, becomes less feasible. Using GPT-3 175B as an example -- deploying independent instances of fine-tuned models, each with 175B parameters, is prohibitively expensive. We propose Low-Rank Adaptation, or LoRA, which freezes the pre-trained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks. Compared to GPT-3 175B fine-tuned with Adam, LoRA can reduce the number of trainable parameters by 10,000 times and the GPU memory requirement by 3 times. LoRA performs on-par or better than fine-tuning in model quality on RoBERTa, DeBERTa, GPT-2, and GPT-3, despite having fewer trainable parameters, a higher training throughput, and, unlike adapters, no additional inference latency. We also provide an empirical investigation into rank-deficiency in language model adaptation, which sheds light on the efficacy of LoRA. We release a package that facilitates the integration of LoRA with PyTorch models and provide our implementations and model checkpoints for RoBERTa, DeBERTa, and GPT-2 at https://github.com/microsoft/LoRA.}, urldate = {2023-07-19}, publisher = {arXiv}, author = {Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu}, month = oct, year = {2021}, note = {arXiv:2106.09685 [cs]}, keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Computation and Language}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/ZHA2VLNH/Hu et al. - 2021 - LoRA Low-Rank Adaptation of Large Language Models.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/QD7PM945/2106.html:text/html}, } @misc{mou_dragondiffusion_2023, title = {{DragonDiffusion}: {Enabling} {Drag}-style {Manipulation} on {Diffusion} {Models}}, shorttitle = {{DragonDiffusion}}, url = {http://arxiv.org/abs/2307.02421}, doi = {10.48550/arXiv.2307.02421}, abstract = {Despite the ability of existing large-scale text-to-image (T2I) models to generate high-quality images from detailed textual descriptions, they often lack the ability to precisely edit the generated or real images. In this paper, we propose a novel image editing method, DragonDiffusion, enabling Drag-style manipulation on Diffusion models. Specifically, we construct classifier guidance based on the strong correspondence of intermediate features in the diffusion model. It can transform the editing signals into gradients via feature correspondence loss to modify the intermediate representation of the diffusion model. Based on this guidance strategy, we also build a multi-scale guidance to consider both semantic and geometric alignment. Moreover, a cross-branch self-attention is added to maintain the consistency between the original image and the editing result. Our method, through an efficient design, achieves various editing modes for the generated or real images, such as object moving, object resizing, object appearance replacement, and content dragging. It is worth noting that all editing and content preservation signals come from the image itself, and the model does not require fine-tuning or additional modules. Our source code will be available at https://github.com/MC-E/DragonDiffusion.}, urldate = {2023-07-17}, publisher = {arXiv}, author = {Mou, Chong and Wang, Xintao and Song, Jiechong and Shan, Ying and Zhang, Jian}, month = jul, year = {2023}, note = {arXiv:2307.02421 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/XJS9MTT9/Mou et al. - 2023 - DragonDiffusion Enabling Drag-style Manipulation .pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/5MU7EYIM/2307.html:text/html}, } @misc{shi_dragdiffusion_2023, title = {{DragDiffusion}: {Harnessing} {Diffusion} {Models} for {Interactive} {Point}-based {Image} {Editing}}, shorttitle = {{DragDiffusion}}, url = {http://arxiv.org/abs/2306.14435}, doi = {10.48550/arXiv.2306.14435}, abstract = {Precise and controllable image editing is a challenging task that has attracted significant attention. Recently, DragGAN enables an interactive point-based image editing framework and achieves impressive editing results with pixel-level precision. However, since this method is based on generative adversarial networks (GAN), its generality is upper-bounded by the capacity of the pre-trained GAN models. In this work, we extend such an editing framework to diffusion models and propose DragDiffusion. By leveraging large-scale pretrained diffusion models, we greatly improve the applicability of interactive point-based editing in real world scenarios. While most existing diffusion-based image editing methods work on text embeddings, DragDiffusion optimizes the diffusion latent to achieve precise spatial control. Although diffusion models generate images in an iterative manner, we empirically show that optimizing diffusion latent at one single step suffices to generate coherent results, enabling DragDiffusion to complete high-quality editing efficiently. Extensive experiments across a wide range of challenging cases (e.g., multi-objects, diverse object categories, various styles, etc.) demonstrate the versatility and generality of DragDiffusion. Code: https://github.com/Yujun-Shi/DragDiffusion.}, urldate = {2023-07-17}, publisher = {arXiv}, author = {Shi, Yujun and Xue, Chuhui and Pan, Jiachun and Zhang, Wenqing and Tan, Vincent Y. F. and Bai, Song}, month = jul, year = {2023}, note = {arXiv:2306.14435 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/HMBGIU3W/Shi et al. - 2023 - DragDiffusion Harnessing Diffusion Models for Int.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/AGS4JE2X/2306.html:text/html}, } @misc{zhao_bias_2018, title = {Bias and {Generalization} in {Deep} {Generative} {Models}: {An} {Empirical} {Study}}, shorttitle = {Bias and {Generalization} in {Deep} {Generative} {Models}}, url = {http://arxiv.org/abs/1811.03259}, doi = {10.48550/arXiv.1811.03259}, abstract = {In high dimensional settings, density estimation algorithms rely crucially on their inductive bias. Despite recent empirical success, the inductive bias of deep generative models is not well understood. In this paper we propose a framework to systematically investigate bias and generalization in deep generative models of images. Inspired by experimental methods from cognitive psychology, we probe each learning algorithm with carefully designed training datasets to characterize when and how existing models generate novel attributes and their combinations. We identify similarities to human psychology and verify that these patterns are consistent across commonly used models and architectures.}, urldate = {2023-08-16}, publisher = {arXiv}, author = {Zhao, Shengjia and Ren, Hongyu and Yuan, Arianna and Song, Jiaming and Goodman, Noah and Ermon, Stefano}, month = nov, year = {2018}, note = {arXiv:1811.03259 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/38D6FI5W/Zhao et al. - 2018 - Bias and Generalization in Deep Generative Models.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/5KNUNRMG/1811.html:text/html}, } @article{kobyzev_normalizing_2021, title = {Normalizing {Flows}: {An} {Introduction} and {Review} of {Current} {Methods}}, volume = {43}, issn = {0162-8828, 2160-9292, 1939-3539}, shorttitle = {Normalizing {Flows}}, url = {http://arxiv.org/abs/1908.09257}, doi = {10.1109/TPAMI.2020.2992934}, abstract = {Normalizing Flows are generative models which produce tractable distributions where both sampling and density evaluation can be efficient and exact. The goal of this survey article is to give a coherent and comprehensive review of the literature around the construction and use of Normalizing Flows for distribution learning. We aim to provide context and explanation of the models, review current state-of-the-art literature, and identify open questions and promising future directions.}, number = {11}, urldate = {2023-08-16}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, author = {Kobyzev, Ivan and Prince, Simon J. D. and Brubaker, Marcus A.}, month = nov, year = {2021}, note = {arXiv:1908.09257 [cs, stat]}, keywords = {Computer Science - Machine Learning, Statistics - Machine Learning}, pages = {3964--3979}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/6J6C8IB8/Kobyzev et al. - 2021 - Normalizing Flows An Introduction and Review of C.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/U8YTGRBX/1908.html:text/html}, } @misc{mildenhall_nerf_2020, title = {{NeRF}: {Representing} {Scenes} as {Neural} {Radiance} {Fields} for {View} {Synthesis}}, shorttitle = {{NeRF}}, url = {http://arxiv.org/abs/2003.08934}, doi = {10.48550/arXiv.2003.08934}, abstract = {We present a method that achieves state-of-the-art results for synthesizing novel views of complex scenes by optimizing an underlying continuous volumetric scene function using a sparse set of input views. Our algorithm represents a scene using a fully-connected (non-convolutional) deep network, whose input is a single continuous 5D coordinate (spatial location \$(x,y,z)\$ and viewing direction \$({\textbackslash}theta, {\textbackslash}phi)\$) and whose output is the volume density and view-dependent emitted radiance at that spatial location. We synthesize views by querying 5D coordinates along camera rays and use classic volume rendering techniques to project the output colors and densities into an image. Because volume rendering is naturally differentiable, the only input required to optimize our representation is a set of images with known camera poses. We describe how to effectively optimize neural radiance fields to render photorealistic novel views of scenes with complicated geometry and appearance, and demonstrate results that outperform prior work on neural rendering and view synthesis. View synthesis results are best viewed as videos, so we urge readers to view our supplementary video for convincing comparisons.}, urldate = {2023-08-16}, publisher = {arXiv}, author = {Mildenhall, Ben and Srinivasan, Pratul P. and Tancik, Matthew and Barron, Jonathan T. and Ramamoorthi, Ravi and Ng, Ren}, month = aug, year = {2020}, note = {arXiv:2003.08934 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/8ANFVRTK/Mildenhall et al. - 2020 - NeRF Representing Scenes as Neural Radiance Field.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/RNKQHNTD/2003.html:text/html}, } @misc{andrade-loarca_poissonnet_2023, title = {{PoissonNet}: {Resolution}-{Agnostic} {3D} {Shape} {Reconstruction} using {Fourier} {Neural} {Operators}}, shorttitle = {{PoissonNet}}, url = {http://arxiv.org/abs/2308.01766}, doi = {10.48550/arXiv.2308.01766}, abstract = {We introduce PoissonNet, an architecture for shape reconstruction that addresses the challenge of recovering 3D shapes from points. Traditional deep neural networks face challenges with common 3D shape discretization techniques due to their computational complexity at higher resolutions. To overcome this, we leverage Fourier Neural Operators (FNOs) to solve the Poisson equation and reconstruct a mesh from oriented point cloud measurements. PoissonNet exhibits two main advantages. First, it enables efficient training on low-resolution data while achieving comparable performance at high-resolution evaluation, thanks to the resolution-agnostic nature of FNOs. This feature allows for one-shot super-resolution. Second, our method surpasses existing approaches in reconstruction quality while being differentiable. Overall, our proposed method not only improves upon the limitations of classical deep neural networks in shape reconstruction but also achieves superior results in terms of reconstruction quality, running time, and resolution flexibility. Furthermore, we demonstrate that the Poisson surface reconstruction problem is well-posed in the limit case by showing a universal approximation theorem for the solution operator of the Poisson equation with distributional data utilizing the Fourier Neural Operator, which provides a theoretical foundation for our numerical results. The code to reproduce the experiments is available on: {\textbackslash}url\{https://github.com/arsenal9971/PoissonNet\}.}, urldate = {2023-08-21}, publisher = {arXiv}, author = {Andrade-Loarca, Hector and Hege, Julius and Bacho, Aras and Kutyniok, Gitta}, month = aug, year = {2023}, note = {arXiv:2308.01766 [cs, math]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Mathematics - Analysis of PDEs}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/WBRQ6JS8/Andrade-Loarca et al. - 2023 - PoissonNet Resolution-Agnostic 3D Shape Reconstru.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/XLVSY3VP/2308.html:text/html}, } @misc{wu_3d_2015, title = {{3D} {ShapeNets}: {A} {Deep} {Representation} for {Volumetric} {Shapes}}, shorttitle = {{3D} {ShapeNets}}, url = {http://arxiv.org/abs/1406.5670}, doi = {10.48550/arXiv.1406.5670}, abstract = {3D shape is a crucial but heavily underutilized cue in today's computer vision systems, mostly due to the lack of a good generic shape representation. With the recent availability of inexpensive 2.5D depth sensors (e.g. Microsoft Kinect), it is becoming increasingly important to have a powerful 3D shape representation in the loop. Apart from category recognition, recovering full 3D shapes from view-based 2.5D depth maps is also a critical part of visual understanding. To this end, we propose to represent a geometric 3D shape as a probability distribution of binary variables on a 3D voxel grid, using a Convolutional Deep Belief Network. Our model, 3D ShapeNets, learns the distribution of complex 3D shapes across different object categories and arbitrary poses from raw CAD data, and discovers hierarchical compositional part representations automatically. It naturally supports joint object recognition and shape completion from 2.5D depth maps, and it enables active object recognition through view planning. To train our 3D deep learning model, we construct ModelNet -- a large-scale 3D CAD model dataset. Extensive experiments show that our 3D deep representation enables significant performance improvement over the-state-of-the-arts in a variety of tasks.}, urldate = {2023-08-21}, publisher = {arXiv}, author = {Wu, Zhirong and Song, Shuran and Khosla, Aditya and Yu, Fisher and Zhang, Linguang and Tang, Xiaoou and Xiao, Jianxiong}, month = apr, year = {2015}, note = {arXiv:1406.5670 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/VWP2VLGH/Wu et al. - 2015 - 3D ShapeNets A Deep Representation for Volumetric.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/58SJMCSB/1406.html:text/html}, } @misc{liao_kitti-360_2022, title = {{KITTI}-360: {A} {Novel} {Dataset} and {Benchmarks} for {Urban} {Scene} {Understanding} in {2D} and {3D}}, shorttitle = {{KITTI}-360}, url = {http://arxiv.org/abs/2109.13410}, doi = {10.48550/arXiv.2109.13410}, abstract = {For the last few decades, several major subfields of artificial intelligence including computer vision, graphics, and robotics have progressed largely independently from each other. Recently, however, the community has realized that progress towards robust intelligent systems such as self-driving cars requires a concerted effort across the different fields. This motivated us to develop KITTI-360, successor of the popular KITTI dataset. KITTI-360 is a suburban driving dataset which comprises richer input modalities, comprehensive semantic instance annotations and accurate localization to facilitate research at the intersection of vision, graphics and robotics. For efficient annotation, we created a tool to label 3D scenes with bounding primitives and developed a model that transfers this information into the 2D image domain, resulting in over 150k images and 1B 3D points with coherent semantic instance annotations across 2D and 3D. Moreover, we established benchmarks and baselines for several tasks relevant to mobile perception, encompassing problems from computer vision, graphics, and robotics on the same dataset, e.g., semantic scene understanding, novel view synthesis and semantic SLAM. KITTI-360 will enable progress at the intersection of these research areas and thus contribute towards solving one of today's grand challenges: the development of fully autonomous self-driving systems.}, urldate = {2023-08-21}, publisher = {arXiv}, author = {Liao, Yiyi and Xie, Jun and Geiger, Andreas}, month = jun, year = {2022}, note = {arXiv:2109.13410 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/NV46RRNN/Liao et al. - 2022 - KITTI-360 A Novel Dataset and Benchmarks for Urba.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/GSUST75L/2109.html:text/html}, } @misc{deitke_objaverse-xl_2023, title = {Objaverse-{XL}: {A} {Universe} of {10M}+ {3D} {Objects}}, shorttitle = {Objaverse-{XL}}, url = {http://arxiv.org/abs/2307.05663}, doi = {10.48550/arXiv.2307.05663}, abstract = {Natural language processing and 2D vision models have attained remarkable proficiency on many tasks primarily by escalating the scale of training data. However, 3D vision tasks have not seen the same progress, in part due to the challenges of acquiring high-quality 3D data. In this work, we present Objaverse-XL, a dataset of over 10 million 3D objects. Our dataset comprises deduplicated 3D objects from a diverse set of sources, including manually designed objects, photogrammetry scans of landmarks and everyday items, and professional scans of historic and antique artifacts. Representing the largest scale and diversity in the realm of 3D datasets, Objaverse-XL enables significant new possibilities for 3D vision. Our experiments demonstrate the improvements enabled with the scale provided by Objaverse-XL. We show that by training Zero123 on novel view synthesis, utilizing over 100 million multi-view rendered images, we achieve strong zero-shot generalization abilities. We hope that releasing Objaverse-XL will enable further innovations in the field of 3D vision at scale.}, urldate = {2023-08-21}, publisher = {arXiv}, author = {Deitke, Matt and Liu, Ruoshi and Wallingford, Matthew and Ngo, Huong and Michel, Oscar and Kusupati, Aditya and Fan, Alan and Laforte, Christian and Voleti, Vikram and Gadre, Samir Yitzhak and VanderBilt, Eli and Kembhavi, Aniruddha and Vondrick, Carl and Gkioxari, Georgia and Ehsani, Kiana and Schmidt, Ludwig and Farhadi, Ali}, month = jul, year = {2023}, note = {arXiv:2307.05663 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Artificial Intelligence}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/3CPCAZDV/Deitke et al. - 2023 - Objaverse-XL A Universe of 10M+ 3D Objects.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/JTXUJPK5/2307.html:text/html}, } @misc{deitke_objaverse_2022, title = {Objaverse: {A} {Universe} of {Annotated} {3D} {Objects}}, shorttitle = {Objaverse}, url = {http://arxiv.org/abs/2212.08051}, doi = {10.48550/arXiv.2212.08051}, abstract = {Massive data corpora like WebText, Wikipedia, Conceptual Captions, WebImageText, and LAION have propelled recent dramatic progress in AI. Large neural models trained on such datasets produce impressive results and top many of today's benchmarks. A notable omission within this family of large-scale datasets is 3D data. Despite considerable interest and potential applications in 3D vision, datasets of high-fidelity 3D models continue to be mid-sized with limited diversity of object categories. Addressing this gap, we present Objaverse 1.0, a large dataset of objects with 800K+ (and growing) 3D models with descriptive captions, tags, and animations. Objaverse improves upon present day 3D repositories in terms of scale, number of categories, and in the visual diversity of instances within a category. We demonstrate the large potential of Objaverse via four diverse applications: training generative 3D models, improving tail category segmentation on the LVIS benchmark, training open-vocabulary object-navigation models for Embodied AI, and creating a new benchmark for robustness analysis of vision models. Objaverse can open new directions for research and enable new applications across the field of AI.}, urldate = {2023-08-21}, publisher = {arXiv}, author = {Deitke, Matt and Schwenk, Dustin and Salvador, Jordi and Weihs, Luca and Michel, Oscar and VanderBilt, Eli and Schmidt, Ludwig and Ehsani, Kiana and Kembhavi, Aniruddha and Farhadi, Ali}, month = dec, year = {2022}, note = {arXiv:2212.08051 [cs]}, keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics, Computer Science - Artificial Intelligence, Computer Science - Robotics}, file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/3C3TW2HB/Deitke et al. - 2022 - Objaverse A Universe of Annotated 3D Objects.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/Q9MANWHX/2212.html:text/html}, } @incollection{mouriaux_nasa_2021, address = {Cham}, series = {Notes on {Numerical} {Fluid} {Mechanics} and {Multidisciplinary} {Design}}, title = {{NASA} {Rotor} 37}, isbn = {978-3-030-62048-6}, url = {https://doi.org/10.1007/978-3-030-62048-6_20}, abstract = {The NASA Rotor 37 is an isolated transonic axial compressor rotor. This case was initially included in a wider research program to cover a range of design parameters typical of high pressure compressor inlet stage of aircraft engines. Most numerical studies fail at predicting with accuracy the overall performance, e.g., the adiabatic efficiency and the losses distribution downstream of the blade. This case presents indeed several phenomena which are challenging to capture: laminar-to-turbulent transition on the blade, interaction of the boundary layer with the shock, secondary and tip-leakage flows. If LES appears a more adequate tool than RANS to predict such inherently unsteady phenomena, it remains delicate, especially because wall modeling is required. This section presents results obtained by Safran and UniBG of WMLES using the Discontinuous Galerkin approach.}, language = {en}, urldate = {2023-08-24}, booktitle = {{TILDA}: {Towards} {Industrial} {LES}/{DNS} in {Aeronautics}: {Paving} the {Way} for {Future} {Accurate} {CFD} - {Results} of the {H2020} {Research} {Project} {TILDA}, {Funded} by the {European} {Union}, 2015 -2018}, publisher = {Springer International Publishing}, author = {Mouriaux, S. and Bassi, F. and Colombo, A. and Ghidoni, A.}, editor = {Hirsch, Charles and Hillewaert, Koen and Hartmann, Ralf and Couaillier, Vincent and Boussuge, Jean-Francois and Chalot, Frederic and Bosniakov, Sergey and Haase, Werner}, year = {2021}, doi = {10.1007/978-3-030-62048-6_20}, pages = {533--544}, }