projet-fin-etude-rapport/src/zotero.bib


@misc{goodfellow_generative_2014,
	title = {Generative {Adversarial} {Networks}},
	url = {http://arxiv.org/abs/1406.2661},
	doi = {10.48550/arXiv.1406.2661},
	abstract = {We propose a new framework for estimating generative models via an adversarial process, in which we simultaneously train two models: a generative model G that captures the data distribution, and a discriminative model D that estimates the probability that a sample came from the training data rather than G. The training procedure for G is to maximize the probability of D making a mistake. This framework corresponds to a minimax two-player game. In the space of arbitrary functions G and D, a unique solution exists, with G recovering the training data distribution and D equal to 1/2 everywhere. In the case where G and D are defined by multilayer perceptrons, the entire system can be trained with backpropagation. There is no need for any Markov chains or unrolled approximate inference networks during either training or generation of samples. Experiments demonstrate the potential of the framework through qualitative and quantitative evaluation of the generated samples.},
	urldate = {2023-01-29},
	publisher = {arXiv},
	author = {Goodfellow, Ian J. and Pouget-Abadie, Jean and Mirza, Mehdi and Xu, Bing and Warde-Farley, David and Ozair, Sherjil and Courville, Aaron and Bengio, Yoshua},
	month = jun,
	year = {2014},
	note = {arXiv:1406.2661 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/5STMX2XJ/Goodfellow et al. - 2014 - Generative Adversarial Networks.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/MYEGE7IK/1406.html:text/html},
}

@misc{salimans_improved_2016,
	title = {Improved {Techniques} for {Training} {GANs}},
	url = {http://arxiv.org/abs/1606.03498},
	doi = {10.48550/arXiv.1606.03498},
	abstract = {We present a variety of new architectural features and training procedures that we apply to the generative adversarial networks (GANs) framework. We focus on two applications of GANs: semi-supervised learning, and the generation of images that humans find visually realistic. Unlike most work on generative models, our primary goal is not to train a model that assigns high likelihood to test data, nor do we require the model to be able to learn well without using any labels. Using our new techniques, we achieve state-of-the-art results in semi-supervised classification on MNIST, CIFAR-10 and SVHN. The generated images are of high quality as confirmed by a visual Turing test: our model generates MNIST samples that humans cannot distinguish from real data, and CIFAR-10 samples that yield a human error rate of 21.3\%. We also present ImageNet samples with unprecedented resolution and show that our methods enable the model to learn recognizable features of ImageNet classes.},
	urldate = {2023-01-29},
	publisher = {arXiv},
	author = {Salimans, Tim and Goodfellow, Ian and Zaremba, Wojciech and Cheung, Vicki and Radford, Alec and Chen, Xi},
	month = jun,
	year = {2016},
	note = {arXiv:1606.03498 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/PEYM38ZW/Salimans et al. - 2016 - Improved Techniques for Training GANs.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/5XMXB7WV/1606.html:text/html},
}

@misc{arjovsky_towards_2017,
	title = {Towards {Principled} {Methods} for {Training} {Generative} {Adversarial} {Networks}},
	url = {http://arxiv.org/abs/1701.04862},
	doi = {10.48550/arXiv.1701.04862},
	abstract = {The goal of this paper is not to introduce a single algorithm or method, but to make theoretical steps towards fully understanding the training dynamics of generative adversarial networks. In order to substantiate our theoretical analysis, we perform targeted experiments to verify our assumptions, illustrate our claims, and quantify the phenomena. This paper is divided into three sections. The first section introduces the problem at hand. The second section is dedicated to studying and proving rigorously the problems including instability and saturation that arize when training generative adversarial networks. The third section examines a practical and theoretically grounded direction towards solving these problems, while introducing new tools to study them.},
	urldate = {2023-01-29},
	publisher = {arXiv},
	author = {Arjovsky, Martin and Bottou, Léon},
	month = jan,
	year = {2017},
	note = {arXiv:1701.04862 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/AEE2NPN4/Arjovsky and Bottou - 2017 - Towards Principled Methods for Training Generative.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/QEE7N7KP/1701.html:text/html},
}

@misc{arjovsky_wasserstein_2017,
	title = {Wasserstein {GAN}},
	url = {http://arxiv.org/abs/1701.07875},
	doi = {10.48550/arXiv.1701.07875},
	abstract = {We introduce a new algorithm named WGAN, an alternative to traditional GAN training. In this new model, we show that we can improve the stability of learning, get rid of problems like mode collapse, and provide meaningful learning curves useful for debugging and hyperparameter searches. Furthermore, we show that the corresponding optimization problem is sound, and provide extensive theoretical work highlighting the deep connections to other distances between distributions.},
	urldate = {2023-01-29},
	publisher = {arXiv},
	author = {Arjovsky, Martin and Chintala, Soumith and Bottou, Léon},
	month = dec,
	year = {2017},
	note = {arXiv:1701.07875 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/KW83LJBX/Arjovsky et al. - 2017 - Wasserstein GAN.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/YA9DTUVB/1701.html:text/html},
}

@misc{song_generative_2020,
	title = {Generative {Modeling} by {Estimating} {Gradients} of the {Data} {Distribution}},
	url = {http://arxiv.org/abs/1907.05600},
	doi = {10.48550/arXiv.1907.05600},
	abstract = {We introduce a new generative model where samples are produced via Langevin dynamics using gradients of the data distribution estimated with score matching. Because gradients can be ill-defined and hard to estimate when the data resides on low-dimensional manifolds, we perturb the data with different levels of Gaussian noise, and jointly estimate the corresponding scores, i.e., the vector fields of gradients of the perturbed data distribution for all noise levels. For sampling, we propose an annealed Langevin dynamics where we use gradients corresponding to gradually decreasing noise levels as the sampling process gets closer to the data manifold. Our framework allows flexible model architectures, requires no sampling during training or the use of adversarial methods, and provides a learning objective that can be used for principled model comparisons. Our models produce samples comparable to GANs on MNIST, CelebA and CIFAR-10 datasets, achieving a new state-of-the-art inception score of 8.87 on CIFAR-10. Additionally, we demonstrate that our models learn effective representations via image inpainting experiments.},
	urldate = {2023-01-29},
	publisher = {arXiv},
	author = {Song, Yang and Ermon, Stefano},
	month = oct,
	year = {2020},
	note = {arXiv:1907.05600 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/NDB8ZJRC/Song and Ermon - 2020 - Generative Modeling by Estimating Gradients of the.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/KG2SAQFI/1907.html:text/html},
}

@article{yacoby_failure_2021,
	title = {Failure {Modes} of {Variational} {Autoencoders} and {Their} {Effects} on {Downstream} {Tasks}},
	url = {https://openreview.net/forum?id=5Spjp0zDYt},
	abstract = {Variational Auto-encoders (VAEs) are deep generative latent variable models that are widely used for a number of downstream tasks. While it has been demonstrated that VAE training can suffer from a number of pathologies, existing literature lacks characterizations of exactly when these pathologies occur and how they impact down-stream task performance. In this paper we concretely characterize conditions under which VAE training exhibits pathologies and connect these failure modes to undesirable effects on specific downstream tasks, such as learning compressed and disentangled representations, adversarial robustness and semi-supervised learning.},
	language = {en},
	urldate = {2023-01-29},
	author = {Yacoby, Yaniv and Pan, Weiwei and Doshi-Velez, Finale},
	month = mar,
	year = {2021},
	file = {Full Text PDF:/home/laurent/Zotero/storage/J37MD8SR/Yacoby et al. - 2021 - Failure Modes of Variational Autoencoders and Thei.pdf:application/pdf},
}

@inproceedings{higgins_beta-vae_2022,
	title = {beta-{VAE}: {Learning} {Basic} {Visual} {Concepts} with a {Constrained} {Variational} {Framework}},
	shorttitle = {beta-{VAE}},
	url = {https://openreview.net/forum?id=Sy2fzU9gl},
	abstract = {Learning an interpretable factorised representation of the independent data generative factors of the world without supervision is an important precursor for the development of artificial intelligence that is able to learn and reason in the same way that humans do. We introduce beta-VAE, a new state-of-the-art framework for automated discovery of interpretable factorised latent representations from raw image data in a completely unsupervised manner. Our approach is a modification of the variational autoencoder (VAE) framework. We introduce an adjustable hyperparameter beta that balances latent channel capacity and independence constraints with reconstruction accuracy. We demonstrate that beta-VAE with appropriately tuned beta {\textgreater} 1 qualitatively outperforms VAE (beta = 1), as well as state of the art unsupervised (InfoGAN) and semi-supervised (DC-IGN) approaches to disentangled factor learning on a variety of datasets (celebA, faces and chairs). Furthermore, we devise a protocol to quantitatively compare the degree of disentanglement learnt by different models, and show that our approach also significantly outperforms all baselines quantitatively. Unlike InfoGAN, beta-VAE is stable to train, makes few assumptions about the data and relies on tuning a single hyperparameter, which can be directly optimised through a hyper parameter search using weakly labelled data or through heuristic visual inspection for purely unsupervised data.},
	language = {en},
	urldate = {2023-01-29},
	author = {Higgins, Irina and Matthey, Loic and Pal, Arka and Burgess, Christopher and Glorot, Xavier and Botvinick, Matthew and Mohamed, Shakir and Lerchner, Alexander},
	month = jul,
	year = {2022},
	file = {Full Text PDF:/home/laurent/Zotero/storage/FD5Q6H4B/Higgins et al. - 2022 - beta-VAE Learning Basic Visual Concepts with a Co.pdf:application/pdf},
}

@misc{kingma_auto-encoding_2022,
	title = {Auto-{Encoding} {Variational} {Bayes}},
	url = {http://arxiv.org/abs/1312.6114},
	doi = {10.48550/arXiv.1312.6114},
	abstract = {How can we perform efficient inference and learning in directed probabilistic models, in the presence of continuous latent variables with intractable posterior distributions, and large datasets? We introduce a stochastic variational inference and learning algorithm that scales to large datasets and, under some mild differentiability conditions, even works in the intractable case. Our contributions are two-fold. First, we show that a reparameterization of the variational lower bound yields a lower bound estimator that can be straightforwardly optimized using standard stochastic gradient methods. Second, we show that for i.i.d. datasets with continuous latent variables per datapoint, posterior inference can be made especially efficient by fitting an approximate inference model (also called a recognition model) to the intractable posterior using the proposed lower bound estimator. Theoretical advantages are reflected in experimental results.},
	urldate = {2023-01-29},
	publisher = {arXiv},
	author = {Kingma, Diederik P. and Welling, Max},
	month = dec,
	year = {2022},
	note = {arXiv:1312.6114 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/8MXMAC2E/Kingma and Welling - 2022 - Auto-Encoding Variational Bayes.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/TDNMVVSS/1312.html:text/html},
}

@misc{zeng_lion_2022,
	title = {{LION}: {Latent} {Point} {Diffusion} {Models} for {3D} {Shape} {Generation}},
	shorttitle = {{LION}},
	url = {http://arxiv.org/abs/2210.06978},
	doi = {10.48550/arXiv.2210.06978},
	abstract = {Denoising diffusion models (DDMs) have shown promising results in 3D point cloud synthesis. To advance 3D DDMs and make them useful for digital artists, we require (i) high generation quality, (ii) flexibility for manipulation and applications such as conditional synthesis and shape interpolation, and (iii) the ability to output smooth surfaces or meshes. To this end, we introduce the hierarchical Latent Point Diffusion Model (LION) for 3D shape generation. LION is set up as a variational autoencoder (VAE) with a hierarchical latent space that combines a global shape latent representation with a point-structured latent space. For generation, we train two hierarchical DDMs in these latent spaces. The hierarchical VAE approach boosts performance compared to DDMs that operate on point clouds directly, while the point-structured latents are still ideally suited for DDM-based modeling. Experimentally, LION achieves state-of-the-art generation performance on multiple ShapeNet benchmarks. Furthermore, our VAE framework allows us to easily use LION for different relevant tasks: LION excels at multimodal shape denoising and voxel-conditioned synthesis, and it can be adapted for text- and image-driven 3D generation. We also demonstrate shape autoencoding and latent shape interpolation, and we augment LION with modern surface reconstruction techniques to generate smooth 3D meshes. We hope that LION provides a powerful tool for artists working with 3D shapes due to its high-quality generation, flexibility, and surface reconstruction. Project page and code: https://nv-tlabs.github.io/LION.},
	urldate = {2023-01-29},
	publisher = {arXiv},
	author = {Zeng, Xiaohui and Vahdat, Arash and Williams, Francis and Gojcic, Zan and Litany, Or and Fidler, Sanja and Kreis, Karsten},
	month = oct,
	year = {2022},
	note = {arXiv:2210.06978 [cs, stat]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/FACF8TI9/Zeng et al. - 2022 - LION Latent Point Diffusion Models for 3D Shape G.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/X57XTJQR/2210.html:text/html},
}

@misc{nichol_point-e_2022,
	title = {Point-{E}: {A} {System} for {Generating} {3D} {Point} {Clouds} from {Complex} {Prompts}},
	shorttitle = {Point-{E}},
	url = {http://arxiv.org/abs/2212.08751},
	doi = {10.48550/arXiv.2212.08751},
	abstract = {While recent work on text-conditional 3D object generation has shown promising results, the state-of-the-art methods typically require multiple GPU-hours to produce a single sample. This is in stark contrast to state-of-the-art generative image models, which produce samples in a number of seconds or minutes. In this paper, we explore an alternative method for 3D object generation which produces 3D models in only 1-2 minutes on a single GPU. Our method first generates a single synthetic view using a text-to-image diffusion model, and then produces a 3D point cloud using a second diffusion model which conditions on the generated image. While our method still falls short of the state-of-the-art in terms of sample quality, it is one to two orders of magnitude faster to sample from, offering a practical trade-off for some use cases. We release our pre-trained point cloud diffusion models, as well as evaluation code and models, at https://github.com/openai/point-e.},
	urldate = {2023-01-29},
	publisher = {arXiv},
	author = {Nichol, Alex and Jun, Heewoo and Dhariwal, Prafulla and Mishkin, Pamela and Chen, Mark},
	month = dec,
	year = {2022},
	note = {arXiv:2212.08751 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/8IW28GBH/Nichol et al. - 2022 - Point-E A System for Generating 3D Point Clouds f.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/LMQF9Q55/2212.html:text/html},
}

@misc{kim_setvae_2021,
	title = {{SetVAE}: {Learning} {Hierarchical} {Composition} for {Generative} {Modeling} of {Set}-{Structured} {Data}},
	shorttitle = {{SetVAE}},
	url = {http://arxiv.org/abs/2103.15619},
	doi = {10.48550/arXiv.2103.15619},
	abstract = {Generative modeling of set-structured data, such as point clouds, requires reasoning over local and global structures at various scales. However, adopting multi-scale frameworks for ordinary sequential data to a set-structured data is nontrivial as it should be invariant to the permutation of its elements. In this paper, we propose SetVAE, a hierarchical variational autoencoder for sets. Motivated by recent progress in set encoding, we build SetVAE upon attentive modules that first partition the set and project the partition back to the original cardinality. Exploiting this module, our hierarchical VAE learns latent variables at multiple scales, capturing coarse-to-fine dependency of the set elements while achieving permutation invariance. We evaluate our model on point cloud generation task and achieve competitive performance to the prior arts with substantially smaller model capacity. We qualitatively demonstrate that our model generalizes to unseen set sizes and learns interesting subset relations without supervision. Our implementation is available at https://github.com/jw9730/setvae.},
	urldate = {2023-03-31},
	publisher = {arXiv},
	author = {Kim, Jinwoo and Yoo, Jaehoon and Lee, Juho and Hong, Seunghoon},
	month = mar,
	year = {2021},
	note = {arXiv:2103.15619 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/WUTNGI56/Kim et al. - 2021 - SetVAE Learning Hierarchical Composition for Gene.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/25K7W3C4/2103.html:text/html},
}

@misc{takikawa_neural_2021,
	title = {Neural {Geometric} {Level} of {Detail}: {Real}-time {Rendering} with {Implicit} {3D} {Shapes}},
	shorttitle = {Neural {Geometric} {Level} of {Detail}},
	url = {http://arxiv.org/abs/2101.10994},
	doi = {10.48550/arXiv.2101.10994},
	abstract = {Neural signed distance functions (SDFs) are emerging as an effective representation for 3D shapes. State-of-the-art methods typically encode the SDF with a large, fixed-size neural network to approximate complex shapes with implicit surfaces. Rendering with these large networks is, however, computationally expensive since it requires many forward passes through the network for every pixel, making these representations impractical for real-time graphics. We introduce an efficient neural representation that, for the first time, enables real-time rendering of high-fidelity neural SDFs, while achieving state-of-the-art geometry reconstruction quality. We represent implicit surfaces using an octree-based feature volume which adaptively fits shapes with multiple discrete levels of detail (LODs), and enables continuous LOD with SDF interpolation. We further develop an efficient algorithm to directly render our novel neural SDF representation in real-time by querying only the necessary LODs with sparse octree traversal. We show that our representation is 2-3 orders of magnitude more efficient in terms of rendering speed compared to previous works. Furthermore, it produces state-of-the-art reconstruction quality for complex shapes under both 3D geometric and 2D image-space metrics.},
	urldate = {2023-03-28},
	publisher = {arXiv},
	author = {Takikawa, Towaki and Litalien, Joey and Yin, Kangxue and Kreis, Karsten and Loop, Charles and Nowrouzezahrai, Derek and Jacobson, Alec and McGuire, Morgan and Fidler, Sanja},
	month = jan,
	year = {2021},
	note = {arXiv:2101.10994 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/KJMJR4CB/Takikawa et al. - 2021 - Neural Geometric Level of Detail Real-time Render.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/ST444B55/2101.html:text/html},
}

@misc{nash_polygen_2020,
	title = {{PolyGen}: {An} {Autoregressive} {Generative} {Model} of {3D} {Meshes}},
	shorttitle = {{PolyGen}},
	url = {http://arxiv.org/abs/2002.10880},
	doi = {10.48550/arXiv.2002.10880},
	abstract = {Polygon meshes are an efficient representation of 3D geometry, and are of central importance in computer graphics, robotics and games development. Existing learning-based approaches have avoided the challenges of working with 3D meshes, instead using alternative object representations that are more compatible with neural architectures and training approaches. We present an approach which models the mesh directly, predicting mesh vertices and faces sequentially using a Transformer-based architecture. Our model can condition on a range of inputs, including object classes, voxels, and images, and because the model is probabilistic it can produce samples that capture uncertainty in ambiguous scenarios. We show that the model is capable of producing high-quality, usable meshes, and establish log-likelihood benchmarks for the mesh-modelling task. We also evaluate the conditional models on surface reconstruction metrics against alternative methods, and demonstrate competitive performance despite not training directly on this task.},
	urldate = {2023-03-28},
	publisher = {arXiv},
	author = {Nash, Charlie and Ganin, Yaroslav and Eslami, S. M. Ali and Battaglia, Peter W.},
	month = feb,
	year = {2020},
	note = {arXiv:2002.10880 [cs, stat]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Statistics - Machine Learning, Computer Science - Graphics},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/JE5MEK9K/Nash et al. - 2020 - PolyGen An Autoregressive Generative Model of 3D .pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/7Y3HEDRQ/2002.html:text/html},
}

@misc{zhang_3dshape2vecset_2023,
	title = {{3DShape2VecSet}: {A} {3D} {Shape} {Representation} for {Neural} {Fields} and {Generative} {Diffusion} {Models}},
	shorttitle = {{3DShape2VecSet}},
	url = {http://arxiv.org/abs/2301.11445},
	doi = {10.48550/arXiv.2301.11445},
	abstract = {We introduce 3DShape2VecSet, a novel shape representation for neural fields designed for generative diffusion models. Our shape representation can encode 3D shapes given as surface models or point clouds, and represents them as neural fields. The concept of neural fields has previously been combined with a global latent vector, a regular grid of latent vectors, or an irregular grid of latent vectors. Our new representation encodes neural fields on top of a set of vectors. We draw from multiple concepts, such as the radial basis function representation and the cross attention and self-attention function, to design a learnable representation that is especially suitable for processing with transformers. Our results show improved performance in 3D shape encoding and 3D shape generative modeling tasks. We demonstrate a wide variety of generative applications: unconditioned generation, category-conditioned generation, text-conditioned generation, point-cloud completion, and image-conditioned generation.},
	urldate = {2023-03-28},
	publisher = {arXiv},
	author = {Zhang, Biao and Tang, Jiapeng and Niessner, Matthias and Wonka, Peter},
	month = feb,
	year = {2023},
	note = {arXiv:2301.11445 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/T8R7H6N4/Zhang et al. - 2023 - 3DShape2VecSet A 3D Shape Representation for Neur.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/6GNICSIQ/2301.html:text/html},
}

@misc{yang_pointflow_2019,
	title = {{PointFlow}: {3D} {Point} {Cloud} {Generation} with {Continuous} {Normalizing} {Flows}},
	shorttitle = {{PointFlow}},
	url = {http://arxiv.org/abs/1906.12320},
	doi = {10.48550/arXiv.1906.12320},
	abstract = {As 3D point clouds become the representation of choice for multiple vision and graphics applications, the ability to synthesize or reconstruct high-resolution, high-fidelity point clouds becomes crucial. Despite the recent success of deep learning models in discriminative tasks of point clouds, generating point clouds remains challenging. This paper proposes a principled probabilistic framework to generate 3D point clouds by modeling them as a distribution of distributions. Specifically, we learn a two-level hierarchy of distributions where the first level is the distribution of shapes and the second level is the distribution of points given a shape. This formulation allows us to both sample shapes and sample an arbitrary number of points from a shape. Our generative model, named PointFlow, learns each level of the distribution with a continuous normalizing flow. The invertibility of normalizing flows enables the computation of the likelihood during training and allows us to train our model in the variational inference framework. Empirically, we demonstrate that PointFlow achieves state-of-the-art performance in point cloud generation. We additionally show that our model can faithfully reconstruct point clouds and learn useful representations in an unsupervised manner. The code will be available at https://github.com/stevenygd/PointFlow.},
	urldate = {2023-03-28},
	publisher = {arXiv},
	author = {Yang, Guandao and Huang, Xun and Hao, Zekun and Liu, Ming-Yu and Belongie, Serge and Hariharan, Bharath},
	month = sep,
	year = {2019},
	note = {arXiv:1906.12320 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/V87MQMLC/Yang et al. - 2019 - PointFlow 3D Point Cloud Generation with Continuo.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/KEHU85VD/1906.html:text/html},
}

@misc{fan_generative_2023,
	title = {Generative {Diffusion} {Models} on {Graphs}: {Methods} and {Applications}},
	shorttitle = {Generative {Diffusion} {Models} on {Graphs}},
	url = {http://arxiv.org/abs/2302.02591},
	doi = {10.48550/arXiv.2302.02591},
	abstract = {Diffusion models, as a novel generative paradigm, have achieved remarkable success in various image generation tasks such as image inpainting, image-to-text translation, and video generation. Graph generation is a crucial computational task on graphs with numerous real-world applications. It aims to learn the distribution of given graphs and then generate new graphs. Given the great success of diffusion models in image generation, increasing efforts have been made to leverage these techniques to advance graph generation in recent years. In this paper, we first provide a comprehensive overview of generative diffusion models on graphs, In particular, we review representative algorithms for three variants of graph diffusion models, i.e., Score Matching with Langevin Dynamics (SMLD), Denoising Diffusion Probabilistic Model (DDPM), and Score-based Generative Model (SGM). Then, we summarize the major applications of generative diffusion models on graphs with a specific focus on molecule and protein modeling. Finally, we discuss promising directions in generative diffusion models on graph-structured data.},
	urldate = {2023-03-27},
	publisher = {arXiv},
	author = {Fan, Wenqi and Liu, Chengyi and Liu, Yunqing and Li, Jiatong and Li, Hang and Liu, Hui and Tang, Jiliang and Li, Qing},
	month = feb,
	year = {2023},
	note = {arXiv:2302.02591 [cs]},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Social and Information Networks},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/3M3G2JY5/Fan et al. - 2023 - Generative Diffusion Models on Graphs Methods and.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/8YV9HJ3W/2302.html:text/html},
}

@misc{zhu_survey_2022,
	title = {A {Survey} on {Deep} {Graph} {Generation}: {Methods} and {Applications}},
	shorttitle = {A {Survey} on {Deep} {Graph} {Generation}},
	url = {http://arxiv.org/abs/2203.06714},
	doi = {10.48550/arXiv.2203.06714},
	abstract = {Graphs are ubiquitous in encoding relational information of real-world objects in many domains. Graph generation, whose purpose is to generate new graphs from a distribution similar to the observed graphs, has received increasing attention thanks to the recent advances of deep learning models. In this paper, we conduct a comprehensive review on the existing literature of deep graph generation from a variety of emerging methods to its wide application areas. Specifically, we first formulate the problem of deep graph generation and discuss its difference with several related graph learning tasks. Secondly, we divide the state-of-the-art methods into three categories based on model architectures and summarize their generation strategies. Thirdly, we introduce three key application areas of deep graph generation. Lastly, we highlight challenges and opportunities in the future study of deep graph generation. We hope that our survey will be useful for researchers and practitioners who are interested in this exciting and rapidly-developing field.},
	urldate = {2023-03-27},
	publisher = {arXiv},
	author = {Zhu, Yanqiao and Du, Yuanqi and Wang, Yinkai and Xu, Yichen and Zhang, Jieyu and Liu, Qiang and Wu, Shu},
	month = dec,
	year = {2022},
	note = {arXiv:2203.06714 [cs, q-bio]},
	keywords = {Computer Science - Machine Learning, Computer Science - Social and Information Networks, Quantitative Biology - Molecular Networks},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/SQWM9VTD/Zhu et al. - 2022 - A Survey on Deep Graph Generation Methods and App.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/IWAETBS6/2203.html:text/html},
}

@misc{shah_auto-decoding_2020,
	title = {Auto-decoding {Graphs}},
	url = {http://arxiv.org/abs/2006.02879},
	doi = {10.48550/arXiv.2006.02879},
	abstract = {We present an approach to synthesizing new graph structures from empirically specified distributions. The generative model is an auto-decoder that learns to synthesize graphs from latent codes. The graph synthesis model is learned jointly with an empirical distribution over the latent codes. Graphs are synthesized using self-attention modules that are trained to identify likely connectivity patterns. Graph-based normalizing flows are used to sample latent codes from the distribution learned by the auto-decoder. The resulting model combines accuracy and scalability. On benchmark datasets of large graphs, the presented model outperforms the state of the art by a factor of 1.5 in mean accuracy and average rank across at least three different graph statistics, with a 2x speedup during inference.},
	urldate = {2023-03-27},
	publisher = {arXiv},
	author = {Shah, Sohil Atul and Koltun, Vladlen},
	month = jun,
	year = {2020},
	note = {arXiv:2006.02879 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/M7JSQ9YK/Shah et Koltun - 2020 - Auto-decoding Graphs.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/WLZYXR33/2006.html:text/html},
}

@misc{faez_deep_2020,
	title = {Deep {Graph} {Generators}: {A} {Survey}},
	shorttitle = {Deep {Graph} {Generators}},
	url = {http://arxiv.org/abs/2012.15544},
	doi = {10.48550/arXiv.2012.15544},
	abstract = {Deep generative models have achieved great success in areas such as image, speech, and natural language processing in the past few years. Thanks to the advances in graph-based deep learning, and in particular graph representation learning, deep graph generation methods have recently emerged with new applications ranging from discovering novel molecular structures to modeling social networks. This paper conducts a comprehensive survey on deep learning-based graph generation approaches and classifies them into five broad categories, namely, autoregressive, autoencoder-based, RL-based, adversarial, and flow-based graph generators, providing the readers a detailed description of the methods in each class. We also present publicly available source codes, commonly used datasets, and the most widely utilized evaluation metrics. Finally, we highlight the existing challenges and discuss future research directions.},
	urldate = {2023-03-27},
	publisher = {arXiv},
	author = {Faez, Faezeh and Ommi, Yassaman and Baghshah, Mahdieh Soleymani and Rabiee, Hamid R.},
	month = dec,
	year = {2020},
	note = {arXiv:2012.15544 [cs]},
	keywords = {Computer Science - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Social and Information Networks},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/G3J3B658/Faez et al. - 2020 - Deep Graph Generators A Survey.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/NSQQDIEH/2012.html:text/html},
}

@misc{shayestehfard_aligngraph_2023,
	title = {{AlignGraph}: {A} {Group} of {Generative} {Models} for {Graphs}},
	shorttitle = {{AlignGraph}},
	url = {http://arxiv.org/abs/2301.11273},
	doi = {10.48550/arXiv.2301.11273},
	abstract = {It is challenging for generative models to learn a distribution over graphs because of the lack of permutation invariance: nodes may be ordered arbitrarily across graphs, and standard graph alignment is combinatorial and notoriously expensive. We propose AlignGraph, a group of generative models that combine fast and efficient graph alignment methods with a family of deep generative models that are invariant to node permutations. Our experiments demonstrate that our framework successfully learns graph distributions, outperforming competitors by 25\% -560\% in relevant performance scores.},
	urldate = {2023-03-27},
	publisher = {arXiv},
	author = {Shayestehfard, Kimia and Brooks, Dana and Ioannidis, Stratis},
	month = jan,
	year = {2023},
	note = {arXiv:2301.11273 [cs]},
	keywords = {Computer Science - Machine Learning, Computer Science - Social and Information Networks},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/I69NJXUI/Shayestehfard et al. - 2023 - AlignGraph A Group of Generative Models for Graph.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/LPF9DVAW/2301.html:text/html},
}

@article{kipf_graph_2020,
	title = {Graph {Neural} {Networks} for {Modeling} {Small} {Molecules}},
	language = {en},
	author = {Kipf, Thomas and Veličković, Petar and Li, Yujia},
	month = mar,
	year = {2020},
	file = {Kipf et al. - Graph Neural Networks for Modeling Small Molecules.pdf:/home/laurent/Zotero/storage/6WZAZFX8/Kipf et al. - Graph Neural Networks for Modeling Small Molecules.pdf:application/pdf},
}

@misc{simonovsky_graphvae_2018,
	title = {{GraphVAE}: {Towards} {Generation} of {Small} {Graphs} {Using} {Variational} {Autoencoders}},
	shorttitle = {{GraphVAE}},
	url = {http://arxiv.org/abs/1802.03480},
	doi = {10.48550/arXiv.1802.03480},
	abstract = {Deep learning on graphs has become a popular research topic with many applications. However, past work has concentrated on learning graph embedding tasks, which is in contrast with advances in generative models for images and text. Is it possible to transfer this progress to the domain of graphs? We propose to sidestep hurdles associated with linearization of such discrete structures by having a decoder output a probabilistic fully-connected graph of a predefined maximum size directly at once. Our method is formulated as a variational autoencoder. We evaluate on the challenging task of molecule generation.},
	urldate = {2023-03-27},
	publisher = {arXiv},
	author = {Simonovsky, Martin and Komodakis, Nikos},
	month = feb,
	year = {2018},
	note = {arXiv:1802.03480 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/IWG2JIVU/Simonovsky et Komodakis - 2018 - GraphVAE Towards Generation of Small Graphs Using.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/PW5ZG5WH/1802.html:text/html},
}

@misc{liao_efficient_2020,
	title = {Efficient {Graph} {Generation} with {Graph} {Recurrent} {Attention} {Networks}},
	url = {http://arxiv.org/abs/1910.00760},
	abstract = {We propose a new family of efficient and expressive deep generative models of graphs, called Graph Recurrent Attention Networks (GRANs). Our model generates graphs one block of nodes and associated edges at a time. The block size and sampling stride allow us to trade off sample quality for efficiency. Compared to previous RNN-based graph generative models, our framework better captures the auto-regressive conditioning between the already-generated and to-be-generated parts of the graph using Graph Neural Networks (GNNs) with attention. This not only reduces the dependency on node ordering but also bypasses the long-term bottleneck caused by the sequential nature of RNNs. Moreover, we parameterize the output distribution per block using a mixture of Bernoulli, which captures the correlations among generated edges within the block. Finally, we propose to handle node orderings in generation by marginalizing over a family of canonical orderings. On standard benchmarks, we achieve state-of-the-art time efficiency and sample quality compared to previous models. Additionally, we show our model is capable of generating large graphs of up to 5K nodes with good quality. To the best of our knowledge, GRAN is the first deep graph generative model that can scale to this size. Our code is released at: https://github.com/lrjconan/GRAN.},
	urldate = {2023-03-27},
	publisher = {arXiv},
	author = {Liao, Renjie and Li, Yujia and Song, Yang and Wang, Shenlong and Nash, Charlie and Hamilton, William L. and Duvenaud, David and Urtasun, Raquel and Zemel, Richard S.},
	month = jul,
	year = {2020},
	note = {arXiv:1910.00760 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/YB44QN2I/Liao et al. - 2020 - Efficient Graph Generation with Graph Recurrent At.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/XXCHWITE/1910.html:text/html},
}

@misc{guo_systematic_2022,
	title = {A {Systematic} {Survey} on {Deep} {Generative} {Models} for {Graph} {Generation}},
	url = {http://arxiv.org/abs/2007.06686},
	doi = {10.48550/arXiv.2007.06686},
	abstract = {Graphs are important data representations for describing objects and their relationships, which appear in a wide diversity of real-world scenarios. As one of a critical problem in this area, graph generation considers learning the distributions of given graphs and generating more novel graphs. Owing to their wide range of applications, generative models for graphs, which have a rich history, however, are traditionally hand-crafted and only capable of modeling a few statistical properties of graphs. Recent advances in deep generative models for graph generation is an important step towards improving the fidelity of generated graphs and paves the way for new kinds of applications. This article provides an extensive overview of the literature in the field of deep generative models for graph generation. Firstly, the formal definition of deep generative models for the graph generation and the preliminary knowledge are provided. Secondly, taxonomies of deep generative models for both unconditional and conditional graph generation are proposed respectively; the existing works of each are compared and analyzed. After that, an overview of the evaluation metrics in this specific domain is provided. Finally, the applications that deep graph generation enables are summarized and five promising future research directions are highlighted.},
	urldate = {2023-03-24},
	publisher = {arXiv},
	author = {Guo, Xiaojie and Zhao, Liang},
	month = oct,
	year = {2022},
	note = {arXiv:2007.06686 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/M6I3YJP8/Guo et Zhao - 2022 - A Systematic Survey on Deep Generative Models for .pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/8N8L3XCF/2007.html:text/html},
}

@misc{doersch_tutorial_2021,
	title = {Tutorial on {Variational} {Autoencoders}},
	url = {http://arxiv.org/abs/1606.05908},
	doi = {10.48550/arXiv.1606.05908},
	abstract = {In just three years, Variational Autoencoders (VAEs) have emerged as one of the most popular approaches to unsupervised learning of complicated distributions. VAEs are appealing because they are built on top of standard function approximators (neural networks), and can be trained with stochastic gradient descent. VAEs have already shown promise in generating many kinds of complicated data, including handwritten digits, faces, house numbers, CIFAR images, physical models of scenes, segmentation, and predicting the future from static images. This tutorial introduces the intuitions behind VAEs, explains the mathematics behind them, and describes some empirical behavior. No prior knowledge of variational Bayesian methods is assumed.},
	urldate = {2023-03-24},
	publisher = {arXiv},
	author = {Doersch, Carl},
	month = jan,
	year = {2021},
	note = {arXiv:1606.05908 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/84J4LNV2/Doersch - 2021 - Tutorial on Variational Autoencoders.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/EWJRB7BM/1606.html:text/html},
}

@misc{salha-galvan_contributions_2022,
	title = {Contributions to {Representation} {Learning} with {Graph} {Autoencoders} and {Applications} to {Music} {Recommendation}},
	url = {http://arxiv.org/abs/2205.14651},
	doi = {10.48550/arXiv.2205.14651},
	abstract = {Graph autoencoders (GAE) and variational graph autoencoders (VGAE) emerged as two powerful groups of unsupervised node embedding methods, with various applications to graph-based machine learning problems such as link prediction and community detection. Nonetheless, at the beginning of this Ph.D. project, GAE and VGAE models were also suffering from key limitations, preventing them from being adopted in the industry. In this thesis, we present several contributions to improve these models, with the general aim of facilitating their use to address industrial-level problems involving graph representations. Firstly, we propose two strategies to overcome the scalability issues of previous GAE and VGAE models, permitting to effectively train these models on large graphs with millions of nodes and edges. These strategies leverage graph degeneracy and stochastic subgraph decoding techniques, respectively. Besides, we introduce Gravity-Inspired GAE and VGAE, providing the first extensions of these models for directed graphs, that are ubiquitous in industrial applications. We also consider extensions of GAE and VGAE models for dynamic graphs. Furthermore, we argue that GAE and VGAE models are often unnecessarily complex, and we propose to simplify them by leveraging linear encoders. Lastly, we introduce Modularity-Aware GAE and VGAE to improve community detection on graphs, while jointly preserving good performances on link prediction. In the last part of this thesis, we evaluate our methods on several graphs extracted from the music streaming service Deezer. We put the emphasis on graph-based music recommendation problems. In particular, we show that our methods can improve the detection of communities of similar musical items to recommend to users, that they can effectively rank similar artists in a cold start setting, and that they permit modeling the music genre perception across cultures.},
	urldate = {2023-03-24},
	publisher = {arXiv},
	author = {Salha-Galvan, Guillaume},
	month = may,
	year = {2022},
	note = {arXiv:2205.14651 [cs]
version: 1},
	keywords = {Computer Science - Machine Learning, Computer Science - Social and Information Networks, Computer Science - Information Retrieval},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/4R2Z87LG/Salha-Galvan - 2022 - Contributions to Representation Learning with Grap.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/AMRY4RUI/2205.html:text/html},
}

@misc{su_f-vaes_2018,
	title = {f-{VAEs}: {Improve} {VAEs} with {Conditional} {Flows}},
	shorttitle = {f-{VAEs}},
	url = {http://arxiv.org/abs/1809.05861},
	doi = {10.48550/arXiv.1809.05861},
	abstract = {In this paper, we integrate VAEs and flow-based generative models successfully and get f-VAEs. Compared with VAEs, f-VAEs generate more vivid images, solved the blurred-image problem of VAEs. Compared with flow-based models such as Glow, f-VAE is more lightweight and converges faster, achieving the same performance under smaller-size architecture.},
	urldate = {2023-03-24},
	publisher = {arXiv},
	author = {Su, Jianlin and Wu, Guang},
	month = sep,
	year = {2018},
	note = {arXiv:1809.05861 [cs, stat]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/2YPANJ73/Su et Wu - 2018 - f-VAEs Improve VAEs with Conditional Flows.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/9M6GUZEX/1809.html:text/html},
}

@misc{burgess_understanding_2018,
	title = {Understanding disentangling in \${\textbackslash}beta\$-{VAE}},
	url = {http://arxiv.org/abs/1804.03599},
	doi = {10.48550/arXiv.1804.03599},
	abstract = {We present new intuitions and theoretical assessments of the emergence of disentangled representation in variational autoencoders. Taking a rate-distortion theory perspective, we show the circumstances under which representations aligned with the underlying generative factors of variation of data emerge when optimising the modified ELBO bound in \${\textbackslash}beta\$-VAE, as training progresses. From these insights, we propose a modification to the training regime of \${\textbackslash}beta\$-VAE, that progressively increases the information capacity of the latent code during training. This modification facilitates the robust learning of disentangled representations in \${\textbackslash}beta\$-VAE, without the previous trade-off in reconstruction accuracy.},
	urldate = {2023-03-23},
	publisher = {arXiv},
	author = {Burgess, Christopher P. and Higgins, Irina and Pal, Arka and Matthey, Loic and Watters, Nick and Desjardins, Guillaume and Lerchner, Alexander},
	month = apr,
	year = {2018},
	note = {arXiv:1804.03599 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning, Computer Science - Artificial Intelligence},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/I7FNXM4I/Burgess et al. - 2018 - Understanding disentangling in \$beta\$-VAE.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/4JPKDD7F/1804.html:text/html},
}

@misc{brody_how_2022,
	title = {How {Attentive} are {Graph} {Attention} {Networks}?},
	url = {http://arxiv.org/abs/2105.14491},
	doi = {10.48550/arXiv.2105.14491},
	abstract = {Graph Attention Networks (GATs) are one of the most popular GNN architectures and are considered as the state-of-the-art architecture for representation learning with graphs. In GAT, every node attends to its neighbors given its own representation as the query. However, in this paper we show that GAT computes a very limited kind of attention: the ranking of the attention scores is unconditioned on the query node. We formally define this restricted kind of attention as static attention and distinguish it from a strictly more expressive dynamic attention. Because GATs use a static attention mechanism, there are simple graph problems that GAT cannot express: in a controlled problem, we show that static attention hinders GAT from even fitting the training data. To remove this limitation, we introduce a simple fix by modifying the order of operations and propose GATv2: a dynamic graph attention variant that is strictly more expressive than GAT. We perform an extensive evaluation and show that GATv2 outperforms GAT across 11 OGB and other benchmarks while we match their parametric costs. Our code is available at https://github.com/tech-srl/how\_attentive\_are\_gats . GATv2 is available as part of the PyTorch Geometric library, the Deep Graph Library, and the TensorFlow GNN library.},
	urldate = {2023-03-22},
	publisher = {arXiv},
	author = {Brody, Shaked and Alon, Uri and Yahav, Eran},
	month = jan,
	year = {2022},
	note = {arXiv:2105.14491 [cs]},
	keywords = {Computer Science - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/C5CY9B82/Brody et al. - 2022 - How Attentive are Graph Attention Networks.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/RWEJ8RAY/2105.html:text/html},
}

@misc{velickovic_graph_2018,
	title = {Graph {Attention} {Networks}},
	url = {http://arxiv.org/abs/1710.10903},
	doi = {10.48550/arXiv.1710.10903},
	abstract = {We present graph attention networks (GATs), novel neural network architectures that operate on graph-structured data, leveraging masked self-attentional layers to address the shortcomings of prior methods based on graph convolutions or their approximations. By stacking layers in which nodes are able to attend over their neighborhoods' features, we enable (implicitly) specifying different weights to different nodes in a neighborhood, without requiring any kind of costly matrix operation (such as inversion) or depending on knowing the graph structure upfront. In this way, we address several key challenges of spectral-based graph neural networks simultaneously, and make our model readily applicable to inductive as well as transductive problems. Our GAT models have achieved or matched state-of-the-art results across four established transductive and inductive graph benchmarks: the Cora, Citeseer and Pubmed citation network datasets, as well as a protein-protein interaction dataset (wherein test graphs remain unseen during training).},
	urldate = {2023-03-22},
	publisher = {arXiv},
	author = {Veličković, Petar and Cucurull, Guillem and Casanova, Arantxa and Romero, Adriana and Liò, Pietro and Bengio, Yoshua},
	month = feb,
	year = {2018},
	note = {arXiv:1710.10903 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning, Computer Science - Artificial Intelligence, Computer Science - Social and Information Networks},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/3X4HALUD/Veličković et al. - 2018 - Graph Attention Networks.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/JGM27EQ6/1710.html:text/html},
}

@misc{kipf_semi-supervised_2017,
	title = {Semi-{Supervised} {Classification} with {Graph} {Convolutional} {Networks}},
	url = {http://arxiv.org/abs/1609.02907},
	doi = {10.48550/arXiv.1609.02907},
	abstract = {We present a scalable approach for semi-supervised learning on graph-structured data that is based on an efficient variant of convolutional neural networks which operate directly on graphs. We motivate the choice of our convolutional architecture via a localized first-order approximation of spectral graph convolutions. Our model scales linearly in the number of graph edges and learns hidden layer representations that encode both local graph structure and features of nodes. In a number of experiments on citation networks and on a knowledge graph dataset we demonstrate that our approach outperforms related methods by a significant margin.},
	urldate = {2023-03-22},
	publisher = {arXiv},
	author = {Kipf, Thomas N. and Welling, Max},
	month = feb,
	year = {2017},
	note = {arXiv:1609.02907 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/N2GXN6ZZ/Kipf et Welling - 2017 - Semi-Supervised Classification with Graph Convolut.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/WMTNID7V/1609.html:text/html},
}

@misc{gao_graph_2019,
	title = {Graph {U}-{Nets}},
	url = {http://arxiv.org/abs/1905.05178},
	doi = {10.48550/arXiv.1905.05178},
	abstract = {We consider the problem of representation learning for graph data. Convolutional neural networks can naturally operate on images, but have significant challenges in dealing with graph data. Given images are special cases of graphs with nodes lie on 2D lattices, graph embedding tasks have a natural correspondence with image pixel-wise prediction tasks such as segmentation. While encoder-decoder architectures like U-Nets have been successfully applied on many image pixel-wise prediction tasks, similar methods are lacking for graph data. This is due to the fact that pooling and up-sampling operations are not natural on graph data. To address these challenges, we propose novel graph pooling (gPool) and unpooling (gUnpool) operations in this work. The gPool layer adaptively selects some nodes to form a smaller graph based on their scalar projection values on a trainable projection vector. We further propose the gUnpool layer as the inverse operation of the gPool layer. The gUnpool layer restores the graph into its original structure using the position information of nodes selected in the corresponding gPool layer. Based on our proposed gPool and gUnpool layers, we develop an encoder-decoder model on graph, known as the graph U-Nets. Our experimental results on node classification and graph classification tasks demonstrate that our methods achieve consistently better performance than previous models.},
	urldate = {2023-03-21},
	publisher = {arXiv},
	author = {Gao, Hongyang and Ji, Shuiwang},
	month = may,
	year = {2019},
	note = {arXiv:1905.05178 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/QIVY2Z39/Gao et Ji - 2019 - Graph U-Nets.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/YHWGK3H7/1905.html:text/html},
}

@misc{kipf_variational_2016,
	title = {Variational {Graph} {Auto}-{Encoders}},
	url = {http://arxiv.org/abs/1611.07308},
	doi = {10.48550/arXiv.1611.07308},
	abstract = {We introduce the variational graph auto-encoder (VGAE), a framework for unsupervised learning on graph-structured data based on the variational auto-encoder (VAE). This model makes use of latent variables and is capable of learning interpretable latent representations for undirected graphs. We demonstrate this model using a graph convolutional network (GCN) encoder and a simple inner product decoder. Our model achieves competitive results on a link prediction task in citation networks. In contrast to most existing models for unsupervised learning on graph-structured data and link prediction, our model can naturally incorporate node features, which significantly improves predictive performance on a number of benchmark datasets.},
	urldate = {2023-03-21},
	publisher = {arXiv},
	author = {Kipf, Thomas N. and Welling, Max},
	month = nov,
	year = {2016},
	note = {arXiv:1611.07308 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/8LYSMTVS/Kipf et Welling - 2016 - Variational Graph Auto-Encoders.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/KCLQX6TX/1611.html:text/html},
}

@misc{alemi_deep_2019,
	title = {Deep {Variational} {Information} {Bottleneck}},
	url = {http://arxiv.org/abs/1612.00410},
	doi = {10.48550/arXiv.1612.00410},
	abstract = {We present a variational approximation to the information bottleneck of Tishby et al. (1999). This variational approach allows us to parameterize the information bottleneck model using a neural network and leverage the reparameterization trick for efficient training. We call this method "Deep Variational Information Bottleneck", or Deep VIB. We show that models trained with the VIB objective outperform those that are trained with other forms of regularization, in terms of generalization performance and robustness to adversarial attack.},
	urldate = {2023-03-21},
	publisher = {arXiv},
	author = {Alemi, Alexander A. and Fischer, Ian and Dillon, Joshua V. and Murphy, Kevin},
	month = oct,
	year = {2019},
	note = {arXiv:1612.00410 [cs, math]},
	keywords = {Computer Science - Machine Learning, Computer Science - Information Theory},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/LMPVVWG5/Alemi et al. - 2019 - Deep Variational Information Bottleneck.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/VBXN4EUZ/1612.html:text/html},
}

@misc{thomas_kpconv_2019,
	title = {{KPConv}: {Flexible} and {Deformable} {Convolution} for {Point} {Clouds}},
	shorttitle = {{KPConv}},
	url = {http://arxiv.org/abs/1904.08889},
	abstract = {We present Kernel Point Convolution (KPConv), a new design of point convolution, i.e. that operates on point clouds without any intermediate representation. The convolution weights of KPConv are located in Euclidean space by kernel points, and applied to the input points close to them. Its capacity to use any number of kernel points gives KPConv more flexibility than fixed grid convolutions. Furthermore, these locations are continuous in space and can be learned by the network. Therefore, KPConv can be extended to deformable convolutions that learn to adapt kernel points to local geometry. Thanks to a regular subsampling strategy, KPConv is also efficient and robust to varying densities. Whether they use deformable KPConv for complex tasks, or rigid KPconv for simpler tasks, our networks outperform state-of-the-art classification and segmentation approaches on several datasets. We also offer ablation studies and visualizations to provide understanding of what has been learned by KPConv and to validate the descriptive power of deformable KPConv.},
	urldate = {2023-05-15},
	publisher = {arXiv},
	author = {Thomas, Hugues and Qi, Charles R. and Deschaud, Jean-Emmanuel and Marcotegui, Beatriz and Goulette, François and Guibas, Leonidas J.},
	month = aug,
	year = {2019},
	note = {arXiv:1904.08889 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv.org Snapshot:/home/laurent/Zotero/storage/5CY645DK/1904.html:text/html;Full Text PDF:/home/laurent/Zotero/storage/782FKEML/Thomas et al. - 2019 - KPConv Flexible and Deformable Convolution for Po.pdf:application/pdf},
}

@misc{tang_searching_2020,
	title = {Searching {Efficient} {3D} {Architectures} with {Sparse} {Point}-{Voxel} {Convolution}},
	url = {http://arxiv.org/abs/2007.16100},
	doi = {10.48550/arXiv.2007.16100},
	abstract = {Self-driving cars need to understand 3D scenes efficiently and accurately in order to drive safely. Given the limited hardware resources, existing 3D perception models are not able to recognize small instances (e.g., pedestrians, cyclists) very well due to the low-resolution voxelization and aggressive downsampling. To this end, we propose Sparse Point-Voxel Convolution (SPVConv), a lightweight 3D module that equips the vanilla Sparse Convolution with the high-resolution point-based branch. With negligible overhead, this point-based branch is able to preserve the fine details even from large outdoor scenes. To explore the spectrum of efficient 3D models, we first define a flexible architecture design space based on SPVConv, and we then present 3D Neural Architecture Search (3D-NAS) to search the optimal network architecture over this diverse design space efficiently and effectively. Experimental results validate that the resulting SPVNAS model is fast and accurate: it outperforms the state-of-the-art MinkowskiNet by 3.3\%, ranking 1st on the competitive SemanticKITTI leaderboard. It also achieves 8x computation reduction and 3x measured speedup over MinkowskiNet with higher accuracy. Finally, we transfer our method to 3D object detection, and it achieves consistent improvements over the one-stage detection baseline on KITTI.},
	urldate = {2023-04-26},
	publisher = {arXiv},
	author = {Tang, Haotian and Liu, Zhijian and Zhao, Shengyu and Lin, Yujun and Lin, Ji and Wang, Hanrui and Han, Song},
	month = aug,
	year = {2020},
	note = {arXiv:2007.16100 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/A2S9RZVE/Tang et al. - 2020 - Searching Efficient 3D Architectures with Sparse P.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/QK6WTDZH/2007.html:text/html},
}

@misc{nguyen_point-set_2021,
	title = {Point-set {Distances} for {Learning} {Representations} of {3D} {Point} {Clouds}},
	url = {http://arxiv.org/abs/2102.04014},
	doi = {10.48550/arXiv.2102.04014},
	abstract = {Learning an effective representation of 3D point clouds requires a good metric to measure the discrepancy between two 3D point sets, which is non-trivial due to their irregularity. Most of the previous works resort to using the Chamfer discrepancy or Earth Mover's distance, but those metrics are either ineffective in measuring the differences between point clouds or computationally expensive. In this paper, we conduct a systematic study with extensive experiments on distance metrics for 3D point clouds. From this study, we propose to use sliced Wasserstein distance and its variants for learning representations of 3D point clouds. In addition, we introduce a new algorithm to estimate sliced Wasserstein distance that guarantees that the estimated value is close enough to the true one. Experiments show that the sliced Wasserstein distance and its variants allow the neural network to learn a more efficient representation compared to the Chamfer discrepancy. We demonstrate the efficiency of the sliced Wasserstein metric and its variants on several tasks in 3D computer vision including training a point cloud autoencoder, generative modeling, transfer learning, and point cloud registration.},
	urldate = {2023-04-21},
	publisher = {arXiv},
	author = {Nguyen, Trung and Pham, Quang-Hieu and Le, Tam and Pham, Tung and Ho, Nhat and Hua, Binh-Son},
	month = sep,
	year = {2021},
	note = {arXiv:2102.04014 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/ZND8758D/Nguyen et al. - 2021 - Point-set Distances for Learning Representations o.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/IUDCHXC2/2102.html:text/html},
}

@misc{peng_shape_2021,
	title = {Shape {As} {Points}: {A} {Differentiable} {Poisson} {Solver}},
	shorttitle = {Shape {As} {Points}},
	url = {http://arxiv.org/abs/2106.03452},
	doi = {10.48550/arXiv.2106.03452},
	abstract = {In recent years, neural implicit representations gained popularity in 3D reconstruction due to their expressiveness and flexibility. However, the implicit nature of neural implicit representations results in slow inference time and requires careful initialization. In this paper, we revisit the classic yet ubiquitous point cloud representation and introduce a differentiable point-to-mesh layer using a differentiable formulation of Poisson Surface Reconstruction (PSR) that allows for a GPU-accelerated fast solution of the indicator function given an oriented point cloud. The differentiable PSR layer allows us to efficiently and differentiably bridge the explicit 3D point representation with the 3D mesh via the implicit indicator field, enabling end-to-end optimization of surface reconstruction metrics such as Chamfer distance. This duality between points and meshes hence allows us to represent shapes as oriented point clouds, which are explicit, lightweight and expressive. Compared to neural implicit representations, our Shape-As-Points (SAP) model is more interpretable, lightweight, and accelerates inference time by one order of magnitude. Compared to other explicit representations such as points, patches, and meshes, SAP produces topology-agnostic, watertight manifold surfaces. We demonstrate the effectiveness of SAP on the task of surface reconstruction from unoriented point clouds and learning-based reconstruction.},
	urldate = {2023-04-17},
	publisher = {arXiv},
	author = {Peng, Songyou and Jiang, Chiyu "Max" and Liao, Yiyi and Niemeyer, Michael and Pollefeys, Marc and Geiger, Andreas},
	month = nov,
	year = {2021},
	note = {arXiv:2106.03452 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Graphics},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/V5BVL34J/Peng et al. - 2021 - Shape As Points A Differentiable Poisson Solver.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/7J3IDAKQ/2106.html:text/html},
}

@misc{sulzer_deep_2022,
	title = {Deep {Surface} {Reconstruction} from {Point} {Clouds} with {Visibility} {Information}},
	url = {http://arxiv.org/abs/2202.01810},
	doi = {10.48550/arXiv.2202.01810},
	abstract = {Most current neural networks for reconstructing surfaces from point clouds ignore sensor poses and only operate on raw point locations. Sensor visibility, however, holds meaningful information regarding space occupancy and surface orientation. In this paper, we present two simple ways to augment raw point clouds with visibility information, so it can directly be leveraged by surface reconstruction networks with minimal adaptation. Our proposed modifications consistently improve the accuracy of generated surfaces as well as the generalization ability of the networks to unseen shape domains. Our code and data is available at https://github.com/raphaelsulzer/dsrv-data.},
	urldate = {2023-04-17},
	publisher = {arXiv},
	author = {Sulzer, Raphael and Landrieu, Loic and Boulch, Alexandre and Marlet, Renaud and Vallet, Bruno},
	month = feb,
	year = {2022},
	note = {arXiv:2202.01810 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/ZDTHHW9H/Sulzer et al. - 2022 - Deep Surface Reconstruction from Point Clouds with.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/X84KMGRU/2202.html:text/html},
}

@misc{nam_3d-ldm_2022,
	title = {{3D}-{LDM}: {Neural} {Implicit} {3D} {Shape} {Generation} with {Latent} {Diffusion} {Models}},
	shorttitle = {{3D}-{LDM}},
	url = {http://arxiv.org/abs/2212.00842},
	doi = {10.48550/arXiv.2212.00842},
	abstract = {Diffusion models have shown great promise for image generation, beating GANs in terms of generation diversity, with comparable image quality. However, their application to 3D shapes has been limited to point or voxel representations that can in practice not accurately represent a 3D surface. We propose a diffusion model for neural implicit representations of 3D shapes that operates in the latent space of an auto-decoder. This allows us to generate diverse and high quality 3D surfaces. We additionally show that we can condition our model on images or text to enable image-to-3D generation and text-to-3D generation using CLIP embeddings. Furthermore, adding noise to the latent codes of existing shapes allows us to explore shape variations.},
	urldate = {2023-04-11},
	publisher = {arXiv},
	author = {Nam, Gimin and Khlifi, Mariem and Rodriguez, Andrew and Tono, Alberto and Zhou, Linqi and Guerrero, Paul},
	month = dec,
	year = {2022},
	note = {arXiv:2212.00842 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/8DKDU5WY/Nam et al. - 2022 - 3D-LDM Neural Implicit 3D Shape Generation with L.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/DMM8K287/2212.html:text/html},
}

@misc{zhou_3d_2021,
	title = {{3D} {Shape} {Generation} and {Completion} through {Point}-{Voxel} {Diffusion}},
	url = {http://arxiv.org/abs/2104.03670},
	doi = {10.48550/arXiv.2104.03670},
	abstract = {We propose a novel approach for probabilistic generative modeling of 3D shapes. Unlike most existing models that learn to deterministically translate a latent vector to a shape, our model, Point-Voxel Diffusion (PVD), is a unified, probabilistic formulation for unconditional shape generation and conditional, multi-modal shape completion. PVD marries denoising diffusion models with the hybrid, point-voxel representation of 3D shapes. It can be viewed as a series of denoising steps, reversing the diffusion process from observed point cloud data to Gaussian noise, and is trained by optimizing a variational lower bound to the (conditional) likelihood function. Experiments demonstrate that PVD is capable of synthesizing high-fidelity shapes, completing partial point clouds, and generating multiple completion results from single-view depth scans of real objects.},
	urldate = {2023-04-04},
	publisher = {arXiv},
	author = {Zhou, Linqi and Du, Yilun and Wu, Jiajun},
	month = aug,
	year = {2021},
	note = {arXiv:2104.03670 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/WGECL3FJ/Zhou et al. - 2021 - 3D Shape Generation and Completion through Point-V.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/C3AEKFNE/2104.html:text/html},
}

@misc{liu_point-voxel_2019,
	title = {Point-{Voxel} {CNN} for {Efficient} {3D} {Deep} {Learning}},
	url = {http://arxiv.org/abs/1907.03739},
	doi = {10.48550/arXiv.1907.03739},
	abstract = {We present Point-Voxel CNN (PVCNN) for efficient, fast 3D deep learning. Previous work processes 3D data using either voxel-based or point-based NN models. However, both approaches are computationally inefficient. The computation cost and memory footprints of the voxel-based models grow cubically with the input resolution, making it memory-prohibitive to scale up the resolution. As for point-based networks, up to 80\% of the time is wasted on structuring the sparse data which have rather poor memory locality, not on the actual feature extraction. In this paper, we propose PVCNN that represents the 3D input data in points to reduce the memory consumption, while performing the convolutions in voxels to reduce the irregular, sparse data access and improve the locality. Our PVCNN model is both memory and computation efficient. Evaluated on semantic and part segmentation datasets, it achieves much higher accuracy than the voxel-based baseline with 10x GPU memory reduction; it also outperforms the state-of-the-art point-based models with 7x measured speedup on average. Remarkably, the narrower version of PVCNN achieves 2x speedup over PointNet (an extremely efficient model) on part and scene segmentation benchmarks with much higher accuracy. We validate the general effectiveness of PVCNN on 3D object detection: by replacing the primitives in Frustrum PointNet with PVConv, it outperforms Frustrum PointNet++ by 2.4\% mAP on average with 1.5x measured speedup and GPU memory reduction.},
	urldate = {2023-04-04},
	publisher = {arXiv},
	author = {Liu, Zhijian and Tang, Haotian and Lin, Yujun and Han, Song},
	month = dec,
	year = {2019},
	note = {arXiv:1907.03739 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/A2XJARYA/Liu et al. - 2019 - Point-Voxel CNN for Efficient 3D Deep Learning.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/LF7RPTGF/1907.html:text/html},
}

@misc{qi_pointnet_2017,
	title = {{PointNet}: {Deep} {Learning} on {Point} {Sets} for {3D} {Classification} and {Segmentation}},
	shorttitle = {{PointNet}},
	url = {http://arxiv.org/abs/1612.00593},
	doi = {10.48550/arXiv.1612.00593},
	abstract = {Point cloud is an important type of geometric data structure. Due to its irregular format, most researchers transform such data to regular 3D voxel grids or collections of images. This, however, renders data unnecessarily voluminous and causes issues. In this paper, we design a novel type of neural network that directly consumes point clouds and well respects the permutation invariance of points in the input. Our network, named PointNet, provides a unified architecture for applications ranging from object classification, part segmentation, to scene semantic parsing. Though simple, PointNet is highly efficient and effective. Empirically, it shows strong performance on par or even better than state of the art. Theoretically, we provide analysis towards understanding of what the network has learnt and why the network is robust with respect to input perturbation and corruption.},
	urldate = {2023-04-04},
	publisher = {arXiv},
	author = {Qi, Charles R. and Su, Hao and Mo, Kaichun and Guibas, Leonidas J.},
	month = apr,
	year = {2017},
	note = {arXiv:1612.00593 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/SV6H7XA9/Qi et al. - 2017 - PointNet Deep Learning on Point Sets for 3D Class.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/YF79EZLH/1612.html:text/html},
}

@misc{qi_pointnet_2017-1,
	title = {{PointNet}++: {Deep} {Hierarchical} {Feature} {Learning} on {Point} {Sets} in a {Metric} {Space}},
	shorttitle = {{PointNet}++},
	url = {http://arxiv.org/abs/1706.02413},
	doi = {10.48550/arXiv.1706.02413},
	abstract = {Few prior works study deep learning on point sets. PointNet by Qi et al. is a pioneer in this direction. However, by design PointNet does not capture local structures induced by the metric space points live in, limiting its ability to recognize fine-grained patterns and generalizability to complex scenes. In this work, we introduce a hierarchical neural network that applies PointNet recursively on a nested partitioning of the input point set. By exploiting metric space distances, our network is able to learn local features with increasing contextual scales. With further observation that point sets are usually sampled with varying densities, which results in greatly decreased performance for networks trained on uniform densities, we propose novel set learning layers to adaptively combine features from multiple scales. Experiments show that our network called PointNet++ is able to learn deep point set features efficiently and robustly. In particular, results significantly better than state-of-the-art have been obtained on challenging benchmarks of 3D point clouds.},
	urldate = {2023-04-03},
	publisher = {arXiv},
	author = {Qi, Charles R. and Yi, Li and Su, Hao and Guibas, Leonidas J.},
	month = jun,
	year = {2017},
	note = {arXiv:1706.02413 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/4FPME54R/Qi et al. - 2017 - PointNet++ Deep Hierarchical Feature Learning on .pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/SXSSFMBW/1706.html:text/html},
}

@misc{hang_efficient_2023,
	title = {Efficient {Diffusion} {Training} via {Min}-{SNR} {Weighting} {Strategy}},
	url = {http://arxiv.org/abs/2303.09556},
	doi = {10.48550/arXiv.2303.09556},
	abstract = {Denoising diffusion models have been a mainstream approach for image generation, however, training these models often suffers from slow convergence. In this paper, we discovered that the slow convergence is partly due to conflicting optimization directions between timesteps. To address this issue, we treat the diffusion training as a multi-task learning problem, and introduce a simple yet effective approach referred to as Min-SNR-\${\textbackslash}gamma\$. This method adapts loss weights of timesteps based on clamped signal-to-noise ratios, which effectively balances the conflicts among timesteps. Our results demonstrate a significant improvement in converging speed, 3.4\${\textbackslash}times\$ faster than previous weighting strategies. It is also more effective, achieving a new record FID score of 2.06 on the ImageNet \$256{\textbackslash}times256\$ benchmark using smaller architectures than that employed in previous state-of-the-art. The code is available at https://github.com/TiankaiHang/Min-SNR-Diffusion-Training.},
	urldate = {2023-06-15},
	publisher = {arXiv},
	author = {Hang, Tiankai and Gu, Shuyang and Li, Chen and Bao, Jianmin and Chen, Dong and Hu, Han and Geng, Xin and Guo, Baining},
	month = mar,
	year = {2023},
	note = {arXiv:2303.09556 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv.org Snapshot:/home/laurent/Zotero/storage/EQPT236P/Hang et al. - 2023 - Efficient Diffusion Training via Min-SNR Weighting.html:text/html},
}

@misc{rombach_high-resolution_2022,
	title = {High-{Resolution} {Image} {Synthesis} with {Latent} {Diffusion} {Models}},
	url = {http://arxiv.org/abs/2112.10752},
	abstract = {By decomposing the image formation process into a sequential application of denoising autoencoders, diffusion models (DMs) achieve state-of-the-art synthesis results on image data and beyond. Additionally, their formulation allows for a guiding mechanism to control the image generation process without retraining. However, since these models typically operate directly in pixel space, optimization of powerful DMs often consumes hundreds of GPU days and inference is expensive due to sequential evaluations. To enable DM training on limited computational resources while retaining their quality and flexibility, we apply them in the latent space of powerful pretrained autoencoders. In contrast to previous work, training diffusion models on such a representation allows for the first time to reach a near-optimal point between complexity reduction and detail preservation, greatly boosting visual fidelity. By introducing cross-attention layers into the model architecture, we turn diffusion models into powerful and flexible generators for general conditioning inputs such as text or bounding boxes and high-resolution synthesis becomes possible in a convolutional manner. Our latent diffusion models (LDMs) achieve a new state of the art for image inpainting and highly competitive performance on various tasks, including unconditional image generation, semantic scene synthesis, and super-resolution, while significantly reducing computational requirements compared to pixel-based DMs. Code is available at https://github.com/CompVis/latent-diffusion .},
	urldate = {2023-06-13},
	publisher = {arXiv},
	author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Björn},
	month = apr,
	year = {2022},
	note = {arXiv:2112.10752 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv.org Snapshot:/home/laurent/Zotero/storage/7AQALVMG/2112.html:text/html;Full Text PDF:/home/laurent/Zotero/storage/NSX4PSPP/Rombach et al. - 2022 - High-Resolution Image Synthesis with Latent Diffus.pdf:application/pdf},
}

@misc{luo_understanding_2022,
	title = {Understanding {Diffusion} {Models}: {A} {Unified} {Perspective}},
	shorttitle = {Understanding {Diffusion} {Models}},
	url = {http://arxiv.org/abs/2208.11970},
	abstract = {Diffusion models have shown incredible capabilities as generative models; indeed, they power the current state-of-the-art models on text-conditioned image generation such as Imagen and DALL-E 2. In this work we review, demystify, and unify the understanding of diffusion models across both variational and score-based perspectives. We first derive Variational Diffusion Models (VDM) as a special case of a Markovian Hierarchical Variational Autoencoder, where three key assumptions enable tractable computation and scalable optimization of the ELBO. We then prove that optimizing a VDM boils down to learning a neural network to predict one of three potential objectives: the original source input from any arbitrary noisification of it, the original source noise from any arbitrarily noisified input, or the score function of a noisified input at any arbitrary noise level. We then dive deeper into what it means to learn the score function, and connect the variational perspective of a diffusion model explicitly with the Score-based Generative Modeling perspective through Tweedie's Formula. Lastly, we cover how to learn a conditional distribution using diffusion models via guidance.},
	urldate = {2023-06-12},
	publisher = {arXiv},
	author = {Luo, Calvin},
	month = aug,
	year = {2022},
	note = {arXiv:2208.11970 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Machine Learning},
	file = {arXiv.org Snapshot:/home/laurent/Zotero/storage/YBCUMCLB/2208.html:text/html;Full Text PDF:/home/laurent/Zotero/storage/6C9BARLG/Luo - 2022 - Understanding Diffusion Models A Unified Perspect.pdf:application/pdf},
}

@misc{zhu_unpaired_2020,
	title = {Unpaired {Image}-to-{Image} {Translation} using {Cycle}-{Consistent} {Adversarial} {Networks}},
	url = {http://arxiv.org/abs/1703.10593},
	doi = {10.48550/arXiv.1703.10593},
	abstract = {Image-to-image translation is a class of vision and graphics problems where the goal is to learn the mapping between an input image and an output image using a training set of aligned image pairs. However, for many tasks, paired training data will not be available. We present an approach for learning to translate an image from a source domain \$X\$ to a target domain \$Y\$ in the absence of paired examples. Our goal is to learn a mapping \$G: X {\textbackslash}rightarrow Y\$ such that the distribution of images from \$G(X)\$ is indistinguishable from the distribution \$Y\$ using an adversarial loss. Because this mapping is highly under-constrained, we couple it with an inverse mapping \$F: Y {\textbackslash}rightarrow X\$ and introduce a cycle consistency loss to push \$F(G(X)) {\textbackslash}approx X\$ (and vice versa). Qualitative results are presented on several tasks where paired training data does not exist, including collection style transfer, object transfiguration, season transfer, photo enhancement, etc. Quantitative comparisons against several prior methods demonstrate the superiority of our approach.},
	urldate = {2023-07-06},
	publisher = {arXiv},
	author = {Zhu, Jun-Yan and Park, Taesung and Isola, Phillip and Efros, Alexei A.},
	month = aug,
	year = {2020},
	note = {arXiv:1703.10593 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/Y6SLL26A/Zhu et al. - 2020 - Unpaired Image-to-Image Translation using Cycle-Co.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/EWW8TRT2/1703.html:text/html},
}

@misc{odena_semi-supervised_2016,
	title = {Semi-{Supervised} {Learning} with {Generative} {Adversarial} {Networks}},
	url = {http://arxiv.org/abs/1606.01583},
	doi = {10.48550/arXiv.1606.01583},
	abstract = {We extend Generative Adversarial Networks (GANs) to the semi-supervised context by forcing the discriminator network to output class labels. We train a generative model G and a discriminator D on a dataset with inputs belonging to one of N classes. At training time, D is made to predict which of N+1 classes the input belongs to, where an extra class is added to correspond to the outputs of G. We show that this method can be used to create a more data-efficient classifier and that it allows for generating higher quality samples than a regular GAN.},
	urldate = {2023-07-06},
	publisher = {arXiv},
	author = {Odena, Augustus},
	month = oct,
	year = {2016},
	note = {arXiv:1606.01583 [cs, stat]},
	keywords = {Computer Science - Machine Learning, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/XM4QQ2FW/Odena - 2016 - Semi-Supervised Learning with Generative Adversari.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/TXCYUE77/1606.html:text/html},
}

@misc{kim_learning_2017,
	title = {Learning to {Discover} {Cross}-{Domain} {Relations} with {Generative} {Adversarial} {Networks}},
	url = {http://arxiv.org/abs/1703.05192},
	doi = {10.48550/arXiv.1703.05192},
	abstract = {While humans easily recognize relations between data from different domains without any supervision, learning to automatically discover them is in general very challenging and needs many ground-truth pairs that illustrate the relations. To avoid costly pairing, we address the task of discovering cross-domain relations given unpaired data. We propose a method based on generative adversarial networks that learns to discover relations between different domains (DiscoGAN). Using the discovered relations, our proposed network successfully transfers style from one domain to another while preserving key attributes such as orientation and face identity. Source code for official implementation is publicly available https://github.com/SKTBrain/DiscoGAN},
	urldate = {2023-07-06},
	publisher = {arXiv},
	author = {Kim, Taeksoo and Cha, Moonsu and Kim, Hyunsoo and Lee, Jung Kwon and Kim, Jiwon},
	month = may,
	year = {2017},
	note = {arXiv:1703.05192 [cs]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/Q6LM7LUP/Kim et al. - 2017 - Learning to Discover Cross-Domain Relations with G.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/VWK3IQCR/1703.html:text/html},
}

@misc{ledig_photo-realistic_2017,
	title = {Photo-{Realistic} {Single} {Image} {Super}-{Resolution} {Using} a {Generative} {Adversarial} {Network}},
	url = {http://arxiv.org/abs/1609.04802},
	doi = {10.48550/arXiv.1609.04802},
	abstract = {Despite the breakthroughs in accuracy and speed of single image super-resolution using faster and deeper convolutional neural networks, one central problem remains largely unsolved: how do we recover the finer texture details when we super-resolve at large upscaling factors? The behavior of optimization-based super-resolution methods is principally driven by the choice of the objective function. Recent work has largely focused on minimizing the mean squared reconstruction error. The resulting estimates have high peak signal-to-noise ratios, but they are often lacking high-frequency details and are perceptually unsatisfying in the sense that they fail to match the fidelity expected at the higher resolution. In this paper, we present SRGAN, a generative adversarial network (GAN) for image super-resolution (SR). To our knowledge, it is the first framework capable of inferring photo-realistic natural images for 4x upscaling factors. To achieve this, we propose a perceptual loss function which consists of an adversarial loss and a content loss. The adversarial loss pushes our solution to the natural image manifold using a discriminator network that is trained to differentiate between the super-resolved images and original photo-realistic images. In addition, we use a content loss motivated by perceptual similarity instead of similarity in pixel space. Our deep residual network is able to recover photo-realistic textures from heavily downsampled images on public benchmarks. An extensive mean-opinion-score (MOS) test shows hugely significant gains in perceptual quality using SRGAN. The MOS scores obtained with SRGAN are closer to those of the original high-resolution images than to those obtained with any state-of-the-art method.},
	urldate = {2023-07-06},
	publisher = {arXiv},
	author = {Ledig, Christian and Theis, Lucas and Huszar, Ferenc and Caballero, Jose and Cunningham, Andrew and Acosta, Alejandro and Aitken, Andrew and Tejani, Alykhan and Totz, Johannes and Wang, Zehan and Shi, Wenzhe},
	month = may,
	year = {2017},
	note = {arXiv:1609.04802 [cs, stat]},
	keywords = {Computer Science - Computer Vision and Pattern Recognition, Statistics - Machine Learning},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/VN4Z76ZB/Ledig et al. - 2017 - Photo-Realistic Single Image Super-Resolution Usin.pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/RN7MPPTH/1609.html:text/html},
}

@article{ma_comprehensive_2021,
	title = {A {Comprehensive} {Survey} on {Graph} {Anomaly} {Detection} with {Deep} {Learning}},
	issn = {1041-4347, 1558-2191, 2326-3865},
	url = {http://arxiv.org/abs/2106.07178},
	doi = {10.1109/TKDE.2021.3118815},
	abstract = {Anomalies represent rare observations (e.g., data records or events) that deviate significantly from others. Over several decades, research on anomaly mining has received increasing interests due to the implications of these occurrences in a wide range of disciplines. Anomaly detection, which aims to identify rare observations, is among the most vital tasks in the world, and has shown its power in preventing detrimental events, such as financial fraud, network intrusion, and social spam. The detection task is typically solved by identifying outlying data points in the feature space and inherently overlooks the relational information in real-world data. Graphs have been prevalently used to represent the structural information, which raises the graph anomaly detection problem - identifying anomalous graph objects (i.e., nodes, edges and sub-graphs) in a single graph, or anomalous graphs in a database/set of graphs. However, conventional anomaly detection techniques cannot tackle this problem well because of the complexity of graph data. For the advent of deep learning, graph anomaly detection with deep learning has received a growing attention recently. In this survey, we aim to provide a systematic and comprehensive review of the contemporary deep learning techniques for graph anomaly detection. We compile open-sourced implementations, public datasets, and commonly-used evaluation metrics to provide affluent resources for future studies. More importantly, we highlight twelve extensive future research directions according to our survey results covering unsolved and emerging research problems and real-world applications. With this survey, our goal is to create a "one-stop-shop" that provides a unified understanding of the problem categories and existing approaches, publicly available hands-on resources, and high-impact open challenges for graph anomaly detection using deep learning.},
	urldate = {2023-07-06},
	journal = {IEEE Transactions on Knowledge and Data Engineering},
	author = {Ma, Xiaoxiao and Wu, Jia and Xue, Shan and Yang, Jian and Zhou, Chuan and Sheng, Quan Z. and Xiong, Hui and Akoglu, Leman},
	year = {2021},
	note = {arXiv:2106.07178 [cs]},
	keywords = {Computer Science - Machine Learning},
	pages = {1--1},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/HLLPJA3X/Ma et al. - 2021 - A Comprehensive Survey on Graph Anomaly Detection .pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/RK3I3FR2/2106.html:text/html},
}

@misc{gao_survey_2023,
	title = {A {Survey} of {Graph} {Neural} {Networks} for {Recommender} {Systems}: {Challenges}, {Methods}, and {Directions}},
	shorttitle = {A {Survey} of {Graph} {Neural} {Networks} for {Recommender} {Systems}},
	url = {http://arxiv.org/abs/2109.12843},
	doi = {10.48550/arXiv.2109.12843},
	abstract = {Recommender system is one of the most important information services on today's Internet. Recently, graph neural networks have become the new state-of-the-art approach to recommender systems. In this survey, we conduct a comprehensive review of the literature on graph neural network-based recommender systems. We first introduce the background and the history of the development of both recommender systems and graph neural networks. For recommender systems, in general, there are four aspects for categorizing existing works: stage, scenario, objective, and application. For graph neural networks, the existing methods consist of two categories, spectral models and spatial ones. We then discuss the motivation of applying graph neural networks into recommender systems, mainly consisting of the high-order connectivity, the structural property of data, and the enhanced supervision signal. We then systematically analyze the challenges in graph construction, embedding propagation/aggregation, model optimization, and computation efficiency. Afterward and primarily, we provide a comprehensive overview of a multitude of existing works of graph neural network-based recommender systems, following the taxonomy above. Finally, we raise discussions on the open problems and promising future directions in this area. We summarize the representative papers along with their code repositories in {\textbackslash}url\{https://github.com/tsinghua-fib-lab/GNN-Recommender-Systems\}.},
	urldate = {2023-07-06},
	publisher = {arXiv},
	author = {Gao, Chen and Zheng, Yu and Li, Nian and Li, Yinfeng and Qin, Yingrong and Piao, Jinghua and Quan, Yuhan and Chang, Jianxin and Jin, Depeng and He, Xiangnan and Li, Yong},
	month = jan,
	year = {2023},
	note = {arXiv:2109.12843 [cs]},
	keywords = {Computer Science - Information Retrieval},
	file = {arXiv Fulltext PDF:/home/laurent/Zotero/storage/WUJ2Y5V4/Gao et al. - 2023 - A Survey of Graph Neural Networks for Recommender .pdf:application/pdf;arXiv.org Snapshot:/home/laurent/Zotero/storage/MADG65MH/2109.html:text/html},
}

@misc{li_gated_2017,
	title = {Gated {Graph} {Sequence} {Neural} {Networks}},
	url = {http://arxiv.org/abs/1511.05493},
	doi = {10.48550/arXiv.1511.05493},
	abstract = {Graph-structured data appears frequently in domains including chemistry, natural language semantics, social networks, and knowledge bases. In this work, we study feature learning techniques for graph-structured inputs. Our starting point is previous work on Graph Neural Networks (Scarselli et al., 2009), which we modify to use gated recurrent units and modern optimization techniques and then extend to output sequences. The result is a flexible and broadly useful class of neural network models that has favorable inductive biases relative to purely sequence-based models (e.g., LSTMs) when the problem is graph-structured. We demonstrate the capabilities on some simple AI (bAbI) and graph algorithm learning tasks. We then show it achieves state-of-the-art performance on a problem from program verification, in which subgraphs need to be matched to abstract data structures.},
	urldate = {2023-07-06},
	publisher = {arXiv},
	author = {Li, Yujia and Tarlow, Daniel and Brockschmidt, Marc and Zemel, Richard},
	month = sep,
	year = {2017},
	note = {arXiv:1511.05493 [cs, stat]},
	keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Computer Science - Neural and Evolutionary Computing, Statistics - Machine Learning},
}