disentangled-image-editing-final-project
/
ContraCLIP
/models
/genforce
/converters
/stylegan2_official
/projector.py
# Copyright (c) 2019, NVIDIA Corporation. All rights reserved. | |
# | |
# This work is made available under the Nvidia Source Code License-NC. | |
# To view a copy of this license, visit | |
# https://nvlabs.github.io/stylegan2/license.html | |
import numpy as np | |
import tensorflow as tf | |
import dnnlib | |
import dnnlib.tflib as tflib | |
from training import misc | |
#---------------------------------------------------------------------------- | |
class Projector: | |
def __init__(self): | |
self.num_steps = 1000 | |
self.dlatent_avg_samples = 10000 | |
self.initial_learning_rate = 0.1 | |
self.initial_noise_factor = 0.05 | |
self.lr_rampdown_length = 0.25 | |
self.lr_rampup_length = 0.05 | |
self.noise_ramp_length = 0.75 | |
self.regularize_noise_weight = 1e5 | |
self.verbose = False | |
self.clone_net = True | |
self._Gs = None | |
self._minibatch_size = None | |
self._dlatent_avg = None | |
self._dlatent_std = None | |
self._noise_vars = None | |
self._noise_init_op = None | |
self._noise_normalize_op = None | |
self._dlatents_var = None | |
self._noise_in = None | |
self._dlatents_expr = None | |
self._images_expr = None | |
self._target_images_var = None | |
self._lpips = None | |
self._dist = None | |
self._loss = None | |
self._reg_sizes = None | |
self._lrate_in = None | |
self._opt = None | |
self._opt_step = None | |
self._cur_step = None | |
def _info(self, *args): | |
if self.verbose: | |
print('Projector:', *args) | |
def set_network(self, Gs, minibatch_size=1): | |
assert minibatch_size == 1 | |
self._Gs = Gs | |
self._minibatch_size = minibatch_size | |
if self._Gs is None: | |
return | |
if self.clone_net: | |
self._Gs = self._Gs.clone() | |
# Find dlatent stats. | |
self._info('Finding W midpoint and stddev using %d samples...' % self.dlatent_avg_samples) | |
latent_samples = np.random.RandomState(123).randn(self.dlatent_avg_samples, *self._Gs.input_shapes[0][1:]) | |
dlatent_samples = self._Gs.components.mapping.run(latent_samples, None)[:, :1, :] # [N, 1, 512] | |
self._dlatent_avg = np.mean(dlatent_samples, axis=0, keepdims=True) # [1, 1, 512] | |
self._dlatent_std = (np.sum((dlatent_samples - self._dlatent_avg) ** 2) / self.dlatent_avg_samples) ** 0.5 | |
self._info('std = %g' % self._dlatent_std) | |
# Find noise inputs. | |
self._info('Setting up noise inputs...') | |
self._noise_vars = [] | |
noise_init_ops = [] | |
noise_normalize_ops = [] | |
while True: | |
n = 'G_synthesis/noise%d' % len(self._noise_vars) | |
if not n in self._Gs.vars: | |
break | |
v = self._Gs.vars[n] | |
self._noise_vars.append(v) | |
noise_init_ops.append(tf.assign(v, tf.random_normal(tf.shape(v), dtype=tf.float32))) | |
noise_mean = tf.reduce_mean(v) | |
noise_std = tf.reduce_mean((v - noise_mean)**2)**0.5 | |
noise_normalize_ops.append(tf.assign(v, (v - noise_mean) / noise_std)) | |
self._info(n, v) | |
self._noise_init_op = tf.group(*noise_init_ops) | |
self._noise_normalize_op = tf.group(*noise_normalize_ops) | |
# Image output graph. | |
self._info('Building image output graph...') | |
self._dlatents_var = tf.Variable(tf.zeros([self._minibatch_size] + list(self._dlatent_avg.shape[1:])), name='dlatents_var') | |
self._noise_in = tf.placeholder(tf.float32, [], name='noise_in') | |
dlatents_noise = tf.random.normal(shape=self._dlatents_var.shape) * self._noise_in | |
self._dlatents_expr = tf.tile(self._dlatents_var + dlatents_noise, [1, self._Gs.components.synthesis.input_shape[1], 1]) | |
self._images_expr = self._Gs.components.synthesis.get_output_for(self._dlatents_expr, randomize_noise=False) | |
# Downsample image to 256x256 if it's larger than that. VGG was built for 224x224 images. | |
proc_images_expr = (self._images_expr + 1) * (255 / 2) | |
sh = proc_images_expr.shape.as_list() | |
if sh[2] > 256: | |
factor = sh[2] // 256 | |
proc_images_expr = tf.reduce_mean(tf.reshape(proc_images_expr, [-1, sh[1], sh[2] // factor, factor, sh[2] // factor, factor]), axis=[3,5]) | |
# Loss graph. | |
self._info('Building loss graph...') | |
self._target_images_var = tf.Variable(tf.zeros(proc_images_expr.shape), name='target_images_var') | |
if self._lpips is None: | |
self._lpips = misc.load_pkl('http://d36zk2xti64re0.cloudfront.net/stylegan1/networks/metrics/vgg16_zhang_perceptual.pkl') | |
self._dist = self._lpips.get_output_for(proc_images_expr, self._target_images_var) | |
self._loss = tf.reduce_sum(self._dist) | |
# Noise regularization graph. | |
self._info('Building noise regularization graph...') | |
reg_loss = 0.0 | |
for v in self._noise_vars: | |
sz = v.shape[2] | |
while True: | |
reg_loss += tf.reduce_mean(v * tf.roll(v, shift=1, axis=3))**2 + tf.reduce_mean(v * tf.roll(v, shift=1, axis=2))**2 | |
if sz <= 8: | |
break # Small enough already | |
v = tf.reshape(v, [1, 1, sz//2, 2, sz//2, 2]) # Downscale | |
v = tf.reduce_mean(v, axis=[3, 5]) | |
sz = sz // 2 | |
self._loss += reg_loss * self.regularize_noise_weight | |
# Optimizer. | |
self._info('Setting up optimizer...') | |
self._lrate_in = tf.placeholder(tf.float32, [], name='lrate_in') | |
self._opt = dnnlib.tflib.Optimizer(learning_rate=self._lrate_in) | |
self._opt.register_gradients(self._loss, [self._dlatents_var] + self._noise_vars) | |
self._opt_step = self._opt.apply_updates() | |
def run(self, target_images): | |
# Run to completion. | |
self.start(target_images) | |
while self._cur_step < self.num_steps: | |
self.step() | |
# Collect results. | |
pres = dnnlib.EasyDict() | |
pres.dlatents = self.get_dlatents() | |
pres.noises = self.get_noises() | |
pres.images = self.get_images() | |
return pres | |
def start(self, target_images): | |
assert self._Gs is not None | |
# Prepare target images. | |
self._info('Preparing target images...') | |
target_images = np.asarray(target_images, dtype='float32') | |
target_images = (target_images + 1) * (255 / 2) | |
sh = target_images.shape | |
assert sh[0] == self._minibatch_size | |
if sh[2] > self._target_images_var.shape[2]: | |
factor = sh[2] // self._target_images_var.shape[2] | |
target_images = np.reshape(target_images, [-1, sh[1], sh[2] // factor, factor, sh[3] // factor, factor]).mean((3, 5)) | |
# Initialize optimization state. | |
self._info('Initializing optimization state...') | |
tflib.set_vars({self._target_images_var: target_images, self._dlatents_var: np.tile(self._dlatent_avg, [self._minibatch_size, 1, 1])}) | |
tflib.run(self._noise_init_op) | |
self._opt.reset_optimizer_state() | |
self._cur_step = 0 | |
def step(self): | |
assert self._cur_step is not None | |
if self._cur_step >= self.num_steps: | |
return | |
if self._cur_step == 0: | |
self._info('Running...') | |
# Hyperparameters. | |
t = self._cur_step / self.num_steps | |
noise_strength = self._dlatent_std * self.initial_noise_factor * max(0.0, 1.0 - t / self.noise_ramp_length) ** 2 | |
lr_ramp = min(1.0, (1.0 - t) / self.lr_rampdown_length) | |
lr_ramp = 0.5 - 0.5 * np.cos(lr_ramp * np.pi) | |
lr_ramp = lr_ramp * min(1.0, t / self.lr_rampup_length) | |
learning_rate = self.initial_learning_rate * lr_ramp | |
# Train. | |
feed_dict = {self._noise_in: noise_strength, self._lrate_in: learning_rate} | |
_, dist_value, loss_value = tflib.run([self._opt_step, self._dist, self._loss], feed_dict) | |
tflib.run(self._noise_normalize_op) | |
# Print status. | |
self._cur_step += 1 | |
if self._cur_step == self.num_steps or self._cur_step % 10 == 0: | |
self._info('%-8d%-12g%-12g' % (self._cur_step, dist_value, loss_value)) | |
if self._cur_step == self.num_steps: | |
self._info('Done.') | |
def get_cur_step(self): | |
return self._cur_step | |
def get_dlatents(self): | |
return tflib.run(self._dlatents_expr, {self._noise_in: 0}) | |
def get_noises(self): | |
return tflib.run(self._noise_vars) | |
def get_images(self): | |
return tflib.run(self._images_expr, {self._noise_in: 0}) | |
#---------------------------------------------------------------------------- | |