# -*- coding: utf-8 -*- """ Created on Mon May 1 07:55:45 2023 @author: Bernd Ebenhoch """ import numpy as np import tensorflow as tf from tensorflow import keras import matplotlib.pyplot as plt import streamlit as st import copy plt.style.use('mystyle.mplstyle') # Defining the neural network as the agent to chose ad scheme A (0) or B (1) model = tf.keras.models.Sequential() model.add(tf.keras.layers.Dense(1, activation="sigmoid", input_shape=(1,))) model.summary() @tf.function() def action_selection(model): # Using GgradientTape to automatically build gradients with TensorFlow with tf.GradientTape() as tape: # As we have no information about the user viewer the ad, # the input in the neural network is always the same: 0 output = model(np.array([[0.0]])) # [0 ... 1] # The output of the neural network is considered as probability for # taking action A (0) or B (1) # We compare the output with a uniform random variable # For example, if the output is 0.8, # we have 80% chance that random variable is smaller, taking action B (1) # and 20% chance that the random variable is larger, taking action A (0) action = (tf.random.uniform((1, 1)) < output) # [0 oder 1] # The loss value measures the difference between the output and the action loss = tf.reduce_mean(tf.keras.losses.binary_crossentropy(action, output)) # We are creating the gradients [dloss/dw, dloss/db] grads = tape.gradient(loss, model.trainable_variables) return output, action, loss, grads st.markdown( 'Simulate A/B optimization with policy gradient reinforcement learning') lr = float(st.text_input('Learning rate', value=0.5)) prob_A = float(st.text_input('Click probability of ad A', value=0.3)) prob_B = float(st.text_input('Click probability of ad B', value=0.4)) steps = int(st.text_input('Number of ad impressions (steps)', value=1000)) information_for_plotting = np.zeros((steps, 10)) if st.button('Run the ad campaign and display the results'): with st.spinner('Simulating the ad campaign may take a few seconds ...'): for step in range(steps): # The neural network is used to choose the action # To display the learning progress, we also record the # model output, loss and gradients output, action, loss, grads = action_selection(model) # Next we are applying the action by displaying ad A or B # As we do not want to wait if a user clicks the ad, # we are simulating a click rate # Ad A has with 40% click rate a lower chance of being clicked # than Ad B with 50% click rate # We consider the click rate as a measure of the reward for training if action == False: # Action A reward = float(np.random.random() < prob_A) if action == True: # Action B reward = float(np.random.random() < prob_B) # The gradients obtained above are multiplied with the acquired reward # Gradients for actions that lead to clicks are kept unchanged, # whereas gradients for actions that do not lead to clicks are reversed grads_adjusted = [] for var_index in range(len(model.trainable_variables)): grads_adjusted.append((reward-0.5)*2 * grads[var_index]) # Using standard backpropagation, we apply the gradients to update the model parameters model.trainable_variables[0].assign( model.trainable_variables[0]-lr*grads_adjusted[0]) model.trainable_variables[1].assign( model.trainable_variables[1]-lr*grads_adjusted[1]) information_for_plotting[step, 0] = output.numpy()[0] information_for_plotting[step, 1] = action.numpy()[0].astype(int) information_for_plotting[step, 2] = loss information_for_plotting[step, 3] = grads[0] information_for_plotting[step, 4] = grads[1] information_for_plotting[step, 5] = reward information_for_plotting[step, 6] = grads_adjusted[0] information_for_plotting[step, 7] = grads_adjusted[1] information_for_plotting[step, 8] = copy.deepcopy(model.trainable_variables[0]) information_for_plotting[step, 9] = copy.deepcopy(model.trainable_variables[1]) # Plot the results titles = ['Model Output', 'Action', 'Loss', 'Gradients', 'Rewards', 'Adjusted Gradients', 'Model Parameters'] plus = [0, 0, 0, 0, 1, 1, 2] fig = plt.figure(figsize=(12, 26)) fig.subplots(7, 1, sharex=True) for i in range(7): plt.subplot(7, 1, i+1) plt.subplots_adjust(hspace=.0) if i in [0, 1, 2, 4]: plt.plot(information_for_plotting[:, i+plus[i]]) plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f')) else: plt.plot(information_for_plotting[:, i+1+plus[i]], label='Bias') plt.plot(information_for_plotting[:, i+plus[i]], label='Weight') plt.legend(loc="upper left") plt.gca().yaxis.set_major_formatter(plt.FormatStrFormatter('%.2f')) plt.ylabel(titles[i]) plt.xlabel('Step') plt.show() # Sum of the total clicks obtained st.markdown('Your ad campaign received **' + str(int(information_for_plotting[:, 5].sum())) + '** clicks in total.') st.pyplot(fig)