Supports max_exploration_probability_hint in `FalconRewardPredictio…

…nPolicy` This new parameter lets users configure the behavior of the policy by expressing their tolerance on the amount of exploration, which may be more tangible and hence easier to determine than `exploitation_coefficient`. It is an optional float, representing a hint on the maximum exploration probability, internally clipped to [0, 1]. When it is set, `exploitation_coefficient` is ignored and the policy attempts to choose non-greedy actions with at most this probability. When such an upper bound cannot be achieved, e.g. due to insufficient training data, the policy attempts to minimize the probability of choosing non-greedy actions on a best-effort basis. PiperOrigin-RevId: 516896185 Change-Id: I0a3c27ce23dec426d0293d846fdf305b7caa247b
tensorflow · Mar 15, 2023 · a8cef4c · a8cef4c
1 parent dd0c0f8
commit a8cef4c
Show file tree

Hide file tree

Showing 4 changed files with 454 additions and 98 deletions.
diff --git a/tf_agents/bandits/agents/neural_falcon_agent.py b/tf_agents/bandits/agents/neural_falcon_agent.py
@@ -154,11 +154,14 @@ def __init__(
         time_step_spec,
         action_spec,
         reward_network,
-        exploitation_coefficient,
-        observation_and_action_constraint_splitter,
+        exploitation_coefficient=exploitation_coefficient,
+        observation_and_action_constraint_splitter=(
+            observation_and_action_constraint_splitter
+        ),
         constraints=constraints,
         accepts_per_arm_features=accepts_per_arm_features,
         emit_policy_info=emit_policy_info,
-        num_samples_list=num_samples_list)
+        num_samples_list=num_samples_list,
+    )
 
     self._collect_policy = self._policy
diff --git a/tf_agents/bandits/agents/neural_falcon_agent_test.py b/tf_agents/bandits/agents/neural_falcon_agent_test.py
@@ -262,7 +262,7 @@ def testTrainedPolicy(self, accepts_per_arm_features):
     # Train the policy.
     # Initialize all variables
     self.evaluate(tf.compat.v1.global_variables_initializer())
-    for _ in range(20):
+    for _ in range(80):
       self.evaluate(
           agent.train(
               self._generate_training_experience(accepts_per_arm_features),
@@ -278,7 +278,7 @@ def testTrainedPolicy(self, accepts_per_arm_features):
     actions = self.evaluate(action_step.action)
     p_info = self.evaluate(action_step.info)
     # Check the log probabilities in the policy info are near greedy.
-    self.assertAllClose(p_info.log_probability, [0.0] * batch_size, atol=5e-3)
+    self.assertAllClose(p_info.log_probability, [0.0] * batch_size, atol=1e-2)
     # Check the chosen arms are greedy.
     self.assertAllEqual(actions,
                         np.argmax(p_info.predicted_rewards_mean, axis=1))

diff --git a/tf_agents/bandits/policies/falcon_reward_prediction_policy.py b/tf_agents/bandits/policies/falcon_reward_prediction_policy.py
@@ -33,6 +33,19 @@
 from tf_agents.typing import types
 
 
+# An upper bound of the gamma parameter. Without an upper bound, the probability
+# of choosing non-greedy actions vanishes as the training data size increases,
+# even when non-greedy actions have almost the same predicted rewards as the
+# greedy action.
+_MAX_GAMMA = 50000.0
+
+
+# When trying to satisfy the constraint on the maximum exploration probability,
+# the policy searches for the most suitable `exploitation_coefficient` on a grid
+# in log2 scale between 0 and this value inclusively.
+_MAX_LOG2_EXPLOITATION_COEF = 14
+
+
 def get_number_of_trainable_elements(network: types.Network) -> types.Float:
   """Gets the total # of elements in the network's trainable variables.
 
@@ -53,22 +66,117 @@ def get_number_of_trainable_elements(network: types.Network) -> types.Float:
   return sum(num_elements_list)
 
 
+def _find_action_probabilities(
+    greedy_action_prob: types.Tensor,
+    other_actions_probs: types.Tensor,
+    max_exploration_prob: float,
+):
+  """Finds action probabilities satisfying `max_exploration_prob`.
+
+  Given action probabilities calculated by different values of the gamma
+  parameter, this function attempts to find action probabilities at a specific
+  gamma value such that non-greedy actions are chosen with at most
+  `max_exploration_prob` probability. If such an upper bound can be achieved,
+  the return maximizes the exploration probability subject to the upper bound.
+  Otherwise, it minimizes the exploration probability on a best-effort basis.
+
+  Args:
+    greedy_action_prob: A tensor shaped as [batch_size, d], the probabilities of
+      choosing the greedy action under `d` different values of gamma.
+    other_actions_probs: A tensor shaped as [batch_size, num_actions, d], all
+      non-greedy action probabilities under `d` different values of gamma. The
+      last dimension is assumed to be aligned with that of `greedy_action_prob`.
+    max_exploration_prob: A float, the maximum probability of choosing
+      non-greedy actions.
+
+  Returns:
+    A tuple of two tensors for the greedy action probability and non-greedy
+    actions probabilities, shaped as [batch_size, 1] and
+    [batch_size, num_actions], respectively.
+  """
+  if greedy_action_prob.shape.rank != 2:
+    raise ValueError(
+        '`greedy_action_prob` is expected to be rank-2, but found otherwise:'
+        f' {greedy_action_prob}'
+    )
+  if other_actions_probs.shape.rank != 3:
+    raise ValueError(
+        '`other_actions_probs` is expected to be rank-3, but found otherwise:'
+        f' {other_actions_probs}'
+    )
+  if greedy_action_prob.shape[-1] != other_actions_probs.shape[-1]:
+    raise ValueError(
+        '`greedy_action_prob` and `other_actions_probs` are '
+        'expected to have the same last dimension, but found '
+        f'otherwise. `greedy_action_prob`: {greedy_action_prob}'
+        f', `other_actions_probs`: {other_actions_probs}'
+    )
+
+  # A [batch_size, d] bool tensor indicating which elements of
+  # `greedy_action_prob` satisfy the `max_exploration_prob` constraint.
+  valid_gamma_mask = tf.greater_equal(
+      greedy_action_prob, 1.0 - max_exploration_prob
+  )
+  # A [batch_size] bool tensor indicating the batch members that have at least
+  # one valid entry in `greedy_action_prob`.
+  feasible = tf.greater(
+      tf.reduce_sum(tf.cast(valid_gamma_mask, tf.float32), axis=1), 0.0
+  )
+  # We mask the probability entries corresponding to invalid gamma values by 2.0
+  # so that they will not be selected as minimizers. See further details in the
+  # comment below.
+  greedy_action_prob_masked = tf.where(
+      valid_gamma_mask,
+      greedy_action_prob,
+      2.0 * tf.ones_like(greedy_action_prob),
+  )
+  # For batch members where the `max_exploration_prob` constraint is feasible,
+  # we maximize the exploration probability (or equivalently, minimize the
+  # greedy action probability) subject to the constraint via masking.
+  # For batch members where the `max_exploration_prob` constraint is infeasible,
+  # we simply minimize the exploration probability (or equivalently, maximize
+  # the greedy action probability).
+  gamma_indices = tf.where(
+      feasible,
+      tf.argmin(greedy_action_prob_masked, axis=1),
+      tf.argmax(greedy_action_prob, axis=1),
+  )
+  gamma_indices = tf.expand_dims(gamma_indices, axis=-1)
+  greedy_action_prob = tf.gather(
+      greedy_action_prob, gamma_indices, axis=1, batch_dims=1
+  )
+  num_actions = tf.shape(other_actions_probs)[1]
+  other_actions_probs = tf.gather(
+      other_actions_probs,
+      tf.tile(gamma_indices, [1, num_actions]),
+      axis=2,
+      batch_dims=2,
+  )
+  return greedy_action_prob, other_actions_probs
+
+
 class FalconRewardPredictionPolicy(
     reward_prediction_base_policy.RewardPredictionBasePolicy):
   """Policy that samples actions based on the FALCON algorithm."""
 
-  def __init__(self,
-               time_step_spec: types.TimeStep,
-               action_spec: types.NestedTensorSpec,
-               reward_network: types.Network,
-               exploitation_coefficient: types.FloatOrReturningFloat = 1.0,
-               observation_and_action_constraint_splitter: Optional[
-                   types.Splitter] = None,
-               accepts_per_arm_features: bool = False,
-               constraints: Iterable[constr.BaseConstraint] = (),
-               emit_policy_info: Tuple[Text, ...] = (),
-               num_samples_list: Sequence[tf.Variable] = (),
-               name: Optional[Text] = None):
+  def __init__(
+      self,
+      time_step_spec: types.TimeStep,
+      action_spec: types.NestedTensorSpec,
+      reward_network: types.Network,
+      exploitation_coefficient: Optional[types.FloatOrReturningFloat] = 1.0,
+      max_exploration_probability_hint: Optional[
+          types.FloatOrReturningFloat
+      ] = None,
+      observation_and_action_constraint_splitter: Optional[
+          types.Splitter
+      ] = None,
+      accepts_per_arm_features: bool = False,
+      constraints: Iterable[constr.BaseConstraint] = (),
+      emit_policy_info: Tuple[Text, ...] = (),
+      num_samples_list: Sequence[tf.Variable] = (),
+      name: Optional[Text] = None,
+  ):
     """Builds a FalconRewardPredictionPolicy given a reward network.
 
     This policy takes a tf_agents.Network predicting rewards and samples an
@@ -90,6 +198,16 @@ def __init__(self,
         exploitative the policy behaves w.r.t the predicted rewards: A larger
         value makes the policy sample the greedy action (one with the best
         predicted reward) with a higher probability.
+      max_exploration_probability_hint: An optional float, representing a hint
+        on the maximum exploration probability, internally clipped to [0, 1].
+        When this argument is set, `exploitation_coefficient` is ignored and the
+        policy attempts to choose non-greedy actions with at most this
+        probability. When such an upper bound cannot be achieved, e.g. due to
+        insufficient training data, the policy attempts to minimize the
+        probability of choosing non-greedy actions on a best-effort basis. For a
+        demonstration of how it affects the policy behavior, see the unit test
+        `testMaxExplorationProbabilityHint` in
+        `falcon_reward_prediction_policy_test`.
       observation_and_action_constraint_splitter: A function used for masking
         valid/invalid actions with each state of the environment. The function
         takes in a full observation and returns a tuple consisting of 1) the
@@ -123,6 +241,7 @@ def __init__(self,
                          emit_policy_info, name)
 
     self._exploitation_coefficient = exploitation_coefficient
+    self._max_exploration_probability_hint = max_exploration_probability_hint
     if num_samples_list:
       self._num_samples_list = num_samples_list
     else:
@@ -171,37 +290,56 @@ def _get_number_of_allowed_actions(
             if mask is None else tf.reduce_sum(
                 tf.cast(tf.cast(mask, tf.bool), tf.float32), axis=1))
 
-  def _compute_gamma(self, mask: Optional[types.Tensor],
-                     dtype: tf.DType) -> types.Float:
-    """Computes the gamma parameter in the sampling probability.
+  def _compute_gamma(
+      self, mask: Optional[types.Tensor], dtype: tf.DType, batch_size: int
+  ) -> types.Float:
+    """Computes the gamma parameter(s) in the sampling probability.
 
     This helper method implements a simple heuristic for computing the
     the gamma parameter in Step 2 of Algorithm 1 in the paper
     https://arxiv.org/pdf/2003.12699.pdf. A higher gamma makes the action
     sampling distribution concentrate more on the greedy action.
 
     Args:
-      mask: An optional mask represented by a tensor shaped as
-        [batch_size, num_actions].
+      mask: An optional mask represented by a tensor shaped as [batch_size,
+        num_actions].
       dtype: Type of the returned value, expected to be a float type.
+      batch_size: The batch size.
 
     Returns:
-      The gamma parameter.
+      The gamma parameter shaped as [batch_size, d], where d = 1 if
+      self._max_exploration_probability_hint is unset, and d > 1 otherwise. In
+      the latter case, the second dimension gives gamma parameters calculated on
+      a 1-D grid of `exploitation_coefficient` in log2 scale between 0 and
+      `_MAX_LOG2_EXPLOITATION_COEF` inclusively, and `d` corresponds to the grid
+      size.
     """
     num_samples_list_float = tf.maximum(
         [tf.cast(x.read_value(), tf.float32) for x in self.num_samples_list],
         0.0)
     num_trainable_elements_float = tf.cast(
         tf.math.maximum(self.num_trainable_elements, 1), tf.float32)
     num_allowed_actions = self._get_number_of_allowed_actions(mask)
-    return self._get_exploitation_coefficient() * tf.sqrt(
-        num_allowed_actions * tf.reduce_sum(num_samples_list_float) /
-        num_trainable_elements_float)
+    exploitation_coefficient = (
+        self._get_exploitation_coefficient()
+        if self._max_exploration_probability_hint is None
+        else tf.pow(2.0, range(_MAX_LOG2_EXPLOITATION_COEF + 1))
+    )
+    gamma = tf.sqrt(
+        num_allowed_actions
+        * tf.reduce_sum(num_samples_list_float)
+        / num_trainable_elements_float
+    )
+    return tf.minimum(
+        _MAX_GAMMA,
+        tf.reshape(gamma, [-1, 1])
+        * tf.ones(shape=[batch_size, 1], dtype=dtype)
+        * tf.reshape(exploitation_coefficient, [1, -1]),
+    )
 
   def _action_distribution(self, mask, predicted_rewards):
-    gamma = tf.expand_dims(
-        self._compute_gamma(mask, predicted_rewards.dtype), axis=-1)
     batch_size = tf.shape(predicted_rewards)[0]
+    gamma = self._compute_gamma(mask, predicted_rewards.dtype, batch_size)
     # Replace predicted rewards of masked actions with -inf.
     predictions = predicted_rewards if mask is None else tf.where(
         tf.cast(mask, tf.bool), predicted_rewards, -float('Inf') *
@@ -211,30 +349,67 @@ def _action_distribution(self, mask, predicted_rewards):
     greedy_action_predictions = tf.reshape(
         tf.reduce_max(predictions, axis=-1), shape=[-1, 1])
 
-    # `other_actions_probs` is a tensor shaped as [batch_size, num_actions] that
-    # contains valid sampling probabilities for all non-greedy actions.
-    num_allowed_actions = tf.expand_dims(
-        self._get_number_of_allowed_actions(mask), axis=-1)
+    # `other_actions_probs` is a tensor shaped as [batch_size, num_actions, d]
+    # that contains valid sampling probabilities for all non-greedy actions.
+    # The last dimension corresponds to different gamma parameters.
+    if mask is not None:
+      num_allowed_actions = tf.reshape(
+          self._get_number_of_allowed_actions(mask), [batch_size, 1, 1]
+      )
+    else:
+      num_allowed_actions = self._get_number_of_allowed_actions(mask)
+    prediction_delta = greedy_action_predictions - predictions
     other_actions_probs = tf.math.divide_no_nan(
         1.0,
-        num_allowed_actions + gamma * (greedy_action_predictions - predictions))
+        num_allowed_actions
+        + tf.matmul(
+            tf.expand_dims(prediction_delta, axis=-1),
+            tf.expand_dims(gamma, axis=1),
+        ),
+    )
     # Although `predictions` has accounted for the action mask, we still need
     # to mask the action probabilities in the case of zero gamma.
-    other_actions_probs = (
-        other_actions_probs if mask is None else tf.where(
-            tf.cast(mask, tf.bool), other_actions_probs,
-            tf.zeros_like(other_actions_probs)))
+    if mask is not None:
+      other_actions_probs = tf.where(
+          tf.repeat(
+              input=tf.expand_dims(tf.cast(mask, tf.bool), axis=-1),
+              repeats=[tf.shape(other_actions_probs)[-1]],
+              axis=2,
+          ),
+          other_actions_probs,
+          tf.zeros_like(other_actions_probs),
+      )
 
     # Get the greedy action.
     greedy_actions = tf.reshape(
         tf.argmax(predictions, axis=-1, output_type=self.action_spec.dtype),
         [-1, 1])
 
     # Compute the probabilities of sampling the greedy actions, which is
-    # 1 - (the total probability of sampling other actions).
-    greedy_action_prob = 1.0 - tf.reshape(
-        tf.reduce_sum(other_actions_probs, axis=1), [-1, 1]) + tf.gather(
-            other_actions_probs, greedy_actions, axis=1, batch_dims=1)
+    # 1 - (the total probability of sampling other actions),
+    # shaped [batch_size, d].
+    greedy_action_prob = (
+        1.0
+        - tf.reduce_sum(other_actions_probs, axis=1)
+        + tf.squeeze(
+            tf.gather(
+                other_actions_probs, greedy_actions, axis=1, batch_dims=1
+            ),
+            axis=1,
+        )
+    )
+
+    if self._max_exploration_probability_hint is not None:
+      max_exploration_prob = tf.clip_by_value(
+          self._max_exploration_probability_hint,
+          clip_value_min=0.0,
+          clip_value_max=1.0,
+      )
+      greedy_action_prob, other_actions_probs = _find_action_probabilities(
+          greedy_action_prob, other_actions_probs, max_exploration_prob
+      )
+    else:
+      other_actions_probs = tf.squeeze(other_actions_probs, axis=2)
 
     # Compute the sampling probabilities for all actions by combining
     # `greedy_action_prob` and `other_actions_probs`.