[go: nahoru, domu]

Skip to content

Commit

Permalink
Supports max_exploration_probability_hint in NeuralFalconAgent
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 518823254
Change-Id: I7517c12e274821bfd4330d77d8228e7484fcb227
  • Loading branch information
TF-Agents Team authored and Copybara-Service committed Mar 23, 2023
1 parent c5c2ccd commit 444eedb
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 11 deletions.
14 changes: 14 additions & 0 deletions tf_agents/bandits/agents/neural_falcon_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@ def __init__(
optimizer: types.Optimizer,
num_samples_list: Sequence[tf.Variable],
exploitation_coefficient: types.FloatOrReturningFloat = 1.0,
max_exploration_probability_hint: Optional[
types.FloatOrReturningFloat
] = None,
observation_and_action_constraint_splitter: Optional[
types.Splitter] = None,
accepts_per_arm_features: bool = False,
Expand Down Expand Up @@ -88,6 +91,16 @@ def __init__(
exploitative the policy behaves with respect to the predicted rewards: A
larger value makes the policy sample the greedy action (one with the
best predicted reward) with a higher probability.
max_exploration_probability_hint: An optional float, representing a hint
on the maximum exploration probability, internally clipped to [0, 1].
When this argument is set, `exploitation_coefficient` is ignored and the
policy attempts to choose non-greedy actions with at most this
probability. When such an upper bound cannot be achieved, e.g. due to
insufficient training data, the policy attempts to minimize the
probability of choosing non-greedy actions on a best-effort basis. For a
demonstration of how it affects the policy behavior, see the unit test
`testTrainedPolicyWithMaxExplorationProbabilityHint` in
`neural_falcon_agent_test`.
observation_and_action_constraint_splitter: A function used for masking
valid/invalid actions with each state of the environment. The function
takes in a full observation and returns a tuple consisting of 1) the
Expand Down Expand Up @@ -155,6 +168,7 @@ def __init__(
action_spec,
reward_network,
exploitation_coefficient=exploitation_coefficient,
max_exploration_probability_hint=max_exploration_probability_hint,
observation_and_action_constraint_splitter=(
observation_and_action_constraint_splitter
),
Expand Down
87 changes: 76 additions & 11 deletions tf_agents/bandits/agents/neural_falcon_agent_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@

"""Tests for neural_falcon_agent."""

from absl.testing import parameterized
from typing import Optional

from absl.testing import parameterized
import numpy as np
import tensorflow as tf # pylint: disable=g-explicit-tensorflow-version-import
from tf_agents.bandits.agents import neural_falcon_agent
Expand Down Expand Up @@ -92,7 +93,10 @@ def _check_uniform_actions(self, actions: np.ndarray,

def _create_agent(
self,
accepts_per_arm_features: bool) -> neural_falcon_agent.NeuralFalconAgent:
accepts_per_arm_features: bool,
exploitation_coefficient: Optional[float] = 10000.0,
max_exploration_probability_hint: Optional[float] = None,
) -> neural_falcon_agent.NeuralFalconAgent:
if accepts_per_arm_features:
optimizer = tf.compat.v1.train.GradientDescentOptimizer(
learning_rate=1e-2)
Expand All @@ -117,10 +121,14 @@ def _create_agent(
num_samples_list=[
tf.compat.v2.Variable(0, dtype=tf.int64, name='num_samples')
],
exploitation_coefficient=10000.0,
emit_policy_info=(policy_utils.InfoFields.LOG_PROBABILITY,
policy_utils.InfoFields.PREDICTED_REWARDS_MEAN),
optimizer=optimizer)
exploitation_coefficient=exploitation_coefficient,
max_exploration_probability_hint=max_exploration_probability_hint,
emit_policy_info=(
policy_utils.InfoFields.LOG_PROBABILITY,
policy_utils.InfoFields.PREDICTED_REWARDS_MEAN,
),
optimizer=optimizer,
)
else:
optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=1.0)
reward_net = DummyNet(self._observation_spec, self._action_spec)
Expand All @@ -131,12 +139,16 @@ def _create_agent(
num_samples_list=[
tf.compat.v2.Variable(0, dtype=tf.int64, name='num_samples_0'),
tf.compat.v2.Variable(0, dtype=tf.int64, name='num_samples_1'),
tf.compat.v2.Variable(0, dtype=tf.int64, name='num_samples_2')
tf.compat.v2.Variable(0, dtype=tf.int64, name='num_samples_2'),
],
exploitation_coefficient=10000.0,
emit_policy_info=(policy_utils.InfoFields.LOG_PROBABILITY,
policy_utils.InfoFields.PREDICTED_REWARDS_MEAN),
optimizer=optimizer)
exploitation_coefficient=exploitation_coefficient,
max_exploration_probability_hint=max_exploration_probability_hint,
emit_policy_info=(
policy_utils.InfoFields.LOG_PROBABILITY,
policy_utils.InfoFields.PREDICTED_REWARDS_MEAN,
),
optimizer=optimizer,
)
return agent

def _generate_observations(
Expand Down Expand Up @@ -283,5 +295,58 @@ def testTrainedPolicy(self, accepts_per_arm_features):
self.assertAllEqual(actions,
np.argmax(p_info.predicted_rewards_mean, axis=1))

@parameterized.named_parameters(
{
'testcase_name': 'accepts_per_arm_features',
'accepts_per_arm_features': True,
},
{'testcase_name': 'simple_action', 'accepts_per_arm_features': False},
)
def testTrainedPolicyWithMaxExplorationProbabilityHint(
self, accepts_per_arm_features
):
# Creates two agents, with and without `max_exploration_probability_hint`.
agent_with_limited_exploration = self._create_agent(
accepts_per_arm_features,
exploitation_coefficient=0.1,
max_exploration_probability_hint=0.05,
)
agent = self._create_agent(
accepts_per_arm_features, exploitation_coefficient=0.1
)
# Initialize all variables and train the agents over one batch.
self.evaluate(tf.compat.v1.global_variables_initializer())
experience = self._generate_training_experience(accepts_per_arm_features)
self.evaluate(agent_with_limited_exploration.train(experience, None).loss)
self.evaluate(agent.train(experience, None).loss)

batch_size = 1000
observations = self._generate_observations(
batch_size, accepts_per_arm_features
)
time_step = ts.restart(observations, batch_size)

# With `max_exploration_probability_hint` set to 5%, the trained agent is
# expected to choose greedy actions at least 95% of the time.
action_step = agent_with_limited_exploration.policy.action(
time_step, seed=1
)
actions = self.evaluate(action_step.action)
p_info = self.evaluate(action_step.info)
greedy_actions = np.argmax(p_info.predicted_rewards_mean, axis=1)
self.assertNotAllEqual(actions, greedy_actions)
# Sets the threshold to be smaller than 95% to be robust against random
# sampling in the test.
self.assertGreater(np.sum(actions == greedy_actions), batch_size * 0.9)

# Without `max_exploration_probability_hint`, the agent is expected to
# explore a lot due to few training data.
action_step = agent.policy.action(time_step, seed=1)
actions = self.evaluate(action_step.action)
p_info = self.evaluate(action_step.info)
greedy_actions = np.argmax(p_info.predicted_rewards_mean, axis=1)
self.assertLess(np.sum(actions == greedy_actions), batch_size * 0.6)


if __name__ == '__main__':
tf.test.main()

0 comments on commit 444eedb

Please sign in to comment.