agents/tf_agents/bandits/agents/neural_epsilon_greedy_agent.py at master · tensorflow/agents

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

# coding=utf-8

#

# Licensed under the Apache License, Version 2.0 (the "License");

# you may not use this file except in compliance with the License.

# You may obtain a copy of the License at

#

# https://www.apache.org/licenses/LICENSE-2.0

#

# Unless required by applicable law or agreed to in writing, software

# distributed under the License is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the License for the specific language governing permissions and

# limitations under the License.

"""A neural network based agent that implements epsilon greedy exploration.

Implements an agent based on a neural network that predicts arm rewards.

The policy adds epsilon greedy exploration.

"""

from __future__ import absolute_import

from __future__ import division

from __future__ import print_function

from typing import Iterable, Optional, Sequence, Text, Tuple

import gin

import tensorflow as tf

from tf_agents.bandits.agents import greedy_reward_prediction_agent

from tf_agents.bandits.policies import constraints as constr

from tf_agents.policies import epsilon_greedy_policy

from tf_agents.typing import types

@gin.configurable

class NeuralEpsilonGreedyAgent(

greedy_reward_prediction_agent.GreedyRewardPredictionAgent

):

"""A neural network based epsilon greedy agent.

This agent receives a neural network that it trains to predict rewards. The

action is chosen greedily with respect to the prediction with probability

`1 - epsilon`, and uniformly randomly with probability `epsilon`.

"""

def __init__(

self,

time_step_spec: types.TimeStep,

action_spec: types.BoundedTensorSpec,

reward_network: types.Network,

optimizer: types.Optimizer,

epsilon: float,

observation_and_action_constraint_splitter: Optional[

types.Splitter

] = None,

accepts_per_arm_features: bool = False,

constraints: Iterable[constr.NeuralConstraint] = (),

# Params for training.

error_loss_fn: types.LossFn = tf.compat.v1.losses.mean_squared_error,

gradient_clipping: Optional[float] = None,

# Params for debugging.

debug_summaries: bool = False,

summarize_grads_and_vars: bool = False,

enable_summaries: bool = True,

emit_policy_info: Tuple[Text, ...] = (),

train_step_counter: Optional[tf.Variable] = None,

laplacian_matrix: Optional[types.Float] = None,

laplacian_smoothing_weight: float = 0.001,

info_fields_to_inherit_from_greedy: Sequence[Text] = (),

name: Optional[Text] = None,

):

"""Creates a Neural Epsilon Greedy Agent.

For more details about the Laplacian smoothing regularization, please see

the documentation of the `GreedyRewardPredictionAgent`.

Args:

time_step_spec: A `TimeStep` spec of the expected time_steps.

action_spec: A nest of `BoundedTensorSpec` representing the actions.

reward_network: A `tf_agents.network.Network` to be used by the agent. The

network will be called with call(observation, step_type) and it is

expected to provide a reward prediction for all actions. *Note*: when

using `observation_and_action_constraint_splitter`, make sure the

`reward_network` is compatible with the network-specific half of the

output of the `observation_and_action_constraint_splitter`. In

particular, `observation_and_action_constraint_splitter` will be called

on the observation before passing to the network.

optimizer: The optimizer to use for training.

epsilon: A float representing the probability of choosing a random action

instead of the greedy action.

observation_and_action_constraint_splitter: A function used for masking

valid/invalid actions with each state of the environment. The function

takes in a full observation and returns a tuple consisting of 1) the

part of the observation intended as input to the bandit agent and

policy, and 2) the boolean mask. This function should also work with a

`TensorSpec` as input, and should output `TensorSpec` objects for the

observation and mask.

accepts_per_arm_features: (bool) Whether the policy accepts per-arm

features.

constraints: iterable of constraints objects that are instances of

`tf_agents.bandits.agents.NeuralConstraint`. WARNING: only the greedy

actions respect the constraints. Uniform random actions may not.

error_loss_fn: A function for computing the error loss, taking parameters

labels, predictions, and weights (any function from tf.losses would

work). The default is `tf.losses.mean_squared_error`.

gradient_clipping: A float representing the norm length to clip gradients

(or None for no clipping.)

debug_summaries: A Python bool, default False. When True, debug summaries

are gathered.

summarize_grads_and_vars: A Python bool, default False. When True,

gradients and network variable summaries are written during training.

enable_summaries: A Python bool, default True. When False, all summaries

(debug or otherwise) should not be written.

emit_policy_info: (tuple of strings) what side information we want to get

as part of the policy info. Allowed values can be found in

`policy_utilities.PolicyInfo`.

train_step_counter: An optional `tf.Variable` to increment every time the

train op is run. Defaults to the `global_step`.

laplacian_matrix: A float `Tensor` shaped `[num_actions, num_actions]`.

This holds the Laplacian matrix used to regularize the smoothness of the

estimated expected reward function. This only applies to problems where

the actions have a graph structure. If `None`, the regularization is not

applied.

laplacian_smoothing_weight: A float that determines the weight of the

regularization term. Note that this has no effect if `laplacian_matrix`

above is `None`.

info_fields_to_inherit_from_greedy: List of info fields that are reported

from the greedy policy even when exploratory action is taken.

name: Python str name of this agent. All variables in this module will

fall under that name. Defaults to the class name.

Raises:

ValueError: If the action spec contains more than one action or or it is

not a bounded scalar int32 spec with minimum 0.

"""

super(NeuralEpsilonGreedyAgent, self).__init__(

time_step_spec=time_step_spec,

action_spec=action_spec,

reward_network=reward_network,

optimizer=optimizer,

observation_and_action_constraint_splitter=(

observation_and_action_constraint_splitter

),

accepts_per_arm_features=accepts_per_arm_features,

constraints=constraints,

error_loss_fn=error_loss_fn,

gradient_clipping=gradient_clipping,

debug_summaries=debug_summaries,

summarize_grads_and_vars=summarize_grads_and_vars,

enable_summaries=enable_summaries,

emit_policy_info=emit_policy_info,

train_step_counter=train_step_counter,

laplacian_matrix=laplacian_matrix,

laplacian_smoothing_weight=laplacian_smoothing_weight,

name=name,

)

self._policy = epsilon_greedy_policy.EpsilonGreedyPolicy(

self._policy,

epsilon=epsilon,

info_fields_to_inherit_from_greedy=info_fields_to_inherit_from_greedy,

)

self._collect_policy = self._policy

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

neural_epsilon_greedy_agent.py

neural_epsilon_greedy_agent.py

Files

neural_epsilon_greedy_agent.py

Latest commit

History

neural_epsilon_greedy_agent.py

File metadata and controls