[go: nahoru, domu]

Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Commit

Permalink
[BREAKING CHANGE]: Refine AnnotationKey, LabelKey and EnvName (#6)
Browse files Browse the repository at this point in the history
1. Change "POD_NAMESPACE" to "FRAMEWORK_NAMESPACE"
2. Prefix "FC_" for all FrameworkController Predefined AnnotationKeys, LabelKeys and EnvNames
3. Prefix "FB_" and uppercase TaskRoleName for all FrameworkBarrier EnvNames
  • Loading branch information
yqwang-ms committed Jan 17, 2019
1 parent c06876a commit 3420ae0
Show file tree
Hide file tree
Showing 9 changed files with 105 additions and 100 deletions.
13 changes: 7 additions & 6 deletions example/framework/basic/batchstatefulfailed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,19 @@ spec:
# To locate a specific Task during its whole lifecycle regardless of
# any retry:
# Consistent Identity:
# PodName = {FrameworkName}-{TaskRoleName}-{TaskIndex}
# PodNamespace = {FrameworkNamespace}
# PodName = {FrameworkName}-{TaskRoleName}-{TaskIndex}
# Consistent Environment Variable Value:
# ${FRAMEWORK_NAME}, ${TASKROLE_NAME}, ${TASK_INDEX}
# ${CONFIGMAP_NAME}, ${POD_NAME}, ${POD_NAMESPACE}
# ${FC_FRAMEWORK_NAMESPACE},
# ${FC_FRAMEWORK_NAME}, ${FC_TASKROLE_NAME}, ${FC_TASK_INDEX},
# ${FC_CONFIGMAP_NAME}, ${FC_POD_NAME}
#
# To locate a specific execution attempt of a specific Task:
# Attempt Specific Environment Variable Value:
# ${FRAMEWORK_ATTEMPT_ID}, ${TASK_ATTEMPT_ID}
# ${FC_FRAMEWORK_ATTEMPT_ID}, ${FC_TASK_ATTEMPT_ID}
#
# To locate a specific execution attempt instance of a specific Task:
# Attempt Instance Specific Environment Variable Value:
# ${FRAMEWORK_ATTEMPT_INSTANCE_UID}, ${CONFIGMAP_UID}
# ${TASK_ATTEMPT_INSTANCE_UID}, ${POD_UID}
# ${FC_FRAMEWORK_ATTEMPT_INSTANCE_UID}, ${FC_CONFIGMAP_UID}
# ${FC_TASK_ATTEMPT_INSTANCE_UID}, ${FC_POD_UID}
command: ["sh", "-c", "printenv && sleep 60 && exit 1"]
4 changes: 2 additions & 2 deletions example/framework/basic/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ metadata:
spec:
selector:
# Using predefined labels
FRAMEWORK_NAME: service
TASKROLE_NAME: server
FC_FRAMEWORK_NAME: service
FC_TASKROLE_NAME: server
# Also can use customized labels
#app: server
ports:
Expand Down
4 changes: 2 additions & 2 deletions example/framework/basic/servicestateful.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,8 @@ metadata:
spec:
selector:
# See comments in service.yaml
FRAMEWORK_NAME: servicestateful
TASKROLE_NAME: serverstateful
FC_FRAMEWORK_NAME: servicestateful
FC_TASKROLE_NAME: serverstateful
ports:
- port: 80
type: NodePort
22 changes: 11 additions & 11 deletions example/framework/extension/frameworkbarrier.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,15 @@ spec:
containers:
- name: ubuntu
image: ubuntu:trusty
# Using /mnt/frameworkbarrier/injector.sh to inject environment
# variables, such as:
# {TaskRoleName}_ips=
# Using /mnt/frameworkbarrier/injector.sh to inject environment variables,
# such as:
# FB_{UpperCase({TaskRoleName})}_IPS=
# {Task[0].PodIP},...,
# {Task[TaskRole.TaskNumber-1].PodIP}
# {TaskRoleName}_addresses=
# {Task[0].PodIP}:${{TaskRoleName}_port},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${{TaskRoleName}_port}
# Note, the environment variable {TaskRoleName}_port should be
# FB_{UpperCase({TaskRoleName})}_ADDRESSES=
# {Task[0].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT}
# Note, the environment variable FB_{UpperCase({TaskRoleName})}_PORT should be
# provided by the caller in advance.
#
# User may need to tweak these environment variables to its own
Expand All @@ -48,8 +48,8 @@ spec:
# /mnt/frameworkbarrier/framework.json.
command: [
"sh", "-c",
"server_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh && printenv &&
server_port=4002 worker_port=5002 . /mnt/frameworkbarrier/injector.sh && printenv &&
"FB_SERVER_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh && printenv &&
FB_SERVER_PORT=4002 FB_WORKER_PORT=5002 . /mnt/frameworkbarrier/injector.sh && printenv &&
sleep 60"]
ports:
- containerPort: 4001
Expand Down Expand Up @@ -104,8 +104,8 @@ spec:
image: ubuntu:trusty
command: [
"sh", "-c",
"server_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh && printenv &&
server_port=4002 worker_port=5002 . /mnt/frameworkbarrier/injector.sh && printenv &&
"FB_SERVER_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh && printenv &&
FB_SERVER_PORT=4002 FB_WORKER_PORT=5002 . /mnt/frameworkbarrier/injector.sh && printenv &&
sleep 60"]
ports:
- containerPort: 5001
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,22 @@ spec:
# For the tf_cnn_benchmarks usage, see
# https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks
workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
# Using /mnt/frameworkbarrier/injector.sh to inject environment
# variables without the need for image invasion and k8s DNS:
# {TaskRoleName}_addresses=
# {Task[0].PodIP}:${{TaskRoleName}_port},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${{TaskRoleName}_port}
# Using /mnt/frameworkbarrier/injector.sh to inject environment variables
# without the need for image invasion and k8s DNS:
# FB_{UpperCase({TaskRoleName})}_ADDRESSES=
# {Task[0].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT}
# See more in ./example/framework/extension/frameworkbarrier.yaml
command: [
"sh", "-c",
"ps_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=ps --task_index=${TASK_INDEX}
--ps_hosts=${ps_addresses} --worker_hosts=${worker_addresses}
"FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=ps --task_index=${FC_TASK_INDEX}
--ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
--variable_update=parameter_server --cross_replica_sync=false
--model=alexnet --batch_size=8 --num_batches=10
--device=cpu --local_parameter_device=cpu --data_format=NHWC
--data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
--train_dir=/mnt/data/${FRAMEWORK_NAME}/output"]
--train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
ports:
- containerPort: 4001
volumeMounts:
Expand Down Expand Up @@ -129,14 +129,14 @@ spec:
workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
command: [
"sh", "-c",
"ps_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=worker --task_index=${TASK_INDEX}
--ps_hosts=${ps_addresses} --worker_hosts=${worker_addresses}
"FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=worker --task_index=${FC_TASK_INDEX}
--ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
--variable_update=parameter_server --cross_replica_sync=false
--model=alexnet --batch_size=8 --num_batches=10
--device=cpu --local_parameter_device=cpu --data_format=NHWC
--data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
--train_dir=/mnt/data/${FRAMEWORK_NAME}/output"]
--train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
ports:
- containerPort: 5001
volumeMounts:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,22 @@ spec:
# For the tf_cnn_benchmarks usage, see
# https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks
workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
# Using /mnt/frameworkbarrier/injector.sh to inject environment
# variables without the need for image invasion and k8s DNS:
# {TaskRoleName}_addresses=
# {Task[0].PodIP}:${{TaskRoleName}_port},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${{TaskRoleName}_port}
# Using /mnt/frameworkbarrier/injector.sh to inject environment variables
# without the need for image invasion and k8s DNS:
# FB_{UpperCase({TaskRoleName})}_ADDRESSES=
# {Task[0].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT},...,
# {Task[TaskRole.TaskNumber-1].PodIP}:${FB_{UpperCase({TaskRoleName})}_PORT}
# See more in ./example/framework/extension/frameworkbarrier.yaml
command: [
"sh", "-c",
"ps_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=ps --task_index=${TASK_INDEX}
--ps_hosts=${ps_addresses} --worker_hosts=${worker_addresses}
"FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=ps --task_index=${FC_TASK_INDEX}
--ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
--variable_update=parameter_server --cross_replica_sync=false
--model=alexnet --batch_size=8 --num_batches=10
--device=gpu --local_parameter_device=gpu --num_gpus=1 --data_format=NCHW
--data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
--train_dir=/mnt/data/${FRAMEWORK_NAME}/output"]
--train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
ports:
- containerPort: 4001
resources:
Expand Down Expand Up @@ -135,14 +135,14 @@ spec:
workingDir: /tensorflow/benchmarks/scripts/tf_cnn_benchmarks
command: [
"sh", "-c",
"ps_port=4001 worker_port=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=worker --task_index=${TASK_INDEX}
--ps_hosts=${ps_addresses} --worker_hosts=${worker_addresses}
"FB_PS_PORT=4001 FB_WORKER_PORT=5001 . /mnt/frameworkbarrier/injector.sh &&
python tf_cnn_benchmarks.py --job_name=worker --task_index=${FC_TASK_INDEX}
--ps_hosts=${FB_PS_ADDRESSES} --worker_hosts=${FB_WORKER_ADDRESSES}
--variable_update=parameter_server --cross_replica_sync=false
--model=alexnet --batch_size=8 --num_batches=10
--device=gpu --local_parameter_device=gpu --num_gpus=1 --data_format=NCHW
--data_name=cifar10 --data_dir=/mnt/data/cifar-10-batches-py
--train_dir=/mnt/data/${FRAMEWORK_NAME}/output"]
--train_dir=/mnt/data/${FC_FRAMEWORK_NAME}/output"]
ports:
- containerPort: 5001
resources:
Expand Down
45 changes: 25 additions & 20 deletions pkg/apis/frameworkcontroller/v1/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,35 +46,40 @@ const (
ExtendedUnlimitedValue = -2

// For all managed objects
AnnotationKeyFrameworkName = "FRAMEWORK_NAME"
AnnotationKeyTaskRoleName = "TASKROLE_NAME"
AnnotationKeyTaskIndex = "TASK_INDEX"
AnnotationKeyConfigMapName = "CONFIGMAP_NAME"
AnnotationKeyPodName = "POD_NAME"
AnnotationKeyPodNamespace = "POD_NAMESPACE"

AnnotationKeyFrameworkAttemptID = "FRAMEWORK_ATTEMPT_ID"
AnnotationKeyFrameworkAttemptInstanceUID = "FRAMEWORK_ATTEMPT_INSTANCE_UID"
AnnotationKeyConfigMapUID = "CONFIGMAP_UID"
AnnotationKeyTaskAttemptID = "TASK_ATTEMPT_ID"

// Predefined Annotations
AnnotationKeyFrameworkNamespace = "FC_FRAMEWORK_NAMESPACE"
AnnotationKeyFrameworkName = "FC_FRAMEWORK_NAME"
AnnotationKeyTaskRoleName = "FC_TASKROLE_NAME"
AnnotationKeyTaskIndex = "FC_TASK_INDEX"
AnnotationKeyConfigMapName = "FC_CONFIGMAP_NAME"
AnnotationKeyPodName = "FC_POD_NAME"

AnnotationKeyFrameworkAttemptID = "FC_FRAMEWORK_ATTEMPT_ID"
AnnotationKeyFrameworkAttemptInstanceUID = "FC_FRAMEWORK_ATTEMPT_INSTANCE_UID"
AnnotationKeyConfigMapUID = "FC_CONFIGMAP_UID"
AnnotationKeyTaskAttemptID = "FC_TASK_ATTEMPT_ID"

// Predefined Labels
LabelKeyFrameworkName = AnnotationKeyFrameworkName
LabelKeyTaskRoleName = AnnotationKeyTaskRoleName

// For all managed containers
EnvNameFrameworkName = AnnotationKeyFrameworkName
EnvNameTaskRoleName = AnnotationKeyTaskRoleName
EnvNameTaskIndex = AnnotationKeyTaskIndex
EnvNameConfigMapName = AnnotationKeyConfigMapName
EnvNamePodName = AnnotationKeyPodName
EnvNamePodNamespace = AnnotationKeyPodNamespace
// Predefined Environment Variables
// It can be referred by the environment variable specified in the spec, i.e.
// specify the environment variable value to include "$(AnyPredefinedEnvName)".
EnvNameFrameworkNamespace = AnnotationKeyFrameworkNamespace
EnvNameFrameworkName = AnnotationKeyFrameworkName
EnvNameTaskRoleName = AnnotationKeyTaskRoleName
EnvNameTaskIndex = AnnotationKeyTaskIndex
EnvNameConfigMapName = AnnotationKeyConfigMapName
EnvNamePodName = AnnotationKeyPodName

EnvNameFrameworkAttemptID = AnnotationKeyFrameworkAttemptID
EnvNameFrameworkAttemptInstanceUID = AnnotationKeyFrameworkAttemptInstanceUID
EnvNameConfigMapUID = AnnotationKeyConfigMapUID
EnvNameTaskAttemptID = AnnotationKeyTaskAttemptID
EnvNameTaskAttemptInstanceUID = "TASK_ATTEMPT_INSTANCE_UID"
EnvNamePodUID = "POD_UID"
EnvNameTaskAttemptInstanceUID = "FC_TASK_ATTEMPT_INSTANCE_UID"
EnvNamePodUID = "FC_POD_UID"
)

var FrameworkGroupVersionKind = SchemeGroupVersion.WithKind(FrameworkKind)
Expand Down
17 changes: 8 additions & 9 deletions pkg/apis/frameworkcontroller/v1/funcs.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ func (f *Framework) NewConfigMap() *core.ConfigMap {
cm.Finalizers = []string{meta.FinalizerDeleteDependents}

cm.Annotations = map[string]string{}
cm.Annotations[AnnotationKeyFrameworkNamespace] = f.Namespace
cm.Annotations[AnnotationKeyFrameworkName] = f.Name
cm.Annotations[AnnotationKeyConfigMapName] = cm.Name
cm.Annotations[AnnotationKeyFrameworkAttemptID] = frameworkAttemptIDStr
Expand Down Expand Up @@ -305,12 +306,12 @@ func (f *Framework) NewPod(cm *core.ConfigMap, taskRoleName string, taskIndex in
if pod.Annotations == nil {
pod.Annotations = map[string]string{}
}
pod.Annotations[AnnotationKeyFrameworkNamespace] = f.Namespace
pod.Annotations[AnnotationKeyFrameworkName] = f.Name
pod.Annotations[AnnotationKeyTaskRoleName] = taskRoleName
pod.Annotations[AnnotationKeyTaskIndex] = taskIndexStr
pod.Annotations[AnnotationKeyConfigMapName] = f.ConfigMapName()
pod.Annotations[AnnotationKeyPodName] = pod.Name
pod.Annotations[AnnotationKeyPodNamespace] = pod.Namespace
pod.Annotations[AnnotationKeyFrameworkAttemptID] = frameworkAttemptIDStr
pod.Annotations[AnnotationKeyFrameworkAttemptInstanceUID] = frameworkAttemptInstanceUIDStr
pod.Annotations[AnnotationKeyConfigMapUID] = configMapUIDStr
Expand All @@ -322,13 +323,13 @@ func (f *Framework) NewPod(cm *core.ConfigMap, taskRoleName string, taskIndex in
pod.Labels[LabelKeyFrameworkName] = f.Name
pod.Labels[LabelKeyTaskRoleName] = taskRoleName

exEnvs := []core.EnvVar{
predefinedEnvs := []core.EnvVar{
{Name: EnvNameFrameworkNamespace, Value: f.Namespace},
{Name: EnvNameFrameworkName, Value: f.Name},
{Name: EnvNameTaskRoleName, Value: taskRoleName},
{Name: EnvNameTaskIndex, Value: taskIndexStr},
{Name: EnvNameConfigMapName, Value: f.ConfigMapName()},
{Name: EnvNamePodName, Value: pod.Name},
{Name: EnvNamePodNamespace, Value: pod.Namespace},
{Name: EnvNameFrameworkAttemptID, Value: frameworkAttemptIDStr},
{Name: EnvNameFrameworkAttemptInstanceUID, Value: frameworkAttemptInstanceUIDStr},
{Name: EnvNameConfigMapUID, Value: configMapUIDStr},
Expand All @@ -337,24 +338,22 @@ func (f *Framework) NewPod(cm *core.ConfigMap, taskRoleName string, taskIndex in
{Name: EnvNameTaskAttemptInstanceUID, Value: taskAttemptInstanceUIDReferStr},
}

// Prepend predefinedEnvs so that they can be referred by the environment variable
// specified in the spec.
// Change the default TerminationMessagePolicy to TerminationMessageFallbackToLogsOnError
// in case the cluster-level logging has not been setup for the cluster.
// See https://kubernetes.io/docs/concepts/cluster-administration/logging
// It is safe to do so, since it will only fall back to the tail log if the container
// is failed and the termination message file specified by the terminationMessagePath
// is not found or empty.
for i := range pod.Spec.Containers {
for _, exEnv := range exEnvs {
pod.Spec.Containers[i].Env = append(pod.Spec.Containers[i].Env, exEnv)
}
pod.Spec.Containers[i].Env = append(predefinedEnvs, pod.Spec.Containers[i].Env...)
if len(pod.Spec.Containers[i].TerminationMessagePolicy) == 0 {
pod.Spec.Containers[i].TerminationMessagePolicy = core.TerminationMessageFallbackToLogsOnError
}
}
for i := range pod.Spec.InitContainers {
for _, exEnv := range exEnvs {
pod.Spec.InitContainers[i].Env = append(pod.Spec.InitContainers[i].Env, exEnv)
}
pod.Spec.InitContainers[i].Env = append(predefinedEnvs, pod.Spec.InitContainers[i].Env...)
if len(pod.Spec.InitContainers[i].TerminationMessagePolicy) == 0 {
pod.Spec.InitContainers[i].TerminationMessagePolicy = core.TerminationMessageFallbackToLogsOnError
}
Expand Down
Loading

0 comments on commit 3420ae0

Please sign in to comment.