修复loss,capsule

zhufz · May 1, 2019 · 1071962 · 1071962
1 parent 66ed83a
commit 1071962
Show file tree

Hide file tree

Showing 8 changed files with 324 additions and 303 deletions.
diff --git a/common/loss.py b/common/loss.py
@@ -0,0 +1,53 @@
+import tensorflow as tf
+import numpy as np
+
+def get_default_value(kwargs, key, value):
+    if key in kwargs:
+        return kwargs[key]
+    else:
+        return value
+
+def get_loss(logits, labels, type = 'cross', labels_sparse = False,  **kwargs):
+    if labels_sparse == True:
+        num = logits.shape.as_list()[-1]
+        labels = tf.one_hot(labels,num)
+
+    if type == 'focal_loss':
+        gamma = get_default_value(kwargs, 'gamma', 2.0)
+        alpha = get_default_value(kwargs, 'alpha', 0.25)
+        epsilon = get_default_value(kwargs, 'epsilon', 1e-8)
+        return focal_loss(logits, labels, gamma, alpha, epsilon)
+    elif type == 'sigmoid_loss':
+        return sigmoid_cross_entropy(logits, labels)
+    elif type == 'softmax_loss':
+        return softmax_cross_entropy(logits, labels)
+    elif type == 'margin_loss':
+        return margin_loss(logits, labels)
+    else:
+        raise ValueError("unknown loss type")
+
+def focal_loss(logits, labels, gamma=2.0, alpha=0.25, epsilon=1e-8):
+    logits = tf.cast(logits, tf.float32)
+    model_out = tf.add(logits, epsilon)
+    ce = tf.multiply(tf.cast(labels, tf.float32), -tf.log(model_out))
+    weights = tf.multiply(tf.cast(labels, tf.float32), tf.pow(tf.subtract(1.0, model_out), gamma))
+    return tf.reduce_mean(tf.multiply(alpha, tf.multiply(weights, ce)))
+
+def sigmoid_cross_entropy(logits, labels):
+    loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, 
+                                                labels=tf.cast(labels,tf.float32))
+    loss = tf.reduce_mean(loss)
+    return loss
+
+def softmax_cross_entropy(logits, labels):
+    loss = tf.nn.softmax_cross_entropy_with_logits(logits=logits, 
+                                                labels=tf.cast(labels,tf.float32))
+    loss = tf.reduce_mean(loss)
+    return loss
+
+def margin_loss(logits, labels):
+    labels = tf.cast(labels,tf.float32)
+    loss = labels * tf.square(tf.maximum(0., 0.9 - logits)) + \
+        0.25 * (1.0 - labels) * tf.square(tf.maximum(0., logits - 0.1))
+    loss = tf.reduce_mean(tf.reduce_sum(loss, axis=1))
+    return loss
diff --git a/encoder/__init__.py b/encoder/__init__.py
@@ -16,6 +16,7 @@
 from fasttext import FastText
 from fast_attention_text import FastAttentionText
 from han import HAN
+from capsule import Capsule
 
 encoder["cnn"] = CNN
 encoder["dcnn"] = DCNN
@@ -29,6 +30,7 @@
 encoder["fasttext"] = FastText
 encoder["fast_attention_text"] = FastAttentionText
 encoder["han"] = HAN
+encoder["capsule"] = Capsule
 
 
 #pair sentence encoder

diff --git a/encoder/capsule.py b/encoder/capsule.py
@@ -1,13 +1,245 @@
 import tensorflow as tf
+import keras
 from keras import backend as K
-from utils import _conv2d_wrapper
 import tensorflow.contrib.slim as slim
+from tensorflow.contrib.layers.python.layers import initializers
+import pdb
 #refer:https://github.com/andyweizhao/capsule_text_classification/blob/master/network.py
 
+epsilon = 1e-9
+
+def softmax(x, axis=-1):
+    ex = K.exp(x - K.max(x, axis=axis, keepdims=True))
+    return ex/K.sum(ex, axis=axis, keepdims=True)
+
+def squash_v1(x, axis=-1):
+    s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
+    scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
+    return scale * x
+
+def squash_v0(s, axis=-1, epsilon=1e-7, name=None):
+    s_squared_norm = K.sum(K.square(s), axis, keepdims=True) + K.epsilon()
+    safe_norm = K.sqrt(s_squared_norm)
+    scale = 1 - tf.exp(-safe_norm)
+    return scale * s / safe_norm
+
+def routing(u_hat_vecs, beta_a, iterations, output_capsule_num, i_activations):
+    b = keras.backend.zeros_like(u_hat_vecs[:,:,:,0])
+    if i_activations is not None:
+        i_activations = i_activations[...,tf.newaxis]
+    for i in range(iterations):
+        if False:
+            leak = tf.zeros_like(b, optimize=True)
+            leak = tf.reduce_sum(leak, axis=1, keep_dims=True)
+            leaky_logits = tf.concat([leak, b], axis=1)
+            leaky_routing = tf.nn.softmax(leaky_logits, dim=1)
+            c = tf.split(leaky_routing, [1, output_capsule_num], axis=1)[1]
+        else:
+            c = softmax(b, 1)
+#        if i_activations is not None:
+#            tf.transpose(tf.transpose(c, perm=[0,2,1]) * i_activations, perm=[0,2,1])
+        outputs = squash_v1(K.batch_dot(c, u_hat_vecs, [2, 2]))
+        if i < iterations - 1:
+            b = b + K.batch_dot(outputs, u_hat_vecs, [2, 3])
+    poses = outputs
+    activations = K.sqrt(K.sum(K.square(poses), 2))
+    return poses, activations
+
+def _matmul_broadcast(x, y, name):
+  """Compute x @ y, broadcasting over the first `N - 2` ranks.
+  """
+  with tf.variable_scope(name) as scope:
+    return tf.reduce_sum(
+      tf.nn.dropout(x[..., tf.newaxis] * y[..., tf.newaxis, :, :],1), axis=-2
+    )
+
+
+def _get_variable_wrapper(
+  name, shape=None, dtype=None, initializer=None,
+  regularizer=None,
+  trainable=True,
+  collections=None,
+  caching_device=None,
+  partitioner=None,
+  validate_shape=True,
+  custom_getter=None
+):
+  """Wrapper over tf.get_variable().
+  """
+
+  with tf.device('/cpu:0'):
+    var = tf.get_variable(
+      name, shape=shape, dtype=dtype, initializer=initializer,
+      regularizer=regularizer, trainable=trainable,
+      collections=collections, caching_device=caching_device,
+      partitioner=partitioner, validate_shape=validate_shape,
+      custom_getter=custom_getter
+    )
+  return var
+
+
+def _get_weights_wrapper(
+  name, shape, dtype=tf.float32, initializer=initializers.xavier_initializer(),
+  weights_decay_factor=None
+):
+  """Wrapper over _get_variable_wrapper() to get weights, with weights decay factor in loss.
+  """
+
+  weights = _get_variable_wrapper(
+    name=name, shape=shape, dtype=dtype, initializer=initializer
+  )
+
+  if weights_decay_factor is not None and weights_decay_factor > 0.0:
+
+    weights_wd = tf.multiply(
+      tf.nn.l2_loss(weights), weights_decay_factor, name=name + '/l2loss'
+    )
+
+    tf.add_to_collection('losses', weights_wd)
+
+  return weights
+
+
+def _get_biases_wrapper(
+  name, shape, dtype=tf.float32, initializer=tf.constant_initializer(0.0)
+):
+  """Wrapper over _get_variable_wrapper() to get bias.
+  """
+
+  biases = _get_variable_wrapper(
+    name=name, shape=shape, dtype=dtype, initializer=initializer
+  )
+
+  return biases
+
+
+def _conv2d_wrapper(inputs, shape, strides, padding, add_bias, activation_fn, name, stddev=0.1):
+  """Wrapper over tf.nn.conv2d().
+  """
+
+  with tf.variable_scope(name) as scope:
+    kernel = _get_weights_wrapper(
+      name='weights', shape=shape, weights_decay_factor=0.0, #initializer=tf.truncated_normal_initializer(stddev=stddev, dtype=tf.float32)
+    )
+    output = tf.nn.conv2d(
+      inputs, filter=kernel, strides=strides, padding=padding, name='conv'
+    )
+    if add_bias:
+      biases = _get_biases_wrapper(
+        name='biases', shape=[shape[-1]]
+      )
+      output = tf.add(
+        output, biases, name='biasAdd'
+      )
+    if activation_fn is not None:
+      output = activation_fn(
+        output, name='activation'
+      )
+
+  return output
+
+
+def _separable_conv2d_wrapper(inputs, depthwise_shape, pointwise_shape, strides, padding, add_bias, activation_fn, name):
+  """Wrapper over tf.nn.separable_conv2d().
+  """
+
+  with tf.variable_scope(name) as scope:
+    dkernel = _get_weights_wrapper(
+      name='depthwise_weights', shape=depthwise_shape, weights_decay_factor=0.0
+    )
+    pkernel = _get_weights_wrapper(
+      name='pointwise_weights', shape=pointwise_shape, weights_decay_factor=0.0
+    )
+    output = tf.nn.separable_conv2d(
+      input=inputs, depthwise_filter=dkernel, pointwise_filter=pkernel,
+      strides=strides, padding=padding, name='conv'
+    )
+    if add_bias:
+      biases = _get_biases_wrapper(
+        name='biases', shape=[pointwise_shape[-1]]
+      )
+      output = tf.add(
+        output, biases, name='biasAdd'
+      )
+    if activation_fn is not None:
+      output = activation_fn(
+        output, name='activation'
+      )
+
+  return output
+
+
+def _depthwise_conv2d_wrapper(inputs, shape, strides, padding, add_bias, activation_fn, name):
+  """Wrapper over tf.nn.depthwise_conv2d().
+  """
+
+  with tf.variable_scope(name) as scope:
+    dkernel = _get_weights_wrapper(
+      name='depthwise_weights', shape=shape, weights_decay_factor=0.0
+    )
+    output = tf.nn.depthwise_conv2d(
+      inputs, filter=dkernel, strides=strides, padding=padding, name='conv'
+    )
+    if add_bias:
+      d_ = output.get_shape()[-1].value
+      biases = _get_biases_wrapper(
+        name='biases', shape=[d_]
+      )
+      output = tf.add(
+        output, biases, name='biasAdd'
+      )
+    if activation_fn is not None:
+      output = activation_fn(
+        output, name='activation'
+      )
+
+    return output
+
+
+def vec_transformationByConv(poses, input_capsule_dim, input_capsule_num, output_capsule_dim, output_capsule_num):
+    kernel = _get_weights_wrapper(
+      name='weights', shape=[1, input_capsule_dim, output_capsule_dim*output_capsule_num], weights_decay_factor=0.0
+    )
+    tf.logging.info('poses: {}'.format(poses.get_shape()))
+    tf.logging.info('kernel: {}'.format(kernel.get_shape()))
+    u_hat_vecs = keras.backend.conv1d(poses, kernel)
+    u_hat_vecs = keras.backend.reshape(u_hat_vecs, (-1, input_capsule_num, output_capsule_num, output_capsule_dim))
+    u_hat_vecs = keras.backend.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
+    return u_hat_vecs
+
+def vec_transformationByMat(poses, input_capsule_dim, input_capsule_num, output_capsule_dim, output_capsule_num, shared=True):
+    inputs_poses_shape = poses.get_shape().as_list()
+    poses = poses[..., tf.newaxis, :]
+    poses = tf.tile(
+              poses, [1, 1, output_capsule_num, 1]
+            )
+    if shared:
+        kernel = _get_weights_wrapper(
+          name='weights', shape=[1, 1, output_capsule_num, output_capsule_dim, input_capsule_dim], weights_decay_factor=0.0
+        )
+        kernel = tf.tile(
+                  kernel, [inputs_poses_shape[0], input_capsule_num, 1, 1, 1]
+                )
+    else:
+        kernel = _get_weights_wrapper(
+          name='weights', shape=[1, input_capsule_num, output_capsule_num, output_capsule_dim, input_capsule_dim], weights_decay_factor=0.0
+        )
+        kernel = tf.tile(
+                  kernel, [inputs_poses_shape[0], 1, 1, 1, 1]
+                )
+    tf.logging.info('poses: {}'.format(poses[...,tf.newaxis].get_shape()))
+    tf.logging.info('kernel: {}'.format(kernel.get_shape()))
+    u_hat_vecs = tf.squeeze(tf.matmul(kernel, poses[...,tf.newaxis]),axis=-1)
+    u_hat_vecs = keras.backend.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
+    return u_hat_vecs
+
+
 class Capsule():
     def __init__(self, **kwargs):
-        self.output_size = 128
-        pass
+        self.seq_length = kwargs['maxlen']
+        self.embedding_size = kwargs['embedding_size']
+        self.keep_prob = kwargs['keep_prob']
+        self.num_output = kwargs['num_output']
 
     def capsules_init(self, inputs, shape, strides, padding, pose_shape, add_bias, name):
         with tf.variable_scope(name):
@@ -90,12 +322,14 @@ def capsule_conv_layer(self, nets, shape, strides, iterations, name):
                     )
             poses, activations = routing(u_hat_vecs, beta_a, iterations, shape[3], i_activations_patches)
             poses = tf.reshape(poses, [
-                        inputs_poses_shape[0], inputs_poses_shape[1],
+                        #inputs_poses_shape[0], inputs_poses_shape[1],
+                        -1, inputs_poses_shape[1],
                         inputs_poses_shape[2], shape[3],
                         inputs_poses_shape[-1]]
                     )
             activations = tf.reshape(activations, [
-                        inputs_poses_shape[0],inputs_poses_shape[1],
+                        #inputs_poses_shape[0],inputs_poses_shape[1],
+                        -1,inputs_poses_shape[1],
                         inputs_poses_shape[2],shape[3]]
                     )
             nets = poses, activations
@@ -125,7 +359,7 @@ def capsule_model_B(self, X):
         for _, ngram in enumerate([3,4,5]):
             with tf.variable_scope('capsule_'+str(ngram)):
                 nets = _conv2d_wrapper(
-                    X, shape=[ngram, 300, 1, 32], strides=[1, 2, 1, 1], padding='VALID', 
+                    X, shape=[ngram, self.embedding_size, 1, 32], strides=[1, 2, 1, 1], padding='VALID', 
                     add_bias=True, activation_fn=tf.nn.relu, name='conv1'
                 )
                 tf.logging.info('output shape: {}'.format(nets.get_shape()))
@@ -135,16 +369,16 @@ def capsule_model_B(self, X):
                 nets = self.capsule_conv_layer(nets, shape=[3, 1, 16, 16], strides=[1, 1, 1, 1], iterations=3, name='conv2')
                 nets = self.capsule_flatten(nets)
                 poses, activations = self.capsule_fc_layer(nets,
-                                                           self.output_size, 3, 'fc2')
+                                                           self.num_output, 3, 'fc2')
                 poses_list.append(poses)
         poses = tf.reduce_mean(tf.convert_to_tensor(poses_list), axis=0) 
         activations = K.sqrt(K.sum(K.square(poses), 2))
-        return poses
+        return activations
 
     def capsule_model_A(self, X):
         with tf.variable_scope('capsule_'+str(3)):
             nets = _conv2d_wrapper(
-                    X, shape=[3, 300, 1, 32], strides=[1, 2, 1, 1], padding='VALID', 
+                    X, shape=[3, self.embedding_size, 1, 32], strides=[1, 2, 1, 1], padding='VALID', 
                     add_bias=True, activation_fn=tf.nn.relu, name='conv1'
                 )
             tf.logging.info('output shape: {}'.format(nets.get_shape()))
@@ -153,8 +387,17 @@ def capsule_model_A(self, X):
                                  name='primary')
             nets = self.capsule_conv_layer(nets, shape=[3, 1, 16, 16], strides=[1, 1, 1, 1], iterations=3, name='conv2')
             nets = self.capsule_flatten(nets)
-            poses, activations = self.capsule_fc_layer(nets, self.output_size, 3, 'fc2') 
-        return poses
+            poses, activations = self.capsule_fc_layer(nets, self.num_output, 3, 'fc2') 
+        return activations
+
+    def feed_dict(self, **kwargs):
+        feed_dict = {}
+        return feed_dict
+
+    def pb_feed_dict(self, graph, **kwargs):
+        feed_dict = {}
+        return feed_dict
 
     def __call__(self, embed, reuse = tf.AUTO_REUSE):
-        return capsule_model_A(embed)
+        embed = tf.expand_dims(embed, -1)
+        return self.capsule_model_A(embed)