From d5ca984c5314d2e683eca87539d03ee4e35e3ee6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 17 Jun 2020 13:05:59 -0700 Subject: [PATCH] Allow the compiler to vectorize the loop. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``` name old time/op new time/op delta BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x112x112x64_f2x2x64_s2x2_SAME 18.6ms ± 5% 18.5ms ±13% ~ (p=0.912 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x128_f2x2x128_s2x2_SAME 12.7ms ±12% 12.7ms ±17% ~ (p=0.684 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x256_f2x2x256_s2x2_SAME 12.2ms ± 8% 11.2ms ± 4% -8.21% (p=0.001 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x112x112x64_f2x2x64_s2x2_VALID 18.7ms ±20% 18.6ms ±23% ~ (p=0.278 n=9+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x128_f2x2x128_s2x2_VALID 12.5ms ±15% 11.4ms ± 2% -8.98% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x256_f2x2x256_s2x2_VALID 11.6ms ± 8% 11.1ms ± 2% -4.22% (p=0.011 n=9+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x64_f1x1x64_s1x1_SAME 4.57ms ± 3% 4.34ms ± 1% -5.04% (p=0.000 n=8+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x64_f1x1x256_s1x1_SAME 12.0ms ± 4% 11.5ms ± 2% -4.32% (p=0.000 n=8+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x256_f1x1x64_s1x1_SAME 20.0ms ±31% 20.6ms ±17% ~ (p=0.912 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x56x56x64_f3x3x64_s1x1_SAME 36.5ms ±21% 32.0ms ± 1% -12.30% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x128_f1x1x128_s1x1_SAME 3.71ms ±17% 3.33ms ± 1% -10.47% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x128_f1x1x512_s1x1_SAME 11.8ms ±16% 10.5ms ± 1% -11.37% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x512_f1x1x128_s1x1_SAME 13.1ms ±13% 11.4ms ± 2% -13.36% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x28x28x512_f3x3x128_s1x1_SAME 142ms ±12% 124ms ± 1% -13.22% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x256_f1x1x256_s1x1_SAME 3.51ms ±14% 3.18ms ±20% -9.43% (p=0.009 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x256_f1x1x1024_s1x1_SAME 14.0ms ±18% 12.0ms ± 4% -13.80% (p=0.012 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x1024_f1x1x256_s1x1_SAME 12.8ms ±18% 11.1ms ± 2% -13.57% (p=0.001 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in8x14x14x256_f3x3x256_s1x1_SAME 23.0ms ±18% 19.9ms ± 4% -13.38% (p=0.004 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x112x112x64_f2x2x64_s2x2_SAME 45.3ms ± 9% 40.5ms ± 4% -10.74% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x128_f2x2x128_s2x2_SAME 33.2ms ±13% 28.8ms ± 2% -13.11% (p=0.001 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x256_f2x2x256_s2x2_SAME 31.5ms ±15% 26.7ms ± 2% -15.13% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x112x112x64_f2x2x64_s2x2_VALID 45.5ms ± 8% 41.3ms ± 9% -9.31% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x128_f2x2x128_s2x2_VALID 33.2ms ±12% 28.8ms ± 2% -13.38% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x256_f2x2x256_s2x2_VALID 31.6ms ±14% 26.7ms ± 1% -15.53% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x64_f1x1x64_s1x1_SAME 11.1ms ±15% 9.4ms ± 3% -15.29% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x64_f1x1x256_s1x1_SAME 27.3ms ±13% 23.5ms ± 2% -13.97% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x256_f1x1x64_s1x1_SAME 46.2ms ± 8% 40.9ms ± 5% -11.54% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x56x56x64_f3x3x64_s1x1_SAME 81.5ms ±17% 64.2ms ± 1% -21.17% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x128_f1x1x128_s1x1_SAME 8.30ms ±16% 6.80ms ± 1% -18.02% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x128_f1x1x512_s1x1_SAME 27.7ms ±14% 23.3ms ± 2% -15.87% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x512_f1x1x128_s1x1_SAME 34.9ms ±14% 28.8ms ± 2% -17.41% (p=0.000 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x28x28x512_f3x3x128_s1x1_SAME 300ms ±12% 250ms ± 1% -16.80% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x256_f1x1x256_s1x1_SAME 7.91ms ±15% 7.17ms ±21% -9.36% (p=0.029 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x256_f1x1x1024_s1x1_SAME 35.6ms ±15% 33.4ms ±13% -6.21% (p=0.023 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x1024_f1x1x256_s1x1_SAME 28.0ms ± 3% 26.7ms ± 1% -4.81% (p=0.000 n=8+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in16x14x14x256_f3x3x256_s1x1_SAME 41.4ms ± 3% 39.6ms ± 4% -4.43% (p=0.001 n=9+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x112x112x64_f2x2x64_s2x2_SAME 90.0ms ± 3% 85.5ms ±11% -5.00% (p=0.010 n=9+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x128_f2x2x128_s2x2_SAME 69.0ms ± 1% 65.8ms ± 3% -4.68% (p=0.000 n=9+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x256_f2x2x256_s2x2_SAME 57.8ms ± 2% 55.8ms ± 2% -3.54% (p=0.000 n=9+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x112x112x64_f2x2x64_s2x2_VALID 89.3ms ± 2% 86.1ms ± 5% -3.57% (p=0.006 n=9+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x128_f2x2x128_s2x2_VALID 68.7ms ± 2% 65.7ms ± 3% -4.31% (p=0.001 n=8+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x256_f2x2x256_s2x2_VALID 57.1ms ± 1% 55.6ms ± 1% -2.58% (p=0.000 n=8+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x64_f1x1x64_s1x1_SAME 23.6ms ± 9% 21.9ms ± 1% -7.27% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x64_f1x1x256_s1x1_SAME 51.0ms ± 3% 48.4ms ± 5% -5.13% (p=0.001 n=8+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x256_f1x1x64_s1x1_SAME 90.2ms ± 3% 85.4ms ± 5% -5.29% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x56x56x64_f3x3x64_s1x1_SAME 143ms ± 7% 133ms ± 3% -6.48% (p=0.000 n=9+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x128_f1x1x128_s1x1_SAME 14.5ms ± 4% 14.0ms ± 2% -3.82% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x128_f1x1x512_s1x1_SAME 53.8ms ± 3% 51.5ms ± 2% -4.43% (p=0.000 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x512_f1x1x128_s1x1_SAME 69.8ms ± 5% 66.9ms ± 8% -4.15% (p=0.010 n=10+9) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x28x28x512_f3x3x128_s1x1_SAME 533ms ± 3% 508ms ± 0% -4.65% (p=0.000 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x256_f1x1x256_s1x1_SAME 14.0ms ± 6% 14.9ms ±21% ~ (p=0.481 n=10+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x256_f1x1x1024_s1x1_SAME 67.2ms ± 3% 71.0ms ±22% ~ (p=0.278 n=9+10) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x1024_f1x1x256_s1x1_SAME 57.5ms ± 4% 56.6ms ± 5% ~ (p=0.055 n=10+8) BM_Conv2DBackpropInput_fp32_NHWC_cpu_in32x14x14x256_f3x3x256_s1x1_SAME 78.1ms ± 5% 75.5ms ± 3% -3.32% (p=0.011 n=9+8) ``` PiperOrigin-RevId: 316949219 Change-Id: I1a1fb517a5c28d489da9762b650577b61bf4e0de --- tensorflow/core/kernels/conv_grad_input_ops.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc index fd2f569a8b82f2..2dd63d1f4d05b7 100644 --- a/tensorflow/core/kernels/conv_grad_input_ops.cc +++ b/tensorflow/core/kernels/conv_grad_input_ops.cc @@ -76,7 +76,7 @@ template void Col2im(const T* col_data, const int depth, const int height, const int width, const int filter_h, const int filter_w, const int pad_t, const int pad_l, const int pad_b, const int pad_r, - const int stride_h, const int stride_w, T* im_data) { + const int stride_h, const int stride_w, T* __restrict im_data) { int height_col = (height + pad_t + pad_b - filter_h) / stride_h + 1; int width_col = (width + pad_l + pad_r - filter_w) / stride_w + 1; int h_pad = -pad_t; @@ -87,7 +87,6 @@ void Col2im(const T* col_data, const int depth, const int height, for (int ih = h_pad; ih < h_pad + filter_h; ++ih) { for (int iw = w_pad; iw < w_pad + filter_w; ++iw) { if (ih >= 0 && ih < height && iw >= 0 && iw < width) { - // TODO(andydavis) Vectorize this loop (if compiler does not). for (int i = 0; i < depth; ++i) { im_patch_data[i] += col_data[i]; }