1/* 2 * Copyright (C) 2012 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17#include <sys/mman.h> 18#include <unistd.h> 19 20#include "rsCpuIntrinsic.h" 21#include "rsCpuIntrinsicInlines.h" 22 23#include <sys/mman.h> 24#include <stddef.h> 25#include <stdint.h> 26#include <stdlib.h> 27//#include <utils/StopWatch.h> 28 29 30/* uint kernel 31 * Q0 D0: Load slot for R 32 * D1: Load slot for G 33 * Q1 D2: Load slot for B 34 * D3: Load slot for A 35 * Q2 D4: Matrix 36 * D5: = 37 * Q3 D6: = 38 * D7: = 39 * Q4 D8: Add R 40 * D9: 41 * Q5 D10: Add G 42 * D11: 43 * Q6 D12: Add B 44 * D13: 45 * Q7 D14: Add A 46 * D15: 47 * Q8 D16: I32: R Sum 48 * D17: 49 * Q9 D18: I32: G Sum 50 * D19: 51 * Q10 D20: I32: B Sum 52 * D21: 53 * Q11 D22: I32: A Sum 54 * D23: 55 * Q12 D24: U16: expanded R 56 * D25: 57 * Q13 D26: U16: expanded G 58 * D27: 59 * Q14 D28: U16: expanded B 60 * D29: 61 * Q15 D30: U16: expanded A 62 * D31: 63 * 64 */ 65 66/* float kernel 67 * Q0 D0: Load slot for R 68 * D1: = 69 * Q1 D2: Load slot for G 70 * D3: = 71 * Q2 D4: Load slot for B 72 * D5: = 73 * Q3 D6: Load slot for A 74 * D7: = 75 * Q4 D8: Matrix 76 * D9: = 77 * Q5 D10: = 78 * D11: = 79 * Q6 D12: = 80 * D13: = 81 * Q7 D14: = 82 * D15: = 83 * Q8 D16: Add R 84 * D17: = 85 * Q9 D18: Add G 86 * D19: = 87 * Q10 D20: Add B 88 * D21: = 89 * Q11 D22: Add A 90 * D23: = 91 * Q12 D24: Sum R 92 * D25: = 93 * Q13 D26: Sum G 94 * D27: = 95 * Q14 D28: Sum B 96 * D29: = 97 * Q15 D30: Sum A 98 * D31: = 99 * 100 */ 101 102 103 104using namespace android; 105using namespace android::renderscript; 106 107namespace android { 108namespace renderscript { 109 110typedef union { 111 uint64_t key; 112 struct { 113 uint32_t inVecSize :2; // [0 - 1] 114 uint32_t outVecSize :2; // [2 - 3] 115 uint32_t inType :4; // [4 - 7] 116 uint32_t outType :4; // [8 - 11] 117 uint32_t dot :1; // [12] 118 uint32_t _unused1 :1; // [13] 119 uint32_t copyAlpha :1; // [14] 120 uint32_t _unused2 :1; // [15] 121 uint32_t coeffMask :16; // [16-31] 122 uint32_t addMask :4; // [32-35] 123 } u; 124} Key_t; 125 126//Re-enable when intrinsic is fixed 127#if defined(ARCH_ARM64_USE_INTRINSICS) 128typedef struct { 129 void (*column[4])(void); 130 void (*store)(void); 131 void (*load)(void); 132 void (*store_end)(void); 133 void (*load_end)(void); 134} FunctionTab_t; 135 136extern "C" void rsdIntrinsicColorMatrix_int_K( 137 void *out, void const *in, size_t count, 138 FunctionTab_t const *fns, 139 int16_t const *mult, int32_t const *add); 140 141extern "C" void rsdIntrinsicColorMatrix_float_K( 142 void *out, void const *in, size_t count, 143 FunctionTab_t const *fns, 144 float const *mult, float const *add); 145 146/* The setup functions fill in function tables to be used by above functions; 147 * this code also eliminates jump-to-another-jump cases by short-circuiting 148 * empty functions. While it's not performance critical, it works out easier 149 * to write the set-up code in assembly than to try to expose the same symbols 150 * and write the code in C. 151 */ 152extern "C" void rsdIntrinsicColorMatrixSetup_int_K( 153 FunctionTab_t *fns, 154 uint32_t mask, int dt, int st); 155 156extern "C" void rsdIntrinsicColorMatrixSetup_float_K( 157 FunctionTab_t *fns, 158 uint32_t mask, int dt, int st); 159#endif 160 161class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic { 162public: 163 void populateScript(Script *) override; 164 165 void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override; 166 167 ~RsdCpuScriptIntrinsicColorMatrix() override; 168 RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); 169 170 void preLaunch(uint32_t slot, const Allocation ** ains, 171 uint32_t inLen, Allocation * aout, const void * usr, 172 uint32_t usrLen, const RsScriptCall *sc) override; 173 174protected: 175 float fp[16]; 176 float fpa[4]; 177 178 // The following four fields are read as constants 179 // by the SIMD assembly code. 180 short ip[16]; 181 int ipa[4]; 182 float tmpFp[16]; 183 float tmpFpa[4]; 184#if defined(ARCH_ARM64_USE_INTRINSICS) 185 FunctionTab_t mFnTab; 186#endif 187 188 static void kernel(const RsExpandKernelDriverInfo *info, 189 uint32_t xstart, uint32_t xend, 190 uint32_t outstep); 191 void updateCoeffCache(float fpMul, float addMul); 192 193 Key_t mLastKey; 194 unsigned char *mBuf; 195 size_t mBufSize; 196 197 Key_t computeKey(const Element *ein, const Element *eout); 198 199 bool build(Key_t key); 200 201 void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count); 202 203}; 204 205} 206} 207 208 209Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey( 210 const Element *ein, const Element *eout) { 211 212 Key_t key; 213 key.key = 0; 214 215 // Compute a unique code key for this operation 216 217 // Add to the key the input and output types 218 bool hasFloat = false; 219 if (ein->getType() == RS_TYPE_FLOAT_32) { 220 hasFloat = true; 221 key.u.inType = RS_TYPE_FLOAT_32; 222 rsAssert(key.u.inType == RS_TYPE_FLOAT_32); 223 } 224 if (eout->getType() == RS_TYPE_FLOAT_32) { 225 hasFloat = true; 226 key.u.outType = RS_TYPE_FLOAT_32; 227 rsAssert(key.u.outType == RS_TYPE_FLOAT_32); 228 } 229 230 // Mask in the bits indicating which coefficients in the 231 // color matrix are needed. 232 if (hasFloat) { 233 for (uint32_t i=0; i < 16; i++) { 234 if (fabs(fp[i]) != 0.f) { 235 key.u.coeffMask |= 1 << i; 236 } 237 } 238 if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1; 239 if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2; 240 if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4; 241 if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8; 242 243 } else { 244 for (uint32_t i=0; i < 16; i++) { 245 if (ip[i] != 0) { 246 key.u.coeffMask |= 1 << i; 247 } 248 } 249 if (ipa[0] != 0) key.u.addMask |= 0x1; 250 if (ipa[1] != 0) key.u.addMask |= 0x2; 251 if (ipa[2] != 0) key.u.addMask |= 0x4; 252 if (ipa[3] != 0) key.u.addMask |= 0x8; 253 } 254 255 // Look for a dot product where the r,g,b colums are the same 256 if ((ip[0] == ip[1]) && (ip[0] == ip[2]) && 257 (ip[4] == ip[5]) && (ip[4] == ip[6]) && 258 (ip[8] == ip[9]) && (ip[8] == ip[10]) && 259 (ip[12] == ip[13]) && (ip[12] == ip[14])) { 260 261 if (!key.u.addMask) key.u.dot = 1; 262 } 263 264 // Is alpha a simple copy 265 if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) { 266 key.u.copyAlpha = !(key.u.inType || key.u.outType); 267 } 268 269 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); 270 271 switch (ein->getVectorSize()) { 272 case 4: 273 key.u.inVecSize = 3; 274 break; 275 case 3: 276 key.u.inVecSize = 2; 277 key.u.coeffMask &= ~0xF000; 278 break; 279 case 2: 280 key.u.inVecSize = 1; 281 key.u.coeffMask &= ~0xFF00; 282 break; 283 default: 284 key.u.coeffMask &= ~0xFFF0; 285 break; 286 } 287 288 switch (eout->getVectorSize()) { 289 case 4: 290 key.u.outVecSize = 3; 291 break; 292 case 3: 293 key.u.outVecSize = 2; 294 key.u.coeffMask &= ~0x8888; 295 key.u.addMask &= 7; 296 break; 297 case 2: 298 key.u.outVecSize = 1; 299 key.u.coeffMask &= ~0xCCCC; 300 key.u.addMask &= 3; 301 break; 302 default: 303 key.u.coeffMask &= ~0xEEEE; 304 key.u.addMask &= 1; 305 break; 306 } 307 308 if (key.u.inType && !key.u.outType) { 309 key.u.addMask |= 1; 310 if (key.u.outVecSize > 0) key.u.addMask |= 2; 311 if (key.u.outVecSize > 1) key.u.addMask |= 4; 312 if (key.u.outVecSize > 2) key.u.addMask |= 8; 313 } 314 315 //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); 316 return key; 317} 318 319#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) 320 321#define DEF_SYM(x) \ 322 extern "C" uint32_t _N_ColorMatrix_##x; \ 323 extern "C" uint32_t _N_ColorMatrix_##x##_end; \ 324 extern "C" uint32_t _N_ColorMatrix_##x##_len; 325 326DEF_SYM(prefix_i) 327DEF_SYM(prefix_f) 328DEF_SYM(postfix1) 329DEF_SYM(postfix2) 330 331DEF_SYM(load_u8_4) 332DEF_SYM(load_u8_3) 333DEF_SYM(load_u8_2) 334DEF_SYM(load_u8_1) 335DEF_SYM(load_u8f_4) 336DEF_SYM(load_u8f_3) 337DEF_SYM(load_u8f_2) 338DEF_SYM(load_u8f_1) 339DEF_SYM(load_f32_4) 340DEF_SYM(load_f32_3) 341DEF_SYM(load_f32_2) 342DEF_SYM(load_f32_1) 343 344DEF_SYM(store_u8_4) 345DEF_SYM(store_u8_2) 346DEF_SYM(store_u8_1) 347DEF_SYM(store_f32_4) 348DEF_SYM(store_f32_3) 349DEF_SYM(store_f32_2) 350DEF_SYM(store_f32_1) 351DEF_SYM(store_f32u_4) 352DEF_SYM(store_f32u_2) 353DEF_SYM(store_f32u_1) 354 355DEF_SYM(unpack_u8_4) 356DEF_SYM(unpack_u8_3) 357DEF_SYM(unpack_u8_2) 358DEF_SYM(unpack_u8_1) 359DEF_SYM(pack_u8_4) 360DEF_SYM(pack_u8_3) 361DEF_SYM(pack_u8_2) 362DEF_SYM(pack_u8_1) 363DEF_SYM(dot) 364DEF_SYM(add_0_u8) 365DEF_SYM(add_1_u8) 366DEF_SYM(add_2_u8) 367DEF_SYM(add_3_u8) 368 369#define ADD_CHUNK(x) \ 370 memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \ 371 buf += _N_ColorMatrix_##x##_len 372 373 374static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) { 375 size_t off = (target - buf - 8) >> 2; 376 rsAssert(((off & 0xff000000) == 0) || 377 ((off & 0xff000000) == 0xff000000)); 378 379 uint32_t op = (condition << 28); 380 op |= 0xa << 24; // branch 381 op |= 0xffffff & off; 382 ((uint32_t *)buf)[0] = op; 383 return buf + 4; 384} 385 386static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) { 387 rsAssert(vd < 32); 388 rsAssert(vm < 32); 389 rsAssert(vn < 32); 390 391 uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22); 392 op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5); 393 op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7); 394 return op; 395} 396 397static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 398 //vmlal.s16 Q#1, D#1, D#2[#] 399 uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); 400 ((uint32_t *)buf)[0] = op; 401 return buf + 4; 402} 403 404static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 405 //vmull.s16 Q#1, D#1, D#2[#] 406 uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); 407 ((uint32_t *)buf)[0] = op; 408 return buf + 4; 409} 410 411static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 412 //vqadd.s32 Q#1, Q#1, Q#2 413 uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 414 ((uint32_t *)buf)[0] = op; 415 return buf + 4; 416} 417 418static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 419 //vmlal.f32 Q#1, D#1, D#2[#] 420 uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); 421 ((uint32_t *)buf)[0] = op; 422 return buf + 4; 423} 424 425static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 426 //vmull.f32 Q#1, D#1, D#2[#] 427 uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); 428 ((uint32_t *)buf)[0] = op; 429 return buf + 4; 430} 431 432static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 433 //vadd.f32 Q#1, D#1, D#2 434 uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 435 ((uint32_t *)buf)[0] = op; 436 return buf + 4; 437} 438 439static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) { 440 //vmov.32 Q#1, #imm 441 rsAssert(imm == 0); 442 uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0); 443 ((uint32_t *)buf)[0] = op; 444 return buf + 4; 445} 446 447static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 448 //vadd.f32 Q#1, D#1, D#2 449 uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 450 ((uint32_t *)buf)[0] = op; 451 return buf + 4; 452} 453#endif 454 455#if defined(ARCH_X86_HAVE_SSSE3) 456extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, 457 const short *coef, uint32_t count); 458extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, 459 const short *coef, uint32_t count); 460extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, 461 const short *coef, uint32_t count); 462 463void * selectKernel(Key_t key) 464{ 465 void * kernel = nullptr; 466 467 // inType, outType float if nonzero 468 if (!(key.u.inType || key.u.outType)) { 469 if (key.u.dot) 470 kernel = (void *)rsdIntrinsicColorMatrixDot_K; 471 else if (key.u.copyAlpha) 472 kernel = (void *)rsdIntrinsicColorMatrix3x3_K; 473 else 474 kernel = (void *)rsdIntrinsicColorMatrix4x4_K; 475 } 476 477 return kernel; 478} 479#endif 480 481bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) { 482#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) 483 mBufSize = 4096; 484 //StopWatch build_time("rs cm: build time"); 485 mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE, 486 MAP_PRIVATE | MAP_ANON, -1, 0); 487 if (mBuf == MAP_FAILED) { 488 mBuf = NULL; 489 return false; 490 } 491 492 uint8_t *buf = mBuf; 493 uint8_t *buf2 = nullptr; 494 495 int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final 496 int opInit[4] = {0, 0, 0, 0}; 497 498 memset(ops, 0, sizeof(ops)); 499 for (int i=0; i < 4; i++) { 500 if (key.u.coeffMask & (1 << (i*4))) { 501 ops[i][0] = 0x2 | opInit[0]; 502 opInit[0] = 1; 503 } 504 if (!key.u.dot) { 505 if (key.u.coeffMask & (1 << (1 + i*4))) { 506 ops[i][1] = 0x2 | opInit[1]; 507 opInit[1] = 1; 508 } 509 if (key.u.coeffMask & (1 << (2 + i*4))) { 510 ops[i][2] = 0x2 | opInit[2]; 511 opInit[2] = 1; 512 } 513 } 514 if (!key.u.copyAlpha) { 515 if (key.u.coeffMask & (1 << (3 + i*4))) { 516 ops[i][3] = 0x2 | opInit[3]; 517 opInit[3] = 1; 518 } 519 } 520 } 521 522 if (key.u.inType || key.u.outType) { 523 key.u.copyAlpha = 0; 524 ADD_CHUNK(prefix_f); 525 buf2 = buf; 526 527 // Load the incoming r,g,b,a as needed 528 if (key.u.inType) { 529 switch(key.u.inVecSize) { 530 case 3: 531 ADD_CHUNK(load_f32_4); 532 break; 533 case 2: 534 ADD_CHUNK(load_f32_3); 535 break; 536 case 1: 537 ADD_CHUNK(load_f32_2); 538 break; 539 case 0: 540 ADD_CHUNK(load_f32_1); 541 break; 542 } 543 } else { 544 switch(key.u.inVecSize) { 545 case 3: 546 ADD_CHUNK(load_u8f_4); 547 break; 548 case 2: 549 ADD_CHUNK(load_u8f_3); 550 break; 551 case 1: 552 ADD_CHUNK(load_u8f_2); 553 break; 554 case 0: 555 ADD_CHUNK(load_u8f_1); 556 break; 557 } 558 } 559 560 for (int i=0; i < 4; i++) { 561 for (int j=0; j < 4; j++) { 562 switch(ops[i][j]) { 563 case 0: 564 break; 565 case 2: 566 buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); 567 break; 568 case 3: 569 buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); 570 break; 571 } 572 } 573 } 574 for (int j=0; j < 4; j++) { 575 if (opInit[j]) { 576 if (key.u.addMask & (1 << j)) { 577 buf = addVADD_F32(buf, j, 12+j, 8+j); 578 } else { 579 buf = addVORR_32(buf, j, 12+j, 12+j); 580 } 581 } else { 582 if (key.u.addMask & (1 << j)) { 583 buf = addVORR_32(buf, j, 8+j, 8+j); 584 } else { 585 buf = addVMOV_32(buf, j, 0); 586 } 587 } 588 } 589 590 if (key.u.outType) { 591 switch(key.u.outVecSize) { 592 case 3: 593 ADD_CHUNK(store_f32_4); 594 break; 595 case 2: 596 ADD_CHUNK(store_f32_3); 597 break; 598 case 1: 599 ADD_CHUNK(store_f32_2); 600 break; 601 case 0: 602 ADD_CHUNK(store_f32_1); 603 break; 604 } 605 } else { 606 switch(key.u.outVecSize) { 607 case 3: 608 case 2: 609 ADD_CHUNK(store_f32u_4); 610 break; 611 case 1: 612 ADD_CHUNK(store_f32u_2); 613 break; 614 case 0: 615 ADD_CHUNK(store_f32u_1); 616 break; 617 } 618 } 619 620 621 } else { 622 // Add the function prefix 623 // Store the address for the loop return 624 ADD_CHUNK(prefix_i); 625 buf2 = buf; 626 627 // Load the incoming r,g,b,a as needed 628 switch(key.u.inVecSize) { 629 case 3: 630 ADD_CHUNK(load_u8_4); 631 if (key.u.copyAlpha) { 632 ADD_CHUNK(unpack_u8_3); 633 } else { 634 ADD_CHUNK(unpack_u8_4); 635 } 636 break; 637 case 2: 638 ADD_CHUNK(load_u8_3); 639 ADD_CHUNK(unpack_u8_3); 640 break; 641 case 1: 642 ADD_CHUNK(load_u8_2); 643 ADD_CHUNK(unpack_u8_2); 644 break; 645 case 0: 646 ADD_CHUNK(load_u8_1); 647 ADD_CHUNK(unpack_u8_1); 648 break; 649 } 650 651 // Add multiply and accumulate 652 // use MULL to init the output register, 653 // use MLAL from there 654 for (int i=0; i < 4; i++) { 655 for (int j=0; j < 4; j++) { 656 switch(ops[i][j]) { 657 case 0: 658 break; 659 case 2: 660 buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j); 661 break; 662 case 3: 663 buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j); 664 break; 665 } 666 } 667 } 668 for (int j=0; j < 4; j++) { 669 if (opInit[j]) { 670 if (key.u.addMask & (1 << j)) { 671 buf = addVQADD_S32(buf, 8+j, 8+j, 4+j); 672 } 673 } else { 674 if (key.u.addMask & (1 << j)) { 675 buf = addVORR_32(buf, 8+j, 4+j, 4+j); 676 } 677 } 678 } 679 680 // If we have a dot product, perform the special pack. 681 if (key.u.dot) { 682 ADD_CHUNK(pack_u8_1); 683 ADD_CHUNK(dot); 684 } else { 685 switch(key.u.outVecSize) { 686 case 3: 687 if (key.u.copyAlpha) { 688 ADD_CHUNK(pack_u8_3); 689 } else { 690 ADD_CHUNK(pack_u8_4); 691 } 692 break; 693 case 2: 694 ADD_CHUNK(pack_u8_3); 695 break; 696 case 1: 697 ADD_CHUNK(pack_u8_2); 698 break; 699 case 0: 700 ADD_CHUNK(pack_u8_1); 701 break; 702 } 703 } 704 705 // Write out result 706 switch(key.u.outVecSize) { 707 case 3: 708 case 2: 709 ADD_CHUNK(store_u8_4); 710 break; 711 case 1: 712 ADD_CHUNK(store_u8_2); 713 break; 714 case 0: 715 ADD_CHUNK(store_u8_1); 716 break; 717 } 718 } 719 720 if (key.u.inType != key.u.outType) { 721 key.u.copyAlpha = 0; 722 key.u.dot = 0; 723 } 724 725 // Loop, branch, and cleanup 726 ADD_CHUNK(postfix1); 727 buf = addBranch(buf, buf2, 0x01); 728 ADD_CHUNK(postfix2); 729 730 int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC); 731 if (ret == -1) { 732 ALOGE("mprotect error %i", ret); 733 return false; 734 } 735 736 __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize); 737 return true; 738#else 739 return false; 740#endif 741} 742 743void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) { 744 for(int ct=0; ct < 16; ct++) { 745 ip[ct] = (short)(fp[ct] * 256.f + 0.5f); 746 tmpFp[ct] = fp[ct] * fpMul; 747 //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]); 748 } 749 750 float add = 0.f; 751 if (fpMul > 254.f) add = 0.5f; 752 for(int ct=0; ct < 4; ct++) { 753 tmpFpa[ct] = fpa[ct] * addMul + add; 754 //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]); 755 } 756 757 for(int ct=0; ct < 4; ct++) { 758 ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f); 759 } 760} 761 762void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data, 763 size_t dataLength) { 764 switch(slot) { 765 case 0: 766 memcpy (fp, data, sizeof(fp)); 767 break; 768 case 1: 769 memcpy (fpa, data, sizeof(fpa)); 770 break; 771 default: 772 rsAssert(0); 773 break; 774 } 775 mRootPtr = &kernel; 776} 777 778 779static void One(const RsExpandKernelDriverInfo *info, void *out, 780 const void *py, const float* coeff, const float *add, 781 uint32_t vsin, uint32_t vsout, bool fin, bool fout) { 782 783 float4 f = 0.f; 784 if (fin) { 785 switch(vsin) { 786 case 3: 787 f = ((const float4 *)py)[0]; 788 break; 789 case 2: 790 f = ((const float4 *)py)[0]; 791 f.w = 0.f; 792 break; 793 case 1: 794 f.xy = ((const float2 *)py)[0]; 795 break; 796 case 0: 797 f.x = ((const float *)py)[0]; 798 break; 799 } 800 } else { 801 switch(vsin) { 802 case 3: 803 f = convert_float4(((const uchar4 *)py)[0]); 804 break; 805 case 2: 806 f = convert_float4(((const uchar4 *)py)[0]); 807 f.w = 0.f; 808 break; 809 case 1: 810 f.xy = convert_float2(((const uchar2 *)py)[0]); 811 break; 812 case 0: 813 f.x = (float)(((const uchar *)py)[0]); 814 break; 815 } 816 } 817 //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w); 818 819 float4 sum; 820 sum.x = f.x * coeff[0] + 821 f.y * coeff[4] + 822 f.z * coeff[8] + 823 f.w * coeff[12]; 824 sum.y = f.x * coeff[1] + 825 f.y * coeff[5] + 826 f.z * coeff[9] + 827 f.w * coeff[13]; 828 sum.z = f.x * coeff[2] + 829 f.y * coeff[6] + 830 f.z * coeff[10] + 831 f.w * coeff[14]; 832 sum.w = f.x * coeff[3] + 833 f.y * coeff[7] + 834 f.z * coeff[11] + 835 f.w * coeff[15]; 836 //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w); 837 838 sum.x += add[0]; 839 sum.y += add[1]; 840 sum.z += add[2]; 841 sum.w += add[3]; 842 843 844 //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w); 845 if (fout) { 846 switch(vsout) { 847 case 3: 848 case 2: 849 ((float4 *)out)[0] = sum; 850 break; 851 case 1: 852 ((float2 *)out)[0] = sum.xy; 853 break; 854 case 0: 855 ((float *)out)[0] = sum.x; 856 break; 857 } 858 } else { 859 sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x); 860 sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y); 861 sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z); 862 sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w); 863 864 switch(vsout) { 865 case 3: 866 case 2: 867 ((uchar4 *)out)[0] = convert_uchar4(sum); 868 break; 869 case 1: 870 ((uchar2 *)out)[0] = convert_uchar2(sum.xy); 871 break; 872 case 0: 873 ((uchar *)out)[0] = sum.x; 874 break; 875 } 876 } 877 //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]); 878} 879 880void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info, 881 uint32_t xstart, uint32_t xend, 882 uint32_t outstep) { 883 RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr; 884 885 uint32_t instep = info->inStride[0]; 886 887 uchar *out = (uchar *)info->outPtr[0]; 888 uchar *in = (uchar *)info->inPtr[0]; 889 uint32_t x1 = xstart; 890 uint32_t x2 = xend; 891 892 uint32_t vsin = cp->mLastKey.u.inVecSize; 893 uint32_t vsout = cp->mLastKey.u.outVecSize; 894 bool floatIn = !!cp->mLastKey.u.inType; 895 bool floatOut = !!cp->mLastKey.u.outType; 896 897 //if (!info->current.y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout); 898 899 if(x2 > x1) { 900 int32_t len = x2 - x1; 901 if (gArchUseSIMD) { 902 if((cp->mOptKernel != nullptr) && (len >= 4)) { 903 // The optimized kernel processes 4 pixels at once 904 // and requires a minimum of 1 chunk of 4 905 cp->mOptKernel(out, in, cp->ip, len >> 2); 906 // Update the len and pointers so the generic code can 907 // finish any leftover pixels 908 len &= ~3; 909 x1 += len; 910 out += outstep * len; 911 in += instep * len; 912 } 913#if defined(ARCH_ARM64_USE_INTRINSICS) 914 else { 915 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) { 916 // Currently this generates off by one errors. 917 //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa); 918 //x1 += len; 919 //out += outstep * len; 920 //in += instep * len; 921 } else { 922 rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa); 923 x1 += len; 924 out += outstep * len; 925 in += instep * len; 926 } 927 } 928#endif 929 } 930 931 while(x1 != x2) { 932 One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut); 933 out += outstep; 934 in += instep; 935 x1++; 936 } 937 } 938} 939 940void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot, 941 const Allocation ** ains, 942 uint32_t inLen, 943 Allocation * aout, 944 const void * usr, 945 uint32_t usrLen, 946 const RsScriptCall *sc) { 947 948 const Element *ein = ains[0]->mHal.state.type->getElement(); 949 const Element *eout = aout->mHal.state.type->getElement(); 950 951 if (ein->getType() == eout->getType()) { 952 if (eout->getType() == RS_TYPE_UNSIGNED_8) { 953 updateCoeffCache(1.f, 255.f); 954 } else { 955 updateCoeffCache(1.f, 1.f); 956 } 957 } else { 958 if (eout->getType() == RS_TYPE_UNSIGNED_8) { 959 updateCoeffCache(255.f, 255.f); 960 } else { 961 updateCoeffCache(1.f / 255.f, 1.f); 962 } 963 } 964 965 Key_t key = computeKey(ein, eout); 966 967#if defined(ARCH_X86_HAVE_SSSE3) 968 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) { 969 // FIXME: Disable mOptKernel to pass RS color matrix CTS cases 970 // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key); 971 mLastKey = key; 972 } 973 974#else //if !defined(ARCH_X86_HAVE_SSSE3) 975 if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) { 976 if (mBuf) munmap(mBuf, mBufSize); 977 mBuf = nullptr; 978 mOptKernel = nullptr; 979 if (build(key)) { 980 mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf; 981 } 982#if defined(ARCH_ARM64_USE_INTRINSICS) 983 else { 984 int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0); 985 int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0); 986 uint32_t mm = 0; 987 int i; 988 for (i = 0; i < 4; i++) 989 { 990 uint32_t m = (key.u.coeffMask >> i) & 0x1111; 991 m = ((m * 0x249) >> 9) & 15; 992 m |= ((key.u.addMask >> i) & 1) << 4; 993 mm |= m << (i * 5); 994 } 995 996 if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) { 997 rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st); 998 } else { 999 rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st); 1000 } 1001 } 1002#endif 1003 mLastKey = key; 1004 } 1005#endif //if !defined(ARCH_X86_HAVE_SSSE3) 1006} 1007 1008RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix( 1009 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 1010 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) { 1011 1012 mLastKey.key = 0; 1013 mBuf = nullptr; 1014 mBufSize = 0; 1015 mOptKernel = nullptr; 1016 const static float defaultMatrix[] = { 1017 1.f, 0.f, 0.f, 0.f, 1018 0.f, 1.f, 0.f, 0.f, 1019 0.f, 0.f, 1.f, 0.f, 1020 0.f, 0.f, 0.f, 1.f 1021 }; 1022 const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f}; 1023 setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix)); 1024 setGlobalVar(1, defaultAdd, sizeof(defaultAdd)); 1025} 1026 1027RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() { 1028 if (mBuf) munmap(mBuf, mBufSize); 1029 mBuf = nullptr; 1030 mOptKernel = nullptr; 1031} 1032 1033void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) { 1034 s->mHal.info.exportedVariableCount = 2; 1035} 1036 1037RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, 1038 const Script *s, const Element *e) { 1039 1040 return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e); 1041} 1042