1/* ------------------------------------------------------------------ 2 * Copyright (C) 1998-2009 PacketVideo 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either 13 * express or implied. 14 * See the License for the specific language governing permissions 15 * and limitations under the License. 16 * ------------------------------------------------------------------- 17 */ 18#include "avcenc_lib.h" 19/* 3/29/01 fast half-pel search based on neighboring guess */ 20/* value ranging from 0 to 4, high complexity (more accurate) to 21 low complexity (less accurate) */ 22#define HP_DISTANCE_TH 5 // 2 /* half-pel distance threshold */ 23 24#define PREF_16_VEC 129 /* 1MV bias versus 4MVs*/ 25 26#define CLIP_RESULT(x) if((uint)x > 0xFF){ \ 27 x = 0xFF & (~(x>>31));} 28 29#define CLIP_UPPER16(x) if((uint)x >= 0x20000000){ \ 30 x = 0xFF0000 & (~(x>>31));} \ 31 else { \ 32 x = (x>>5)&0xFF0000; \ 33 } 34 35/*===================================================================== 36 Function: AVCFindHalfPelMB 37 Date: 10/31/2007 38 Purpose: Find half pel resolution MV surrounding the full-pel MV 39=====================================================================*/ 40 41int AVCFindHalfPelMB(AVCEncObject *encvid, uint8 *cur, AVCMV *mot, uint8 *ncand, 42 int xpos, int ypos, int hp_guess, int cmvx, int cmvy) 43{ 44 AVCPictureData *currPic = encvid->common->currPic; 45 int lx = currPic->pitch; 46 int d, dmin, satd_min; 47 uint8* cand; 48 int lambda_motion = encvid->lambda_motion; 49 uint8 *mvbits = encvid->mvbits; 50 int mvcost; 51 /* list of candidate to go through for half-pel search*/ 52 uint8 *subpel_pred = (uint8*) encvid->subpel_pred; // all 16 sub-pel positions 53 uint8 **hpel_cand = (uint8**) encvid->hpel_cand; /* half-pel position */ 54 55 int xh[9] = {0, 0, 2, 2, 2, 0, -2, -2, -2}; 56 int yh[9] = {0, -2, -2, 0, 2, 2, 2, 0, -2}; 57 int xq[8] = {0, 1, 1, 1, 0, -1, -1, -1}; 58 int yq[8] = { -1, -1, 0, 1, 1, 1, 0, -1}; 59 int h, hmin, q, qmin; 60 61 OSCL_UNUSED_ARG(xpos); 62 OSCL_UNUSED_ARG(ypos); 63 OSCL_UNUSED_ARG(hp_guess); 64 65 GenerateHalfPelPred(subpel_pred, ncand, lx); 66 67 cur = encvid->currYMB; // pre-load current original MB 68 69 cand = hpel_cand[0]; 70 71 // find cost for the current full-pel position 72 dmin = SATD_MB(cand, cur, 65535); // get Hadamaard transform SAD 73 mvcost = MV_COST_S(lambda_motion, mot->x, mot->y, cmvx, cmvy); 74 satd_min = dmin; 75 dmin += mvcost; 76 hmin = 0; 77 78 /* find half-pel */ 79 for (h = 1; h < 9; h++) 80 { 81 d = SATD_MB(hpel_cand[h], cur, dmin); 82 mvcost = MV_COST_S(lambda_motion, mot->x + xh[h], mot->y + yh[h], cmvx, cmvy); 83 d += mvcost; 84 85 if (d < dmin) 86 { 87 dmin = d; 88 hmin = h; 89 satd_min = d - mvcost; 90 } 91 } 92 93 mot->sad = dmin; 94 mot->x += xh[hmin]; 95 mot->y += yh[hmin]; 96 encvid->best_hpel_pos = hmin; 97 98 /*** search for quarter-pel ****/ 99 GenerateQuartPelPred(encvid->bilin_base[hmin], &(encvid->qpel_cand[0][0]), hmin); 100 101 encvid->best_qpel_pos = qmin = -1; 102 103 for (q = 0; q < 8; q++) 104 { 105 d = SATD_MB(encvid->qpel_cand[q], cur, dmin); 106 mvcost = MV_COST_S(lambda_motion, mot->x + xq[q], mot->y + yq[q], cmvx, cmvy); 107 d += mvcost; 108 if (d < dmin) 109 { 110 dmin = d; 111 qmin = q; 112 satd_min = d - mvcost; 113 } 114 } 115 116 if (qmin != -1) 117 { 118 mot->sad = dmin; 119 mot->x += xq[qmin]; 120 mot->y += yq[qmin]; 121 encvid->best_qpel_pos = qmin; 122 } 123 124 return satd_min; 125} 126 127 128 129/** This function generates sub-pel prediction around the full-pel candidate. 130Each sub-pel position array is 20 pixel wide (for word-alignment) and 17 pixel tall. */ 131/** The sub-pel position is labeled in spiral manner from the center. */ 132 133void GenerateHalfPelPred(uint8* subpel_pred, uint8 *ncand, int lx) 134{ 135 /* let's do straightforward way first */ 136 uint8 *ref; 137 uint8 *dst; 138 uint8 tmp8; 139 int32 tmp32; 140 int16 tmp_horz[18*22], *dst_16, *src_16; 141 int a = 0, b = 0, c = 0, d = 0, e = 0, f = 0; // temp 142 int i, j; 143 144 /* first copy full-pel to the first array */ 145 /* to be optimized later based on byte-offset load */ 146 ref = ncand - 3 - lx - (lx << 1); /* move back (-3,-3) */ 147 dst = subpel_pred; 148 149 dst -= 4; /* offset */ 150 for (j = 0; j < 22; j++) /* 24x22 */ 151 { 152 i = 6; 153 while (i > 0) 154 { 155 tmp32 = *ref++; 156 tmp8 = *ref++; 157 tmp32 |= (tmp8 << 8); 158 tmp8 = *ref++; 159 tmp32 |= (tmp8 << 16); 160 tmp8 = *ref++; 161 tmp32 |= (tmp8 << 24); 162 *((uint32*)(dst += 4)) = tmp32; 163 i--; 164 } 165 ref += (lx - 24); 166 } 167 168 /* from the first array, we do horizontal interp */ 169 ref = subpel_pred + 2; 170 dst_16 = tmp_horz; /* 17 x 22 */ 171 172 for (j = 4; j > 0; j--) 173 { 174 for (i = 16; i > 0; i -= 4) 175 { 176 a = ref[-2]; 177 b = ref[-1]; 178 c = ref[0]; 179 d = ref[1]; 180 e = ref[2]; 181 f = ref[3]; 182 *dst_16++ = a + f - 5 * (b + e) + 20 * (c + d); 183 a = ref[4]; 184 *dst_16++ = b + a - 5 * (c + f) + 20 * (d + e); 185 b = ref[5]; 186 *dst_16++ = c + b - 5 * (d + a) + 20 * (e + f); 187 c = ref[6]; 188 *dst_16++ = d + c - 5 * (e + b) + 20 * (f + a); 189 190 ref += 4; 191 } 192 /* do the 17th column here */ 193 d = ref[3]; 194 *dst_16 = e + d - 5 * (f + c) + 20 * (a + b); 195 dst_16 += 2; /* stride for tmp_horz is 18 */ 196 ref += 8; /* stride for ref is 24 */ 197 if (j == 3) // move 18 lines down 198 { 199 dst_16 += 324;//18*18; 200 ref += 432;//18*24; 201 } 202 } 203 204 ref -= 480;//20*24; 205 dst_16 -= 360;//20*18; 206 dst = subpel_pred + V0Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* go to the 14th array 17x18*/ 207 208 for (j = 18; j > 0; j--) 209 { 210 for (i = 16; i > 0; i -= 4) 211 { 212 a = ref[-2]; 213 b = ref[-1]; 214 c = ref[0]; 215 d = ref[1]; 216 e = ref[2]; 217 f = ref[3]; 218 tmp32 = a + f - 5 * (b + e) + 20 * (c + d); 219 *dst_16++ = tmp32; 220 tmp32 = (tmp32 + 16) >> 5; 221 CLIP_RESULT(tmp32) 222 *dst++ = tmp32; 223 224 a = ref[4]; 225 tmp32 = b + a - 5 * (c + f) + 20 * (d + e); 226 *dst_16++ = tmp32; 227 tmp32 = (tmp32 + 16) >> 5; 228 CLIP_RESULT(tmp32) 229 *dst++ = tmp32; 230 231 b = ref[5]; 232 tmp32 = c + b - 5 * (d + a) + 20 * (e + f); 233 *dst_16++ = tmp32; 234 tmp32 = (tmp32 + 16) >> 5; 235 CLIP_RESULT(tmp32) 236 *dst++ = tmp32; 237 238 c = ref[6]; 239 tmp32 = d + c - 5 * (e + b) + 20 * (f + a); 240 *dst_16++ = tmp32; 241 tmp32 = (tmp32 + 16) >> 5; 242 CLIP_RESULT(tmp32) 243 *dst++ = tmp32; 244 245 ref += 4; 246 } 247 /* do the 17th column here */ 248 d = ref[3]; 249 tmp32 = e + d - 5 * (f + c) + 20 * (a + b); 250 *dst_16 = tmp32; 251 tmp32 = (tmp32 + 16) >> 5; 252 CLIP_RESULT(tmp32) 253 *dst = tmp32; 254 255 dst += 8; /* stride for dst is 24 */ 256 dst_16 += 2; /* stride for tmp_horz is 18 */ 257 ref += 8; /* stride for ref is 24 */ 258 } 259 260 261 /* Do middle point filtering*/ 262 src_16 = tmp_horz; /* 17 x 22 */ 263 dst = subpel_pred + V2Q_H2Q * SUBPEL_PRED_BLK_SIZE; /* 12th array 17x17*/ 264 dst -= 24; // offset 265 for (i = 0; i < 17; i++) 266 { 267 for (j = 16; j > 0; j -= 4) 268 { 269 a = *src_16; 270 b = *(src_16 += 18); 271 c = *(src_16 += 18); 272 d = *(src_16 += 18); 273 e = *(src_16 += 18); 274 f = *(src_16 += 18); 275 276 tmp32 = a + f - 5 * (b + e) + 20 * (c + d); 277 tmp32 = (tmp32 + 512) >> 10; 278 CLIP_RESULT(tmp32) 279 *(dst += 24) = tmp32; 280 281 a = *(src_16 += 18); 282 tmp32 = b + a - 5 * (c + f) + 20 * (d + e); 283 tmp32 = (tmp32 + 512) >> 10; 284 CLIP_RESULT(tmp32) 285 *(dst += 24) = tmp32; 286 287 b = *(src_16 += 18); 288 tmp32 = c + b - 5 * (d + a) + 20 * (e + f); 289 tmp32 = (tmp32 + 512) >> 10; 290 CLIP_RESULT(tmp32) 291 *(dst += 24) = tmp32; 292 293 c = *(src_16 += 18); 294 tmp32 = d + c - 5 * (e + b) + 20 * (f + a); 295 tmp32 = (tmp32 + 512) >> 10; 296 CLIP_RESULT(tmp32) 297 *(dst += 24) = tmp32; 298 299 src_16 -= (18 << 2); 300 } 301 302 d = src_16[90]; // 18*5 303 tmp32 = e + d - 5 * (f + c) + 20 * (a + b); 304 tmp32 = (tmp32 + 512) >> 10; 305 CLIP_RESULT(tmp32) 306 dst[24] = tmp32; 307 308 src_16 -= ((18 << 4) - 1); 309 dst -= ((24 << 4) - 1); 310 } 311 312 /* do vertical interpolation */ 313 ref = subpel_pred + 2; 314 dst = subpel_pred + V2Q_H0Q * SUBPEL_PRED_BLK_SIZE; /* 10th array 18x17 */ 315 dst -= 24; // offset 316 317 for (i = 2; i > 0; i--) 318 { 319 for (j = 16; j > 0; j -= 4) 320 { 321 a = *ref; 322 b = *(ref += 24); 323 c = *(ref += 24); 324 d = *(ref += 24); 325 e = *(ref += 24); 326 f = *(ref += 24); 327 328 tmp32 = a + f - 5 * (b + e) + 20 * (c + d); 329 tmp32 = (tmp32 + 16) >> 5; 330 CLIP_RESULT(tmp32) 331 *(dst += 24) = tmp32; // 10th 332 333 a = *(ref += 24); 334 tmp32 = b + a - 5 * (c + f) + 20 * (d + e); 335 tmp32 = (tmp32 + 16) >> 5; 336 CLIP_RESULT(tmp32) 337 *(dst += 24) = tmp32; // 10th 338 339 b = *(ref += 24); 340 tmp32 = c + b - 5 * (d + a) + 20 * (e + f); 341 tmp32 = (tmp32 + 16) >> 5; 342 CLIP_RESULT(tmp32) 343 *(dst += 24) = tmp32; // 10th 344 345 c = *(ref += 24); 346 tmp32 = d + c - 5 * (e + b) + 20 * (f + a); 347 tmp32 = (tmp32 + 16) >> 5; 348 CLIP_RESULT(tmp32) 349 *(dst += 24) = tmp32; // 10th 350 351 ref -= (24 << 2); 352 } 353 354 d = ref[120]; // 24*5 355 tmp32 = e + d - 5 * (f + c) + 20 * (a + b); 356 tmp32 = (tmp32 + 16) >> 5; 357 CLIP_RESULT(tmp32) 358 dst[24] = tmp32; // 10th 359 360 dst -= ((24 << 4) - 1); 361 ref -= ((24 << 4) - 1); 362 } 363 364 // note that using SIMD here doesn't help much, the cycle almost stays the same 365 // one can just use the above code and change the for(i=2 to for(i=18 366 for (i = 16; i > 0; i -= 4) 367 { 368 for (j = 17; j > 0; j--) 369 { 370 a = *((uint32*)ref); /* load 4 bytes */ 371 b = (a >> 8) & 0xFF00FF; /* second and fourth byte */ 372 a &= 0xFF00FF; 373 374 c = *((uint32*)(ref + 120)); 375 d = (c >> 8) & 0xFF00FF; 376 c &= 0xFF00FF; 377 378 a += c; 379 b += d; 380 381 e = *((uint32*)(ref + 72)); /* e, f */ 382 f = (e >> 8) & 0xFF00FF; 383 e &= 0xFF00FF; 384 385 c = *((uint32*)(ref + 48)); /* c, d */ 386 d = (c >> 8) & 0xFF00FF; 387 c &= 0xFF00FF; 388 389 c += e; 390 d += f; 391 392 a += 20 * c; 393 b += 20 * d; 394 a += 0x100010; 395 b += 0x100010; 396 397 e = *((uint32*)(ref += 24)); /* e, f */ 398 f = (e >> 8) & 0xFF00FF; 399 e &= 0xFF00FF; 400 401 c = *((uint32*)(ref + 72)); /* c, d */ 402 d = (c >> 8) & 0xFF00FF; 403 c &= 0xFF00FF; 404 405 c += e; 406 d += f; 407 408 a -= 5 * c; 409 b -= 5 * d; 410 411 c = a << 16; 412 d = b << 16; 413 CLIP_UPPER16(a) 414 CLIP_UPPER16(c) 415 CLIP_UPPER16(b) 416 CLIP_UPPER16(d) 417 418 a |= (c >> 16); 419 b |= (d >> 16); 420 // a>>=5; 421 // b>>=5; 422 /* clip */ 423 // msk |= b; msk|=a; 424 // a &= 0xFF00FF; 425 // b &= 0xFF00FF; 426 a |= (b << 8); /* pack it back */ 427 428 *((uint16*)(dst += 24)) = a & 0xFFFF; //dst is not word-aligned. 429 *((uint16*)(dst + 2)) = a >> 16; 430 431 } 432 dst -= 404; // 24*17-4 433 ref -= 404; 434 /* if(msk & 0xFF00FF00) // need clipping 435 { 436 VertInterpWClip(dst,ref); // re-do 4 column with clip 437 }*/ 438 } 439 440 return ; 441} 442 443void VertInterpWClip(uint8 *dst, uint8 *ref) 444{ 445 int i, j; 446 int a, b, c, d, e, f; 447 int32 tmp32; 448 449 dst -= 4; 450 ref -= 4; 451 452 for (i = 4; i > 0; i--) 453 { 454 for (j = 16; j > 0; j -= 4) 455 { 456 a = *ref; 457 b = *(ref += 24); 458 c = *(ref += 24); 459 d = *(ref += 24); 460 e = *(ref += 24); 461 f = *(ref += 24); 462 463 tmp32 = a + f - 5 * (b + e) + 20 * (c + d); 464 tmp32 = (tmp32 + 16) >> 5; 465 CLIP_RESULT(tmp32) 466 *(dst += 24) = tmp32; // 10th 467 468 a = *(ref += 24); 469 tmp32 = b + a - 5 * (c + f) + 20 * (d + e); 470 tmp32 = (tmp32 + 16) >> 5; 471 CLIP_RESULT(tmp32) 472 *(dst += 24) = tmp32; // 10th 473 474 b = *(ref += 24); 475 tmp32 = c + b - 5 * (d + a) + 20 * (e + f); 476 tmp32 = (tmp32 + 16) >> 5; 477 CLIP_RESULT(tmp32) 478 *(dst += 24) = tmp32; // 10th 479 480 c = *(ref += 24); 481 tmp32 = d + c - 5 * (e + b) + 20 * (f + a); 482 tmp32 = (tmp32 + 16) >> 5; 483 CLIP_RESULT(tmp32) 484 *(dst += 24) = tmp32; // 10th 485 486 ref -= (24 << 2); 487 } 488 489 d = ref[120]; // 24*5 490 tmp32 = e + d - 5 * (f + c) + 20 * (a + b); 491 tmp32 = (tmp32 + 16) >> 5; 492 CLIP_RESULT(tmp32) 493 dst[24] = tmp32; // 10th 494 495 dst -= ((24 << 4) - 1); 496 ref -= ((24 << 4) - 1); 497 } 498 499 return ; 500} 501 502 503void GenerateQuartPelPred(uint8 **bilin_base, uint8 *qpel_cand, int hpel_pos) 504{ 505 // for even value of hpel_pos, start with pattern 1, otherwise, start with pattern 2 506 int i, j; 507 508 uint8 *c1 = qpel_cand; 509 uint8 *tl = bilin_base[0]; 510 uint8 *tr = bilin_base[1]; 511 uint8 *bl = bilin_base[2]; 512 uint8 *br = bilin_base[3]; 513 int a, b, c, d; 514 int offset = 1 - (384 * 7); 515 516 if (!(hpel_pos&1)) // diamond pattern 517 { 518 j = 16; 519 while (j--) 520 { 521 i = 16; 522 while (i--) 523 { 524 d = tr[24]; 525 a = *tr++; 526 b = bl[1]; 527 c = *br++; 528 529 *c1 = (c + a + 1) >> 1; 530 *(c1 += 384) = (b + a + 1) >> 1; /* c2 */ 531 *(c1 += 384) = (b + c + 1) >> 1; /* c3 */ 532 *(c1 += 384) = (b + d + 1) >> 1; /* c4 */ 533 534 b = *bl++; 535 536 *(c1 += 384) = (c + d + 1) >> 1; /* c5 */ 537 *(c1 += 384) = (b + d + 1) >> 1; /* c6 */ 538 *(c1 += 384) = (b + c + 1) >> 1; /* c7 */ 539 *(c1 += 384) = (b + a + 1) >> 1; /* c8 */ 540 541 c1 += offset; 542 } 543 // advance to the next line, pitch is 24 544 tl += 8; 545 tr += 8; 546 bl += 8; 547 br += 8; 548 c1 += 8; 549 } 550 } 551 else // star pattern 552 { 553 j = 16; 554 while (j--) 555 { 556 i = 16; 557 while (i--) 558 { 559 a = *br++; 560 b = *tr++; 561 c = tl[1]; 562 *c1 = (a + b + 1) >> 1; 563 b = bl[1]; 564 *(c1 += 384) = (a + c + 1) >> 1; /* c2 */ 565 c = tl[25]; 566 *(c1 += 384) = (a + b + 1) >> 1; /* c3 */ 567 b = tr[23]; 568 *(c1 += 384) = (a + c + 1) >> 1; /* c4 */ 569 c = tl[24]; 570 *(c1 += 384) = (a + b + 1) >> 1; /* c5 */ 571 b = *bl++; 572 *(c1 += 384) = (a + c + 1) >> 1; /* c6 */ 573 c = *tl++; 574 *(c1 += 384) = (a + b + 1) >> 1; /* c7 */ 575 *(c1 += 384) = (a + c + 1) >> 1; /* c8 */ 576 577 c1 += offset; 578 } 579 // advance to the next line, pitch is 24 580 tl += 8; 581 tr += 8; 582 bl += 8; 583 br += 8; 584 c1 += 8; 585 } 586 } 587 588 return ; 589} 590 591 592/* assuming cand always has a pitch of 24 */ 593int SATD_MB(uint8 *cand, uint8 *cur, int dmin) 594{ 595 int cost; 596 597 598 dmin = (dmin << 16) | 24; 599 cost = AVCSAD_Macroblock_C(cand, cur, dmin, NULL); 600 601 return cost; 602} 603 604 605 606 607 608