 |
Changeset 4006
- Timestamp:
- 10/15/08 02:57:18
(2 months ago)
- Author:
- Don Clugston
- Message:
Redid MMX shiftleft, since the previous one was broken for the relatively common case where the destination and source overlap.
Fixed a latent bug in the division, which was exposed by the new MMX code.
-
Files:
-
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
| r3954 |
r4006 |
|
| 34 | 34 | * +,- 2.25 1.52 15.6 2.25 |
|---|
| 35 | 35 | * <<,>> 2.0 5.0 6.6 2.0 |
|---|
| 36 | | * (<< MMX) 1.75 1.2 |
|---|
| | 36 | * (<< MMX) 1.73 1.2 |
|---|
| 37 | 37 | * * 5.0 4.3 15 |
|---|
| 38 | 38 | * mulAdd 5.4 4.9 19 |
|---|
| … | … | |
| 301 | 301 | push EDI; |
|---|
| 302 | 302 | push EBX; |
|---|
| | 303 | align 16; |
|---|
| 303 | 304 | mov EDI, [ESP + LASTPARAM + 4*3]; //dest.ptr; |
|---|
| 304 | 305 | mov EBX, [ESP + LASTPARAM + 4*2]; //dest.length; |
|---|
| 305 | | align 16; |
|---|
| 306 | 306 | mov ESI, [ESP + LASTPARAM + 4*1]; //src.ptr; |
|---|
| 307 | | lea EDI, [EDI + 4*EBX]; // EDI = end of dest |
|---|
| 308 | | lea ESI, [ESI + 4*EBX]; // ESI = end of src |
|---|
| 309 | | neg EBX; // count UP to zero. |
|---|
| | 307 | |
|---|
| 310 | 308 | // Register usage |
|---|
| 311 | 309 | // MM0 : scratch |
|---|
| … | … | |
| 314 | 312 | // MM4 = ECX = 64-numbits = bits to shift right |
|---|
| 315 | 313 | // EAX = 32-numbits = bits to shift single int right. |
|---|
| 316 | | |
|---|
| 317 | 314 | movd MM3, EAX; // numbits |
|---|
| 318 | 315 | xor EAX, 63; |
|---|
| … | … | |
| 321 | 318 | |
|---|
| 322 | 319 | pxor MM1, MM1; // input carry is zero |
|---|
| 323 | | test EBX, 1; |
|---|
| 324 | | jz not_odd; |
|---|
| 325 | | |
|---|
| 326 | | // Deal with the first int |
|---|
| | 320 | |
|---|
| | 321 | // Get the carry |
|---|
| | 322 | |
|---|
| 327 | 323 | and EAX, 31; // EAX = 32-numbits |
|---|
| 328 | 324 | movd MM2, EAX; // 32-numbits |
|---|
| 329 | | movd MM1, [ESI+4*EBX]; |
|---|
| 330 | | movd MM0, [ESI+4*EBX]; |
|---|
| 331 | | psllq MM0, MM3; |
|---|
| | 325 | movd MM1, [ESI+4*EBX-4]; |
|---|
| 332 | 326 | psrlq MM1, MM2; |
|---|
| 333 | | movd [ESI+4*EBX], MM0; |
|---|
| 334 | | add EBX, 1; |
|---|
| 335 | | movd EAX, MM1; // carry, in case length was 1 |
|---|
| 336 | | jz L_last; |
|---|
| 337 | | // EBX is now even. Carry is in MM1 |
|---|
| 338 | | not_odd: |
|---|
| 339 | | test EBX, 2; |
|---|
| 340 | | jnz L_onceeven; |
|---|
| 341 | | movq MM2, MM1; |
|---|
| 342 | | // EBX is now doubleeven |
|---|
| 343 | | add EBX, 2; // TRICK |
|---|
| 344 | | |
|---|
| 345 | | L_twiceeven: // here MM2 is the carry |
|---|
| | 327 | movd EAX, MM1; // final carry |
|---|
| | 328 | test EBX, 1; |
|---|
| | 329 | jnz L_odd; |
|---|
| | 330 | |
|---|
| | 331 | movq MM2, [ESI+4*EBX-8]; |
|---|
| | 332 | psllq MM2, MM3; |
|---|
| | 333 | sub EBX, 2; |
|---|
| | 334 | jle L_last; |
|---|
| | 335 | jmp L_even; |
|---|
| | 336 | L_odd: |
|---|
| | 337 | // deal with odd lengths |
|---|
| | 338 | movd MM1, [ESI+4*EBX-4]; |
|---|
| | 339 | movd MM0, [ESI+4*EBX-8]; |
|---|
| | 340 | psllq MM1, MM3; |
|---|
| | 341 | psrlq MM0, MM2; |
|---|
| | 342 | por MM1, MM0; |
|---|
| | 343 | movd [EDI+4*EBX-4], MM1; |
|---|
| | 344 | sub EBX, 1; |
|---|
| | 345 | movq MM0, MM1; |
|---|
| | 346 | |
|---|
| | 347 | movq MM1, [ESI + 4*EBX-8]; |
|---|
| | 348 | movq MM2, [ESI + 4*EBX-8]; |
|---|
| | 349 | psrlq MM1, MM4; |
|---|
| | 350 | por MM0, MM1; |
|---|
| | 351 | movd [EDI +4*EBX], MM0; |
|---|
| | 352 | |
|---|
| | 353 | psllq MM2, MM3; |
|---|
| | 354 | sub EBX, 2; |
|---|
| | 355 | jle L_last; |
|---|
| | 356 | L_even: // It's either singly or doubly even |
|---|
| | 357 | movq MM1, MM2; |
|---|
| | 358 | add EBX, 2; |
|---|
| | 359 | test EBX, 2; |
|---|
| | 360 | jz L_onceeven; |
|---|
| | 361 | sub EBX, 2; |
|---|
| | 362 | |
|---|
| | 363 | L_twiceeven: // here MM2 is the carry |
|---|
| 346 | 364 | movq MM0, [ESI + 4*EBX-8]; |
|---|
| 347 | | psllq MM0, MM3; |
|---|
| 348 | | movq MM1, [ESI + 4*EBX-8]; |
|---|
| 349 | | psrlq MM1, MM4; |
|---|
| 350 | | por MM0, MM2; |
|---|
| 351 | | movq [EDI +4*EBX-8], MM0; |
|---|
| | 365 | psrlq MM0, MM4; |
|---|
| | 366 | movq MM1, [ESI + 4*EBX-8]; |
|---|
| | 367 | psllq MM1, MM3; |
|---|
| | 368 | por MM2, MM0; |
|---|
| | 369 | movq [EDI +4*EBX], MM2; |
|---|
| 352 | 370 | L_onceeven: // here MM1 is the carry |
|---|
| 353 | | movq MM0, [ESI + 4*EBX]; |
|---|
| 354 | | psllq MM0, MM3; |
|---|
| 355 | | movq MM2, [ESI + 4*EBX]; |
|---|
| 356 | | por MM0, MM1; |
|---|
| 357 | | movq [EDI + 4*EBX], MM0; |
|---|
| 358 | | psrlq MM2, MM4; |
|---|
| 359 | | add EBX, 4; |
|---|
| 360 | | jl L_twiceeven; |
|---|
| 361 | | |
|---|
| 362 | | movd EAX, MM2; // MM2 is final carry |
|---|
| 363 | | L_last: |
|---|
| | 371 | movq MM0, [ESI + 4*EBX-16]; |
|---|
| | 372 | psrlq MM0, MM4; |
|---|
| | 373 | movq MM2, [ESI + 4*EBX-16]; |
|---|
| | 374 | por MM1, MM0; |
|---|
| | 375 | movq [EDI +4*EBX-8], MM1; |
|---|
| | 376 | psllq MM2, MM3; |
|---|
| | 377 | sub EBX, 4; |
|---|
| | 378 | jg L_twiceeven; |
|---|
| | 379 | L_last: |
|---|
| | 380 | movq [EDI +4*EBX], MM2; |
|---|
| 364 | 381 | emms; // NOTE: costs 6 cycles on Intel CPUs |
|---|
| 365 | 382 | pop EBX; |
|---|
| … | … | |
| 435 | 452 | |
|---|
| 436 | 453 | aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; |
|---|
| 437 | | // printf("%x %x %x %x %x\n", aa[0], aa[1], aa[2], aa[3], aa[4]); |
|---|
| 438 | | uint r = multibyteShl(aa[1..4], aa[1..$], 4); |
|---|
| 439 | | // printf("%x %x %x %x %x\n", aa[0], aa[1], aa[2], aa[3], aa[4]); |
|---|
| | 454 | uint r = multibyteShl(aa[2..4], aa[2..4], 4); |
|---|
| | 455 | assert(aa[0] == 0xF0FF_FFFF && aa[1]==0x1222_2223 |
|---|
| | 456 | && aa[2]==0x5555_5560 && aa[3]==0x9999_99A4 && aa[4]==0xBCCC_CCCD); |
|---|
| | 457 | assert(r==8); |
|---|
| | 458 | |
|---|
| | 459 | aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; |
|---|
| | 460 | r = multibyteShl(aa[1..4], aa[1..4], 4); |
|---|
| 440 | 461 | assert(aa[0] == 0xF0FF_FFFF |
|---|
| 441 | 462 | && aa[2]==0x5555_5561); |
|---|
| … | … | |
| 443 | 464 | assert(r==8); |
|---|
| 444 | 465 | assert(aa[1]==0x2222_2230); |
|---|
| | 466 | |
|---|
| | 467 | aa = [0xF0FF_FFFF, 0x1222_2223, 0x4555_5556, 0x8999_999A, 0xBCCC_CCCD, 0xEEEE_EEEE]; |
|---|
| | 468 | r = multibyteShl(aa[0..4], aa[1..5], 31); |
|---|
| 445 | 469 | } |
|---|
| 446 | 470 | |
|---|
| r3945 |
r4006 |
|
| 891 | 891 | // How much to left shift v, so that its MSB is set. |
|---|
| 892 | 892 | uint s = 31 - bsr(v[$-1]); |
|---|
| 893 | | multibyteShl(vn, v, s); |
|---|
| 894 | | un[$-1] = multibyteShl(un[0..$-1], u, s); |
|---|
| | 893 | if (s!=0) { |
|---|
| | 894 | multibyteShl(vn, v, s); |
|---|
| | 895 | un[$-1] = multibyteShl(un[0..$-1], u, s); |
|---|
| | 896 | } else { |
|---|
| | 897 | vn[] = v[]; |
|---|
| | 898 | un[0..$-1] = u[]; |
|---|
| | 899 | un[$-1] = 0; |
|---|
| | 900 | } |
|---|
| | 901 | |
|---|
| 895 | 902 | for (int j = u.length - v.length; j >= 0; j--) { |
|---|
| 896 | 903 | // Compute estimate qhat of quotient[j]. |
|---|
| … | … | |
| 922 | 929 | // Unnormalize remainder, if required. |
|---|
| 923 | 930 | if (remainder != null) { |
|---|
| 924 | | multibyteShr(remainder, un, s); |
|---|
| 925 | | } |
|---|
| | 931 | if (s == 0) remainder[] = un[0..$-1]; |
|---|
| | 932 | else multibyteShr(remainder, un, s); |
|---|
| | 933 | } |
|---|
| | 934 | delete un; |
|---|
| | 935 | delete vn; |
|---|
| 926 | 936 | } |
|---|
| 927 | 937 | |
|---|
Download in other formats:
|
 |