Changeset 3112
- Timestamp:
- 01/21/08 20:07:03 (11 months ago)
- Files:
-
- trunk/tango/text/Regex.d (modified) (20 diffs, 1 prop)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/tango/text/Regex.d
- Property svn:eol-style set to native
r3111 r3112 37 37 $(TR $(TD X*?) $(TD see above) ) 38 38 $(TR $(TD X+?) $(TD see above) ) 39 $(TR $(TD X{n,m}?) $(TD see above) 39 $(TR $(TD X{n,m}?) $(TD see above) ) 40 40 </table> 41 41 … … 2139 2139 { 2140 2140 // if index is < 0 it is a temporary index 2141 // used only to distinguish the state from existing ones.2142 // the previous index can be reused instead.2141 // used only to distinguish the state from existing ones. 2142 // the previous index can be reused instead. 2143 2143 if ( index < 0 ) 2144 2144 index = -index-1; … … 2832 2832 class RegExpT(char_t) 2833 2833 { 2834 alias char_t[] string_t;2835 2834 alias TDFA!(dchar) tdfa_t; 2836 alias TNFA!(dchar) tnfa_t; 2835 alias TNFA!(dchar) tnfa_t; 2837 2836 alias CharClass!(dchar) charclass_t; 2838 2837 … … 2854 2853 } 2855 2854 2856 this( string_tpattern, bool swapMBS, bool unanchored)2855 this(char_t[] pattern, bool swapMBS, bool unanchored) 2857 2856 { 2858 2857 static if ( is(char_t == dchar) ) { … … 2905 2904 --- 2906 2905 **********************************************************************************************/ 2907 public RegExpT!(char_t) search( string_tinput)2906 public RegExpT!(char_t) search(char_t[] input) 2908 2907 { 2909 2908 this.input = input; … … 2926 2925 Returns: false for no match, true for match 2927 2926 **********************************************************************************************/ 2928 bool test( string_tinput)2927 bool test(char_t[] input) 2929 2928 { 2930 2929 this.input = input; … … 3031 3030 Slice of input for the requested submatch, or null if no such submatch exists. 3032 3031 **********************************************************************************************/ 3033 string_tmatch(uint index)3032 char_t[] match(uint index) 3034 3033 { 3035 3034 if ( index > tdfa.num_tags ) … … 3045 3044 Return the slice of the input that precedes the matched substring. 3046 3045 **********************************************************************************************/ 3047 string_tpre()3046 char_t[] pre() 3048 3047 { 3049 3048 int start = last_start+registers[0]; … … 3056 3055 Return the slice of the input that follows the matched substring. 3057 3056 **********************************************************************************************/ 3058 string_tpost()3057 char_t[] post() 3059 3058 { 3060 3059 int end = last_start+registers[1]; … … 3282 3281 tnfa_t tnfa; 3283 3282 tdfa_t tdfa; 3284 string_tinput;3283 char_t[] input; 3285 3284 size_t next_start, 3286 3285 last_start; 3287 3286 3288 string compileCommand(Layout!(char) layout, bool is_lookahead, tdfa_t.Command cmd, string_tindent)3287 string compileCommand(Layout!(char) layout, bool is_lookahead, tdfa_t.Command cmd, char_t[] indent) 3289 3288 { 3290 3289 string code, … … 3366 3365 class UtfException : Exception 3367 3366 { 3368 size_t idx; /// index in string of where error occurred3367 size_t idx; /// index in string of where error occurred 3369 3368 3370 3369 this(char[] s, size_t i) 3371 3370 { 3372 idx = i;3373 super(s);3371 idx = i; 3372 super(s); 3374 3373 } 3375 3374 } … … 3384 3383 3385 3384 return c < 0xD800 || 3386 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/);3385 (c > 0xDFFF && c <= 0x10FFFF /*&& c != 0xFFFE && c != 0xFFFF*/); 3387 3386 } 3388 3387 … … 3395 3394 dchar decode(in char[] s, inout size_t idx) 3396 3395 { 3397 size_t len = s.length;3398 dchar V;3399 size_t i = idx;3400 char u = s[i];3401 3402 if (u & 0x80)3403 { uint n;3404 char u2;3405 3406 /* The following encodings are valid, except for the 5 and 6 byte3407 * combinations:3408 *0xxxxxxx3409 *110xxxxx 10xxxxxx3410 *1110xxxx 10xxxxxx 10xxxxxx3411 *11110xxx 10xxxxxx 10xxxxxx 10xxxxxx3412 *111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx3413 *1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx3414 */3415 for (n = 1; ; n++)3416 {3417 if (n > 4)3418 goto Lerr;// only do the first 4 of 6 encodings3419 if (((u << n) & 0x80) == 0)3420 {3421 if (n == 1)3422 goto Lerr;3423 break;3424 }3425 }3426 3427 // Pick off (7 - n) significant bits of B from first byte of octet3428 V = cast(dchar)(u & ((1 << (7 - n)) - 1));3429 3430 if (i + (n - 1) >= len)3431 goto Lerr;// off end of string3432 3433 /* The following combinations are overlong, and illegal:3434 *1100000x (10xxxxxx)3435 *11100000 100xxxxx (10xxxxxx)3436 *11110000 1000xxxx (10xxxxxx 10xxxxxx)3437 *11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)3438 *11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)3439 */3440 u2 = s[i + 1];3441 if ((u & 0xFE) == 0xC0 ||3442 (u == 0xE0 && (u2 & 0xE0) == 0x80) ||3443 (u == 0xF0 && (u2 & 0xF0) == 0x80) ||3444 (u == 0xF8 && (u2 & 0xF8) == 0x80) ||3445 (u == 0xFC && (u2 & 0xFC) == 0x80))3446 goto Lerr;// overlong combination3447 3448 for (uint j = 1; j != n; j++)3449 {3450 u = s[i + j];3451 if ((u & 0xC0) != 0x80)3452 goto Lerr;// trailing bytes are 10xxxxxx3453 V = (V << 6) | (u & 0x3F);3454 }3455 if (!isValidDchar(V))3456 goto Lerr;3457 i += n;3458 }3459 else3460 {3461 V = cast(dchar) u;3462 i++;3463 }3464 3465 idx = i;3466 return V;3396 size_t len = s.length; 3397 dchar V; 3398 size_t i = idx; 3399 char u = s[i]; 3400 3401 if (u & 0x80) 3402 { uint n; 3403 char u2; 3404 3405 /* The following encodings are valid, except for the 5 and 6 byte 3406 * combinations: 3407 * 0xxxxxxx 3408 * 110xxxxx 10xxxxxx 3409 * 1110xxxx 10xxxxxx 10xxxxxx 3410 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 3411 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 3412 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 3413 */ 3414 for (n = 1; ; n++) 3415 { 3416 if (n > 4) 3417 goto Lerr; // only do the first 4 of 6 encodings 3418 if (((u << n) & 0x80) == 0) 3419 { 3420 if (n == 1) 3421 goto Lerr; 3422 break; 3423 } 3424 } 3425 3426 // Pick off (7 - n) significant bits of B from first byte of octet 3427 V = cast(dchar)(u & ((1 << (7 - n)) - 1)); 3428 3429 if (i + (n - 1) >= len) 3430 goto Lerr; // off end of string 3431 3432 /* The following combinations are overlong, and illegal: 3433 * 1100000x (10xxxxxx) 3434 * 11100000 100xxxxx (10xxxxxx) 3435 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) 3436 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) 3437 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) 3438 */ 3439 u2 = s[i + 1]; 3440 if ((u & 0xFE) == 0xC0 || 3441 (u == 0xE0 && (u2 & 0xE0) == 0x80) || 3442 (u == 0xF0 && (u2 & 0xF0) == 0x80) || 3443 (u == 0xF8 && (u2 & 0xF8) == 0x80) || 3444 (u == 0xFC && (u2 & 0xFC) == 0x80)) 3445 goto Lerr; // overlong combination 3446 3447 for (uint j = 1; j != n; j++) 3448 { 3449 u = s[i + j]; 3450 if ((u & 0xC0) != 0x80) 3451 goto Lerr; // trailing bytes are 10xxxxxx 3452 V = (V << 6) | (u & 0x3F); 3453 } 3454 if (!isValidDchar(V)) 3455 goto Lerr; 3456 i += n; 3457 } 3458 else 3459 { 3460 V = cast(dchar) u; 3461 i++; 3462 } 3463 3464 idx = i; 3465 return V; 3467 3466 3468 3467 Lerr: 3469 throw new Exception("4invalid UTF-8 sequence");3468 throw new Exception("4invalid UTF-8 sequence"); 3470 3469 } 3471 3470 … … 3498 3497 3499 3498 static char[][] s4 = 3500 [ "\xE2\x89",// too short3501 "\xC0\x8A",3502 "\xE0\x80\x8A",3503 "\xF0\x80\x80\x8A",3504 "\xF8\x80\x80\x80\x8A",3505 "\xFC\x80\x80\x80\x80\x8A",3499 [ "\xE2\x89", // too short 3500 "\xC0\x8A", 3501 "\xE0\x80\x8A", 3502 "\xF0\x80\x80\x8A", 3503 "\xF8\x80\x80\x80\x8A", 3504 "\xFC\x80\x80\x80\x80\x8A", 3506 3505 ]; 3507 3506 3508 3507 for (int j = 0; j < s4.length; j++) 3509 3508 { 3510 try3511 {3512 i = 0;3513 c = decode(s4[j], i);3514 assert(0);3515 }3516 catch (UtfException u)3517 {3518 i = 23;3519 delete u;3520 }3521 assert(i == 23);3509 try 3510 { 3511 i = 0; 3512 c = decode(s4[j], i); 3513 assert(0); 3514 } 3515 catch (UtfException u) 3516 { 3517 i = 23; 3518 delete u; 3519 } 3520 assert(i == 23); 3522 3521 } 3523 3522 } … … 3528 3527 in 3529 3528 { 3530 assert(idx >= 0 && idx < s.length);3529 assert(idx >= 0 && idx < s.length); 3531 3530 } 3532 3531 out (result) 3533 3532 { 3534 assert(isValidDchar(result));3533 assert(isValidDchar(result)); 3535 3534 } 3536 3535 body 3537 3536 { 3538 char[] msg;3539 dchar V;3540 size_t i = idx;3541 uint u = s[i];3542 3543 if (u & ~0x7F)3544 { if (u >= 0xD800 && u <= 0xDBFF)3545 { uint u2;3546 3547 if (i + 1 == s.length)3548 { msg = "surrogate UTF-16 high value past end of string";3549 goto Lerr;3550 }3551 u2 = s[i + 1];3552 if (u2 < 0xDC00 || u2 > 0xDFFF)3553 { msg = "surrogate UTF-16 low value out of range";3554 goto Lerr;3555 }3556 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);3557 i += 2;3558 }3559 else if (u >= 0xDC00 && u <= 0xDFFF)3560 { msg = "unpaired surrogate UTF-16 value";3561 goto Lerr;3562 }3563 else if (u == 0xFFFE || u == 0xFFFF)3564 { msg = "illegal UTF-16 value";3565 goto Lerr;3566 }3567 else3568 i++;3569 }3570 else3571 {3572 i++;3573 }3574 3575 idx = i;3576 return cast(dchar)u;3537 char[] msg; 3538 dchar V; 3539 size_t i = idx; 3540 uint u = s[i]; 3541 3542 if (u & ~0x7F) 3543 { if (u >= 0xD800 && u <= 0xDBFF) 3544 { uint u2; 3545 3546 if (i + 1 == s.length) 3547 { msg = "surrogate UTF-16 high value past end of string"; 3548 goto Lerr; 3549 } 3550 u2 = s[i + 1]; 3551 if (u2 < 0xDC00 || u2 > 0xDFFF) 3552 { msg = "surrogate UTF-16 low value out of range"; 3553 goto Lerr; 3554 } 3555 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 3556 i += 2; 3557 } 3558 else if (u >= 0xDC00 && u <= 0xDFFF) 3559 { msg = "unpaired surrogate UTF-16 value"; 3560 goto Lerr; 3561 } 3562 else if (u == 0xFFFE || u == 0xFFFF) 3563 { msg = "illegal UTF-16 value"; 3564 goto Lerr; 3565 } 3566 else 3567 i++; 3568 } 3569 else 3570 { 3571 i++; 3572 } 3573 3574 idx = i; 3575 return cast(dchar)u; 3577 3576 3578 3577 Lerr: 3579 throw new UtfException(msg, i);3578 throw new UtfException(msg, i); 3580 3579 } 3581 3580 … … 3585 3584 in 3586 3585 { 3587 assert(idx >= 0 && idx < s.length);3586 assert(idx >= 0 && idx < s.length); 3588 3587 } 3589 3588 body 3590 3589 { 3591 size_t i = idx;3592 dchar c = s[i];3593 3594 if (!isValidDchar(c))3595 goto Lerr;3596 idx = i + 1;3597 return c;3590 size_t i = idx; 3591 dchar c = s[i]; 3592 3593 if (!isValidDchar(c)) 3594 goto Lerr; 3595 idx = i + 1; 3596 return c; 3598 3597 3599 3598 Lerr: 3600 throw new UtfException("5invalid UTF-32 value", i);3599 throw new UtfException("5invalid UTF-32 value", i); 3601 3600 } 3602 3601 … … 3612 3611 in 3613 3612 { 3614 assert(isValidDchar(c));3613 assert(isValidDchar(c)); 3615 3614 } 3616 3615 body 3617 3616 { 3618 char[] r = s;3619 3620 if (c <= 0x7F)3621 {3622 r ~= cast(char) c;3623 }3624 else3625 {3626 char[4] buf;3627 uint L;3628 3629 if (c <= 0x7FF)3630 {3631 buf[0] = cast(char)(0xC0 | (c >> 6));3632 buf[1] = cast(char)(0x80 | (c & 0x3F));3633 L = 2;3634 }3635 else if (c <= 0xFFFF)3636 {3637 buf[0] = cast(char)(0xE0 | (c >> 12));3638 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));3639 buf[2] = cast(char)(0x80 | (c & 0x3F));3640 L = 3;3641 }3642 else if (c <= 0x10FFFF)3643 {3644 buf[0] = cast(char)(0xF0 | (c >> 18));3645 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));3646 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));3647 buf[3] = cast(char)(0x80 | (c & 0x3F));3648 L = 4;3649 }3650 else3651 {3652 assert(0);3653 }3654 r ~= buf[0 .. L];3655 }3656 s = r;3617 char[] r = s; 3618 3619 if (c <= 0x7F) 3620 { 3621 r ~= cast(char) c; 3622 } 3623 else 3624 { 3625 char[4] buf; 3626 uint L; 3627 3628 if (c <= 0x7FF) 3629 { 3630 buf[0] = cast(char)(0xC0 | (c >> 6)); 3631 buf[1] = cast(char)(0x80 | (c & 0x3F)); 3632 L = 2; 3633 } 3634 else if (c <= 0xFFFF) 3635 { 3636 buf[0] = cast(char)(0xE0 | (c >> 12)); 3637 buf[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 3638 buf[2] = cast(char)(0x80 | (c & 0x3F)); 3639 L = 3; 3640 } 3641 else if (c <= 0x10FFFF) 3642 { 3643 buf[0] = cast(char)(0xF0 | (c >> 18)); 3644 buf[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 3645 buf[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 3646 buf[3] = cast(char)(0x80 | (c & 0x3F)); 3647 L = 4; 3648 } 3649 else 3650 { 3651 assert(0); 3652 } 3653 r ~= buf[0 .. L]; 3654 } 3655 s = r; 3657 3656 } 3658 3657 … … 3669 3668 assert(s.length == 7); 3670 3669 assert(s == "abcda\xC2\xA9"); 3671 //assert(s == "abcda\u00A9"); // BUG: fix compiler3670 //assert(s == "abcda\u00A9"); // BUG: fix compiler 3672 3671 3673 3672 encode(s, cast(dchar)'\u2260'); … … 3681 3680 in 3682 3681 { 3683 assert(isValidDchar(c));3682 assert(isValidDchar(c)); 3684 3683 } 3685 3684 body 3686 3685 { 3687 wchar[] r = s;3688 3689 if (c <= 0xFFFF)3690 {3691 r ~= cast(wchar) c;3692 }3693 else3694 {3695 wchar[2] buf;3696 3697 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);3698 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00);3699 r ~= buf;3700 }3701 s = r;3686 wchar[] r = s; 3687 3688 if (c <= 0xFFFF) 3689 { 3690 r ~= cast(wchar) c; 3691 } 3692 else 3693 { 3694 wchar[2] buf; 3695 3696 buf[0] = cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800); 3697 buf[1] = cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00); 3698 r ~= buf; 3699 } 3700 s = r; 3702 3701 } 3703 3702 … … 3707 3706 in 3708 3707 { 3709 assert(isValidDchar(c));3708 assert(isValidDchar(c)); 3710 3709 } 3711 3710 body 3712 3711 { 3713 s ~= c;3714 } 3712 s ~= c; 3713 }












