 |
Changeset 3201
- Timestamp:
- 02/16/08 08:50:14
(10 months ago)
- Author:
- jascha
- Message:
- optimized DFA generation
- took care of unreachable code warnings (#913)
- fixed problem with lookahead at start of expression
- brought D code generation up-to-date
- minor changes
-
Files:
-
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
| r3134 |
r3201 |
|
| 11 | 11 | This is a regular expression compiler and interpreter based on the Tagged NFA/DFA method. |
|---|
| 12 | 12 | |
|---|
| 13 | | This implies, that the expressions it can handle are <i>regular</i>, in the way language theory defines it, |
|---|
| | 13 | See <a href="http://en.wikipedia.org/wiki/Regular_expression">Wikpedia's article on regular expressions</a> |
|---|
| | 14 | for details on regular expressions in general. |
|---|
| | 15 | |
|---|
| | 16 | The used method implies, that the expressions are <i>regular</i>, in the way language theory defines it, |
|---|
| 14 | 17 | as opposed to what "regular expression" means in most implementations |
|---|
| 15 | 18 | (e.g. PCRE or those from the standard libraries of Perl, Java or Python). |
|---|
| … | … | |
| 19 | 22 | for details on the differences. |
|---|
| 20 | 23 | |
|---|
| 21 | | See <a href="http://en.wikipedia.org/wiki/Regular_expression">Wikpedia's article on regular expressions</a> |
|---|
| 22 | | for details on regular expressions in general. |
|---|
| | 24 | The time for matching a regular expression against an input string of length N is in O(M*N), where M depends on the |
|---|
| | 25 | number of matching brackets and the complexity of the expression. That is, M is constant wrt. the input |
|---|
| | 26 | and therefore matching is a linear-time process. |
|---|
| 23 | 27 | |
|---|
| 24 | 28 | The syntax of a regular expressions is as follows. |
|---|
| … | … | |
| 466 | 470 | } |
|---|
| 467 | 471 | } |
|---|
| | 472 | |
|---|
| | 473 | /* ************************************************************************************************ |
|---|
| | 474 | |
|---|
| | 475 | **************************************************************************************************/ |
|---|
| | 476 | void quickSort(T)(T[] a) |
|---|
| | 477 | { |
|---|
| | 478 | quickSort(a,cast(size_t)0,a.length); |
|---|
| | 479 | } |
|---|
| | 480 | |
|---|
| | 481 | void quickSort(T)(T[] a, size_t l, size_t r) |
|---|
| | 482 | { |
|---|
| | 483 | T t; |
|---|
| | 484 | auto i = r-l; |
|---|
| | 485 | if ( i < 3 ) |
|---|
| | 486 | { |
|---|
| | 487 | if ( i < 2 ) |
|---|
| | 488 | return; |
|---|
| | 489 | if ( a[l] < a[l+1] ) |
|---|
| | 490 | return; |
|---|
| | 491 | t = a[l]; |
|---|
| | 492 | a[l] = a[l+1]; |
|---|
| | 493 | a[l+1] = t; |
|---|
| | 494 | return; |
|---|
| | 495 | } |
|---|
| | 496 | |
|---|
| | 497 | auto p = a[l]; |
|---|
| | 498 | i = l; |
|---|
| | 499 | auto j = r; |
|---|
| | 500 | |
|---|
| | 501 | while ( true ) |
|---|
| | 502 | { |
|---|
| | 503 | ++i; |
|---|
| | 504 | for ( ; i < j && a[i] < p; ++i ) {} |
|---|
| | 505 | --j; |
|---|
| | 506 | for ( ; i < j && a[j] >= p; --j ) {} |
|---|
| | 507 | if ( i >= j ) |
|---|
| | 508 | break; |
|---|
| | 509 | t = a[i]; |
|---|
| | 510 | a[i] = a[j]; |
|---|
| | 511 | a[j] = t; |
|---|
| | 512 | } |
|---|
| | 513 | --i; |
|---|
| | 514 | a[l] = a[i]; |
|---|
| | 515 | a[i] = p; |
|---|
| | 516 | |
|---|
| | 517 | quickSort(a, l, i); |
|---|
| | 518 | quickSort(a, i+1, r); |
|---|
| | 519 | } |
|---|
| 468 | 520 | import tango.math.Math; |
|---|
| 469 | 521 | |
|---|
| … | … | |
| 473 | 525 | struct CharRange(char_t) |
|---|
| 474 | 526 | { |
|---|
| 475 | | char_t l, r; |
|---|
| | 527 | char_t l_, r_; |
|---|
| 476 | 528 | |
|---|
| 477 | 529 | static CharRange opCall(char_t c) |
|---|
| 478 | 530 | { |
|---|
| 479 | | CharRange r; |
|---|
| 480 | | r.l = c; |
|---|
| 481 | | r.r = c; |
|---|
| 482 | | return r; |
|---|
| | 531 | CharRange cr; |
|---|
| | 532 | cr.l_ = c; |
|---|
| | 533 | cr.r_ = c; |
|---|
| | 534 | return cr; |
|---|
| 483 | 535 | } |
|---|
| 484 | 536 | |
|---|
| 485 | 537 | static CharRange opCall(char_t a, char_t b) |
|---|
| 486 | 538 | { |
|---|
| 487 | | CharRange r; |
|---|
| 488 | | r.l = min(a,b); |
|---|
| 489 | | r.r = max(a,b); |
|---|
| 490 | | return r; |
|---|
| 491 | | } |
|---|
| 492 | | |
|---|
| | 539 | CharRange cr; |
|---|
| | 540 | cr.l_ = min(a,b); |
|---|
| | 541 | cr.r_ = max(a,b); |
|---|
| | 542 | return cr; |
|---|
| | 543 | } |
|---|
| | 544 | |
|---|
| | 545 | char_t l() |
|---|
| | 546 | { |
|---|
| | 547 | return l_; |
|---|
| | 548 | } |
|---|
| | 549 | |
|---|
| | 550 | char_t r() |
|---|
| | 551 | { |
|---|
| | 552 | return r_; |
|---|
| | 553 | } |
|---|
| | 554 | |
|---|
| | 555 | /* ******************************************************************************************** |
|---|
| | 556 | Compares the ranges according to their beginning. |
|---|
| | 557 | **********************************************************************************************/ |
|---|
| 493 | 558 | int opCmp(CharRange cr) |
|---|
| 494 | 559 | { |
|---|
| 495 | | if ( l == cr.l ) |
|---|
| | 560 | if ( l_ == cr.l_ ) |
|---|
| 496 | 561 | return 0; |
|---|
| 497 | | if ( l < cr.l ) |
|---|
| | 562 | if ( l_ < cr.l_ ) |
|---|
| 498 | 563 | return -1; |
|---|
| 499 | 564 | return 1; |
|---|
| … | … | |
| 502 | 567 | bool contains(char_t c) |
|---|
| 503 | 568 | { |
|---|
| 504 | | return c >= l && c <= r; |
|---|
| | 569 | return c >= l_ && c <= r_; |
|---|
| 505 | 570 | } |
|---|
| 506 | 571 | |
|---|
| 507 | 572 | bool contains(CharRange cr) |
|---|
| 508 | 573 | { |
|---|
| 509 | | return l <= cr.l && r >= cr.r; |
|---|
| | 574 | return l_ <= cr.l_ && r_ >= cr.r_; |
|---|
| 510 | 575 | } |
|---|
| 511 | 576 | |
|---|
| 512 | 577 | bool intersects(CharRange cr) |
|---|
| 513 | 578 | { |
|---|
| 514 | | return r >= cr.l && l <= cr.r; |
|---|
| | 579 | return r_ >= cr.l_ && l_ <= cr.r_; |
|---|
| 515 | 580 | } |
|---|
| 516 | 581 | |
|---|
| … | … | |
| 519 | 584 | assert(intersects(cr)); |
|---|
| 520 | 585 | CharRange ir; |
|---|
| 521 | | ir.l = max(l, cr.l); |
|---|
| 522 | | ir.r = min(r, cr.r); |
|---|
| | 586 | ir.l_ = max(l_, cr.l_); |
|---|
| | 587 | ir.r_ = min(r_, cr.r_); |
|---|
| | 588 | if ( ir.l_ > ir.r_ ) |
|---|
| | 589 | ir.l_ = ir.r_ = char_t.min; |
|---|
| 523 | 590 | return ir; |
|---|
| 524 | 591 | } |
|---|
| … | … | |
| 536 | 603 | if ( contains(cr) ) |
|---|
| 537 | 604 | { |
|---|
| 538 | | d.l = l; |
|---|
| 539 | | d.r = cr.l-1; |
|---|
| 540 | | if ( d.l <= d.r ) |
|---|
| | 605 | d.l_ = l_; |
|---|
| | 606 | d.r_ = cr.l_-1; |
|---|
| | 607 | if ( d.l_ <= d.r_ ) |
|---|
| 541 | 608 | sr ~= d; |
|---|
| 542 | | d.l = cr.r+1; |
|---|
| 543 | | d.r = r; |
|---|
| 544 | | if ( d.l <= d.r ) |
|---|
| | 609 | d.l_ = cr.r_+1; |
|---|
| | 610 | d.r_ = r_; |
|---|
| | 611 | if ( d.l_ <= d.r_ ) |
|---|
| 545 | 612 | sr ~= d; |
|---|
| 546 | 613 | } |
|---|
| 547 | | else if ( cr.r > l ) |
|---|
| 548 | | { |
|---|
| 549 | | d.l = cr.r+1; |
|---|
| 550 | | d.r = r; |
|---|
| 551 | | if ( d.l <= d.r ) |
|---|
| | 614 | else if ( cr.r_ > l_ ) |
|---|
| | 615 | { |
|---|
| | 616 | d.l_ = cr.r_+1; |
|---|
| | 617 | d.r_ = r_; |
|---|
| | 618 | if ( d.l_ <= d.r_ ) |
|---|
| 552 | 619 | sr ~= d; |
|---|
| 553 | 620 | } |
|---|
| 554 | | else if ( cr.l < r ) |
|---|
| 555 | | { |
|---|
| 556 | | d.l = l; |
|---|
| 557 | | d.r = cr.l-1; |
|---|
| 558 | | if ( d.l <= d.r ) |
|---|
| | 621 | else if ( cr.l_ < r_ ) |
|---|
| | 622 | { |
|---|
| | 623 | d.l_ = l_; |
|---|
| | 624 | d.r_ = cr.l_-1; |
|---|
| | 625 | if ( d.l_ <= d.r_ ) |
|---|
| 559 | 626 | sr ~= d; |
|---|
| 560 | 627 | } |
|---|
| … | … | |
| 567 | 634 | string str; |
|---|
| 568 | 635 | auto layout = new Layout!(char); |
|---|
| 569 | | if ( l == r ) |
|---|
| 570 | | { |
|---|
| 571 | | if ( l > 0x20 && l < 0x7f ) |
|---|
| 572 | | str = layout.convert("'{}'", l); |
|---|
| | 636 | if ( l_ == r_ ) |
|---|
| | 637 | { |
|---|
| | 638 | if ( l_ > 0x20 && l_ < 0x7f ) |
|---|
| | 639 | str = layout.convert("'{}'", l_); |
|---|
| 573 | 640 | else |
|---|
| 574 | | str = layout.convert("({:x})", cast(int)l); |
|---|
| | 641 | str = layout.convert("({:x})", cast(int)l_); |
|---|
| 575 | 642 | } |
|---|
| 576 | 643 | else |
|---|
| 577 | 644 | { |
|---|
| 578 | | if ( l > 0x20 && l < 0x7f ) |
|---|
| 579 | | str = layout.convert("'{}'", l); |
|---|
| | 645 | if ( l_ > 0x20 && l_ < 0x7f ) |
|---|
| | 646 | str = layout.convert("'{}'", l_); |
|---|
| 580 | 647 | else |
|---|
| 581 | | str = layout.convert("({:x})", cast(int)l); |
|---|
| | 648 | str = layout.convert("({:x})", cast(int)l_); |
|---|
| 582 | 649 | str ~= "-"; |
|---|
| 583 | | if ( r > 0x20 && r < 0x7f ) |
|---|
| 584 | | str ~= layout.convert("'{}'", r); |
|---|
| | 650 | if ( r_ > 0x20 && r_ < 0x7f ) |
|---|
| | 651 | str ~= layout.convert("'{}'", r_); |
|---|
| 585 | 652 | else |
|---|
| 586 | | str ~= layout.convert("({:x})", cast(int)r); |
|---|
| | 653 | str ~= layout.convert("({:x})", cast(int)r_); |
|---|
| 587 | 654 | } |
|---|
| 588 | 655 | return str; |
|---|
| … | … | |
| 601 | 668 | static const CharClass!(char_t) |
|---|
| 602 | 669 | line_startend = {parts: [ |
|---|
| 603 | | {l:0x00, r:0x00}, |
|---|
| 604 | | {l:0x0a, r:0x0a}, |
|---|
| 605 | | {l:0x13, r:0x13} |
|---|
| | 670 | {l_:0x00, r_:0x00}, |
|---|
| | 671 | {l_:0x0a, r_:0x0a}, |
|---|
| | 672 | {l_:0x13, r_:0x13} |
|---|
| 606 | 673 | ]}, |
|---|
| 607 | 674 | digit = {parts: [ |
|---|
| 608 | | {l:0x30, r:0x39} |
|---|
| | 675 | {l_:0x30, r_:0x39} |
|---|
| 609 | 676 | ]}, |
|---|
| 610 | 677 | whitespace = {parts: [ |
|---|
| 611 | | {l:0x00, r:0x00}, |
|---|
| 612 | | {l:0x09, r:0x09}, |
|---|
| 613 | | {l:0x0a, r:0x0a}, |
|---|
| 614 | | {l:0x0b, r:0x0b}, |
|---|
| 615 | | {l:0x13, r:0x13}, |
|---|
| 616 | | {l:0x14, r:0x14}, |
|---|
| 617 | | {l:0x20, r:0x20} |
|---|
| | 678 | {l_:0x09, r_:0x09}, |
|---|
| | 679 | {l_:0x0a, r_:0x0a}, |
|---|
| | 680 | {l_:0x0b, r_:0x0b}, |
|---|
| | 681 | {l_:0x13, r_:0x13}, |
|---|
| | 682 | {l_:0x14, r_:0x14}, |
|---|
| | 683 | {l_:0x20, r_:0x20} |
|---|
| 618 | 684 | ]}; |
|---|
| 619 | 685 | |
|---|
| … | … | |
| 623 | 689 | static const CharClass!(char_t) |
|---|
| 624 | 690 | any_char = {parts: [ |
|---|
| 625 | | {l:0x00, r:0x00}, |
|---|
| 626 | | {l:0x09, r:0x13}, // basic control chars |
|---|
| 627 | | {l:0x20, r:0x7e}, // basic latin |
|---|
| 628 | | {l:0xa0, r:0xff} // latin-1 supplement |
|---|
| | 691 | {l_:0x01, r_:0xff} |
|---|
| 629 | 692 | ]}, |
|---|
| 630 | 693 | dot_oper = {parts: [ |
|---|
| 631 | | {l:0x09, r:0x13}, // basic control chars |
|---|
| 632 | | {l:0x20, r:0x7e}, // basic latin |
|---|
| 633 | | {l:0xa0, r:0xff} // latin-1 supplement |
|---|
| | 694 | {l_:0x09, r_:0x13}, // basic control chars |
|---|
| | 695 | {l_:0x20, r_:0x7e}, // basic latin |
|---|
| | 696 | {l_:0xa0, r_:0xff} // latin-1 supplement |
|---|
| 634 | 697 | ]}, |
|---|
| 635 | 698 | alphanum_ = {parts: [ |
|---|
| 636 | | {l:0x30, r:0x39}, |
|---|
| 637 | | {l:0x41, r:0x5a}, |
|---|
| 638 | | {l:0x5f, r:0x5f}, |
|---|
| 639 | | {l:0x61, r:0x7a} |
|---|
| | 699 | {l_:0x30, r_:0x39}, |
|---|
| | 700 | {l_:0x41, r_:0x5a}, |
|---|
| | 701 | {l_:0x5f, r_:0x5f}, |
|---|
| | 702 | {l_:0x61, r_:0x7a} |
|---|
| 640 | 703 | ]}; |
|---|
| 641 | 704 | } |
|---|
| … | … | |
| 645 | 708 | static const CharClass!(char_t) |
|---|
| 646 | 709 | any_char = {parts: [ |
|---|
| 647 | | {l:0x00, r:0x00}, |
|---|
| 648 | | {l:0x09,r:0x13},{l:0x20, r:0x7e},{l:0xa0, r:0xff}, |
|---|
| 649 | | {l:0x0100, r:0x017f}, // latin extended a |
|---|
| 650 | | {l:0x0180, r:0x024f}, // latin extended b |
|---|
| 651 | | {l:0x20a3, r:0x20b5}, // currency symbols |
|---|
| | 710 | {l_:0x0001, r_:0xffff} |
|---|
| 652 | 711 | ]}, |
|---|
| 653 | 712 | dot_oper = {parts: [ |
|---|
| 654 | | {l:0x09,r:0x13},{l:0x20, r:0x7e},{l:0xa0, r:0xff}, |
|---|
| 655 | | {l:0x0100, r:0x017f}, // latin extended a |
|---|
| 656 | | {l:0x0180, r:0x024f}, // latin extended b |
|---|
| 657 | | {l:0x20a3, r:0x20b5}, // currency symbols |
|---|
| | 713 | {l_:0x09,r_:0x13},{l_:0x20, r_:0x7e},{l_:0xa0, r_:0xff}, |
|---|
| | 714 | {l_:0x0100, r_:0x017f}, // latin extended a |
|---|
| | 715 | {l_:0x0180, r_:0x024f}, // latin extended b |
|---|
| | 716 | {l_:0x20a3, r_:0x20b5}, // currency symbols |
|---|
| 658 | 717 | ]}, |
|---|
| 659 | 718 | alphanum_ = {parts: [ |
|---|
| 660 | | {l:0x30, r:0x39}, |
|---|
| 661 | | {l:0x41, r:0x5a}, |
|---|
| 662 | | {l:0x5f, r:0x5f}, |
|---|
| 663 | | {l:0x61, r:0x7a} |
|---|
| | 719 | {l_:0x30, r_:0x39}, |
|---|
| | 720 | {l_:0x41, r_:0x5a}, |
|---|
| | 721 | {l_:0x5f, r_:0x5f}, |
|---|
| | 722 | {l_:0x61, r_:0x7a} |
|---|
| 664 | 723 | ]}; |
|---|
| 665 | 724 | } |
|---|
| … | … | |
| 667 | 726 | //--------------------------------------------------------------------------------------------- |
|---|
| 668 | 727 | range_t[] parts; |
|---|
| | 728 | |
|---|
| | 729 | invariant() |
|---|
| | 730 | { |
|---|
| | 731 | // foreach ( i, p; parts ) |
|---|
| | 732 | // assert(p.l_<=p.r_, Int.toString(i)~": "~Int.toString(p.l_)~" > "~Int.toString(p.r_)); |
|---|
| | 733 | } |
|---|
| | 734 | |
|---|
| | 735 | static CharClass opCall(CharClass cc) |
|---|
| | 736 | { |
|---|
| | 737 | CharClass ncc; |
|---|
| | 738 | ncc.parts = cc.parts.dup; |
|---|
| | 739 | return ncc; |
|---|
| | 740 | } |
|---|
| 669 | 741 | |
|---|
| 670 | 742 | int opCmp(CharClass cc) |
|---|
| … | … | |
| 676 | 748 | foreach ( i, p; cc.parts ) |
|---|
| 677 | 749 | { |
|---|
| 678 | | if ( p.l != parts[i].l || p.r != parts[i].r ) |
|---|
| | 750 | if ( p.l_ != parts[i].l_ || p.r_ != parts[i].r_ ) |
|---|
| 679 | 751 | return 1; |
|---|
| 680 | 752 | } |
|---|
| … | … | |
| 708 | 780 | } |
|---|
| 709 | 781 | } |
|---|
| 710 | | ic.optimize; |
|---|
| 711 | 782 | return ic; |
|---|
| 712 | 783 | } |
|---|
| … | … | |
| 717 | 788 | add(cc); |
|---|
| 718 | 789 | negate; |
|---|
| | 790 | } |
|---|
| | 791 | |
|---|
| | 792 | void add(CharClass cc) |
|---|
| | 793 | { |
|---|
| | 794 | parts ~= cc.parts; |
|---|
| | 795 | } |
|---|
| | 796 | |
|---|
| | 797 | void add(char_t c) |
|---|
| | 798 | { |
|---|
| | 799 | parts ~= CharRange!(char_t)(c); |
|---|
| | 800 | } |
|---|
| | 801 | |
|---|
| | 802 | /* ******************************************************************************************** |
|---|
| | 803 | Requires the CharClass to be optimized. |
|---|
| | 804 | **********************************************************************************************/ |
|---|
| | 805 | void negate() |
|---|
| | 806 | { |
|---|
| 719 | 807 | optimize; |
|---|
| 720 | | } |
|---|
| 721 | | |
|---|
| 722 | | void add(CharClass cc) |
|---|
| 723 | | { |
|---|
| 724 | | parts ~= cc.parts; |
|---|
| 725 | | optimize; |
|---|
| 726 | | } |
|---|
| 727 | | |
|---|
| 728 | | void add(char_t c) |
|---|
| 729 | | { |
|---|
| 730 | | parts ~= CharRange!(char_t)(c); |
|---|
| 731 | | optimize; |
|---|
| 732 | | } |
|---|
| 733 | | |
|---|
| 734 | | void negate() |
|---|
| 735 | | { |
|---|
| 736 | | if ( empty ) { |
|---|
| 737 | | parts ~= any_char.parts; |
|---|
| | 808 | char_t start = char_t.min; |
|---|
| | 809 | |
|---|
| | 810 | // first part touches left boundary of value range |
|---|
| | 811 | if ( parts.length > 0 && parts[0].l_ == start ) |
|---|
| | 812 | { |
|---|
| | 813 | start = parts[0].r_; |
|---|
| | 814 | if ( start < char_t.max ) |
|---|
| | 815 | ++start; |
|---|
| | 816 | |
|---|
| | 817 | foreach ( i, ref cr; parts[0 .. $-1] ) |
|---|
| | 818 | { |
|---|
| | 819 | cr.l_ = start; |
|---|
| | 820 | cr.r_ = parts[i+1].l_-1; |
|---|
| | 821 | start = parts[i+1].r_; |
|---|
| | 822 | if ( start < char_t.max ) |
|---|
| | 823 | ++start; |
|---|
| | 824 | } |
|---|
| | 825 | if ( start != char_t.max ) { |
|---|
| | 826 | parts[$-1].l_ = start; |
|---|
| | 827 | parts[$-1].r_ = char_t.max; |
|---|
| | 828 | } |
|---|
| | 829 | else |
|---|
| | 830 | parts.length = parts.length-1; |
|---|
| 738 | 831 | return; |
|---|
| 739 | 832 | } |
|---|
| 740 | | |
|---|
| 741 | | optimize; |
|---|
| 742 | | range_t[] oldparts = parts; |
|---|
| 743 | | parts = null; |
|---|
| 744 | | |
|---|
| 745 | | Stack!(range_t) stack; |
|---|
| 746 | | foreach_reverse ( p; any_char.parts ) |
|---|
| 747 | | stack ~= p; |
|---|
| 748 | | outerLoop: while ( !stack.empty ) |
|---|
| 749 | | { |
|---|
| 750 | | range_t r = stack.top; |
|---|
| 751 | | stack.pop; |
|---|
| 752 | | |
|---|
| 753 | | foreach ( op; oldparts ) |
|---|
| 754 | | { |
|---|
| 755 | | range_t[] sr = r.subtract(op); |
|---|
| 756 | | |
|---|
| 757 | | if ( sr.length == 0 ) |
|---|
| 758 | | continue outerLoop; |
|---|
| 759 | | if ( sr.length == 2 ) |
|---|
| 760 | | stack ~= sr[1]; |
|---|
| 761 | | r = sr[0]; |
|---|
| 762 | | } |
|---|
| 763 | | |
|---|
| 764 | | parts ~= r; |
|---|
| 765 | | } |
|---|
| | 833 | |
|---|
| | 834 | foreach ( i, ref cr; parts ) |
|---|
| | 835 | { |
|---|
| | 836 | char_t tmp = cr.l_-1; |
|---|
| | 837 | cr.l_ = start; |
|---|
| | 838 | start = cr.r_; |
|---|
| | 839 | if ( start < char_t.max ) |
|---|
| | 840 | ++start; |
|---|
| | 841 | cr.r_ = tmp; |
|---|
| | 842 | } |
|---|
| | 843 | |
|---|
| | 844 | // last part does not touch right boundary |
|---|
| | 845 | if ( start != char_t.max ) |
|---|
| | 846 | parts ~= range_t(start, char_t.max); |
|---|
| 766 | 847 | } |
|---|
| 767 | 848 | |
|---|
| … | … | |
| 771 | 852 | return; |
|---|
| 772 | 853 | |
|---|
| 773 | | range_t[] oldparts = parts; |
|---|
| 774 | | oldparts.sort; |
|---|
| 775 | | parts = null; |
|---|
| 776 | | |
|---|
| 777 | | range_t cp = oldparts[0]; |
|---|
| 778 | | foreach ( ocp; oldparts[1..$] ) |
|---|
| 779 | | { |
|---|
| 780 | | if ( ocp.l > cp.r+1 ) { |
|---|
| 781 | | parts ~= cp; |
|---|
| 782 | | cp = ocp; |
|---|
| 783 | | } |
|---|
| 784 | | else if ( ocp.l <= cp.r+1 ) |
|---|
| 785 | | cp.r = max(ocp.r, cp.r); |
|---|
| 786 | | } |
|---|
| 787 | | parts ~= cp; |
|---|
| | 854 | parts.sort; |
|---|
| | 855 | |
|---|
| | 856 | size_t i = 0; |
|---|
| | 857 | foreach ( p; parts[1 .. $] ) |
|---|
| | 858 | { |
|---|
| | 859 | if ( p.l_ > parts[i].r_+1 ) { |
|---|
| | 860 | ++i; |
|---|
| | 861 | parts[i].l_ = p.l_; |
|---|
| | 862 | parts[i].r_ = p.r_; |
|---|
| | 863 | continue; |
|---|
| | 864 | } |
|---|
| | 865 | parts[i].r_ = max(p.r_, parts[i].r_); |
|---|
| | 866 | if ( parts[i].r_ >= char_t.max ) |
|---|
| | 867 | break; |
|---|
| | 868 | } |
|---|
| | 869 | parts.length = i+1; |
|---|
| 788 | 870 | } |
|---|
| 789 | 871 | |
|---|
| … | … | |
| 799 | 881 | } |
|---|
| 800 | 882 | |
|---|
| | 883 | import tango.io.Stdout; |
|---|
| | 884 | unittest |
|---|
| | 885 | { |
|---|
| | 886 | Stdout.formatln("CharClass unittest"); |
|---|
| | 887 | static CharClass!(char) cc = { parts: [{l_:0,r_:10},{l_:0,r_:6},{l_:5,r_:12},{l_:12,r_:17},{l_:20,r_:100}] }; |
|---|
| | 888 | Stdout.formatln("{}", cc.toString); |
|---|
| | 889 | cc.optimize; |
|---|
| | 890 | Stdout.formatln("optimized: {}", cc.toString); |
|---|
| | 891 | cc.negate; |
|---|
| | 892 | Stdout.formatln("negated: {}", cc.toString); |
|---|
| | 893 | cc.optimize; |
|---|
| | 894 | Stdout.formatln("optimized: {}", cc.toString); |
|---|
| | 895 | cc.negate; |
|---|
| | 896 | Stdout.formatln("negated: {}", cc.toString); |
|---|
| | 897 | |
|---|
| | 898 | static CharClass!(char) cc2 = { parts: [] }; |
|---|
| | 899 | Stdout.formatln("{}", cc2.toString); |
|---|
| | 900 | cc2.optimize; |
|---|
| | 901 | Stdout.formatln("optimized: {}", cc2.toString); |
|---|
| | 902 | cc2.negate; |
|---|
| | 903 | Stdout.formatln("negated: {}", cc2.toString); |
|---|
| | 904 | cc2.optimize; |
|---|
| | 905 | Stdout.formatln("optimized: {}", cc2.toString); |
|---|
| | 906 | cc2.negate; |
|---|
| | 907 | Stdout.formatln("negated: {}", cc2.toString); |
|---|
| | 908 | |
|---|
| | 909 | static CharClass!(char) cc3 = { parts: [{l_:0,r_:100},{l_:200,r_:0xff},] }; |
|---|
| | 910 | Stdout.formatln("{}", cc3.toString); |
|---|
| | 911 | cc3.negate; |
|---|
| | 912 | Stdout.formatln("negated: {}", cc3.toString); |
|---|
| | 913 | cc3.negate; |
|---|
| | 914 | Stdout.formatln("negated: {}", cc3.toString); |
|---|
| | 915 | |
|---|
| | 916 | static CharClass!(char) cc4 = { parts: [{l_:0,r_:200},{l_:100,r_:0xff},] }; |
|---|
| | 917 | Stdout.formatln("{}", cc4.toString); |
|---|
| | 918 | cc4.optimize; |
|---|
| | 919 | Stdout.formatln("optimized: {}", cc4.toString); |
|---|
| | 920 | |
|---|
| | 921 | static CharClass!(dchar) cc5 = { parts: [{l_:0x9,r_:0x13},{0x20,r_:'~'},{l_:0xa0,r_:0xff},{l_:0x100,r_:0x17f},{l_:0x180,r_:0x24f},{l_:0x20a3,r_:0x20b5}] }; |
|---|
| | 922 | Stdout.formatln("{}", cc5.toString); |
|---|
| | 923 | cc5.optimize; |
|---|
| | 924 | Stdout.formatln("optimized: {}", cc5.toString); |
|---|
| | 925 | cc5.negate; |
|---|
| | 926 | Stdout.formatln("negated: {}", cc5.toString); |
|---|
| | 927 | cc5.optimize; |
|---|
| | 928 | Stdout.formatln("optimized: {}", cc5.toString); |
|---|
| | 929 | cc5.negate; |
|---|
| | 930 | Stdout.formatln("negated: {}", cc5.toString); |
|---|
| | 931 | assert(0, "control the results for yourself, i'm too lazy to type all those asserts ;)"); |
|---|
| | 932 | } |
|---|
| | 933 | |
|---|
| 801 | 934 | |
|---|
| 802 | 935 | /* ************************************************************************************************ |
|---|
| … | … | |
| 818 | 951 | |
|---|
| 819 | 952 | // data for compiled predicates |
|---|
| 820 | | const uint MAX_BITMAP_LENGTH = 256*8, |
|---|
| | 953 | const uint MAX_BITMAP_LENGTH = 256, |
|---|
| 821 | 954 | MAX_SEARCH_LENGTH = 256; |
|---|
| 822 | 955 | enum MatchMode { |
|---|
| … | … | |
| 839 | 972 | |
|---|
| 840 | 973 | // single char? |
|---|
| 841 | | if ( input.parts.length == 1 && input.parts[0].l == input.parts[0].r ) |
|---|
| | 974 | if ( input.parts.length == 1 && input.parts[0].l_ == input.parts[0].r_ ) |
|---|
| 842 | 975 | { |
|---|
| 843 | 976 | mode = type==Type.consume ? MatchMode.single_char : MatchMode.single_char_l; |
|---|
| 844 | | data_chr = input.parts[0].l; |
|---|
| | 977 | data_chr = input.parts[0].l_; |
|---|
| 845 | 978 | return; |
|---|
| 846 | 979 | } |
|---|
| 847 | | |
|---|
| 848 | 980 | // check whether we can use a bitmap |
|---|
| 849 | 981 | foreach ( p; input.parts ) |
|---|
| 850 | 982 | { |
|---|
| 851 | | if ( p.l > MAX_BITMAP_LENGTH || p.r > MAX_BITMAP_LENGTH ) |
|---|
| | 983 | if ( p.l_ > MAX_BITMAP_LENGTH || p.r_ > MAX_BITMAP_LENGTH ) |
|---|
| 852 | 984 | goto LnoBitmap; |
|---|
| 853 | 985 | } |
|---|
| … | … | |
| 857 | 989 | foreach ( p; input.parts ) |
|---|
| 858 | 990 | { |
|---|
| 859 | | for ( char_t c = p.l; c <= p.r; ++c ) |
|---|
| | 991 | for ( char_t c = p.l_; c <= p.r_; ++c ) |
|---|
| 860 | 992 | data_bmp[c/8] |= 1 << (c&7); |
|---|
| 861 | 993 | } |
|---|
| … | … | |
| 864 | 996 | |
|---|
| 865 | 997 | LnoBitmap: |
|---|
| | 998 | /* |
|---|
| 866 | 999 | // check whether the class is small enough to justify a string-search |
|---|
| 867 | 1000 | // TODO: consider inverse class for 8bit chars? |
|---|
| 868 | 1001 | uint class_size; |
|---|
| 869 | 1002 | foreach ( p; input.parts ) |
|---|
| 870 | | class_size += cast(uint)p.r+1-p.l; |
|---|
| | 1003 | class_size += cast(uint)p.r_+1-p.l_; |
|---|
| 871 | 1004 | if ( class_size > MAX_SEARCH_LENGTH ) |
|---|
| 872 | 1005 | goto Lgeneric; |
|---|
| … | … | |
| 875 | 1008 | foreach ( p; input.parts ) |
|---|
| 876 | 1009 | { |
|---|
| 877 | | for ( char_t c = p.l; c <= p.r; ++c ) |
|---|
| | 1010 | for ( char_t c = p.l_; c <= p.r_; ++c ) |
|---|
| 878 | 1011 | data_str[ind++] = c; |
|---|
| 879 | 1012 | } |
|---|
| 880 | 1013 | mode = type==Type.consume ? MatchMode.string_search : MatchMode.string_search_l; |
|---|
| 881 | 1014 | return; |
|---|
| 882 | | |
|---|
| | 1015 | */ |
|---|
| 883 | 1016 | Lgeneric: |
|---|
| 884 | 1017 | data_str = cast(char_t[])input.parts; |
|---|
| … | … | |
| 1078 | 1211 | alias char_t[] string_t; |
|---|
| 1079 | 1212 | alias CharRange!(char_t) range_t; |
|---|
| | 1213 | alias CharClass!(char_t) cc_t; |
|---|
| 1080 | 1214 | |
|---|
| 1081 | 1215 | string_t pattern; |
|---|
| … | … | |
| 1218 | 1352 | // and matching bracket for total match group |
|---|
| 1219 | 1353 | if ( unanchored ) { |
|---|
| 1220 | | frags ~= constructChars(CharClass!(char_t).dot_oper, predicate_t.Type.consume); |
|---|
| | 1354 | frags ~= constructChars(cc_t.dot_oper, predicate_t.Type.consume); |
|---|
| 1221 | 1355 | perform(Operator.zero_more_xr, false); |
|---|
| 1222 | 1356 | perform(Operator.concat, false); |
|---|
| … | … | |
| 1298 | 1432 | perform(Operator.concat, false); |
|---|
| 1299 | 1433 | implicit_concat = true; |
|---|
| 1300 | | frags ~= constructChars(CharClass!(char_t).dot_oper, pred_type); |
|---|
| | 1434 | frags ~= constructChars(cc_t.dot_oper, pred_type); |
|---|
| 1301 | 1435 | break; |
|---|
| 1302 | 1436 | case '$': |
|---|
| … | … | |
| 1305 | 1439 | implicit_concat = true; |
|---|
| 1306 | 1440 | |
|---|
| 1307 | | frags ~= constructChars(CharClass!(char_t).line_startend, predicate_t.Type.lookahead); |
|---|
| | 1441 | frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookahead); |
|---|
| 1308 | 1442 | break; |
|---|
| 1309 | 1443 | case '^': |
|---|
| … | … | |
| 1312 | 1446 | implicit_concat = true; |
|---|
| 1313 | 1447 | |
|---|
| 1314 | | frags ~= constructChars(CharClass!(char_t).line_startend, predicate_t.Type.lookbehind); |
|---|
| | 1448 | frags ~= constructChars(cc_t.line_startend, predicate_t.Type.lookbehind); |
|---|
| 1315 | 1449 | break; |
|---|
| 1316 | 1450 | case '>': |
|---|
| … | … | |
| 1325 | 1459 | else |
|---|
| 1326 | 1460 | goto default; |
|---|
| 1327 | | break; |
|---|
| 1328 | 1461 | case '<': |
|---|
| 1329 | 1462 | c = readPattern; |
|---|
| … | … | |
| 1337 | 1470 | else |
|---|
| 1338 | 1471 | goto default; |
|---|
| 1339 | | break; |
|---|
| 1340 | 1472 | case '\\': |
|---|
| 1341 | 1473 | c = readPattern; |
|---|
| … | … | |
| 1357 | 1489 | break; |
|---|
| 1358 | 1490 | case 'w': // alphanumeric and _ |
|---|
| 1359 | | frags ~= constructChars(CharClass!(char_t).alphanum_, pred_type); |
|---|
| | 1491 | frags ~= constructChars(cc_t.alphanum_, pred_type); |
|---|
| 1360 | 1492 | break; |
|---|
| 1361 | 1493 | case 'W': // non-(alphanum and _) |
|---|
| 1362 | | auto cc = CharClass!(char_t).alphanum_; |
|---|
| | 1494 | auto cc = cc_t(cc_t.alphanum_); |
|---|
| 1363 | 1495 | cc.negate; |
|---|
| 1364 | 1496 | frags ~= constructChars(cc, pred_type); |
|---|
| 1365 | 1497 | break; |
|---|
| 1366 | 1498 | case 's': // whitespace |
|---|
| 1367 | | frags ~= constructChars(CharClass!(char_t).whitespace, pred_type); |
|---|
| | 1499 | frags ~= constructChars(cc_t.whitespace, pred_type); |
|---|
| 1368 | 1500 | break; |
|---|
| 1369 | 1501 | case 'S': // non-whitespace |
|---|
| 1370 | | auto cc = CharClass!(char_t).whitespace; |
|---|
| | 1502 | auto cc = cc_t(cc_t.whitespace); |
|---|
| 1371 | 1503 | cc.negate; |
|---|
| 1372 | 1504 | frags ~= constructChars(cc, pred_type); |
|---|
| 1373 | 1505 | break; |
|---|
| 1374 | 1506 | case 'd': // digit |
|---|
| 1375 | | frags ~= constructChars(CharClass!(char_t).digit, pred_type); |
|---|
| | 1507 | frags ~= constructChars(cc_t.digit, pred_type); |
|---|
| 1376 | 1508 | break; |
|---|
| 1377 | 1509 | case 'D': // non-digit |
|---|
| 1378 | | auto cc = CharClass!(char_t).digit; |
|---|
| | 1510 | auto cc = cc_t(cc_t.digit); |
|---|
| 1379 | 1511 | cc.negate; |
|---|
| 1380 | 1512 | frags ~= constructChars(cc, pred_type); |
|---|
| … | … | |
| 1385 | 1517 | |
|---|
| 1386 | 1518 | // create (?<\S>\s|<\s>\S) |
|---|
| 1387 | | auto cc = CharClass!(char_t).whitespace; |
|---|
| | 1519 | auto cc = cc_t(cc_t.whitespace); |
|---|
| 1388 | 1520 | cc.negate; |
|---|
| 1389 | 1521 | |
|---|
| … | … | |
| 1392 | 1524 | frags ~= constructChars(cc, predicate_t.Type.lookbehind); |
|---|
| 1393 | 1525 | perform(Operator.concat, false); |
|---|
| 1394 | | frags ~= constructChars(CharClass!(char_t).whitespace, predicate_t.Type.lookahead); |
|---|
| | 1526 | frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead); |
|---|
| 1395 | 1527 | perform(Operator.altern, false); |
|---|
| 1396 | | frags ~= constructChars(CharClass!(char_t).whitespace, predicate_t.Type.lookbehind); |
|---|
| | 1528 | frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind); |
|---|
| 1397 | 1529 | perform(Operator.concat, false); |
|---|
| 1398 | 1530 | frags ~= constructChars(cc, predicate_t.Type.lookahead); |
|---|
| 1399 | | |
|---|
| | 1531 | |
|---|
| 1400 | 1532 | perform(Operator.close_par, false); |
|---|
| 1401 | 1533 | break; |
|---|
| … | … | |
| 1405 | 1537 | |
|---|
| 1406 | 1538 | // create (?<\S>\S|<\s>\s) |
|---|
| 1407 | | auto cc = CharClass!(char_t).whitespace; |
|---|
| | 1539 | auto cc = cc_t(cc_t.whitespace); |
|---|
| 1408 | 1540 | cc.negate; |
|---|
| 1409 | 1541 | |
|---|
| … | … | |
| 1414 | 1546 | frags ~= constructChars(cc, predicate_t.Type.lookahead); |
|---|
| 1415 | 1547 | perform(Operator.altern, false); |
|---|
| 1416 | | frags ~= constructChars(CharClass!(char_t).whitespace, predicate_t.Type.lookbehind); |
|---|
| | 1548 | frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookbehind); |
|---|
| 1417 | 1549 | perform(Operator.concat, false); |
|---|
| 1418 | | frags ~= constructChars(CharClass!(char_t).whitespace, predicate_t.Type.lookahead); |
|---|
| 1419 | | |
|---|
| | 1550 | frags ~= constructChars(cc_t.whitespace, predicate_t.Type.lookahead); |
|---|
| | 1551 | |
|---|
| 1420 | 1552 | perform(Operator.close_par, false); |
|---|
| 1421 | 1553 | break; |
|---|
| … | … | |
| 1436 | 1568 | case '<': |
|---|
| 1437 | 1569 | case '>': |
|---|
| | 1570 | default: |
|---|
| 1438 | 1571 | frags ~= constructSingleChar(c, pred_type); |
|---|
| 1439 | 1572 | break; |
|---|
| 1440 | | default: |
|---|
| 1441 | | throw new RegExpException(layout.convert("Unknown escape sequence \\{}", c)); |
|---|
| | 1573 | // throw new RegExpException(layout.convert("Unknown escape sequence \\{}", c)); |
|---|
| 1442 | 1574 | } |
|---|
| 1443 | 1575 | break; |
|---|
| … | … | |
| 1457 | 1589 | if ( implicit_concat ) |
|---|
| 1458 | 1590 | perform(Operator.concat, false); |
|---|
| 1459 | | frags ~= constructChars(CharClass!(char_t).dot_oper, predicate_t.Type.consume); |
|---|
| | 1591 | frags ~= constructChars(cc_t.dot_oper, predicate_t.Type.consume); |
|---|
| 1460 | 1592 | perform(Operator.zero_more_ng, false); |
|---|
| 1461 | 1593 | } |
|---|
| … | … | |
| 1463 | 1595 | // empty operator stack |
|---|
| 1464 | 1596 | while ( !perform(Operator.eos) ) {} |
|---|
| 1465 | | |
|---|
| | 1597 | |
|---|
| 1466 | 1598 | // set start and finish states |
|---|
| 1467 | 1599 | start = addState; |
|---|
| … | … | |
| 1703 | 1835 | frag_t constructChars(string_t chars, predicate_t.Type type) |
|---|
| 1704 | 1836 | { |
|---|
| 1705 | | CharClass!(char_t) cc; |
|---|
| | 1837 | cc_t cc; |
|---|
| 1706 | 1838 | for ( int i = 0; i < chars.length; ++i ) |
|---|
| 1707 | 1839 | cc.add(chars[i]); |
|---|
| … | … | |
| 1710 | 1842 | } |
|---|
| 1711 | 1843 | |
|---|
| 1712 | | frag_t constructChars(CharClass!(char_t) charclass, predicate_t.Type type) |
|---|
| 1713 | | { |
|---|
| 1714 | | debug(tnfa) Stdout.format("constructChars"); |
|---|
| | 1844 | frag_t constructChars(cc_t charclass, predicate_t.Type type) |
|---|
| | 1845 | { |
|---|
| | 1846 | debug(tnfa) Stdout.format("constructChars type={}", type); |
|---|
| 1715 | 1847 | |
|---|
| 1716 | 1848 | trans_t trans = addTransition; |
|---|
| 1717 | 1849 | trans.predicate.type = type; |
|---|
| 1718 | 1850 | |
|---|
| 1719 | | trans.predicate.setInput(charclass); |
|---|
| | 1851 | trans.predicate.setInput(cc_t(charclass)); |
|---|
| 1720 | 1852 | |
|---|
| 1721 | 1853 | trans.predicate.optimize; |
|---|
| … | … | |
| 1730 | 1862 | frag_t constructCharClass(predicate_t.Type type) |
|---|
| 1731 | 1863 | { |
|---|
| 1732 | | debug(tnfa) Stdout.format("constructCharClass"); |
|---|
| | 1864 | debug(tnfa) Stdout.format("constructCharClass type={}", type); |
|---|
| 1733 | 1865 | auto oldCursor = cursor; |
|---|
| 1734 | 1866 | |
|---|
| … | … | |
| 1941 | 2073 | prev = f; |
|---|
| 1942 | 2074 | } |
|---|
| 1943 | | |
|---|
| | 2075 | |
|---|
| 1944 | 2076 | if ( maxOccur == 0 ) |
|---|
| 1945 | 2077 | { |
|---|
| … | … | |
| 1967 | 2099 | prev = f; |
|---|
| 1968 | 2100 | } |
|---|
| 1969 | | |
|---|
| | 2101 | |
|---|
| 1970 | 2102 | for ( int i = minOccur; i < maxOccur; ++i ) |
|---|
| 1971 | 2103 | { |
|---|
| … | … | |
| 2054 | 2186 | { |
|---|
| 2055 | 2187 | alias Predicate!(char_t) predicate_t; |
|---|
| | 2188 | alias CharRange!(char_t) range_t; |
|---|
| | 2189 | alias CharClass!(char_t) charclass_t; |
|---|
| 2056 | 2190 | alias char_t[] string_t; |
|---|
| 2057 | 2191 | |
|---|
| … | … | |
| 2159 | 2293 | se.nfa_state = tnfa.start; |
|---|
| 2160 | 2294 | subset_start.elms ~= se; |
|---|
| 2161 | | subset_start = epsilonClosure(subset_start, subset_start); |
|---|
| 2162 | 2295 | |
|---|
| 2163 | 2296 | // apply lookbehind closure for string/line start |
|---|
| 2164 | | predicate_t pred; |
|---|
| 2165 | | pred.setInput(CharClass!(char_t).line_startend); |
|---|
| 2166 | | subset_start = lookbehindClosure(subset_start, pred); |
|---|
| | 2297 | predicate_t tmp_pred; |
|---|
| | 2298 | tmp_pred.setInput(CharClass!(char_t).line_startend); |
|---|
| | 2299 | subset_start = epsilonClosure(lookbehindClosure(epsilonClosure(subset_start, subset_start), tmp_pred), subset_start); |
|---|
| 2167 | 2300 | |
|---|
| 2168 | 2301 | start = addState; |
|---|
| … | … | |
| 2186 | 2319 | |
|---|
| 2187 | 2320 | // create transitions for each class, creating new states when necessary |
|---|
| 2188 | | foreach ( pred; disjointPredicates(state) ) |
|---|
| | 2321 | foreach ( disjoint; disjointPredicates(state) ) |
|---|
| 2189 | 2322 | { |
|---|
| 2190 | 2323 | // find NFA state we reach with pred |
|---|
| | 2324 | predicate_t pred; |
|---|
| | 2325 | pred.appendInput(disjoint); |
|---|
| | 2326 | |
|---|
| | 2327 | // reach will set predicate type correctly |
|---|
| | 2328 | debug(tdfa) Stdout.format("from {} with {} reach", state.dfa_state.index, pred.toString); |
|---|
| 2191 | 2329 | SubsetState target = reach(state, pred); |
|---|
| 2192 | | if ( target is null ) |
|---|
| 2193 | | { |
|---|
| 2194 | | debug(tdfa) { |
|---|
| 2195 | | Stdout.formatln("from {} with {} - lookbehind at beginning", state.dfa_state.index, pred.toString); |
|---|
| 2196 | | } |
|---|
| 2197 | | throw new Exception("Lookbehind at beginning of expression"); |
|---|
| | 2330 | if ( target is null ) { |
|---|
| | 2331 | continue; |
|---|
| | 2332 | debug(tdfa) Stdout.formatln(" nothing - lookbehind at beginning, skipping"); |
|---|
| 2198 | 2333 | } |
|---|
| 2199 | | debug(tdfa) { |
|---|
| 2200 | | Stdout.formatln("from {} with {} reach {}", state.dfa_state.index, pred.toString, target.toString); |
|---|
| 2201 | | } |
|---|
| 2202 | | target = epsilonClosure(target, state); |
|---|
| 2203 | | target = lookbehindClosure(target, pred); |
|---|
| | 2334 | debug(tdfa) Stdout.formatln(" {}", target.toString); |
|---|
| | 2335 | target = epsilonClosure(lookbehindClosure(epsilonClosure(target, state), pred), state); |
|---|
| 2204 | 2336 | |
|---|
| 2205 | 2337 | Transition trans = new Transition; |
|---|
| … | … | |
| 2324 | 2456 | } |
|---|
| 2325 | 2457 | |
|---|
| 2326 | | // TODO: add lookahead for string end somewhere |
|---|
| 2327 | | // TODO: minimize DFA |
|---|
| | 2458 | // TODO: add lookahead for string-end somewhere |
|---|
| 2328 | 2459 | // TODO: mark dead-end states (not leaving a non-finishing susbet) |
|---|
| 2329 | 2460 | // TODO: mark states that can leave the finishing subset of DFA states or use a greedy transition |
|---|
| … | … | |
| 2393 | 2524 | TNFAState!(char_t) nfa_state; |
|---|
| 2394 | 2525 | int[uint] tags; |
|---|
| | 2526 | // use place-value priority with 2 places, value(maxPrio) > value(lastPrio) |
|---|
| 2395 | 2527 | uint maxPriority, |
|---|
| 2396 | 2528 | lastPriority; |
|---|
| … | … | |
| 2463 | 2595 | } |
|---|
| 2464 | 2596 | return str~" ]"; |
|---|
| | 2597 | } |
|---|
| | 2598 | } |
|---|
| | 2599 | |
|---|
| | 2600 | /* ******************************************************************************************** |
|---|
| | 2601 | Temporary structure used for disjoint predicate computation |
|---|
| | 2602 | **********************************************************************************************/ |
|---|
| | 2603 | struct Mark |
|---|
| | 2604 | { |
|---|
| | 2605 | char_t c; |
|---|
| | 2606 | bool end; /// false = start of range |
|---|
| | 2607 | |
|---|
| | 2608 | int opCmp(Mark m) |
|---|
| | 2609 | { |
|---|
| | 2610 | if ( c < m.c ) |
|---|
| | 2611 | return -1; |
|---|
| | 2612 | if ( c == m.c ) |
|---|
| | 2613 | { |
|---|
| | 2614 | if ( !end ) |
|---|
| | 2615 | return -1; |
|---|
| | 2616 | if ( !m.end ) |
|---|
| | 2617 | return 1; |
|---|
| | 2618 | return 0; |
|---|
| | 2619 | } |
|---|
| | 2620 | return 1; |
|---|
| 2465 | 2621 | } |
|---|
| 2466 | 2622 | } |
|---|
| … | … | |
| 2490 | 2646 | } |
|---|
| 2491 | 2647 | |
|---|
| | 2648 | Mark[] marks_; |
|---|
| | 2649 | |
|---|
| 2492 | 2650 | /* ******************************************************************************************** < |
|---|
|