root/orange/xml/PhobosXML.d

Revision 35:511d1ef4e299, 69.6 kB (checked in by Jacob Carlborg <doob@me.com>, 3 years ago)

Now all unit tests pass on latest DMD2 compiler.

Line 
1 // Written in the D programming language.
2
3 /**
4 Classes and functions for creating and parsing XML
5
6 The basic architecture of this module is that there are standalone functions,
7 classes for constructing an XML document from scratch (Tag, Element and
8 Document), and also classes for parsing a pre-existing XML file (ElementParser
9 and DocumentParser). The parsing classes <i>may</i> be used to build a
10 Document, but that is not their primary purpose. The handling capabilities of
11 DocumentParser and ElementParser are sufficiently customizable that you can
12 make them do pretty much whatever you want.
13
14 Example: This example creates a DOM (Document Object Model) tree
15     from an XML file.
16 ------------------------------------------------------------------------------
17 import std.xml;
18 import std.stdio;
19 import std.string;
20
21 // books.xml is used in various samples throughout the Microsoft XML Core
22 // Services (MSXML) SDK.
23 //
24 // See http://msdn2.microsoft.com/en-us/library/ms762271(VS.85).aspx
25
26 void main()
27 {
28     string s = cast(string)std.file.read("books.xml");
29
30     // Check for well-formedness
31     check(s);
32
33     // Make a DOM tree
34     auto doc = new Document(s);
35
36     // Plain-print it
37     writefln(doc);
38 }
39 ------------------------------------------------------------------------------
40
41 Example: This example does much the same thing, except that the file is
42     deconstructed and reconstructed by hand. This is more work, but the
43     techniques involved offer vastly more power.
44 ------------------------------------------------------------------------------
45 import std.xml;
46 import std.stdio;
47 import std.string;
48
49 struct Book
50 {
51     string id;
52     string author;
53     string title;
54     string genre;
55     string price;
56     string pubDate;
57     string description;
58 }
59
60 void main()
61 {
62     string s = cast(string)std.file.read("books.xml");
63
64     // Check for well-formedness
65     check(s);
66
67     // Take it apart
68     Book[] books;
69
70     auto xml = new DocumentParser(s);
71     xml.onStartTag["book"] = (ElementParser xml)
72     {
73         Book book;
74         book.id = xml.tag.attr["id"];
75
76         xml.onEndTag["author"]     = (in Element e) { book.author     = e.text; };
77         xml.onEndTag["title"]       = (in Element e) { book.title      = e.text; };
78         xml.onEndTag["genre"]       = (in Element e) { book.genre      = e.text; };
79         xml.onEndTag["price"]       = (in Element e) { book.price      = e.text; };
80         xml.onEndTag["publish-date"] = (in Element e) { book.pubDate     = e.text; };
81         xml.onEndTag["description"]  = (in Element e) { book.description = e.text; };
82
83         xml.parse();
84
85         books ~= book;
86     };
87     xml.parse();
88
89     // Put it back together again;
90     auto doc = new Document(new Tag("catalog"));
91     foreach(book;books)
92     {
93         auto element = new Element("book");
94         element.tag.attr["id"] = book.id;
95
96         element ~= new Element("author",      book.author);
97         element ~= new Element("title",    book.title);
98         element ~= new Element("genre",    book.genre);
99         element ~= new Element("price",    book.price);
100         element ~= new Element("publish-date",book.pubDate);
101         element ~= new Element("description", book.description);
102
103         doc ~= element;
104     }
105
106     // Pretty-print it
107     writefln(join(doc.pretty(3),"\n"));
108 }
109 -------------------------------------------------------------------------------
110 Macros:
111     WIKI=Phobos/StdXml
112
113 Copyright: Copyright Janice Caron 2008 - 2009.
114 License:   <a href="http://www.boost.org/LICENSE_1_0.txt">Boost License 1.0</a>.
115 Authors:   Janice Caron
116
117          Copyright Janice Caron 2008 - 2009.
118 Distributed under the Boost Software License, Version 1.0.
119    (See accompanying file LICENSE_1_0.txt or copy at
120          http://www.boost.org/LICENSE_1_0.txt)
121 */
122 module orange.xml.PhobosXML;
123
124 version (Tango) {}
125 else
126     version = Phobos;
127
128 version (Phobos):
129
130 mixin(`import std.array;
131 import std.string;
132 import std.encoding;
133
134 enum cdata = "<![CDATA[";
135
136 final class Attribute : Element
137 {
138     private alias string tstring;
139     private tstring name_;
140     private tstring value_;
141
142     this (tstring name, tstring value, Element parent)
143     {
144         super(name);
145         name_ = name;
146         value_ = value;
147         parent_ = parent;
148     }
149
150     tstring name ()
151     {
152         return name_;
153     }
154
155     tstring value ()
156     {
157         return value_;
158     }
159 }
160
161 /*struct TagProxy
162 {
163     private alias string tstring;
164     private tstring name_;
165
166     private static TagProxy opCall (tstring name)
167     {
168         TagProxy tp;
169         tp.name_ = name;
170
171         return tp;
172     }
173
174     tstring name ()
175     {
176         return name_;
177     }
178 }*/
179
180 /**
181  * Returns true if the character is a character according to the XML standard
182  *
183  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
184  *
185  * Params:
186  *  c = the character to be tested
187  */
188 bool isChar(dchar c) // rule 2
189 {
190     if (c <= 0xD7FF)
191     {
192         if (c >= 0x20)
193             return true;
194         switch(c)
195         {
196         case 0xA:
197         case 0x9:
198         case 0xD:
199             return true;
200         default:
201             return false;
202         }
203     }
204     else if (0xE000 <= c && c <= 0x10FFFF)
205     {
206         if ((c & 0x1FFFFE) != 0xFFFE) // U+FFFE and U+FFFF
207             return true;
208     }
209     return false;
210 }
211
212 unittest
213 {
214 //  const CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,
215 //      0x10000,0x10FFFF];
216     assert(!isChar(cast(dchar)0x8));
217     assert( isChar(cast(dchar)0x9));
218     assert( isChar(cast(dchar)0xA));
219     assert(!isChar(cast(dchar)0xB));
220     assert(!isChar(cast(dchar)0xC));
221     assert( isChar(cast(dchar)0xD));
222     assert(!isChar(cast(dchar)0xE));
223     assert(!isChar(cast(dchar)0x1F));
224     assert( isChar(cast(dchar)0x20));
225     assert( isChar('J'));
226     assert( isChar(cast(dchar)0xD7FF));
227     assert(!isChar(cast(dchar)0xD800));
228     assert(!isChar(cast(dchar)0xDFFF));
229     assert( isChar(cast(dchar)0xE000));
230     assert( isChar(cast(dchar)0xFFFD));
231     assert(!isChar(cast(dchar)0xFFFE));
232     assert(!isChar(cast(dchar)0xFFFF));
233     assert( isChar(cast(dchar)0x10000));
234     assert( isChar(cast(dchar)0x10FFFF));
235     assert(!isChar(cast(dchar)0x110000));
236
237     debug (stdxml_TestHardcodedChecks)
238     {
239         foreach (c; 0 .. dchar.max + 1)
240             assert(isChar(c) == lookup(CharTable, c));
241     }
242 }
243
244 /**
245  * Returns true if the character is whitespace according to the XML standard
246  *
247  * Only the following characters are considered whitespace in XML - space, tab,
248  * carriage return and linefeed
249  *
250  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
251  *
252  * Params:
253  *  c = the character to be tested
254  */
255 bool isSpace(dchar c)
256 {
257     return c == '\u0020' || c == '\u0009' || c == '\u000A' || c == '\u000D';
258 }
259
260 /**
261  * Returns true if the character is a digit according to the XML standard
262  *
263  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
264  *
265  * Params:
266  *  c = the character to be tested
267  */
268 bool isDigit(dchar c)
269 {
270     if (c <= 0x0039 && c >= 0x0030)
271         return true;
272     else
273         return lookup(DigitTable,c);
274 }
275
276 unittest
277 {
278     debug (stdxml_TestHardcodedChecks)
279     {
280         foreach (c; 0 .. dchar.max + 1)
281             assert(isDigit(c) == lookup(DigitTable, c));
282     }
283 }
284
285 /**
286  * Returns true if the character is a letter according to the XML standard
287  *
288  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
289  *
290  * Params:
291  *  c = the character to be tested
292  */
293 bool isLetter(dchar c) // rule 84
294 {
295     return isIdeographic(c) || isBaseChar(c);
296 }
297
298 /**
299  * Returns true if the character is an ideographic character according to the
300  * XML standard
301  *
302  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
303  *
304  * Params:
305  *  c = the character to be tested
306  */
307 bool isIdeographic(dchar c)
308 {
309     if (c == 0x3007)
310         return true;
311     if (c <= 0x3029 && c >= 0x3021 )
312         return true;
313     if (c <= 0x9FA5 && c >= 0x4E00)
314         return true;
315     return false;
316 }
317
318 unittest
319 {
320     assert(isIdeographic('\u4E00'));
321     assert(isIdeographic('\u9FA5'));
322     assert(isIdeographic('\u3007'));
323     assert(isIdeographic('\u3021'));
324     assert(isIdeographic('\u3029'));
325
326     debug (stdxml_TestHardcodedChecks)
327     {
328         foreach (c; 0 .. dchar.max + 1)
329             assert(isIdeographic(c) == lookup(IdeographicTable, c));
330     }
331 }
332
333 /**
334  * Returns true if the character is a base character according to the XML
335  * standard
336  *
337  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
338  *
339  * Params:
340  *  c = the character to be tested
341  */
342 bool isBaseChar(dchar c)
343 {
344     return lookup(BaseCharTable,c);
345 }
346
347 /**
348  * Returns true if the character is a combining character according to the
349  * XML standard
350  *
351  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
352  *
353  * Params:
354  *  c = the character to be tested
355  */
356 bool isCombiningChar(dchar c)
357 {
358     return lookup(CombiningCharTable,c);
359 }
360
361 /**
362  * Returns true if the character is an extender according to the XML standard
363  *
364  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
365  *
366  * Params:
367  *  c = the character to be tested
368  */
369 bool isExtender(dchar c)
370 {
371     return lookup(ExtenderTable,c);
372 }
373
374 /**
375  * Encodes a string by replacing all characters which need to be escaped with
376  * appropriate predefined XML entities.
377  *
378  * encode() escapes certain characters (ampersand, quote, apostrophe, less-than
379  * and greater-than), and similarly, decode() unescapes them. These functions
380  * are provided for convenience only. You do not need to use them when using
381  * the std.xml classes, because then all the encoding and decoding will be done
382  * for you automatically.
383  *
384  * If the string is not modified, the original will be returned.
385  *
386  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
387  *
388  * Params:
389  *    s = The string to be encoded
390  *
391  * Returns: The encoded string
392  *
393  * Examples:
394  * --------------
395  * writefln(encode("a > b")); // writes "a &gt; b"
396  * --------------
397  */
398 S encode(S)(S s, S buffer = null)
399 {
400     string r;
401     size_t lastI;
402     if (buffer) buffer.length = 0;
403     auto result = appender(&buffer);
404
405     foreach (i, c; s)
406     {
407         switch (c)
408         {
409         case '&':  r = "&amp;"; break;
410         case '"':  r = "&quot;"; break;
411         case '\'': r = "&apos;"; break;
412         case '<':  r = "&lt;"; break;
413         case '>':  r = "&gt;"; break;
414         default: continue;
415         }
416         // Replace with r
417         result.put(s[lastI .. i]);
418         result.put(r);
419         lastI = i + 1;
420     }
421
422     if (!result.data) return s;
423     result.put(s[lastI .. $]);
424     return result.data;
425 }
426
427 unittest
428 {
429     assert(encode("hello") is "hello");
430     assert(encode("a > b") == "a &gt; b", encode("a > b"));
431     assert(encode("a < b") == "a &lt; b");
432     assert(encode("don't") == "don&apos;t");
433     assert(encode("\"hi\"") == "&quot;hi&quot;", encode("\"hi\""));
434     assert(encode("cat & dog") == "cat &amp; dog");
435 }
436
437 /**
438  * Mode to use for decoding.
439  *
440  * $(DDOC_ENUM_MEMBERS NONE) Do not decode
441  * $(DDOC_ENUM_MEMBERS LOOSE) Decode, but ignore errors
442  * $(DDOC_ENUM_MEMBERS STRICT) Decode, and throw exception on error
443  */
444 enum DecodeMode
445 {
446     NONE, LOOSE, STRICT
447 }
448
449 /**
450  * Decodes a string by unescaping all predefined XML entities.
451  *
452  * encode() escapes certain characters (ampersand, quote, apostrophe, less-than
453  * and greater-than), and similarly, decode() unescapes them. These functions
454  * are provided for convenience only. You do not need to use them when using
455  * the std.xml classes, because then all the encoding and decoding will be done
456  * for you automatically.
457  *
458  * This function decodes the entities &amp;amp;, &amp;quot;, &amp;apos;,
459  * &amp;lt; and &amp;gt,
460  * as well as decimal and hexadecimal entities such as &amp;#x20AC;
461  *
462  * If the string does not contain an ampersand, the original will be returned.
463  *
464  * Note that the "mode" parameter can be one of DecodeMode.NONE (do not
465  * decode), DecodeMode.LOOSE (decode, but ignore errors), or DecodeMode.STRICT
466  * (decode, and throw a DecodeException in the event of an error).
467  *
468  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
469  *
470  * Params:
471  *    s = The string to be decoded
472  *    mode = (optional) Mode to use for decoding. (Defaults to LOOSE).
473  *
474  * Throws: DecodeException if mode == DecodeMode.STRICT and decode fails
475  *
476  * Returns: The decoded string
477  *
478  * Examples:
479  * --------------
480  * writefln(decode("a &gt; b")); // writes "a > b"
481  * --------------
482  */
483 string decode(string s, DecodeMode mode=DecodeMode.LOOSE)
484 {
485     if (mode == DecodeMode.NONE) return s;
486
487     char[] buffer;
488     for (int i=0; i<s.length; ++i)
489     {
490         char c = s[i];
491         if (c != '&')
492         {
493             if (buffer.length != 0) buffer ~= c;
494         }
495         else
496         {
497             if (buffer.length == 0)
498             {
499                 buffer = s[0 .. i].dup;
500             }
501             if (startsWith(s[i..$],"&#"))
502             {
503                 try
504                 {
505                     dchar d;
506                     string t = s[i..$];
507                     checkCharRef(t, d);
508                     char[4] temp;
509                     buffer ~= temp[0 .. std.utf.encode(temp, d)];
510                     i = s.length - t.length - 1;
511                 }
512                 catch(Err e)
513                 {
514                     if (mode == DecodeMode.STRICT)
515                         throw new DecodeException("Unescaped &");
516                     buffer ~= '&';
517                 }
518             }
519             else if (startsWith(s[i..$],"&amp;" )) { buffer ~= '&';  i += 4; }
520             else if (startsWith(s[i..$],"&quot;")) { buffer ~= '"';  i += 5; }
521             else if (startsWith(s[i..$],"&apos;")) { buffer ~= '\''; i += 5; }
522             else if (startsWith(s[i..$],"&lt;"  )) { buffer ~= '<';  i += 3; }
523             else if (startsWith(s[i..$],"&gt;"  )) { buffer ~= '>';  i += 3; }
524             else
525             {
526                 if (mode == DecodeMode.STRICT)
527                     throw new DecodeException("Unescaped &");
528                 buffer ~= '&';
529             }
530         }
531     }
532     return (buffer.length == 0) ? s : cast(string)buffer;
533 }
534
535 unittest
536 {
537     void assertNot(string s)
538     {
539         bool b = false;
540         try { decode(s,DecodeMode.STRICT); }
541         catch (DecodeException e) { b = true; }
542         assert(b,s);
543     }
544
545     // Assert that things that should work, do
546     assert(decode("hello",        DecodeMode.STRICT) is "hello");
547     assert(decode("a &gt; b",      DecodeMode.STRICT) == "a > b");
548     assert(decode("a &lt; b",      DecodeMode.STRICT) == "a < b");
549     assert(decode("don&apos;t",  DecodeMode.STRICT) == "don't");
550     assert(decode("&quot;hi&quot;", DecodeMode.STRICT) == "\"hi\"");
551     assert(decode("cat &amp; dog",  DecodeMode.STRICT) == "cat & dog");
552     assert(decode("&#42;",        DecodeMode.STRICT) == "*");
553     assert(decode("&#x2A;",      DecodeMode.STRICT) == "*");
554     assert(decode("cat & dog",    DecodeMode.LOOSE) == "cat & dog");
555     assert(decode("a &gt b",        DecodeMode.LOOSE) == "a &gt b");
556     assert(decode("&#;",            DecodeMode.LOOSE) == "&#;");
557     assert(decode("&#x;",          DecodeMode.LOOSE) == "&#x;");
558     assert(decode("&#2G;",        DecodeMode.LOOSE) == "&#2G;");
559     assert(decode("&#x2G;",      DecodeMode.LOOSE) == "&#x2G;");
560
561     // Assert that things that shouldn't work, don't
562     assertNot("cat & dog");
563     assertNot("a &gt b");
564     assertNot("&#;");
565     assertNot("&#x;");
566     assertNot("&#2G;");
567     assertNot("&#x2G;");
568 }
569
570 /**
571  * Class representing an XML document.
572  *
573  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
574  *
575  */
576 class Document : Element
577 {
578     /**
579      * Contains all text which occurs before the root element.
580      * Defaults to &lt;?xml version="1.0"?&gt;
581      */
582     string prolog = "<?xml version=\"1.0\"?>";
583     /**
584      * Contains all text which occurs after the root element.
585      * Defaults to the empty string
586      */
587     string epilog;
588
589     /**
590      * Constructs a Document by parsing XML text.
591      *
592      * This function creates a complete DOM (Document Object Model) tree.
593      *
594      * The input to this function MUST be valid XML.
595      * This is enforced by DocumentParser's in contract.
596      *
597      * Params:
598      *    s = the complete XML text.
599      */
600     this(string s)
601     in
602     {
603         assert(s.length != 0);
604     }
605     body
606     {
607         auto xml = new DocumentParser(s);
608         string tagString = xml.tag.tagString;
609
610         this(xml.tag);
611         prolog = s[0 .. tagString.ptr - s.ptr];
612         parse(xml);
613         epilog = *xml.s;
614     }
615
616     /**
617      * Constructs a Document from a Tag.
618      *
619      * Params:
620      *    tag = the start tag of the document.
621      */
622     this(const(Tag) tag)
623     {
624         super(tag);
625     }
626
627     const
628     {
629         /**
630          * Compares two Documents for equality
631          *
632          * Examples:
633          * --------------
634          * Document d1,d2;
635          * if (d1 == d2) { }
636          * --------------
637          */
638         override bool opEquals(Object o)
639         {
640             const doc = toType!(const Document)(o);
641             return
642                 (prolog != doc.prolog           ) ? false : (
643                 (super  != cast(const Element)doc) ? false : (
644                 (epilog != doc.epilog           ) ? false : (
645             true )));
646         }
647
648         /**
649          * Compares two Documents
650          *
651          * You should rarely need to call this function. It exists so that
652          * Documents can be used as associative array keys.
653          *
654          * Examples:
655          * --------------
656          * Document d1,d2;
657          * if (d1 < d2) { }
658          * --------------
659          */
660         override int opCmp(Object o)
661         {
662             const doc = toType!(const Document)(o);
663             return
664                 ((prolog != doc.prolog          )
665                     ? ( prolog < doc.prolog          ? -1 : 1 ) :
666                 ((super  != cast(const Element)doc)
667                     ? ( super  < cast(const Element)doc ? -1 : 1 ) :
668                 ((epilog != doc.epilog          )
669                     ? ( epilog < doc.epilog          ? -1 : 1 ) :
670             0 )));
671         }
672
673         /**
674          * Returns the hash of a Document
675          *
676          * You should rarely need to call this function. It exists so that
677          * Documents can be used as associative array keys.
678          */
679         override hash_t toHash()
680         {
681             return hash(prolog,hash(epilog,super.toHash));
682         }
683
684         /**
685          * Returns the string representation of a Document. (That is, the
686          * complete XML of a document).
687          */
688         override string toString()
689         {
690             return prolog ~ super.toString ~ epilog;
691         }
692     }
693 }
694
695 /**
696  * Class representing an XML element.
697  *
698  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
699  */
700 class Element : Item
701 {
702     Tag tag; /// The start tag of the element
703     Item[] items; /// The element's items
704     Text[] texts; /// The element's text items
705     CData[] cdatas; /// The element's CData items
706     Comment[] comments; /// The element's comments
707     ProcessingInstruction[] pis; /// The element's processing instructions
708     Element[] elements; /// The element's child elements
709     Element parent_;
710
711     /**
712      * Constructs an Element given a name and a string to be used as a Text
713      * interior.
714      *
715      * Params:
716      *    name = the name of the element.
717      *    interior = (optional) the string interior.
718      *
719      * Examples:
720      * -------------------------------------------------------
721      * auto element = new Element("title","Serenity")
722      *   // constructs the element <title>Serenity</title>
723      * -------------------------------------------------------
724      */
725     this(string name, string interior=null)
726     {
727         this(new Tag(name));
728         if (interior.length != 0) opCatAssign(new Text(interior));
729     }
730
731     /**
732      * Constructs an Element from a Tag.
733      *
734      * Params:
735      *    tag = the start or empty tag of the element.
736      */
737     this(const(Tag) tag_)
738     {
739         this.tag = new Tag(tag_.name);
740         tag.type = TagType.EMPTY;
741         foreach(k,v;tag_.attr) tag.attr[k] = v;
742         tag.tagString = tag_.tagString;
743     }
744
745     Element parent ()
746     {
747         return parent_;
748     }
749     
750     Element parent (Element parent)
751     {
752         return parent_ = parent;
753     }
754     
755     string name ()
756     {
757         return tag.name;
758     }
759     
760     string value ()
761     {
762         return text;
763     }
764     
765     alias elements children;
766     
767     Attribute[] attributes ()
768     {
769         auto attrs = new Attribute[tag.attr.length];
770         attrs = attrs[0 .. 0];
771
772         foreach (k, v ; tag.attr)
773             attrs ~= new Attribute(k, v, this);         
774
775         return attrs;           
776     }
777     
778     Element query ()
779     {
780         return this;
781     }
782     
783     Element attribute (string prefix, string name, string value = null)
784     {
785         tag.attr[name] = value;
786         
787         return this;
788     }
789
790     /**
791      * Append a text item to the interior of this element
792      *
793      * Params:
794      *    item = the item you wish to append.
795      *
796      * Examples:
797      * --------------
798      * Element element;
799      * element ~= new Text("hello");
800      * --------------
801      */
802     void opCatAssign(Text item)
803     {
804         texts ~= item;
805         appendItem(item);
806     }
807
808     /**
809      * Append a CData item to the interior of this element
810      *
811      * Params:
812      *    item = the item you wish to append.
813      *
814      * Examples:
815      * --------------
816      * Element element;
817      * element ~= new CData("hello");
818      * --------------
819      */
820     void opCatAssign(CData item)
821     {
822         cdatas ~= item;
823         appendItem(item);
824     }
825
826     /**
827      * Append a comment to the interior of this element
828      *
829      * Params:
830      *    item = the item you wish to append.
831      *
832      * Examples:
833      * --------------
834      * Element element;
835      * element ~= new Comment("hello");
836      * --------------
837      */
838     void opCatAssign(Comment item)
839     {
840         comments ~= item;
841         appendItem(item);
842     }
843
844     /**
845      * Append a processing instruction to the interior of this element
846      *
847      * Params:
848      *    item = the item you wish to append.
849      *
850      * Examples:
851      * --------------
852      * Element element;
853      * element ~= new ProcessingInstruction("hello");
854      * --------------
855      */
856     void opCatAssign(ProcessingInstruction item)
857     {
858         pis ~= item;
859         appendItem(item);
860     }
861
862     /**
863      * Append a complete element to the interior of this element
864      *
865      * Params:
866      *    item = the item you wish to append.
867      *
868      * Examples:
869      * --------------
870      * Element element;
871      * Element other = new Element("br");
872      * element ~= other;
873      *  // appends element representing <br />
874      * --------------
875      */
876     void opCatAssign(Element item)
877     {
878         elements ~= item;
879         appendItem(item);
880     }
881
882     private void appendItem(Item item)
883     {
884         items ~= item;
885         if (tag.type == TagType.EMPTY && !item.isEmptyXML)
886             tag.type = TagType.START;
887     }
888
889     private void parse(ElementParser xml)
890     {
891         xml.onText = (string s) { opCatAssign(new Text(s)); };
892         xml.onCData = (string s) { opCatAssign(new CData(s)); };
893         xml.onComment = (string s) { opCatAssign(new Comment(s)); };
894         xml.onPI = (string s) { opCatAssign(new ProcessingInstruction(s)); };
895
896         xml.onStartTag[null] = (ElementParser xml)
897         {
898             auto e = new Element(xml.tag);
899             e.parse(xml);
900             opCatAssign(e);
901         };
902
903         xml.parse();
904     }
905
906     /**
907      * Compares two Elements for equality
908      *
909      * Examples:
910      * --------------
911      * Element e1,e2;
912      * if (e1 == e2) { }
913      * --------------
914      */
915     override bool opEquals(Object o)
916     {
917         const element = toType!(const Element)(o);
918         uint len = items.length;
919         if (len != element.items.length) return false;
920         for (uint i=0; i<len; ++i)
921         {
922             if (!items[i].opEquals(element.items[i])) return false;
923         }
924         return true;
925     }
926
927     /**
928      * Compares two Elements
929      *
930      * You should rarely need to call this function. It exists so that Elements
931      * can be used as associative array keys.
932      *
933      * Examples:
934      * --------------
935      * Element e1,e2;
936      * if (e1 < e2) { }
937      * --------------
938      */
939     override int opCmp(Object o)
940     {
941         const element = toType!(const Element)(o);
942         for (uint i=0; ; ++i)
943         {
944             if (i == items.length && i == element.items.length) return 0;
945             if (i == items.length) return -1;
946             if (i == element.items.length) return 1;
947             if (items[i] != element.items[i])
948                 return items[i].opCmp(element.items[i]);
949         }
950     }
951
952     /**
953      * Returns the hash of an Element
954      *
955      * You should rarely need to call this function. It exists so that Elements
956      * can be used as associative array keys.
957      */
958     override hash_t toHash()
959     {
960         hash_t hash = tag.toHash;
961         foreach(item;items) hash += item.toHash();
962         return hash;
963     }
964
965     const
966     {
967         /**
968          * Returns the decoded interior of an element.
969          *
970          * The element is assumed to containt text <i>only</i>. So, for
971          * example, given XML such as "&lt;title&gt;Good &amp;amp;
972          * Bad&lt;/title&gt;", will return "Good &amp; Bad".
973          *
974          * Params:
975          *    mode = (optional) Mode to use for decoding. (Defaults to LOOSE).
976          *
977          * Throws: DecodeException if decode fails
978          */
979         string text(DecodeMode mode=DecodeMode.LOOSE)
980         {
981             string buffer;
982             foreach(item;items)
983             {
984                 Text t = cast(Text)item;
985                 if (t is null) throw new DecodeException(item.toString);
986                 buffer ~= decode(t.toString,mode);
987             }
988             return buffer;
989         }
990
991         /**
992          * Returns an indented string representation of this item
993          *
994          * Params:
995          *    indent = (optional) number of spaces by which to indent this
996          *        element. Defaults to 2.
997          */
998         override string[] pretty(uint indent=2)
999         {
1000
1001             if (isEmptyXML || tag.isEmpty) return [ tag.toEmptyString ];
1002
1003             if (items.length == 1)
1004             {
1005                 Text t = cast(Text)(items[0]);
1006                 if (t !is null)
1007                 {
1008                     return [tag.toStartString ~ t.toString ~ tag.toEndString];
1009                 }
1010             }
1011
1012             string[] a = [ tag.toStartString ];
1013             foreach(item;items)
1014             {
1015                 string[] b = item.pretty(indent);
1016                 foreach(s;b)
1017                 {
1018                     a ~= rightJustify(s,s.length + indent);
1019                 }
1020             }
1021             a ~= tag.toEndString;
1022             return a;
1023         }
1024
1025         /**
1026          * Returns the string representation of an Element
1027          *
1028          * Examples:
1029          * --------------
1030          * auto element = new Element("br");
1031          * writefln(element.toString); // writes "<br />"
1032          * --------------
1033          */
1034         override string toString()
1035         {
1036             if (isEmptyXML || tag.isEmpty) return tag.toEmptyString;
1037
1038             string buffer = tag.toStartString;
1039             foreach(item;items) { buffer ~= item.toString; }
1040             buffer ~= tag.toEndString;
1041             return buffer;
1042         }
1043
1044         override bool isEmptyXML() { return false; } /// Returns false always
1045     }
1046 }
1047
1048 /**
1049  * Tag types.
1050  *
1051  * $(DDOC_ENUM_MEMBERS START) Used for start tags
1052  * $(DDOC_ENUM_MEMBERS END) Used for end tags
1053  * $(DDOC_ENUM_MEMBERS EMPTY) Used for empty tags
1054  *
1055  */
1056 enum TagType { START, END, EMPTY };
1057
1058 /**
1059  * Class representing an XML tag.
1060  *
1061  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1062  *
1063  * The class invariant guarantees
1064  * <ul>
1065  * <li> that $(B type) is a valid enum TagType value</li>
1066  * <li> that $(B name) consists of valid characters</li>
1067  * <li> that each attribute name consists of valid characters</li>
1068  * </ul>
1069  */
1070 class Tag
1071 {
1072     TagType type = TagType.START;   /// Type of tag
1073     string name;                    /// Tag name
1074     string[string] attr;            /// Associative array of attributes
1075     private string tagString;
1076
1077     invariant()
1078     {
1079         string s;
1080         string t;
1081
1082         assert(type == TagType.START
1083             || type == TagType.END
1084             || type == TagType.EMPTY);
1085
1086         s = name;
1087         try { checkName(s,t); }
1088         catch(Err e) { assert(false,"Invalid tag name:" ~ e.toString); }
1089
1090         foreach(k,v;attr)
1091         {
1092             s = k;
1093             try { checkName(s,t); }
1094             catch(Err e)
1095                 { assert(false,"Invalid atrribute name:" ~ e.toString); }
1096         }
1097     }
1098
1099     /**
1100      * Constructs an instance of Tag with a specified name and type
1101      *
1102      * The constructor does not initialize the attributes. To initialize the
1103      * attributes, you access the $(B attr) member variable.
1104      *
1105      * Params:
1106      *    name = the Tag's name
1107      *    type = (optional) the Tag's type. If omitted, defaults to
1108      *        TagType.START.
1109      *
1110      * Examples:
1111      * --------------
1112      * auto tag = new Tag("img",Tag.EMPTY);
1113      * tag.attr["src"] = "http://example.com/example.jpg";
1114      * --------------
1115      */
1116     this(string name, TagType type=TagType.START)
1117     {
1118         this.name = name;
1119         this.type = type;
1120     }
1121
1122     /* Private constructor (so don't ddoc this!)
1123      *
1124      * Constructs a Tag by parsing the string representation, e.g. "<html>".
1125      *
1126      * The string is passed by reference, and is advanced over all characters
1127      * consumed.
1128      *
1129      * The second parameter is a dummy parameter only, required solely to
1130      * distinguish this constructor from the public one.
1131      */
1132     private this(ref string s, bool dummy)
1133     {
1134         tagString = s;
1135         try
1136         {
1137             reqc(s,'<');
1138             if (optc(s,'/')) type = TagType.END;
1139             name = munch(s,"^/>"~whitespace);
1140             munch(s,whitespace);
1141             while(s.length > 0 && s[0] != '>' && s[0] != '/')
1142             {
1143                 string key = munch(s,"^="~whitespace);
1144                 munch(s,whitespace);
1145                 reqc(s,'=');
1146                 munch(s,whitespace);
1147                 reqc(s,'"');
1148                 string val = decode(munch(s,"^\""), DecodeMode.LOOSE);
1149                 reqc(s,'"');
1150                 munch(s,whitespace);
1151                 attr[key] = val;
1152             }
1153             if (optc(s,'/'))
1154             {
1155                 if (type == TagType.END) throw new TagException("");
1156                 type = TagType.EMPTY;
1157             }
1158             reqc(s,'>');
1159             tagString.length = (s.ptr - tagString.ptr);
1160         }
1161         catch(XMLException e)
1162         {
1163             tagString.length = (s.ptr - tagString.ptr);
1164             throw new TagException(tagString);
1165         }
1166     }
1167
1168     const
1169     {
1170         /**
1171          * Compares two Tags for equality
1172          *
1173          * You should rarely need to call this function. It exists so that Tags
1174          * can be used as associative array keys.
1175          *
1176          * Examples:
1177          * --------------
1178          * Tag tag1,tag2
1179          * if (tag1 == tag2) { }
1180          * --------------
1181          */
1182         override bool opEquals(Object o)
1183         {
1184             const tag = toType!(const Tag)(o);
1185             return
1186                 (name != tag.name) ? false : (
1187                 (attr != tag.attr) ? false : (
1188                 (type != tag.type) ? false : (
1189             true )));
1190         }
1191
1192         /**
1193          * Compares two Tags
1194          *
1195          * Examples:
1196          * --------------
1197          * Tag tag1,tag2
1198          * if (tag1 < tag2) { }
1199          * --------------
1200          */
1201         override int opCmp(Object o)
1202         {
1203             const tag = toType!(const Tag)(o);
1204             return
1205                 ((name != tag.name) ? ( name < tag.name ? -1 : 1 ) :
1206                 ((attr != tag.attr) ? ( attr < tag.attr ? -1 : 1 ) :
1207                 ((type != tag.type) ? ( type < tag.type ? -1 : 1 ) :
1208             0 )));
1209         }
1210
1211         /**
1212          * Returns the hash of a Tag
1213          *
1214          * You should rarely need to call this function. It exists so that Tags
1215          * can be used as associative array keys.
1216          */
1217         override hash_t toHash()
1218         {
1219             hash_t hash = 0;
1220             foreach(dchar c;name) hash = hash * 11 + c;
1221             return hash;
1222         }
1223
1224         /**
1225          * Returns the string representation of a Tag
1226          *
1227          * Examples:
1228          * --------------
1229          * auto tag = new Tag("book",TagType.START);
1230          * writefln(tag.toString); // writes "<book>"
1231          * --------------
1232          */
1233         override string toString()
1234         {
1235             if (isEmpty) return toEmptyString();
1236             return (isEnd) ? toEndString() : toStartString();
1237         }
1238
1239         private
1240         {
1241             string toNonEndString()
1242             {
1243                 string s = "<" ~ name;
1244                 foreach(key,val;attr)
1245                     s ~= format(" %s=\"%s\"",key,decode(val,DecodeMode.LOOSE));
1246                 return s;
1247             }
1248
1249             string toStartString() { return toNonEndString() ~ ">"; }
1250
1251             string toEndString() { return "</" ~ name ~ ">"; }
1252
1253             string toEmptyString() { return toNonEndString() ~ "/>"; }
1254         }
1255
1256         /**
1257          * Returns true if the Tag is a start tag
1258          *
1259          * Examples:
1260          * --------------
1261          * if (tag.isStart) { }
1262          * --------------
1263          */
1264         bool isStart() { return type == TagType.START; }
1265
1266         /**
1267          * Returns true if the Tag is an end tag
1268          *
1269          * Examples:
1270          * --------------
1271          * if (tag.isEnd) { }
1272          * --------------
1273          */
1274         bool isEnd()   { return type == TagType.END;   }
1275
1276         /**
1277          * Returns true if the Tag is an empty tag
1278          *
1279          * Examples:
1280          * --------------
1281          * if (tag.isEmpty) { }
1282          * --------------
1283          */
1284         bool isEmpty() { return type == TagType.EMPTY; }
1285     }
1286 }
1287
1288 /**
1289  * Class representing a comment
1290  */
1291 class Comment : Item
1292 {
1293     private string content;
1294
1295     /**
1296      * Construct a comment
1297      *
1298      * Params:
1299      *    content = the body of the comment
1300      *
1301      * Throws: CommentException if the comment body is illegal (contains "--"
1302      * or exactly equals "-")
1303      *
1304      * Examples:
1305      * --------------
1306      * auto item = new Comment("This is a comment");
1307      *  // constructs <!--This is a comment-->
1308      * --------------
1309      */
1310     this(string content)
1311     {
1312         if (content == "-" || content.indexOf("==") != -1)
1313             throw new CommentException(content);
1314         this.content = content;
1315     }
1316
1317     /**
1318      * Compares two comments for equality
1319      *
1320      * Examples:
1321      * --------------
1322      * Comment item1,item2;
1323      * if (item1 == item2) { }
1324      * --------------
1325      */
1326     override bool opEquals(Object o)
1327     {
1328         const item = toType!(const Item)(o);
1329         const t = cast(Comment)item;
1330         return t !is null && content == t.content;
1331     }
1332
1333     /**
1334      * Compares two comments
1335      *
1336      * You should rarely need to call this function. It exists so that Comments
1337      * can be used as associative array keys.
1338      *
1339      * Examples:
1340      * --------------
1341      * Comment item1,item2;
1342      * if (item1 < item2) { }
1343      * --------------
1344      */
1345     override int opCmp(Object o)
1346     {
1347         const item = toType!(const Item)(o);
1348         const t = cast(Comment)item;
1349         return t !is null && (content != t.content
1350             ? (content < t.content ? -1 : 1 ) : 0 );
1351     }
1352
1353     /**
1354      * Returns the hash of a Comment
1355      *
1356      * You should rarely need to call this function. It exists so that Comments
1357      * can be used as associative array keys.
1358      */
1359     override hash_t toHash() { return hash(content); }
1360
1361     /**
1362      * Returns a string representation of this comment
1363      */
1364     override const string toString() { return "<!--" ~ content ~ "-->"; }
1365
1366     override const bool isEmptyXML() { return false; } /// Returns false always
1367 }
1368
1369 /**
1370  * Class representing a Character Data section
1371  */
1372 class CData : Item
1373 {
1374     private string content;
1375
1376     /**
1377      * Construct a chraracter data section
1378      *
1379      * Params:
1380      *    content = the body of the character data segment
1381      *
1382      * Throws: CDataException if the segment body is illegal (contains "]]>")
1383      *
1384      * Examples:
1385      * --------------
1386      * auto item = new CData("<b>hello</b>");
1387      *  // constructs <![CDATA[<b>hello</b>]]>
1388      * --------------
1389      */
1390     this(string content)
1391     {
1392         if (content.indexOf("]]>") != -1) throw new CDataException(content);
1393         this.content = content;
1394     }
1395
1396     /**
1397      * Compares two CDatas for equality
1398      *
1399      * Examples:
1400      * --------------
1401      * CData item1,item2;
1402      * if (item1 == item2) { }
1403      * --------------
1404      */
1405     override bool opEquals(Object o)
1406     {
1407         const item = toType!(const Item)(o);
1408         const t = cast(CData)item;
1409         return t !is null && content == t.content;
1410     }
1411
1412     /**
1413      * Compares two CDatas
1414      *
1415      * You should rarely need to call this function. It exists so that CDatas
1416      * can be used as associative array keys.
1417      *
1418      * Examples:
1419      * --------------
1420      * CData item1,item2;
1421      * if (item1 < item2) { }
1422      * --------------
1423      */
1424     override int opCmp(Object o)
1425     {
1426         const item = toType!(const Item)(o);
1427         const t = cast(CData)item;
1428         return t !is null && (content != t.content
1429             ? (content < t.content ? -1 : 1 ) : 0 );
1430     }
1431
1432     /**
1433      * Returns the hash of a CData
1434      *
1435      * You should rarely need to call this function. It exists so that CDatas
1436      * can be used as associative array keys.
1437      */
1438     override hash_t toHash() { return hash(content); }
1439
1440     /**
1441      * Returns a string representation of this CData section
1442      */
1443     override const string toString() { return cdata ~ content ~ "]]>"; }
1444
1445     override const bool isEmptyXML() { return false; } /// Returns false always
1446 }
1447
1448 /**
1449  * Class representing a text (aka Parsed Character Data) section
1450  */
1451 class Text : Item
1452 {
1453     private string content;
1454
1455     /**
1456      * Construct a text (aka PCData) section
1457      *
1458      * Params:
1459      *    content = the text. This function encodes the text before
1460      *    insertion, so it is safe to insert any text
1461      *
1462      * Examples:
1463      * --------------
1464      * auto Text = new CData("a < b");
1465      *  // constructs a &lt; b
1466      * --------------
1467      */
1468     this(string content)
1469     {
1470         this.content = encode(content);
1471     }
1472
1473     /**
1474      * Compares two text sections for equality
1475      *
1476      * Examples:
1477      * --------------
1478      * Text item1,item2;
1479      * if (item1 == item2) { }
1480      * --------------
1481      */
1482     override bool opEquals(Object o)
1483     {
1484         const item = toType!(const Item)(o);
1485         const t = cast(Text)item;
1486         return t !is null && content == t.content;
1487     }
1488
1489     /**
1490      * Compares two text sections
1491      *
1492      * You should rarely need to call this function. It exists so that Texts
1493      * can be used as associative array keys.
1494      *
1495      * Examples:
1496      * --------------
1497      * Text item1,item2;
1498      * if (item1 < item2) { }
1499      * --------------
1500      */
1501     override int opCmp(Object o)
1502     {
1503         const item = toType!(const Item)(o);
1504         const t = cast(Text)item;
1505         return t !is null
1506             && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1507     }
1508
1509     /**
1510      * Returns the hash of a text section
1511      *
1512      * You should rarely need to call this function. It exists so that Texts
1513      * can be used as associative array keys.
1514      */
1515     override hash_t toHash() { return hash(content); }
1516
1517     /**
1518      * Returns a string representation of this Text section
1519      */
1520     override const string toString() { return content; }
1521
1522     /**
1523      * Returns true if the content is the empty string
1524      */
1525     override const bool isEmptyXML() { return content.length == 0; }
1526 }
1527
1528 /**
1529  * Class representing an XML Instruction section
1530  */
1531 class XMLInstruction : Item
1532 {
1533     private string content;
1534
1535     /**
1536      * Construct an XML Instruction section
1537      *
1538      * Params:
1539      *    content = the body of the instruction segment
1540      *
1541      * Throws: XIException if the segment body is illegal (contains ">")
1542      *
1543      * Examples:
1544      * --------------
1545      * auto item = new XMLInstruction("ATTLIST");
1546      *  // constructs <!ATTLIST>
1547      * --------------
1548      */
1549     this(string content)
1550     {
1551         if (content.indexOf(">") != -1) throw new XIException(content);
1552         this.content = content;
1553     }
1554
1555     /**
1556      * Compares two XML instructions for equality
1557      *
1558      * Examples:
1559      * --------------
1560      * XMLInstruction item1,item2;
1561      * if (item1 == item2) { }
1562      * --------------
1563      */
1564     override bool opEquals(Object o)
1565     {
1566         const item = toType!(const Item)(o);
1567         const t = cast(XMLInstruction)item;
1568         return t !is null && content == t.content;
1569     }
1570
1571     /**
1572      * Compares two XML instructions
1573      *
1574      * You should rarely need to call this function. It exists so that
1575      * XmlInstructions can be used as associative array keys.
1576      *
1577      * Examples:
1578      * --------------
1579      * XMLInstruction item1,item2;
1580      * if (item1 < item2) { }
1581      * --------------
1582      */
1583     override int opCmp(Object o)
1584     {
1585         const item = toType!(const Item)(o);
1586         const t = cast(XMLInstruction)item;
1587         return t !is null
1588             && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1589     }
1590
1591     /**
1592      * Returns the hash of an XMLInstruction
1593      *
1594      * You should rarely need to call this function. It exists so that
1595      * XmlInstructions can be used as associative array keys.
1596      */
1597     override hash_t toHash() { return hash(content); }
1598
1599     /**
1600      * Returns a string representation of this XmlInstruction
1601      */
1602     override const string toString() { return "<!" ~ content ~ ">"; }
1603
1604     override const bool isEmptyXML() { return false; } /// Returns false always
1605 }
1606
1607 /**
1608  * Class representing a Processing Instruction section
1609  */
1610 class ProcessingInstruction : Item
1611 {
1612     private string content;
1613
1614     /**
1615      * Construct a Processing Instruction section
1616      *
1617      * Params:
1618      *    content = the body of the instruction segment
1619      *
1620      * Throws: PIException if the segment body is illegal (contains "?>")
1621      *
1622      * Examples:
1623      * --------------
1624      * auto item = new ProcessingInstruction("php");
1625      *  // constructs <?php?>
1626      * --------------
1627      */
1628     this(string content)
1629     {
1630         if (content.indexOf("?>") != -1) throw new PIException(content);
1631         this.content = content;
1632     }
1633
1634     /**
1635      * Compares two processing instructions for equality
1636      *
1637      * Examples:
1638      * --------------
1639      * ProcessingInstruction item1,item2;
1640      * if (item1 == item2) { }
1641      * --------------
1642      */
1643     override bool opEquals(Object o)
1644     {
1645         const item = toType!(const Item)(o);
1646         const t = cast(ProcessingInstruction)item;
1647         return t !is null && content == t.content;
1648     }
1649
1650     /**
1651      * Compares two processing instructions
1652      *
1653      * You should rarely need to call this function. It exists so that
1654      * ProcessingInstructions can be used as associative array keys.
1655      *
1656      * Examples:
1657      * --------------
1658      * ProcessingInstruction item1,item2;
1659      * if (item1 < item2) { }
1660      * --------------
1661      */
1662     override int opCmp(Object o)
1663     {
1664         const item = toType!(const Item)(o);
1665         const t = cast(ProcessingInstruction)item;
1666         return t !is null
1667             && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1668     }
1669
1670     /**
1671      * Returns the hash of a ProcessingInstruction
1672      *
1673      * You should rarely need to call this function. It exists so that
1674      * ProcessingInstructions can be used as associative array keys.
1675      */
1676     override hash_t toHash() { return hash(content); }
1677
1678     /**
1679      * Returns a string representation of this ProcessingInstruction
1680      */
1681     override const string toString() { return "<?" ~ content ~ "?>"; }
1682
1683     override const bool isEmptyXML() { return false; } /// Returns false always
1684 }
1685
1686 /**
1687  * Abstract base class for XML items
1688  */
1689 abstract class Item
1690 {
1691     /// Compares with another Item of same type for equality
1692     abstract override bool opEquals(Object o);
1693
1694     /// Compares with another Item of same type
1695     abstract override int opCmp(Object o);
1696
1697     /// Returns the hash of this item
1698     abstract override hash_t toHash();
1699
1700     /// Returns a string representation of this item
1701     abstract override const string toString();
1702
1703     /**
1704      * Returns an indented string representation of this item
1705      *
1706      * Params:
1707      *    indent = number of spaces by which to indent child elements
1708      */
1709     const string[] pretty(uint indent)
1710     {
1711         string s = strip(toString());
1712         return s.length == 0 ? [] : [ s ];
1713     }
1714
1715     /// Returns true if the item represents empty XML text
1716     abstract const bool isEmptyXML();
1717 }
1718
1719 /**
1720  * Class for parsing an XML Document.
1721  *
1722  * This is a subclass of ElementParser. Most of the useful functions are
1723  * documented there.
1724  *
1725  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1726  *
1727  * Bugs:
1728  *    Currently only supports UTF documents.
1729  *
1730  *    If there is an encoding attribute in the prolog, it is ignored.
1731  *
1732  */
1733 class DocumentParser : ElementParser
1734 {
1735     string xmlText;
1736
1737     /**
1738      * Constructs a DocumentParser.
1739      *
1740      * The input to this function MUST be valid XML.
1741      * This is enforced by the function's in contract.
1742      *
1743      * Params:
1744      *    xmltext = the entire XML document as text
1745      *
1746      */
1747     this(string xmlText_)
1748     in
1749     {
1750         assert(xmlText_.length != 0);
1751         try
1752         {
1753             // Confirm that the input is valid XML
1754             check(xmlText_);
1755         }
1756         catch (CheckException e)
1757         {
1758             // And if it's not, tell the user why not
1759             assert(false, "\n" ~ e.toString());
1760         }
1761     }
1762     body
1763     {
1764         xmlText = xmlText_;
1765         s = &xmlText;
1766         super();    // Initialize everything
1767         parse();    // Parse through the root tag (but not beyond)
1768     }
1769 }
1770
1771 /**
1772  * Class for parsing an XML element.
1773  *
1774  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1775  *
1776  * Note that you cannot construct instances of this class directly. You can
1777  * construct a DocumentParser (which is a subclass of ElementParser), but
1778  * otherwise, Instances of ElementParser will be created for you by the
1779  * library, and passed your way via onStartTag handlers.
1780  *
1781  */
1782 class ElementParser
1783 {
1784     alias void delegate(string) Handler;
1785     alias void delegate(in Element element) ElementHandler;
1786     alias void delegate(ElementParser parser) ParserHandler;
1787
1788     private
1789     {
1790         Tag tag_;
1791         string elementStart;
1792         string* s;
1793
1794         Handler commentHandler = null;
1795         Handler cdataHandler = null;
1796         Handler xiHandler = null;
1797         Handler piHandler = null;
1798         Handler rawTextHandler = null;
1799         Handler textHandler = null;
1800
1801         // Private constructor for start tags
1802         this(ElementParser parent)
1803         {
1804             s = parent.s;
1805             this();
1806             tag_ = parent.tag_;
1807         }
1808         
1809         // Private constructor for empty tags
1810         this(Tag tag, string* t)
1811         {
1812             s = t;
1813             this();
1814             tag_ = tag;
1815         }
1816     }
1817
1818     /**
1819      * The Tag at the start of the element being parsed. You can read this to
1820      * determine the tag's name and attributes.
1821      */
1822     const const(Tag) tag() { return tag_; }
1823
1824     /**
1825      * Register a handler which will be called whenever a start tag is
1826      * encountered which matches the specified name. You can also pass null as
1827      * the name, in which case the handler will be called for any unmatched
1828      * start tag.
1829      *
1830      * Examples:
1831      * --------------
1832      * // Call this function whenever a <podcast> start tag is encountered
1833      * onStartTag["podcast"] = (ElementParser xml)
1834      * {
1835      *   // Your code here
1836      *   //
1837      *   // This is a a closure, so code here may reference
1838      *   // variables which are outside of this scope
1839      * };
1840      *
1841      * // call myEpisodeStartHandler (defined elsewhere) whenever an <episode>
1842      * // start tag is encountered
1843      * onStartTag["episode"] = &myEpisodeStartHandler;
1844      *
1845      * // call delegate dg for all other start tags
1846      * onStartTag[null] = dg;
1847      * --------------
1848      *
1849      * This library will supply your function with a new instance of
1850      * ElementHandler, which may be used to parse inside the element whose
1851      * start tag was just found, or to identify the tag attributes of the
1852      * element, etc.
1853      *
1854      * Note that your function will be called for both start tags and empty
1855      * tags. That is, we make no distinction between &lt;br&gt;&lt;/br&gt;
1856      * and &lt;br/&gt;.
1857      */
1858     ParserHandler[string] onStartTag;
1859
1860     /**
1861      * Register a handler which will be called whenever an end tag is
1862      * encountered which matches the specified name. You can also pass null as
1863      * the name, in which case the handler will be called for any unmatched
1864      * end tag.
1865      *
1866      * Examples:
1867      * --------------
1868      * // Call this function whenever a </podcast> end tag is encountered
1869      * onEndTag["podcast"] = (in Element e)
1870      * {
1871      *   // Your code here
1872      *   //
1873      *   // This is a a closure, so code here may reference
1874      *   // variables which are outside of this scope
1875      * };
1876      *
1877      * // call myEpisodeEndHandler (defined elsewhere) whenever an </episode>
1878      * // end tag is encountered
1879      * onEndTag["episode"] = &myEpisodeEndHandler;
1880      *
1881      * // call delegate dg for all other end tags
1882      * onEndTag[null] = dg;
1883      * --------------
1884      *
1885      * Note that your function will be called for both start tags and empty
1886      * tags. That is, we make no distinction between &lt;br&gt;&lt;/br&gt;
1887      * and &lt;br/&gt;.
1888      */
1889     ElementHandler[string] onEndTag;
1890
1891     protected this()
1892     {
1893         elementStart = *s;
1894     }
1895
1896     /**
1897      * Register a handler which will be called whenever text is encountered.
1898      *
1899      * Examples:
1900      * --------------
1901      * // Call this function whenever text is encountered
1902      * onText = (string s)
1903      * {
1904      *   // Your code here
1905      *
1906      *   // The passed parameter s will have been decoded by the time you see
1907      *   // it, and so may contain any character.
1908      *   //
1909      *   // This is a a closure, so code here may reference
1910      *   // variables which are outside of this scope
1911      * };
1912      * --------------
1913      */
1914     void onText(Handler handler) { textHandler = handler; }
1915
1916     /**
1917      * Register an alternative handler which will be called whenever text
1918      * is encountered. This differs from onText in that onText will decode
1919      * the text, wheras onTextRaw will not. This allows you to make design
1920      * choices, since onText will be more accurate, but slower, while
1921      * onTextRaw will be faster, but less accurate. Of course, you can
1922      * still call decode() within your handler, if you want, but you'd
1923      * probably want to use onTextRaw only in circumstances where you
1924      * know that decoding is unnecessary.
1925      *
1926      * Examples:
1927      * --------------
1928      * // Call this function whenever text is encountered
1929      * onText = (string s)
1930      * {
1931      *   // Your code here
1932      *
1933      *   // The passed parameter s will NOT have been decoded.
1934      *   //
1935      *   // This is a a closure, so code here may reference
1936      *   // variables which are outside of this scope
1937      * };
1938      * --------------
1939      */
1940     void onTextRaw(Handler handler) { rawTextHandler = handler; }
1941
1942     /**
1943      * Register a handler which will be called whenever a character data
1944      * segement is encountered.
1945      *
1946      * Examples:
1947      * --------------
1948      * // Call this function whenever a CData section is encountered
1949      * onCData = (string s)
1950      * {
1951      *   // Your code here
1952      *
1953      *   // The passed parameter s does not include the opening <![CDATA[
1954      *   // nor closing ]]>
1955      *   //
1956      *   // This is a a closure, so code here may reference
1957      *   // variables which are outside of this scope
1958      * };
1959      * --------------
1960      */
1961     void onCData(Handler handler) { cdataHandler = handler; }
1962
1963     /**
1964      * Register a handler which will be called whenever a comment is
1965      * encountered.
1966      *
1967      * Examples:
1968      * --------------
1969      * // Call this function whenever a comment is encountered
1970      * onComment = (string s)
1971      * {
1972      *   // Your code here
1973      *
1974      *   // The passed parameter s does not include the opening <!-- nor
1975      *   // closing -->
1976      *   //
1977      *   // This is a a closure, so code here may reference
1978      *   // variables which are outside of this scope
1979      * };
1980      * --------------
1981      */
1982     void onComment(Handler handler) { commentHandler = handler; }
1983
1984     /**
1985      * Register a handler which will be called whenever a processing
1986      * instruction is encountered.
1987      *
1988      * Examples:
1989      * --------------
1990      * // Call this function whenever a processing instruction is encountered
1991      * onPI = (string s)
1992      * {
1993      *   // Your code here
1994      *
1995      *   // The passed parameter s does not include the opening <? nor
1996      *   // closing ?>
1997      *   //
1998      *   // This is a a closure, so code here may reference
1999      *   // variables which are outside of this scope
2000      * };
2001      * --------------
2002      */
2003     void onPI(Handler handler) { piHandler = handler; }
2004
2005     /**
2006      * Register a handler which will be called whenever an XML instruction is
2007      * encountered.
2008      *
2009      * Examples:
2010      * --------------
2011      * // Call this function whenever an XML instruction is encountered
2012      * // (Note: XML instructions may only occur preceeding the root tag of a
2013      * // document).
2014      * onPI = (string s)
2015      * {
2016      *   // Your code here
2017      *
2018      *   // The passed parameter s does not include the opening <! nor
2019      *   // closing >
2020      *   //
2021      *   // This is a a closure, so code here may reference
2022      *   // variables which are outside of this scope
2023      * };
2024      * --------------
2025      */
2026     void onXI(Handler handler) { xiHandler = handler; }
2027
2028     /**
2029      * Parse an XML element.
2030      *
2031      * Parsing will continue until the end of the current element. Any items
2032      * encountered for which a handler has been registered will invoke that
2033      * handler.
2034      *
2035      * Throws: various kinds of XMLException
2036      */
2037     void parse()
2038     {
2039         string t;
2040         Tag root = tag_;
2041         Tag[string] startTags;
2042         if (tag_ !is null) startTags[tag_.name] = tag_;
2043
2044         while(s.length != 0)
2045         {
2046             if (startsWith(*s,"<!--"))
2047             {
2048                 chop(*s,4);
2049                 t = chop(*s,indexOf(*s,"-->"));
2050                 if (commentHandler.funcptr !is null) commentHandler(t);
2051                 chop(*s,3);
2052             }
2053             else if (startsWith(*s,"<![CDATA["))
2054             {
2055                 chop(*s,9);
2056                 t = chop(*s,indexOf(*s,"]]>"));
2057                 if (cdataHandler.funcptr !is null) cdataHandler(t);
2058                 chop(*s,3);
2059             }
2060             else if (startsWith(*s,"<!"))
2061             {
2062                 chop(*s,2);
2063                 t = chop(*s,indexOf(*s,">"));
2064                 if (xiHandler.funcptr !is null) xiHandler(t);
2065                 chop(*s,1);
2066             }
2067             else if (startsWith(*s,"<?"))
2068             {
2069                 chop(*s,2);
2070                 t = chop(*s,indexOf(*s,"?>"));
2071                 if (piHandler.funcptr !is null) piHandler(t);
2072                 chop(*s,2);
2073             }
2074             else if (startsWith(*s,"<"))
2075             {
2076                 tag_ = new Tag(*s,true);
2077                 if (root is null)
2078                     return; // Return to constructor of derived class
2079
2080                 if (tag_.isStart)
2081                 {
2082                     startTags[tag_.name] = tag_;
2083
2084                     auto parser = new ElementParser(this);
2085
2086                     auto handler = tag_.name in onStartTag;
2087                     if (handler !is null) (*handler)(parser);
2088                     else
2089                     {
2090                         handler = null in onStartTag;
2091                         if (handler !is null) (*handler)(parser);
2092                     }
2093                 }
2094                 else if (tag_.isEnd)
2095                 {
2096                     auto startTag = startTags[tag_.name];
2097                     string text;
2098
2099                     immutable(char)* p = startTag.tagString.ptr
2100                         + startTag.tagString.length;
2101                     immutable(char)* q = tag_.tagString.ptr;
2102                     text = decode(p[0..(q-p)], DecodeMode.LOOSE);
2103
2104                     auto element = new Element(startTag);
2105                     if (text.length != 0) element ~= new Text(text);
2106
2107                     auto handler = tag_.name in onEndTag;
2108                     if (handler !is null) (*handler)(element);
2109                     else
2110                     {
2111                         handler = null in onEndTag;
2112                         if (handler !is null) (*handler)(element);
2113                     }
2114
2115                     if (tag_.name == root.name) return;
2116                 }
2117                 else if (tag_.isEmpty)
2118                 {
2119                     Tag startTag = new Tag(tag_.name);
2120
2121                     // FIX by hed010gy, for bug 2979
2122                     // http://d.puremagic.com/issues/show_bug.cgi?id=2979
2123                     if (tag_.attr.length > 0)
2124                           foreach(tn,tv; tag_.attr) startTag.attr[tn]=tv;
2125                     // END FIX
2126
2127                     // Handle the pretend start tag
2128                     string s2;
2129                     auto parser = new ElementParser(startTag,&s2);
2130                     auto handler1 = startTag.name in onStartTag;
2131                     if (handler1 !is null) (*handler1)(parser);
2132                     else
2133                     {
2134                         handler1 = null in onStartTag;
2135                         if (handler1 !is null) (*handler1)(parser);
2136                     }
2137                     
2138                     // Handle the pretend end tag
2139                     auto element = new Element(startTag);
2140                     auto handler2 = tag_.name in onEndTag;
2141                     if (handler2 !is null) (*handler2)(element);
2142                     else
2143                     {
2144                         handler2 = null in onEndTag;
2145                         if (handler2 !is null) (*handler2)(element);
2146                     }
2147                 }
2148             }
2149             else
2150             {
2151                 t = chop(*s,indexOf(*s,"<"));
2152                 if (rawTextHandler.funcptr !is null)
2153                     rawTextHandler(t);
2154                 else if (textHandler.funcptr !is null)
2155                     textHandler(decode(t,DecodeMode.LOOSE));
2156             }
2157         }
2158     }
2159
2160     /**
2161      * Returns that part of the element which has already been parsed
2162      */
2163     const override string toString()
2164     {
2165         int n = elementStart.length - s.length;
2166         return elementStart[0..n];
2167     }
2168
2169 }
2170
2171 private
2172 {
2173     template Check(string msg)
2174     {
2175         string old = s;
2176
2177         void fail()
2178         {
2179             s = old;
2180             throw new Err(s,msg);
2181         }
2182
2183         void fail(Err e)
2184         {
2185             s = old;
2186             throw new Err(s,msg,e);
2187         }
2188
2189         void fail(string msg2)
2190         {
2191             fail(new Err(s,msg2));
2192         }
2193     }
2194
2195     void checkMisc(ref string s) // rule 27
2196     {
2197         mixin Check!("Misc");
2198
2199         try
2200         {
2201                  if (s.startsWith("<!--")) { checkComment(s); }
2202             else if (s.startsWith("<?"))   { checkPI(s); }
2203             else                           { checkSpace(s); }
2204         }
2205         catch(Err e) { fail(e); }
2206     }
2207
2208     void checkDocument(ref string s) // rule 1
2209     {
2210         mixin Check!("Document");
2211         try
2212         {
2213             checkProlog(s);
2214             checkElement(s);
2215             star!(checkMisc)(s);
2216         }
2217         catch(Err e) { fail(e); }
2218     }
2219
2220     void checkChars(ref string s) // rule 2
2221     {
2222         // TO DO - Fix std.utf stride and decode functions, then use those
2223         // instead
2224
2225         mixin Check!("Chars");
2226
2227         dchar c;
2228         int n = -1;
2229         foreach(int i,dchar d; s)
2230         {
2231             if (!isChar(d))
2232             {
2233                 c = d;
2234                 n = i;
2235                 break;
2236             }
2237         }
2238         if (n != -1)
2239         {
2240             s = s[n..$];
2241             fail(format("invalid character: U+%04X",c));
2242         }
2243     }
2244
2245     void checkSpace(ref string s) // rule 3
2246     {
2247         mixin Check!("Whitespace");
2248         munch(s,"\u0020\u0009\u000A\u000D");
2249         if (s is old) fail();
2250     }
2251
2252     void checkName(ref string s, out string name) // rule 5
2253     {
2254         mixin Check!("Name");
2255
2256         if (s.length == 0) fail();
2257         int n;
2258         foreach(int i,dchar c;s)
2259         {
2260             if (c == '_' || c == ':' || isLetter(c)) continue;
2261             if (i == 0) fail();
2262             if (c == '-' || c == '.' || isDigit(c)
2263                 || isCombiningChar(c) || isExtender(c)) continue;
2264             n = i;
2265             break;
2266         }
2267         name = s[0..n];
2268         s = s[n..$];
2269     }
2270
2271     void checkAttValue(ref string s) // rule 10
2272     {
2273         mixin Check!("AttValue");
2274
2275         if (s.length == 0) fail();
2276         char c = s[0];
2277         if (c != '\u0022' && c != '\u0027')
2278             fail("attribute value requires quotes");
2279         s = s[1..$];
2280         for(;;)
2281         {
2282             munch(s,"^<&"~c);
2283             if (s.length == 0) fail("unterminated attribute value");
2284             if (s[0] == '<') fail("< found in attribute value");
2285             if (s[0] == c) break;
2286             try { checkReference(s); } catch(Err e) { fail(e); }
2287         }
2288         s = s[1..$];
2289     }
2290
2291     void checkCharData(ref string s) // rule 14
2292     {
2293         mixin Check!("CharData");
2294
2295         while (s.length != 0)
2296         {
2297             if (s.startsWith("&")) break;
2298             if (s.startsWith("<")) break;
2299             if (s.startsWith("]]>")) fail("]]> found within char data");
2300             s = s[1..$];
2301         }
2302     }
2303
2304     void checkComment(ref string s) // rule 15
2305     {
2306         mixin Check!("Comment");
2307
2308         try { checkLiteral("<!--",s); } catch(Err e) { fail(e); }
2309         int n = s.indexOf("--");
2310         if (n == -1) fail("unterminated comment");
2311         s = s[n..$];
2312         try { checkLiteral("-->",s); } catch(Err e) { fail(e); }
2313     }
2314
2315     void checkPI(ref string s) // rule 16
2316     {
2317         mixin Check!("PI");
2318
2319         try
2320         {
2321             checkLiteral("<?",s);
2322             checkEnd("?>",s);
2323         }
2324         catch(Err e) { fail(e); }
2325     }
2326
2327     void checkCDSect(ref string s) // rule 18
2328     {
2329         mixin Check!("CDSect");
2330
2331         try
2332         {
2333             checkLiteral(cdata,s);
2334             checkEnd("]]>",s);
2335         }
2336         catch(Err e) { fail(e); }
2337     }
2338
2339     void checkProlog(ref string s) // rule 22
2340     {
2341         mixin Check!("Prolog");
2342
2343         try
2344         {
2345             checkXMLDecl(s);
2346             star!(checkMisc)(s);
2347             opt!(seq!(checkDocTypeDecl,star!(checkMisc)))(s);
2348         }
2349         catch(Err e) { fail(e); }
2350     }
2351
2352     void checkXMLDecl(ref string s) // rule 23
2353     {
2354         mixin Check!("XMLDecl");
2355
2356         try
2357         {
2358             checkLiteral("<?xml",s);
2359             checkVersionInfo(s);
2360             opt!(checkEncodingDecl)(s);
2361             opt!(checkSDDecl)(s);
2362             opt!(checkSpace)(s);
2363             checkLiteral("?>",s);
2364         }
2365         catch(Err e) { fail(e); }
2366     }
2367
2368     void checkVersionInfo(ref string s) // rule 24
2369     {
2370         mixin Check!("VersionInfo");
2371
2372         try
2373         {
2374             checkSpace(s);
2375             checkLiteral("version",s);
2376             checkEq(s);
2377             quoted!(checkVersionNum)(s);
2378         }
2379         catch(Err e) { fail(e); }
2380     }
2381
2382     void checkEq(ref string s) // rule 25
2383     {
2384         mixin Check!("Eq");
2385
2386         try
2387         {
2388             opt!(checkSpace)(s);
2389             checkLiteral("=",s);
2390             opt!(checkSpace)(s);
2391         }
2392         catch(Err e) { fail(e); }
2393     }
2394
2395     void checkVersionNum(ref string s) // rule 26
2396     {
2397         mixin Check!("VersionNum");
2398
2399         munch(s,"a-zA-Z0-9_.:-");
2400         if (s is old) fail();
2401     }
2402
2403     void checkDocTypeDecl(ref string s) // rule 28
2404     {
2405         mixin Check!("DocTypeDecl");
2406
2407         try
2408         {
2409             checkLiteral("<!DOCTYPE",s);
2410             //
2411             // TO DO -- ensure DOCTYPE is well formed
2412             // (But not yet. That's one of our "future directions")
2413             //
2414             checkEnd(">",s);
2415         }
2416         catch(Err e) { fail(e); }
2417     }
2418
2419     void checkSDDecl(ref string s) // rule 32
2420     {
2421         mixin Check!("SDDecl");
2422
2423         try
2424         {
2425             checkSpace(s);
2426             checkLiteral("standalone",s);
2427             checkEq(s);
2428         }
2429         catch(Err e) { fail(e); }
2430
2431         int n = 0;
2432              if (s.startsWith("'yes'") || s.startsWith("\"yes\"")) n = 5;
2433         else if (s.startsWith("'no'" ) || s.startsWith("\"no\"" )) n = 4;
2434         else fail("standalone attribute value must be 'yes', \"yes\","
2435             " 'no' or \"no\"");
2436         s = s[n..$];
2437     }
2438
2439     void checkElement(ref string s) // rule 39
2440     {
2441         mixin Check!("Element");
2442
2443         string sname,ename,t;
2444         try { checkTag(s,t,sname); } catch(Err e) { fail(e); }
2445
2446         if (t == "STag")
2447         {
2448             try
2449             {
2450                 checkContent(s);
2451                 t = s;
2452                 checkETag(s,ename);
2453             }
2454             catch(Err e) { fail(e); }
2455
2456             if (sname != ename)
2457             {
2458                 s = t;
2459                 fail("end tag name \"" ~ ename
2460                     ~ "\" differs from start tag name \""~sname~"\"");
2461             }
2462         }
2463     }
2464
2465     // rules 40 and 44
2466     void checkTag(ref string s, out string type, out string name)
2467     {
2468         mixin Check!("Tag");
2469
2470         try
2471         {
2472             type = "STag";
2473             checkLiteral("<",s);
2474             checkName(s,name);
2475             star!(seq!(checkSpace,checkAttribute))(s);
2476             opt!(checkSpace)(s);
2477             if (s.length != 0 && s[0] == '/')
2478             {
2479                 s = s[1..$];
2480                 type = "ETag";
2481             }
2482             checkLiteral(">",s);
2483         }
2484         catch(Err e) { fail(e); }
2485     }
2486
2487     void checkAttribute(ref string s) // rule 41
2488     {
2489         mixin Check!("Attribute");
2490
2491         try
2492         {
2493             string name;
2494             checkName(s,name);
2495             checkEq(s);
2496             checkAttValue(s);
2497         }
2498         catch(Err e) { fail(e); }
2499     }
2500
2501     void checkETag(ref string s, out string name) // rule 42
2502     {
2503         mixin Check!("ETag");
2504
2505         try
2506         {
2507             checkLiteral("</",s);
2508             checkName(s,name);
2509             opt!(checkSpace)(s);
2510             checkLiteral(">",s);
2511         }
2512         catch(Err e) { fail(e); }
2513     }
2514
2515     void checkContent(ref string s) // rule 43
2516     {
2517         mixin Check!("Content");
2518
2519         try
2520         {
2521             while (s.length != 0)
2522             {
2523                 old = s;
2524                      if (s.startsWith("&"))     { checkReference(s); }
2525                 else if (s.startsWith("<!--"))   { checkComment(s); }
2526                 else if (s.startsWith("<?"))       { checkPI(s); }
2527                 else if (s.startsWith(cdata)) { checkCDSect(s); }
2528                 else if (s.startsWith("</"))       { break; }
2529                 else if (s.startsWith("<"))     { checkElement(s); }
2530                 else                               { checkCharData(s); }
2531             }
2532         }
2533         catch(Err e) { fail(e); }
2534     }
2535
2536     void checkCharRef(ref string s, out dchar c) // rule 66
2537     {
2538         mixin Check!("CharRef");
2539
2540         c = 0;
2541         try { checkLiteral("&#",s); } catch(Err e) { fail(e); }
2542         int radix = 10;
2543         if (s.length != 0 && s[0] == 'x')
2544         {
2545             s = s[1..$];
2546             radix = 16;
2547         }
2548         if (s.length == 0) fail("unterminated character reference");
2549         if (s[0] == ';')
2550             fail("character reference must have at least one digit");
2551         while (s.length != 0)
2552         {
2553             char d = s[0];
2554             int n = 0;
2555             switch(d)
2556             {
2557                 case 'F','f': ++n;
2558                 case 'E','e': ++n;
2559                 case 'D','d': ++n;
2560                 case 'C','c': ++n;
2561                 case 'B','b': ++n;
2562                 case 'A','a': ++n;
2563                 case '9': ++n;
2564                 case '8': ++n;
2565                 case '7': ++n;
2566                 case '6': ++n;
2567                 case '5': ++n;
2568                 case '4': ++n;
2569                 case '3': ++n;
2570                 case '2': ++n;
2571                 case '1': ++n;
2572                 case '0': break;
2573                 default: n = 100; break;
2574             }
2575             if (n >= radix) break;
2576             c *= radix;
2577             c += n;
2578             s = s[1..$];
2579         }
2580         if (!isChar(c)) fail(format("U+%04X is not a legal character",c));
2581         if (s.length == 0 || s[0] != ';') fail("expected ;");
2582         else s = s[1..$];
2583     }
2584
2585     void checkReference(ref string s) // rule 67
2586     {
2587         mixin Check!("Reference");
2588
2589         try
2590         {
2591             dchar c;
2592             if (s.startsWith("&#")) checkCharRef(s,c);
2593             else checkEntityRef(s);
2594         }
2595         catch(Err e) { fail(e); }
2596     }
2597
2598     void checkEntityRef(ref string s) // rule 68
2599     {
2600         mixin Check!("EntityRef");
2601
2602         try
2603         {
2604             string name;
2605             checkLiteral("&",s);
2606             checkName(s,name);
2607             checkLiteral(";",s);
2608         }
2609         catch(Err e) { fail(e); }
2610     }
2611
2612     void checkEncName(ref string s) // rule 81
2613     {
2614         mixin Check!("EncName");
2615
2616         munch(s,"a-zA-Z");
2617         if (s is old) fail();
2618         munch(s,"a-zA-Z0-9_.-");
2619     }
2620
2621     void checkEncodingDecl(ref string s) // rule 80
2622     {
2623         mixin Check!("EncodingDecl");
2624
2625         try
2626         {
2627             checkSpace(s);
2628             checkLiteral("encoding",s);
2629             checkEq(s);
2630             quoted!(checkEncName)(s);
2631         }
2632         catch(Err e) { fail(e); }
2633     }
2634
2635     // Helper functions
2636
2637     void checkLiteral(string literal,ref string s)
2638     {
2639         mixin Check!("Literal");
2640
2641         if (!s.startsWith(literal)) fail("Expected literal \""~literal~"\"");
2642         s = s[literal.length..$];
2643     }
2644
2645     void checkEnd(string end,ref string s)
2646     {
2647         // Deliberately no mixin Check here.
2648
2649         int n = s.indexOf(end);
2650         if (n == -1) throw new Err(s,"Unable to find terminating \""~end~"\"");
2651         s = s[n..$];
2652         checkLiteral(end,s);
2653     }
2654
2655     // Metafunctions -- none of these use mixin Check
2656
2657     void opt(alias f)(ref string s)
2658     {
2659         try { f(s); } catch(Err e) {}
2660     }
2661
2662     void plus(alias f)(ref string s)
2663     {
2664         f(s);
2665         star!(f)(s);
2666     }
2667
2668     void star(alias f)(ref string s)
2669     {
2670         while (s.length != 0)
2671         {
2672             try { f(s); }
2673             catch(Err e) { return; }
2674         }
2675     }
2676
2677     void quoted(alias f)(ref string s)
2678     {
2679         if (s.startsWith("'"))
2680         {
2681             checkLiteral("'",s);
2682             f(s);
2683             checkLiteral("'",s);
2684         }
2685         else
2686         {
2687             checkLiteral("\"",s);
2688             f(s);
2689             checkLiteral("\"",s);
2690         }
2691     }
2692
2693     void seq(alias f,alias g)(ref string s)
2694     {
2695         f(s);
2696         g(s);
2697     }
2698 }
2699
2700 /**
2701  * Check an entire XML document for well-formedness
2702  *
2703  * Params:
2704  *    s = the document to be checked, passed as a string
2705  *
2706  * Throws: CheckException if the document is not well formed
2707  *
2708  * CheckException's toString() method will yield the complete heirarchy of
2709  * parse failure (the XML equivalent of a stack trace), giving the line and
2710  * column number of every failure at every level.
2711  */
2712 void check(string s)
2713 {
2714     try
2715     {
2716         checkChars(s);
2717         checkDocument(s);
2718         if (s.length != 0) throw new Err(s,"Junk found after document");
2719     }
2720     catch(Err e)
2721     {
2722         e.complete(s);
2723         throw e;
2724     }
2725 }
2726
2727 unittest
2728 {
2729   version (none) // WHY ARE WE NOT RUNNING THIS UNIT TEST?
2730   {
2731     try
2732     {
2733         check(q"[<?xml version="1.0"?>
2734         <catalog>
2735            <book id="bk101">
2736               <author>Gambardella, Matthew</author>
2737               <title>XML Developer's Guide</title>
2738               <genre>Computer</genre>
2739               <price>44.95</price>
2740               <publish_date>2000-10-01</publish_date>
2741               <description>An in-depth look at creating applications
2742               with XML.</description>
2743            </book>
2744            <book id="bk102">
2745               <author>Ralls, Kim</author>
2746               <title>Midnight Rain</title>
2747               <genre>Fantasy</genres>
2748               <price>5.95</price>
2749               <publish_date>2000-12-16</publish_date>
2750               <description>A former architect battles corporate zombies,
2751               an evil sorceress, and her own childhood to become queen
2752               of the world.</description>
2753            </book>
2754            <book id="bk103">
2755               <author>Corets, Eva</author>
2756               <title>Maeve Ascendant</title>
2757               <genre>Fantasy</genre>
2758               <price>5.95</price>
2759               <publish_date>2000-11-17</publish_date>
2760               <description>After the collapse of a nanotechnology
2761               society in England, the young survivors lay the
2762               foundation for a new society.</description>
2763            </book>
2764         </catalog>
2765         ]");
2766     assert(false);
2767     }
2768     catch(CheckException e)
2769     {
2770         int n = e.toString().indexOf("end tag name \"genres\" differs"
2771             " from start tag name \"genre\"");
2772         assert(n != -1);
2773     }
2774   }
2775 }
2776
2777 unittest
2778 {
2779     string s = q"EOS
2780 <?xml version="1.0"?>
2781 <set>
2782     <one>A</one>
2783     <!-- comment -->
2784     <two>B</two>
2785 </set>
2786 EOS";
2787     try
2788     {
2789         check(s);
2790     }
2791     catch (CheckException e)
2792     {
2793         assert(0, e.toString());
2794     }
2795 }
2796
2797 unittest
2798 {
2799     string s = q"EOS
2800 <?xml version="1.0" encoding="utf-8"?> <Tests>
2801     <Test thing="What &amp; Up">What &amp; Up Second</Test>
2802 </Tests>
2803 EOS";
2804     auto xml = new DocumentParser(s);
2805
2806     xml.onStartTag["Test"] = (ElementParser xml) {
2807         assert(xml.tag.attr["thing"] == "What & Up");
2808     };
2809
2810     xml.onEndTag["Test"] = (in Element e) {
2811         assert(e.text == "What & Up Second");
2812     };
2813     xml.parse();
2814 }
2815
2816 /** The base class for exceptions thrown by this module */
2817 class XMLException : Exception { this(string msg) { super(msg); } }
2818
2819 // Other exceptions
2820
2821 /// Thrown during Comment constructor
2822 class CommentException : XMLException
2823 { private this(string msg) { super(msg); } }
2824
2825 /// Thrown during CData constructor
2826 class CDataException : XMLException
2827 { private this(string msg) { super(msg); } }
2828
2829 /// Thrown during XMLInstruction constructor
2830 class XIException : XMLException
2831 { private this(string msg) { super(msg); } }
2832
2833 /// Thrown during ProcessingInstruction constructor
2834 class PIException : XMLException
2835 { private this(string msg) { super(msg); } }
2836
2837 /// Thrown during Text constructor
2838 class TextException : XMLException
2839 { private this(string msg) { super(msg); } }
2840
2841 /// Thrown during decode()
2842 class DecodeException : XMLException
2843 { private this(string msg) { super(msg); } }
2844
2845 /// Thrown if comparing with wrong type
2846 class InvalidTypeException : XMLException
2847 { private this(string msg) { super(msg); } }
2848
2849 /// Thrown when parsing for Tags
2850 class TagException : XMLException
2851 { private this(string msg) { super(msg); } }
2852
2853 /**
2854  * Thrown during check()
2855  */
2856 class CheckException : XMLException
2857 {
2858     CheckException err; /// Parent in heirarchy
2859     private string tail;
2860     /**
2861      * Name of production rule which failed to parse,
2862      * or specific error message
2863      */
2864     string msg;
2865     uint line = 0; /// Line number at which parse failure occurred
2866     uint column = 0; /// Column number at which parse failure occurred
2867
2868     private this(string tail,string msg,Err err=null)
2869     {
2870         super(null);
2871         this.tail = tail;
2872         this.msg = msg;
2873         this.err = err;
2874     }
2875
2876     private void complete(string entire)
2877     {
2878         string head = entire[0..$-tail.length];
2879         int n = head.lastIndexOf('\n') + 1;
2880         line = head.count("\n") + 1;
2881         dstring t;
2882         transcode(head[n..$],t);
2883         column = t.length + 1;
2884         if (err !is null) err.complete(entire);
2885     }
2886
2887     override const string toString()
2888     {
2889         string s;
2890         if (line != 0) s = format("Line %d, column %d: ",line,column);
2891         s ~= msg;
2892         s ~= '\n';
2893         if (err !is null) s = err.toString ~ s;
2894         return s;
2895     }
2896 }
2897
2898 private alias CheckException Err;
2899
2900 // Private helper functions
2901
2902 private
2903 {
2904     T toType(T)(Object o)
2905     {
2906         T t = cast(T)(o);
2907         if (t is null)
2908         {
2909             throw new InvalidTypeException("Attempt to compare a "
2910                 ~ T.stringof ~ " with an instance of another type");
2911         }
2912         return t;
2913     }
2914
2915     string chop(ref string s, int n)
2916     {
2917         if (n == -1) n = s.length;
2918         string t = s[0..n];
2919         s = s[n..$];
2920         return t;
2921     }
2922
2923     bool optc(ref string s, char c)
2924     {
2925         bool b = s.length != 0 && s[0] == c;
2926         if (b) s = s[1..$];
2927         return b;
2928     }
2929
2930     void reqc(ref string s, char c)
2931     {
2932         if (s.length == 0 || s[0] != c) throw new TagException("");
2933         s = s[1..$];
2934     }
2935
2936     hash_t hash(string s,hash_t h=0)
2937     {
2938         foreach(dchar c;s) h = h * 11 + c;
2939         return h;
2940     }
2941
2942     // Definitions from the XML specification
2943     immutable CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,
2944         0x10000,0x10FFFF];
2945     immutable BaseCharTable=[0x0041,0x005A,0x0061,0x007A,0x00C0,0x00D6,0x00D8,
2946         0x00F6,0x00F8,0x00FF,0x0100,0x0131,0x0134,0x013E,0x0141,0x0148,0x014A,
2947         0x017E,0x0180,0x01C3,0x01CD,0x01F0,0x01F4,0x01F5,0x01FA,0x0217,0x0250,
2948         0x02A8,0x02BB,0x02C1,0x0386,0x0386,0x0388,0x038A,0x038C,0x038C,0x038E,
2949         0x03A1,0x03A3,0x03CE,0x03D0,0x03D6,0x03DA,0x03DA,0x03DC,0x03DC,0x03DE,
2950         0x03DE,0x03E0,0x03E0,0x03E2,0x03F3,0x0401,0x040C,0x040E,0x044F,0x0451,
2951         0x045C,0x045E,0x0481,0x0490,0x04C4,0x04C7,0x04C8,0x04CB,0x04CC,0x04D0,
2952         0x04EB,0x04EE,0x04F5,0x04F8,0x04F9,0x0531,0x0556,0x0559,0x0559,0x0561,
2953         0x0586,0x05D0,0x05EA,0x05F0,0x05F2,0x0621,0x063A,0x0641,0x064A,0x0671,
2954         0x06B7,0x06BA,0x06BE,0x06C0,0x06CE,0x06D0,0x06D3,0x06D5,0x06D5,0x06E5,
2955         0x06E6,0x0905,0x0939,0x093D,0x093D,0x0958,0x0961,0x0985,0x098C,0x098F,
2956         0x0990,0x0993,0x09A8,0x09AA,0x09B0,0x09B2,0x09B2,0x09B6,0x09B9,0x09DC,
2957         0x09DD,0x09DF,0x09E1,0x09F0,0x09F1,0x0A05,0x0A0A,0x0A0F,0x0A10,0x0A13,
2958         0x0A28,0x0A2A,0x0A30,0x0A32,0x0A33,0x0A35,0x0A36,0x0A38,0x0A39,0x0A59,
2959         0x0A5C,0x0A5E,0x0A5E,0x0A72,0x0A74,0x0A85,0x0A8B,0x0A8D,0x0A8D,0x0A8F,
2960         0x0A91,0x0A93,0x0AA8,0x0AAA,0x0AB0,0x0AB2,0x0AB3,0x0AB5,0x0AB9,0x0ABD,
2961         0x0ABD,0x0AE0,0x0AE0,0x0B05,0x0B0C,0x0B0F,0x0B10,0x0B13,0x0B28,0x0B2A,
2962         0x0B30,0x0B32,0x0B33,0x0B36,0x0B39,0x0B3D,0x0B3D,0x0B5C,0x0B5D,0x0B5F,
2963         0x0B61,0x0B85,0x0B8A,0x0B8E,0x0B90,0x0B92,0x0B95,0x0B99,0x0B9A,0x0B9C,
2964         0x0B9C,0x0B9E,0x0B9F,0x0BA3,0x0BA4,0x0BA8,0x0BAA,0x0BAE,0x0BB5,0x0BB7,
2965         0x0BB9,0x0C05,0x0C0C,0x0C0E,0x0C10,0x0C12,0x0C28,0x0C2A,0x0C33,0x0C35,
2966         0x0C39,0x0C60,0x0C61,0x0C85,0x0C8C,0x0C8E,0x0C90,0x0C92,0x0CA8,0x0CAA,
2967         0x0CB3,0x0CB5,0x0CB9,0x0CDE,0x0CDE,0x0CE0,0x0CE1,0x0D05,0x0D0C,0x0D0E,
2968         0x0D10,0x0D12,0x0D28,0x0D2A,0x0D39,0x0D60,0x0D61,0x0E01,0x0E2E,0x0E30,
2969         0x0E30,0x0E32,0x0E33,0x0E40,0x0E45,0x0E81,0x0E82,0x0E84,0x0E84,0x0E87,
2970         0x0E88,0x0E8A,0x0E8A,0x0E8D,0x0E8D,0x0E94,0x0E97,0x0E99,0x0E9F,0x0EA1,
2971         0x0EA3,0x0EA5,0x0EA5,0x0EA7,0x0EA7,0x0EAA,0x0EAB,0x0EAD,0x0EAE,0x0EB0,
2972         0x0EB0,0x0EB2,0x0EB3,0x0EBD,0x0EBD,0x0EC0,0x0EC4,0x0F40,0x0F47,0x0F49,
2973         0x0F69,0x10A0,0x10C5,0x10D0,0x10F6,0x1100,0x1100,0x1102,0x1103,0x1105,
2974         0x1107,0x1109,0x1109,0x110B,0x110C,0x110E,0x1112,0x113C,0x113C,0x113E,
2975         0x113E,0x1140,0x1140,0x114C,0x114C,0x114E,0x114E,0x1150,0x1150,0x1154,
2976         0x1155,0x1159,0x1159,0x115F,0x1161,0x1163,0x1163,0x1165,0x1165,0x1167,
2977         0x1167,0x1169,0x1169,0x116D,0x116E,0x1172,0x1173,0x1175,0x1175,0x119E,
2978         0x119E,0x11A8,0x11A8,0x11AB,0x11AB,0x11AE,0x11AF,0x11B7,0x11B8,0x11BA,
2979         0x11BA,0x11BC,0x11C2,0x11EB,0x11EB,0x11F0,0x11F0,0x11F9,0x11F9,0x1E00,
2980         0x1E9B,0x1EA0,0x1EF9,0x1F00,0x1F15,0x1F18,0x1F1D,0x1F20,0x1F45,0x1F48,
2981         0x1F4D,0x1F50,0x1F57,0x1F59,0x1F59,0x1F5B,0x1F5B,0x1F5D,0x1F5D,0x1F5F,
2982         0x1F7D,0x1F80,0x1FB4,0x1FB6,0x1FBC,0x1FBE,0x1FBE,0x1FC2,0x1FC4,0x1FC6,
2983         0x1FCC,0x1FD0,0x1FD3,0x1FD6,0x1FDB,0x1FE0,0x1FEC,0x1FF2,0x1FF4,0x1FF6,
2984         0x1FFC,0x2126,0x2126,0x212A,0x212B,0x212E,0x212E,0x2180,0x2182,0x3041,
2985         0x3094,0x30A1,0x30FA,0x3105,0x312C,0xAC00,0xD7A3];
2986     immutable IdeographicTable=[0x3007,0x3007,0x3021,0x3029,0x4E00,0x9FA5];
2987     immutable CombiningCharTable=[0x0300,0x0345,0x0360,0x0361,0x0483,0x0486,
2988         0x0591,0x05A1,0x05A3,0x05B9,0x05BB,0x05BD,0x05BF,0x05BF,0x05C1,0x05C2,
2989         0x05C4,0x05C4,0x064B,0x0652,0x0670,0x0670,0x06D6,0x06DC,0x06DD,0x06DF,
2990         0x06E0,0x06E4,0x06E7,0x06E8,0x06EA,0x06ED,0x0901,0x0903,0x093C,0x093C,
2991         0x093E,0x094C,0x094D,0x094D,0x0951,0x0954,0x0962,0x0963,0x0981,0x0983,
2992         0x09BC,0x09BC,0x09BE,0x09BE,0x09BF,0x09BF,0x09C0,0x09C4,0x09C7,0x09C8,
2993         0x09CB,0x09CD,0x09D7,0x09D7,0x09E2,0x09E3,0x0A02,0x0A02,0x0A3C,0x0A3C,
2994         0x0A3E,0x0A3E,0x0A3F,0x0A3F,0x0A40,0x0A42,0x0A47,0x0A48,0x0A4B,0x0A4D,
2995         0x0A70,0x0A71,0x0A81,0x0A83,0x0ABC,0x0ABC,0x0ABE,0x0AC5,0x0AC7,0x0AC9,
2996         0x0ACB,0x0ACD,0x0B01,0x0B03,0x0B3C,0x0B3C,0x0B3E,0x0B43,0x0B47,0x0B48,
2997         0x0B4B,0x0B4D,0x0B56,0x0B57,0x0B82,0x0B83,0x0BBE,0x0BC2,0x0BC6,0x0BC8,
2998         0x0BCA,0x0BCD,0x0BD7,0x0BD7,0x0C01,0x0C03,0x0C3E,0x0C44,0x0C46,0x0C48,
2999         0x0C4A,0x0C4D,0x0C55,0x0C56,0x0C82,0x0C83,0x0CBE,0x0CC4,0x0CC6,0x0CC8,
3000         0x0CCA,0x0CCD,0x0CD5,0x0CD6,0x0D02,0x0D03,0x0D3E,0x0D43,0x0D46,0x0D48,
3001         0x0D4A,0x0D4D,0x0D57,0x0D57,0x0E31,0x0E31,0x0E34,0x0E3A,0x0E47,0x0E4E,
3002         0x0EB1,0x0EB1,0x0EB4,0x0EB9,0x0EBB,0x0EBC,0x0EC8,0x0ECD,0x0F18,0x0F19,
3003         0x0F35,0x0F35,0x0F37,0x0F37,0x0F39,0x0F39,0x0F3E,0x0F3E,0x0F3F,0x0F3F,
3004         0x0F71,0x0F84,0x0F86,0x0F8B,0x0F90,0x0F95,0x0F97,0x0F97,0x0F99,0x0FAD,
3005         0x0FB1,0x0FB7,0x0FB9,0x0FB9,0x20D0,0x20DC,0x20E1,0x20E1,0x302A,0x302F,
3006         0x3099,0x3099,0x309A,0x309A];
3007     immutable DigitTable=[0x0030,0x0039,0x0660,0x0669,0x06F0,0x06F9,0x0966,
3008         0x096F,0x09E6,0x09EF,0x0A66,0x0A6F,0x0AE6,0x0AEF,0x0B66,0x0B6F,0x0BE7,
3009         0x0BEF,0x0C66,0x0C6F,0x0CE6,0x0CEF,0x0D66,0x0D6F,0x0E50,0x0E59,0x0ED0,
3010         0x0ED9,0x0F20,0x0F29];
3011     immutable ExtenderTable=[0x00B7,0x00B7,0x02D0,0x02D0,0x02D1,0x02D1,0x0387,
3012         0x0387,0x0640,0x0640,0x0E46,0x0E46,0x0EC6,0x0EC6,0x3005,0x3005,0x3031,
3013         0x3035,0x309D,0x309E,0x30FC,0x30FE];
3014
3015     bool lookup(const(int)[] table, int c)
3016     {
3017         while (table.length != 0)
3018         {
3019             int m = (table.length >> 1) & ~1;
3020             if (c < table[m])
3021             {
3022                 table = table[0..m];
3023             }
3024             else if (c > table[m+1])
3025             {
3026                 table = table[m+2..$];
3027             }
3028             else return true;
3029         }
3030         return false;
3031     }
3032
3033     string startOf(string s)
3034     {
3035         string r;
3036         foreach(char c;s)
3037         {
3038             r ~= (c < 0x20 || c > 0x7F) ? '.' : c;
3039             if (r.length >= 40) { r ~= "___"; break; }
3040         }
3041         return r;
3042     }
3043
3044     void exit(string s=null)
3045     {
3046         throw new XMLException(s);
3047     }
3048 }`);
Note: See TracBrowser for help on using the browser.