Note: This website is archived. For up-to-date information about D projects and development, please visit wiki.dlang.org.

WikiStart: xml.d

File xml.d, 83.5 kB (added by y0uf00bar, 15 years ago)

std2.xml

Line 
1 // Written in the D programming language.
2
3 module std2.xml;
4 import std2.encoding;
5
6 import std.string;
7
8 private alias find indexOf;
9 private alias rfind lastIndexOf;
10
11
12
13 /**
14 Classes and functions for creating and parsing XML
15
16 The basic architecture of this module is that there are standalone functions,
17 classes for constructing an XML document from scratch (Tag, Element and
18 Document), and also classes for parsing a pre-existing XML file (ElementParser
19 and DocumentParser). The parsing classes <i>may</i> be used to build a
20 Document, but that is not their primary purpose. The handling capabilities of
21 DocumentParser and ElementParser are sufficiently customizable that you can
22 make them do pretty much whatever you want.
23
24 Authors: Janice Caron
25
26 Date: 2008.02.12 - 2008.05.07
27
28 License: Public Domain
29
30 Example: This example creates a DOM (Document Object Model) tree
31     from an XML file.
32 ------------------------------------------------------------------------------
33 import std.xml;
34 import std.stdio;
35 import std.string;
36
37 // books.xml is used in various samples throughout the Microsoft XML Core
38 // Services (MSXML) SDK.
39 //
40 // See http://msdn2.microsoft.com/en-us/library/ms762271(VS.85).aspx
41
42 void main()
43 {
44     string s = cast(string)std.file.read("books.xml");
45
46     // Check for well-formedness
47     check(s);
48
49     // Make a DOM tree
50     auto doc = new Document(s);
51
52     // Plain-print it
53     writefln(doc);
54 }
55 ------------------------------------------------------------------------------
56
57 Example: This example does much the same thing, except that the file is
58     deconstructed and reconstructed by hand. This is more work, but the
59     techniques involved offer vastly more power.
60 ------------------------------------------------------------------------------
61 import std.xml;
62 import std.stdio;
63 import std.string;
64
65 struct Book
66 {
67     string id;
68     string author;
69     string title;
70     string genre;
71     string price;
72     string pubDate;
73     string description;
74 }
75
76 void main()
77 {
78     string s = cast(string)std.file.read("books.xml");
79
80     // Check for well-formedness
81     check(s);
82
83     // Take it apart
84     Book[] books;
85
86     auto xml = new DocumentParser(s);
87     xml.onStartTag["book"] = (ElementParser xml)
88     {
89         Book book;
90         book.id = xml.tag.attr["id"];
91
92         xml.onEndTag["author"]       = (in Element e) { book.author      = e.text; };
93         xml.onEndTag["title"]        = (in Element e) { book.title       = e.text; };
94         xml.onEndTag["genre"]        = (in Element e) { book.genre       = e.text; };
95         xml.onEndTag["price"]        = (in Element e) { book.price       = e.text; };
96         xml.onEndTag["publish-date"] = (in Element e) { book.pubDate     = e.text; };
97         xml.onEndTag["description"]  = (in Element e) { book.description = e.text; };
98
99         xml.parse();
100
101         books ~= book;
102     };
103     xml.parse();
104
105     // Put it back together again;
106     auto doc = new Document(new Tag("catalog"));
107     foreach(book;books)
108     {
109         auto element = new Element("book");
110         element.tag.attr["id"] = book.id;
111
112         element ~= new Element("author",      book.author);
113         element ~= new Element("title",       book.title);
114         element ~= new Element("genre",       book.genre);
115         element ~= new Element("price",       book.price);
116         element ~= new Element("publish-date",book.pubDate);
117         element ~= new Element("description", book.description);
118
119         doc ~= element;
120     }
121
122     // Pretty-print it
123     writefln(join(doc.pretty(3),"\n"));
124 }
125 -------------------------------------------------------------------------------
126  * Macros:
127  *  WIKI=Phobos/StdXml
128  */
129
130 /**
131  * Abstract base class for XML items
132  */
133 abstract class Item
134 {
135     /// Compares with another Item of same type for equality
136     abstract override int opEquals(Object o);
137
138     /// Compares with another Item of same type
139     abstract override int opCmp(Object o);
140
141     /// Returns the hash of this item
142     abstract override hash_t toHash();
143
144     /// Returns a string representation of this item
145     abstract override string toString();
146
147     /**
148      * Returns an indented string representation of this item
149      *
150      * Params:
151      *      indent = number of spaces by which to indent child elements
152      */
153     string[] pretty(uint indent)
154     {
155         string s = strip(toString());
156         return s.length == 0 ? [] : [ s ];
157     }
158
159     /// Returns true if the item represents empty XML text
160     abstract bool isEmptyXML();
161 }
162
163
164 string cdata = "<![CDATA[";
165
166 /**
167  * Returns true if the character is a character according to the XML standard
168  *
169  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
170  *
171  * Params:
172  *    c = the character to be tested
173  */
174 bool isChar(dchar c) // rule 2
175 {
176     return lookup(CharTable,c);
177 }
178
179 unittest
180 {
181 //  const CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,
182 //        0x10000,0x10FFFF];
183     assert(!isChar(cast(dchar)0x8));
184     assert( isChar(cast(dchar)0x9));
185     assert( isChar(cast(dchar)0xA));
186     assert(!isChar(cast(dchar)0xB));
187     assert(!isChar(cast(dchar)0xC));
188     assert( isChar(cast(dchar)0xD));
189     assert(!isChar(cast(dchar)0xE));
190     assert(!isChar(cast(dchar)0x1F));
191     assert( isChar(cast(dchar)0x20));
192     assert( isChar('J'));
193     assert( isChar(cast(dchar)0xD7FF));
194     assert(!isChar(cast(dchar)0xD800));
195     assert(!isChar(cast(dchar)0xDFFF));
196     assert( isChar(cast(dchar)0xE000));
197     assert( isChar(cast(dchar)0xFFFD));
198     assert(!isChar(cast(dchar)0xFFFE));
199     assert(!isChar(cast(dchar)0xFFFF));
200     assert( isChar(cast(dchar)0x10000));
201     assert( isChar(cast(dchar)0x10FFFF));
202     assert(!isChar(cast(dchar)0x110000));
203 }
204 S1 munch(S1, S2)(ref S1 s, S2 pattern)
205 {
206     size_t j = s.length;
207     foreach (i, c; s)
208     {
209         if (!inPattern(c, pattern))
210         {
211             j = i;
212             break;
213         }
214     }
215     scope(exit) s = s[j .. $];
216     return s[0 .. j];
217 }
218 /**
219  * Returns true if the character is whitespace according to the XML standard
220  *
221  * Only the following characters are considered whitespace in XML - space, tab,
222  * carriage return and linefeed
223  *
224  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
225  *
226  * Params:
227  *    c = the character to be tested
228  */
229 bool isSpace(dchar c)
230 {
231     return c == '\u0020' || c == '\u0009' || c == '\u000A' || c == '\u000D';
232 }
233
234 /**
235  * Returns true if the character is a digit according to the XML standard
236  *
237  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
238  *
239  * Params:
240  *    c = the character to be tested
241  */
242 bool isDigit(dchar c)
243 {
244     return lookup(DigitTable,c);
245 }
246
247 /**
248  * Returns true if the character is a letter according to the XML standard
249  *
250  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
251  *
252  * Params:
253  *    c = the character to be tested
254  */
255 bool isLetter(dchar c) // rule 84
256 {
257     return isIdeographic(c) || isBaseChar(c);
258 }
259
260 /**
261  * Returns true if the character is an ideographic character according to the
262  * XML standard
263  *
264  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
265  *
266  * Params:
267  *    c = the character to be tested
268  */
269 bool isIdeographic(dchar c)
270 {
271     return lookup(IdeographicTable,c);
272 }
273
274 /**
275  * Returns true if the character is a base character according to the XML
276  * standard
277  *
278  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
279  *
280  * Params:
281  *    c = the character to be tested
282  */
283 bool isBaseChar(dchar c)
284 {
285     return lookup(BaseCharTable,c);
286 }
287
288 /**
289  * Returns true if the character is a combining character according to the
290  * XML standard
291  *
292  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
293  *
294  * Params:
295  *    c = the character to be tested
296  */
297 bool isCombiningChar(dchar c)
298 {
299     return lookup(CombiningCharTable,c);
300 }
301
302 /**
303  * Returns true if the character is an extender according to the XML standard
304  *
305  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
306  *
307  * Params:
308  *    c = the character to be tested
309  */
310 bool isExtender(dchar c)
311 {
312     return lookup(ExtenderTable,c);
313 }
314
315 /**
316  * Encodes a string by replacing all characters which need to be escaped with
317  * appropriate predefined XML entities.
318  *
319  * encode() escapes certain characters (ampersand, quote, apostrophe, less-than
320  * and greater-than), and similarly, decode() unescapes them. These functions
321  * are provided for convenience only. You do not need to use them when using
322  * the std.xml classes, because then all the encoding and decoding will be done
323  * for you automatically.
324  *
325  * If the string is not modified, the original will be returned.
326  *
327  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
328  *
329  * Params:
330  *      s = The string to be encoded
331  *
332  * Returns: The encoded string
333  *
334  * Examples:
335  * --------------
336  * writefln(encode("a > b")); // writes "a &gt; b"
337  * --------------
338  */
339 /*
340 //
341 //Specialized version of replace, to avoid the boring predictability of std.string.replace always returning a new array.
342 //Replace all occurances of from with to.
343 //Return original array if from not found.
344 //
345
346 string replaceIf(string s, dchar from, string to)
347 {
348     char[] p;
349     int i;
350     size_t istart;
351
352     istart = 0;
353     while (istart < s.length)
354     {
355     i = find(s[istart .. s.length], from);
356     if (i == -1)
357     {
358         if (istart == 0)
359         return s;
360         p ~= s[istart .. s.length];
361         break;
362     }
363     p ~= s[istart .. istart + i];
364     p ~= to;
365     istart += i + 1;
366     }
367     return p;
368 }
369
370 string encodeStdEntity(string s)
371 {
372     s = replaceIf(s,'&',"&amp;");
373     s = replaceIf(s,'\"',"&quot;");
374     s = replaceIf(s,'\'',"&apos;");
375     s = replaceIf(s,'<',"&lt;");
376     s = replaceIf(s,'>',"&gt;");
377     return s;
378 }
379 */
380
381 /**
382 Specialized version of startsWith.
383 Return if string s2 exactly matches string s1 up to length of s2, for non-zero length strings.
384 */
385 private bool startsWith(string s1, string s2)
386 {
387     size_t len2 = s2.length;
388     size_t len1 = s1.length;
389
390     if ((len2 > 0) && (len1 >= len2))
391         return (s2 == s1[0..len2]);
392     else
393         return false;
394 }
395
396 /* encodeStdEntity suggestion as copied from Digital Mars bug reports issue 3218 */
397
398 T[] encodeStdEntity(T) (T[] src, T[] dst = null)
399 {
400         T[]  entity;
401         auto s = src.ptr;
402         auto t = s;
403         auto e = s + src.length;
404         auto index = 0;
405
406         while (s < e)
407                switch (*s)
408                       {
409                       case '"':
410                            entity = "&quot;";
411                            goto common;
412
413                       case '>':
414                            entity = "&gt;";
415                            goto common;
416
417                       case '<':
418                            entity = "&lt;";
419                            goto common;
420
421                       case '&':
422                            entity = "&amp;";
423                            goto common;
424
425                       case '\'':
426                            entity = "&apos;";
427                            goto common;
428
429                       common:
430                            auto len = s - t;
431                            if (dst.length <= index + len + entity.length)
432                                dst.length = (dst.length + len + entity.length)
433 + dst.length / 2;
434
435                            dst [index .. index + len] = t [0 .. len];
436                            index += len;
437
438                            dst [index .. index + entity.length] = entity;
439                            index += entity.length;
440                            t = ++s;
441                            break;
442
443                       default:
444                            ++s;
445                            break;
446                       }
447
448
449         // did we change anything?
450         if (index)
451            {
452            // copy tail too
453            auto len = e - t;
454            if (dst.length <= index + len)
455                dst.length = index + len;
456
457            dst [index .. index + len] = t [0 .. len];
458            return dst [0 .. index + len];
459            }
460
461         return src;
462 }
463
464
465
466 unittest
467 {
468     assert(encodeStdEntity("hello") is "hello");
469     assert(encodeStdEntity("a > b") == "a &gt; b");
470     assert(encodeStdEntity("a < b") == "a &lt; b");
471     assert(encodeStdEntity("don't") == "don&apos;t");
472     assert(encodeStdEntity("\"hi\"") == "&quot;hi&quot;");
473     assert(encodeStdEntity("cat & dog") == "cat &amp; dog");
474 }
475
476 /**
477  * Mode to use for decoding.
478  *
479  * $(DDOC_ENUM_MEMBERS NONE) Do not decode
480  * $(DDOC_ENUM_MEMBERS LOOSE) Decode, but ignore errors
481  * $(DDOC_ENUM_MEMBERS STRICT) Decode, and throw exception on error
482  */
483 enum DecodeMode
484 {
485     NONE, LOOSE, STRICT
486 }
487
488 /**
489  * Decodes a string by unescaping all predefined XML entities.
490  *
491  * encode() escapes certain characters (ampersand, quote, apostrophe, less-than
492  * and greater-than), and similarly, decode() unescapes them. These functions
493  * are provided for convenience only. You do not need to use them when using
494  * the std.xml classes, because then all the encoding and decoding will be done
495  * for you automatically.
496  *
497  * This function decodes the entities &amp;amp;, &amp;quot;, &amp;apos;,
498  * &amp;lt; and &amp;gt,
499  * as well as decimal and hexadecimal entities such as &amp;#x20AC;
500  *
501  * If the string does not contain an ampersand, the original will be returned.
502  *
503  * Note that the "mode" parameter can be one of DecodeMode.NONE (do not
504  * decode), DecodeMode.LOOSE (decode, but ignore errors), or DecodeMode.STRICT
505  * (decode, and throw a DecodeException in the event of an error).
506  *
507  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
508  *
509  * Params:
510  *      s = The string to be decoded
511  *      mode = (optional) Mode to use for decoding. (Defaults to LOOSE).
512  *
513  * Throws: DecodeException if mode == DecodeMode.STRICT and decode fails
514  *
515  * Returns: The decoded string
516  *
517  * Examples:
518  * --------------
519  * writefln(decode("a &gt; b")); // writes "a > b"
520  * --------------
521  */
522 string decode(string s, DecodeMode mode=DecodeMode.LOOSE)
523 {
524     if (mode == DecodeMode.NONE) return s;
525
526     char[] buffer;
527
528     for (int i=0; i<s.length; ++i)
529     {
530         char c = s[i];
531         if (c != '&')
532         {
533             if (buffer.length != 0) buffer ~= c;
534         }
535         else
536         {
537             if (buffer.length == 0)
538             {
539                 buffer = s.dup;
540                 buffer.length = i;
541             }
542             if (startsWith(s[i..$],"&#"))
543             {
544                 try
545                 {
546                     dchar d;
547                     string t = s[i..$];
548                     checkCharRef(t, d);
549                     std.utf.encode(buffer, d);
550                     i = s.length - t.length - 1;
551                 }
552                 catch(Err e)
553                 {
554                     if (mode == DecodeMode.STRICT)
555                         throw new DecodeException("Unescaped &");
556                     buffer ~= '&';
557                 }
558             }
559             else if (startsWith(s[i..$],"&amp;" )) { buffer ~= '&';  i += 4; }
560             else if (startsWith(s[i..$],"&quot;")) { buffer ~= '"';  i += 5; }
561             else if (startsWith(s[i..$],"&apos;")) { buffer ~= '\''; i += 5; }
562             else if (startsWith(s[i..$],"&lt;"  )) { buffer ~= '<';  i += 3; }
563             else if (startsWith(s[i..$],"&gt;"  )) { buffer ~= '>';  i += 3; }
564             else
565             {
566                 if (mode == DecodeMode.STRICT)
567                     throw new DecodeException("Unescaped &");
568                 buffer ~= '&';
569             }
570         }
571     }
572     return (buffer.length == 0) ? s : cast(string)buffer;
573 }
574
575 unittest
576 {
577     void assertNot(string s)
578     {
579         bool b = false;
580         try { decode(s,DecodeMode.STRICT); }
581         catch (DecodeException e) { b = true; }
582         assert(b,s);
583     }
584
585     // Assert that things that should work, do
586     assert(decode("hello",          DecodeMode.STRICT) is "hello");
587     assert(decode("a &gt; b",       DecodeMode.STRICT) == "a > b");
588     assert(decode("a &lt; b",       DecodeMode.STRICT) == "a < b");
589     assert(decode("don&apos;t",     DecodeMode.STRICT) == "don't");
590     assert(decode("&quot;hi&quot;", DecodeMode.STRICT) == "\"hi\"");
591     assert(decode("cat &amp; dog",  DecodeMode.STRICT) == "cat & dog");
592     assert(decode("&#42;",          DecodeMode.STRICT) == "*");
593     assert(decode("&#x2A;",         DecodeMode.STRICT) == "*");
594     assert(decode("cat & dog",      DecodeMode.LOOSE) == "cat & dog");
595     assert(decode("a &gt b",        DecodeMode.LOOSE) == "a &gt b");
596     assert(decode("&#;",            DecodeMode.LOOSE) == "&#;");
597     assert(decode("&#x;",           DecodeMode.LOOSE) == "&#x;");
598     assert(decode("&#2G;",          DecodeMode.LOOSE) == "&#2G;");
599     assert(decode("&#x2G;",         DecodeMode.LOOSE) == "&#x2G;");
600
601     // Assert that things that shouldn't work, don't
602     assertNot("cat & dog");
603     assertNot("a &gt b");
604     assertNot("&#;");
605     assertNot("&#x;");
606     assertNot("&#2G;");
607     assertNot("&#x2G;");
608 }
609
610 /**
611  * Class representing an XML document.
612  *
613  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
614  *
615  */
616 class Document : Element
617 {
618     /**
619      * Contains all text which occurs before the root element.
620      * Defaults to &lt;?xml version="1.0"?&gt;
621      */
622     string prolog = "<?xml version=\"1.0\"?>";
623     /**
624      * Contains all text which occurs after the root element.
625      * Defaults to the empty string
626      */
627     string epilog;
628
629     /**
630      * Constructs a Document by parsing XML text.
631      *
632      * This function creates a complete DOM (Document Object Model) tree.
633      *
634      * The input to this function MUST be valid XML.
635      * This is enforced by DocumentParser's in contract.
636      *
637      * Params:
638      *      s = the complete XML text.
639      */
640     this(string s)
641     in
642     {
643         assert(s.length != 0);
644     }
645     body
646     {
647         auto xml = new DocumentParser(s);
648         string tagString = xml.tag.tagString;
649
650         this(xml.tag);
651         prolog = s[0 .. tagString.ptr - s.ptr];
652         parse(xml);
653         epilog = *xml.s;
654     }
655
656     /**
657      * Constructs a Document from a Tag.
658      *
659      * Params:
660      *      tag = the start tag of the document.
661      */
662     this(Tag tag)
663     {
664         super(tag);
665     }
666
667
668         /**
669          * Compares two Documents for equality
670          *
671          * Examples:
672          * --------------
673          * Document d1,d2;
674          * if (d1 == d2) { }
675          * --------------
676          */
677         override int opEquals(Object o)
678         {
679             auto doc = toType!( Document)(o);
680             return
681                 (prolog != doc.prolog            ) ? false : (
682                 (super  != cast( Element)doc) ? false : (
683                 (epilog != doc.epilog            ) ? false : (
684             true )));
685         }
686
687         /**
688          * Compares two Documents
689          *
690          * You should rarely need to call this function. It exists so that
691          * Documents can be used as associative array keys.
692          *
693          * Examples:
694          * --------------
695          * Document d1,d2;
696          * if (d1 < d2) { }
697          * --------------
698          */
699         override int opCmp(Object o)
700         {
701             auto doc = toType!(Document)(o);
702             return
703                 ((prolog != doc.prolog            )
704                     ? ( prolog < doc.prolog             ? -1 : 1 ) :
705                 ((super != cast( Element)doc)
706                     ? ( super  < cast( Element)doc ? -1 : 1 ) :
707                 ((epilog != doc.epilog            )
708                     ? ( epilog < doc.epilog             ? -1 : 1 ) :
709             0 )));
710         }
711
712         /**
713          * Returns the hash of a Document
714          *
715          * You should rarely need to call this function. It exists so that
716          * Documents can be used as associative array keys.
717          */
718         override hash_t toHash()
719         {
720             return hash(prolog,hash(epilog,super.toHash));
721         }
722
723         /**
724          * Returns the string representation of a Document. (That is, the
725          * complete XML of a document).
726          */
727         override string toString()
728         {
729             return prolog ~ super.toString ~ epilog;
730         }
731     }
732
733
734 /**
735  * Class representing an XML element.
736  *
737  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
738  */
739 class Element : Item
740 {
741     Tag tag; /// The start tag of the element
742     Item[] items; /// The element's items
743     Text[] texts; /// The element's text items
744     CData[] cdatas; /// The element's CData items
745     Comment[] comments; /// The element's comments
746     ProcessingInstruction[] pis; /// The element's processing instructions
747     Element[] elements; /// The element's child elements
748
749     /**
750      * Constructs an Element given a name and a string to be used as a Text
751      * interior.
752      *
753      * Params:
754      *      name = the name of the element.
755      *      interior = (optional) the string interior.
756      *
757      * Examples:
758      * -------------------------------------------------------
759      * auto element = new Element("title","Serenity")
760      *     // constructs the element <title>Serenity</title>
761      * -------------------------------------------------------
762      */
763     this(string name, string interior=null)
764     {
765         this(new Tag(name));
766         if (interior.length != 0) opCatAssign(new Text(interior));
767     }
768
769     /**
770      * Constructs an Element from a Tag.
771      *
772      * Params:
773      *      tag = the start or empty tag of the element.
774      */
775     this(Tag tag_)
776     {
777         this.tag = new Tag(tag_.name);
778         tag.type = TagType.EMPTY;
779         foreach(k,v;tag_.attr) tag.attr[k] = v;
780         tag.tagString = tag_.tagString;
781     }
782
783     /**
784      * Append a text item to the interior of this element
785      *
786      * Params:
787      *      item = the item you wish to append.
788      *
789      * Examples:
790      * --------------
791      * Element element;
792      * element ~= new Text("hello");
793      * --------------
794      */
795     void opCatAssign(Text item)
796     {
797         texts ~= item;
798         appendItem(item);
799     }
800
801     /**
802      * Append a CData item to the interior of this element
803      *
804      * Params:
805      *      item = the item you wish to append.
806      *
807      * Examples:
808      * --------------
809      * Element element;
810      * element ~= new CData("hello");
811      * --------------
812      */
813     void opCatAssign(CData item)
814     {
815         cdatas ~= item;
816         appendItem(item);
817     }
818
819     /**
820      * Append a comment to the interior of this element
821      *
822      * Params:
823      *      item = the item you wish to append.
824      *
825      * Examples:
826      * --------------
827      * Element element;
828      * element ~= new Comment("hello");
829      * --------------
830      */
831     void opCatAssign(Comment item)
832     {
833         comments ~= item;
834         appendItem(item);
835     }
836
837     /**
838      * Append a processing instruction to the interior of this element
839      *
840      * Params:
841      *      item = the item you wish to append.
842      *
843      * Examples:
844      * --------------
845      * Element element;
846      * element ~= new ProcessingInstruction("hello");
847      * --------------
848      */
849     void opCatAssign(ProcessingInstruction item)
850     {
851         pis ~= item;
852         appendItem(item);
853     }
854
855     /**
856      * Append a complete element to the interior of this element
857      *
858      * Params:
859      *      item = the item you wish to append.
860      *
861      * Examples:
862      * --------------
863      * Element element;
864      * Element other = new Element("br");
865      * element ~= other;
866      *    // appends element representing <br />
867      * --------------
868      */
869     void opCatAssign(Element item)
870     {
871         elements ~= item;
872         appendItem(item);
873     }
874
875     private void appendItem(Item item)
876     {
877         items ~= item;
878         if (tag.type == TagType.EMPTY && !item.isEmptyXML)
879             tag.type = TagType.START;
880     }
881
882     private void parse(ElementParser xml)
883     {
884         xml.onText = (string s) { opCatAssign(new Text(s)); };
885         xml.onCData = (string s) { opCatAssign(new CData(s)); };
886         xml.onComment = (string s) { opCatAssign(new Comment(s)); };
887         xml.onPI = (string s) { opCatAssign(new ProcessingInstruction(s)); };
888
889         xml.onStartTag[null] = (ElementParser xml)
890         {
891             auto e = new Element(xml.tag);
892             e.parse(xml);
893             opCatAssign(e);
894         };
895
896         xml.parse();
897     }
898
899     /**
900      * Compares two Elements for equality
901      *
902      * Examples:
903      * --------------
904      * Element e1,e2;
905      * if (e1 == e2) { }
906      * --------------
907      */
908     override int opEquals(Object o)
909     {
910         auto element = toType!( Element)(o);
911         uint len = items.length;
912         if (len != element.items.length) return false;
913         for (uint i=0; i<len; ++i)
914         {
915             if (!items[i].opEquals(element.items[i])) return false;
916         }
917         return true;
918     }
919
920     /**
921      * Compares two Elements
922      *
923      * You should rarely need to call this function. It exists so that Elements
924      * can be used as associative array keys.
925      *
926      * Examples:
927      * --------------
928      * Element e1,e2;
929      * if (e1 < e2) { }
930      * --------------
931      */
932     override int opCmp(Object o)
933     {
934         auto element = toType!( Element)(o);
935         for (uint i=0; ; ++i)
936         {
937             if (i == items.length && i == element.items.length) return 0;
938             if (i == items.length) return -1;
939             if (i == element.items.length) return 1;
940             if (items[i] != element.items[i])
941                 return items[i].opCmp(element.items[i]);
942         }
943     }
944
945     /**
946      * Returns the hash of an Element
947      *
948      * You should rarely need to call this function. It exists so that Elements
949      * can be used as associative array keys.
950      */
951     override hash_t toHash()
952     {
953         hash_t hash = tag.toHash;
954         foreach(item;items) hash += item.toHash();
955         return hash;
956     }
957
958         /**
959          * Returns the decoded interior of an element.
960          *
961          * The element is assumed to containt text <i>only</i>. So, for
962          * example, given XML such as "&lt;title&gt;Good &amp;amp;
963          * Bad&lt;/title&gt;", will return "Good &amp; Bad".
964          *
965          * Params:
966          *      mode = (optional) Mode to use for decoding. (Defaults to LOOSE).
967          *
968          * Throws: DecodeException if decode fails
969          */
970         string text(DecodeMode mode=DecodeMode.LOOSE)
971         {
972             string buffer;
973             foreach(item;items)
974             {
975                 Text t = cast(Text)item;
976                 if (t is null) throw new DecodeException(item.toString);
977                 buffer ~= decode(t.toString,mode);
978             }
979             return buffer;
980         }
981
982         /**
983          * Returns an indented string representation of this item
984          *
985          * Params:
986          *      indent = (optional) number of spaces by which to indent this
987          *          element. Defaults to 2.
988          */
989         override string[] pretty(uint indent=2)
990         {
991
992             if (isEmptyXML) return [ tag.toEmptyString ];
993
994             if (items.length == 1)
995             {
996                 Text t = cast(Text)(items[0]);
997                 if (t !is null)
998                 {
999                     return [tag.toStartString ~ t.toString ~ tag.toEndString];
1000                 }
1001             }
1002
1003             string[] a = [ tag.toStartString ];
1004             foreach(item;items)
1005             {
1006                 string[] b = item.pretty(indent);
1007                 foreach(s;b)
1008                 {
1009                     a ~= rjustify(s,s.length + indent);
1010                 }
1011             }
1012             a ~= tag.toEndString;
1013             return a;
1014         }
1015
1016         /**
1017          * Returns the string representation of an Element
1018          *
1019          * Examples:
1020          * --------------
1021          * auto element = new Element("br");
1022          * writefln(element.toString); // writes "<br />"
1023          * --------------
1024          */
1025         override string toString()
1026         {
1027             if (isEmptyXML) return tag.toEmptyString;
1028
1029             string buffer = tag.toStartString;
1030             foreach(item;items) { buffer ~= item.toString; }
1031             buffer ~= tag.toEndString;
1032             return buffer;
1033         }
1034
1035         override bool isEmptyXML() { return false; } /// Returns false always
1036     }
1037
1038
1039 /**
1040  * Tag types.
1041  *
1042  * $(DDOC_ENUM_MEMBERS START) Used for start tags
1043  * $(DDOC_ENUM_MEMBERS END) Used for end tags
1044  * $(DDOC_ENUM_MEMBERS EMPTY) Used for empty tags
1045  *
1046  */
1047 enum TagType { START, END, EMPTY };
1048
1049 /**
1050  * Class representing an XML tag.
1051  *
1052  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1053  *
1054  * The class invariant guarantees
1055  * <ul>
1056  * <li> that $(B type) is a valid enum TagType value</li>
1057  * <li> that $(B name) consists of valid characters</li>
1058  * <li> that each attribute name consists of valid characters</li>
1059  * </ul>
1060  */
1061 class Tag
1062 {
1063     TagType type = TagType.START;   /// Type of tag
1064     string name;                    /// Tag name
1065     string[string] attr;            /// Associative array of attributes
1066     private string tagString;
1067
1068     invariant()
1069     {
1070         string s;
1071         string t;
1072
1073         assert(type == TagType.START
1074             || type == TagType.END
1075             || type == TagType.EMPTY);
1076
1077         s = name;
1078         try { checkName(s,t); }
1079         catch(Err e) { assert(false,"Invalid tag name:" ~ e.toString); }
1080
1081         foreach(k,v;attr)
1082         {
1083             s = k;
1084             try { checkName(s,t); }
1085             catch(Err e)
1086                 { assert(false,"Invalid atrribute name:" ~ e.toString); }
1087         }
1088     }
1089
1090     /**
1091      * Constructs an instance of Tag with a specified name and type
1092      *
1093      * The constructor does not initialize the attributes. To initialize the
1094      * attributes, you access the $(B attr) member variable.
1095      *
1096      * Params:
1097      *      name = the Tag's name
1098      *      type = (optional) the Tag's type. If omitted, defaults to
1099      *          TagType.START.
1100      *
1101      * Examples:
1102      * --------------
1103      * auto tag = new Tag("img",Tag.EMPTY);
1104      * tag.attr["src"] = "http://example.com/example.jpg";
1105      * --------------
1106      */
1107     this(string name, TagType type=TagType.START)
1108     {
1109         this.name = name;
1110         this.type = type;
1111     }
1112
1113     /* Private constructor (so don't ddoc this!)
1114      *
1115      * Constructs a Tag by parsing the string representation, e.g. "<html>".
1116      *
1117      * The string is passed by reference, and is advanced over all characters
1118      * consumed.
1119      *
1120      * The second parameter is a dummy parameter only, required solely to
1121      * distinguish this constructor from the public one.
1122      */
1123     private this(ref string s, bool dummy)
1124     {
1125         tagString = s;
1126         try
1127         {
1128             reqc(s,'<');
1129             if (optc(s,'/')) type = TagType.END;
1130             name = munch(s,"^/>"~whitespace);
1131             munch(s,whitespace);
1132             while(s.length > 0 && s[0] != '>' && s[0] != '/')
1133             {
1134                 string key = munch(s,"^="~whitespace);
1135                 munch(s,whitespace);
1136                 reqc(s,'=');
1137                 munch(s,whitespace);
1138                 reqc(s,'"');
1139                 string val = encodeStdEntity(munch(s,"^\""));
1140                 reqc(s,'"');
1141                 munch(s,whitespace);
1142                 attr[key] = val;
1143             }
1144             if (optc(s,'/'))
1145             {
1146                 if (type == TagType.END) throw new TagException("");
1147                 type = TagType.EMPTY;
1148             }
1149             reqc(s,'>');
1150             tagString.length = (s.ptr - tagString.ptr);
1151         }
1152         catch(XMLException e)
1153         {
1154             tagString.length = (s.ptr - tagString.ptr);
1155             throw new TagException(tagString);
1156         }
1157     }
1158
1159
1160         /**
1161          * Compares two Tags for equality
1162          *
1163          * You should rarely need to call this function. It exists so that Tags
1164          * can be used as associative array keys.
1165          *
1166          * Examples:
1167          * --------------
1168          * Tag tag1,tag2
1169          * if (tag1 == tag2) { }
1170          * --------------
1171          */
1172         override int opEquals(Object o)
1173         {
1174             auto tag = toType!( Tag)(o);
1175             return
1176                 (name != tag.name) ? false : (
1177                 (attr != tag.attr) ? false : (
1178                 (type != tag.type) ? false : (
1179             true )));
1180         }
1181
1182         /**
1183          * Compares two Tags
1184          *
1185          * Examples:
1186          * --------------
1187          * Tag tag1,tag2
1188          * if (tag1 < tag2) { }
1189          * --------------
1190          */
1191         override int opCmp(Object o)
1192         {
1193             auto tag = toType!( Tag)(o);
1194             return
1195                 ((name != tag.name) ? ( name < tag.name ? -1 : 1 ) :
1196                 ((attr != tag.attr) ? ( attr < tag.attr ? -1 : 1 ) :
1197                 ((type != tag.type) ? ( type < tag.type ? -1 : 1 ) :
1198             0 )));
1199         }
1200
1201         /**
1202          * Returns the hash of a Tag
1203          *
1204          * You should rarely need to call this function. It exists so that Tags
1205          * can be used as associative array keys.
1206          */
1207         override hash_t toHash()
1208         {
1209             hash_t hash = 0;
1210             foreach(dchar c;name) hash = hash * 11 + c;
1211             return hash;
1212         }
1213
1214         /**
1215          * Returns the string representation of a Tag
1216          *
1217          * Examples:
1218          * --------------
1219          * auto tag = new Tag("book",TagType.START);
1220          * writefln(tag.toString); // writes "<book>"
1221          * --------------
1222          */
1223         override string toString()
1224         {
1225             if (isEmpty) return toEmptyString();
1226             return (isEnd) ? toEndString() : toStartString();
1227         }
1228
1229         private
1230         {
1231             string toNonEndString()
1232             {
1233                 string s = "<" ~ name;
1234                 foreach(key,val;attr)
1235                     s ~= format(" %s=\"%s\"",key,decode(val,DecodeMode.LOOSE));
1236                 return s;
1237             }
1238
1239             string toStartString() { return toNonEndString() ~ ">"; }
1240
1241             string toEndString() { return "</" ~ name ~ ">"; }
1242
1243             string toEmptyString() { return toNonEndString() ~ " />"; }
1244         }
1245
1246         /**
1247          * Returns true if the Tag is a start tag
1248          *
1249          * Examples:
1250          * --------------
1251          * if (tag.isStart) { }
1252          * --------------
1253          */
1254         bool isStart() { return type == TagType.START; }
1255
1256         /**
1257          * Returns true if the Tag is an end tag
1258          *
1259          * Examples:
1260          * --------------
1261          * if (tag.isEnd) { }
1262          * --------------
1263          */
1264         bool isEnd()   { return type == TagType.END;   }
1265
1266         /**
1267          * Returns true if the Tag is an empty tag
1268          *
1269          * Examples:
1270          * --------------
1271          * if (tag.isEmpty) { }
1272          * --------------
1273          */
1274         bool isEmpty() { return type == TagType.EMPTY; }
1275     }
1276
1277
1278 /**
1279  * Class representing a comment
1280  */
1281 class Comment : Item
1282 {
1283     private string content;
1284
1285     /**
1286      * Construct a comment
1287      *
1288      * Params:
1289      *      content = the body of the comment
1290      *
1291      * Throws: CommentException if the comment body is illegal (contains "--"
1292      * or exactly equals "-")
1293      *
1294      * Examples:
1295      * --------------
1296      * auto item = new Comment("This is a comment");
1297      *    // constructs <!--This is a comment-->
1298      * --------------
1299      */
1300     this(string content)
1301     {
1302         if (content == "-" || content.indexOf("==") != -1)
1303             throw new CommentException(content);
1304         this.content = content;
1305     }
1306
1307     /**
1308      * Compares two comments for equality
1309      *
1310      * Examples:
1311      * --------------
1312      * Comment item1,item2;
1313      * if (item1 == item2) { }
1314      * --------------
1315      */
1316     override int opEquals(Object o)
1317     {
1318         auto item = toType!( Item)(o);
1319         auto  t = cast(Comment)item;
1320         return t !is null && content == t.content;
1321     }
1322
1323     /**
1324      * Compares two comments
1325      *
1326      * You should rarely need to call this function. It exists so that Comments
1327      * can be used as associative array keys.
1328      *
1329      * Examples:
1330      * --------------
1331      * Comment item1,item2;
1332      * if (item1 < item2) { }
1333      * --------------
1334      */
1335     override int opCmp(Object o)
1336     {
1337         auto item = toType!( Item)(o);
1338         auto t = cast(Comment)item;
1339         return t !is null && (content != t.content
1340             ? (content < t.content ? -1 : 1 ) : 0 );
1341     }
1342
1343     /**
1344      * Returns the hash of a Comment
1345      *
1346      * You should rarely need to call this function. It exists so that Comments
1347      * can be used as associative array keys.
1348      */
1349     override hash_t toHash() { return hash(content); }
1350
1351     /**
1352      * Returns a string representation of this comment
1353      */
1354     override string toString() { return "<!--" ~ content ~ "-->"; }
1355
1356     override bool isEmptyXML() { return false; } /// Returns false always
1357 }
1358
1359 /**
1360  * Class representing a Character Data section
1361  */
1362 class CData : Item
1363 {
1364     private string content;
1365
1366     /**
1367      * Construct a chraracter data section
1368      *
1369      * Params:
1370      *      content = the body of the character data segment
1371      *
1372      * Throws: CDataException if the segment body is illegal (contains "]]>")
1373      *
1374      * Examples:
1375      * --------------
1376      * auto item = new CData("<b>hello</b>");
1377      *    // constructs <![CDATA[<b>hello</b>]]>
1378      * --------------
1379      */
1380     this(string content)
1381     {
1382         if (content.indexOf("]]>") != -1) throw new CDataException(content);
1383         this.content = content;
1384     }
1385
1386     /**
1387      * Compares two CDatas for equality
1388      *
1389      * Examples:
1390      * --------------
1391      * CData item1,item2;
1392      * if (item1 == item2) { }
1393      * --------------
1394      */
1395     override int opEquals(Object o)
1396     {
1397         auto item = toType!( Item)(o);
1398         auto t = cast(CData)item;
1399         return t !is null && content == t.content;
1400     }
1401
1402     /**
1403      * Compares two CDatas
1404      *
1405      * You should rarely need to call this function. It exists so that CDatas
1406      * can be used as associative array keys.
1407      *
1408      * Examples:
1409      * --------------
1410      * CData item1,item2;
1411      * if (item1 < item2) { }
1412      * --------------
1413      */
1414     override int opCmp(Object o)
1415     {
1416        auto  item = toType!( Item)(o);
1417        auto  t = cast(CData)item;
1418         return t !is null && (content != t.content
1419             ? (content < t.content ? -1 : 1 ) : 0 );
1420     }
1421
1422     /**
1423      * Returns the hash of a CData
1424      *
1425      * You should rarely need to call this function. It exists so that CDatas
1426      * can be used as associative array keys.
1427      */
1428     override hash_t toHash() { return hash(content); }
1429
1430     /**
1431      * Returns a string representation of this CData section
1432      */
1433     override  string toString() { return cdata ~ content ~ "]]>"; }
1434
1435     override  bool isEmptyXML() { return false; } /// Returns false always
1436 }
1437
1438 /**
1439  * Class representing a text (aka Parsed Character Data) section
1440  */
1441 class Text : Item
1442 {
1443     private string content;
1444
1445     /**
1446      * Construct a text (aka PCData) section
1447      *
1448      * Params:
1449      *      content = the text. This function encodes the text before
1450      *      insertion, so it is safe to insert any text
1451      *
1452      * Examples:
1453      * --------------
1454      * auto Text = new CData("a < b");
1455      *    // constructs a &lt; b
1456      * --------------
1457      */
1458     this(string content)
1459     {
1460         this.content = encodeStdEntity(content);
1461     }
1462
1463     /**
1464      * Compares two text sections for equality
1465      *
1466      * Examples:
1467      * --------------
1468      * Text item1,item2;
1469      * if (item1 == item2) { }
1470      * --------------
1471      */
1472     override int opEquals(Object o)
1473     {
1474         auto item = toType!( Item)(o);
1475         auto t = cast(Text)item;
1476         return t !is null && content == t.content;
1477     }
1478
1479     /**
1480      * Compares two text sections
1481      *
1482      * You should rarely need to call this function. It exists so that Texts
1483      * can be used as associative array keys.
1484      *
1485      * Examples:
1486      * --------------
1487      * Text item1,item2;
1488      * if (item1 < item2) { }
1489      * --------------
1490      */
1491     override int opCmp(Object o)
1492     {
1493         auto item = toType!( Item)(o);
1494         auto t = cast(Text)item;
1495         return t !is null
1496             && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1497     }
1498
1499     /**
1500      * Returns the hash of a text section
1501      *
1502      * You should rarely need to call this function. It exists so that Texts
1503      * can be used as associative array keys.
1504      */
1505     override hash_t toHash() { return hash(content); }
1506
1507     /**
1508      * Returns a string representation of this Text section
1509      */
1510     override string toString() { return content; }
1511
1512     /**
1513      * Returns true if the content is the empty string
1514      */
1515     override bool isEmptyXML() { return content.length == 0; }
1516 }
1517
1518 /**
1519  * Class representing an XML Instruction section
1520  */
1521 class XMLInstruction : Item
1522 {
1523     private string content;
1524
1525     /**
1526      * Construct an XML Instruction section
1527      *
1528      * Params:
1529      *      content = the body of the instruction segment
1530      *
1531      * Throws: XIException if the segment body is illegal (contains ">")
1532      *
1533      * Examples:
1534      * --------------
1535      * auto item = new XMLInstruction("ATTLIST");
1536      *    // constructs <!ATTLIST>
1537      * --------------
1538      */
1539     this(string content)
1540     {
1541         if (content.indexOf(">") != -1) throw new XIException(content);
1542         this.content = content;
1543     }
1544
1545     /**
1546      * Compares two XML instructions for equality
1547      *
1548      * Examples:
1549      * --------------
1550      * XMLInstruction item1,item2;
1551      * if (item1 == item2) { }
1552      * --------------
1553      */
1554     override int opEquals(Object o)
1555     {
1556         auto item = toType!( Item)(o);
1557         auto t = cast(XMLInstruction)item;
1558         return t !is null && content == t.content;
1559     }
1560
1561     /**
1562      * Compares two XML instructions
1563      *
1564      * You should rarely need to call this function. It exists so that
1565      * XmlInstructions can be used as associative array keys.
1566      *
1567      * Examples:
1568      * --------------
1569      * XMLInstruction item1,item2;
1570      * if (item1 < item2) { }
1571      * --------------
1572      */
1573     override int opCmp(Object o)
1574     {
1575         auto item = toType!( Item)(o);
1576         auto t = cast(XMLInstruction)item;
1577         return t !is null
1578             && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1579     }
1580
1581     /**
1582      * Returns the hash of an XMLInstruction
1583      *
1584      * You should rarely need to call this function. It exists so that
1585      * XmlInstructions can be used as associative array keys.
1586      */
1587     override hash_t toHash() { return hash(content); }
1588
1589     /**
1590      * Returns a string representation of this XmlInstruction
1591      */
1592     override  string toString() { return "<!" ~ content ~ ">"; }
1593
1594     override  bool isEmptyXML() { return false; } /// Returns false always
1595 }
1596
1597 /**
1598  * Class representing a Processing Instruction section
1599  */
1600 class ProcessingInstruction : Item
1601 {
1602     private string content;
1603
1604     /**
1605      * Construct a Processing Instruction section
1606      *
1607      * Params:
1608      *      content = the body of the instruction segment
1609      *
1610      * Throws: PIException if the segment body is illegal (contains "?>")
1611      *
1612      * Examples:
1613      * --------------
1614      * auto item = new ProcessingInstruction("php");
1615      *    // constructs <?php?>
1616      * --------------
1617      */
1618     this(string content)
1619     {
1620         if (content.indexOf("?>") != -1) throw new PIException(content);
1621         this.content = content;
1622     }
1623
1624     /**
1625      * Compares two processing instructions for equality
1626      *
1627      * Examples:
1628      * --------------
1629      * ProcessingInstruction item1,item2;
1630      * if (item1 == item2) { }
1631      * --------------
1632      */
1633     override int opEquals(Object o)
1634     {
1635         auto item = toType!( Item)(o);
1636         auto t = cast(ProcessingInstruction)item;
1637         return t !is null && content == t.content;
1638     }
1639
1640     /**
1641      * Compares two processing instructions
1642      *
1643      * You should rarely need to call this function. It exists so that
1644      * ProcessingInstructions can be used as associative array keys.
1645      *
1646      * Examples:
1647      * --------------
1648      * ProcessingInstruction item1,item2;
1649      * if (item1 < item2) { }
1650      * --------------
1651      */
1652     override int opCmp(Object o)
1653     {
1654         auto item = toType!( Item)(o);
1655         auto t = cast(ProcessingInstruction)item;
1656         return t !is null
1657             && (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1658     }
1659
1660     /**
1661      * Returns the hash of a ProcessingInstruction
1662      *
1663      * You should rarely need to call this function. It exists so that
1664      * ProcessingInstructions can be used as associative array keys.
1665      */
1666     override hash_t toHash() { return hash(content); }
1667
1668     /**
1669      * Returns a string representation of this ProcessingInstruction
1670
1671      */
1672     override  string toString() { return "<?" ~ content ~ "?>"; }
1673
1674     override  bool isEmptyXML() { return false; } /// Returns false always
1675 }
1676
1677
1678 /**
1679  * Class for parsing an XML Document.
1680  *
1681  * This is a subclass of ElementParser. Most of the useful functions are
1682  * documented there.
1683  *
1684  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1685  *
1686  * Bugs:
1687  *      Currently only supports UTF documents.
1688  *
1689  *      If there is an encoding attribute in the prolog, it is ignored.
1690  *
1691  */
1692 class DocumentParser : ElementParser
1693 {
1694     string xmlText;
1695
1696     /**
1697      * Constructs a DocumentParser.
1698      *
1699      * The input to this function MUST be valid XML.
1700      * This is enforced by the function's in contract.
1701      *
1702      * Params:
1703      *      xmltext = the entire XML document as text
1704      *
1705      */
1706     this(string xmlText_)
1707     in
1708     {
1709         assert(xmlText_.length != 0);
1710         try
1711         {
1712             // Confirm that the input is valid XML
1713             check(xmlText_);
1714         }
1715         catch (CheckException e)
1716         {
1717             // And if it's not, tell the user why not
1718             assert(false, "\n" ~ e.toString());
1719         }
1720     }
1721     body
1722     {
1723         xmlText = xmlText_;
1724         s = &xmlText;
1725         super();    // Initialize everything
1726         parse();    // Parse through the root tag (but not beyond)
1727     }
1728 }
1729
1730 /**
1731  * Class for parsing an XML element.
1732  *
1733  * Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1734  *
1735  * Note that you cannot construct instances of this class directly. You can
1736  * construct a DocumentParser (which is a subclass of ElementParser), but
1737  * otherwise, Instances of ElementParser will be created for you by the
1738  * library, and passed your way via onStartTag handlers.
1739  *
1740  */
1741 class ElementParser
1742 {
1743     alias void delegate(string) Handler;
1744     alias void delegate(in Element element) ElementHandler;
1745     alias void delegate(ElementParser parser) ParserHandler;
1746
1747     private
1748     {
1749         Tag tag_;
1750         string elementStart;
1751         string* s;
1752
1753         Handler commentHandler = null;
1754         Handler cdataHandler = null;
1755         Handler xiHandler = null;
1756         Handler piHandler = null;
1757         Handler rawTextHandler = null;
1758         Handler textHandler = null;
1759
1760         // Private constructor for start tags
1761         this(ElementParser parent)
1762         {
1763             s = parent.s;
1764             this();
1765             tag_ = parent.tag_;
1766         }
1767        
1768         // Private constructor for empty tags
1769         this(Tag tag, string* t)
1770         {
1771             s = t;
1772             this();
1773             tag_ = tag;
1774         }
1775     }
1776
1777     /**
1778      * The Tag at the start of the element being parsed. You can read this to
1779      * determine the tag's name and attributes.
1780      */
1781     Tag tag() { return tag_; }
1782
1783     /**
1784      * Register a handler which will be called whenever a start tag is
1785      * encountered which matches the specified name. You can also pass null as
1786      * the name, in which case the handler will be called for any unmatched
1787      * start tag.
1788      *
1789      * Examples:
1790      * --------------
1791      * // Call this function whenever a <podcast> start tag is encountered
1792      * onStartTag["podcast"] = (ElementParser xml)
1793      * {
1794      *     // Your code here
1795      *     //
1796      *     // This is a a closure, so code here may reference
1797      *     // variables which are outside of this scope
1798      * };
1799      *
1800      * // call myEpisodeStartHandler (defined elsewhere) whenever an <episode>
1801      * // start tag is encountered
1802      * onStartTag["episode"] = &myEpisodeStartHandler;
1803      *
1804      * // call delegate dg for all other start tags
1805      * onStartTag[null] = dg;
1806      * --------------
1807      *
1808      * This library will supply your function with a new instance of
1809      * ElementHandler, which may be used to parse inside the element whose
1810      * start tag was just found, or to identify the tag attributes of the
1811      * element, etc.
1812      *
1813      * Note that your function will be called for both start tags and empty
1814      * tags. That is, we make no distinction between &lt;br&gt;&lt;/br&gt;
1815      * and &lt;br/&gt;.
1816      */
1817     ParserHandler[string] onStartTag;
1818
1819     /**
1820      * Register a handler which will be called whenever an end tag is
1821      * encountered which matches the specified name. You can also pass null as
1822      * the name, in which case the handler will be called for any unmatched
1823      * end tag.
1824      *
1825      * Examples:
1826      * --------------
1827      * // Call this function whenever a </podcast> end tag is encountered
1828      * onEndTag["podcast"] = (in Element e)
1829      * {
1830      *     // Your code here
1831      *     //
1832      *     // This is a a closure, so code here may reference
1833      *     // variables which are outside of this scope
1834      * };
1835      *
1836      * // call myEpisodeEndHandler (defined elsewhere) whenever an </episode>
1837      * // end tag is encountered
1838      * onEndTag["episode"] = &myEpisodeEndHandler;
1839      *
1840      * // call delegate dg for all other end tags
1841      * onEndTag[null] = dg;
1842      * --------------
1843      *
1844      * Note that your function will be called for both start tags and empty
1845      * tags. That is, we make no distinction between &lt;br&gt;&lt;/br&gt;
1846      * and &lt;br/&gt;.
1847      */
1848     ElementHandler[string] onEndTag;
1849
1850     protected this()
1851     {
1852         elementStart = *s;
1853     }
1854
1855     /**
1856      * Register a handler which will be called whenever text is encountered.
1857      *
1858      * Examples:
1859      * --------------
1860      * // Call this function whenever text is encountered
1861      * onText = (string s)
1862      * {
1863      *     // Your code here
1864      *
1865      *     // The passed parameter s will have been decoded by the time you see
1866      *     // it, and so may contain any character.
1867      *     //
1868      *     // This is a a closure, so code here may reference
1869      *     // variables which are outside of this scope
1870      * };
1871      * --------------
1872      */
1873     void onText(Handler handler) { textHandler = handler; }
1874
1875     /**
1876      * Register an alternative handler which will be called whenever text
1877      * is encountered. This differs from onText in that onText will decode
1878      * the text, wheras onTextRaw will not. This allows you to make design
1879      * choices, since onText will be more accurate, but slower, while
1880      * onTextRaw will be faster, but less accurate. Of course, you can
1881      * still call decode() within your handler, if you want, but you'd
1882      * probably want to use onTextRaw only in circumstances where you
1883      * know that decoding is unnecessary.
1884      *
1885      * Examples:
1886      * --------------
1887      * // Call this function whenever text is encountered
1888      * onText = (string s)
1889      * {
1890      *     // Your code here
1891      *
1892      *     // The passed parameter s will NOT have been decoded.
1893      *     //
1894      *     // This is a a closure, so code here may reference
1895      *     // variables which are outside of this scope
1896      * };
1897      * --------------
1898      */
1899     void onTextRaw(Handler handler) { rawTextHandler = handler; }
1900
1901     /**
1902      * Register a handler which will be called whenever a character data
1903      * segement is encountered.
1904      *
1905      * Examples:
1906      * --------------
1907      * // Call this function whenever a CData section is encountered
1908      * onCData = (string s)
1909      * {
1910      *     // Your code here
1911      *
1912      *     // The passed parameter s does not include the opening <![CDATA[
1913      *     // nor closing ]]>
1914      *     //
1915      *     // This is a a closure, so code here may reference
1916      *     // variables which are outside of this scope
1917      * };
1918      * --------------
1919      */
1920     void onCData(Handler handler) { cdataHandler = handler; }
1921
1922     /**
1923      * Register a handler which will be called whenever a comment is
1924      * encountered.
1925      *
1926      * Examples:
1927      * --------------
1928      * // Call this function whenever a comment is encountered
1929      * onComment = (string s)
1930      * {
1931      *     // Your code here
1932      *
1933      *     // The passed parameter s does not include the opening <!-- nor
1934      *     // closing -->
1935      *     //
1936      *     // This is a a closure, so code here may reference
1937      *     // variables which are outside of this scope
1938      * };
1939      * --------------
1940      */
1941     void onComment(Handler handler) { commentHandler = handler; }
1942
1943     /**
1944      * Register a handler which will be called whenever a processing
1945      * instruction is encountered.
1946      *
1947      * Examples:
1948      * --------------
1949      * // Call this function whenever a processing instruction is encountered
1950      * onPI = (string s)
1951      * {
1952      *     // Your code here
1953      *
1954      *     // The passed parameter s does not include the opening <? nor
1955      *     // closing ?>
1956      *     //
1957      *     // This is a a closure, so code here may reference
1958      *     // variables which are outside of this scope
1959      * };
1960      * --------------
1961      */
1962     void onPI(Handler handler) { piHandler = handler; }
1963
1964     /**
1965      * Register a handler which will be called whenever an XML instruction is
1966      * encountered.
1967      *
1968      * Examples:
1969      * --------------
1970      * // Call this function whenever an XML instruction is encountered
1971      * // (Note: XML instructions may only occur preceeding the root tag of a
1972      * // document).
1973      * onPI = (string s)
1974      * {
1975      *     // Your code here
1976      *
1977      *     // The passed parameter s does not include the opening <! nor
1978      *     // closing >
1979      *     //
1980      *     // This is a a closure, so code here may reference
1981      *     // variables which are outside of this scope
1982      * };
1983      * --------------
1984      */
1985     void onXI(Handler handler) { xiHandler = handler; }
1986
1987     /**
1988      * Parse an XML element.
1989      *
1990      * Parsing will continue until the end of the current element. Any items
1991      * encountered for which a handler has been registered will invoke that
1992      * handler.
1993      *
1994      * Throws: various kinds of XMLException
1995      */
1996     void parse()
1997     {
1998         string t;
1999         Tag root = tag_;
2000         Tag[string] startTags;
2001         if (tag_ !is null) startTags[tag_.name] = tag_;
2002
2003         while(s.length != 0)
2004         {
2005             if (startsWith(*s,"<!--"))
2006             {
2007                 chop(*s,4);
2008                 t = chop(*s,indexOf(*s,"-->"));
2009                 if (commentHandler.funcptr !is null) commentHandler(t);
2010                 chop(*s,3);
2011             }
2012             else if (startsWith(*s,"<![CDATA["))
2013             {
2014                 chop(*s,9);
2015                 t = chop(*s,indexOf(*s,"]]>"));
2016                 if (cdataHandler.funcptr !is null) cdataHandler(t);
2017                 chop(*s,3);
2018             }
2019             else if (startsWith(*s,"<!"))
2020             {
2021                 chop(*s,2);
2022                 t = chop(*s,indexOf(*s,">"));
2023                 if (xiHandler.funcptr !is null) xiHandler(t);
2024                 chop(*s,1);
2025             }
2026             else if (startsWith(*s,"<?"))
2027             {
2028                 chop(*s,2);
2029                 t = chop(*s,indexOf(*s,"?>"));
2030                 if (piHandler.funcptr !is null) piHandler(t);
2031                 chop(*s,2);
2032             }
2033             else if (startsWith(*s,"<"))
2034             {
2035                 tag_ = new Tag(*s,true);
2036                 if (root is null)
2037                     return; // Return to constructor of derived class
2038
2039                 if (tag_.isStart)
2040                 {
2041                     startTags[tag_.name] = tag_;
2042
2043                     auto parser = new ElementParser(this);
2044
2045                     auto handler = tag_.name in onStartTag;
2046                     if (handler !is null) (*handler)(parser);
2047                     else
2048                     {
2049                         handler = null in onStartTag;
2050                         if (handler !is null) (*handler)(parser);
2051                     }
2052                 }
2053                 else if (tag_.isEnd)
2054                 {
2055                     auto startTag = startTags[tag_.name];
2056                     string text;
2057
2058             char* p = startTag.tagString.ptr
2059                 + startTag.tagString.length;
2060             char* q = tag_.tagString.ptr;
2061             text = p[0..(q-p)];
2062
2063                     auto element = new Element(startTag);
2064                     if (text.length != 0) element ~= new Text(text);
2065
2066                     auto handler = tag_.name in onEndTag;
2067                     if (handler !is null) (*handler)(element);
2068                     else
2069                     {
2070                         handler = null in onEndTag;
2071                         if (handler !is null) (*handler)(element);
2072                     }
2073
2074                     if (tag_.name == root.name) return;
2075                 }
2076                 else if (tag_.isEmpty)
2077                 {
2078                     Tag startTag = new Tag(tag_.name);
2079
2080                     // Handle the pretend start tag
2081                     string s2;
2082                     auto parser = new ElementParser(startTag,&s2);
2083                     auto handler1 = startTag.name in onStartTag;
2084                     if (handler1 !is null) (*handler1)(parser);
2085                     else
2086                     {
2087                         handler1 = null in onStartTag;
2088                         if (handler1 !is null) (*handler1)(parser);
2089                     }
2090                    
2091                     // Handle the pretend end tag
2092                     auto element = new Element(startTag);
2093                     auto handler2 = tag_.name in onEndTag;
2094                     if (handler2 !is null) (*handler2)(element);
2095                     else
2096                     {
2097                         handler2 = null in onEndTag;
2098                         if (handler2 !is null) (*handler2)(element);
2099                     }
2100                 }
2101             }
2102             else
2103             {
2104                 t = chop(*s,indexOf(*s,"<"));
2105                 if (rawTextHandler.funcptr !is null)
2106                     rawTextHandler(t);
2107                 else if (textHandler.funcptr !is null)
2108                     textHandler(decode(t,DecodeMode.LOOSE));
2109             }
2110         }
2111     }
2112
2113     /**
2114      * Returns that part of the element which has already been parsed
2115      */
2116     override string toString()
2117     {
2118         int n = elementStart.length - s.length;
2119         return elementStart[0..n];
2120     }
2121
2122 }
2123
2124 private
2125 {
2126     template Check(string msg)
2127     {
2128         string old = s;
2129
2130         void fail()
2131         {
2132             s = old;
2133             throw new Err(s,msg);
2134         }
2135
2136         void fail(Err e)
2137         {
2138             s = old;
2139             throw new Err(s,msg,e);
2140         }
2141
2142         void fail(string msg2)
2143         {
2144             fail(new Err(s,msg2));
2145         }
2146     }
2147
2148     void checkMisc(ref string s) // rule 27
2149     {
2150         mixin Check!("Misc");
2151
2152         try
2153         {
2154                  if (s.startsWith("<!--")) { checkComment(s); }
2155             else if (s.startsWith("<?"))   { checkPI(s); }
2156             else                           { checkSpace(s); }
2157         }
2158         catch(Err e) { fail(e); }
2159     }
2160
2161     void checkDocument(ref string s) // rule 1
2162     {
2163         mixin Check!("Document");
2164         try
2165         {
2166             checkProlog(s);
2167             checkElement(s);
2168             star!(checkMisc)(s);
2169         }
2170         catch(Err e) { fail(e); }
2171     }
2172
2173     void checkChars(ref string s) // rule 2
2174     {
2175         // TO DO - Fix std.utf stride and decode functions, then use those
2176         // instead
2177
2178         mixin Check!("Chars");
2179
2180         dchar c;
2181         int n = -1;
2182         foreach(int i,dchar d; s)
2183         {
2184             if (!isChar(d))
2185             {
2186                 c = d;
2187                 n = i;
2188                 break;
2189             }
2190         }
2191         if (n != -1)
2192         {
2193             s = s[n..$];
2194             fail(format("invalid character: U+%04X",c));
2195         }
2196     }
2197
2198     void checkSpace(ref string s) // rule 3
2199     {
2200         mixin Check!("Whitespace");
2201         munch(s,"\u0020\u0009\u000A\u000D");
2202         if (s is old) fail();
2203     }
2204
2205     void checkName(ref string s, out string name) // rule 5
2206     {
2207         mixin Check!("Name");
2208
2209         if (s.length == 0) fail();
2210         int n;
2211         foreach(int i,dchar c;s)
2212         {
2213             if (c == '_' || c == ':' || isLetter(c)) continue;
2214             if (i == 0) fail();
2215             if (c == '-' || c == '.' || isDigit(c)
2216                 || isCombiningChar(c) || isExtender(c)) continue;
2217             n = i;
2218             break;
2219         }
2220         name = s[0..n];
2221         s = s[n..$];
2222     }
2223
2224     void checkAttValue(ref string s) // rule 10
2225     {
2226         mixin Check!("AttValue");
2227
2228         if (s.length == 0) fail();
2229         char c = s[0];
2230         if (c != '\u0022' && c != '\u0027')
2231             fail("attribute value requires quotes");
2232         s = s[1..$];
2233         for(;;)
2234         {
2235             munch(s,"^<&"~c);
2236             if (s.length == 0) fail("unterminated attribute value");
2237             if (s[0] == '<') fail("< found in attribute value");
2238             if (s[0] == c) break;
2239             try { checkReference(s); } catch(Err e) { fail(e); }
2240         }
2241         s = s[1..$];
2242     }
2243
2244     void checkCharData(ref string s) // rule 14
2245     {
2246         mixin Check!("CharData");
2247
2248         while (s.length != 0)
2249         {
2250             if (s.startsWith("&")) break;
2251             if (s.startsWith("<")) break;
2252             if (s.startsWith("]]>")) fail("]]> found within char data");
2253             s = s[1..$];
2254         }
2255     }
2256
2257     void checkComment(ref string s) // rule 15
2258     {
2259         mixin Check!("Comment");
2260
2261         try { checkLiteral("<!--",s); } catch(Err e) { fail(e); }
2262         int n = s.indexOf("--");
2263         if (n == -1) fail("unterminated comment");
2264         s = s[0..n];
2265         try { checkLiteral("-->",s); } catch(Err e) { fail(e); }
2266     }
2267
2268     void checkPI(ref string s) // rule 16
2269     {
2270         mixin Check!("PI");
2271
2272         try
2273         {
2274             checkLiteral("<?",s);
2275             checkEnd("?>",s);
2276         }
2277         catch(Err e) { fail(e); }
2278     }
2279
2280     void checkCDSect(ref string s) // rule 18
2281     {
2282         mixin Check!("CDSect");
2283
2284         try
2285         {
2286             checkLiteral(cdata,s);
2287             checkEnd("]]>",s);
2288         }
2289         catch(Err e) { fail(e); }
2290     }
2291
2292     void checkProlog(ref string s) // rule 22
2293     {
2294         mixin Check!("Prolog");
2295
2296         try
2297         {
2298             checkXMLDecl(s);
2299             star!(checkMisc)(s);
2300             opt!(seq!(checkDocTypeDecl,star!(checkMisc)))(s);
2301         }
2302         catch(Err e) { fail(e); }
2303     }
2304
2305     void checkXMLDecl(ref string s) // rule 23
2306     {
2307         mixin Check!("XMLDecl");
2308
2309         try
2310         {
2311             checkLiteral("<?xml",s);
2312             checkVersionInfo(s);
2313             opt!(checkEncodingDecl)(s);
2314             opt!(checkSDDecl)(s);
2315             opt!(checkSpace)(s);
2316             checkLiteral("?>",s);
2317         }
2318         catch(Err e) { fail(e); }
2319     }
2320
2321     void checkVersionInfo(ref string s) // rule 24
2322     {
2323         mixin Check!("VersionInfo");
2324
2325         try
2326         {
2327             checkSpace(s);
2328             checkLiteral("version",s);
2329             checkEq(s);
2330             quoted!(checkVersionNum)(s);
2331         }
2332         catch(Err e) { fail(e); }
2333     }
2334
2335     void checkEq(ref string s) // rule 25
2336     {
2337         mixin Check!("Eq");
2338
2339         try
2340         {
2341             opt!(checkSpace)(s);
2342             checkLiteral("=",s);
2343             opt!(checkSpace)(s);
2344         }
2345         catch(Err e) { fail(e); }
2346     }
2347
2348     void checkVersionNum(ref string s) // rule 26
2349     {
2350         mixin Check!("VersionNum");
2351
2352         munch(s,"a-zA-Z0-9_.:-");
2353         if (s is old) fail();
2354     }
2355
2356     void checkDocTypeDecl(ref string s) // rule 28
2357     {
2358         mixin Check!("DocTypeDecl");
2359
2360         try
2361         {
2362             checkLiteral("<!DOCTYPE",s);
2363             //
2364             // TO DO -- ensure DOCTYPE is well formed
2365             // (But not yet. That's one of our "future directions")
2366             //
2367             checkEnd(">",s);
2368         }
2369         catch(Err e) { fail(e); }
2370     }
2371
2372     void checkSDDecl(ref string s) // rule 32
2373     {
2374         mixin Check!("SDDecl");
2375
2376         try
2377         {
2378             checkSpace(s);
2379             checkLiteral("standalone",s);
2380             checkEq(s);
2381         }
2382         catch(Err e) { fail(e); }
2383
2384         int n = 0;
2385              if (s.startsWith("'yes'") || s.startsWith("\"yes\"")) n = 5;
2386         else if (s.startsWith("'no'" ) || s.startsWith("\"no\"" )) n = 4;
2387         else fail("standalone attribute value must be 'yes', \"yes\","
2388             " 'no' or \"no\"");
2389         s = s[n..$];
2390     }
2391
2392     void checkElement(ref string s) // rule 39
2393     {
2394         mixin Check!("Element");
2395
2396         string sname,ename,t;
2397         try { checkTag(s,t,sname); } catch(Err e) { fail(e); }
2398
2399         if (t == "STag")
2400         {
2401             try
2402             {
2403                 checkContent(s);
2404                 t = s;
2405                 checkETag(s,ename);
2406             }
2407             catch(Err e) { fail(e); }
2408
2409             if (sname != ename)
2410             {
2411                 s = t;
2412                 fail("end tag name \"" ~ ename
2413                     ~ "\" differs from start tag name \""~sname~"\"");
2414             }
2415         }
2416     }
2417
2418     // rules 40 and 44
2419     void checkTag(ref string s, out string type, out string name)
2420     {
2421         mixin Check!("Tag");
2422
2423         try
2424         {
2425             type = "STag";
2426             checkLiteral("<",s);
2427             checkName(s,name);
2428             star!(seq!(checkSpace,checkAttribute))(s);
2429             opt!(checkSpace)(s);
2430             if (s.length != 0 && s[0] == '/')
2431             {
2432                 s = s[1..$];
2433                 type = "ETag";
2434             }
2435             checkLiteral(">",s);
2436         }
2437         catch(Err e) { fail(e); }
2438     }
2439
2440     void checkAttribute(ref string s) // rule 41
2441     {
2442         mixin Check!("Attribute");
2443
2444         try
2445         {
2446             string name;
2447             checkName(s,name);
2448             checkEq(s);
2449             checkAttValue(s);
2450         }
2451         catch(Err e) { fail(e); }
2452     }
2453
2454     void checkETag(ref string s, out string name) // rule 42
2455     {
2456         mixin Check!("ETag");
2457
2458         try
2459         {
2460             checkLiteral("</",s);
2461             checkName(s,name);
2462             opt!(checkSpace)(s);
2463             checkLiteral(">",s);
2464         }
2465         catch(Err e) { fail(e); }
2466     }
2467
2468     void checkContent(ref string s) // rule 43
2469     {
2470         mixin Check!("Content");
2471
2472         try
2473         {
2474             while (s.length != 0)
2475             {
2476                 old = s;
2477                      if (s.startsWith("&"))        { checkReference(s); }
2478                 else if (s.startsWith("<!--"))     { checkComment(s); }
2479                 else if (s.startsWith("<?"))       { checkPI(s); }
2480                 else if (s.startsWith(cdata)) { checkCDSect(s); }
2481                 else if (s.startsWith("</"))       { break; }
2482                 else if (s.startsWith("<"))        { checkElement(s); }
2483                 else                               { checkCharData(s); }
2484             }
2485         }
2486         catch(Err e) { fail(e); }
2487     }
2488
2489     void checkCharRef(ref string s, out dchar c) // rule 66
2490     {
2491         mixin Check!("CharRef");
2492
2493         c = 0;
2494         try { checkLiteral("&#",s); } catch(Err e) { fail(e); }
2495         int radix = 10;
2496         if (s.length != 0 && s[0] == 'x')
2497         {
2498             s = s[1..$];
2499             radix = 16;
2500         }
2501         if (s.length == 0) fail("unterminated character reference");
2502         if (s[0] == ';')
2503             fail("character reference must have at least one digit");
2504         while (s.length != 0)
2505         {
2506             char d = s[0];
2507             int n = 0;
2508             switch(d)
2509             {
2510                 case 'F','f': ++n;
2511                 case 'E','e': ++n;
2512                 case 'D','d': ++n;
2513                 case 'C','c': ++n;
2514                 case 'B','b': ++n;
2515                 case 'A','a': ++n;
2516                 case '9': ++n;
2517                 case '8': ++n;
2518                 case '7': ++n;
2519                 case '6': ++n;
2520                 case '5': ++n;
2521                 case '4': ++n;
2522                 case '3': ++n;
2523                 case '2': ++n;
2524                 case '1': ++n;
2525                 case '0': break;
2526                 default: n = 100; break;
2527             }
2528             if (n >= radix) break;
2529             c *= radix;
2530             c += n;
2531             s = s[1..$];
2532         }
2533         if (!isChar(c)) fail(format("U+%04X is not a legal character",c));
2534         if (s.length == 0 || s[0] != ';') fail("expected ;");
2535         else s = s[1..$];
2536     }
2537
2538     void checkReference(ref string s) // rule 67
2539     {
2540         mixin Check!("Reference");
2541
2542         try
2543         {
2544             dchar c;
2545             if (s.startsWith("&#")) checkCharRef(s,c);
2546             else checkEntityRef(s);
2547         }
2548         catch(Err e) { fail(e); }
2549     }
2550
2551     void checkEntityRef(ref string s) // rule 68
2552     {
2553         mixin Check!("EntityRef");
2554
2555         try
2556         {
2557             string name;
2558             checkLiteral("&",s);
2559             checkName(s,name);
2560             checkLiteral(";",s);
2561         }
2562         catch(Err e) { fail(e); }
2563     }
2564
2565     void checkEncName(ref string s) // rule 81
2566     {
2567         mixin Check!("EncName");
2568
2569         munch(s,"a-zA-Z");
2570         if (s is old) fail();
2571         munch(s,"a-zA-Z0-9_.-");
2572     }
2573
2574     void checkEncodingDecl(ref string s) // rule 80
2575     {
2576         mixin Check!("EncodingDecl");
2577
2578         try
2579         {
2580             checkSpace(s);
2581             checkLiteral("encoding",s);
2582             checkEq(s);
2583             quoted!(checkEncName)(s);
2584         }
2585         catch(Err e) { fail(e); }
2586     }
2587
2588     // Helper functions
2589
2590     void checkLiteral(string literal,ref string s)
2591     {
2592         mixin Check!("Literal");
2593
2594         if (!s.startsWith(literal)) fail("Expected literal \""~literal~"\"");
2595         s = s[literal.length..$];
2596     }
2597
2598     void checkEnd(string end,ref string s)
2599     {
2600         // Deliberately no mixin Check here.
2601
2602         int n = s.indexOf(end);
2603         if (n == -1) throw new Err(s,"Unable to find terminating \""~end~"\"");
2604         s = s[n..$];
2605         checkLiteral(end,s);
2606     }
2607
2608     // Metafunctions -- none of these use mixin Check
2609
2610     void opt(alias f)(ref string s)
2611     {
2612         try { f(s); } catch(Err e) {}
2613     }
2614
2615     void plus(alias f)(ref string s)
2616     {
2617         f(s);
2618         star!(f)(s);
2619     }
2620
2621     void star(alias f)(ref string s)
2622     {
2623         while (s.length != 0)
2624         {
2625             try { f(s); }
2626             catch(Err e) { return; }
2627         }
2628     }
2629
2630     void quoted(alias f)(ref string s)
2631     {
2632         if (s.startsWith("'"))
2633         {
2634             checkLiteral("'",s);
2635             f(s);
2636             checkLiteral("'",s);
2637         }
2638         else
2639         {
2640             checkLiteral("\"",s);
2641             f(s);
2642             checkLiteral("\"",s);
2643         }
2644     }
2645
2646     void seq(alias f,alias g)(ref string s)
2647     {
2648         f(s);
2649         g(s);
2650     }
2651 }
2652
2653 /**
2654  * Check an entire XML document for well-formedness
2655  *
2656  * Params:
2657  *      s = the document to be checked, passed as a string
2658  *
2659  * Throws: CheckException if the document is not well formed
2660  *
2661  * CheckException's toString() method will yield the complete heirarchy of
2662  * parse failure (the XML equivalent of a stack trace), giving the line and
2663  * column number of every failure at every level.
2664  */
2665 void check(string s)
2666 {
2667     try
2668     {
2669         checkChars(s);
2670         checkDocument(s);
2671         if (s.length != 0) throw new Err(s,"Junk found after document");
2672     }
2673     catch(Err e)
2674     {
2675         e.complete(s);
2676         throw e;
2677     }
2678 }
2679
2680 unittest
2681 {
2682     try
2683     {
2684         check(`<?xml version="1.0"?>
2685         <catalog>
2686            <book id="bk101">
2687               <author>Gambardella, Matthew</author>
2688               <title>XML Developer's Guide</title>
2689               <genre>Computer</genre>
2690               <price>44.95</price>
2691               <publish_date>2000-10-01</publish_date>
2692               <description>An in-depth look at creating applications
2693               with XML.</description>
2694            </book>
2695            <book id="bk102">
2696               <author>Ralls, Kim</author>
2697               <title>Midnight Rain</title>
2698               <genre>Fantasy</genres>
2699               <price>5.95</price>
2700               <publish_date>2000-12-16</publish_date>
2701               <description>A former architect battles corporate zombies,
2702               an evil sorceress, and her own childhood to become queen
2703               of the world.</description>
2704            </book>
2705            <book id="bk103">
2706               <author>Corets, Eva</author>
2707               <title>Maeve Ascendant</title>
2708               <genre>Fantasy</genre>
2709               <price>5.95</price>
2710               <publish_date>2000-11-17</publish_date>
2711               <description>After the collapse of a nanotechnology
2712               society in England, the young survivors lay the
2713               foundation for a new society.</description>
2714            </book>
2715         </catalog>`);
2716     assert(false);
2717     }
2718     catch(CheckException e)
2719     {
2720         int n = e.toString().indexOf("end tag name \"genres\" differs"
2721             " from start tag name \"genre\"");
2722         assert(n != -1);
2723     }
2724 }
2725
2726 /** The base class for exceptions thrown by this module */
2727 class XMLException : Exception { this(string msg) { super(msg); } }
2728
2729 // Other exceptions
2730
2731 /// Thrown during Comment constructor
2732 class CommentException : XMLException
2733 { private this(string msg) { super(msg); } }
2734
2735 /// Thrown during CData constructor
2736 class CDataException : XMLException
2737 { private this(string msg) { super(msg); } }
2738
2739 /// Thrown during XMLInstruction constructor
2740 class XIException : XMLException
2741 { private this(string msg) { super(msg); } }
2742
2743 /// Thrown during ProcessingInstruction constructor
2744 class PIException : XMLException
2745 { private this(string msg) { super(msg); } }
2746
2747 /// Thrown during Text constructor
2748 class TextException : XMLException
2749 { private this(string msg) { super(msg); } }
2750
2751 /// Thrown during decode()
2752 class DecodeException : XMLException
2753 { private this(string msg) { super(msg); } }
2754
2755 /// Thrown if comparing with wrong type
2756 class InvalidTypeException : XMLException
2757 { private this(string msg) { super(msg); } }
2758
2759 /// Thrown when parsing for Tags
2760 class TagException : XMLException
2761 { private this(string msg) { super(msg); } }
2762
2763 /**
2764  * Thrown during check()
2765  */
2766 class CheckException : XMLException
2767 {
2768     CheckException err; /// Parent in heirarchy
2769     private string tail;
2770     /**
2771      * Name of production rule which failed to parse,
2772      * or specific error message
2773      */
2774     string msg;
2775     uint line = 0; /// Line number at which parse failure occurred
2776     uint column = 0; /// Column number at which parse failure occurred
2777
2778     private this(string tail,string msg,Err err=null)
2779     {
2780         super(null);
2781         this.tail = tail;
2782         this.msg = msg;
2783         this.err = err;
2784     }
2785
2786     private void complete(string entire)
2787     {
2788         string head = entire[0..$-tail.length];
2789         int n = head.lastIndexOf('\n') + 1;
2790         line = head.count("\n") + 1;
2791         dstring t;
2792         transcode(head[n..$],t);
2793         column = t.length + 1;
2794         if (err !is null) err.complete(entire);
2795     }
2796
2797     override string toString()
2798     {
2799         string s;
2800         if (line != 0) s = format("Line %d, column %d: ",line,column);
2801         s ~= msg;
2802         s ~= '\n';
2803         if (err !is null) s = err.toString ~ s;
2804         return s;
2805     }
2806 }
2807
2808 private alias CheckException Err;
2809
2810 // Private helper functions
2811
2812 private
2813 {
2814     T toType(T)(Object o)
2815     {
2816         T t = cast(T)(o);
2817         if (t is null)
2818         {
2819             throw new InvalidTypeException("Attempt to compare a "
2820                 ~ T.stringof ~ " with an instance of another type");
2821         }
2822         return t;
2823     }
2824
2825     string chop(ref string s, int n)
2826     {
2827         if (n == -1) n = s.length;
2828         string t = s[0..n];
2829         s = s[n..$];
2830         return t;
2831     }
2832
2833     bool optc(ref string s, char c)
2834     {
2835         bool b = s.length != 0 && s[0] == c;
2836         if (b) s = s[1..$];
2837         return b;
2838     }
2839
2840     void reqc(ref string s, char c)
2841     {
2842         if (s.length == 0 || s[0] != c) throw new TagException("");
2843         s = s[1..$];
2844     }
2845
2846     hash_t hash(string s,hash_t h=0)
2847     {
2848         foreach(dchar c;s) h = h * 11 + c;
2849         return h;
2850     }
2851
2852     // Definitions from the XML specification
2853      dchar[] CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,
2854         0x10000,0x10FFFF];
2855      dchar[] BaseCharTable=[0x0041,0x005A,0x0061,0x007A,0x00C0,0x00D6,0x00D8,
2856         0x00F6,0x00F8,0x00FF,0x0100,0x0131,0x0134,0x013E,0x0141,0x0148,0x014A,
2857         0x017E,0x0180,0x01C3,0x01CD,0x01F0,0x01F4,0x01F5,0x01FA,0x0217,0x0250,
2858         0x02A8,0x02BB,0x02C1,0x0386,0x0386,0x0388,0x038A,0x038C,0x038C,0x038E,
2859         0x03A1,0x03A3,0x03CE,0x03D0,0x03D6,0x03DA,0x03DA,0x03DC,0x03DC,0x03DE,
2860         0x03DE,0x03E0,0x03E0,0x03E2,0x03F3,0x0401,0x040C,0x040E,0x044F,0x0451,
2861         0x045C,0x045E,0x0481,0x0490,0x04C4,0x04C7,0x04C8,0x04CB,0x04CC,0x04D0,
2862         0x04EB,0x04EE,0x04F5,0x04F8,0x04F9,0x0531,0x0556,0x0559,0x0559,0x0561,
2863         0x0586,0x05D0,0x05EA,0x05F0,0x05F2,0x0621,0x063A,0x0641,0x064A,0x0671,
2864         0x06B7,0x06BA,0x06BE,0x06C0,0x06CE,0x06D0,0x06D3,0x06D5,0x06D5,0x06E5,
2865         0x06E6,0x0905,0x0939,0x093D,0x093D,0x0958,0x0961,0x0985,0x098C,0x098F,
2866         0x0990,0x0993,0x09A8,0x09AA,0x09B0,0x09B2,0x09B2,0x09B6,0x09B9,0x09DC,
2867         0x09DD,0x09DF,0x09E1,0x09F0,0x09F1,0x0A05,0x0A0A,0x0A0F,0x0A10,0x0A13,
2868         0x0A28,0x0A2A,0x0A30,0x0A32,0x0A33,0x0A35,0x0A36,0x0A38,0x0A39,0x0A59,
2869         0x0A5C,0x0A5E,0x0A5E,0x0A72,0x0A74,0x0A85,0x0A8B,0x0A8D,0x0A8D,0x0A8F,
2870         0x0A91,0x0A93,0x0AA8,0x0AAA,0x0AB0,0x0AB2,0x0AB3,0x0AB5,0x0AB9,0x0ABD,
2871         0x0ABD,0x0AE0,0x0AE0,0x0B05,0x0B0C,0x0B0F,0x0B10,0x0B13,0x0B28,0x0B2A,
2872         0x0B30,0x0B32,0x0B33,0x0B36,0x0B39,0x0B3D,0x0B3D,0x0B5C,0x0B5D,0x0B5F,
2873         0x0B61,0x0B85,0x0B8A,0x0B8E,0x0B90,0x0B92,0x0B95,0x0B99,0x0B9A,0x0B9C,
2874         0x0B9C,0x0B9E,0x0B9F,0x0BA3,0x0BA4,0x0BA8,0x0BAA,0x0BAE,0x0BB5,0x0BB7,
2875         0x0BB9,0x0C05,0x0C0C,0x0C0E,0x0C10,0x0C12,0x0C28,0x0C2A,0x0C33,0x0C35,
2876         0x0C39,0x0C60,0x0C61,0x0C85,0x0C8C,0x0C8E,0x0C90,0x0C92,0x0CA8,0x0CAA,
2877         0x0CB3,0x0CB5,0x0CB9,0x0CDE,0x0CDE,0x0CE0,0x0CE1,0x0D05,0x0D0C,0x0D0E,
2878         0x0D10,0x0D12,0x0D28,0x0D2A,0x0D39,0x0D60,0x0D61,0x0E01,0x0E2E,0x0E30,
2879         0x0E30,0x0E32,0x0E33,0x0E40,0x0E45,0x0E81,0x0E82,0x0E84,0x0E84,0x0E87,
2880         0x0E88,0x0E8A,0x0E8A,0x0E8D,0x0E8D,0x0E94,0x0E97,0x0E99,0x0E9F,0x0EA1,
2881         0x0EA3,0x0EA5,0x0EA5,0x0EA7,0x0EA7,0x0EAA,0x0EAB,0x0EAD,0x0EAE,0x0EB0,
2882         0x0EB0,0x0EB2,0x0EB3,0x0EBD,0x0EBD,0x0EC0,0x0EC4,0x0F40,0x0F47,0x0F49,
2883         0x0F69,0x10A0,0x10C5,0x10D0,0x10F6,0x1100,0x1100,0x1102,0x1103,0x1105,
2884         0x1107,0x1109,0x1109,0x110B,0x110C,0x110E,0x1112,0x113C,0x113C,0x113E,
2885         0x113E,0x1140,0x1140,0x114C,0x114C,0x114E,0x114E,0x1150,0x1150,0x1154,
2886         0x1155,0x1159,0x1159,0x115F,0x1161,0x1163,0x1163,0x1165,0x1165,0x1167,
2887         0x1167,0x1169,0x1169,0x116D,0x116E,0x1172,0x1173,0x1175,0x1175,0x119E,
2888         0x119E,0x11A8,0x11A8,0x11AB,0x11AB,0x11AE,0x11AF,0x11B7,0x11B8,0x11BA,
2889         0x11BA,0x11BC,0x11C2,0x11EB,0x11EB,0x11F0,0x11F0,0x11F9,0x11F9,0x1E00,
2890         0x1E9B,0x1EA0,0x1EF9,0x1F00,0x1F15,0x1F18,0x1F1D,0x1F20,0x1F45,0x1F48,
2891         0x1F4D,0x1F50,0x1F57,0x1F59,0x1F59,0x1F5B,0x1F5B,0x1F5D,0x1F5D,0x1F5F,
2892         0x1F7D,0x1F80,0x1FB4,0x1FB6,0x1FBC,0x1FBE,0x1FBE,0x1FC2,0x1FC4,0x1FC6,
2893         0x1FCC,0x1FD0,0x1FD3,0x1FD6,0x1FDB,0x1FE0,0x1FEC,0x1FF2,0x1FF4,0x1FF6,
2894         0x1FFC,0x2126,0x2126,0x212A,0x212B,0x212E,0x212E,0x2180,0x2182,0x3041,
2895         0x3094,0x30A1,0x30FA,0x3105,0x312C,0xAC00,0xD7A3];
2896     dchar[] IdeographicTable=[0x4E00,0x9FA5,0x3007,0x3007,0x3021,0x3029];
2897     dchar[] CombiningCharTable=[0x0300,0x0345,0x0360,0x0361,0x0483,0x0486,
2898         0x0591,0x05A1,0x05A3,0x05B9,0x05BB,0x05BD,0x05BF,0x05BF,0x05C1,0x05C2,
2899         0x05C4,0x05C4,0x064B,0x0652,0x0670,0x0670,0x06D6,0x06DC,0x06DD,0x06DF,
2900         0x06E0,0x06E4,0x06E7,0x06E8,0x06EA,0x06ED,0x0901,0x0903,0x093C,0x093C,
2901         0x093E,0x094C,0x094D,0x094D,0x0951,0x0954,0x0962,0x0963,0x0981,0x0983,
2902         0x09BC,0x09BC,0x09BE,0x09BE,0x09BF,0x09BF,0x09C0,0x09C4,0x09C7,0x09C8,
2903         0x09CB,0x09CD,0x09D7,0x09D7,0x09E2,0x09E3,0x0A02,0x0A02,0x0A3C,0x0A3C,
2904         0x0A3E,0x0A3E,0x0A3F,0x0A3F,0x0A40,0x0A42,0x0A47,0x0A48,0x0A4B,0x0A4D,
2905         0x0A70,0x0A71,0x0A81,0x0A83,0x0ABC,0x0ABC,0x0ABE,0x0AC5,0x0AC7,0x0AC9,
2906         0x0ACB,0x0ACD,0x0B01,0x0B03,0x0B3C,0x0B3C,0x0B3E,0x0B43,0x0B47,0x0B48,
2907         0x0B4B,0x0B4D,0x0B56,0x0B57,0x0B82,0x0B83,0x0BBE,0x0BC2,0x0BC6,0x0BC8,
2908         0x0BCA,0x0BCD,0x0BD7,0x0BD7,0x0C01,0x0C03,0x0C3E,0x0C44,0x0C46,0x0C48,
2909         0x0C4A,0x0C4D,0x0C55,0x0C56,0x0C82,0x0C83,0x0CBE,0x0CC4,0x0CC6,0x0CC8,
2910         0x0CCA,0x0CCD,0x0CD5,0x0CD6,0x0D02,0x0D03,0x0D3E,0x0D43,0x0D46,0x0D48,
2911         0x0D4A,0x0D4D,0x0D57,0x0D57,0x0E31,0x0E31,0x0E34,0x0E3A,0x0E47,0x0E4E,
2912         0x0EB1,0x0EB1,0x0EB4,0x0EB9,0x0EBB,0x0EBC,0x0EC8,0x0ECD,0x0F18,0x0F19,
2913         0x0F35,0x0F35,0x0F37,0x0F37,0x0F39,0x0F39,0x0F3E,0x0F3E,0x0F3F,0x0F3F,
2914         0x0F71,0x0F84,0x0F86,0x0F8B,0x0F90,0x0F95,0x0F97,0x0F97,0x0F99,0x0FAD,
2915         0x0FB1,0x0FB7,0x0FB9,0x0FB9,0x20D0,0x20DC,0x20E1,0x20E1,0x302A,0x302F,
2916         0x3099,0x3099,0x309A,0x309A];
2917     dchar[] DigitTable=[0x0030,0x0039,0x0660,0x0669,0x06F0,0x06F9,0x0966,
2918         0x096F,0x09E6,0x09EF,0x0A66,0x0A6F,0x0AE6,0x0AEF,0x0B66,0x0B6F,0x0BE7,
2919         0x0BEF,0x0C66,0x0C6F,0x0CE6,0x0CEF,0x0D66,0x0D6F,0x0E50,0x0E59,0x0ED0,
2920         0x0ED9,0x0F20,0x0F29];
2921     dchar[] ExtenderTable=[0x00B7,0x00B7,0x02D0,0x02D0,0x02D1,0x02D1,0x0387,
2922         0x0387,0x0640,0x0640,0x0E46,0x0E46,0x0EC6,0x0EC6,0x3005,0x3005,0x3031,
2923         0x3035,0x309D,0x309E,0x30FC,0x30FE];
2924
2925     bool lookup(dchar[] table, int c)
2926     {
2927         while (table.length != 0)
2928         {
2929             int m = (table.length >> 1) & ~1;
2930             if (c < table[m])
2931             {
2932                 table = table[0..m];
2933             }
2934             else if (c > table[m+1])
2935             {
2936                 table = table[m+2..$];
2937             }
2938             else return true;
2939         }
2940         return false;
2941     }
2942
2943     string startOf(string s)
2944     {
2945         string r;
2946         foreach(char c;s)
2947         {
2948             r ~= (c < 0x20 || c > 0x7F) ? '.' : c;
2949             if (r.length >= 40) { r ~= "___"; break; }
2950         }
2951         return r;
2952     }
2953
2954     void exit(string s=null)
2955     {
2956         throw new XMLException(s);
2957     }
2958 }
2959
2960 version (unittest_report)
2961 {
2962     import std.stdio;
2963     unittest {
2964        writefln("unittest std2.xml passed");
2965     }
2966 }