WikiStart: xml.d

File xml.d, 83.5 kB (added by y0uf00bar, 15 years ago)
std2.xml

Line
1	// Written in the D programming language.
2
3	module std2.xml;
4	import std2.encoding;
5
6	import std.string;
7
8	private alias find indexOf;
9	private alias rfind lastIndexOf;
10
11
12
13	/**
14	Classes and functions for creating and parsing XML
15
16	The basic architecture of this module is that there are standalone functions,
17	classes for constructing an XML document from scratch (Tag, Element and
18	Document), and also classes for parsing a pre-existing XML file (ElementParser
19	and DocumentParser). The parsing classes <i>may</i> be used to build a
20	Document, but that is not their primary purpose. The handling capabilities of
21	DocumentParser and ElementParser are sufficiently customizable that you can
22	make them do pretty much whatever you want.
23
24	Authors: Janice Caron
25
26	Date: 2008.02.12 - 2008.05.07
27
28	License: Public Domain
29
30	Example: This example creates a DOM (Document Object Model) tree
31	from an XML file.
32	------------------------------------------------------------------------------
33	import std.xml;
34	import std.stdio;
35	import std.string;
36
37	// books.xml is used in various samples throughout the Microsoft XML Core
38	// Services (MSXML) SDK.
39	//
40	// See http://msdn2.microsoft.com/en-us/library/ms762271(VS.85).aspx
41
42	void main()
43	{
44	string s = cast(string)std.file.read("books.xml");
45
46	// Check for well-formedness
47	check(s);
48
49	// Make a DOM tree
50	auto doc = new Document(s);
51
52	// Plain-print it
53	writefln(doc);
54	}
55	------------------------------------------------------------------------------
56
57	Example: This example does much the same thing, except that the file is
58	deconstructed and reconstructed by hand. This is more work, but the
59	techniques involved offer vastly more power.
60	------------------------------------------------------------------------------
61	import std.xml;
62	import std.stdio;
63	import std.string;
64
65	struct Book
66	{
67	string id;
68	string author;
69	string title;
70	string genre;
71	string price;
72	string pubDate;
73	string description;
74	}
75
76	void main()
77	{
78	string s = cast(string)std.file.read("books.xml");
79
80	// Check for well-formedness
81	check(s);
82
83	// Take it apart
84	Book[] books;
85
86	auto xml = new DocumentParser(s);
87	xml.onStartTag["book"] = (ElementParser xml)
88	{
89	Book book;
90	book.id = xml.tag.attr["id"];
91
92	xml.onEndTag["author"] = (in Element e) { book.author = e.text; };
93	xml.onEndTag["title"] = (in Element e) { book.title = e.text; };
94	xml.onEndTag["genre"] = (in Element e) { book.genre = e.text; };
95	xml.onEndTag["price"] = (in Element e) { book.price = e.text; };
96	xml.onEndTag["publish-date"] = (in Element e) { book.pubDate = e.text; };
97	xml.onEndTag["description"] = (in Element e) { book.description = e.text; };
98
99	xml.parse();
100
101	books ~= book;
102	};
103	xml.parse();
104
105	// Put it back together again;
106	auto doc = new Document(new Tag("catalog"));
107	foreach(book;books)
108	{
109	auto element = new Element("book");
110	element.tag.attr["id"] = book.id;
111
112	element ~= new Element("author", book.author);
113	element ~= new Element("title", book.title);
114	element ~= new Element("genre", book.genre);
115	element ~= new Element("price", book.price);
116	element ~= new Element("publish-date",book.pubDate);
117	element ~= new Element("description", book.description);
118
119	doc ~= element;
120	}
121
122	// Pretty-print it
123	writefln(join(doc.pretty(3),"\n"));
124	}
125	-------------------------------------------------------------------------------
126	* Macros:
127	* WIKI=Phobos/StdXml
128	*/
129
130	/**
131	* Abstract base class for XML items
132	*/
133	abstract class Item
134	{
135	/// Compares with another Item of same type for equality
136	abstract override int opEquals(Object o);
137
138	/// Compares with another Item of same type
139	abstract override int opCmp(Object o);
140
141	/// Returns the hash of this item
142	abstract override hash_t toHash();
143
144	/// Returns a string representation of this item
145	abstract override string toString();
146
147	/**
148	* Returns an indented string representation of this item
149	*
150	* Params:
151	* indent = number of spaces by which to indent child elements
152	*/
153	string[] pretty(uint indent)
154	{
155	string s = strip(toString());
156	return s.length == 0 ? [] : [ s ];
157	}
158
159	/// Returns true if the item represents empty XML text
160	abstract bool isEmptyXML();
161	}
162
163
164	string cdata = "<![CDATA[";
165
166	/**
167	* Returns true if the character is a character according to the XML standard
168	*
169	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
170	*
171	* Params:
172	* c = the character to be tested
173	*/
174	bool isChar(dchar c) // rule 2
175	{
176	return lookup(CharTable,c);
177	}
178
179	unittest
180	{
181	// const CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,
182	// 0x10000,0x10FFFF];
183	assert(!isChar(cast(dchar)0x8));
184	assert( isChar(cast(dchar)0x9));
185	assert( isChar(cast(dchar)0xA));
186	assert(!isChar(cast(dchar)0xB));
187	assert(!isChar(cast(dchar)0xC));
188	assert( isChar(cast(dchar)0xD));
189	assert(!isChar(cast(dchar)0xE));
190	assert(!isChar(cast(dchar)0x1F));
191	assert( isChar(cast(dchar)0x20));
192	assert( isChar('J'));
193	assert( isChar(cast(dchar)0xD7FF));
194	assert(!isChar(cast(dchar)0xD800));
195	assert(!isChar(cast(dchar)0xDFFF));
196	assert( isChar(cast(dchar)0xE000));
197	assert( isChar(cast(dchar)0xFFFD));
198	assert(!isChar(cast(dchar)0xFFFE));
199	assert(!isChar(cast(dchar)0xFFFF));
200	assert( isChar(cast(dchar)0x10000));
201	assert( isChar(cast(dchar)0x10FFFF));
202	assert(!isChar(cast(dchar)0x110000));
203	}
204	S1 munch(S1, S2)(ref S1 s, S2 pattern)
205	{
206	size_t j = s.length;
207	foreach (i, c; s)
208	{
209	if (!inPattern(c, pattern))
210	{
211	j = i;
212	break;
213	}
214	}
215	scope(exit) s = s[j .. $];
216	return s[0 .. j];
217	}
218	/**
219	* Returns true if the character is whitespace according to the XML standard
220	*
221	* Only the following characters are considered whitespace in XML - space, tab,
222	* carriage return and linefeed
223	*
224	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
225	*
226	* Params:
227	* c = the character to be tested
228	*/
229	bool isSpace(dchar c)
230	{
231	return c == '\u0020' \|\| c == '\u0009' \|\| c == '\u000A' \|\| c == '\u000D';
232	}
233
234	/**
235	* Returns true if the character is a digit according to the XML standard
236	*
237	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
238	*
239	* Params:
240	* c = the character to be tested
241	*/
242	bool isDigit(dchar c)
243	{
244	return lookup(DigitTable,c);
245	}
246
247	/**
248	* Returns true if the character is a letter according to the XML standard
249	*
250	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
251	*
252	* Params:
253	* c = the character to be tested
254	*/
255	bool isLetter(dchar c) // rule 84
256	{
257	return isIdeographic(c) \|\| isBaseChar(c);
258	}
259
260	/**
261	* Returns true if the character is an ideographic character according to the
262	* XML standard
263	*
264	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
265	*
266	* Params:
267	* c = the character to be tested
268	*/
269	bool isIdeographic(dchar c)
270	{
271	return lookup(IdeographicTable,c);
272	}
273
274	/**
275	* Returns true if the character is a base character according to the XML
276	* standard
277	*
278	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
279	*
280	* Params:
281	* c = the character to be tested
282	*/
283	bool isBaseChar(dchar c)
284	{
285	return lookup(BaseCharTable,c);
286	}
287
288	/**
289	* Returns true if the character is a combining character according to the
290	* XML standard
291	*
292	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
293	*
294	* Params:
295	* c = the character to be tested
296	*/
297	bool isCombiningChar(dchar c)
298	{
299	return lookup(CombiningCharTable,c);
300	}
301
302	/**
303	* Returns true if the character is an extender according to the XML standard
304	*
305	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
306	*
307	* Params:
308	* c = the character to be tested
309	*/
310	bool isExtender(dchar c)
311	{
312	return lookup(ExtenderTable,c);
313	}
314
315	/**
316	* Encodes a string by replacing all characters which need to be escaped with
317	* appropriate predefined XML entities.
318	*
319	* encode() escapes certain characters (ampersand, quote, apostrophe, less-than
320	* and greater-than), and similarly, decode() unescapes them. These functions
321	* are provided for convenience only. You do not need to use them when using
322	* the std.xml classes, because then all the encoding and decoding will be done
323	* for you automatically.
324	*
325	* If the string is not modified, the original will be returned.
326	*
327	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
328	*
329	* Params:
330	* s = The string to be encoded
331	*
332	* Returns: The encoded string
333	*
334	* Examples:
335	* --------------
336	* writefln(encode("a > b")); // writes "a > b"
337	* --------------
338	*/
339	/*
340	//
341	//Specialized version of replace, to avoid the boring predictability of std.string.replace always returning a new array.
342	//Replace all occurances of from with to.
343	//Return original array if from not found.
344	//
345
346	string replaceIf(string s, dchar from, string to)
347	{
348	char[] p;
349	int i;
350	size_t istart;
351
352	istart = 0;
353	while (istart < s.length)
354	{
355	i = find(s[istart .. s.length], from);
356	if (i == -1)
357	{
358	if (istart == 0)
359	return s;
360	p ~= s[istart .. s.length];
361	break;
362	}
363	p ~= s[istart .. istart + i];
364	p ~= to;
365	istart += i + 1;
366	}
367	return p;
368	}
369
370	string encodeStdEntity(string s)
371	{
372	s = replaceIf(s,'&',"&");
373	s = replaceIf(s,'\"',""");
374	s = replaceIf(s,'\'',"'");
375	s = replaceIf(s,'<',"<");
376	s = replaceIf(s,'>',">");
377	return s;
378	}
379	*/
380
381	/**
382	Specialized version of startsWith.
383	Return if string s2 exactly matches string s1 up to length of s2, for non-zero length strings.
384	*/
385	private bool startsWith(string s1, string s2)
386	{
387	size_t len2 = s2.length;
388	size_t len1 = s1.length;
389
390	if ((len2 > 0) && (len1 >= len2))
391	return (s2 == s1[0..len2]);
392	else
393	return false;
394	}
395
396	/* encodeStdEntity suggestion as copied from Digital Mars bug reports issue 3218 */
397
398	T[] encodeStdEntity(T) (T[] src, T[] dst = null)
399	{
400	T[] entity;
401	auto s = src.ptr;
402	auto t = s;
403	auto e = s + src.length;
404	auto index = 0;
405
406	while (s < e)
407	switch (*s)
408	{
409	case '"':
410	entity = """;
411	goto common;
412
413	case '>':
414	entity = ">";
415	goto common;
416
417	case '<':
418	entity = "<";
419	goto common;
420
421	case '&':
422	entity = "&";
423	goto common;
424
425	case '\'':
426	entity = "'";
427	goto common;
428
429	common:
430	auto len = s - t;
431	if (dst.length <= index + len + entity.length)
432	dst.length = (dst.length + len + entity.length)
433	+ dst.length / 2;
434
435	dst [index .. index + len] = t [0 .. len];
436	index += len;
437
438	dst [index .. index + entity.length] = entity;
439	index += entity.length;
440	t = ++s;
441	break;
442
443	default:
444	++s;
445	break;
446	}
447
448
449	// did we change anything?
450	if (index)
451	{
452	// copy tail too
453	auto len = e - t;
454	if (dst.length <= index + len)
455	dst.length = index + len;
456
457	dst [index .. index + len] = t [0 .. len];
458	return dst [0 .. index + len];
459	}
460
461	return src;
462	}
463
464
465
466	unittest
467	{
468	assert(encodeStdEntity("hello") is "hello");
469	assert(encodeStdEntity("a > b") == "a > b");
470	assert(encodeStdEntity("a < b") == "a < b");
471	assert(encodeStdEntity("don't") == "don't");
472	assert(encodeStdEntity("\"hi\"") == ""hi"");
473	assert(encodeStdEntity("cat & dog") == "cat & dog");
474	}
475
476	/**
477	* Mode to use for decoding.
478	*
479	* $(DDOC_ENUM_MEMBERS NONE) Do not decode
480	* $(DDOC_ENUM_MEMBERS LOOSE) Decode, but ignore errors
481	* $(DDOC_ENUM_MEMBERS STRICT) Decode, and throw exception on error
482	*/
483	enum DecodeMode
484	{
485	NONE, LOOSE, STRICT
486	}
487
488	/**
489	* Decodes a string by unescaping all predefined XML entities.
490	*
491	* encode() escapes certain characters (ampersand, quote, apostrophe, less-than
492	* and greater-than), and similarly, decode() unescapes them. These functions
493	* are provided for convenience only. You do not need to use them when using
494	* the std.xml classes, because then all the encoding and decoding will be done
495	* for you automatically.
496	*
497	* This function decodes the entities &amp;, &quot;, &apos;,
498	* &lt; and &gt,
499	* as well as decimal and hexadecimal entities such as &#x20AC;
500	*
501	* If the string does not contain an ampersand, the original will be returned.
502	*
503	* Note that the "mode" parameter can be one of DecodeMode.NONE (do not
504	* decode), DecodeMode.LOOSE (decode, but ignore errors), or DecodeMode.STRICT
505	* (decode, and throw a DecodeException in the event of an error).
506	*
507	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
508	*
509	* Params:
510	* s = The string to be decoded
511	* mode = (optional) Mode to use for decoding. (Defaults to LOOSE).
512	*
513	* Throws: DecodeException if mode == DecodeMode.STRICT and decode fails
514	*
515	* Returns: The decoded string
516	*
517	* Examples:
518	* --------------
519	* writefln(decode("a > b")); // writes "a > b"
520	* --------------
521	*/
522	string decode(string s, DecodeMode mode=DecodeMode.LOOSE)
523	{
524	if (mode == DecodeMode.NONE) return s;
525
526	char[] buffer;
527
528	for (int i=0; i<s.length; ++i)
529	{
530	char c = s[i];
531	if (c != '&')
532	{
533	if (buffer.length != 0) buffer ~= c;
534	}
535	else
536	{
537	if (buffer.length == 0)
538	{
539	buffer = s.dup;
540	buffer.length = i;
541	}
542	if (startsWith(s[i..$],"&#"))
543	{
544	try
545	{
546	dchar d;
547	string t = s[i..$];
548	checkCharRef(t, d);
549	std.utf.encode(buffer, d);
550	i = s.length - t.length - 1;
551	}
552	catch(Err e)
553	{
554	if (mode == DecodeMode.STRICT)
555	throw new DecodeException("Unescaped &");
556	buffer ~= '&';
557	}
558	}
559	else if (startsWith(s[i..$],"&" )) { buffer ~= '&'; i += 4; }
560	else if (startsWith(s[i..$],""")) { buffer ~= '"'; i += 5; }
561	else if (startsWith(s[i..$],"'")) { buffer ~= '\''; i += 5; }
562	else if (startsWith(s[i..$],"<" )) { buffer ~= '<'; i += 3; }
563	else if (startsWith(s[i..$],">" )) { buffer ~= '>'; i += 3; }
564	else
565	{
566	if (mode == DecodeMode.STRICT)
567	throw new DecodeException("Unescaped &");
568	buffer ~= '&';
569	}
570	}
571	}
572	return (buffer.length == 0) ? s : cast(string)buffer;
573	}
574
575	unittest
576	{
577	void assertNot(string s)
578	{
579	bool b = false;
580	try { decode(s,DecodeMode.STRICT); }
581	catch (DecodeException e) { b = true; }
582	assert(b,s);
583	}
584
585	// Assert that things that should work, do
586	assert(decode("hello", DecodeMode.STRICT) is "hello");
587	assert(decode("a > b", DecodeMode.STRICT) == "a > b");
588	assert(decode("a < b", DecodeMode.STRICT) == "a < b");
589	assert(decode("don't", DecodeMode.STRICT) == "don't");
590	assert(decode(""hi"", DecodeMode.STRICT) == "\"hi\"");
591	assert(decode("cat & dog", DecodeMode.STRICT) == "cat & dog");
592	assert(decode("", DecodeMode.STRICT) == "");
593	assert(decode("", DecodeMode.STRICT) == "");
594	assert(decode("cat & dog", DecodeMode.LOOSE) == "cat & dog");
595	assert(decode("a &gt b", DecodeMode.LOOSE) == "a &gt b");
596	assert(decode("&#;", DecodeMode.LOOSE) == "&#;");
597	assert(decode("&#x;", DecodeMode.LOOSE) == "&#x;");
598	assert(decode("&#2G;", DecodeMode.LOOSE) == "&#2G;");
599	assert(decode("&#x2G;", DecodeMode.LOOSE) == "&#x2G;");
600
601	// Assert that things that shouldn't work, don't
602	assertNot("cat & dog");
603	assertNot("a &gt b");
604	assertNot("&#;");
605	assertNot("&#x;");
606	assertNot("&#2G;");
607	assertNot("&#x2G;");
608	}
609
610	/**
611	* Class representing an XML document.
612	*
613	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
614	*
615	*/
616	class Document : Element
617	{
618	/**
619	* Contains all text which occurs before the root element.
620	* Defaults to <?xml version="1.0"?>
621	*/
622	string prolog = "<?xml version=\"1.0\"?>";
623	/**
624	* Contains all text which occurs after the root element.
625	* Defaults to the empty string
626	*/
627	string epilog;
628
629	/**
630	* Constructs a Document by parsing XML text.
631	*
632	* This function creates a complete DOM (Document Object Model) tree.
633	*
634	* The input to this function MUST be valid XML.
635	* This is enforced by DocumentParser's in contract.
636	*
637	* Params:
638	* s = the complete XML text.
639	*/
640	this(string s)
641	in
642	{
643	assert(s.length != 0);
644	}
645	body
646	{
647	auto xml = new DocumentParser(s);
648	string tagString = xml.tag.tagString;
649
650	this(xml.tag);
651	prolog = s[0 .. tagString.ptr - s.ptr];
652	parse(xml);
653	epilog = *xml.s;
654	}
655
656	/**
657	* Constructs a Document from a Tag.
658	*
659	* Params:
660	* tag = the start tag of the document.
661	*/
662	this(Tag tag)
663	{
664	super(tag);
665	}
666
667
668	/**
669	* Compares two Documents for equality
670	*
671	* Examples:
672	* --------------
673	* Document d1,d2;
674	* if (d1 == d2) { }
675	* --------------
676	*/
677	override int opEquals(Object o)
678	{
679	auto doc = toType!( Document)(o);
680	return
681	(prolog != doc.prolog ) ? false : (
682	(super != cast( Element)doc) ? false : (
683	(epilog != doc.epilog ) ? false : (
684	true )));
685	}
686
687	/**
688	* Compares two Documents
689	*
690	* You should rarely need to call this function. It exists so that
691	* Documents can be used as associative array keys.
692	*
693	* Examples:
694	* --------------
695	* Document d1,d2;
696	* if (d1 < d2) { }
697	* --------------
698	*/
699	override int opCmp(Object o)
700	{
701	auto doc = toType!(Document)(o);
702	return
703	((prolog != doc.prolog )
704	? ( prolog < doc.prolog ? -1 : 1 ) :
705	((super != cast( Element)doc)
706	? ( super < cast( Element)doc ? -1 : 1 ) :
707	((epilog != doc.epilog )
708	? ( epilog < doc.epilog ? -1 : 1 ) :
709	0 )));
710	}
711
712	/**
713	* Returns the hash of a Document
714	*
715	* You should rarely need to call this function. It exists so that
716	* Documents can be used as associative array keys.
717	*/
718	override hash_t toHash()
719	{
720	return hash(prolog,hash(epilog,super.toHash));
721	}
722
723	/**
724	* Returns the string representation of a Document. (That is, the
725	* complete XML of a document).
726	*/
727	override string toString()
728	{
729	return prolog ~ super.toString ~ epilog;
730	}
731	}
732
733
734	/**
735	* Class representing an XML element.
736	*
737	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
738	*/
739	class Element : Item
740	{
741	Tag tag; /// The start tag of the element
742	Item[] items; /// The element's items
743	Text[] texts; /// The element's text items
744	CData[] cdatas; /// The element's CData items
745	Comment[] comments; /// The element's comments
746	ProcessingInstruction[] pis; /// The element's processing instructions
747	Element[] elements; /// The element's child elements
748
749	/**
750	* Constructs an Element given a name and a string to be used as a Text
751	* interior.
752	*
753	* Params:
754	* name = the name of the element.
755	* interior = (optional) the string interior.
756	*
757	* Examples:
758	* -------------------------------------------------------
759	* auto element = new Element("title","Serenity")
760	* // constructs the element <title>Serenity</title>
761	* -------------------------------------------------------
762	*/
763	this(string name, string interior=null)
764	{
765	this(new Tag(name));
766	if (interior.length != 0) opCatAssign(new Text(interior));
767	}
768
769	/**
770	* Constructs an Element from a Tag.
771	*
772	* Params:
773	* tag = the start or empty tag of the element.
774	*/
775	this(Tag tag_)
776	{
777	this.tag = new Tag(tag_.name);
778	tag.type = TagType.EMPTY;
779	foreach(k,v;tag_.attr) tag.attr[k] = v;
780	tag.tagString = tag_.tagString;
781	}
782
783	/**
784	* Append a text item to the interior of this element
785	*
786	* Params:
787	* item = the item you wish to append.
788	*
789	* Examples:
790	* --------------
791	* Element element;
792	* element ~= new Text("hello");
793	* --------------
794	*/
795	void opCatAssign(Text item)
796	{
797	texts ~= item;
798	appendItem(item);
799	}
800
801	/**
802	* Append a CData item to the interior of this element
803	*
804	* Params:
805	* item = the item you wish to append.
806	*
807	* Examples:
808	* --------------
809	* Element element;
810	* element ~= new CData("hello");
811	* --------------
812	*/
813	void opCatAssign(CData item)
814	{
815	cdatas ~= item;
816	appendItem(item);
817	}
818
819	/**
820	* Append a comment to the interior of this element
821	*
822	* Params:
823	* item = the item you wish to append.
824	*
825	* Examples:
826	* --------------
827	* Element element;
828	* element ~= new Comment("hello");
829	* --------------
830	*/
831	void opCatAssign(Comment item)
832	{
833	comments ~= item;
834	appendItem(item);
835	}
836
837	/**
838	* Append a processing instruction to the interior of this element
839	*
840	* Params:
841	* item = the item you wish to append.
842	*
843	* Examples:
844	* --------------
845	* Element element;
846	* element ~= new ProcessingInstruction("hello");
847	* --------------
848	*/
849	void opCatAssign(ProcessingInstruction item)
850	{
851	pis ~= item;
852	appendItem(item);
853	}
854
855	/**
856	* Append a complete element to the interior of this element
857	*
858	* Params:
859	* item = the item you wish to append.
860	*
861	* Examples:
862	* --------------
863	* Element element;
864	* Element other = new Element("br");
865	* element ~= other;
866	* // appends element representing <br />
867	* --------------
868	*/
869	void opCatAssign(Element item)
870	{
871	elements ~= item;
872	appendItem(item);
873	}
874
875	private void appendItem(Item item)
876	{
877	items ~= item;
878	if (tag.type == TagType.EMPTY && !item.isEmptyXML)
879	tag.type = TagType.START;
880	}
881
882	private void parse(ElementParser xml)
883	{
884	xml.onText = (string s) { opCatAssign(new Text(s)); };
885	xml.onCData = (string s) { opCatAssign(new CData(s)); };
886	xml.onComment = (string s) { opCatAssign(new Comment(s)); };
887	xml.onPI = (string s) { opCatAssign(new ProcessingInstruction(s)); };
888
889	xml.onStartTag[null] = (ElementParser xml)
890	{
891	auto e = new Element(xml.tag);
892	e.parse(xml);
893	opCatAssign(e);
894	};
895
896	xml.parse();
897	}
898
899	/**
900	* Compares two Elements for equality
901	*
902	* Examples:
903	* --------------
904	* Element e1,e2;
905	* if (e1 == e2) { }
906	* --------------
907	*/
908	override int opEquals(Object o)
909	{
910	auto element = toType!( Element)(o);
911	uint len = items.length;
912	if (len != element.items.length) return false;
913	for (uint i=0; i<len; ++i)
914	{
915	if (!items[i].opEquals(element.items[i])) return false;
916	}
917	return true;
918	}
919
920	/**
921	* Compares two Elements
922	*
923	* You should rarely need to call this function. It exists so that Elements
924	* can be used as associative array keys.
925	*
926	* Examples:
927	* --------------
928	* Element e1,e2;
929	* if (e1 < e2) { }
930	* --------------
931	*/
932	override int opCmp(Object o)
933	{
934	auto element = toType!( Element)(o);
935	for (uint i=0; ; ++i)
936	{
937	if (i == items.length && i == element.items.length) return 0;
938	if (i == items.length) return -1;
939	if (i == element.items.length) return 1;
940	if (items[i] != element.items[i])
941	return items[i].opCmp(element.items[i]);
942	}
943	}
944
945	/**
946	* Returns the hash of an Element
947	*
948	* You should rarely need to call this function. It exists so that Elements
949	* can be used as associative array keys.
950	*/
951	override hash_t toHash()
952	{
953	hash_t hash = tag.toHash;
954	foreach(item;items) hash += item.toHash();
955	return hash;
956	}
957
958	/**
959	* Returns the decoded interior of an element.
960	*
961	* The element is assumed to containt text <i>only</i>. So, for
962	* example, given XML such as "<title>Good &amp;
963	* Bad</title>", will return "Good & Bad".
964	*
965	* Params:
966	* mode = (optional) Mode to use for decoding. (Defaults to LOOSE).
967	*
968	* Throws: DecodeException if decode fails
969	*/
970	string text(DecodeMode mode=DecodeMode.LOOSE)
971	{
972	string buffer;
973	foreach(item;items)
974	{
975	Text t = cast(Text)item;
976	if (t is null) throw new DecodeException(item.toString);
977	buffer ~= decode(t.toString,mode);
978	}
979	return buffer;
980	}
981
982	/**
983	* Returns an indented string representation of this item
984	*
985	* Params:
986	* indent = (optional) number of spaces by which to indent this
987	* element. Defaults to 2.
988	*/
989	override string[] pretty(uint indent=2)
990	{
991
992	if (isEmptyXML) return [ tag.toEmptyString ];
993
994	if (items.length == 1)
995	{
996	Text t = cast(Text)(items[0]);
997	if (t !is null)
998	{
999	return [tag.toStartString ~ t.toString ~ tag.toEndString];
1000	}
1001	}
1002
1003	string[] a = [ tag.toStartString ];
1004	foreach(item;items)
1005	{
1006	string[] b = item.pretty(indent);
1007	foreach(s;b)
1008	{
1009	a ~= rjustify(s,s.length + indent);
1010	}
1011	}
1012	a ~= tag.toEndString;
1013	return a;
1014	}
1015
1016	/**
1017	* Returns the string representation of an Element
1018	*
1019	* Examples:
1020	* --------------
1021	* auto element = new Element("br");
1022	* writefln(element.toString); // writes "<br />"
1023	* --------------
1024	*/
1025	override string toString()
1026	{
1027	if (isEmptyXML) return tag.toEmptyString;
1028
1029	string buffer = tag.toStartString;
1030	foreach(item;items) { buffer ~= item.toString; }
1031	buffer ~= tag.toEndString;
1032	return buffer;
1033	}
1034
1035	override bool isEmptyXML() { return false; } /// Returns false always
1036	}
1037
1038
1039	/**
1040	* Tag types.
1041	*
1042	* $(DDOC_ENUM_MEMBERS START) Used for start tags
1043	* $(DDOC_ENUM_MEMBERS END) Used for end tags
1044	* $(DDOC_ENUM_MEMBERS EMPTY) Used for empty tags
1045	*
1046	*/
1047	enum TagType { START, END, EMPTY };
1048
1049	/**
1050	* Class representing an XML tag.
1051	*
1052	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1053	*
1054	* The class invariant guarantees
1055	* <ul>
1056	* <li> that $(B type) is a valid enum TagType value</li>
1057	* <li> that $(B name) consists of valid characters</li>
1058	* <li> that each attribute name consists of valid characters</li>
1059	* </ul>
1060	*/
1061	class Tag
1062	{
1063	TagType type = TagType.START; /// Type of tag
1064	string name; /// Tag name
1065	string[string] attr; /// Associative array of attributes
1066	private string tagString;
1067
1068	invariant()
1069	{
1070	string s;
1071	string t;
1072
1073	assert(type == TagType.START
1074	\|\| type == TagType.END
1075	\|\| type == TagType.EMPTY);
1076
1077	s = name;
1078	try { checkName(s,t); }
1079	catch(Err e) { assert(false,"Invalid tag name:" ~ e.toString); }
1080
1081	foreach(k,v;attr)
1082	{
1083	s = k;
1084	try { checkName(s,t); }
1085	catch(Err e)
1086	{ assert(false,"Invalid atrribute name:" ~ e.toString); }
1087	}
1088	}
1089
1090	/**
1091	* Constructs an instance of Tag with a specified name and type
1092	*
1093	* The constructor does not initialize the attributes. To initialize the
1094	* attributes, you access the $(B attr) member variable.
1095	*
1096	* Params:
1097	* name = the Tag's name
1098	* type = (optional) the Tag's type. If omitted, defaults to
1099	* TagType.START.
1100	*
1101	* Examples:
1102	* --------------
1103	* auto tag = new Tag("img",Tag.EMPTY);
1104	* tag.attr["src"] = "http://example.com/example.jpg";
1105	* --------------
1106	*/
1107	this(string name, TagType type=TagType.START)
1108	{
1109	this.name = name;
1110	this.type = type;
1111	}
1112
1113	/* Private constructor (so don't ddoc this!)
1114	*
1115	* Constructs a Tag by parsing the string representation, e.g. "<html>".
1116	*
1117	* The string is passed by reference, and is advanced over all characters
1118	* consumed.
1119	*
1120	* The second parameter is a dummy parameter only, required solely to
1121	* distinguish this constructor from the public one.
1122	*/
1123	private this(ref string s, bool dummy)
1124	{
1125	tagString = s;
1126	try
1127	{
1128	reqc(s,'<');
1129	if (optc(s,'/')) type = TagType.END;
1130	name = munch(s,"^/>"~whitespace);
1131	munch(s,whitespace);
1132	while(s.length > 0 && s[0] != '>' && s[0] != '/')
1133	{
1134	string key = munch(s,"^="~whitespace);
1135	munch(s,whitespace);
1136	reqc(s,'=');
1137	munch(s,whitespace);
1138	reqc(s,'"');
1139	string val = encodeStdEntity(munch(s,"^\""));
1140	reqc(s,'"');
1141	munch(s,whitespace);
1142	attr[key] = val;
1143	}
1144	if (optc(s,'/'))
1145	{
1146	if (type == TagType.END) throw new TagException("");
1147	type = TagType.EMPTY;
1148	}
1149	reqc(s,'>');
1150	tagString.length = (s.ptr - tagString.ptr);
1151	}
1152	catch(XMLException e)
1153	{
1154	tagString.length = (s.ptr - tagString.ptr);
1155	throw new TagException(tagString);
1156	}
1157	}
1158
1159
1160	/**
1161	* Compares two Tags for equality
1162	*
1163	* You should rarely need to call this function. It exists so that Tags
1164	* can be used as associative array keys.
1165	*
1166	* Examples:
1167	* --------------
1168	* Tag tag1,tag2
1169	* if (tag1 == tag2) { }
1170	* --------------
1171	*/
1172	override int opEquals(Object o)
1173	{
1174	auto tag = toType!( Tag)(o);
1175	return
1176	(name != tag.name) ? false : (
1177	(attr != tag.attr) ? false : (
1178	(type != tag.type) ? false : (
1179	true )));
1180	}
1181
1182	/**
1183	* Compares two Tags
1184	*
1185	* Examples:
1186	* --------------
1187	* Tag tag1,tag2
1188	* if (tag1 < tag2) { }
1189	* --------------
1190	*/
1191	override int opCmp(Object o)
1192	{
1193	auto tag = toType!( Tag)(o);
1194	return
1195	((name != tag.name) ? ( name < tag.name ? -1 : 1 ) :
1196	((attr != tag.attr) ? ( attr < tag.attr ? -1 : 1 ) :
1197	((type != tag.type) ? ( type < tag.type ? -1 : 1 ) :
1198	0 )));
1199	}
1200
1201	/**
1202	* Returns the hash of a Tag
1203	*
1204	* You should rarely need to call this function. It exists so that Tags
1205	* can be used as associative array keys.
1206	*/
1207	override hash_t toHash()
1208	{
1209	hash_t hash = 0;
1210	foreach(dchar c;name) hash = hash * 11 + c;
1211	return hash;
1212	}
1213
1214	/**
1215	* Returns the string representation of a Tag
1216	*
1217	* Examples:
1218	* --------------
1219	* auto tag = new Tag("book",TagType.START);
1220	* writefln(tag.toString); // writes "<book>"
1221	* --------------
1222	*/
1223	override string toString()
1224	{
1225	if (isEmpty) return toEmptyString();
1226	return (isEnd) ? toEndString() : toStartString();
1227	}
1228
1229	private
1230	{
1231	string toNonEndString()
1232	{
1233	string s = "<" ~ name;
1234	foreach(key,val;attr)
1235	s ~= format(" %s=\"%s\"",key,decode(val,DecodeMode.LOOSE));
1236	return s;
1237	}
1238
1239	string toStartString() { return toNonEndString() ~ ">"; }
1240
1241	string toEndString() { return "</" ~ name ~ ">"; }
1242
1243	string toEmptyString() { return toNonEndString() ~ " />"; }
1244	}
1245
1246	/**
1247	* Returns true if the Tag is a start tag
1248	*
1249	* Examples:
1250	* --------------
1251	* if (tag.isStart) { }
1252	* --------------
1253	*/
1254	bool isStart() { return type == TagType.START; }
1255
1256	/**
1257	* Returns true if the Tag is an end tag
1258	*
1259	* Examples:
1260	* --------------
1261	* if (tag.isEnd) { }
1262	* --------------
1263	*/
1264	bool isEnd() { return type == TagType.END; }
1265
1266	/**
1267	* Returns true if the Tag is an empty tag
1268	*
1269	* Examples:
1270	* --------------
1271	* if (tag.isEmpty) { }
1272	* --------------
1273	*/
1274	bool isEmpty() { return type == TagType.EMPTY; }
1275	}
1276
1277
1278	/**
1279	* Class representing a comment
1280	*/
1281	class Comment : Item
1282	{
1283	private string content;
1284
1285	/**
1286	* Construct a comment
1287	*
1288	* Params:
1289	* content = the body of the comment
1290	*
1291	* Throws: CommentException if the comment body is illegal (contains "--"
1292	* or exactly equals "-")
1293	*
1294	* Examples:
1295	* --------------
1296	* auto item = new Comment("This is a comment");
1297	* // constructs <!--This is a comment-->
1298	* --------------
1299	*/
1300	this(string content)
1301	{
1302	if (content == "-" \|\| content.indexOf("==") != -1)
1303	throw new CommentException(content);
1304	this.content = content;
1305	}
1306
1307	/**
1308	* Compares two comments for equality
1309	*
1310	* Examples:
1311	* --------------
1312	* Comment item1,item2;
1313	* if (item1 == item2) { }
1314	* --------------
1315	*/
1316	override int opEquals(Object o)
1317	{
1318	auto item = toType!( Item)(o);
1319	auto t = cast(Comment)item;
1320	return t !is null && content == t.content;
1321	}
1322
1323	/**
1324	* Compares two comments
1325	*
1326	* You should rarely need to call this function. It exists so that Comments
1327	* can be used as associative array keys.
1328	*
1329	* Examples:
1330	* --------------
1331	* Comment item1,item2;
1332	* if (item1 < item2) { }
1333	* --------------
1334	*/
1335	override int opCmp(Object o)
1336	{
1337	auto item = toType!( Item)(o);
1338	auto t = cast(Comment)item;
1339	return t !is null && (content != t.content
1340	? (content < t.content ? -1 : 1 ) : 0 );
1341	}
1342
1343	/**
1344	* Returns the hash of a Comment
1345	*
1346	* You should rarely need to call this function. It exists so that Comments
1347	* can be used as associative array keys.
1348	*/
1349	override hash_t toHash() { return hash(content); }
1350
1351	/**
1352	* Returns a string representation of this comment
1353	*/
1354	override string toString() { return "<!--" ~ content ~ "-->"; }
1355
1356	override bool isEmptyXML() { return false; } /// Returns false always
1357	}
1358
1359	/**
1360	* Class representing a Character Data section
1361	*/
1362	class CData : Item
1363	{
1364	private string content;
1365
1366	/**
1367	* Construct a chraracter data section
1368	*
1369	* Params:
1370	* content = the body of the character data segment
1371	*
1372	* Throws: CDataException if the segment body is illegal (contains "]]>")
1373	*
1374	* Examples:
1375	* --------------
1376	* auto item = new CData("<b>hello</b>");
1377	* // constructs <![CDATA[<b>hello</b>]]>
1378	* --------------
1379	*/
1380	this(string content)
1381	{
1382	if (content.indexOf("]]>") != -1) throw new CDataException(content);
1383	this.content = content;
1384	}
1385
1386	/**
1387	* Compares two CDatas for equality
1388	*
1389	* Examples:
1390	* --------------
1391	* CData item1,item2;
1392	* if (item1 == item2) { }
1393	* --------------
1394	*/
1395	override int opEquals(Object o)
1396	{
1397	auto item = toType!( Item)(o);
1398	auto t = cast(CData)item;
1399	return t !is null && content == t.content;
1400	}
1401
1402	/**
1403	* Compares two CDatas
1404	*
1405	* You should rarely need to call this function. It exists so that CDatas
1406	* can be used as associative array keys.
1407	*
1408	* Examples:
1409	* --------------
1410	* CData item1,item2;
1411	* if (item1 < item2) { }
1412	* --------------
1413	*/
1414	override int opCmp(Object o)
1415	{
1416	auto item = toType!( Item)(o);
1417	auto t = cast(CData)item;
1418	return t !is null && (content != t.content
1419	? (content < t.content ? -1 : 1 ) : 0 );
1420	}
1421
1422	/**
1423	* Returns the hash of a CData
1424	*
1425	* You should rarely need to call this function. It exists so that CDatas
1426	* can be used as associative array keys.
1427	*/
1428	override hash_t toHash() { return hash(content); }
1429
1430	/**
1431	* Returns a string representation of this CData section
1432	*/
1433	override string toString() { return cdata ~ content ~ "]]>"; }
1434
1435	override bool isEmptyXML() { return false; } /// Returns false always
1436	}
1437
1438	/**
1439	* Class representing a text (aka Parsed Character Data) section
1440	*/
1441	class Text : Item
1442	{
1443	private string content;
1444
1445	/**
1446	* Construct a text (aka PCData) section
1447	*
1448	* Params:
1449	* content = the text. This function encodes the text before
1450	* insertion, so it is safe to insert any text
1451	*
1452	* Examples:
1453	* --------------
1454	* auto Text = new CData("a < b");
1455	* // constructs a < b
1456	* --------------
1457	*/
1458	this(string content)
1459	{
1460	this.content = encodeStdEntity(content);
1461	}
1462
1463	/**
1464	* Compares two text sections for equality
1465	*
1466	* Examples:
1467	* --------------
1468	* Text item1,item2;
1469	* if (item1 == item2) { }
1470	* --------------
1471	*/
1472	override int opEquals(Object o)
1473	{
1474	auto item = toType!( Item)(o);
1475	auto t = cast(Text)item;
1476	return t !is null && content == t.content;
1477	}
1478
1479	/**
1480	* Compares two text sections
1481	*
1482	* You should rarely need to call this function. It exists so that Texts
1483	* can be used as associative array keys.
1484	*
1485	* Examples:
1486	* --------------
1487	* Text item1,item2;
1488	* if (item1 < item2) { }
1489	* --------------
1490	*/
1491	override int opCmp(Object o)
1492	{
1493	auto item = toType!( Item)(o);
1494	auto t = cast(Text)item;
1495	return t !is null
1496	&& (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1497	}
1498
1499	/**
1500	* Returns the hash of a text section
1501	*
1502	* You should rarely need to call this function. It exists so that Texts
1503	* can be used as associative array keys.
1504	*/
1505	override hash_t toHash() { return hash(content); }
1506
1507	/**
1508	* Returns a string representation of this Text section
1509	*/
1510	override string toString() { return content; }
1511
1512	/**
1513	* Returns true if the content is the empty string
1514	*/
1515	override bool isEmptyXML() { return content.length == 0; }
1516	}
1517
1518	/**
1519	* Class representing an XML Instruction section
1520	*/
1521	class XMLInstruction : Item
1522	{
1523	private string content;
1524
1525	/**
1526	* Construct an XML Instruction section
1527	*
1528	* Params:
1529	* content = the body of the instruction segment
1530	*
1531	* Throws: XIException if the segment body is illegal (contains ">")
1532	*
1533	* Examples:
1534	* --------------
1535	* auto item = new XMLInstruction("ATTLIST");
1536	* // constructs <!ATTLIST>
1537	* --------------
1538	*/
1539	this(string content)
1540	{
1541	if (content.indexOf(">") != -1) throw new XIException(content);
1542	this.content = content;
1543	}
1544
1545	/**
1546	* Compares two XML instructions for equality
1547	*
1548	* Examples:
1549	* --------------
1550	* XMLInstruction item1,item2;
1551	* if (item1 == item2) { }
1552	* --------------
1553	*/
1554	override int opEquals(Object o)
1555	{
1556	auto item = toType!( Item)(o);
1557	auto t = cast(XMLInstruction)item;
1558	return t !is null && content == t.content;
1559	}
1560
1561	/**
1562	* Compares two XML instructions
1563	*
1564	* You should rarely need to call this function. It exists so that
1565	* XmlInstructions can be used as associative array keys.
1566	*
1567	* Examples:
1568	* --------------
1569	* XMLInstruction item1,item2;
1570	* if (item1 < item2) { }
1571	* --------------
1572	*/
1573	override int opCmp(Object o)
1574	{
1575	auto item = toType!( Item)(o);
1576	auto t = cast(XMLInstruction)item;
1577	return t !is null
1578	&& (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1579	}
1580
1581	/**
1582	* Returns the hash of an XMLInstruction
1583	*
1584	* You should rarely need to call this function. It exists so that
1585	* XmlInstructions can be used as associative array keys.
1586	*/
1587	override hash_t toHash() { return hash(content); }
1588
1589	/**
1590	* Returns a string representation of this XmlInstruction
1591	*/
1592	override string toString() { return "<!" ~ content ~ ">"; }
1593
1594	override bool isEmptyXML() { return false; } /// Returns false always
1595	}
1596
1597	/**
1598	* Class representing a Processing Instruction section
1599	*/
1600	class ProcessingInstruction : Item
1601	{
1602	private string content;
1603
1604	/**
1605	* Construct a Processing Instruction section
1606	*
1607	* Params:
1608	* content = the body of the instruction segment
1609	*
1610	* Throws: PIException if the segment body is illegal (contains "?>")
1611	*
1612	* Examples:
1613	* --------------
1614	* auto item = new ProcessingInstruction("php");
1615	* // constructs <?php?>
1616	* --------------
1617	*/
1618	this(string content)
1619	{
1620	if (content.indexOf("?>") != -1) throw new PIException(content);
1621	this.content = content;
1622	}
1623
1624	/**
1625	* Compares two processing instructions for equality
1626	*
1627	* Examples:
1628	* --------------
1629	* ProcessingInstruction item1,item2;
1630	* if (item1 == item2) { }
1631	* --------------
1632	*/
1633	override int opEquals(Object o)
1634	{
1635	auto item = toType!( Item)(o);
1636	auto t = cast(ProcessingInstruction)item;
1637	return t !is null && content == t.content;
1638	}
1639
1640	/**
1641	* Compares two processing instructions
1642	*
1643	* You should rarely need to call this function. It exists so that
1644	* ProcessingInstructions can be used as associative array keys.
1645	*
1646	* Examples:
1647	* --------------
1648	* ProcessingInstruction item1,item2;
1649	* if (item1 < item2) { }
1650	* --------------
1651	*/
1652	override int opCmp(Object o)
1653	{
1654	auto item = toType!( Item)(o);
1655	auto t = cast(ProcessingInstruction)item;
1656	return t !is null
1657	&& (content != t.content ? (content < t.content ? -1 : 1 ) : 0 );
1658	}
1659
1660	/**
1661	* Returns the hash of a ProcessingInstruction
1662	*
1663	* You should rarely need to call this function. It exists so that
1664	* ProcessingInstructions can be used as associative array keys.
1665	*/
1666	override hash_t toHash() { return hash(content); }
1667
1668	/**
1669	* Returns a string representation of this ProcessingInstruction
1670
1671	*/
1672	override string toString() { return "<?" ~ content ~ "?>"; }
1673
1674	override bool isEmptyXML() { return false; } /// Returns false always
1675	}
1676
1677
1678	/**
1679	* Class for parsing an XML Document.
1680	*
1681	* This is a subclass of ElementParser. Most of the useful functions are
1682	* documented there.
1683	*
1684	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1685	*
1686	* Bugs:
1687	* Currently only supports UTF documents.
1688	*
1689	* If there is an encoding attribute in the prolog, it is ignored.
1690	*
1691	*/
1692	class DocumentParser : ElementParser
1693	{
1694	string xmlText;
1695
1696	/**
1697	* Constructs a DocumentParser.
1698	*
1699	* The input to this function MUST be valid XML.
1700	* This is enforced by the function's in contract.
1701	*
1702	* Params:
1703	* xmltext = the entire XML document as text
1704	*
1705	*/
1706	this(string xmlText_)
1707	in
1708	{
1709	assert(xmlText_.length != 0);
1710	try
1711	{
1712	// Confirm that the input is valid XML
1713	check(xmlText_);
1714	}
1715	catch (CheckException e)
1716	{
1717	// And if it's not, tell the user why not
1718	assert(false, "\n" ~ e.toString());
1719	}
1720	}
1721	body
1722	{
1723	xmlText = xmlText_;
1724	s = &xmlText;
1725	super(); // Initialize everything
1726	parse(); // Parse through the root tag (but not beyond)
1727	}
1728	}
1729
1730	/**
1731	* Class for parsing an XML element.
1732	*
1733	* Standards: $(LINK2 http://www.w3.org/TR/1998/REC-xml-19980210, XML 1.0)
1734	*
1735	* Note that you cannot construct instances of this class directly. You can
1736	* construct a DocumentParser (which is a subclass of ElementParser), but
1737	* otherwise, Instances of ElementParser will be created for you by the
1738	* library, and passed your way via onStartTag handlers.
1739	*
1740	*/
1741	class ElementParser
1742	{
1743	alias void delegate(string) Handler;
1744	alias void delegate(in Element element) ElementHandler;
1745	alias void delegate(ElementParser parser) ParserHandler;
1746
1747	private
1748	{
1749	Tag tag_;
1750	string elementStart;
1751	string* s;
1752
1753	Handler commentHandler = null;
1754	Handler cdataHandler = null;
1755	Handler xiHandler = null;
1756	Handler piHandler = null;
1757	Handler rawTextHandler = null;
1758	Handler textHandler = null;
1759
1760	// Private constructor for start tags
1761	this(ElementParser parent)
1762	{
1763	s = parent.s;
1764	this();
1765	tag_ = parent.tag_;
1766	}
1767
1768	// Private constructor for empty tags
1769	this(Tag tag, string* t)
1770	{
1771	s = t;
1772	this();
1773	tag_ = tag;
1774	}
1775	}
1776
1777	/**
1778	* The Tag at the start of the element being parsed. You can read this to
1779	* determine the tag's name and attributes.
1780	*/
1781	Tag tag() { return tag_; }
1782
1783	/**
1784	* Register a handler which will be called whenever a start tag is
1785	* encountered which matches the specified name. You can also pass null as
1786	* the name, in which case the handler will be called for any unmatched
1787	* start tag.
1788	*
1789	* Examples:
1790	* --------------
1791	* // Call this function whenever a <podcast> start tag is encountered
1792	* onStartTag["podcast"] = (ElementParser xml)
1793	* {
1794	* // Your code here
1795	* //
1796	* // This is a a closure, so code here may reference
1797	* // variables which are outside of this scope
1798	* };
1799	*
1800	* // call myEpisodeStartHandler (defined elsewhere) whenever an <episode>
1801	* // start tag is encountered
1802	* onStartTag["episode"] = &myEpisodeStartHandler;
1803	*
1804	* // call delegate dg for all other start tags
1805	* onStartTag[null] = dg;
1806	* --------------
1807	*
1808	* This library will supply your function with a new instance of
1809	* ElementHandler, which may be used to parse inside the element whose
1810	* start tag was just found, or to identify the tag attributes of the
1811	* element, etc.
1812	*
1813	* Note that your function will be called for both start tags and empty
1814	* tags. That is, we make no distinction between <br></br>
1815	* and <br/>.
1816	*/
1817	ParserHandler[string] onStartTag;
1818
1819	/**
1820	* Register a handler which will be called whenever an end tag is
1821	* encountered which matches the specified name. You can also pass null as
1822	* the name, in which case the handler will be called for any unmatched
1823	* end tag.
1824	*
1825	* Examples:
1826	* --------------
1827	* // Call this function whenever a </podcast> end tag is encountered
1828	* onEndTag["podcast"] = (in Element e)
1829	* {
1830	* // Your code here
1831	* //
1832	* // This is a a closure, so code here may reference
1833	* // variables which are outside of this scope
1834	* };
1835	*
1836	* // call myEpisodeEndHandler (defined elsewhere) whenever an </episode>
1837	* // end tag is encountered
1838	* onEndTag["episode"] = &myEpisodeEndHandler;
1839	*
1840	* // call delegate dg for all other end tags
1841	* onEndTag[null] = dg;
1842	* --------------
1843	*
1844	* Note that your function will be called for both start tags and empty
1845	* tags. That is, we make no distinction between <br></br>
1846	* and <br/>.
1847	*/
1848	ElementHandler[string] onEndTag;
1849
1850	protected this()
1851	{
1852	elementStart = *s;
1853	}
1854
1855	/**
1856	* Register a handler which will be called whenever text is encountered.
1857	*
1858	* Examples:
1859	* --------------
1860	* // Call this function whenever text is encountered
1861	* onText = (string s)
1862	* {
1863	* // Your code here
1864	*
1865	* // The passed parameter s will have been decoded by the time you see
1866	* // it, and so may contain any character.
1867	* //
1868	* // This is a a closure, so code here may reference
1869	* // variables which are outside of this scope
1870	* };
1871	* --------------
1872	*/
1873	void onText(Handler handler) { textHandler = handler; }
1874
1875	/**
1876	* Register an alternative handler which will be called whenever text
1877	* is encountered. This differs from onText in that onText will decode
1878	* the text, wheras onTextRaw will not. This allows you to make design
1879	* choices, since onText will be more accurate, but slower, while
1880	* onTextRaw will be faster, but less accurate. Of course, you can
1881	* still call decode() within your handler, if you want, but you'd
1882	* probably want to use onTextRaw only in circumstances where you
1883	* know that decoding is unnecessary.
1884	*
1885	* Examples:
1886	* --------------
1887	* // Call this function whenever text is encountered
1888	* onText = (string s)
1889	* {
1890	* // Your code here
1891	*
1892	* // The passed parameter s will NOT have been decoded.
1893	* //
1894	* // This is a a closure, so code here may reference
1895	* // variables which are outside of this scope
1896	* };
1897	* --------------
1898	*/
1899	void onTextRaw(Handler handler) { rawTextHandler = handler; }
1900
1901	/**
1902	* Register a handler which will be called whenever a character data
1903	* segement is encountered.
1904	*
1905	* Examples:
1906	* --------------
1907	* // Call this function whenever a CData section is encountered
1908	* onCData = (string s)
1909	* {
1910	* // Your code here
1911	*
1912	* // The passed parameter s does not include the opening <![CDATA[
1913	* // nor closing ]]>
1914	* //
1915	* // This is a a closure, so code here may reference
1916	* // variables which are outside of this scope
1917	* };
1918	* --------------
1919	*/
1920	void onCData(Handler handler) { cdataHandler = handler; }
1921
1922	/**
1923	* Register a handler which will be called whenever a comment is
1924	* encountered.
1925	*
1926	* Examples:
1927	* --------------
1928	* // Call this function whenever a comment is encountered
1929	* onComment = (string s)
1930	* {
1931	* // Your code here
1932	*
1933	* // The passed parameter s does not include the opening <!-- nor
1934	* // closing -->
1935	* //
1936	* // This is a a closure, so code here may reference
1937	* // variables which are outside of this scope
1938	* };
1939	* --------------
1940	*/
1941	void onComment(Handler handler) { commentHandler = handler; }
1942
1943	/**
1944	* Register a handler which will be called whenever a processing
1945	* instruction is encountered.
1946	*
1947	* Examples:
1948	* --------------
1949	* // Call this function whenever a processing instruction is encountered
1950	* onPI = (string s)
1951	* {
1952	* // Your code here
1953	*
1954	* // The passed parameter s does not include the opening <? nor
1955	* // closing ?>
1956	* //
1957	* // This is a a closure, so code here may reference
1958	* // variables which are outside of this scope
1959	* };
1960	* --------------
1961	*/
1962	void onPI(Handler handler) { piHandler = handler; }
1963
1964	/**
1965	* Register a handler which will be called whenever an XML instruction is
1966	* encountered.
1967	*
1968	* Examples:
1969	* --------------
1970	* // Call this function whenever an XML instruction is encountered
1971	* // (Note: XML instructions may only occur preceeding the root tag of a
1972	* // document).
1973	* onPI = (string s)
1974	* {
1975	* // Your code here
1976	*
1977	* // The passed parameter s does not include the opening <! nor
1978	* // closing >
1979	* //
1980	* // This is a a closure, so code here may reference
1981	* // variables which are outside of this scope
1982	* };
1983	* --------------
1984	*/
1985	void onXI(Handler handler) { xiHandler = handler; }
1986
1987	/**
1988	* Parse an XML element.
1989	*
1990	* Parsing will continue until the end of the current element. Any items
1991	* encountered for which a handler has been registered will invoke that
1992	* handler.
1993	*
1994	* Throws: various kinds of XMLException
1995	*/
1996	void parse()
1997	{
1998	string t;
1999	Tag root = tag_;
2000	Tag[string] startTags;
2001	if (tag_ !is null) startTags[tag_.name] = tag_;
2002
2003	while(s.length != 0)
2004	{
2005	if (startsWith(*s,"<!--"))
2006	{
2007	chop(*s,4);
2008	t = chop(s,indexOf(s,"-->"));
2009	if (commentHandler.funcptr !is null) commentHandler(t);
2010	chop(*s,3);
2011	}
2012	else if (startsWith(*s,"<![CDATA["))
2013	{
2014	chop(*s,9);
2015	t = chop(s,indexOf(s,"]]>"));
2016	if (cdataHandler.funcptr !is null) cdataHandler(t);
2017	chop(*s,3);
2018	}
2019	else if (startsWith(*s,"<!"))
2020	{
2021	chop(*s,2);
2022	t = chop(s,indexOf(s,">"));
2023	if (xiHandler.funcptr !is null) xiHandler(t);
2024	chop(*s,1);
2025	}
2026	else if (startsWith(*s,"<?"))
2027	{
2028	chop(*s,2);
2029	t = chop(s,indexOf(s,"?>"));
2030	if (piHandler.funcptr !is null) piHandler(t);
2031	chop(*s,2);
2032	}
2033	else if (startsWith(*s,"<"))
2034	{
2035	tag_ = new Tag(*s,true);
2036	if (root is null)
2037	return; // Return to constructor of derived class
2038
2039	if (tag_.isStart)
2040	{
2041	startTags[tag_.name] = tag_;
2042
2043	auto parser = new ElementParser(this);
2044
2045	auto handler = tag_.name in onStartTag;
2046	if (handler !is null) (*handler)(parser);
2047	else
2048	{
2049	handler = null in onStartTag;
2050	if (handler !is null) (*handler)(parser);
2051	}
2052	}
2053	else if (tag_.isEnd)
2054	{
2055	auto startTag = startTags[tag_.name];
2056	string text;
2057
2058	char* p = startTag.tagString.ptr
2059	+ startTag.tagString.length;
2060	char* q = tag_.tagString.ptr;
2061	text = p[0..(q-p)];
2062
2063	auto element = new Element(startTag);
2064	if (text.length != 0) element ~= new Text(text);
2065
2066	auto handler = tag_.name in onEndTag;
2067	if (handler !is null) (*handler)(element);
2068	else
2069	{
2070	handler = null in onEndTag;
2071	if (handler !is null) (*handler)(element);
2072	}
2073
2074	if (tag_.name == root.name) return;
2075	}
2076	else if (tag_.isEmpty)
2077	{
2078	Tag startTag = new Tag(tag_.name);
2079
2080	// Handle the pretend start tag
2081	string s2;
2082	auto parser = new ElementParser(startTag,&s2);
2083	auto handler1 = startTag.name in onStartTag;
2084	if (handler1 !is null) (*handler1)(parser);
2085	else
2086	{
2087	handler1 = null in onStartTag;
2088	if (handler1 !is null) (*handler1)(parser);
2089	}
2090
2091	// Handle the pretend end tag
2092	auto element = new Element(startTag);
2093	auto handler2 = tag_.name in onEndTag;
2094	if (handler2 !is null) (*handler2)(element);
2095	else
2096	{
2097	handler2 = null in onEndTag;
2098	if (handler2 !is null) (*handler2)(element);
2099	}
2100	}
2101	}
2102	else
2103	{
2104	t = chop(s,indexOf(s,"<"));
2105	if (rawTextHandler.funcptr !is null)
2106	rawTextHandler(t);
2107	else if (textHandler.funcptr !is null)
2108	textHandler(decode(t,DecodeMode.LOOSE));
2109	}
2110	}
2111	}
2112
2113	/**
2114	* Returns that part of the element which has already been parsed
2115	*/
2116	override string toString()
2117	{
2118	int n = elementStart.length - s.length;
2119	return elementStart[0..n];
2120	}
2121
2122	}
2123
2124	private
2125	{
2126	template Check(string msg)
2127	{
2128	string old = s;
2129
2130	void fail()
2131	{
2132	s = old;
2133	throw new Err(s,msg);
2134	}
2135
2136	void fail(Err e)
2137	{
2138	s = old;
2139	throw new Err(s,msg,e);
2140	}
2141
2142	void fail(string msg2)
2143	{
2144	fail(new Err(s,msg2));
2145	}
2146	}
2147
2148	void checkMisc(ref string s) // rule 27
2149	{
2150	mixin Check!("Misc");
2151
2152	try
2153	{
2154	if (s.startsWith("<!--")) { checkComment(s); }
2155	else if (s.startsWith("<?")) { checkPI(s); }
2156	else { checkSpace(s); }
2157	}
2158	catch(Err e) { fail(e); }
2159	}
2160
2161	void checkDocument(ref string s) // rule 1
2162	{
2163	mixin Check!("Document");
2164	try
2165	{
2166	checkProlog(s);
2167	checkElement(s);
2168	star!(checkMisc)(s);
2169	}
2170	catch(Err e) { fail(e); }
2171	}
2172
2173	void checkChars(ref string s) // rule 2
2174	{
2175	// TO DO - Fix std.utf stride and decode functions, then use those
2176	// instead
2177
2178	mixin Check!("Chars");
2179
2180	dchar c;
2181	int n = -1;
2182	foreach(int i,dchar d; s)
2183	{
2184	if (!isChar(d))
2185	{
2186	c = d;
2187	n = i;
2188	break;
2189	}
2190	}
2191	if (n != -1)
2192	{
2193	s = s[n..$];
2194	fail(format("invalid character: U+%04X",c));
2195	}
2196	}
2197
2198	void checkSpace(ref string s) // rule 3
2199	{
2200	mixin Check!("Whitespace");
2201	munch(s,"\u0020\u0009\u000A\u000D");
2202	if (s is old) fail();
2203	}
2204
2205	void checkName(ref string s, out string name) // rule 5
2206	{
2207	mixin Check!("Name");
2208
2209	if (s.length == 0) fail();
2210	int n;
2211	foreach(int i,dchar c;s)
2212	{
2213	if (c == '_' \|\| c == ':' \|\| isLetter(c)) continue;
2214	if (i == 0) fail();
2215	if (c == '-' \|\| c == '.' \|\| isDigit(c)
2216	\|\| isCombiningChar(c) \|\| isExtender(c)) continue;
2217	n = i;
2218	break;
2219	}
2220	name = s[0..n];
2221	s = s[n..$];
2222	}
2223
2224	void checkAttValue(ref string s) // rule 10
2225	{
2226	mixin Check!("AttValue");
2227
2228	if (s.length == 0) fail();
2229	char c = s[0];
2230	if (c != '\u0022' && c != '\u0027')
2231	fail("attribute value requires quotes");
2232	s = s[1..$];
2233	for(;;)
2234	{
2235	munch(s,"^<&"~c);
2236	if (s.length == 0) fail("unterminated attribute value");
2237	if (s[0] == '<') fail("< found in attribute value");
2238	if (s[0] == c) break;
2239	try { checkReference(s); } catch(Err e) { fail(e); }
2240	}
2241	s = s[1..$];
2242	}
2243
2244	void checkCharData(ref string s) // rule 14
2245	{
2246	mixin Check!("CharData");
2247
2248	while (s.length != 0)
2249	{
2250	if (s.startsWith("&")) break;
2251	if (s.startsWith("<")) break;
2252	if (s.startsWith("]]>")) fail("]]> found within char data");
2253	s = s[1..$];
2254	}
2255	}
2256
2257	void checkComment(ref string s) // rule 15
2258	{
2259	mixin Check!("Comment");
2260
2261	try { checkLiteral("<!--",s); } catch(Err e) { fail(e); }
2262	int n = s.indexOf("--");
2263	if (n == -1) fail("unterminated comment");
2264	s = s[0..n];
2265	try { checkLiteral("-->",s); } catch(Err e) { fail(e); }
2266	}
2267
2268	void checkPI(ref string s) // rule 16
2269	{
2270	mixin Check!("PI");
2271
2272	try
2273	{
2274	checkLiteral("<?",s);
2275	checkEnd("?>",s);
2276	}
2277	catch(Err e) { fail(e); }
2278	}
2279
2280	void checkCDSect(ref string s) // rule 18
2281	{
2282	mixin Check!("CDSect");
2283
2284	try
2285	{
2286	checkLiteral(cdata,s);
2287	checkEnd("]]>",s);
2288	}
2289	catch(Err e) { fail(e); }
2290	}
2291
2292	void checkProlog(ref string s) // rule 22
2293	{
2294	mixin Check!("Prolog");
2295
2296	try
2297	{
2298	checkXMLDecl(s);
2299	star!(checkMisc)(s);
2300	opt!(seq!(checkDocTypeDecl,star!(checkMisc)))(s);
2301	}
2302	catch(Err e) { fail(e); }
2303	}
2304
2305	void checkXMLDecl(ref string s) // rule 23
2306	{
2307	mixin Check!("XMLDecl");
2308
2309	try
2310	{
2311	checkLiteral("<?xml",s);
2312	checkVersionInfo(s);
2313	opt!(checkEncodingDecl)(s);
2314	opt!(checkSDDecl)(s);
2315	opt!(checkSpace)(s);
2316	checkLiteral("?>",s);
2317	}
2318	catch(Err e) { fail(e); }
2319	}
2320
2321	void checkVersionInfo(ref string s) // rule 24
2322	{
2323	mixin Check!("VersionInfo");
2324
2325	try
2326	{
2327	checkSpace(s);
2328	checkLiteral("version",s);
2329	checkEq(s);
2330	quoted!(checkVersionNum)(s);
2331	}
2332	catch(Err e) { fail(e); }
2333	}
2334
2335	void checkEq(ref string s) // rule 25
2336	{
2337	mixin Check!("Eq");
2338
2339	try
2340	{
2341	opt!(checkSpace)(s);
2342	checkLiteral("=",s);
2343	opt!(checkSpace)(s);
2344	}
2345	catch(Err e) { fail(e); }
2346	}
2347
2348	void checkVersionNum(ref string s) // rule 26
2349	{
2350	mixin Check!("VersionNum");
2351
2352	munch(s,"a-zA-Z0-9_.:-");
2353	if (s is old) fail();
2354	}
2355
2356	void checkDocTypeDecl(ref string s) // rule 28
2357	{
2358	mixin Check!("DocTypeDecl");
2359
2360	try
2361	{
2362	checkLiteral("<!DOCTYPE",s);
2363	//
2364	// TO DO -- ensure DOCTYPE is well formed
2365	// (But not yet. That's one of our "future directions")
2366	//
2367	checkEnd(">",s);
2368	}
2369	catch(Err e) { fail(e); }
2370	}
2371
2372	void checkSDDecl(ref string s) // rule 32
2373	{
2374	mixin Check!("SDDecl");
2375
2376	try
2377	{
2378	checkSpace(s);
2379	checkLiteral("standalone",s);
2380	checkEq(s);
2381	}
2382	catch(Err e) { fail(e); }
2383
2384	int n = 0;
2385	if (s.startsWith("'yes'") \|\| s.startsWith("\"yes\"")) n = 5;
2386	else if (s.startsWith("'no'" ) \|\| s.startsWith("\"no\"" )) n = 4;
2387	else fail("standalone attribute value must be 'yes', \"yes\","
2388	" 'no' or \"no\"");
2389	s = s[n..$];
2390	}
2391
2392	void checkElement(ref string s) // rule 39
2393	{
2394	mixin Check!("Element");
2395
2396	string sname,ename,t;
2397	try { checkTag(s,t,sname); } catch(Err e) { fail(e); }
2398
2399	if (t == "STag")
2400	{
2401	try
2402	{
2403	checkContent(s);
2404	t = s;
2405	checkETag(s,ename);
2406	}
2407	catch(Err e) { fail(e); }
2408
2409	if (sname != ename)
2410	{
2411	s = t;
2412	fail("end tag name \"" ~ ename
2413	~ "\" differs from start tag name \""~sname~"\"");
2414	}
2415	}
2416	}
2417
2418	// rules 40 and 44
2419	void checkTag(ref string s, out string type, out string name)
2420	{
2421	mixin Check!("Tag");
2422
2423	try
2424	{
2425	type = "STag";
2426	checkLiteral("<",s);
2427	checkName(s,name);
2428	star!(seq!(checkSpace,checkAttribute))(s);
2429	opt!(checkSpace)(s);
2430	if (s.length != 0 && s[0] == '/')
2431	{
2432	s = s[1..$];
2433	type = "ETag";
2434	}
2435	checkLiteral(">",s);
2436	}
2437	catch(Err e) { fail(e); }
2438	}
2439
2440	void checkAttribute(ref string s) // rule 41
2441	{
2442	mixin Check!("Attribute");
2443
2444	try
2445	{
2446	string name;
2447	checkName(s,name);
2448	checkEq(s);
2449	checkAttValue(s);
2450	}
2451	catch(Err e) { fail(e); }
2452	}
2453
2454	void checkETag(ref string s, out string name) // rule 42
2455	{
2456	mixin Check!("ETag");
2457
2458	try
2459	{
2460	checkLiteral("</",s);
2461	checkName(s,name);
2462	opt!(checkSpace)(s);
2463	checkLiteral(">",s);
2464	}
2465	catch(Err e) { fail(e); }
2466	}
2467
2468	void checkContent(ref string s) // rule 43
2469	{
2470	mixin Check!("Content");
2471
2472	try
2473	{
2474	while (s.length != 0)
2475	{
2476	old = s;
2477	if (s.startsWith("&")) { checkReference(s); }
2478	else if (s.startsWith("<!--")) { checkComment(s); }
2479	else if (s.startsWith("<?")) { checkPI(s); }
2480	else if (s.startsWith(cdata)) { checkCDSect(s); }
2481	else if (s.startsWith("</")) { break; }
2482	else if (s.startsWith("<")) { checkElement(s); }
2483	else { checkCharData(s); }
2484	}
2485	}
2486	catch(Err e) { fail(e); }
2487	}
2488
2489	void checkCharRef(ref string s, out dchar c) // rule 66
2490	{
2491	mixin Check!("CharRef");
2492
2493	c = 0;
2494	try { checkLiteral("&#",s); } catch(Err e) { fail(e); }
2495	int radix = 10;
2496	if (s.length != 0 && s[0] == 'x')
2497	{
2498	s = s[1..$];
2499	radix = 16;
2500	}
2501	if (s.length == 0) fail("unterminated character reference");
2502	if (s[0] == ';')
2503	fail("character reference must have at least one digit");
2504	while (s.length != 0)
2505	{
2506	char d = s[0];
2507	int n = 0;
2508	switch(d)
2509	{
2510	case 'F','f': ++n;
2511	case 'E','e': ++n;
2512	case 'D','d': ++n;
2513	case 'C','c': ++n;
2514	case 'B','b': ++n;
2515	case 'A','a': ++n;
2516	case '9': ++n;
2517	case '8': ++n;
2518	case '7': ++n;
2519	case '6': ++n;
2520	case '5': ++n;
2521	case '4': ++n;
2522	case '3': ++n;
2523	case '2': ++n;
2524	case '1': ++n;
2525	case '0': break;
2526	default: n = 100; break;
2527	}
2528	if (n >= radix) break;
2529	c *= radix;
2530	c += n;
2531	s = s[1..$];
2532	}
2533	if (!isChar(c)) fail(format("U+%04X is not a legal character",c));
2534	if (s.length == 0 \|\| s[0] != ';') fail("expected ;");
2535	else s = s[1..$];
2536	}
2537
2538	void checkReference(ref string s) // rule 67
2539	{
2540	mixin Check!("Reference");
2541
2542	try
2543	{
2544	dchar c;
2545	if (s.startsWith("&#")) checkCharRef(s,c);
2546	else checkEntityRef(s);
2547	}
2548	catch(Err e) { fail(e); }
2549	}
2550
2551	void checkEntityRef(ref string s) // rule 68
2552	{
2553	mixin Check!("EntityRef");
2554
2555	try
2556	{
2557	string name;
2558	checkLiteral("&",s);
2559	checkName(s,name);
2560	checkLiteral(";",s);
2561	}
2562	catch(Err e) { fail(e); }
2563	}
2564
2565	void checkEncName(ref string s) // rule 81
2566	{
2567	mixin Check!("EncName");
2568
2569	munch(s,"a-zA-Z");
2570	if (s is old) fail();
2571	munch(s,"a-zA-Z0-9_.-");
2572	}
2573
2574	void checkEncodingDecl(ref string s) // rule 80
2575	{
2576	mixin Check!("EncodingDecl");
2577
2578	try
2579	{
2580	checkSpace(s);
2581	checkLiteral("encoding",s);
2582	checkEq(s);
2583	quoted!(checkEncName)(s);
2584	}
2585	catch(Err e) { fail(e); }
2586	}
2587
2588	// Helper functions
2589
2590	void checkLiteral(string literal,ref string s)
2591	{
2592	mixin Check!("Literal");
2593
2594	if (!s.startsWith(literal)) fail("Expected literal \""~literal~"\"");
2595	s = s[literal.length..$];
2596	}
2597
2598	void checkEnd(string end,ref string s)
2599	{
2600	// Deliberately no mixin Check here.
2601
2602	int n = s.indexOf(end);
2603	if (n == -1) throw new Err(s,"Unable to find terminating \""~end~"\"");
2604	s = s[n..$];
2605	checkLiteral(end,s);
2606	}
2607
2608	// Metafunctions -- none of these use mixin Check
2609
2610	void opt(alias f)(ref string s)
2611	{
2612	try { f(s); } catch(Err e) {}
2613	}
2614
2615	void plus(alias f)(ref string s)
2616	{
2617	f(s);
2618	star!(f)(s);
2619	}
2620
2621	void star(alias f)(ref string s)
2622	{
2623	while (s.length != 0)
2624	{
2625	try { f(s); }
2626	catch(Err e) { return; }
2627	}
2628	}
2629
2630	void quoted(alias f)(ref string s)
2631	{
2632	if (s.startsWith("'"))
2633	{
2634	checkLiteral("'",s);
2635	f(s);
2636	checkLiteral("'",s);
2637	}
2638	else
2639	{
2640	checkLiteral("\"",s);
2641	f(s);
2642	checkLiteral("\"",s);
2643	}
2644	}
2645
2646	void seq(alias f,alias g)(ref string s)
2647	{
2648	f(s);
2649	g(s);
2650	}
2651	}
2652
2653	/**
2654	* Check an entire XML document for well-formedness
2655	*
2656	* Params:
2657	* s = the document to be checked, passed as a string
2658	*
2659	* Throws: CheckException if the document is not well formed
2660	*
2661	* CheckException's toString() method will yield the complete heirarchy of
2662	* parse failure (the XML equivalent of a stack trace), giving the line and
2663	* column number of every failure at every level.
2664	*/
2665	void check(string s)
2666	{
2667	try
2668	{
2669	checkChars(s);
2670	checkDocument(s);
2671	if (s.length != 0) throw new Err(s,"Junk found after document");
2672	}
2673	catch(Err e)
2674	{
2675	e.complete(s);
2676	throw e;
2677	}
2678	}
2679
2680	unittest
2681	{
2682	try
2683	{
2684	check(`<?xml version="1.0"?>
2685	<catalog>
2686	<book id="bk101">
2687	<author>Gambardella, Matthew</author>
2688	<title>XML Developer's Guide</title>
2689	<genre>Computer</genre>
2690	<price>44.95</price>
2691	<publish_date>2000-10-01</publish_date>
2692	<description>An in-depth look at creating applications
2693	with XML.</description>
2694	</book>
2695	<book id="bk102">
2696	<author>Ralls, Kim</author>
2697	<title>Midnight Rain</title>
2698	<genre>Fantasy</genres>
2699	<price>5.95</price>
2700	<publish_date>2000-12-16</publish_date>
2701	<description>A former architect battles corporate zombies,
2702	an evil sorceress, and her own childhood to become queen
2703	of the world.</description>
2704	</book>
2705	<book id="bk103">
2706	<author>Corets, Eva</author>
2707	<title>Maeve Ascendant</title>
2708	<genre>Fantasy</genre>
2709	<price>5.95</price>
2710	<publish_date>2000-11-17</publish_date>
2711	<description>After the collapse of a nanotechnology
2712	society in England, the young survivors lay the
2713	foundation for a new society.</description>
2714	</book>
2715	</catalog>`);
2716	assert(false);
2717	}
2718	catch(CheckException e)
2719	{
2720	int n = e.toString().indexOf("end tag name \"genres\" differs"
2721	" from start tag name \"genre\"");
2722	assert(n != -1);
2723	}
2724	}
2725
2726	/** The base class for exceptions thrown by this module */
2727	class XMLException : Exception { this(string msg) { super(msg); } }
2728
2729	// Other exceptions
2730
2731	/// Thrown during Comment constructor
2732	class CommentException : XMLException
2733	{ private this(string msg) { super(msg); } }
2734
2735	/// Thrown during CData constructor
2736	class CDataException : XMLException
2737	{ private this(string msg) { super(msg); } }
2738
2739	/// Thrown during XMLInstruction constructor
2740	class XIException : XMLException
2741	{ private this(string msg) { super(msg); } }
2742
2743	/// Thrown during ProcessingInstruction constructor
2744	class PIException : XMLException
2745	{ private this(string msg) { super(msg); } }
2746
2747	/// Thrown during Text constructor
2748	class TextException : XMLException
2749	{ private this(string msg) { super(msg); } }
2750
2751	/// Thrown during decode()
2752	class DecodeException : XMLException
2753	{ private this(string msg) { super(msg); } }
2754
2755	/// Thrown if comparing with wrong type
2756	class InvalidTypeException : XMLException
2757	{ private this(string msg) { super(msg); } }
2758
2759	/// Thrown when parsing for Tags
2760	class TagException : XMLException
2761	{ private this(string msg) { super(msg); } }
2762
2763	/**
2764	* Thrown during check()
2765	*/
2766	class CheckException : XMLException
2767	{
2768	CheckException err; /// Parent in heirarchy
2769	private string tail;
2770	/**
2771	* Name of production rule which failed to parse,
2772	* or specific error message
2773	*/
2774	string msg;
2775	uint line = 0; /// Line number at which parse failure occurred
2776	uint column = 0; /// Column number at which parse failure occurred
2777
2778	private this(string tail,string msg,Err err=null)
2779	{
2780	super(null);
2781	this.tail = tail;
2782	this.msg = msg;
2783	this.err = err;
2784	}
2785
2786	private void complete(string entire)
2787	{
2788	string head = entire[0..$-tail.length];
2789	int n = head.lastIndexOf('\n') + 1;
2790	line = head.count("\n") + 1;
2791	dstring t;
2792	transcode(head[n..$],t);
2793	column = t.length + 1;
2794	if (err !is null) err.complete(entire);
2795	}
2796
2797	override string toString()
2798	{
2799	string s;
2800	if (line != 0) s = format("Line %d, column %d: ",line,column);
2801	s ~= msg;
2802	s ~= '\n';
2803	if (err !is null) s = err.toString ~ s;
2804	return s;
2805	}
2806	}
2807
2808	private alias CheckException Err;
2809
2810	// Private helper functions
2811
2812	private
2813	{
2814	T toType(T)(Object o)
2815	{
2816	T t = cast(T)(o);
2817	if (t is null)
2818	{
2819	throw new InvalidTypeException("Attempt to compare a "
2820	~ T.stringof ~ " with an instance of another type");
2821	}
2822	return t;
2823	}
2824
2825	string chop(ref string s, int n)
2826	{
2827	if (n == -1) n = s.length;
2828	string t = s[0..n];
2829	s = s[n..$];
2830	return t;
2831	}
2832
2833	bool optc(ref string s, char c)
2834	{
2835	bool b = s.length != 0 && s[0] == c;
2836	if (b) s = s[1..$];
2837	return b;
2838	}
2839
2840	void reqc(ref string s, char c)
2841	{
2842	if (s.length == 0 \|\| s[0] != c) throw new TagException("");
2843	s = s[1..$];
2844	}
2845
2846	hash_t hash(string s,hash_t h=0)
2847	{
2848	foreach(dchar c;s) h = h * 11 + c;
2849	return h;
2850	}
2851
2852	// Definitions from the XML specification
2853	dchar[] CharTable=[0x9,0x9,0xA,0xA,0xD,0xD,0x20,0xD7FF,0xE000,0xFFFD,
2854	0x10000,0x10FFFF];
2855	dchar[] BaseCharTable=[0x0041,0x005A,0x0061,0x007A,0x00C0,0x00D6,0x00D8,
2856	0x00F6,0x00F8,0x00FF,0x0100,0x0131,0x0134,0x013E,0x0141,0x0148,0x014A,
2857	0x017E,0x0180,0x01C3,0x01CD,0x01F0,0x01F4,0x01F5,0x01FA,0x0217,0x0250,
2858	0x02A8,0x02BB,0x02C1,0x0386,0x0386,0x0388,0x038A,0x038C,0x038C,0x038E,
2859	0x03A1,0x03A3,0x03CE,0x03D0,0x03D6,0x03DA,0x03DA,0x03DC,0x03DC,0x03DE,
2860	0x03DE,0x03E0,0x03E0,0x03E2,0x03F3,0x0401,0x040C,0x040E,0x044F,0x0451,
2861	0x045C,0x045E,0x0481,0x0490,0x04C4,0x04C7,0x04C8,0x04CB,0x04CC,0x04D0,
2862	0x04EB,0x04EE,0x04F5,0x04F8,0x04F9,0x0531,0x0556,0x0559,0x0559,0x0561,
2863	0x0586,0x05D0,0x05EA,0x05F0,0x05F2,0x0621,0x063A,0x0641,0x064A,0x0671,
2864	0x06B7,0x06BA,0x06BE,0x06C0,0x06CE,0x06D0,0x06D3,0x06D5,0x06D5,0x06E5,
2865	0x06E6,0x0905,0x0939,0x093D,0x093D,0x0958,0x0961,0x0985,0x098C,0x098F,
2866	0x0990,0x0993,0x09A8,0x09AA,0x09B0,0x09B2,0x09B2,0x09B6,0x09B9,0x09DC,
2867	0x09DD,0x09DF,0x09E1,0x09F0,0x09F1,0x0A05,0x0A0A,0x0A0F,0x0A10,0x0A13,
2868	0x0A28,0x0A2A,0x0A30,0x0A32,0x0A33,0x0A35,0x0A36,0x0A38,0x0A39,0x0A59,
2869	0x0A5C,0x0A5E,0x0A5E,0x0A72,0x0A74,0x0A85,0x0A8B,0x0A8D,0x0A8D,0x0A8F,
2870	0x0A91,0x0A93,0x0AA8,0x0AAA,0x0AB0,0x0AB2,0x0AB3,0x0AB5,0x0AB9,0x0ABD,
2871	0x0ABD,0x0AE0,0x0AE0,0x0B05,0x0B0C,0x0B0F,0x0B10,0x0B13,0x0B28,0x0B2A,
2872	0x0B30,0x0B32,0x0B33,0x0B36,0x0B39,0x0B3D,0x0B3D,0x0B5C,0x0B5D,0x0B5F,
2873	0x0B61,0x0B85,0x0B8A,0x0B8E,0x0B90,0x0B92,0x0B95,0x0B99,0x0B9A,0x0B9C,
2874	0x0B9C,0x0B9E,0x0B9F,0x0BA3,0x0BA4,0x0BA8,0x0BAA,0x0BAE,0x0BB5,0x0BB7,
2875	0x0BB9,0x0C05,0x0C0C,0x0C0E,0x0C10,0x0C12,0x0C28,0x0C2A,0x0C33,0x0C35,
2876	0x0C39,0x0C60,0x0C61,0x0C85,0x0C8C,0x0C8E,0x0C90,0x0C92,0x0CA8,0x0CAA,
2877	0x0CB3,0x0CB5,0x0CB9,0x0CDE,0x0CDE,0x0CE0,0x0CE1,0x0D05,0x0D0C,0x0D0E,
2878	0x0D10,0x0D12,0x0D28,0x0D2A,0x0D39,0x0D60,0x0D61,0x0E01,0x0E2E,0x0E30,
2879	0x0E30,0x0E32,0x0E33,0x0E40,0x0E45,0x0E81,0x0E82,0x0E84,0x0E84,0x0E87,
2880	0x0E88,0x0E8A,0x0E8A,0x0E8D,0x0E8D,0x0E94,0x0E97,0x0E99,0x0E9F,0x0EA1,
2881	0x0EA3,0x0EA5,0x0EA5,0x0EA7,0x0EA7,0x0EAA,0x0EAB,0x0EAD,0x0EAE,0x0EB0,
2882	0x0EB0,0x0EB2,0x0EB3,0x0EBD,0x0EBD,0x0EC0,0x0EC4,0x0F40,0x0F47,0x0F49,
2883	0x0F69,0x10A0,0x10C5,0x10D0,0x10F6,0x1100,0x1100,0x1102,0x1103,0x1105,
2884	0x1107,0x1109,0x1109,0x110B,0x110C,0x110E,0x1112,0x113C,0x113C,0x113E,
2885	0x113E,0x1140,0x1140,0x114C,0x114C,0x114E,0x114E,0x1150,0x1150,0x1154,
2886	0x1155,0x1159,0x1159,0x115F,0x1161,0x1163,0x1163,0x1165,0x1165,0x1167,
2887	0x1167,0x1169,0x1169,0x116D,0x116E,0x1172,0x1173,0x1175,0x1175,0x119E,
2888	0x119E,0x11A8,0x11A8,0x11AB,0x11AB,0x11AE,0x11AF,0x11B7,0x11B8,0x11BA,
2889	0x11BA,0x11BC,0x11C2,0x11EB,0x11EB,0x11F0,0x11F0,0x11F9,0x11F9,0x1E00,
2890	0x1E9B,0x1EA0,0x1EF9,0x1F00,0x1F15,0x1F18,0x1F1D,0x1F20,0x1F45,0x1F48,
2891	0x1F4D,0x1F50,0x1F57,0x1F59,0x1F59,0x1F5B,0x1F5B,0x1F5D,0x1F5D,0x1F5F,
2892	0x1F7D,0x1F80,0x1FB4,0x1FB6,0x1FBC,0x1FBE,0x1FBE,0x1FC2,0x1FC4,0x1FC6,
2893	0x1FCC,0x1FD0,0x1FD3,0x1FD6,0x1FDB,0x1FE0,0x1FEC,0x1FF2,0x1FF4,0x1FF6,
2894	0x1FFC,0x2126,0x2126,0x212A,0x212B,0x212E,0x212E,0x2180,0x2182,0x3041,
2895	0x3094,0x30A1,0x30FA,0x3105,0x312C,0xAC00,0xD7A3];
2896	dchar[] IdeographicTable=[0x4E00,0x9FA5,0x3007,0x3007,0x3021,0x3029];
2897	dchar[] CombiningCharTable=[0x0300,0x0345,0x0360,0x0361,0x0483,0x0486,
2898	0x0591,0x05A1,0x05A3,0x05B9,0x05BB,0x05BD,0x05BF,0x05BF,0x05C1,0x05C2,
2899	0x05C4,0x05C4,0x064B,0x0652,0x0670,0x0670,0x06D6,0x06DC,0x06DD,0x06DF,
2900	0x06E0,0x06E4,0x06E7,0x06E8,0x06EA,0x06ED,0x0901,0x0903,0x093C,0x093C,
2901	0x093E,0x094C,0x094D,0x094D,0x0951,0x0954,0x0962,0x0963,0x0981,0x0983,
2902	0x09BC,0x09BC,0x09BE,0x09BE,0x09BF,0x09BF,0x09C0,0x09C4,0x09C7,0x09C8,
2903	0x09CB,0x09CD,0x09D7,0x09D7,0x09E2,0x09E3,0x0A02,0x0A02,0x0A3C,0x0A3C,
2904	0x0A3E,0x0A3E,0x0A3F,0x0A3F,0x0A40,0x0A42,0x0A47,0x0A48,0x0A4B,0x0A4D,
2905	0x0A70,0x0A71,0x0A81,0x0A83,0x0ABC,0x0ABC,0x0ABE,0x0AC5,0x0AC7,0x0AC9,
2906	0x0ACB,0x0ACD,0x0B01,0x0B03,0x0B3C,0x0B3C,0x0B3E,0x0B43,0x0B47,0x0B48,
2907	0x0B4B,0x0B4D,0x0B56,0x0B57,0x0B82,0x0B83,0x0BBE,0x0BC2,0x0BC6,0x0BC8,
2908	0x0BCA,0x0BCD,0x0BD7,0x0BD7,0x0C01,0x0C03,0x0C3E,0x0C44,0x0C46,0x0C48,
2909	0x0C4A,0x0C4D,0x0C55,0x0C56,0x0C82,0x0C83,0x0CBE,0x0CC4,0x0CC6,0x0CC8,
2910	0x0CCA,0x0CCD,0x0CD5,0x0CD6,0x0D02,0x0D03,0x0D3E,0x0D43,0x0D46,0x0D48,
2911	0x0D4A,0x0D4D,0x0D57,0x0D57,0x0E31,0x0E31,0x0E34,0x0E3A,0x0E47,0x0E4E,
2912	0x0EB1,0x0EB1,0x0EB4,0x0EB9,0x0EBB,0x0EBC,0x0EC8,0x0ECD,0x0F18,0x0F19,
2913	0x0F35,0x0F35,0x0F37,0x0F37,0x0F39,0x0F39,0x0F3E,0x0F3E,0x0F3F,0x0F3F,
2914	0x0F71,0x0F84,0x0F86,0x0F8B,0x0F90,0x0F95,0x0F97,0x0F97,0x0F99,0x0FAD,
2915	0x0FB1,0x0FB7,0x0FB9,0x0FB9,0x20D0,0x20DC,0x20E1,0x20E1,0x302A,0x302F,
2916	0x3099,0x3099,0x309A,0x309A];
2917	dchar[] DigitTable=[0x0030,0x0039,0x0660,0x0669,0x06F0,0x06F9,0x0966,
2918	0x096F,0x09E6,0x09EF,0x0A66,0x0A6F,0x0AE6,0x0AEF,0x0B66,0x0B6F,0x0BE7,
2919	0x0BEF,0x0C66,0x0C6F,0x0CE6,0x0CEF,0x0D66,0x0D6F,0x0E50,0x0E59,0x0ED0,
2920	0x0ED9,0x0F20,0x0F29];
2921	dchar[] ExtenderTable=[0x00B7,0x00B7,0x02D0,0x02D0,0x02D1,0x02D1,0x0387,
2922	0x0387,0x0640,0x0640,0x0E46,0x0E46,0x0EC6,0x0EC6,0x3005,0x3005,0x3031,
2923	0x3035,0x309D,0x309E,0x30FC,0x30FE];
2924
2925	bool lookup(dchar[] table, int c)
2926	{
2927	while (table.length != 0)
2928	{
2929	int m = (table.length >> 1) & ~1;
2930	if (c < table[m])
2931	{
2932	table = table[0..m];
2933	}
2934	else if (c > table[m+1])
2935	{
2936	table = table[m+2..$];
2937	}
2938	else return true;
2939	}
2940	return false;
2941	}
2942
2943	string startOf(string s)
2944	{
2945	string r;
2946	foreach(char c;s)
2947	{
2948	r ~= (c < 0x20 \|\| c > 0x7F) ? '.' : c;
2949	if (r.length >= 40) { r ~= "___"; break; }
2950	}
2951	return r;
2952	}
2953
2954	void exit(string s=null)
2955	{
2956	throw new XMLException(s);
2957	}
2958	}
2959
2960	version (unittest_report)
2961	{
2962	import std.stdio;
2963	unittest {
2964	writefln("unittest std2.xml passed");
2965	}
2966	}

Download in other formats:

Original Format

std2

WikiStart: xml.d

Download in other formats: