WikiStart: encoding.d

File encoding.d, 71.6 kB (added by y0uf00bar, 15 years ago)
std2.encoding

Line
1	// Written in the D programming language.
2
3	/**
4	Classes and functions for handling and transcoding between various encodings.
5
6	For cases where the _encoding is known at compile-time, functions are provided
7	for arbitrary _encoding and decoding of characters, arbitrary transcoding
8	between strings of different type, as well as validation and sanitization.
9
10	Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
11	(also known as LATIN-1), and WINDOWS-1252.
12
13	$(UL
14	$(LI The type $(D AsciiChar) represents an ASCII character.)
15	$(LI The type $(D AsciiString) represents an ASCII string.)
16	$(LI The type $(D Latin1Char) represents an ISO-8859-1 character.)
17	$(LI The type $(D Latin1String) represents an ISO-8859-1 string.)
18	$(LI The type $(D Windows1252Char) represents a Windows-1252 character.)
19	$(LI The type $(D Windows1252String) represents a Windows-1252 string.))
20
21	For cases where the _encoding is not known at compile-time, but is
22	known at run-time, we provide the abstract class $(D EncodingScheme)
23	and its subclasses. To construct a run-time encoder/decoder, one does
24	e.g.
25
26	----------------------------------------------------
27	auto e = EncodingScheme.create("utf-8");
28	----------------------------------------------------
29
30	This library supplies $(D EncodingScheme) subclasses for ASCII,
31	ISO-8859-1 (also known as LATIN-1), WINDOWS-1252, UTF-8, and (on
32	little-endian architectures) UTF-16LE and UTF-32LE; or (on big-endian
33	architectures) UTF-16BE and UTF-32BE.
34
35	This library provides a mechanism whereby other modules may add $(D
36	EncodingScheme) subclasses for any other _encoding.
37
38	Authors: Janice Caron
39
40	Date: 2008.02.27 - 2008.05.07
41
42	License: Public Domain
43
44	Macros:
45	WIKI=Phobos/StdEncoding
46	*/
47
48	module std2.encoding;
49	import std.string;
50	import std.traits;
51
52
53	//=============================================================================
54
55	/** Special value returned by $(D safeDecode) */
56	const dchar INVALID_SEQUENCE = cast(dchar)0xFFFFFFFF;
57
58	template EncoderFunctions()
59	{
60	// Various forms of read
61
62	template ReadFromString()
63	{
64	bool canRead() { return s.length != 0; }
65	E peek() { return s[0]; }
66	E read() { E t = s[0]; s = s[1..$]; return t; }
67	}
68
69	template ReverseReadFromString()
70	{
71	bool canRead() { return s.length != 0; }
72	E peek() { return s[$-1]; }
73	E read() { E t = s[$-1]; s = s[0..$-1]; return t; }
74	}
75
76	// Various forms of Write
77
78	template WriteToString()
79	{
80	E[] s;
81	void write(E c) { s ~= c; }
82	}
83
84	template WriteToArray()
85	{
86	void write(E c) { array[0] = c; array = array[1..$]; }
87	}
88
89	deprecated template WriteToBuffer()
90	{
91	void write(E c) { buffer ~= c; }
92	}
93
94	template WriteToDelegate()
95	{
96	void write(E c) { dg(c); }
97	}
98
99	// Functions we will export
100
101	template EncodeViaWrite()
102	{
103	mixin encodeViaWrite;
104	void encode(dchar c) { encodeViaWrite(c); }
105	}
106
107	template SkipViaRead()
108	{
109	mixin skipViaRead;
110	void skip() { skipViaRead(); }
111	}
112
113	template DecodeViaRead()
114	{
115	mixin decodeViaRead;
116	dchar decode() { return decodeViaRead(); }
117	}
118
119	template SafeDecodeViaRead()
120	{
121	mixin safeDecodeViaRead;
122	dchar safeDecode() { return safeDecodeViaRead(); }
123	}
124
125	template DecodeReverseViaRead()
126	{
127	mixin decodeReverseViaRead;
128	dchar decodeReverse() { return decodeReverseViaRead(); }
129	}
130
131	// Encoding to different destinations
132
133	template EncodeToString()
134	{
135	mixin WriteToString;
136	mixin EncodeViaWrite;
137	}
138
139	template EncodeToArray()
140	{
141	mixin WriteToArray;
142	mixin EncodeViaWrite;
143	}
144
145	deprecated template EncodeToBuffer()
146	{
147	mixin WriteToBuffer;
148	mixin EncodeViaWrite;
149	}
150
151	template EncodeToDelegate()
152	{
153	mixin WriteToDelegate;
154	mixin EncodeViaWrite;
155	}
156
157	// Decoding functions
158
159	template SkipFromString()
160	{
161	mixin ReadFromString;
162	mixin SkipViaRead;
163	}
164
165	template DecodeFromString()
166	{
167	mixin ReadFromString;
168	mixin DecodeViaRead;
169	}
170
171	template SafeDecodeFromString()
172	{
173	mixin ReadFromString;
174	mixin SafeDecodeViaRead;
175	}
176
177	template DecodeReverseFromString()
178	{
179	mixin ReverseReadFromString;
180	mixin DecodeReverseViaRead;
181	}
182
183	//=========================================================================
184
185	// Below are the functions we will ultimately expose to the user
186
187	E[] encode(dchar c)
188	{
189	mixin EncodeToString e;
190	e.encode(c);
191	return e.s;
192	}
193
194	void encode(dchar c, ref E[] array)
195	{
196	mixin EncodeToArray e;
197	e.encode(c);
198	}
199
200	void encode(dchar c, void delegate(E) dg)
201	{
202	mixin EncodeToDelegate e;
203	e.encode(c);
204	}
205
206	void skip(ref E[] s)
207	{
208	mixin SkipFromString e;
209	e.skip();
210	}
211
212	dchar decode(S)(ref S s)
213	{
214	mixin DecodeFromString e;
215	return e.decode();
216	}
217
218	dchar safeDecode(S)(ref S s)
219	{
220	mixin SafeDecodeFromString e;
221	return e.safeDecode();
222	}
223
224	dchar decodeReverse(ref E[] s)
225	{
226	mixin DecodeReverseFromString e;
227	return e.decodeReverse();
228	}
229	}
230
231	//=========================================================================
232
233	struct CodePoints(E)
234	{
235	E[] s;
236
237	static CodePoints opCall(E[] s)
238	in
239	{
240	assert(isValid(s));
241	}
242	body
243	{
244	CodePoints codePoints;
245	codePoints.s = s;
246	return codePoints;
247	}
248
249	int opApply(int delegate(ref dchar) dg)
250	{
251	int result = 0;
252	while (s.length != 0)
253	{
254	dchar c = decode(s);
255	result = dg(c);
256	if (result != 0) break;
257	}
258	return result;
259	}
260
261	int opApply(int delegate(ref uint, ref dchar) dg)
262	{
263	uint i = 0;
264	int result = 0;
265	while (s.length != 0)
266	{
267	uint len = s.length;
268	dchar c = decode(s);
269	uint j = i; // We don't want the delegate corrupting i
270	result = dg(j,c);
271	if (result != 0) break;
272	i += len - s.length;
273	}
274	return result;
275	}
276
277	int opApplyReverse(int delegate(ref dchar) dg)
278	{
279	int result = 0;
280	while (s.length != 0)
281	{
282	dchar c = decodeReverse(s);
283	result = dg(c);
284	if (result != 0) break;
285	}
286	return result;
287	}
288
289	int opApplyReverse(int delegate(ref uint, ref dchar) dg)
290	{
291	int result = 0;
292	while (s.length != 0)
293	{
294	dchar c = decodeReverse(s);
295	uint i = s.length;
296	result = dg(i,c);
297	if (result != 0) break;
298	}
299	return result;
300	}
301	}
302
303	struct CodeUnits(E)
304	{
305	E[] s;
306
307	static CodeUnits opCall(dchar d)
308	in
309	{
310	assert(isValidCodePoint(d));
311	}
312	body
313	{
314	CodeUnits codeUnits;
315	codeUnits.s = encode!(E)(d);
316	return codeUnits;
317	}
318
319	int opApply(int delegate(ref E) dg)
320	{
321	int result = 0;
322	foreach(E c;s)
323	{
324	result = dg(c);
325	if (result != 0) break;
326	}
327	return result;
328	}
329
330	int opApplyReverse(int delegate(ref E) dg)
331	{
332	int result = 0;
333	foreach_reverse(E c;s)
334	{
335	result = dg(c);
336	if (result != 0) break;
337	}
338	return result;
339	}
340	}
341
342	//=============================================================================
343
344	template EncoderInstance(E)
345	{
346	static assert(false,"Cannot instantiate EncoderInstance for type "
347	~ E.stringof);
348	}
349
350	//=============================================================================
351	// ASCII
352	//=============================================================================
353
354	/** Defines various character sets. */
355	typedef ubyte AsciiChar;
356	/// Ditto
357	alias AsciiChar[] AsciiString;
358
359	template EncoderInstance(CharType : AsciiChar)
360	{
361	alias AsciiChar E;
362	alias AsciiString EString;
363
364	string encodingName()
365	{
366	return "ASCII";
367	}
368
369	bool canEncode(dchar c)
370	{
371	return c < 0x80;
372	}
373
374	bool isValidCodeUnit(AsciiChar c)
375	{
376	return c < 0x80;
377	}
378
379	uint encodedLength(dchar c)
380	in
381	{
382	assert(canEncode(c));
383	}
384	body
385	{
386	return 1;
387	}
388
389	void encodeX(Range)(dchar c, Range r)
390	{
391	if (!canEncode(c)) c = '?';
392	r.write(cast(AsciiChar) c);
393	}
394
395	void encodeViaWrite()(dchar c)
396	{
397	if (!canEncode(c)) c = '?';
398	write(cast(AsciiChar)c);
399	}
400
401	void skipViaRead()()
402	{
403	read();
404	}
405
406	dchar decodeViaRead()()
407	{
408	return read;
409	}
410
411	dchar safeDecodeViaRead()()
412	{
413	dchar c = read;
414	return canEncode(c) ? c : INVALID_SEQUENCE;
415	}
416
417	dchar decodeReverseViaRead()()
418	{
419	return read;
420	}
421
422	EString replacementSequence()
423	{
424	return cast(EString)("?");
425	}
426
427	mixin EncoderFunctions;
428	}
429
430	//=============================================================================
431	// ISO-8859-1
432	//=============================================================================
433
434	/** Defines an Latin1-encoded character. */
435	typedef ubyte Latin1Char;
436	/**
437	Defines an Latin1-encoded string (as an array of $(D
438	invariant(Latin1Char))).
439	*/
440	alias Latin1Char[] Latin1String; ///
441
442	template EncoderInstance(CharType : Latin1Char)
443	{
444	alias Latin1Char E;
445	alias Latin1String EString;
446
447	string encodingName()
448	{
449	return "ISO-8859-1";
450	}
451
452	bool canEncode(dchar c)
453	{
454	return c < 0x100;
455	}
456
457	bool isValidCodeUnit(Latin1Char c)
458	{
459	return true;
460	}
461
462	uint encodedLength(dchar c)
463	in
464	{
465	assert(canEncode(c));
466	}
467	body
468	{
469	return 1;
470	}
471
472	void encodeViaWrite()(dchar c)
473	{
474	if (!canEncode(c)) c = '?';
475	write(cast(Latin1Char)c);
476	}
477
478	void skipViaRead()()
479	{
480	read();
481	}
482
483	dchar decodeViaRead()()
484	{
485	return read;
486	}
487
488	dchar safeDecodeViaRead()()
489	{
490	return read;
491	}
492
493	dchar decodeReverseViaRead()()
494	{
495	return read;
496	}
497
498	EString replacementSequence()
499	{
500	return cast(EString)("?");
501	}
502
503	mixin EncoderFunctions;
504	}
505
506	//=============================================================================
507	// WINDOWS-1252
508	//=============================================================================
509
510	/** Defines a Windows1252-encoded character. */
511	typedef ubyte Windows1252Char;
512	/**
513	Defines an Windows1252-encoded string (as an array of $(D
514	invariant(Windows1252Char))).
515	*/
516	alias Windows1252Char[] Windows1252String; ///
517
518	template EncoderInstance(CharType : Windows1252Char)
519	{
520	alias Windows1252Char E;
521	alias Windows1252String EString;
522
523	string encodingName()
524	{
525	return "windows-1252";
526	}
527
528	static const wstring charMap =
529	"\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"
530	"\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"
531	"\uFFFD\u2018\u2019\u201C\u201D\u2022\u2103\u2014"
532	"\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178";
533
534	bool canEncode(dchar c)
535	{
536	if (c < 0x80 \|\| (c >= 0xA0 && c < 0x100)) return true;
537	if (c >= 0xFFFD) return false;
538	foreach(wchar d;charMap) { if (c == d) return true; }
539	return false;
540	}
541
542	bool isValidCodeUnit(Windows1252Char c)
543	{
544	if (c < 0x80 \|\| c >= 0xA0) return true;
545	return (charMap[c-0x80] != 0xFFFD);
546	}
547
548	uint encodedLength(dchar c)
549	in
550	{
551	assert(canEncode(c));
552	}
553	body
554	{
555	return 1;
556	}
557
558	void encodeViaWrite()(dchar c)
559	{
560	if (c < 0x80 \|\| (c >= 0xA0 && c < 0x100)) {}
561	else if (c >= 0xFFFD) { c = '?'; }
562	else
563	{
564	int n = -1;
565	foreach(i,wchar d;charMap)
566	{
567	if (c == d)
568	{
569	n = i;
570	break;
571	}
572	}
573	c = n == -1 ? '?' : 0x80 + n;
574	}
575	write(cast(Windows1252Char)c);
576	}
577
578	void skipViaRead()()
579	{
580	read();
581	}
582
583	dchar decodeViaRead()()
584	{
585	Windows1252Char c = read;
586	return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
587	}
588
589	dchar safeDecodeViaRead()()
590	{
591	Windows1252Char c = read;
592	dchar d = (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
593	return d == 0xFFFD ? INVALID_SEQUENCE : d;
594	}
595
596	dchar decodeReverseViaRead()()
597	{
598	Windows1252Char c = read;
599	return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
600	}
601
602	EString replacementSequence()
603	{
604	return cast(EString)("?");
605	}
606
607	mixin EncoderFunctions;
608	}
609
610	//=============================================================================
611	// UTF-8
612	//=============================================================================
613
614	template EncoderInstance(CharType : char)
615	{
616	alias char E;
617	alias char[] EString;
618
619	string encodingName()
620	{
621	return "UTF-8";
622	}
623
624	bool canEncode(dchar c)
625	{
626	return isValidCodePoint(c);
627	}
628
629	bool isValidCodeUnit(char c)
630	{
631	return (c < 0xC0 \|\| (c >= 0xC2 && c < 0xF5));
632	}
633
634	static const ubyte[128] tailTable =
635	[
636	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
637	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
638	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
639	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
640	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
641	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
642	2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
643	3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
644	];
645
646	private int tails(char c)
647	in
648	{
649	assert(c >= 0x80);
650	}
651	body
652	{
653	return tailTable[c-0x80];
654	}
655
656	uint encodedLength(dchar c)
657	in
658	{
659	assert(canEncode(c));
660	}
661	body
662	{
663	if (c < 0x80) return 1;
664	if (c < 0x800) return 2;
665	if (c < 0x10000) return 3;
666	return 4;
667	}
668
669	void encodeViaWrite()(dchar c)
670	{
671	if (c < 0x80)
672	{
673	write(cast(char)c);
674	}
675	else if (c < 0x800)
676	{
677	write(cast(char)((c >> 6) + 0xC0));
678	write(cast(char)((c & 0x3F) + 0x80));
679	}
680	else if (c < 0x10000)
681	{
682	write(cast(char)((c >> 12) + 0xE0));
683	write(cast(char)(((c >> 6) & 0x3F) + 0x80));
684	write(cast(char)((c & 0x3F) + 0x80));
685	}
686	else
687	{
688	write(cast(char)((c >> 18) + 0xF0));
689	write(cast(char)(((c >> 12) & 0x3F) + 0x80));
690	write(cast(char)(((c >> 6) & 0x3F) + 0x80));
691	write(cast(char)((c & 0x3F) + 0x80));
692	}
693	}
694
695	void skipViaRead()()
696	{
697	auto c = read;
698	if (c < 0xC0) return;
699	int n = tails(cast(char) c);
700	for (uint i=0; i<n; ++i)
701	{
702	read();
703	}
704	}
705
706	dchar decodeViaRead()()
707	{
708	dchar c = read;
709	if (c < 0xC0) return c;
710	int n = tails(cast(char) c);
711	c &= (1 << (6 - n)) - 1;
712	for (uint i=0; i<n; ++i)
713	{
714	c = (c << 6) + (read & 0x3F);
715	}
716	return c;
717	}
718
719	dchar safeDecodeViaRead()()
720	{
721	dchar c = read;
722	if (c < 0x80) return c;
723	int n = tails(cast(char) c);
724	if (n == 0) return INVALID_SEQUENCE;
725
726	if (!canRead) return INVALID_SEQUENCE;
727	uint d = peek;
728	bool err =
729	(
730	(c < 0xC2) // fail overlong 2-byte sequences
731	\|\| (c > 0xF4) // fail overlong 4-6-byte sequences
732	\|\| (c == 0xE0 && ((d & 0xE0) == 0x80)) // fail overlong 3-byte sequences
733	\|\| (c == 0xED && ((d & 0xE0) == 0xA0)) // fail surrogates
734	\|\| (c == 0xF0 && ((d & 0xF0) == 0x80)) // fail overlong 4-byte sequences
735	\|\| (c == 0xF4 && ((d & 0xF0) >= 0x90)) // fail code points > 0x10FFFF
736	);
737
738	c &= (1 << (6 - n)) - 1;
739	for (uint i=0; i<n; ++i)
740	{
741	if (!canRead) return INVALID_SEQUENCE;
742	d = peek;
743	if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
744	c = (c << 6) + (read & 0x3F);
745	}
746
747	return err ? INVALID_SEQUENCE : c;
748	}
749
750	dchar decodeReverseViaRead()()
751	{
752	//auto feed_char = read;
753	//dchar c = feed_char;
754	dchar c = read;
755	if (c < 0x80) return c;
756	uint shift = 0;
757	c &= 0x3F;
758	for (uint i=0; i<4; ++i)
759	{
760	shift += 6;
761	auto d = read;
762	uint n = tails(cast(char) d);
763	uint mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
764	c += ((d & mask) << shift);
765	if (n != 0) break;
766	}
767	return c;
768	}
769
770	EString replacementSequence()
771	{
772	return "\uFFFD";
773	}
774
775	mixin EncoderFunctions;
776	}
777
778	//=============================================================================
779	// UTF-16
780	//=============================================================================
781
782	template EncoderInstance(CharType : wchar)
783	{
784	alias wchar E;
785	alias wchar[] EString;
786
787	string encodingName()
788	{
789	return "UTF-16";
790	}
791
792	bool canEncode(dchar c)
793	{
794	return isValidCodePoint(c);
795	}
796
797	bool isValidCodeUnit(wchar c)
798	{
799	return true;
800	}
801
802	uint encodedLength(dchar c)
803	in
804	{
805	assert(canEncode(c));
806	}
807	body
808	{
809	return (c < 0x10000) ? 1 : 2;
810	}
811
812	void encodeViaWrite()(dchar c)
813	{
814	if (c < 0x10000)
815	{
816	write(cast(wchar)c);
817	}
818	else
819	{
820	uint n = c - 0x10000;
821	write(cast(wchar)(0xD800 + (n >> 10)));
822	write(cast(wchar)(0xDC00 + (n & 0x3FF)));
823	}
824	}
825
826	void skipViaRead()()
827	{
828	wchar c = read;
829	if (c < 0xD800 \|\| c >= 0xE000) return;
830	read();
831	}
832
833	dchar decodeViaRead()()
834	{
835	wchar c = read;
836	if (c < 0xD800 \|\| c >= 0xE000) return cast(dchar)c;
837	wchar d = read;
838	c &= 0x3FF;
839	d &= 0x3FF;
840	return 0x10000 + (c << 10) + d;
841	}
842
843	dchar safeDecodeViaRead()()
844	{
845	wchar c = read;
846	if (c < 0xD800 \|\| c >= 0xE000) return cast(dchar)c;
847	if (c >= 0xDC00) return INVALID_SEQUENCE;
848	if (!canRead) return INVALID_SEQUENCE;
849	wchar d = peek;
850	if (d < 0xDC00 \|\| d >= 0xE000) return INVALID_SEQUENCE;
851	d = read;
852	c &= 0x3FF;
853	d &= 0x3FF;
854	return 0x10000 + (c << 10) + d;
855	}
856
857	dchar decodeReverseViaRead()()
858	{
859	wchar c = read;
860	if (c < 0xD800 \|\| c >= 0xE000) return cast(dchar)c;
861	wchar d = read;
862	c &= 0x3FF;
863	d &= 0x3FF;
864	return 0x10000 + (d << 10) + c;
865	}
866
867	EString replacementSequence()
868	{
869	return "\uFFFD"w;
870	}
871
872	mixin EncoderFunctions;
873	}
874
875	//=============================================================================
876	// UTF-32
877	//=============================================================================
878
879	template EncoderInstance(CharType : dchar)
880	{
881	alias dchar E;
882	alias dchar[] EString;
883
884	string encodingName()
885	{
886	return "UTF-32";
887	}
888
889	bool canEncode(dchar c)
890	{
891	return isValidCodePoint(c);
892	}
893
894	bool isValidCodeUnit(dchar c)
895	{
896	return isValidCodePoint(c);
897	}
898
899	uint encodedLength(dchar c)
900	in
901	{
902	assert(canEncode(c));
903	}
904	body
905	{
906	return 1;
907	}
908
909	void encodeViaWrite()(dchar c)
910	{
911	write(c);
912	}
913
914	void skipViaRead()()
915	{
916	read();
917	}
918
919	dchar decodeViaRead()()
920	{
921	return cast(dchar)read;
922	}
923
924	dchar safeDecodeViaRead()()
925	{
926	dchar c = read;
927	return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
928	}
929
930	dchar decodeReverseViaRead()()
931	{
932	return cast(dchar)read;
933	}
934
935	EString replacementSequence()
936	{
937	return "\uFFFD"d;
938	}
939
940	mixin EncoderFunctions;
941	}
942
943	//=============================================================================
944	// Below are forwarding functions which expose the function to the user
945
946	/**
947	Returns true if c is a valid code point
948
949	Note that this includes the non-character code points U+FFFE and U+FFFF,
950	since these are valid code points (even though they are not valid
951	characters).
952
953	Supercedes:
954	This function supercedes $(D std.utf.startsValidDchar()).
955
956	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
957
958	Params:
959	c = the code point to be tested
960	*/
961	bool isValidCodePoint(dchar c)
962	{
963	return c < 0xD800 \|\| (c >= 0xE000 && c < 0x110000);
964	}
965
966	/**
967	Returns the name of an encoding.
968
969	The type of encoding cannot be deduced. Therefore, it is necessary to
970	explicitly specify the encoding type.
971
972	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
973
974	Examples:
975	-----------------------------------
976	assert(encodingName!(Latin1Char) == "ISO-8859-1");
977	-----------------------------------
978	*/
979	string encodingName(T)()
980	{
981	return EncoderInstance!(T).encodingName;
982	}
983
984	unittest
985	{
986	assert(encodingName!(char) == "UTF-8");
987	assert(encodingName!(wchar) == "UTF-16");
988	assert(encodingName!(dchar) == "UTF-32");
989	assert(encodingName!(AsciiChar) == "ASCII");
990	assert(encodingName!(Latin1Char) == "ISO-8859-1");
991	assert(encodingName!(Windows1252Char) == "windows-1252");
992	}
993
994	/**
995	Returns true iff it is possible to represent the specifed codepoint
996	in the encoding.
997
998	The type of encoding cannot be deduced. Therefore, it is necessary to
999	explicitly specify the encoding type.
1000
1001	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1002
1003	Examples:
1004	-----------------------------------
1005	assert(canEncode!(Latin1Char)('A'));
1006	-----------------------------------
1007	*/
1008	bool canEncode(E)(dchar c)
1009	{
1010	return EncoderInstance!(E).canEncode(c);
1011	}
1012
1013	unittest
1014	{
1015	assert(!canEncode!(AsciiChar)('\u00A0'));
1016	assert(canEncode!(Latin1Char)('\u00A0'));
1017	assert(canEncode!(Windows1252Char)('\u20AC'));
1018	assert(!canEncode!(Windows1252Char)('\u20AD'));
1019	assert(!canEncode!(Windows1252Char)('\uFFFD'));
1020	assert(!canEncode!(char)(cast(dchar)0x110000));
1021	}
1022
1023	/**
1024	Returns true if the code unit is legal. For example, the byte 0x80 would
1025	not be legal in ASCII, because ASCII code units must always be in the range
1026	0x00 to 0x7F.
1027
1028	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1029
1030	Params:
1031	c = the code unit to be tested
1032	*/
1033	bool isValidCodeUnit(E)(E c)
1034	{
1035	return EncoderInstance!(E).isValidCodeUnit(c);
1036	}
1037
1038	unittest
1039	{
1040	assert(!isValidCodeUnit(cast(AsciiChar)0xA0));
1041	assert( isValidCodeUnit(cast(Windows1252Char)0x80));
1042	assert(!isValidCodeUnit(cast(Windows1252Char)0x81));
1043	assert(!isValidCodeUnit(cast(char)0xC0));
1044	assert(!isValidCodeUnit(cast(char)0xFF));
1045	assert( isValidCodeUnit(cast(wchar)0xD800));
1046	assert(!isValidCodeUnit(cast(dchar)0xD800));
1047	}
1048
1049	/**
1050	Returns true if the string is encoded correctly
1051
1052	Supercedes:
1053	This function supercedes std.utf.validate(), however note that this
1054	function returns a bool indicating whether the input was valid or not,
1055	wheras the older funtion would throw an exception.
1056
1057	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1058
1059	Params:
1060	s = the string to be tested
1061	*/
1062	bool isValid(E)(E[] s)
1063	{
1064	return s.length == validLength(s);
1065	}
1066
1067	unittest
1068	{
1069	assert(isValid("\u20AC100"));
1070	}
1071
1072	/**
1073	Returns the length of the longest possible substring, starting from
1074	the first code unit, which is validly encoded.
1075
1076	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1077
1078	Params:
1079	s = the string to be tested
1080	*/
1081	uint validLength(E)(E[] s)
1082	{
1083	uint result, before = void;
1084	while ((before = s.length) > 0)
1085	{
1086	if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1087	break;
1088	result += before - s.length;
1089	}
1090	return result;
1091	}
1092
1093	/**
1094	Sanitizes a string by replacing malformed code unit sequences with valid
1095	code unit sequences. The result is guaranteed to be valid for this encoding.
1096
1097	If the input string is already valid, this function returns the original,
1098	otherwise it constructs a new string by replacing all illegal code unit
1099	sequences with the encoding's replacement character, Invalid sequences will
1100	be replaced with the Unicode replacement character (U+FFFD) if the
1101	character repertoire contains it, otherwise invalid sequences will be
1102	replaced with '?'.
1103
1104	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1105
1106	Params:
1107	s = the string to be sanitized
1108	*/
1109
1110	E[] sanitize(E)(E[] s)
1111	{
1112	uint n = validLength(s);
1113	if (n == s.length) return s;
1114
1115	auto repSeq = EncoderInstance!(E).replacementSequence;
1116
1117	// Count how long the string needs to be.
1118	// Overestimating is not a problem
1119	uint len = s.length;
1120	E[] t = s[n..$];
1121	while (t.length != 0)
1122	{
1123	dchar c = EncoderInstance!(E).safeDecode(t);
1124	assert(c == INVALID_SEQUENCE);
1125	len += repSeq.length;
1126	t = t[validLength(t)..$];
1127	}
1128
1129	// Now do the write
1130	E[] array = new E[len];
1131	array[0..n] = s[0..n];
1132	uint offset = n;
1133
1134	t = s[n..$];
1135	while (t.length != 0)
1136	{
1137	dchar c = EncoderInstance!(E).safeDecode(t);
1138	assert(c == INVALID_SEQUENCE);
1139	array[offset..offset+repSeq.length] = repSeq[];
1140	offset += repSeq.length;
1141	n = validLength(t);
1142	array[offset..offset+n] = t[0..n];
1143	offset += n;
1144	t = t[n..$];
1145	}
1146	return cast(E[])array[0..offset];
1147	}
1148
1149	unittest
1150	{
1151	assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1152	}
1153
1154	/**
1155	Returns the length of the first encoded sequence.
1156
1157	The input to this function MUST be validly encoded.
1158	This is enforced by the function's in-contract.
1159
1160	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1161
1162	Params:
1163	s = the string to be sliced
1164	*/
1165	uint firstSequence(E)(E[] s)
1166	in
1167	{
1168	assert(s.length != 0);
1169	E[] u = s;
1170	assert(safeDecode(u) != INVALID_SEQUENCE);
1171	}
1172	body
1173	{
1174	auto before = s.length;
1175	EncoderInstance!(E).skip(s);
1176	return before - s.length;
1177	}
1178
1179	unittest
1180	{
1181	assert(firstSequence("\u20AC1000") == "\u20AC".length);
1182	}
1183
1184	/**
1185	Returns the length the last encoded sequence.
1186
1187	The input to this function MUST be validly encoded.
1188	This is enforced by the function's in-contract.
1189
1190	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1191
1192	Params:
1193	s = the string to be sliced
1194	*/
1195	uint lastSequence(E)(E[] s)
1196	in
1197	{
1198	assert(s.length != 0);
1199	assert(isValid(s));
1200	}
1201	body
1202	{
1203	E[] t = s;
1204	EncoderInstance!(E).decodeReverse(s);
1205	return t.length - s.length;
1206	}
1207
1208	unittest
1209	{
1210	assert(lastSequence("1000\u20AC") == "\u20AC".length);
1211	}
1212
1213	/**
1214	Returns the total number of code points encoded in a string.
1215
1216	The input to this function MUST be validly encoded. This is enforced
1217	by the function's in-contract.
1218
1219	Supercedes: This function supercedes $(D std.utf.toUCSindex()).
1220
1221	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1222
1223	Params:
1224	s = the string to be counted
1225	*/
1226	uint codepoints_count(E)(E[] s)
1227	in
1228	{
1229	assert(isValid(s));
1230	}
1231	body
1232	{
1233	uint n = 0;
1234	while (s.length != 0)
1235	{
1236	EncoderInstance!(E).skip(s);
1237	++n;
1238	}
1239	return n;
1240	}
1241
1242	unittest
1243	{
1244	assert(codepoints_count("\u20AC100") == 4);
1245	}
1246
1247	/**
1248	Returns the array index at which the (n+1)th code point begins.
1249
1250	The input to this function MUST be validly encoded.
1251	This is enforced by the function's in-contract.
1252
1253	Supercedes:
1254	This function supercedes std.utf.toUTFindex().
1255
1256	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1257
1258	Params:
1259	s = the string to be counted
1260	*/
1261	int index(E)(E[] s,int n)
1262	in
1263	{
1264	assert(isValid(s));
1265	assert(n >= 0);
1266	}
1267	body
1268	{
1269	E[] t = s;
1270	for (uint i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1271	return t.length - s.length;
1272	}
1273
1274	unittest
1275	{
1276	assert(index("\u20AC100",1) == 3);
1277	}
1278
1279	/**
1280	Decodes a single code point.
1281
1282	This function removes one or more code units from the start of a string,
1283	and returns the decoded code point which those code units represent.
1284
1285	The input to this function MUST be validly encoded.
1286	This is enforced by the function's in-contract.
1287
1288	Supercedes:
1289	This function supercedes std.utf.decode(), however, note that the
1290	function codePoints() supercedes it more conveniently.
1291
1292	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1293
1294	Params:
1295	s = the string whose first code point is to be decoded
1296	*/
1297	dchar decode(S)(ref S s)
1298	in
1299	{
1300	assert(s.length != 0);
1301	auto u = s;
1302	assert(safeDecode(u) != INVALID_SEQUENCE);
1303	}
1304	body
1305	{
1306	return EncoderInstance!(typeof(s[0])).decode(s);
1307	}
1308
1309	/**
1310	Decodes a single code point from the end of a string.
1311
1312	This function removes one or more code units from the end of a string,
1313	and returns the decoded code point which those code units represent.
1314
1315	The input to this function MUST be validly encoded.
1316	This is enforced by the function's in-contract.
1317
1318	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1319
1320	Params:
1321	s = the string whose first code point is to be decoded
1322	*/
1323	dchar decodeReverse(E)(ref E[] s)
1324	in
1325	{
1326	assert(s.length != 0);
1327	assert(isValid(s));
1328	}
1329	body
1330	{
1331	return EncoderInstance!(E).decodeReverse(s);
1332	}
1333
1334	/**
1335	Decodes a single code point. The input does not have to be valid.
1336
1337	This function removes one or more code units from the start of a string,
1338	and returns the decoded code point which those code units represent.
1339
1340	This function will accept an invalidly encoded string as input.
1341	If an invalid sequence is found at the start of the string, this
1342	function will remove it, and return the value INVALID_SEQUENCE.
1343
1344	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1345
1346	Params:
1347	s = the string whose first code point is to be decoded
1348	*/
1349	dchar safeDecode(S)(ref S s)
1350	in
1351	{
1352	assert(s.length != 0);
1353	}
1354	body
1355	{
1356	return EncoderInstance!(typeof(s[0])).safeDecode(s);
1357	}
1358
1359	/**
1360	Returns the number of code units required to encode a single code point.
1361
1362	The input to this function MUST be a valid code point.
1363	This is enforced by the function's in-contract.
1364
1365	The type of the output cannot be deduced. Therefore, it is necessary to
1366	explicitly specify the encoding as a template parameter.
1367
1368	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1369
1370	Params:
1371	c = the code point to be encoded
1372	*/
1373	uint encodedLength(E)(dchar c)
1374	in
1375	{
1376	assert(isValidCodePoint(c));
1377	}
1378	body
1379	{
1380	return EncoderInstance!(E).encodedLength(c);
1381	}
1382
1383	/**
1384	Encodes a single code point.
1385
1386	This function encodes a single code point into one or more code units.
1387	It returns a string containing those code units.
1388
1389	The input to this function MUST be a valid code point.
1390	This is enforced by the function's in-contract.
1391
1392	The type of the output cannot be deduced. Therefore, it is necessary to
1393	explicitly specify the encoding as a template parameter.
1394
1395	Supercedes:
1396	This function supercedes std.utf.encode(), however, note that the
1397	function codeUnits() supercedes it more conveniently.
1398
1399	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1400
1401	Params:
1402	c = the code point to be encoded
1403	*/
1404	E[] encode(E)(dchar c)
1405	in
1406	{
1407	assert(isValidCodePoint(c));
1408	}
1409	body
1410	{
1411	return EncoderInstance!(E).encode(c);
1412	}
1413
1414	/**
1415	Encodes a single code point into an array.
1416
1417	This function encodes a single code point into one or more code units
1418	The code units are stored in a user-supplied fixed-size array,
1419	which must be passed by reference.
1420
1421	The input to this function MUST be a valid code point.
1422	This is enforced by the function's in-contract.
1423
1424	The type of the output cannot be deduced. Therefore, it is necessary to
1425	explicitly specify the encoding as a template parameter.
1426
1427	Supercedes:
1428	This function supercedes std.utf.encode(), however, note that the
1429	function codeUnits() supercedes it more conveniently.
1430
1431	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1432
1433	Params:
1434	c = the code point to be encoded
1435
1436	Returns:
1437	the number of code units written to the array
1438	*/
1439	uint encode(E)(dchar c, E[] array)
1440	in
1441	{
1442	assert(isValidCodePoint(c));
1443	}
1444	body
1445	{
1446	E[] t = array;
1447	EncoderInstance!(E).encode(c,t);
1448	return array.length - t.length;
1449	}
1450
1451	// /**
1452	// * Encodes a single code point into a Buffer.
1453	// *
1454	// * This function encodes a single code point into one or more code units
1455	// * The code units are stored in a growable buffer.
1456	// *
1457	// * The input to this function MUST be a valid code point.
1458	// * This is enforced by the function's in-contract.
1459	// *
1460	// * The type of the output cannot be deduced. Therefore, it is necessary to
1461	// * explicitly specify the encoding as a template parameter.
1462	// *
1463	// * Supercedes:
1464	// * This function supercedes std.utf.encode(), however, note that the
1465	// * function codeUnits() supercedes it more conveniently.
1466	// *
1467	// * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1468	// *
1469	// * Params:
1470	// * c = the code point to be encoded
1471	// */
1472	// deprecated void encode(E)(dchar c, ref Buffer!(E) buffer)
1473	// in
1474	// {
1475	// assert(isValidCodePoint(c));
1476	// }
1477	// body
1478	// {
1479	// EncoderInstance!(E).encode(c,buffer);
1480	// }
1481
1482	/**
1483	Encodes $(D c) in units of type $(D E) and writes the result to the
1484	output range $(D R). Returns the number of $(D E)s written.
1485	*/
1486
1487	alias void delegate(char c) encode_putchar;
1488	alias void delegate(wchar c) encode_putwchar;
1489
1490	size_t encode_char(dchar c, encode_putchar putc)
1491	{
1492	if (c <= 0x7F)
1493	{
1494	putc(cast(char) c);
1495	return 1;
1496	}
1497	if (c <= 0x7FF)
1498	{
1499	putc(cast(char)(0xC0 \| (c >> 6)));
1500	putc(cast(char)(0x80 \| (c & 0x3F)));
1501	return 2;
1502	}
1503	if (c <= 0xFFFF)
1504	{
1505	putc(cast(char)(0xE0 \| (c >> 12)));
1506	putc(cast(char)(0x80 \| ((c >> 6) & 0x3F)));
1507	putc(cast(char)(0x80 \| (c & 0x3F)));
1508	return 3;
1509	}
1510	if (c <= 0x10FFFF)
1511	{
1512	putc(cast(char)(0xF0 \| (c >> 18)));
1513	putc(cast(char)(0x80 \| ((c >> 12) & 0x3F)));
1514	putc(cast(char)(0x80 \| ((c >> 6) & 0x3F)));
1515	putc(cast(char)(0x80 \| (c & 0x3F)));
1516	return 4;
1517	}
1518	else
1519	{
1520	assert(0);
1521	}
1522	}
1523	size_t encode_wchar(E, R)(dchar c, encode_putwchar putw)
1524	{
1525	if (c <= 0xFFFF)
1526	{
1527	r.putw(cast(wchar) c);
1528	return 1;
1529	}
1530	r.putw(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
1531	r.putw(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
1532	return 2;
1533	}
1534
1535	/**
1536	Encodes a single code point to a delegate.
1537
1538	This function encodes a single code point into one or more code units.
1539	The code units are passed one at a time to the supplied delegate.
1540
1541	The input to this function MUST be a valid code point.
1542	This is enforced by the function's in-contract.
1543
1544	The type of the output cannot be deduced. Therefore, it is necessary to
1545	explicitly specify the encoding as a template parameter.
1546
1547	Supercedes:
1548	This function supercedes std.utf.encode(), however, note that the
1549	function codeUnits() supercedes it more conveniently.
1550
1551	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1552
1553	Params:
1554	c = the code point to be encoded
1555	*/
1556	void encode(E)(dchar c, void delegate(E) dg)
1557	in
1558	{
1559	assert(isValidCodePoint(c));
1560	}
1561	body
1562	{
1563	EncoderInstance!(E).encode(c,dg);
1564	}
1565
1566	/**
1567	Returns a foreachable struct which can bidirectionally iterate over all
1568	code points in a string.
1569
1570	The input to this function MUST be validly encoded.
1571	This is enforced by the function's in-contract.
1572
1573	You can foreach either
1574	with or without an index. If an index is specified, it will be initialized
1575	at each iteration with the offset into the string at which the code point
1576	begins.
1577
1578	Supercedes:
1579	This function supercedes std.utf.decode().
1580
1581	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1582
1583	Params:
1584	s = the string to be decoded
1585
1586	Examples:
1587	--------------------------------------------------------
1588	string s = "hello world";
1589	foreach(c;codePoints(s))
1590	{
1591	// do something with c (which will always be a dchar)
1592	}
1593	--------------------------------------------------------
1594
1595	Note that, currently, foreach(c:codePoints(s)) is superior to foreach(c;s)
1596	in that the latter will fall over on encountering U+FFFF.
1597	*/
1598	CodePoints!(E) codePoints(E)(E[] s)
1599	in
1600	{
1601	assert(isValid(s));
1602	}
1603	body
1604	{
1605	return CodePoints!(E)(s);
1606	}
1607
1608	unittest
1609	{
1610	string s = "hello";
1611	string t;
1612	foreach(c;codePoints(s))
1613	{
1614	t ~= cast(char)c;
1615	}
1616	assert(s == t);
1617	}
1618
1619	/**
1620	Returns a foreachable struct which can bidirectionally iterate over all
1621	code units in a code point.
1622
1623	The input to this function MUST be a valid code point.
1624	This is enforced by the function's in-contract.
1625
1626	The type of the output cannot be deduced. Therefore, it is necessary to
1627	explicitly specify the encoding type in the template parameter.
1628
1629	Supercedes:
1630	This function supercedes std.utf.encode().
1631
1632	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1633
1634	Params:
1635	d = the code point to be encoded
1636
1637	Examples:
1638	--------------------------------------------------------
1639	dchar d = '\u20AC';
1640	foreach(c;codeUnits!(char)(d))
1641	{
1642	writefln("%X",c)
1643	}
1644	// will print
1645	// E2
1646	// 82
1647	// AC
1648	--------------------------------------------------------
1649	*/
1650	CodeUnits!(E) codeUnits(E)(dchar c)
1651	in
1652	{
1653	assert(isValidCodePoint(c));
1654	}
1655	body
1656	{
1657	return CodeUnits!(E)(c);
1658	}
1659
1660	unittest
1661	{
1662	char[] a;
1663	foreach(c;codeUnits!(char)(cast(dchar)'\u20AC'))
1664	{
1665	a ~= c;
1666	}
1667	assert(a.length == 3);
1668	assert(a[0] == 0xE2);
1669	assert(a[1] == 0x82);
1670	assert(a[2] == 0xAC);
1671	}
1672
1673	/**
1674	Encodes $(D c) in units of type $(D E) and writes the result to the
1675	output range $(D R). Returns the number of $(D E)s written.
1676	*/
1677
1678	uint encode(Tgt, Src, R)(in Src[] s, R range)
1679	{
1680	uint result;
1681	foreach (c; s)
1682	{
1683	result += encode!(Tgt)(c, range);
1684	}
1685	return result;
1686	}
1687
1688	/**
1689	Convert a string from one encoding to another. (See also to!() below).
1690
1691	The input to this function MUST be validly encoded.
1692	This is enforced by the function's in-contract.
1693
1694	Supercedes:
1695	This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and
1696	std.utf.toUTF32()
1697	(but note that to!() supercedes it more conveniently).
1698
1699	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1700
1701	Params:
1702	s = the source string
1703	r = the destination string
1704
1705	Examples:
1706	--------------------------------------------------------
1707	wstring ws;
1708	transcode("hello world",ws);
1709	// transcode from UTF-8 to UTF-16
1710
1711	Latin1String ls;
1712	transcode(ws, ls);
1713	// transcode from UTF-16 to ISO-8859-1
1714	--------------------------------------------------------
1715	*/
1716	void transcode(Src,Dst)(Src[] s,out Dst[] r)
1717	in
1718	{
1719	assert(isValid(s));
1720	}
1721	body
1722	{
1723	static if(is(Src==Dst))
1724	{
1725	r = s;
1726	}
1727	else static if(is(Src==AsciiChar))
1728	{
1729	transcode!(char,Dst)(cast(string)s,r);
1730	}
1731	else
1732	{
1733	Src[] t = s;
1734	while (t.length != 0)
1735	{
1736	r ~= encode!(Dst)(decode(t));
1737	}
1738	}
1739	}
1740
1741	/*
1742	Convert a string from one encoding to another. (See also transcode() above).
1743
1744	The input to this function MUST be validly encoded.
1745	This is enforced by the function's in-contract.
1746
1747	Supercedes:
1748	This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and
1749	std.utf.toUTF32().
1750
1751	Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1752
1753	Params:
1754	Dst = the destination encoding type
1755	s = the source string
1756
1757	Examples:
1758	-----------------------------------------------------------------------------
1759	auto ws = to!(wchar)("hello world"); // transcode from UTF-8 to UTF-16
1760	auto ls = to!(Latin1Char)(ws); // transcode from UTF-16 to ISO-8859-1
1761	-----------------------------------------------------------------------------
1762	*/
1763	// TODO: Commented out for no - to be moved to std.conv
1764	// Dst to(Dst,Src)(immutable(Src)[] s)
1765	// in
1766	// {
1767	// assert(isValid(s));
1768	// }
1769	// body
1770	// {
1771	// Dst r;
1772	// transcode(s,r);
1773	// return r;
1774	// }
1775
1776	//=============================================================================
1777
1778	/** The base class for exceptions thrown by this module */
1779	class EncodingException : Exception { this(string msg) { super(msg); } }
1780
1781	class UnrecognizedEncodingException : EncodingException
1782	{
1783	private this(string msg) { super(msg); }
1784	}
1785
1786	/** Abstract base class of all encoding schemes */
1787	abstract class EncodingScheme
1788	{
1789	/**
1790	* Registers a subclass of EncodingScheme.
1791	*
1792	* This function allows user-defined subclasses of EncodingScheme to
1793	* be declared in other modules.
1794	*
1795	* Examples:
1796	* ----------------------------------------------
1797	* class Amiga1251 : EncodingScheme
1798	* {
1799	* static this()
1800	* {
1801	* EncodingScheme.register("path.to.Amiga1251");
1802	* }
1803	* }
1804	* ----------------------------------------------
1805	*/
1806	static void register(string className)
1807	{
1808	auto scheme = cast(EncodingScheme)ClassInfo.find(className).create();
1809	if (scheme is null)
1810	throw new EncodingException("Unable to create class "~className);
1811	foreach(encodingName;scheme.names())
1812	{
1813	supported[tolower(encodingName)] = className;
1814	}
1815	}
1816
1817	/**
1818	* Obtains a subclass of EncodingScheme which is capable of encoding
1819	* and decoding the named encoding scheme.
1820	*
1821	* This function is only aware of EncodingSchemes which have been
1822	* registered with the register() function.
1823	*
1824	* Examples:
1825	* ---------------------------------------------------
1826	* auto scheme = EncodingScheme.create("Amiga-1251");
1827	* ---------------------------------------------------
1828	*/
1829	static EncodingScheme create(string encodingName)
1830	{
1831	auto p = std.string.tolower(encodingName) in supported;
1832	if (p is null)
1833	throw new EncodingException("Unrecognized Encoding: "~encodingName);
1834	string className = *p;
1835	auto scheme = cast(EncodingScheme)ClassInfo.find(className).create();
1836	if (scheme is null) throw new EncodingException("Unable to create class "~className);
1837	return scheme;
1838	}
1839
1840	/**
1841	* Returns the standard name of the encoding scheme
1842	*/
1843	abstract override string toString();
1844
1845	/**
1846	* Returns an array of all known names for this encoding scheme
1847	*/
1848	abstract string[] names();
1849
1850	/**
1851	* Returns true if the character c can be represented
1852	* in this encoding scheme.
1853	*/
1854	abstract bool canEncode(dchar c);
1855
1856	/**
1857	* Returns the number of ubytes required to encode this code point.
1858	*
1859	* The input to this function MUST be a valid code point.
1860	*
1861	* Params:
1862	* c = the code point to be encoded
1863	*
1864	* Returns:
1865	* the number of ubytes required.
1866	*/
1867	abstract uint encodedLength(dchar c);
1868
1869	/**
1870	* Encodes a single code point into a user-supplied, fixed-size buffer.
1871	*
1872	* This function encodes a single code point into one or more ubytes.
1873	* The supplied buffer must be code unit aligned.
1874	* (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
1875	* UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
1876	*
1877	* The input to this function MUST be a valid code point.
1878	*
1879	* Params:
1880	* c = the code point to be encoded
1881	*
1882	* Returns:
1883	* the number of ubytes written.
1884	*/
1885	abstract uint encode(dchar c, ubyte[] buffer);
1886
1887	/**
1888	* Decodes a single code point.
1889	*
1890	* This function removes one or more ubytes from the start of an array,
1891	* and returns the decoded code point which those ubytes represent.
1892	*
1893	* The input to this function MUST be validly encoded.
1894	*
1895	* Params:
1896	* s = the array whose first code point is to be decoded
1897	*/
1898	abstract dchar decode(ref ubyte[] s);
1899
1900	/**
1901	* Decodes a single code point. The input does not have to be valid.
1902	*
1903	* This function removes one or more ubytes from the start of an array,
1904	* and returns the decoded code point which those ubytes represent.
1905	*
1906	* This function will accept an invalidly encoded array as input.
1907	* If an invalid sequence is found at the start of the string, this
1908	* function will remove it, and return the value INVALID_SEQUENCE.
1909	*
1910	* Params:
1911	* s = the array whose first code point is to be decoded
1912	*/
1913	abstract dchar safeDecode(ref ubyte[] s);
1914
1915	/**
1916	* Returns the sequence of ubytes to be used to represent
1917	* any character which cannot be represented in the encoding scheme.
1918	*
1919	* Normally this will be a representation of some substitution
1920	* character, such as U+FFFD or '?'.
1921	*/
1922	abstract ubyte[] replacementSequence();
1923
1924
1925	/**
1926	* Returns true if the array is encoded correctly
1927	*
1928	* Params:
1929	* s = the array to be tested
1930	*/
1931	bool isValid(ubyte[] s)
1932	{
1933	while (s.length != 0)
1934	{
1935	dchar d = safeDecode(s);
1936	if (d == INVALID_SEQUENCE)
1937	return false;
1938	}
1939	return true;
1940	}
1941
1942	/**
1943	* Returns the length of the longest possible substring, starting from
1944	* the first element, which is validly encoded.
1945	*
1946	* Params:
1947	* s = the array to be tested
1948	*/
1949	uint validLength(ubyte[] s)
1950	{
1951	ubyte[] r = s;
1952	ubyte[] t = s;
1953	while (s.length != 0)
1954	{
1955	if (safeDecode(s) == INVALID_SEQUENCE) break;
1956	t = s;
1957	}
1958	return r.length - t.length;
1959	}
1960
1961	/**
1962	* Sanitizes an array by replacing malformed ubyte sequences with valid
1963	* ubyte sequences. The result is guaranteed to be valid for this
1964	* encoding scheme.
1965	*
1966	* If the input array is already valid, this function returns the
1967	* original, otherwise it constructs a new array by replacing all illegal
1968	* sequences with the encoding scheme's replacement sequence.
1969	*
1970	* Params:
1971	* s = the string to be sanitized
1972	*/
1973	ubyte[] sanitize(ubyte[] s)
1974	{
1975	uint n = validLength(s);
1976	if (n == s.length) return s;
1977
1978	auto repSeq = replacementSequence;
1979
1980	// Count how long the string needs to be.
1981	// Overestimating is not a problem
1982	uint len = s.length;
1983	ubyte[] t = s[n..$];
1984	while (t.length != 0)
1985	{
1986	dchar c = safeDecode(t);
1987	assert(c == INVALID_SEQUENCE);
1988	len += repSeq.length;
1989	t = t[validLength(t)..$];
1990	}
1991
1992	// Now do the write
1993	ubyte[] array = new ubyte[len];
1994	array[0..n] = s[0..n];
1995	uint offset = n;
1996
1997	t = s[n..$];
1998	while (t.length != 0)
1999	{
2000	dchar c = safeDecode(t);
2001	assert(c == INVALID_SEQUENCE);
2002	array[offset .. offset+repSeq.length] = repSeq[];
2003	offset += repSeq.length;
2004	n = validLength(t);
2005	array[offset..offset+n] = t[0..n];
2006	offset += n;
2007	t = t[n..$];
2008	}
2009	return cast(ubyte[])array[0..offset];
2010	}
2011
2012	/**
2013	* Returns the length of the first encoded sequence.
2014	*
2015	* The input to this function MUST be validly encoded.
2016	* This is enforced by the function's in-contract.
2017	*
2018	* Params:
2019	* s = the array to be sliced
2020	*/
2021	uint firstSequence(ubyte[] s)
2022	in
2023	{
2024	assert(s.length != 0);
2025	ubyte[] u = s;
2026	assert(safeDecode(u) != INVALID_SEQUENCE);
2027	}
2028	body
2029	{
2030	ubyte[] t = s;
2031	decode(s);
2032	return t.length - s.length;
2033	}
2034
2035	/**
2036	* Returns the total number of code points encoded in a ubyte array.
2037	*
2038	* The input to this function MUST be validly encoded.
2039	* This is enforced by the function's in-contract.
2040	*
2041	* Params:
2042	* s = the string to be counted
2043	*/
2044	uint count(ubyte[] s)
2045	in
2046	{
2047	assert(isValid(s));
2048	}
2049	body
2050	{
2051	uint n = 0;
2052	while (s.length != 0)
2053	{
2054	decode(s);
2055	++n;
2056	}
2057	return n;
2058	}
2059
2060	/**
2061	* Returns the array index at which the (n+1)th code point begins.
2062	*
2063	* The input to this function MUST be validly encoded.
2064	* This is enforced by the function's in-contract.
2065	*
2066	* Params:
2067	* s = the string to be counted
2068	*/
2069	int index(ubyte[] s,int n)
2070	in
2071	{
2072	assert(isValid(s));
2073	assert(n >= 0);
2074	}
2075	body
2076	{
2077	ubyte[] t = s;
2078	for (uint i=0; i<n; ++i) decode(s);
2079	return t.length - s.length;
2080	}
2081
2082	static string[string] supported;
2083	}
2084
2085	/**
2086	EncodingScheme to handle ASCII
2087
2088	This scheme recognises the following names:
2089	"ANSI_X3.4-1968",
2090	"ANSI_X3.4-1986",
2091	"ASCII",
2092	"IBM367",
2093	"ISO646-US",
2094	"ISO_646.irv:1991",
2095	"US-ASCII",
2096	"cp367",
2097	"csASCII"
2098	"iso-ir-6",
2099	"us"
2100	*/
2101	class EncodingSchemeASCII : EncodingScheme
2102	{
2103	static this()
2104	{
2105	EncodingScheme.register("std2.encoding.EncodingSchemeASCII");
2106	}
2107
2108
2109	override string[] names()
2110	{
2111	return
2112	[
2113	cast(string)
2114	"ANSI_X3.4-1968",
2115	"ANSI_X3.4-1986",
2116	"ASCII",
2117	"IBM367",
2118	"ISO646-US",
2119	"ISO_646.irv:1991",
2120	"US-ASCII",
2121	"cp367",
2122	"csASCII"
2123	"iso-ir-6",
2124	"us"
2125	];
2126	}
2127
2128	override string toString()
2129	{
2130	return "ASCII";
2131	}
2132
2133	override bool canEncode(dchar c)
2134	{
2135	return std2.encoding.canEncode!(AsciiChar)(c);
2136	}
2137
2138	override uint encodedLength(dchar c)
2139	{
2140	return std2.encoding.encodedLength!(AsciiChar)(c);
2141	}
2142
2143	override uint encode(dchar c, ubyte[] buffer)
2144	{
2145	auto r = cast(AsciiChar[])buffer;
2146	return std2.encoding.encode(c,r);
2147	}
2148
2149	override dchar decode(ref ubyte[] s)
2150	{
2151	auto t = cast(AsciiChar[]) s;
2152	dchar c = std2.encoding.decode(t);
2153	s = s[$-t.length..$];
2154	return c;
2155	}
2156
2157	override dchar safeDecode(ref ubyte[] s)
2158	{
2159	auto t = cast(AsciiChar[]) s;
2160	dchar c = std2.encoding.safeDecode(t);
2161	s = s[$-t.length..$];
2162	return c;
2163	}
2164
2165	override ubyte[] replacementSequence()
2166	{
2167	return cast(ubyte[])"?";
2168	}
2169
2170	}
2171
2172	/**
2173	EncodingScheme to handle Latin-1
2174
2175	This scheme recognises the following names:
2176	"CP819",
2177	"IBM819",
2178	"ISO-8859-1",
2179	"ISO_8859-1",
2180	"ISO_8859-1:1987",
2181	"csISOLatin1",
2182	"iso-ir-100",
2183	"l1",
2184	"latin1"
2185	*/
2186	class EncodingSchemeLatin1 : EncodingScheme
2187	{
2188	static this()
2189	{
2190	EncodingScheme.register("std2.encoding.EncodingSchemeLatin1");
2191	}
2192
2193	override string[] names()
2194	{
2195	return
2196	[
2197	cast(string)
2198	"CP819",
2199	"IBM819",
2200	"ISO-8859-1",
2201	"ISO_8859-1",
2202	"ISO_8859-1:1987",
2203	"csISOLatin1",
2204	"iso-ir-100",
2205	"l1",
2206	"latin1"
2207	];
2208	}
2209
2210	override string toString()
2211	{
2212	return "ISO-8859-1";
2213	}
2214
2215	override bool canEncode(dchar c)
2216	{
2217	return std2.encoding.canEncode!(Latin1Char)(c);
2218	}
2219
2220	override uint encodedLength(dchar c)
2221	{
2222	return std2.encoding.encodedLength!(Latin1Char)(c);
2223	}
2224
2225	override uint encode(dchar c, ubyte[] buffer)
2226	{
2227	auto r = cast(Latin1Char[])buffer;
2228	return std2.encoding.encode(c,r);
2229	}
2230
2231	override dchar decode(ref ubyte[] s)
2232	{
2233	auto t = cast(Latin1Char[]) s;
2234	dchar c = std2.encoding.decode(t);
2235	s = s[$-t.length..$];
2236	return c;
2237	}
2238
2239	override dchar safeDecode(ref ubyte[] s)
2240	{
2241	auto t = cast(Latin1Char[]) s;
2242	dchar c = std2.encoding.safeDecode(t);
2243	s = s[$-t.length..$];
2244	return c;
2245	}
2246
2247	override ubyte[] replacementSequence()
2248	{
2249	return cast(ubyte[])"?";
2250	}
2251	}
2252
2253	/**
2254	EncodingScheme to handle Windows-1252
2255
2256	This scheme recognises the following names:
2257	"windows-1252"
2258	*/
2259	class EncodingSchemeWindows1252 : EncodingScheme
2260	{
2261	static this()
2262	{
2263	EncodingScheme.register("std2.encoding.EncodingSchemeWindows1252");
2264	}
2265
2266	override string[] names()
2267	{
2268	return
2269	[
2270	cast(string)
2271	"windows-1252"
2272	];
2273	}
2274
2275	override string toString()
2276	{
2277	return "windows-1252";
2278	}
2279
2280	override bool canEncode(dchar c)
2281	{
2282	return std2.encoding.canEncode!(Windows1252Char)(c);
2283	}
2284
2285	override uint encodedLength(dchar c)
2286	{
2287	return std2.encoding.encodedLength!(Windows1252Char)(c);
2288	}
2289
2290	override uint encode(dchar c, ubyte[] buffer)
2291	{
2292	auto r = cast(Windows1252Char[])buffer;
2293	return std2.encoding.encode(c,r);
2294	}
2295
2296	override dchar decode(ref ubyte[] s)
2297	{
2298	auto t = cast(Windows1252Char[]) s;
2299	dchar c = std2.encoding.decode(t);
2300	s = s[$-t.length..$];
2301	return c;
2302	}
2303
2304	override dchar safeDecode(ref ubyte[] s)
2305	{
2306	auto t = cast(Windows1252Char[]) s;
2307	dchar c = std2.encoding.safeDecode(t);
2308	s = s[$-t.length..$];
2309	return c;
2310	}
2311
2312	override ubyte[] replacementSequence()
2313	{
2314	return cast(ubyte[])"?";
2315	}
2316	}
2317
2318	/**
2319	EncodingScheme to handle UTF-8
2320
2321	This scheme recognises the following names:
2322	"UTF-8"
2323	*/
2324	class EncodingSchemeUtf8 : EncodingScheme
2325	{
2326	static this()
2327	{
2328	EncodingScheme.register("std2.encoding.EncodingSchemeUtf8");
2329	}
2330
2331	override string[] names()
2332	{
2333	return
2334	[
2335	cast(string)
2336	"UTF-8"
2337	];
2338	}
2339
2340	override string toString()
2341	{
2342	return "UTF-8";
2343	}
2344
2345	override bool canEncode(dchar c)
2346	{
2347	return std2.encoding.canEncode!(char)(c);
2348	}
2349
2350	override uint encodedLength(dchar c)
2351	{
2352	return std2.encoding.encodedLength!(char)(c);
2353	}
2354
2355	override uint encode(dchar c, ubyte[] buffer)
2356	{
2357	auto r = cast(char[])buffer;
2358	return std2.encoding.encode(c,r);
2359	}
2360
2361	override dchar decode(ref ubyte[] s)
2362	{
2363	auto t = cast(char[]) s;
2364	dchar c = std2.encoding.decode(t);
2365	s = s[$-t.length..$];
2366	return c;
2367	}
2368
2369	override dchar safeDecode(ref ubyte[] s)
2370	{
2371	auto t = cast(char[]) s;
2372	dchar c = std2.encoding.safeDecode(t);
2373	s = s[$-t.length..$];
2374	return c;
2375	}
2376
2377	override ubyte[] replacementSequence()
2378	{
2379	return cast(ubyte[])"\uFFFD";
2380	}
2381
2382	}
2383
2384	/**
2385	EncodingScheme to handle UTF-16 in native byte order
2386
2387	This scheme recognises the following names:
2388	"UTF-16LE" (little-endian architecture only)
2389	"UTF-16BE" (big-endian architecture only)
2390	*/
2391	class EncodingSchemeUtf16Native : EncodingScheme
2392	{
2393	static this()
2394	{
2395	EncodingScheme.register("std2.encoding.EncodingSchemeUtf16Native");
2396	}
2397
2398	version(LittleEndian) { string NAME = "UTF-16LE"; }
2399	version(BigEndian) { string NAME = "UTF-16BE"; }
2400
2401	override string[] names()
2402	{
2403	return [ NAME ];
2404	}
2405
2406	override string toString()
2407	{
2408	return NAME;
2409	}
2410
2411	override bool canEncode(dchar c)
2412	{
2413	return std2.encoding.canEncode!(wchar)(c);
2414	}
2415
2416	override uint encodedLength(dchar c)
2417	{
2418	return std2.encoding.encodedLength!(wchar)(c);
2419	}
2420
2421	override uint encode(dchar c, ubyte[] buffer)
2422	{
2423	auto r = cast(wchar[])buffer;
2424	return wchar.sizeof * std2.encoding.encode(c,r);
2425	}
2426
2427	override dchar decode(ref ubyte[] s)
2428	in
2429	{
2430	assert((s.length & 1) == 0);
2431	}
2432	body
2433	{
2434	auto t = cast(wchar[]) s;
2435	dchar c = std2.encoding.decode(t);
2436	s = s[$-t.length..$];
2437	return c;
2438	}
2439
2440	override dchar safeDecode(ref ubyte[] s)
2441	in
2442	{
2443	assert((s.length & 1) == 0);
2444	}
2445	body
2446	{
2447	auto t = cast(wchar[]) s;
2448	dchar c = std2.encoding.safeDecode(t);
2449	s = s[$-t.length..$];
2450	return c;
2451	}
2452
2453	override ubyte[] replacementSequence()
2454	{
2455	return cast(ubyte[])"\uFFFD"w;
2456	}
2457	}
2458
2459	/**
2460	EncodingScheme to handle UTF-32 in native byte order
2461
2462	This scheme recognises the following names:
2463	"UTF-32LE" (little-endian architecture only)
2464	"UTF-32BE" (big-endian architecture only)
2465	*/
2466	class EncodingSchemeUtf32Native : EncodingScheme
2467	{
2468	static this()
2469	{
2470	EncodingScheme.register("std2.encoding.EncodingSchemeUtf32Native");
2471	}
2472
2473
2474	version(LittleEndian) { string NAME = "UTF-32LE"; }
2475	version(BigEndian) { string NAME = "UTF-32BE"; }
2476
2477	override string[] names()
2478	{
2479	return [ NAME ];
2480	}
2481
2482	override string toString()
2483	{
2484	return NAME;
2485	}
2486
2487	override bool canEncode(dchar c)
2488	{
2489	return std2.encoding.canEncode!(dchar)(c);
2490	}
2491
2492	override uint encodedLength(dchar c)
2493	{
2494	return std2.encoding.encodedLength!(dchar)(c);
2495	}
2496
2497	override uint encode(dchar c, ubyte[] buffer)
2498	{
2499	auto r = cast(dchar[])buffer;
2500	return dchar.sizeof * std2.encoding.encode(c,r);
2501	}
2502
2503	override dchar decode(ref ubyte[] s)
2504	in
2505	{
2506	assert((s.length & 3) == 0);
2507	}
2508	body
2509	{
2510	auto t = cast(dchar[]) s;
2511	dchar c = std2.encoding.decode(t);
2512	s = s[$-t.length..$];
2513	return c;
2514	}
2515
2516	override dchar safeDecode(ref ubyte[] s)
2517	in
2518	{
2519	assert((s.length & 3) == 0);
2520	}
2521	body
2522	{
2523	auto t = cast(dchar[]) s;
2524	dchar c = std2.encoding.safeDecode(t);
2525	s = s[$-t.length..$];
2526	return c;
2527	}
2528
2529	override ubyte[] replacementSequence()
2530	{
2531	return cast(ubyte[])"\uFFFD"d;
2532	}
2533	}
2534
2535	void transcodeReverse(Src,Dst)(Src[] s, out Dst[] r)
2536	{
2537	static if(is(Src==Dst))
2538	{
2539	return s;
2540	}
2541	else static if(is(Src==AsciiChar))
2542	{
2543	transcodeReverse!(char,Dst)(cast(string)s,r);
2544	}
2545	else
2546	{
2547	foreach_reverse(d;codePoints(s))
2548	{
2549	foreach_reverse(c;codeUnits!(Dst)(d))
2550	{
2551	r = c ~ r;
2552	}
2553	}
2554	}
2555	}
2556
2557	string makeReadable(string s)
2558	{
2559	string r = "\"";
2560	foreach(char c;s)
2561	{
2562	if (c >= 0x20 && c < 0x80)
2563	{
2564	r ~= c;
2565	}
2566	else
2567	{
2568	r ~= "\\x";
2569	r ~= toHexDigit(c >> 4);
2570	r ~= toHexDigit(c);
2571	}
2572	}
2573	r ~= "\"";
2574	return r;
2575	}
2576
2577	string makeReadable(wstring s)
2578	{
2579	string r = "\"";
2580	foreach(wchar c;s)
2581	{
2582	if (c >= 0x20 && c < 0x80)
2583	{
2584	r ~= cast(char) c;
2585	}
2586	else
2587	{
2588	r ~= "\\u";
2589	r ~= toHexDigit(c >> 12);
2590	r ~= toHexDigit(c >> 8);
2591	r ~= toHexDigit(c >> 4);
2592	r ~= toHexDigit(c);
2593	}
2594	}
2595	r ~= "\"w";
2596	return r;
2597	}
2598
2599	string makeReadable(dstring s)
2600	{
2601	string r = "\"";
2602	foreach(dchar c; s)
2603	{
2604	if (c >= 0x20 && c < 0x80)
2605	{
2606	r ~= cast(char) c;
2607	}
2608	else if (c < 0x10000)
2609	{
2610	r ~= "\\u";
2611	r ~= toHexDigit(c >> 12);
2612	r ~= toHexDigit(c >> 8);
2613	r ~= toHexDigit(c >> 4);
2614	r ~= toHexDigit(c);
2615	}
2616	else
2617	{
2618	r ~= "\\U00";
2619	r ~= toHexDigit(c >> 20);
2620	r ~= toHexDigit(c >> 16);
2621	r ~= toHexDigit(c >> 12);
2622	r ~= toHexDigit(c >> 8);
2623	r ~= toHexDigit(c >> 4);
2624	r ~= toHexDigit(c);
2625	}
2626	}
2627	r ~= "\"d";
2628	return r;
2629	}
2630
2631	char toHexDigit(int n)
2632	{
2633	return "0123456789ABCDEF"[n & 0xF];
2634	}
2635
2636
2637	unittest
2638	{
2639	void TestEncoding()
2640	{
2641	ubyte[][] validStrings =
2642	[
2643	// Plain ASCII
2644	cast(ubyte[])"hello",
2645
2646	// First possible sequence of a certain length
2647	[ 0x00 ], // U+00000000 one byte
2648	[ 0xC2, 0x80 ], // U+00000080 two bytes
2649	[ 0xE0, 0xA0, 0x80 ], // U+00000800 three bytes
2650	[ 0xF0, 0x90, 0x80, 0x80 ], // U+00010000 three bytes
2651
2652	// Last possible sequence of a certain length
2653	[ 0x7F ], // U+0000007F one byte
2654	[ 0xDF, 0xBF ], // U+000007FF two bytes
2655	[ 0xEF, 0xBF, 0xBF ], // U+0000FFFF three bytes
2656
2657	// Other boundary conditions
2658	[ 0xED, 0x9F, 0xBF ],
2659	// U+0000D7FF Last character before surrogates
2660	[ 0xEE, 0x80, 0x80 ],
2661	// U+0000E000 First character after surrogates
2662	[ 0xEF, 0xBF, 0xBD ],
2663	// U+0000FFFD Unicode replacement character
2664	[ 0xF4, 0x8F, 0xBF, 0xBF ],
2665	// U+0010FFFF Very last character
2666
2667	// Non-character code points
2668	/* NOTE: These are legal in UTF, and may be converted from
2669	one UTF to another, however they do not represent Unicode
2670	characters. These code points have been reserved by
2671	Unicode as non-character code points. They are permissible
2672	for data exchange within an application, but they are are
2673	not permitted to be used as characters. Since this module
2674	deals with UTF, and not with Unicode per se, we choose to
2675	accept them here. */
2676	[ 0xDF, 0xBE ], // U+0000FFFE
2677	[ 0xDF, 0xBF ], // U+0000FFFF
2678	];
2679
2680
2681	ubyte[][] invalidStrings =
2682	[
2683	// First possible sequence of a certain length, but greater
2684	// than U+10FFFF
2685	[ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U+00200000 five bytes
2686	[ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U+04000000 six bytes
2687
2688	// Last possible sequence of a certain length, but greater than U+10FFFF
2689	[ 0xF7, 0xBF, 0xBF, 0xBF ], // U+001FFFFF four bytes
2690	[ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF five bytes
2691	[ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U+7FFFFFFF six bytes
2692
2693	// Other boundary conditions
2694	[ 0xF4, 0x90, 0x80, 0x80 ], // U+00110000
2695	// First code
2696	// point after
2697	// last character
2698
2699	// Unexpected continuation bytes
2700	[ 0x80 ],
2701	[ 0xBF ],
2702	[ 0x20, 0x80, 0x20 ],
2703	[ 0x20, 0xBF, 0x20 ],
2704	[ 0x80, 0x9F, 0xA0 ],
2705
2706	// Lonely start bytes
2707	[ 0xC0 ],
2708	[ 0xCF ],
2709	[ 0x20, 0xC0, 0x20 ],
2710	[ 0x20, 0xCF, 0x20 ],
2711	[ 0xD0 ],
2712	[ 0xDF ],
2713	[ 0x20, 0xD0, 0x20 ],
2714	[ 0x20, 0xDF, 0x20 ],
2715	[ 0xE0 ],
2716	[ 0xEF ],
2717	[ 0x20, 0xE0, 0x20 ],
2718	[ 0x20, 0xEF, 0x20 ],
2719	[ 0xF0 ],
2720	[ 0xF1 ],
2721	[ 0xF2 ],
2722	[ 0xF3 ],
2723	[ 0xF4 ],
2724	[ 0xF5 ], // If this were legal it would start a character > U+10FFFF
2725	[ 0xF6 ], // If this were legal it would start a character > U+10FFFF
2726	[ 0xF7 ], // If this were legal it would start a character > U+10FFFF
2727
2728	[ 0xEF, 0xBF ], // Three byte sequence with third byte missing
2729	[ 0xF7, 0xBF, 0xBF ], // Four byte sequence with fourth byte missing
2730	[ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ], // Concatenation of the above
2731
2732	// Impossible bytes
2733	[ 0xF8 ],
2734	[ 0xF9 ],
2735	[ 0xFA ],
2736	[ 0xFB ],
2737	[ 0xFC ],
2738	[ 0xFD ],
2739	[ 0xFE ],
2740	[ 0xFF ],
2741	[ 0x20, 0xF8, 0x20 ],
2742	[ 0x20, 0xF9, 0x20 ],
2743	[ 0x20, 0xFA, 0x20 ],
2744	[ 0x20, 0xFB, 0x20 ],
2745	[ 0x20, 0xFC, 0x20 ],
2746	[ 0x20, 0xFD, 0x20 ],
2747	[ 0x20, 0xFE, 0x20 ],
2748	[ 0x20, 0xFF, 0x20 ],
2749
2750	// Overlong sequences, all representing U+002F
2751	/* With a safe UTF-8 decoder, all of the following five overlong
2752	representations of the ASCII character slash ("/") should be
2753	rejected like a malformed UTF-8 sequence */
2754	[ 0xC0, 0xAF ],
2755	[ 0xE0, 0x80, 0xAF ],
2756	[ 0xF0, 0x80, 0x80, 0xAF ],
2757	[ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
2758	[ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
2759
2760	// Maximum overlong sequences
2761	/* Below you see the highest Unicode value that is still resulting in
2762	an overlong sequence if represented with the given number of bytes.
2763	This is a boundary test for safe UTF-8 decoders. All five
2764	characters should be rejected like malformed UTF-8 sequences. */
2765	[ 0xC1, 0xBF ], // U+0000007F
2766	[ 0xE0, 0x9F, 0xBF ], // U+000007FF
2767	[ 0xF0, 0x8F, 0xBF, 0xBF ], // U+0000FFFF
2768	[ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U+001FFFFF
2769	[ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF
2770
2771	// Overlong representation of the NUL character
2772	/* The following five sequences should also be rejected like malformed
2773	UTF-8 sequences and should not be treated like the ASCII NUL
2774	character. */
2775	[ 0xC0, 0x80 ],
2776	[ 0xE0, 0x80, 0x80 ],
2777	[ 0xF0, 0x80, 0x80, 0x80 ],
2778	[ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
2779	[ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
2780
2781	// Illegal code positions
2782	/* The following UTF-8 sequences should be rejected like malformed
2783	sequences, because they never represent valid ISO 10646 characters
2784	and a UTF-8 decoder that accepts them might introduce security
2785	problems comparable to overlong UTF-8 sequences. */
2786	[ 0xED, 0xA0, 0x80 ], // U+D800
2787	[ 0xED, 0xAD, 0xBF ], // U+DB7F
2788	[ 0xED, 0xAE, 0x80 ], // U+DB80
2789	[ 0xED, 0xAF, 0xBF ], // U+DBFF
2790	[ 0xED, 0xB0, 0x80 ], // U+DC00
2791	[ 0xED, 0xBE, 0x80 ], // U+DF80
2792	[ 0xED, 0xBF, 0xBF ], // U+DFFF
2793	];
2794
2795	string[] sanitizedStrings =
2796	[
2797	"\uFFFD","\uFFFD",
2798	"\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
2799	" \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
2800	"\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
2801	" \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
2802	"\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
2803	"\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
2804	" \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
2805	" \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
2806	"\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
2807	"\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
2808	];
2809
2810	// Make sure everything that should be valid, is
2811	foreach(a;validStrings)
2812	{
2813	string s = cast(string)a;
2814	assert(isValid(s),"Failed to validate: "~makeReadable(s));
2815	}
2816
2817	// Make sure everything that shouldn't be valid, isn't
2818	foreach(a;invalidStrings)
2819	{
2820	string s = cast(string)a;
2821	assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
2822	}
2823
2824	// Make sure we can sanitize everything bad
2825	assert(invalidStrings.length == sanitizedStrings.length);
2826	for(int i=0; i<invalidStrings.length; ++i)
2827	{
2828	string s = cast(string)invalidStrings[i];
2829	string t = sanitize(s);
2830	assert(isValid(t));
2831	assert(t == sanitizedStrings[i]);
2832	ubyte[] u = cast(ubyte[])t;
2833	validStrings ~= u;
2834	}
2835
2836	// Make sure all transcodings work in both directions, using both forward
2837	// and reverse iteration
2838	foreach(i,a; validStrings)
2839	{
2840	string s = cast(string)a;
2841	string s2;
2842	wstring ws, ws2;
2843	dstring ds, ds2;
2844
2845	transcode(s,ws);
2846	assert(isValid(ws));
2847	transcode(ws,s2);
2848	assert(s == s2);
2849
2850	transcode(s,ds);
2851	assert(isValid(ds));
2852	transcode(ds,s2);
2853	assert(s == s2);
2854
2855	transcode(ws,s);
2856	assert(isValid(s));
2857	transcode(s,ws2);
2858	assert(ws == ws2);
2859
2860	transcode(ws,ds);
2861	assert(isValid(ds));
2862	transcode(ds,ws2);
2863	assert(ws == ws2);
2864
2865	transcode(ds,s);
2866	assert(isValid(s));
2867	transcode(s,ds2);
2868	assert(ds == ds2);
2869
2870	transcode(ds,ws);
2871	assert(isValid(ws));
2872	transcode(ws,ds2);
2873	assert(ds == ds2);
2874
2875	transcodeReverse(s,ws);
2876	assert(isValid(ws));
2877	transcodeReverse(ws,s2);
2878	assert(s == s2);
2879
2880	transcodeReverse(s,ds);
2881	assert(isValid(ds));
2882	transcodeReverse(ds,s2);
2883	assert(s == s2);
2884
2885	transcodeReverse(ws,s);
2886	assert(isValid(s));
2887	transcodeReverse(s,ws2);
2888	assert(ws == ws2);
2889
2890	transcodeReverse(ws,ds);
2891	assert(isValid(ds));
2892	transcodeReverse(ds,ws2);
2893	assert(ws == ws2);
2894
2895	transcodeReverse(ds,s);
2896	assert(isValid(s));
2897	transcodeReverse(s,ds2);
2898	assert(ds == ds2);
2899
2900	transcodeReverse(ds,ws);
2901	assert(isValid(ws));
2902	transcodeReverse(ws,ds2);
2903	assert(ds == ds2);
2904	}
2905
2906	// Make sure the non-UTF encodings work too
2907	{
2908	auto s = "\u20AC100";
2909	Windows1252String t;
2910	transcode(s,t);
2911	assert(t == [cast(Windows1252Char)0x80, '1', '0', '0']);
2912	string u;
2913	transcode(s,u);
2914	assert(s == u);
2915	Latin1String v;
2916	transcode(s,v);
2917	assert(cast(string)v == "?100");
2918	AsciiString w;
2919	transcode(v,w);
2920	assert(cast(string)w == "?100");
2921	}
2922
2923	// Make sure we can count properly
2924	{
2925	assert(encodedLength!(char)('A') == 1);
2926	assert(encodedLength!(char)('\u00E3') == 2);
2927	assert(encodedLength!(char)('\u2028') == 3);
2928	assert(encodedLength!(char)('\U0010FFF0') == 4);
2929	assert(encodedLength!(wchar)('A') == 1);
2930	assert(encodedLength!(wchar)('\U0010FFF0') == 2);
2931	}
2932
2933	// Make sure we can write into mutable arrays
2934	{
2935	char[4] buffer;
2936	uint n = encode(cast(dchar)'\u00E3',buffer);
2937	assert(n == 2);
2938	assert(buffer[0] == 0xC3);
2939	assert(buffer[1] == 0xA3);
2940	}
2941	}
2942	TestEncoding();
2943
2944	}
2945	version (unittest_report)
2946	{
2947	import std.stdio;
2948	unittest {
2949	writefln("unittest std2.encoding passed");
2950	}
2951	}
2952	//=============================================================================

Download in other formats:

Original Format

std2

WikiStart: encoding.d

Download in other formats: