Note: This website is archived. For up-to-date information about D projects and development, please visit wiki.dlang.org.

WikiStart: encoding.d

File encoding.d, 71.6 kB (added by y0uf00bar, 15 years ago)

std2.encoding

Line 
1 // Written in the D programming language.
2
3 /**
4 Classes and functions for handling and transcoding between various encodings.
5
6 For cases where the _encoding is known at compile-time, functions are provided
7 for arbitrary _encoding and decoding of characters, arbitrary transcoding
8 between strings of different type, as well as validation and sanitization.
9
10 Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1
11 (also known as LATIN-1), and WINDOWS-1252.
12
13 $(UL
14 $(LI The type $(D AsciiChar) represents an ASCII character.)
15 $(LI The type $(D AsciiString) represents an ASCII string.)
16 $(LI The type $(D Latin1Char) represents an ISO-8859-1 character.)
17 $(LI The type $(D Latin1String) represents an ISO-8859-1 string.)
18 $(LI The type $(D Windows1252Char) represents a Windows-1252 character.)
19 $(LI The type $(D Windows1252String) represents a Windows-1252 string.))
20
21 For cases where the _encoding is not known at compile-time, but is
22 known at run-time, we provide the abstract class $(D EncodingScheme)
23 and its subclasses.  To construct a run-time encoder/decoder, one does
24 e.g.
25
26 ----------------------------------------------------
27     auto e = EncodingScheme.create("utf-8");
28 ----------------------------------------------------
29
30 This library supplies $(D EncodingScheme) subclasses for ASCII,
31 ISO-8859-1 (also known as LATIN-1), WINDOWS-1252, UTF-8, and (on
32 little-endian architectures) UTF-16LE and UTF-32LE; or (on big-endian
33 architectures) UTF-16BE and UTF-32BE.
34
35 This library provides a mechanism whereby other modules may add $(D
36 EncodingScheme) subclasses for any other _encoding.
37
38 Authors: Janice Caron
39
40 Date: 2008.02.27 - 2008.05.07
41
42 License: Public Domain
43
44 Macros:
45     WIKI=Phobos/StdEncoding
46 */
47
48 module std2.encoding;
49 import std.string;
50 import std.traits;
51
52
53 //=============================================================================
54
55 /** Special value returned by $(D safeDecode) */
56 const dchar INVALID_SEQUENCE =  cast(dchar)0xFFFFFFFF;
57
58 template EncoderFunctions()
59 {
60     // Various forms of read
61
62     template ReadFromString()
63     {
64         bool canRead() { return s.length != 0; }
65         E peek() { return s[0]; }
66         E read() { E t = s[0]; s = s[1..$]; return t; }
67     }
68
69     template ReverseReadFromString()
70     {
71         bool canRead() { return s.length != 0; }
72         E peek() { return s[$-1]; }
73         E read() { E t = s[$-1]; s = s[0..$-1]; return t; }
74     }
75
76     // Various forms of Write
77
78     template WriteToString()
79     {
80         E[] s;
81         void write(E c) { s ~= c; }
82     }
83    
84     template WriteToArray()
85     {
86         void write(E c) { array[0] = c; array = array[1..$]; }
87     }
88
89     deprecated template WriteToBuffer()
90     {
91         void write(E c) { buffer ~= c; }
92     }
93
94     template WriteToDelegate()
95     {
96         void write(E c) { dg(c); }
97     }
98
99     // Functions we will export
100
101     template EncodeViaWrite()
102     {
103         mixin encodeViaWrite;
104         void encode(dchar c) { encodeViaWrite(c); }
105     }
106
107     template SkipViaRead()
108     {
109         mixin skipViaRead;
110         void skip() { skipViaRead(); }
111     }
112
113     template DecodeViaRead()
114     {
115         mixin decodeViaRead;
116         dchar decode() { return decodeViaRead(); }
117     }
118
119     template SafeDecodeViaRead()
120     {
121         mixin safeDecodeViaRead;
122         dchar safeDecode() { return safeDecodeViaRead(); }
123     }
124
125     template DecodeReverseViaRead()
126     {
127         mixin decodeReverseViaRead;
128         dchar decodeReverse() { return decodeReverseViaRead(); }
129     }
130
131     // Encoding to different destinations
132
133     template EncodeToString()
134     {
135         mixin WriteToString;
136         mixin EncodeViaWrite;
137     }
138    
139     template EncodeToArray()
140     {
141         mixin WriteToArray;
142         mixin EncodeViaWrite;
143     }
144
145     deprecated template EncodeToBuffer()
146     {
147         mixin WriteToBuffer;
148         mixin EncodeViaWrite;
149     }
150
151     template EncodeToDelegate()
152     {
153         mixin WriteToDelegate;
154         mixin EncodeViaWrite;
155     }
156
157     // Decoding functions
158
159     template SkipFromString()
160     {
161         mixin ReadFromString;
162         mixin SkipViaRead;
163     }
164
165     template DecodeFromString()
166     {
167         mixin ReadFromString;
168         mixin DecodeViaRead;
169     }
170
171     template SafeDecodeFromString()
172     {
173         mixin ReadFromString;
174         mixin SafeDecodeViaRead;
175     }
176
177     template DecodeReverseFromString()
178     {
179         mixin ReverseReadFromString;
180         mixin DecodeReverseViaRead;
181     }
182
183     //=========================================================================
184
185     // Below are the functions we will ultimately expose to the user
186    
187     E[] encode(dchar c)
188     {
189         mixin EncodeToString e;
190         e.encode(c);
191         return e.s;
192     }
193
194     void encode(dchar c, ref E[] array)
195     {
196         mixin EncodeToArray e;
197         e.encode(c);
198     }
199
200     void encode(dchar c, void delegate(E) dg)
201     {
202         mixin EncodeToDelegate e;
203         e.encode(c);
204     }
205
206     void skip(ref E[] s)
207     {
208         mixin SkipFromString e;
209         e.skip();
210     }
211
212     dchar decode(S)(ref S s)
213     {
214         mixin DecodeFromString e;
215         return e.decode();
216     }
217
218     dchar safeDecode(S)(ref S s)
219     {
220         mixin SafeDecodeFromString e;
221         return e.safeDecode();
222     }
223
224     dchar decodeReverse(ref E[] s)
225     {
226         mixin DecodeReverseFromString e;
227         return e.decodeReverse();
228     }
229 }
230
231 //=========================================================================
232
233 struct CodePoints(E)
234 {
235     E[] s;
236
237     static CodePoints opCall(E[] s)
238     in
239     {
240         assert(isValid(s));
241     }
242     body
243     {
244         CodePoints codePoints;
245         codePoints.s = s;
246         return codePoints;
247     }
248
249     int opApply(int delegate(ref dchar) dg)
250     {
251         int result = 0;
252         while (s.length != 0)
253         {
254             dchar c = decode(s);
255             result = dg(c);
256             if (result != 0) break;
257         }
258         return result;
259     }
260
261     int opApply(int delegate(ref uint, ref dchar) dg)
262     {
263         uint i = 0;
264         int result = 0;
265         while (s.length != 0)
266         {
267             uint len = s.length;
268             dchar c = decode(s);
269             uint j = i; // We don't want the delegate corrupting i
270             result = dg(j,c);
271             if (result != 0) break;
272             i += len - s.length;
273         }
274         return result;
275     }
276
277     int opApplyReverse(int delegate(ref dchar) dg)
278     {
279         int result = 0;
280         while (s.length != 0)
281         {
282             dchar c = decodeReverse(s);
283             result = dg(c);
284             if (result != 0) break;
285         }
286         return result;
287     }
288
289     int opApplyReverse(int delegate(ref uint, ref dchar) dg)
290     {
291         int result = 0;
292         while (s.length != 0)
293         {
294             dchar c = decodeReverse(s);
295             uint i = s.length;
296             result = dg(i,c);
297             if (result != 0) break;
298         }
299         return result;
300     }
301 }
302
303 struct CodeUnits(E)
304 {
305     E[] s;
306
307     static CodeUnits opCall(dchar d)
308     in
309     {
310         assert(isValidCodePoint(d));
311     }
312     body
313     {
314         CodeUnits codeUnits;
315         codeUnits.s = encode!(E)(d);
316         return codeUnits;
317     }
318
319     int opApply(int delegate(ref E) dg)
320     {
321         int result = 0;
322         foreach(E c;s)
323         {
324             result = dg(c);
325             if (result != 0) break;
326         }
327         return result;
328     }
329
330     int opApplyReverse(int delegate(ref E) dg)
331     {
332         int result = 0;
333         foreach_reverse(E c;s)
334         {
335             result = dg(c);
336             if (result != 0) break;
337         }
338         return result;
339     }
340 }
341
342 //=============================================================================
343
344 template EncoderInstance(E)
345 {
346     static assert(false,"Cannot instantiate EncoderInstance for type "
347         ~ E.stringof);
348 }
349
350 //=============================================================================
351 //          ASCII
352 //=============================================================================
353
354 /** Defines various character sets. */
355 typedef ubyte AsciiChar;
356 /// Ditto
357 alias AsciiChar[] AsciiString;
358
359 template EncoderInstance(CharType : AsciiChar)
360 {
361     alias AsciiChar E;
362     alias AsciiString EString;
363
364     string encodingName()
365     {
366         return "ASCII";
367     }
368
369     bool canEncode(dchar c)
370     {
371         return c < 0x80;
372     }
373
374     bool isValidCodeUnit(AsciiChar c)
375     {
376         return c < 0x80;
377     }
378    
379     uint encodedLength(dchar c)
380     in
381     {
382         assert(canEncode(c));
383     }
384     body
385     {
386         return 1;
387     }
388
389     void encodeX(Range)(dchar c, Range r)
390     {
391         if (!canEncode(c)) c = '?';
392         r.write(cast(AsciiChar) c);
393     }
394
395     void encodeViaWrite()(dchar c)
396     {
397         if (!canEncode(c)) c = '?';
398         write(cast(AsciiChar)c);
399     }
400
401     void skipViaRead()()
402     {
403         read();
404     }
405
406     dchar decodeViaRead()()
407     {
408         return read;
409     }
410
411     dchar safeDecodeViaRead()()
412     {
413         dchar c = read;
414         return canEncode(c) ? c : INVALID_SEQUENCE;
415     }
416
417     dchar decodeReverseViaRead()()
418     {
419         return read;
420     }
421
422     EString replacementSequence()
423     {
424         return cast(EString)("?");
425     }
426
427     mixin EncoderFunctions;
428 }
429
430 //=============================================================================
431 //          ISO-8859-1
432 //=============================================================================
433
434 /** Defines an Latin1-encoded character. */
435 typedef ubyte Latin1Char;
436 /**
437 Defines an Latin1-encoded string (as an array of $(D
438 invariant(Latin1Char))).
439  */
440 alias Latin1Char[] Latin1String; ///
441
442 template EncoderInstance(CharType : Latin1Char)
443 {
444     alias Latin1Char E;
445     alias Latin1String EString;
446
447     string encodingName()
448     {
449         return "ISO-8859-1";
450     }
451
452     bool canEncode(dchar c)
453     {
454         return c < 0x100;
455     }
456
457     bool isValidCodeUnit(Latin1Char c)
458     {
459         return true;
460     }
461
462     uint encodedLength(dchar c)
463     in
464     {
465         assert(canEncode(c));
466     }
467     body
468     {
469         return 1;
470     }
471
472     void encodeViaWrite()(dchar c)
473     {
474         if (!canEncode(c)) c = '?';
475         write(cast(Latin1Char)c);
476     }
477
478     void skipViaRead()()
479     {
480         read();
481     }
482
483     dchar decodeViaRead()()
484     {
485         return read;
486     }
487
488     dchar safeDecodeViaRead()()
489     {
490         return read;
491     }
492
493     dchar decodeReverseViaRead()()
494     {
495         return read;
496     }
497
498     EString replacementSequence()
499     {
500         return cast(EString)("?");
501     }
502
503     mixin EncoderFunctions;
504 }
505
506 //=============================================================================
507 //          WINDOWS-1252
508 //=============================================================================
509
510 /** Defines a Windows1252-encoded character. */
511 typedef ubyte Windows1252Char;
512 /**
513 Defines an Windows1252-encoded string (as an array of $(D
514 invariant(Windows1252Char))).
515  */
516 alias Windows1252Char[] Windows1252String; ///
517
518 template EncoderInstance(CharType : Windows1252Char)
519 {
520     alias Windows1252Char E;
521     alias Windows1252String EString;
522
523     string encodingName()
524     {
525         return "windows-1252";
526     }
527
528     static const wstring charMap =
529         "\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021"
530         "\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD"
531         "\uFFFD\u2018\u2019\u201C\u201D\u2022\u2103\u2014"
532         "\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178";
533
534     bool canEncode(dchar c)
535     {
536         if (c < 0x80 || (c >= 0xA0 && c < 0x100)) return true;
537         if (c >= 0xFFFD) return false;
538         foreach(wchar d;charMap) { if (c == d) return true; }
539         return false;
540     }
541
542     bool isValidCodeUnit(Windows1252Char c)
543     {
544         if (c < 0x80 || c >= 0xA0) return true;
545         return (charMap[c-0x80] != 0xFFFD);
546     }
547
548     uint encodedLength(dchar c)
549     in
550     {
551         assert(canEncode(c));
552     }
553     body
554     {
555         return 1;
556     }
557
558     void encodeViaWrite()(dchar c)
559     {
560         if (c < 0x80 || (c >= 0xA0 && c < 0x100)) {}
561         else if (c >= 0xFFFD) { c = '?'; }
562         else
563         {
564             int n = -1;
565             foreach(i,wchar d;charMap)
566             {
567                 if (c == d)
568                 {
569                     n = i;
570                     break;
571                 }
572             }
573             c = n == -1 ? '?' : 0x80 + n;
574         }
575         write(cast(Windows1252Char)c);
576     }
577
578     void skipViaRead()()
579     {
580         read();
581     }
582
583     dchar decodeViaRead()()
584     {
585         Windows1252Char c = read;
586         return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
587     }
588
589     dchar safeDecodeViaRead()()
590     {
591         Windows1252Char c = read;
592         dchar d = (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
593         return d == 0xFFFD ? INVALID_SEQUENCE : d;
594     }
595
596     dchar decodeReverseViaRead()()
597     {
598         Windows1252Char c = read;
599         return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c;
600     }
601
602     EString replacementSequence()
603     {
604         return cast(EString)("?");
605     }
606
607     mixin EncoderFunctions;
608 }
609
610 //=============================================================================
611 //          UTF-8
612 //=============================================================================
613
614 template EncoderInstance(CharType : char)
615 {
616     alias char E;
617     alias char[] EString;
618
619     string encodingName()
620     {
621         return "UTF-8";
622     }
623
624     bool canEncode(dchar c)
625     {
626         return isValidCodePoint(c);
627     }
628
629     bool isValidCodeUnit(char c)
630     {
631         return (c < 0xC0 || (c >= 0xC2 && c < 0xF5));
632     }
633
634     static const ubyte[128] tailTable =
635     [
636         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
637         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
638         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
639         0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
640         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
641         1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
642         2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
643         3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0,
644     ];
645
646     private int tails(char c)
647     in
648     {
649         assert(c >= 0x80);
650     }
651     body
652     {
653         return tailTable[c-0x80];
654     }
655
656     uint encodedLength(dchar c)
657     in
658     {
659         assert(canEncode(c));
660     }
661     body
662     {
663         if (c < 0x80) return 1;
664         if (c < 0x800) return 2;
665         if (c < 0x10000) return 3;
666         return 4;
667     }
668
669     void encodeViaWrite()(dchar c)
670     {
671         if (c < 0x80)
672         {
673             write(cast(char)c);
674         }
675         else if (c < 0x800)
676         {
677             write(cast(char)((c >> 6) + 0xC0));
678             write(cast(char)((c & 0x3F) + 0x80));
679         }
680         else if (c < 0x10000)
681         {
682             write(cast(char)((c >> 12) + 0xE0));
683             write(cast(char)(((c >> 6) & 0x3F) + 0x80));
684             write(cast(char)((c & 0x3F) + 0x80));
685         }
686         else
687         {
688             write(cast(char)((c >> 18) + 0xF0));
689             write(cast(char)(((c >> 12) & 0x3F) + 0x80));
690             write(cast(char)(((c >> 6) & 0x3F) + 0x80));
691             write(cast(char)((c & 0x3F) + 0x80));
692         }
693     }
694
695     void skipViaRead()()
696     {
697         auto c = read;
698         if (c < 0xC0) return;
699         int n = tails(cast(char) c);
700         for (uint i=0; i<n; ++i)
701         {
702             read();
703         }
704     }
705
706     dchar decodeViaRead()()
707     {
708         dchar c = read;
709         if (c < 0xC0) return c;
710         int n = tails(cast(char) c);
711         c &= (1 << (6 - n)) - 1;
712         for (uint i=0; i<n; ++i)
713         {
714             c = (c << 6) + (read & 0x3F);
715         }
716         return c;
717     }
718
719     dchar safeDecodeViaRead()()
720     {
721         dchar c = read;
722         if (c < 0x80) return c;
723         int n = tails(cast(char) c);
724         if (n == 0) return INVALID_SEQUENCE;
725
726         if (!canRead) return INVALID_SEQUENCE;
727         uint d = peek;
728         bool err =
729         (
730             (c < 0xC2)                              // fail overlong 2-byte sequences
731         ||  (c > 0xF4)                              // fail overlong 4-6-byte sequences
732         ||  (c == 0xE0 && ((d & 0xE0) == 0x80))     // fail overlong 3-byte sequences
733         ||  (c == 0xED && ((d & 0xE0) == 0xA0))     // fail surrogates
734         ||  (c == 0xF0 && ((d & 0xF0) == 0x80))     // fail overlong 4-byte sequences
735         ||  (c == 0xF4 && ((d & 0xF0) >= 0x90))     // fail code points > 0x10FFFF
736         );
737
738         c &= (1 << (6 - n)) - 1;
739         for (uint i=0; i<n; ++i)
740         {
741             if (!canRead) return INVALID_SEQUENCE;
742             d = peek;
743             if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE;
744             c = (c << 6) + (read & 0x3F);
745         }
746
747         return err ? INVALID_SEQUENCE : c;
748     }
749
750     dchar decodeReverseViaRead()()
751     {
752         //auto feed_char = read;
753     //dchar c = feed_char;
754     dchar c = read;
755         if (c < 0x80) return c;
756         uint shift = 0;
757         c &= 0x3F;
758         for (uint i=0; i<4; ++i)
759         {
760             shift += 6;
761             auto d = read;
762             uint n = tails(cast(char) d);
763             uint mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1;
764             c += ((d & mask) << shift);
765             if (n != 0) break;
766         }
767         return c;
768     }
769
770     EString replacementSequence()
771     {
772         return "\uFFFD";
773     }
774
775     mixin EncoderFunctions;
776 }
777
778 //=============================================================================
779 //          UTF-16
780 //=============================================================================
781
782 template EncoderInstance(CharType : wchar)
783 {
784     alias wchar E;
785     alias wchar[] EString;
786
787     string encodingName()
788     {
789         return "UTF-16";
790     }
791
792     bool canEncode(dchar c)
793     {
794         return isValidCodePoint(c);
795     }
796
797     bool isValidCodeUnit(wchar c)
798     {
799         return true;
800     }
801
802     uint encodedLength(dchar c)
803     in
804     {
805         assert(canEncode(c));
806     }
807     body
808     {
809         return (c < 0x10000) ? 1 : 2;
810     }
811
812     void encodeViaWrite()(dchar c)
813     {
814         if (c < 0x10000)
815         {
816             write(cast(wchar)c);
817         }
818         else
819         {
820             uint n = c - 0x10000;
821             write(cast(wchar)(0xD800 + (n >> 10)));
822             write(cast(wchar)(0xDC00 + (n & 0x3FF)));
823         }
824     }
825
826     void skipViaRead()()
827     {
828         wchar c = read;
829         if (c < 0xD800 || c >= 0xE000) return;
830         read();
831     }
832
833     dchar decodeViaRead()()
834     {
835         wchar c = read;
836         if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
837         wchar d = read;
838         c &= 0x3FF;
839         d &= 0x3FF;
840         return 0x10000 + (c << 10) + d;
841     }
842
843     dchar safeDecodeViaRead()()
844     {
845         wchar c = read;
846         if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
847         if (c >= 0xDC00) return INVALID_SEQUENCE;
848         if (!canRead) return INVALID_SEQUENCE;
849         wchar d = peek;
850         if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE;
851         d = read;
852         c &= 0x3FF;
853         d &= 0x3FF;
854         return 0x10000 + (c << 10) + d;
855     }
856
857     dchar decodeReverseViaRead()()
858     {
859         wchar c = read;
860         if (c < 0xD800 || c >= 0xE000) return cast(dchar)c;
861         wchar d = read;
862         c &= 0x3FF;
863         d &= 0x3FF;
864         return 0x10000 + (d << 10) + c;
865     }
866
867     EString replacementSequence()
868     {
869         return "\uFFFD"w;
870     }
871
872     mixin EncoderFunctions;
873 }
874
875 //=============================================================================
876 //          UTF-32
877 //=============================================================================
878
879 template EncoderInstance(CharType : dchar)
880 {
881     alias dchar E;
882     alias dchar[] EString;
883
884     string encodingName()
885     {
886         return "UTF-32";
887     }
888
889     bool canEncode(dchar c)
890     {
891         return isValidCodePoint(c);
892     }
893
894     bool isValidCodeUnit(dchar c)
895     {
896         return isValidCodePoint(c);
897     }
898
899     uint encodedLength(dchar c)
900     in
901     {
902         assert(canEncode(c));
903     }
904     body
905     {
906         return 1;
907     }
908
909     void encodeViaWrite()(dchar c)
910     {
911         write(c);
912     }
913
914     void skipViaRead()()
915     {
916         read();
917     }
918
919     dchar decodeViaRead()()
920     {
921         return cast(dchar)read;
922     }
923
924     dchar safeDecodeViaRead()()
925     {
926         dchar c = read;
927         return isValidCodePoint(c) ? c : INVALID_SEQUENCE;
928     }
929
930     dchar decodeReverseViaRead()()
931     {
932         return cast(dchar)read;
933     }
934
935     EString replacementSequence()
936     {
937         return "\uFFFD"d;
938     }
939
940     mixin EncoderFunctions;
941 }
942
943 //=============================================================================
944 // Below are forwarding functions which expose the function to the user
945
946 /**
947 Returns true if c is a valid code point
948
949  Note that this includes the non-character code points U+FFFE and U+FFFF,
950  since these are valid code points (even though they are not valid
951  characters).
952
953  Supercedes:
954  This function supercedes $(D std.utf.startsValidDchar()).
955
956  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
957
958  Params:
959     c = the code point to be tested
960  */
961 bool isValidCodePoint(dchar c)
962 {
963     return c < 0xD800 || (c >= 0xE000 && c < 0x110000);
964 }
965
966 /**
967  Returns the name of an encoding.
968
969  The type of encoding cannot be deduced. Therefore, it is necessary to
970  explicitly specify the encoding type.
971
972  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
973
974  Examples:
975  -----------------------------------
976  assert(encodingName!(Latin1Char) == "ISO-8859-1");
977  -----------------------------------
978  */
979 string encodingName(T)()
980 {
981     return EncoderInstance!(T).encodingName;
982 }
983
984 unittest
985 {
986     assert(encodingName!(char) == "UTF-8");
987     assert(encodingName!(wchar) == "UTF-16");
988     assert(encodingName!(dchar) == "UTF-32");
989     assert(encodingName!(AsciiChar) == "ASCII");
990     assert(encodingName!(Latin1Char) == "ISO-8859-1");
991     assert(encodingName!(Windows1252Char) == "windows-1252");
992 }
993
994 /**
995  Returns true iff it is possible to represent the specifed codepoint
996  in the encoding.
997
998  The type of encoding cannot be deduced. Therefore, it is necessary to
999  explicitly specify the encoding type.
1000
1001  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1002
1003  Examples:
1004  -----------------------------------
1005  assert(canEncode!(Latin1Char)('A'));
1006  -----------------------------------
1007  */
1008 bool canEncode(E)(dchar c)
1009 {
1010     return EncoderInstance!(E).canEncode(c);
1011 }
1012
1013 unittest
1014 {
1015     assert(!canEncode!(AsciiChar)('\u00A0'));
1016     assert(canEncode!(Latin1Char)('\u00A0'));
1017     assert(canEncode!(Windows1252Char)('\u20AC'));
1018     assert(!canEncode!(Windows1252Char)('\u20AD'));
1019     assert(!canEncode!(Windows1252Char)('\uFFFD'));
1020     assert(!canEncode!(char)(cast(dchar)0x110000));
1021 }
1022
1023 /**
1024  Returns true if the code unit is legal. For example, the byte 0x80 would
1025  not be legal in ASCII, because ASCII code units must always be in the range
1026  0x00 to 0x7F.
1027
1028  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1029
1030  Params:
1031     c = the code unit to be tested
1032  */
1033 bool isValidCodeUnit(E)(E c)
1034 {
1035     return EncoderInstance!(E).isValidCodeUnit(c);
1036 }
1037
1038 unittest
1039 {
1040     assert(!isValidCodeUnit(cast(AsciiChar)0xA0));
1041     assert( isValidCodeUnit(cast(Windows1252Char)0x80));
1042     assert(!isValidCodeUnit(cast(Windows1252Char)0x81));
1043     assert(!isValidCodeUnit(cast(char)0xC0));
1044     assert(!isValidCodeUnit(cast(char)0xFF));
1045     assert( isValidCodeUnit(cast(wchar)0xD800));
1046     assert(!isValidCodeUnit(cast(dchar)0xD800));
1047 }
1048
1049 /**
1050  Returns true if the string is encoded correctly
1051
1052  Supercedes:
1053  This function supercedes std.utf.validate(), however note that this
1054  function returns a bool indicating whether the input was valid or not,
1055  wheras the older funtion would throw an exception.
1056
1057  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1058
1059  Params:
1060     s = the string to be tested
1061  */
1062 bool isValid(E)(E[] s)
1063 {
1064     return s.length == validLength(s);
1065 }
1066
1067 unittest
1068 {
1069     assert(isValid("\u20AC100"));
1070 }
1071
1072 /**
1073  Returns the length of the longest possible substring, starting from
1074  the first code unit, which is validly encoded.
1075
1076  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1077
1078  Params:
1079     s = the string to be tested
1080  */
1081 uint validLength(E)(E[] s)
1082 {
1083     uint result, before = void;
1084     while ((before = s.length) > 0)
1085     {
1086         if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE)
1087             break;
1088         result += before - s.length;
1089     }
1090     return result;
1091 }
1092
1093 /**
1094  Sanitizes a string by replacing malformed code unit sequences with valid
1095  code unit sequences. The result is guaranteed to be valid for this encoding.
1096
1097  If the input string is already valid, this function returns the original,
1098  otherwise it constructs a new string by replacing all illegal code unit
1099  sequences with the encoding's replacement character, Invalid sequences will
1100  be replaced with the Unicode replacement character (U+FFFD) if the
1101  character repertoire contains it, otherwise invalid sequences will be
1102  replaced with '?'.
1103
1104  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1105
1106  Params:
1107     s = the string to be sanitized
1108  */
1109
1110 E[] sanitize(E)(E[] s)
1111 {
1112     uint n = validLength(s);
1113     if (n == s.length) return s;
1114
1115     auto repSeq = EncoderInstance!(E).replacementSequence;
1116
1117     // Count how long the string needs to be.
1118     // Overestimating is not a problem
1119     uint len = s.length;
1120     E[] t = s[n..$];
1121     while (t.length != 0)
1122     {
1123         dchar c = EncoderInstance!(E).safeDecode(t);
1124         assert(c == INVALID_SEQUENCE);
1125         len += repSeq.length;
1126         t = t[validLength(t)..$];
1127     }
1128
1129     // Now do the write
1130     E[] array = new E[len];
1131     array[0..n] = s[0..n];
1132     uint offset = n;
1133
1134     t = s[n..$];
1135     while (t.length != 0)
1136     {
1137         dchar c = EncoderInstance!(E).safeDecode(t);
1138         assert(c == INVALID_SEQUENCE);
1139         array[offset..offset+repSeq.length] = repSeq[];
1140         offset += repSeq.length;
1141         n = validLength(t);
1142         array[offset..offset+n] = t[0..n];
1143         offset += n;
1144         t = t[n..$];
1145     }
1146     return cast(E[])array[0..offset];
1147 }
1148
1149 unittest
1150 {
1151     assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld");
1152 }
1153
1154 /**
1155 Returns the length of the first encoded sequence.
1156
1157 The input to this function MUST be validly encoded.
1158 This is enforced by the function's in-contract.
1159
1160 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1161
1162 Params:
1163 s = the string to be sliced
1164  */
1165 uint firstSequence(E)(E[] s)
1166 in
1167 {
1168     assert(s.length != 0);
1169     E[] u = s;
1170     assert(safeDecode(u) != INVALID_SEQUENCE);
1171 }
1172 body
1173 {
1174     auto before = s.length;
1175     EncoderInstance!(E).skip(s);
1176     return before - s.length;
1177 }
1178
1179 unittest
1180 {
1181     assert(firstSequence("\u20AC1000") == "\u20AC".length);
1182 }
1183
1184 /**
1185  Returns the length the last encoded sequence.
1186
1187  The input to this function MUST be validly encoded.
1188  This is enforced by the function's in-contract.
1189
1190  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1191
1192  Params:
1193     s = the string to be sliced
1194  */
1195 uint lastSequence(E)(E[] s)
1196 in
1197 {
1198     assert(s.length != 0);
1199     assert(isValid(s));
1200 }
1201 body
1202 {
1203     E[] t = s;
1204     EncoderInstance!(E).decodeReverse(s);
1205     return t.length - s.length;
1206 }
1207
1208 unittest
1209 {
1210     assert(lastSequence("1000\u20AC") == "\u20AC".length);
1211 }
1212
1213 /**
1214 Returns the total number of code points encoded in a string.
1215
1216 The input to this function MUST be validly encoded.  This is enforced
1217 by the function's in-contract.
1218
1219 Supercedes: This function supercedes $(D std.utf.toUCSindex()).
1220
1221 Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1222
1223 Params:
1224 s = the string to be counted
1225  */
1226 uint codepoints_count(E)(E[] s)
1227 in
1228 {
1229     assert(isValid(s));
1230 }
1231 body
1232 {
1233     uint n = 0;
1234     while (s.length != 0)
1235     {
1236         EncoderInstance!(E).skip(s);
1237         ++n;
1238     }
1239     return n;
1240 }
1241
1242 unittest
1243 {
1244     assert(codepoints_count("\u20AC100") == 4);
1245 }
1246
1247 /**
1248  Returns the array index at which the (n+1)th code point begins.
1249
1250  The input to this function MUST be validly encoded.
1251  This is enforced by the function's in-contract.
1252
1253  Supercedes:
1254  This function supercedes std.utf.toUTFindex().
1255
1256  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1257
1258  Params:
1259     s = the string to be counted
1260  */
1261 int index(E)(E[] s,int n)
1262 in
1263 {
1264     assert(isValid(s));
1265     assert(n >= 0);
1266 }
1267 body
1268 {
1269     E[] t = s;
1270     for (uint i=0; i<n; ++i) EncoderInstance!(E).skip(s);
1271     return t.length - s.length;
1272 }
1273
1274 unittest
1275 {
1276     assert(index("\u20AC100",1) == 3);
1277 }
1278
1279 /**
1280  Decodes a single code point.
1281
1282  This function removes one or more code units from the start of a string,
1283  and returns the decoded code point which those code units represent.
1284
1285  The input to this function MUST be validly encoded.
1286  This is enforced by the function's in-contract.
1287
1288  Supercedes:
1289  This function supercedes std.utf.decode(), however, note that the
1290  function codePoints() supercedes it more conveniently.
1291
1292  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1293
1294  Params:
1295     s = the string whose first code point is to be decoded
1296  */
1297 dchar decode(S)(ref S s)
1298 in
1299 {
1300     assert(s.length != 0);
1301     auto u = s;
1302     assert(safeDecode(u) != INVALID_SEQUENCE);
1303 }
1304 body
1305 {
1306     return EncoderInstance!(typeof(s[0])).decode(s);
1307 }
1308
1309 /**
1310  Decodes a single code point from the end of a string.
1311
1312  This function removes one or more code units from the end of a string,
1313  and returns the decoded code point which those code units represent.
1314
1315  The input to this function MUST be validly encoded.
1316  This is enforced by the function's in-contract.
1317
1318  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1319
1320  Params:
1321     s = the string whose first code point is to be decoded
1322  */
1323 dchar decodeReverse(E)(ref E[] s)
1324 in
1325 {
1326     assert(s.length != 0);
1327     assert(isValid(s));
1328 }
1329 body
1330 {
1331     return EncoderInstance!(E).decodeReverse(s);
1332 }
1333
1334 /**
1335  Decodes a single code point. The input does not have to be valid.
1336
1337  This function removes one or more code units from the start of a string,
1338  and returns the decoded code point which those code units represent.
1339
1340  This function will accept an invalidly encoded string as input.
1341  If an invalid sequence is found at the start of the string, this
1342  function will remove it, and return the value INVALID_SEQUENCE.
1343
1344  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1345
1346  Params:
1347     s = the string whose first code point is to be decoded
1348  */
1349 dchar safeDecode(S)(ref S s)
1350 in
1351 {
1352     assert(s.length != 0);
1353 }
1354 body
1355 {
1356     return EncoderInstance!(typeof(s[0])).safeDecode(s);
1357 }
1358
1359 /**
1360  Returns the number of code units required to encode a single code point.
1361
1362  The input to this function MUST be a valid code point.
1363  This is enforced by the function's in-contract.
1364
1365  The type of the output cannot be deduced. Therefore, it is necessary to
1366  explicitly specify the encoding as a template parameter.
1367
1368  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1369
1370  Params:
1371     c = the code point to be encoded
1372  */
1373 uint encodedLength(E)(dchar c)
1374 in
1375 {
1376     assert(isValidCodePoint(c));
1377 }
1378 body
1379 {
1380     return EncoderInstance!(E).encodedLength(c);
1381 }
1382
1383 /**
1384  Encodes a single code point.
1385
1386  This function encodes a single code point into one or more code units.
1387  It returns a string containing those code units.
1388
1389  The input to this function MUST be a valid code point.
1390  This is enforced by the function's in-contract.
1391
1392  The type of the output cannot be deduced. Therefore, it is necessary to
1393  explicitly specify the encoding as a template parameter.
1394
1395  Supercedes:
1396  This function supercedes std.utf.encode(), however, note that the
1397  function codeUnits() supercedes it more conveniently.
1398
1399  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1400
1401  Params:
1402     c = the code point to be encoded
1403  */
1404 E[] encode(E)(dchar c)
1405 in
1406 {
1407     assert(isValidCodePoint(c));
1408 }
1409 body
1410 {
1411     return EncoderInstance!(E).encode(c);
1412 }
1413
1414 /**
1415  Encodes a single code point into an array.
1416
1417  This function encodes a single code point into one or more code units
1418  The code units are stored in a user-supplied fixed-size array,
1419  which must be passed by reference.
1420
1421  The input to this function MUST be a valid code point.
1422  This is enforced by the function's in-contract.
1423
1424  The type of the output cannot be deduced. Therefore, it is necessary to
1425  explicitly specify the encoding as a template parameter.
1426
1427  Supercedes:
1428  This function supercedes std.utf.encode(), however, note that the
1429  function codeUnits() supercedes it more conveniently.
1430
1431  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1432
1433  Params:
1434     c = the code point to be encoded
1435
1436  Returns:
1437       the number of code units written to the array
1438  */
1439 uint encode(E)(dchar c, E[] array)
1440 in
1441 {
1442     assert(isValidCodePoint(c));
1443 }
1444 body
1445 {
1446     E[] t = array;
1447     EncoderInstance!(E).encode(c,t);
1448     return array.length - t.length;
1449 }
1450
1451 // /**
1452 //  * Encodes a single code point into a Buffer.
1453 //  *
1454 //  * This function encodes a single code point into one or more code units
1455 //  * The code units are stored in a growable buffer.
1456 //  *
1457 //  * The input to this function MUST be a valid code point.
1458 //  * This is enforced by the function's in-contract.
1459 //  *
1460 //  * The type of the output cannot be deduced. Therefore, it is necessary to
1461 //  * explicitly specify the encoding as a template parameter.
1462 //  *
1463 //  * Supercedes:
1464 //  * This function supercedes std.utf.encode(), however, note that the
1465 //  * function codeUnits() supercedes it more conveniently.
1466 //  *
1467 //  * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1468 //  *
1469 //  * Params:
1470 //  *    c = the code point to be encoded
1471 //  */
1472 // deprecated void encode(E)(dchar c, ref Buffer!(E) buffer)
1473 // in
1474 // {
1475 //     assert(isValidCodePoint(c));
1476 // }
1477 // body
1478 // {
1479 //     EncoderInstance!(E).encode(c,buffer);
1480 // }
1481
1482 /**
1483 Encodes $(D c) in units of type $(D E) and writes the result to the
1484 output range $(D R). Returns the number of $(D E)s written.
1485  */
1486
1487 alias void delegate(char c) encode_putchar;
1488 alias void delegate(wchar c) encode_putwchar;
1489
1490 size_t encode_char(dchar  c, encode_putchar putc)
1491 {
1492     if (c <= 0x7F)
1493     {
1494         putc(cast(char) c);
1495         return 1;
1496     }
1497     if (c <= 0x7FF)
1498     {
1499         putc(cast(char)(0xC0 | (c >> 6)));
1500         putc(cast(char)(0x80 | (c & 0x3F)));
1501         return 2;
1502     }
1503     if (c <= 0xFFFF)
1504     {
1505         putc(cast(char)(0xE0 | (c >> 12)));
1506         putc(cast(char)(0x80 | ((c >> 6) & 0x3F)));
1507         putc(cast(char)(0x80 | (c & 0x3F)));
1508         return 3;
1509     }
1510     if (c <= 0x10FFFF)
1511     {
1512         putc(cast(char)(0xF0 | (c >> 18)));
1513         putc(cast(char)(0x80 | ((c >> 12) & 0x3F)));
1514         putc(cast(char)(0x80 | ((c >> 6) & 0x3F)));
1515         putc(cast(char)(0x80 | (c & 0x3F)));
1516         return 4;
1517     }
1518     else
1519     {
1520         assert(0);
1521     }
1522 }
1523 size_t encode_wchar(E, R)(dchar c, encode_putwchar putw)
1524 {
1525         if (c <= 0xFFFF)
1526         {
1527             r.putw(cast(wchar) c);
1528             return 1;
1529         }
1530         r.putw(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800));
1531         r.putw(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00));
1532         return 2;
1533 }
1534
1535 /**
1536  Encodes a single code point to a delegate.
1537
1538  This function encodes a single code point into one or more code units.
1539  The code units are passed one at a time to the supplied delegate.
1540
1541  The input to this function MUST be a valid code point.
1542  This is enforced by the function's in-contract.
1543
1544  The type of the output cannot be deduced. Therefore, it is necessary to
1545  explicitly specify the encoding as a template parameter.
1546
1547  Supercedes:
1548  This function supercedes std.utf.encode(), however, note that the
1549  function codeUnits() supercedes it more conveniently.
1550
1551  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1552
1553  Params:
1554     c = the code point to be encoded
1555  */
1556 void encode(E)(dchar c, void delegate(E) dg)
1557 in
1558 {
1559     assert(isValidCodePoint(c));
1560 }
1561 body
1562 {
1563     EncoderInstance!(E).encode(c,dg);
1564 }
1565
1566 /**
1567  Returns a foreachable struct which can bidirectionally iterate over all
1568  code points in a string.
1569
1570  The input to this function MUST be validly encoded.
1571  This is enforced by the function's in-contract.
1572
1573  You can foreach either
1574  with or without an index. If an index is specified, it will be initialized
1575  at each iteration with the offset into the string at which the code point
1576  begins.
1577
1578  Supercedes:
1579  This function supercedes std.utf.decode().
1580
1581  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1582
1583  Params:
1584     s = the string to be decoded
1585
1586  Examples:
1587  --------------------------------------------------------
1588  string s = "hello world";
1589  foreach(c;codePoints(s))
1590  {
1591      // do something with c (which will always be a dchar)
1592  }
1593  --------------------------------------------------------
1594
1595  Note that, currently, foreach(c:codePoints(s)) is superior to foreach(c;s)
1596  in that the latter will fall over on encountering U+FFFF.
1597  */
1598 CodePoints!(E) codePoints(E)(E[] s)
1599 in
1600 {
1601     assert(isValid(s));
1602 }
1603 body
1604 {
1605     return CodePoints!(E)(s);
1606 }
1607
1608 unittest
1609 {
1610     string s = "hello";
1611     string t;
1612     foreach(c;codePoints(s))
1613     {
1614         t ~= cast(char)c;
1615     }
1616     assert(s == t);
1617 }
1618
1619 /**
1620  Returns a foreachable struct which can bidirectionally iterate over all
1621  code units in a code point.
1622
1623  The input to this function MUST be a valid code point.
1624  This is enforced by the function's in-contract.
1625
1626  The type of the output cannot be deduced. Therefore, it is necessary to
1627  explicitly specify the encoding type in the template parameter.
1628
1629  Supercedes:
1630  This function supercedes std.utf.encode().
1631
1632  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1633
1634  Params:
1635     d = the code point to be encoded
1636
1637  Examples:
1638  --------------------------------------------------------
1639  dchar d = '\u20AC';
1640  foreach(c;codeUnits!(char)(d))
1641  {
1642      writefln("%X",c)
1643  }
1644  // will print
1645  // E2
1646  // 82
1647  // AC
1648  --------------------------------------------------------
1649  */
1650 CodeUnits!(E) codeUnits(E)(dchar c)
1651 in
1652 {
1653     assert(isValidCodePoint(c));
1654 }
1655 body
1656 {
1657     return CodeUnits!(E)(c);
1658 }
1659
1660 unittest
1661 {
1662     char[] a;
1663     foreach(c;codeUnits!(char)(cast(dchar)'\u20AC'))
1664     {
1665         a ~= c;
1666     }
1667     assert(a.length == 3);
1668     assert(a[0] == 0xE2);
1669     assert(a[1] == 0x82);
1670     assert(a[2] == 0xAC);
1671 }
1672
1673 /**
1674 Encodes $(D c) in units of type $(D E) and writes the result to the
1675 output range $(D R). Returns the number of $(D E)s written.
1676  */
1677
1678 uint encode(Tgt, Src, R)(in Src[] s, R range)
1679 {
1680     uint result;
1681     foreach (c; s)
1682     {
1683         result += encode!(Tgt)(c, range);
1684     }
1685     return result;
1686 }
1687
1688 /**
1689  Convert a string from one encoding to another. (See also to!() below).
1690
1691  The input to this function MUST be validly encoded.
1692  This is enforced by the function's in-contract.
1693
1694  Supercedes:
1695  This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and
1696  std.utf.toUTF32()
1697  (but note that to!() supercedes it more conveniently).
1698
1699  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1700
1701  Params:
1702     s = the source string
1703     r = the destination string
1704
1705  Examples:
1706  --------------------------------------------------------
1707  wstring ws;
1708  transcode("hello world",ws);
1709      // transcode from UTF-8 to UTF-16
1710
1711  Latin1String ls;
1712  transcode(ws, ls);
1713      // transcode from UTF-16 to ISO-8859-1
1714   --------------------------------------------------------
1715  */
1716 void transcode(Src,Dst)(Src[] s,out Dst[] r)
1717 in
1718 {
1719     assert(isValid(s));
1720 }
1721 body
1722 {
1723     static if(is(Src==Dst))
1724     {
1725         r = s;
1726     }
1727     else static if(is(Src==AsciiChar))
1728     {
1729         transcode!(char,Dst)(cast(string)s,r);
1730     }
1731     else
1732     {
1733         Src[] t = s;
1734         while (t.length != 0)
1735         {
1736             r ~= encode!(Dst)(decode(t));
1737         }
1738     }
1739 }
1740
1741 /*
1742  Convert a string from one encoding to another. (See also transcode() above).
1743
1744  The input to this function MUST be validly encoded.
1745  This is enforced by the function's in-contract.
1746
1747  Supercedes:
1748  This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and
1749  std.utf.toUTF32().
1750
1751  Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252
1752
1753  Params:
1754     Dst = the destination encoding type
1755     s = the source string
1756
1757  Examples:
1758  -----------------------------------------------------------------------------
1759  auto ws = to!(wchar)("hello world");  // transcode from UTF-8 to UTF-16
1760  auto ls = to!(Latin1Char)(ws);            // transcode from UTF-16 to ISO-8859-1
1761  -----------------------------------------------------------------------------
1762  */
1763 // TODO: Commented out for no - to be moved to std.conv
1764 // Dst to(Dst,Src)(immutable(Src)[] s)
1765 // in
1766 // {
1767 //  assert(isValid(s));
1768 // }
1769 // body
1770 // {
1771 //  Dst r;
1772 //  transcode(s,r);
1773 //  return r;
1774 // }
1775
1776 //=============================================================================
1777
1778 /** The base class for exceptions thrown by this module */
1779 class EncodingException : Exception { this(string msg) { super(msg); } }
1780
1781 class UnrecognizedEncodingException : EncodingException
1782 {
1783     private this(string msg) { super(msg); }
1784 }
1785
1786 /** Abstract base class of all encoding schemes */
1787 abstract class EncodingScheme
1788 {
1789     /**
1790      * Registers a subclass of EncodingScheme.
1791      *
1792      * This function allows user-defined subclasses of EncodingScheme to
1793      * be declared in other modules.
1794      *
1795      * Examples:
1796      * ----------------------------------------------
1797      * class Amiga1251 : EncodingScheme
1798      * {
1799      *     static this()
1800      *     {
1801      *         EncodingScheme.register("path.to.Amiga1251");
1802      *     }
1803      * }
1804      * ----------------------------------------------
1805      */
1806     static void register(string className)
1807     {
1808         auto scheme = cast(EncodingScheme)ClassInfo.find(className).create();
1809         if (scheme is null)
1810             throw new EncodingException("Unable to create class "~className);
1811         foreach(encodingName;scheme.names())
1812         {
1813             supported[tolower(encodingName)] = className;
1814         }
1815     }
1816
1817     /**
1818      * Obtains a subclass of EncodingScheme which is capable of encoding
1819      * and decoding the named encoding scheme.
1820      *
1821      * This function is only aware of EncodingSchemes which have been
1822      * registered with the register() function.
1823      *
1824      * Examples:
1825      * ---------------------------------------------------
1826      * auto scheme = EncodingScheme.create("Amiga-1251");
1827      * ---------------------------------------------------
1828      */
1829     static EncodingScheme create(string encodingName)
1830     {
1831         auto p = std.string.tolower(encodingName) in supported;
1832         if (p is null)
1833             throw new EncodingException("Unrecognized Encoding: "~encodingName);
1834         string className = *p;
1835         auto scheme = cast(EncodingScheme)ClassInfo.find(className).create();
1836         if (scheme is null) throw new EncodingException("Unable to create class "~className);
1837         return scheme;
1838     }
1839
1840          /**
1841          * Returns the standard name of the encoding scheme
1842          */
1843         abstract override string toString();
1844
1845         /**
1846          * Returns an array of all known names for this encoding scheme
1847          */
1848         abstract string[] names();
1849
1850         /**
1851          * Returns true if the character c can be represented
1852          * in this encoding scheme.
1853          */
1854         abstract bool canEncode(dchar c);
1855
1856         /**
1857          * Returns the number of ubytes required to encode this code point.
1858          *
1859          * The input to this function MUST be a valid code point.
1860          *
1861          * Params:
1862          *    c = the code point to be encoded
1863          *
1864          * Returns:
1865          *    the number of ubytes required.
1866          */
1867         abstract uint encodedLength(dchar c);
1868
1869         /**
1870          * Encodes a single code point into a user-supplied, fixed-size buffer.
1871          *
1872          * This function encodes a single code point into one or more ubytes.
1873          * The supplied buffer must be code unit aligned.
1874          * (For example, UTF-16LE or UTF-16BE must be wchar-aligned,
1875          * UTF-32LE or UTF-32BE must be dchar-aligned, etc.)
1876          *
1877          * The input to this function MUST be a valid code point.
1878          *
1879          * Params:
1880          *    c = the code point to be encoded
1881          *
1882          * Returns:
1883          *    the number of ubytes written.
1884          */
1885         abstract uint encode(dchar c, ubyte[] buffer);
1886
1887         /**
1888          * Decodes a single code point.
1889          *
1890          * This function removes one or more ubytes from the start of an array,
1891          * and returns the decoded code point which those ubytes represent.
1892          *
1893          * The input to this function MUST be validly encoded.
1894          *
1895          * Params:
1896          *    s = the array whose first code point is to be decoded
1897          */
1898         abstract dchar decode(ref ubyte[] s);
1899
1900         /**
1901          * Decodes a single code point. The input does not have to be valid.
1902          *
1903          * This function removes one or more ubytes from the start of an array,
1904          * and returns the decoded code point which those ubytes represent.
1905          *
1906          * This function will accept an invalidly encoded array as input.
1907          * If an invalid sequence is found at the start of the string, this
1908          * function will remove it, and return the value INVALID_SEQUENCE.
1909          *
1910          * Params:
1911          *    s = the array whose first code point is to be decoded
1912          */
1913         abstract dchar safeDecode(ref ubyte[] s);
1914
1915         /**
1916          * Returns the sequence of ubytes to be used to represent
1917          * any character which cannot be represented in the encoding scheme.
1918          *
1919          * Normally this will be a representation of some substitution
1920          * character, such as U+FFFD or '?'.
1921          */
1922         abstract ubyte[] replacementSequence();
1923    
1924
1925     /**
1926      * Returns true if the array is encoded correctly
1927      *
1928      * Params:
1929      *    s = the array to be tested
1930      */
1931     bool isValid(ubyte[] s)
1932     {
1933         while (s.length != 0)
1934         {
1935             dchar d = safeDecode(s);
1936             if (d == INVALID_SEQUENCE)
1937                 return false;
1938         }
1939         return true;
1940     }
1941
1942     /**
1943      * Returns the length of the longest possible substring, starting from
1944      * the first element, which is validly encoded.
1945      *
1946      * Params:
1947      *    s = the array to be tested
1948      */
1949     uint validLength(ubyte[] s)
1950     {
1951         ubyte[] r = s;
1952         ubyte[] t = s;
1953         while (s.length != 0)
1954         {
1955             if (safeDecode(s) == INVALID_SEQUENCE) break;
1956             t = s;
1957         }
1958         return r.length - t.length;
1959     }
1960
1961     /**
1962      * Sanitizes an array by replacing malformed ubyte sequences with valid
1963      * ubyte sequences. The result is guaranteed to be valid for this
1964      * encoding scheme.
1965      *
1966      * If the input array is already valid, this function returns the
1967      * original, otherwise it constructs a new array by replacing all illegal
1968      * sequences with the encoding scheme's replacement sequence.
1969      *
1970      * Params:
1971      *    s = the string to be sanitized
1972      */
1973      ubyte[] sanitize(ubyte[] s)
1974     {
1975         uint n = validLength(s);
1976         if (n == s.length) return s;
1977
1978         auto repSeq = replacementSequence;
1979
1980         // Count how long the string needs to be.
1981         // Overestimating is not a problem
1982         uint len = s.length;
1983         ubyte[] t = s[n..$];
1984         while (t.length != 0)
1985         {
1986             dchar c = safeDecode(t);
1987             assert(c == INVALID_SEQUENCE);
1988             len += repSeq.length;
1989             t = t[validLength(t)..$];
1990         }
1991
1992         // Now do the write
1993         ubyte[] array = new ubyte[len];
1994         array[0..n] = s[0..n];
1995         uint offset = n;
1996
1997         t = s[n..$];
1998         while (t.length != 0)
1999         {
2000             dchar c = safeDecode(t);
2001             assert(c == INVALID_SEQUENCE);
2002             array[offset .. offset+repSeq.length] = repSeq[];
2003             offset += repSeq.length;
2004             n = validLength(t);
2005             array[offset..offset+n] = t[0..n];
2006             offset += n;
2007             t = t[n..$];
2008         }
2009         return cast(ubyte[])array[0..offset];
2010     }
2011
2012     /**
2013      * Returns the length of the first encoded sequence.
2014      *
2015      * The input to this function MUST be validly encoded.
2016      * This is enforced by the function's in-contract.
2017      *
2018      * Params:
2019      *    s = the array to be sliced
2020      */
2021     uint firstSequence(ubyte[] s)
2022     in
2023     {
2024         assert(s.length != 0);
2025         ubyte[] u = s;
2026         assert(safeDecode(u) != INVALID_SEQUENCE);
2027     }
2028     body
2029     {
2030         ubyte[] t = s;
2031         decode(s);
2032         return t.length - s.length;
2033     }
2034
2035     /**
2036      * Returns the total number of code points encoded in a ubyte array.
2037      *
2038      * The input to this function MUST be validly encoded.
2039      * This is enforced by the function's in-contract.
2040      *
2041      * Params:
2042      *    s = the string to be counted
2043      */
2044     uint count(ubyte[] s)
2045     in
2046     {
2047         assert(isValid(s));
2048     }
2049     body
2050     {
2051         uint n = 0;
2052         while (s.length != 0)
2053         {
2054             decode(s);
2055             ++n;
2056         }
2057         return n;
2058     }
2059
2060     /**
2061      * Returns the array index at which the (n+1)th code point begins.
2062      *
2063      * The input to this function MUST be validly encoded.
2064      * This is enforced by the function's in-contract.
2065      *
2066      * Params:
2067      *    s = the string to be counted
2068      */
2069     int index(ubyte[] s,int n)
2070     in
2071     {
2072         assert(isValid(s));
2073         assert(n >= 0);
2074     }
2075     body
2076     {
2077         ubyte[] t = s;
2078         for (uint i=0; i<n; ++i) decode(s);
2079         return t.length - s.length;
2080     }
2081
2082     static string[string] supported;
2083 }
2084
2085 /**
2086  EncodingScheme to handle ASCII
2087
2088  This scheme recognises the following names:
2089                  "ANSI_X3.4-1968",
2090                  "ANSI_X3.4-1986",
2091                  "ASCII",
2092                  "IBM367",
2093                  "ISO646-US",
2094                  "ISO_646.irv:1991",
2095                  "US-ASCII",
2096                  "cp367",
2097                  "csASCII"
2098                  "iso-ir-6",
2099                  "us"
2100  */
2101 class EncodingSchemeASCII : EncodingScheme
2102 {
2103     static this()
2104     {
2105         EncodingScheme.register("std2.encoding.EncodingSchemeASCII");
2106     }
2107
2108
2109         override string[] names()
2110         {
2111             return
2112             [
2113                 cast(string)
2114                 "ANSI_X3.4-1968",
2115                 "ANSI_X3.4-1986",
2116                 "ASCII",
2117                 "IBM367",
2118                 "ISO646-US",
2119                 "ISO_646.irv:1991",
2120                 "US-ASCII",
2121                 "cp367",
2122                 "csASCII"
2123                 "iso-ir-6",
2124                 "us"
2125             ];
2126         }
2127
2128         override string toString()
2129         {
2130             return "ASCII";
2131         }
2132
2133         override bool canEncode(dchar c)
2134         {
2135             return std2.encoding.canEncode!(AsciiChar)(c);
2136         }
2137        
2138         override uint encodedLength(dchar c)
2139         {
2140             return std2.encoding.encodedLength!(AsciiChar)(c);
2141         }
2142
2143         override uint encode(dchar c, ubyte[] buffer)
2144         {
2145             auto r = cast(AsciiChar[])buffer;
2146             return std2.encoding.encode(c,r);
2147         }
2148
2149         override dchar decode(ref ubyte[] s)
2150         {
2151             auto t = cast(AsciiChar[]) s;
2152             dchar c = std2.encoding.decode(t);
2153             s = s[$-t.length..$];
2154             return c;
2155         }
2156
2157         override dchar safeDecode(ref ubyte[] s)
2158         {
2159             auto t = cast(AsciiChar[]) s;
2160             dchar c = std2.encoding.safeDecode(t);
2161             s = s[$-t.length..$];
2162             return c;
2163         }
2164
2165         override ubyte[] replacementSequence()
2166         {
2167             return cast(ubyte[])"?";
2168         }
2169
2170 }
2171
2172 /**
2173  EncodingScheme to handle Latin-1
2174
2175  This scheme recognises the following names:
2176                  "CP819",
2177                  "IBM819",
2178                  "ISO-8859-1",
2179                  "ISO_8859-1",
2180                  "ISO_8859-1:1987",
2181                  "csISOLatin1",
2182                  "iso-ir-100",
2183                  "l1",
2184                  "latin1"
2185  */
2186 class EncodingSchemeLatin1 : EncodingScheme
2187 {
2188     static this()
2189     {
2190         EncodingScheme.register("std2.encoding.EncodingSchemeLatin1");
2191     }
2192
2193         override string[] names()
2194         {
2195             return
2196             [
2197                 cast(string)
2198                 "CP819",
2199                 "IBM819",
2200                 "ISO-8859-1",
2201                 "ISO_8859-1",
2202                 "ISO_8859-1:1987",
2203                 "csISOLatin1",
2204                 "iso-ir-100",
2205                 "l1",
2206                 "latin1"
2207             ];
2208         }
2209
2210         override string toString()
2211         {
2212             return "ISO-8859-1";
2213         }
2214
2215         override bool canEncode(dchar c)
2216         {
2217             return std2.encoding.canEncode!(Latin1Char)(c);
2218         }
2219
2220         override uint encodedLength(dchar c)
2221         {
2222             return std2.encoding.encodedLength!(Latin1Char)(c);
2223         }
2224
2225         override uint encode(dchar c, ubyte[] buffer)
2226         {
2227             auto r = cast(Latin1Char[])buffer;
2228             return std2.encoding.encode(c,r);
2229         }
2230
2231         override dchar decode(ref ubyte[] s)
2232         {
2233             auto t = cast(Latin1Char[]) s;
2234             dchar c = std2.encoding.decode(t);
2235             s = s[$-t.length..$];
2236             return c;
2237         }
2238
2239         override dchar safeDecode(ref ubyte[] s)
2240         {
2241             auto t = cast(Latin1Char[]) s;
2242             dchar c = std2.encoding.safeDecode(t);
2243             s = s[$-t.length..$];
2244             return c;
2245         }
2246
2247         override ubyte[] replacementSequence()
2248         {
2249             return cast(ubyte[])"?";
2250         }
2251 }
2252
2253 /**
2254  EncodingScheme to handle Windows-1252
2255
2256  This scheme recognises the following names:
2257                  "windows-1252"
2258  */
2259 class EncodingSchemeWindows1252 : EncodingScheme
2260 {
2261     static this()
2262     {
2263         EncodingScheme.register("std2.encoding.EncodingSchemeWindows1252");
2264     }
2265
2266         override string[] names()
2267         {
2268             return
2269             [
2270                 cast(string)
2271                 "windows-1252"
2272             ];
2273         }
2274
2275         override string toString()
2276         {
2277             return "windows-1252";
2278         }
2279
2280         override bool canEncode(dchar c)
2281         {
2282             return std2.encoding.canEncode!(Windows1252Char)(c);
2283         }
2284
2285         override uint encodedLength(dchar c)
2286         {
2287             return std2.encoding.encodedLength!(Windows1252Char)(c);
2288         }
2289
2290         override uint encode(dchar c, ubyte[] buffer)
2291         {
2292             auto r = cast(Windows1252Char[])buffer;
2293             return std2.encoding.encode(c,r);
2294         }
2295
2296         override dchar decode(ref ubyte[] s)
2297         {
2298             auto t = cast(Windows1252Char[]) s;
2299             dchar c = std2.encoding.decode(t);
2300             s = s[$-t.length..$];
2301             return c;
2302         }
2303
2304         override dchar safeDecode(ref ubyte[] s)
2305         {
2306             auto t = cast(Windows1252Char[]) s;
2307             dchar c = std2.encoding.safeDecode(t);
2308             s = s[$-t.length..$];
2309             return c;
2310         }
2311
2312         override ubyte[] replacementSequence()
2313         {
2314             return cast(ubyte[])"?";
2315         }
2316 }
2317
2318 /**
2319  EncodingScheme to handle UTF-8
2320
2321  This scheme recognises the following names:
2322                  "UTF-8"
2323  */
2324 class EncodingSchemeUtf8 : EncodingScheme
2325 {
2326     static this()
2327     {
2328         EncodingScheme.register("std2.encoding.EncodingSchemeUtf8");
2329     }
2330
2331         override string[] names()
2332         {
2333             return
2334             [
2335                 cast(string)
2336                 "UTF-8"
2337             ];
2338         }
2339
2340         override string toString()
2341         {
2342             return "UTF-8";
2343         }
2344
2345         override bool canEncode(dchar c)
2346         {
2347             return std2.encoding.canEncode!(char)(c);
2348         }
2349
2350         override uint encodedLength(dchar c)
2351         {
2352             return std2.encoding.encodedLength!(char)(c);
2353         }
2354
2355         override uint encode(dchar c, ubyte[] buffer)
2356         {
2357             auto r = cast(char[])buffer;
2358             return std2.encoding.encode(c,r);
2359         }
2360
2361         override dchar decode(ref ubyte[] s)
2362         {
2363             auto t = cast(char[]) s;
2364             dchar c = std2.encoding.decode(t);
2365             s = s[$-t.length..$];
2366             return c;
2367         }
2368
2369         override dchar safeDecode(ref ubyte[] s)
2370         {
2371             auto t = cast(char[]) s;
2372             dchar c = std2.encoding.safeDecode(t);
2373             s = s[$-t.length..$];
2374             return c;
2375         }
2376
2377         override ubyte[] replacementSequence()
2378         {
2379             return cast(ubyte[])"\uFFFD";
2380         }
2381
2382 }
2383
2384 /**
2385  EncodingScheme to handle UTF-16 in native byte order
2386
2387  This scheme recognises the following names:
2388                  "UTF-16LE" (little-endian architecture only)
2389                  "UTF-16BE" (big-endian architecture only)
2390  */
2391 class EncodingSchemeUtf16Native : EncodingScheme
2392 {
2393     static this()
2394     {
2395         EncodingScheme.register("std2.encoding.EncodingSchemeUtf16Native");
2396     }
2397
2398         version(LittleEndian) { string NAME = "UTF-16LE"; }
2399         version(BigEndian)    { string NAME = "UTF-16BE"; }
2400
2401         override string[] names()
2402         {
2403             return [ NAME ];
2404         }
2405
2406         override string toString()
2407         {
2408             return NAME;
2409         }
2410
2411         override bool canEncode(dchar c)
2412         {
2413             return std2.encoding.canEncode!(wchar)(c);
2414         }
2415
2416         override uint encodedLength(dchar c)
2417         {
2418             return std2.encoding.encodedLength!(wchar)(c);
2419         }
2420
2421         override uint encode(dchar c, ubyte[] buffer)
2422         {
2423             auto r = cast(wchar[])buffer;
2424             return wchar.sizeof * std2.encoding.encode(c,r);
2425         }
2426
2427         override dchar decode(ref ubyte[] s)
2428         in
2429         {
2430             assert((s.length & 1) == 0);
2431         }
2432         body
2433         {
2434             auto t = cast(wchar[]) s;
2435             dchar c = std2.encoding.decode(t);
2436             s = s[$-t.length..$];
2437             return c;
2438         }
2439
2440         override dchar safeDecode(ref ubyte[] s)
2441         in
2442         {
2443             assert((s.length & 1) == 0);
2444         }
2445         body
2446         {
2447             auto t = cast(wchar[]) s;
2448             dchar c = std2.encoding.safeDecode(t);
2449             s = s[$-t.length..$];
2450             return c;
2451         }
2452
2453         override ubyte[] replacementSequence()
2454         {
2455             return cast(ubyte[])"\uFFFD"w;
2456         }
2457 }
2458
2459 /**
2460  EncodingScheme to handle UTF-32 in native byte order
2461
2462  This scheme recognises the following names:
2463                  "UTF-32LE" (little-endian architecture only)
2464                  "UTF-32BE" (big-endian architecture only)
2465  */
2466 class EncodingSchemeUtf32Native : EncodingScheme
2467 {
2468     static this()
2469     {
2470         EncodingScheme.register("std2.encoding.EncodingSchemeUtf32Native");
2471     }
2472
2473
2474         version(LittleEndian) { string NAME = "UTF-32LE"; }
2475         version(BigEndian)    { string NAME = "UTF-32BE"; }
2476
2477         override string[] names()
2478         {
2479             return [ NAME ];
2480         }
2481
2482         override string toString()
2483         {
2484             return NAME;
2485         }
2486
2487         override bool canEncode(dchar c)
2488         {
2489             return std2.encoding.canEncode!(dchar)(c);
2490         }
2491
2492         override uint encodedLength(dchar c)
2493         {
2494             return std2.encoding.encodedLength!(dchar)(c);
2495         }
2496
2497         override uint encode(dchar c, ubyte[] buffer)
2498         {
2499             auto r = cast(dchar[])buffer;
2500             return dchar.sizeof * std2.encoding.encode(c,r);
2501         }
2502
2503         override dchar decode(ref ubyte[] s)
2504         in
2505         {
2506             assert((s.length & 3) == 0);
2507         }
2508         body
2509         {
2510             auto t = cast(dchar[]) s;
2511             dchar c = std2.encoding.decode(t);
2512             s = s[$-t.length..$];
2513             return c;
2514         }
2515
2516         override dchar safeDecode(ref ubyte[] s)
2517         in
2518         {
2519             assert((s.length & 3) == 0);
2520         }
2521         body
2522         {
2523             auto t = cast(dchar[]) s;
2524             dchar c = std2.encoding.safeDecode(t);
2525             s = s[$-t.length..$];
2526             return c;
2527         }
2528
2529         override ubyte[] replacementSequence()
2530         {
2531             return cast(ubyte[])"\uFFFD"d;
2532         }
2533 }
2534
2535 void transcodeReverse(Src,Dst)(Src[] s, out Dst[] r)
2536 {
2537     static if(is(Src==Dst))
2538     {
2539         return s;
2540     }
2541     else static if(is(Src==AsciiChar))
2542     {
2543         transcodeReverse!(char,Dst)(cast(string)s,r);
2544     }
2545     else
2546     {
2547         foreach_reverse(d;codePoints(s))
2548         {
2549             foreach_reverse(c;codeUnits!(Dst)(d))
2550             {
2551                 r = c ~ r;
2552             }
2553         }
2554     }
2555 }
2556
2557 string makeReadable(string s)
2558 {
2559     string r = "\"";
2560     foreach(char c;s)
2561     {
2562         if (c >= 0x20 && c < 0x80)
2563         {
2564             r ~= c;
2565         }
2566         else
2567         {
2568             r ~= "\\x";
2569             r ~= toHexDigit(c >> 4);
2570             r ~= toHexDigit(c);
2571         }
2572     }
2573     r ~= "\"";
2574     return r;
2575 }
2576
2577 string makeReadable(wstring s)
2578 {
2579     string r = "\"";
2580     foreach(wchar c;s)
2581     {
2582         if (c >= 0x20 && c < 0x80)
2583         {
2584             r ~= cast(char) c;
2585         }
2586         else
2587         {
2588             r ~= "\\u";
2589             r ~= toHexDigit(c >> 12);
2590             r ~= toHexDigit(c >> 8);
2591             r ~= toHexDigit(c >> 4);
2592             r ~= toHexDigit(c);
2593         }
2594     }
2595     r ~= "\"w";
2596     return r;
2597 }
2598
2599 string makeReadable(dstring s)
2600 {
2601     string r = "\"";
2602     foreach(dchar c; s)
2603     {
2604         if (c >= 0x20 && c < 0x80)
2605         {
2606             r ~= cast(char) c;
2607         }
2608         else if (c < 0x10000)
2609         {
2610             r ~= "\\u";
2611             r ~= toHexDigit(c >> 12);
2612             r ~= toHexDigit(c >> 8);
2613             r ~= toHexDigit(c >> 4);
2614             r ~= toHexDigit(c);
2615         }
2616         else
2617         {
2618             r ~= "\\U00";
2619             r ~= toHexDigit(c >> 20);
2620             r ~= toHexDigit(c >> 16);
2621             r ~= toHexDigit(c >> 12);
2622             r ~= toHexDigit(c >> 8);
2623             r ~= toHexDigit(c >> 4);
2624             r ~= toHexDigit(c);
2625         }
2626     }
2627     r ~= "\"d";
2628     return r;
2629 }
2630
2631 char toHexDigit(int n)
2632 {
2633     return "0123456789ABCDEF"[n & 0xF];
2634 }
2635
2636
2637 unittest
2638 {
2639     void TestEncoding()
2640     {
2641         ubyte[][] validStrings =
2642         [
2643         // Plain ASCII
2644         cast(ubyte[])"hello",
2645
2646         // First possible sequence of a certain length
2647         [ 0x00 ],                       // U+00000000   one byte
2648         [ 0xC2, 0x80 ],                 // U+00000080   two bytes
2649         [ 0xE0, 0xA0, 0x80 ],           // U+00000800   three bytes
2650         [ 0xF0, 0x90, 0x80, 0x80 ],     // U+00010000   three bytes
2651
2652         // Last possible sequence of a certain length
2653         [ 0x7F ],                       // U+0000007F   one byte
2654         [ 0xDF, 0xBF ],                 // U+000007FF   two bytes
2655         [ 0xEF, 0xBF, 0xBF ],           // U+0000FFFF   three bytes
2656
2657         // Other boundary conditions
2658         [ 0xED, 0x9F, 0xBF ],
2659         // U+0000D7FF   Last character before surrogates
2660         [ 0xEE, 0x80, 0x80 ],
2661         // U+0000E000   First character after surrogates
2662         [ 0xEF, 0xBF, 0xBD ],
2663         // U+0000FFFD   Unicode replacement character
2664         [ 0xF4, 0x8F, 0xBF, 0xBF ],
2665         // U+0010FFFF   Very last character
2666
2667         // Non-character code points
2668         /*  NOTE: These are legal in UTF, and may be converted from
2669             one UTF to another, however they do not represent Unicode
2670             characters. These code points have been reserved by
2671             Unicode as non-character code points. They are permissible
2672             for data exchange within an application, but they are are
2673             not permitted to be used as characters. Since this module
2674             deals with UTF, and not with Unicode per se, we choose to
2675             accept them here. */
2676         [ 0xDF, 0xBE ],                 // U+0000FFFE
2677         [ 0xDF, 0xBF ],                 // U+0000FFFF
2678         ];
2679
2680
2681         ubyte[][] invalidStrings =
2682         [
2683         // First possible sequence of a certain length, but greater
2684         // than U+10FFFF
2685         [ 0xF8, 0x88, 0x80, 0x80, 0x80 ],           // U+00200000   five bytes
2686         [ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ],     // U+04000000   six bytes
2687
2688         // Last possible sequence of a certain length, but greater than U+10FFFF
2689         [ 0xF7, 0xBF, 0xBF, 0xBF ],                 // U+001FFFFF   four bytes
2690         [ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ],           // U+03FFFFFF   five bytes
2691         [ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+7FFFFFFF   six bytes
2692
2693         // Other boundary conditions
2694         [ 0xF4, 0x90, 0x80, 0x80 ],                 // U+00110000
2695                                                 // First code
2696                                                 // point after
2697                                                 // last character
2698
2699         // Unexpected continuation bytes
2700         [ 0x80 ],
2701         [ 0xBF ],
2702         [ 0x20, 0x80, 0x20 ],
2703         [ 0x20, 0xBF, 0x20 ],
2704         [ 0x80, 0x9F, 0xA0 ],
2705
2706         // Lonely start bytes
2707         [ 0xC0 ],
2708         [ 0xCF ],
2709         [ 0x20, 0xC0, 0x20 ],
2710         [ 0x20, 0xCF, 0x20 ],
2711         [ 0xD0 ],
2712         [ 0xDF ],
2713         [ 0x20, 0xD0, 0x20 ],
2714         [ 0x20, 0xDF, 0x20 ],
2715         [ 0xE0 ],
2716         [ 0xEF ],
2717         [ 0x20, 0xE0, 0x20 ],
2718         [ 0x20, 0xEF, 0x20 ],
2719         [ 0xF0 ],
2720         [ 0xF1 ],
2721         [ 0xF2 ],
2722         [ 0xF3 ],
2723         [ 0xF4 ],
2724         [ 0xF5 ],   // If this were legal it would start a character > U+10FFFF
2725         [ 0xF6 ],   // If this were legal it would start a character > U+10FFFF
2726         [ 0xF7 ],   // If this were legal it would start a character > U+10FFFF
2727
2728         [ 0xEF, 0xBF ],             // Three byte sequence with third byte missing
2729         [ 0xF7, 0xBF, 0xBF ],       // Four byte sequence with fourth byte missing
2730         [ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ],   // Concatenation of the above
2731
2732         // Impossible bytes
2733         [ 0xF8 ],
2734         [ 0xF9 ],
2735         [ 0xFA ],
2736         [ 0xFB ],
2737         [ 0xFC ],
2738         [ 0xFD ],
2739         [ 0xFE ],
2740         [ 0xFF ],
2741         [ 0x20, 0xF8, 0x20 ],
2742         [ 0x20, 0xF9, 0x20 ],
2743         [ 0x20, 0xFA, 0x20 ],
2744         [ 0x20, 0xFB, 0x20 ],
2745         [ 0x20, 0xFC, 0x20 ],
2746         [ 0x20, 0xFD, 0x20 ],
2747         [ 0x20, 0xFE, 0x20 ],
2748         [ 0x20, 0xFF, 0x20 ],
2749
2750         // Overlong sequences, all representing U+002F
2751         /*  With a safe UTF-8 decoder, all of the following five overlong
2752             representations of the ASCII character slash ("/") should be
2753             rejected like a malformed UTF-8 sequence */
2754         [ 0xC0, 0xAF ],
2755         [ 0xE0, 0x80, 0xAF ],
2756         [ 0xF0, 0x80, 0x80, 0xAF ],
2757         [ 0xF8, 0x80, 0x80, 0x80, 0xAF ],
2758         [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ],
2759
2760         // Maximum overlong sequences
2761         /*  Below you see the highest Unicode value that is still resulting in
2762             an overlong sequence if represented with the given number of bytes.
2763             This is a boundary test for safe UTF-8 decoders. All five
2764             characters should be rejected like malformed UTF-8 sequences. */
2765         [ 0xC1, 0xBF ],                             // U+0000007F
2766         [ 0xE0, 0x9F, 0xBF ],                       // U+000007FF
2767         [ 0xF0, 0x8F, 0xBF, 0xBF ],                 // U+0000FFFF
2768         [ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ],           // U+001FFFFF
2769         [ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ],     // U+03FFFFFF
2770
2771         // Overlong representation of the NUL character
2772         /*  The following five sequences should also be rejected like malformed
2773             UTF-8 sequences and should not be treated like the ASCII NUL
2774             character. */
2775         [ 0xC0, 0x80 ],
2776         [ 0xE0, 0x80, 0x80 ],
2777         [ 0xF0, 0x80, 0x80, 0x80 ],
2778         [ 0xF8, 0x80, 0x80, 0x80, 0x80 ],
2779         [ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ],
2780
2781         // Illegal code positions
2782         /*  The following UTF-8 sequences should be rejected like malformed
2783             sequences, because they never represent valid ISO 10646 characters
2784             and a UTF-8 decoder that accepts them might introduce security
2785             problems comparable to overlong UTF-8 sequences. */
2786         [ 0xED, 0xA0, 0x80 ],       // U+D800
2787         [ 0xED, 0xAD, 0xBF ],       // U+DB7F
2788         [ 0xED, 0xAE, 0x80 ],       // U+DB80
2789         [ 0xED, 0xAF, 0xBF ],       // U+DBFF
2790         [ 0xED, 0xB0, 0x80 ],       // U+DC00
2791         [ 0xED, 0xBE, 0x80 ],       // U+DF80
2792         [ 0xED, 0xBF, 0xBF ],       // U+DFFF
2793         ];
2794
2795         string[] sanitizedStrings =
2796         [
2797         "\uFFFD","\uFFFD",
2798         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
2799         " \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ",
2800         "\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ",
2801         " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
2802         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD",
2803         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ",
2804         " \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ",
2805         " \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
2806         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
2807         "\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD",
2808         ];
2809
2810         // Make sure everything that should be valid, is
2811         foreach(a;validStrings)
2812         {
2813         string s = cast(string)a;
2814         assert(isValid(s),"Failed to validate: "~makeReadable(s));
2815         }
2816
2817         // Make sure everything that shouldn't be valid, isn't
2818         foreach(a;invalidStrings)
2819         {
2820         string s = cast(string)a;
2821         assert(!isValid(s),"Incorrectly validated: "~makeReadable(s));
2822         }
2823
2824         // Make sure we can sanitize everything bad
2825         assert(invalidStrings.length == sanitizedStrings.length);
2826         for(int i=0; i<invalidStrings.length; ++i)
2827         {
2828         string s = cast(string)invalidStrings[i];
2829         string t = sanitize(s);
2830         assert(isValid(t));
2831         assert(t == sanitizedStrings[i]);
2832         ubyte[] u = cast(ubyte[])t;
2833         validStrings ~= u;
2834         }
2835
2836         // Make sure all transcodings work in both directions, using both forward
2837         // and reverse iteration
2838         foreach(i,a; validStrings)
2839         {
2840         string s = cast(string)a;
2841         string s2;
2842         wstring ws, ws2;
2843         dstring ds, ds2;
2844
2845         transcode(s,ws);
2846         assert(isValid(ws));
2847         transcode(ws,s2);
2848         assert(s == s2);
2849
2850         transcode(s,ds);
2851         assert(isValid(ds));
2852         transcode(ds,s2);
2853         assert(s == s2);
2854
2855         transcode(ws,s);
2856         assert(isValid(s));
2857         transcode(s,ws2);
2858         assert(ws == ws2);
2859
2860         transcode(ws,ds);
2861         assert(isValid(ds));
2862         transcode(ds,ws2);
2863         assert(ws == ws2);
2864
2865         transcode(ds,s);
2866         assert(isValid(s));
2867         transcode(s,ds2);
2868         assert(ds == ds2);
2869
2870         transcode(ds,ws);
2871         assert(isValid(ws));
2872         transcode(ws,ds2);
2873         assert(ds == ds2);
2874
2875         transcodeReverse(s,ws);
2876         assert(isValid(ws));
2877         transcodeReverse(ws,s2);
2878         assert(s == s2);
2879
2880         transcodeReverse(s,ds);
2881         assert(isValid(ds));
2882         transcodeReverse(ds,s2);
2883         assert(s == s2);
2884
2885         transcodeReverse(ws,s);
2886         assert(isValid(s));
2887         transcodeReverse(s,ws2);
2888         assert(ws == ws2);
2889
2890         transcodeReverse(ws,ds);
2891         assert(isValid(ds));
2892         transcodeReverse(ds,ws2);
2893         assert(ws == ws2);
2894
2895         transcodeReverse(ds,s);
2896         assert(isValid(s));
2897         transcodeReverse(s,ds2);
2898         assert(ds == ds2);
2899
2900         transcodeReverse(ds,ws);
2901         assert(isValid(ws));
2902         transcodeReverse(ws,ds2);
2903         assert(ds == ds2);
2904         }
2905
2906         // Make sure the non-UTF encodings work too
2907         {
2908         auto s = "\u20AC100";
2909         Windows1252String t;
2910         transcode(s,t);
2911         assert(t == [cast(Windows1252Char)0x80, '1', '0', '0']);
2912         string u;
2913         transcode(s,u);
2914         assert(s == u);
2915         Latin1String v;
2916         transcode(s,v);
2917         assert(cast(string)v == "?100");
2918         AsciiString w;
2919         transcode(v,w);
2920         assert(cast(string)w == "?100");
2921         }
2922        
2923         // Make sure we can count properly
2924         {
2925             assert(encodedLength!(char)('A') == 1);
2926             assert(encodedLength!(char)('\u00E3') == 2);
2927             assert(encodedLength!(char)('\u2028') == 3);
2928             assert(encodedLength!(char)('\U0010FFF0') == 4);
2929             assert(encodedLength!(wchar)('A') == 1);
2930             assert(encodedLength!(wchar)('\U0010FFF0') == 2);
2931         }
2932        
2933         // Make sure we can write into mutable arrays
2934         {
2935         char[4] buffer;
2936         uint n = encode(cast(dchar)'\u00E3',buffer);
2937         assert(n == 2);
2938         assert(buffer[0] == 0xC3);
2939         assert(buffer[1] == 0xA3);
2940         }
2941     }
2942     TestEncoding();
2943
2944 }
2945 version (unittest_report)
2946 {
2947     import std.stdio;
2948     unittest {
2949         writefln("unittest std2.encoding passed");
2950     }
2951 }
2952 //=============================================================================