1 |
// Written in the D programming language. |
---|
2 |
|
---|
3 |
/** |
---|
4 |
Classes and functions for handling and transcoding between various encodings. |
---|
5 |
|
---|
6 |
For cases where the _encoding is known at compile-time, functions are provided |
---|
7 |
for arbitrary _encoding and decoding of characters, arbitrary transcoding |
---|
8 |
between strings of different type, as well as validation and sanitization. |
---|
9 |
|
---|
10 |
Encodings currently supported are UTF-8, UTF-16, UTF-32, ASCII, ISO-8859-1 |
---|
11 |
(also known as LATIN-1), and WINDOWS-1252. |
---|
12 |
|
---|
13 |
$(UL |
---|
14 |
$(LI The type $(D AsciiChar) represents an ASCII character.) |
---|
15 |
$(LI The type $(D AsciiString) represents an ASCII string.) |
---|
16 |
$(LI The type $(D Latin1Char) represents an ISO-8859-1 character.) |
---|
17 |
$(LI The type $(D Latin1String) represents an ISO-8859-1 string.) |
---|
18 |
$(LI The type $(D Windows1252Char) represents a Windows-1252 character.) |
---|
19 |
$(LI The type $(D Windows1252String) represents a Windows-1252 string.)) |
---|
20 |
|
---|
21 |
For cases where the _encoding is not known at compile-time, but is |
---|
22 |
known at run-time, we provide the abstract class $(D EncodingScheme) |
---|
23 |
and its subclasses. To construct a run-time encoder/decoder, one does |
---|
24 |
e.g. |
---|
25 |
|
---|
26 |
---------------------------------------------------- |
---|
27 |
auto e = EncodingScheme.create("utf-8"); |
---|
28 |
---------------------------------------------------- |
---|
29 |
|
---|
30 |
This library supplies $(D EncodingScheme) subclasses for ASCII, |
---|
31 |
ISO-8859-1 (also known as LATIN-1), WINDOWS-1252, UTF-8, and (on |
---|
32 |
little-endian architectures) UTF-16LE and UTF-32LE; or (on big-endian |
---|
33 |
architectures) UTF-16BE and UTF-32BE. |
---|
34 |
|
---|
35 |
This library provides a mechanism whereby other modules may add $(D |
---|
36 |
EncodingScheme) subclasses for any other _encoding. |
---|
37 |
|
---|
38 |
Authors: Janice Caron |
---|
39 |
|
---|
40 |
Date: 2008.02.27 - 2008.05.07 |
---|
41 |
|
---|
42 |
License: Public Domain |
---|
43 |
|
---|
44 |
Macros: |
---|
45 |
WIKI=Phobos/StdEncoding |
---|
46 |
*/ |
---|
47 |
|
---|
48 |
module std2.encoding; |
---|
49 |
import std.string; |
---|
50 |
import std.traits; |
---|
51 |
|
---|
52 |
|
---|
53 |
//============================================================================= |
---|
54 |
|
---|
55 |
/** Special value returned by $(D safeDecode) */ |
---|
56 |
const dchar INVALID_SEQUENCE = cast(dchar)0xFFFFFFFF; |
---|
57 |
|
---|
58 |
template EncoderFunctions() |
---|
59 |
{ |
---|
60 |
// Various forms of read |
---|
61 |
|
---|
62 |
template ReadFromString() |
---|
63 |
{ |
---|
64 |
bool canRead() { return s.length != 0; } |
---|
65 |
E peek() { return s[0]; } |
---|
66 |
E read() { E t = s[0]; s = s[1..$]; return t; } |
---|
67 |
} |
---|
68 |
|
---|
69 |
template ReverseReadFromString() |
---|
70 |
{ |
---|
71 |
bool canRead() { return s.length != 0; } |
---|
72 |
E peek() { return s[$-1]; } |
---|
73 |
E read() { E t = s[$-1]; s = s[0..$-1]; return t; } |
---|
74 |
} |
---|
75 |
|
---|
76 |
// Various forms of Write |
---|
77 |
|
---|
78 |
template WriteToString() |
---|
79 |
{ |
---|
80 |
E[] s; |
---|
81 |
void write(E c) { s ~= c; } |
---|
82 |
} |
---|
83 |
|
---|
84 |
template WriteToArray() |
---|
85 |
{ |
---|
86 |
void write(E c) { array[0] = c; array = array[1..$]; } |
---|
87 |
} |
---|
88 |
|
---|
89 |
deprecated template WriteToBuffer() |
---|
90 |
{ |
---|
91 |
void write(E c) { buffer ~= c; } |
---|
92 |
} |
---|
93 |
|
---|
94 |
template WriteToDelegate() |
---|
95 |
{ |
---|
96 |
void write(E c) { dg(c); } |
---|
97 |
} |
---|
98 |
|
---|
99 |
// Functions we will export |
---|
100 |
|
---|
101 |
template EncodeViaWrite() |
---|
102 |
{ |
---|
103 |
mixin encodeViaWrite; |
---|
104 |
void encode(dchar c) { encodeViaWrite(c); } |
---|
105 |
} |
---|
106 |
|
---|
107 |
template SkipViaRead() |
---|
108 |
{ |
---|
109 |
mixin skipViaRead; |
---|
110 |
void skip() { skipViaRead(); } |
---|
111 |
} |
---|
112 |
|
---|
113 |
template DecodeViaRead() |
---|
114 |
{ |
---|
115 |
mixin decodeViaRead; |
---|
116 |
dchar decode() { return decodeViaRead(); } |
---|
117 |
} |
---|
118 |
|
---|
119 |
template SafeDecodeViaRead() |
---|
120 |
{ |
---|
121 |
mixin safeDecodeViaRead; |
---|
122 |
dchar safeDecode() { return safeDecodeViaRead(); } |
---|
123 |
} |
---|
124 |
|
---|
125 |
template DecodeReverseViaRead() |
---|
126 |
{ |
---|
127 |
mixin decodeReverseViaRead; |
---|
128 |
dchar decodeReverse() { return decodeReverseViaRead(); } |
---|
129 |
} |
---|
130 |
|
---|
131 |
// Encoding to different destinations |
---|
132 |
|
---|
133 |
template EncodeToString() |
---|
134 |
{ |
---|
135 |
mixin WriteToString; |
---|
136 |
mixin EncodeViaWrite; |
---|
137 |
} |
---|
138 |
|
---|
139 |
template EncodeToArray() |
---|
140 |
{ |
---|
141 |
mixin WriteToArray; |
---|
142 |
mixin EncodeViaWrite; |
---|
143 |
} |
---|
144 |
|
---|
145 |
deprecated template EncodeToBuffer() |
---|
146 |
{ |
---|
147 |
mixin WriteToBuffer; |
---|
148 |
mixin EncodeViaWrite; |
---|
149 |
} |
---|
150 |
|
---|
151 |
template EncodeToDelegate() |
---|
152 |
{ |
---|
153 |
mixin WriteToDelegate; |
---|
154 |
mixin EncodeViaWrite; |
---|
155 |
} |
---|
156 |
|
---|
157 |
// Decoding functions |
---|
158 |
|
---|
159 |
template SkipFromString() |
---|
160 |
{ |
---|
161 |
mixin ReadFromString; |
---|
162 |
mixin SkipViaRead; |
---|
163 |
} |
---|
164 |
|
---|
165 |
template DecodeFromString() |
---|
166 |
{ |
---|
167 |
mixin ReadFromString; |
---|
168 |
mixin DecodeViaRead; |
---|
169 |
} |
---|
170 |
|
---|
171 |
template SafeDecodeFromString() |
---|
172 |
{ |
---|
173 |
mixin ReadFromString; |
---|
174 |
mixin SafeDecodeViaRead; |
---|
175 |
} |
---|
176 |
|
---|
177 |
template DecodeReverseFromString() |
---|
178 |
{ |
---|
179 |
mixin ReverseReadFromString; |
---|
180 |
mixin DecodeReverseViaRead; |
---|
181 |
} |
---|
182 |
|
---|
183 |
//========================================================================= |
---|
184 |
|
---|
185 |
// Below are the functions we will ultimately expose to the user |
---|
186 |
|
---|
187 |
E[] encode(dchar c) |
---|
188 |
{ |
---|
189 |
mixin EncodeToString e; |
---|
190 |
e.encode(c); |
---|
191 |
return e.s; |
---|
192 |
} |
---|
193 |
|
---|
194 |
void encode(dchar c, ref E[] array) |
---|
195 |
{ |
---|
196 |
mixin EncodeToArray e; |
---|
197 |
e.encode(c); |
---|
198 |
} |
---|
199 |
|
---|
200 |
void encode(dchar c, void delegate(E) dg) |
---|
201 |
{ |
---|
202 |
mixin EncodeToDelegate e; |
---|
203 |
e.encode(c); |
---|
204 |
} |
---|
205 |
|
---|
206 |
void skip(ref E[] s) |
---|
207 |
{ |
---|
208 |
mixin SkipFromString e; |
---|
209 |
e.skip(); |
---|
210 |
} |
---|
211 |
|
---|
212 |
dchar decode(S)(ref S s) |
---|
213 |
{ |
---|
214 |
mixin DecodeFromString e; |
---|
215 |
return e.decode(); |
---|
216 |
} |
---|
217 |
|
---|
218 |
dchar safeDecode(S)(ref S s) |
---|
219 |
{ |
---|
220 |
mixin SafeDecodeFromString e; |
---|
221 |
return e.safeDecode(); |
---|
222 |
} |
---|
223 |
|
---|
224 |
dchar decodeReverse(ref E[] s) |
---|
225 |
{ |
---|
226 |
mixin DecodeReverseFromString e; |
---|
227 |
return e.decodeReverse(); |
---|
228 |
} |
---|
229 |
} |
---|
230 |
|
---|
231 |
//========================================================================= |
---|
232 |
|
---|
233 |
struct CodePoints(E) |
---|
234 |
{ |
---|
235 |
E[] s; |
---|
236 |
|
---|
237 |
static CodePoints opCall(E[] s) |
---|
238 |
in |
---|
239 |
{ |
---|
240 |
assert(isValid(s)); |
---|
241 |
} |
---|
242 |
body |
---|
243 |
{ |
---|
244 |
CodePoints codePoints; |
---|
245 |
codePoints.s = s; |
---|
246 |
return codePoints; |
---|
247 |
} |
---|
248 |
|
---|
249 |
int opApply(int delegate(ref dchar) dg) |
---|
250 |
{ |
---|
251 |
int result = 0; |
---|
252 |
while (s.length != 0) |
---|
253 |
{ |
---|
254 |
dchar c = decode(s); |
---|
255 |
result = dg(c); |
---|
256 |
if (result != 0) break; |
---|
257 |
} |
---|
258 |
return result; |
---|
259 |
} |
---|
260 |
|
---|
261 |
int opApply(int delegate(ref uint, ref dchar) dg) |
---|
262 |
{ |
---|
263 |
uint i = 0; |
---|
264 |
int result = 0; |
---|
265 |
while (s.length != 0) |
---|
266 |
{ |
---|
267 |
uint len = s.length; |
---|
268 |
dchar c = decode(s); |
---|
269 |
uint j = i; // We don't want the delegate corrupting i |
---|
270 |
result = dg(j,c); |
---|
271 |
if (result != 0) break; |
---|
272 |
i += len - s.length; |
---|
273 |
} |
---|
274 |
return result; |
---|
275 |
} |
---|
276 |
|
---|
277 |
int opApplyReverse(int delegate(ref dchar) dg) |
---|
278 |
{ |
---|
279 |
int result = 0; |
---|
280 |
while (s.length != 0) |
---|
281 |
{ |
---|
282 |
dchar c = decodeReverse(s); |
---|
283 |
result = dg(c); |
---|
284 |
if (result != 0) break; |
---|
285 |
} |
---|
286 |
return result; |
---|
287 |
} |
---|
288 |
|
---|
289 |
int opApplyReverse(int delegate(ref uint, ref dchar) dg) |
---|
290 |
{ |
---|
291 |
int result = 0; |
---|
292 |
while (s.length != 0) |
---|
293 |
{ |
---|
294 |
dchar c = decodeReverse(s); |
---|
295 |
uint i = s.length; |
---|
296 |
result = dg(i,c); |
---|
297 |
if (result != 0) break; |
---|
298 |
} |
---|
299 |
return result; |
---|
300 |
} |
---|
301 |
} |
---|
302 |
|
---|
303 |
struct CodeUnits(E) |
---|
304 |
{ |
---|
305 |
E[] s; |
---|
306 |
|
---|
307 |
static CodeUnits opCall(dchar d) |
---|
308 |
in |
---|
309 |
{ |
---|
310 |
assert(isValidCodePoint(d)); |
---|
311 |
} |
---|
312 |
body |
---|
313 |
{ |
---|
314 |
CodeUnits codeUnits; |
---|
315 |
codeUnits.s = encode!(E)(d); |
---|
316 |
return codeUnits; |
---|
317 |
} |
---|
318 |
|
---|
319 |
int opApply(int delegate(ref E) dg) |
---|
320 |
{ |
---|
321 |
int result = 0; |
---|
322 |
foreach(E c;s) |
---|
323 |
{ |
---|
324 |
result = dg(c); |
---|
325 |
if (result != 0) break; |
---|
326 |
} |
---|
327 |
return result; |
---|
328 |
} |
---|
329 |
|
---|
330 |
int opApplyReverse(int delegate(ref E) dg) |
---|
331 |
{ |
---|
332 |
int result = 0; |
---|
333 |
foreach_reverse(E c;s) |
---|
334 |
{ |
---|
335 |
result = dg(c); |
---|
336 |
if (result != 0) break; |
---|
337 |
} |
---|
338 |
return result; |
---|
339 |
} |
---|
340 |
} |
---|
341 |
|
---|
342 |
//============================================================================= |
---|
343 |
|
---|
344 |
template EncoderInstance(E) |
---|
345 |
{ |
---|
346 |
static assert(false,"Cannot instantiate EncoderInstance for type " |
---|
347 |
~ E.stringof); |
---|
348 |
} |
---|
349 |
|
---|
350 |
//============================================================================= |
---|
351 |
// ASCII |
---|
352 |
//============================================================================= |
---|
353 |
|
---|
354 |
/** Defines various character sets. */ |
---|
355 |
typedef ubyte AsciiChar; |
---|
356 |
/// Ditto |
---|
357 |
alias AsciiChar[] AsciiString; |
---|
358 |
|
---|
359 |
template EncoderInstance(CharType : AsciiChar) |
---|
360 |
{ |
---|
361 |
alias AsciiChar E; |
---|
362 |
alias AsciiString EString; |
---|
363 |
|
---|
364 |
string encodingName() |
---|
365 |
{ |
---|
366 |
return "ASCII"; |
---|
367 |
} |
---|
368 |
|
---|
369 |
bool canEncode(dchar c) |
---|
370 |
{ |
---|
371 |
return c < 0x80; |
---|
372 |
} |
---|
373 |
|
---|
374 |
bool isValidCodeUnit(AsciiChar c) |
---|
375 |
{ |
---|
376 |
return c < 0x80; |
---|
377 |
} |
---|
378 |
|
---|
379 |
uint encodedLength(dchar c) |
---|
380 |
in |
---|
381 |
{ |
---|
382 |
assert(canEncode(c)); |
---|
383 |
} |
---|
384 |
body |
---|
385 |
{ |
---|
386 |
return 1; |
---|
387 |
} |
---|
388 |
|
---|
389 |
void encodeX(Range)(dchar c, Range r) |
---|
390 |
{ |
---|
391 |
if (!canEncode(c)) c = '?'; |
---|
392 |
r.write(cast(AsciiChar) c); |
---|
393 |
} |
---|
394 |
|
---|
395 |
void encodeViaWrite()(dchar c) |
---|
396 |
{ |
---|
397 |
if (!canEncode(c)) c = '?'; |
---|
398 |
write(cast(AsciiChar)c); |
---|
399 |
} |
---|
400 |
|
---|
401 |
void skipViaRead()() |
---|
402 |
{ |
---|
403 |
read(); |
---|
404 |
} |
---|
405 |
|
---|
406 |
dchar decodeViaRead()() |
---|
407 |
{ |
---|
408 |
return read; |
---|
409 |
} |
---|
410 |
|
---|
411 |
dchar safeDecodeViaRead()() |
---|
412 |
{ |
---|
413 |
dchar c = read; |
---|
414 |
return canEncode(c) ? c : INVALID_SEQUENCE; |
---|
415 |
} |
---|
416 |
|
---|
417 |
dchar decodeReverseViaRead()() |
---|
418 |
{ |
---|
419 |
return read; |
---|
420 |
} |
---|
421 |
|
---|
422 |
EString replacementSequence() |
---|
423 |
{ |
---|
424 |
return cast(EString)("?"); |
---|
425 |
} |
---|
426 |
|
---|
427 |
mixin EncoderFunctions; |
---|
428 |
} |
---|
429 |
|
---|
430 |
//============================================================================= |
---|
431 |
// ISO-8859-1 |
---|
432 |
//============================================================================= |
---|
433 |
|
---|
434 |
/** Defines an Latin1-encoded character. */ |
---|
435 |
typedef ubyte Latin1Char; |
---|
436 |
/** |
---|
437 |
Defines an Latin1-encoded string (as an array of $(D |
---|
438 |
invariant(Latin1Char))). |
---|
439 |
*/ |
---|
440 |
alias Latin1Char[] Latin1String; /// |
---|
441 |
|
---|
442 |
template EncoderInstance(CharType : Latin1Char) |
---|
443 |
{ |
---|
444 |
alias Latin1Char E; |
---|
445 |
alias Latin1String EString; |
---|
446 |
|
---|
447 |
string encodingName() |
---|
448 |
{ |
---|
449 |
return "ISO-8859-1"; |
---|
450 |
} |
---|
451 |
|
---|
452 |
bool canEncode(dchar c) |
---|
453 |
{ |
---|
454 |
return c < 0x100; |
---|
455 |
} |
---|
456 |
|
---|
457 |
bool isValidCodeUnit(Latin1Char c) |
---|
458 |
{ |
---|
459 |
return true; |
---|
460 |
} |
---|
461 |
|
---|
462 |
uint encodedLength(dchar c) |
---|
463 |
in |
---|
464 |
{ |
---|
465 |
assert(canEncode(c)); |
---|
466 |
} |
---|
467 |
body |
---|
468 |
{ |
---|
469 |
return 1; |
---|
470 |
} |
---|
471 |
|
---|
472 |
void encodeViaWrite()(dchar c) |
---|
473 |
{ |
---|
474 |
if (!canEncode(c)) c = '?'; |
---|
475 |
write(cast(Latin1Char)c); |
---|
476 |
} |
---|
477 |
|
---|
478 |
void skipViaRead()() |
---|
479 |
{ |
---|
480 |
read(); |
---|
481 |
} |
---|
482 |
|
---|
483 |
dchar decodeViaRead()() |
---|
484 |
{ |
---|
485 |
return read; |
---|
486 |
} |
---|
487 |
|
---|
488 |
dchar safeDecodeViaRead()() |
---|
489 |
{ |
---|
490 |
return read; |
---|
491 |
} |
---|
492 |
|
---|
493 |
dchar decodeReverseViaRead()() |
---|
494 |
{ |
---|
495 |
return read; |
---|
496 |
} |
---|
497 |
|
---|
498 |
EString replacementSequence() |
---|
499 |
{ |
---|
500 |
return cast(EString)("?"); |
---|
501 |
} |
---|
502 |
|
---|
503 |
mixin EncoderFunctions; |
---|
504 |
} |
---|
505 |
|
---|
506 |
//============================================================================= |
---|
507 |
// WINDOWS-1252 |
---|
508 |
//============================================================================= |
---|
509 |
|
---|
510 |
/** Defines a Windows1252-encoded character. */ |
---|
511 |
typedef ubyte Windows1252Char; |
---|
512 |
/** |
---|
513 |
Defines an Windows1252-encoded string (as an array of $(D |
---|
514 |
invariant(Windows1252Char))). |
---|
515 |
*/ |
---|
516 |
alias Windows1252Char[] Windows1252String; /// |
---|
517 |
|
---|
518 |
template EncoderInstance(CharType : Windows1252Char) |
---|
519 |
{ |
---|
520 |
alias Windows1252Char E; |
---|
521 |
alias Windows1252String EString; |
---|
522 |
|
---|
523 |
string encodingName() |
---|
524 |
{ |
---|
525 |
return "windows-1252"; |
---|
526 |
} |
---|
527 |
|
---|
528 |
static const wstring charMap = |
---|
529 |
"\u20AC\uFFFD\u201A\u0192\u201E\u2026\u2020\u2021" |
---|
530 |
"\u02C6\u2030\u0160\u2039\u0152\uFFFD\u017D\uFFFD" |
---|
531 |
"\uFFFD\u2018\u2019\u201C\u201D\u2022\u2103\u2014" |
---|
532 |
"\u02DC\u2122\u0161\u203A\u0153\uFFFD\u017E\u0178"; |
---|
533 |
|
---|
534 |
bool canEncode(dchar c) |
---|
535 |
{ |
---|
536 |
if (c < 0x80 || (c >= 0xA0 && c < 0x100)) return true; |
---|
537 |
if (c >= 0xFFFD) return false; |
---|
538 |
foreach(wchar d;charMap) { if (c == d) return true; } |
---|
539 |
return false; |
---|
540 |
} |
---|
541 |
|
---|
542 |
bool isValidCodeUnit(Windows1252Char c) |
---|
543 |
{ |
---|
544 |
if (c < 0x80 || c >= 0xA0) return true; |
---|
545 |
return (charMap[c-0x80] != 0xFFFD); |
---|
546 |
} |
---|
547 |
|
---|
548 |
uint encodedLength(dchar c) |
---|
549 |
in |
---|
550 |
{ |
---|
551 |
assert(canEncode(c)); |
---|
552 |
} |
---|
553 |
body |
---|
554 |
{ |
---|
555 |
return 1; |
---|
556 |
} |
---|
557 |
|
---|
558 |
void encodeViaWrite()(dchar c) |
---|
559 |
{ |
---|
560 |
if (c < 0x80 || (c >= 0xA0 && c < 0x100)) {} |
---|
561 |
else if (c >= 0xFFFD) { c = '?'; } |
---|
562 |
else |
---|
563 |
{ |
---|
564 |
int n = -1; |
---|
565 |
foreach(i,wchar d;charMap) |
---|
566 |
{ |
---|
567 |
if (c == d) |
---|
568 |
{ |
---|
569 |
n = i; |
---|
570 |
break; |
---|
571 |
} |
---|
572 |
} |
---|
573 |
c = n == -1 ? '?' : 0x80 + n; |
---|
574 |
} |
---|
575 |
write(cast(Windows1252Char)c); |
---|
576 |
} |
---|
577 |
|
---|
578 |
void skipViaRead()() |
---|
579 |
{ |
---|
580 |
read(); |
---|
581 |
} |
---|
582 |
|
---|
583 |
dchar decodeViaRead()() |
---|
584 |
{ |
---|
585 |
Windows1252Char c = read; |
---|
586 |
return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c; |
---|
587 |
} |
---|
588 |
|
---|
589 |
dchar safeDecodeViaRead()() |
---|
590 |
{ |
---|
591 |
Windows1252Char c = read; |
---|
592 |
dchar d = (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c; |
---|
593 |
return d == 0xFFFD ? INVALID_SEQUENCE : d; |
---|
594 |
} |
---|
595 |
|
---|
596 |
dchar decodeReverseViaRead()() |
---|
597 |
{ |
---|
598 |
Windows1252Char c = read; |
---|
599 |
return (c >= 0x80 && c < 0xA0) ? charMap[c-0x80] : c; |
---|
600 |
} |
---|
601 |
|
---|
602 |
EString replacementSequence() |
---|
603 |
{ |
---|
604 |
return cast(EString)("?"); |
---|
605 |
} |
---|
606 |
|
---|
607 |
mixin EncoderFunctions; |
---|
608 |
} |
---|
609 |
|
---|
610 |
//============================================================================= |
---|
611 |
// UTF-8 |
---|
612 |
//============================================================================= |
---|
613 |
|
---|
614 |
template EncoderInstance(CharType : char) |
---|
615 |
{ |
---|
616 |
alias char E; |
---|
617 |
alias char[] EString; |
---|
618 |
|
---|
619 |
string encodingName() |
---|
620 |
{ |
---|
621 |
return "UTF-8"; |
---|
622 |
} |
---|
623 |
|
---|
624 |
bool canEncode(dchar c) |
---|
625 |
{ |
---|
626 |
return isValidCodePoint(c); |
---|
627 |
} |
---|
628 |
|
---|
629 |
bool isValidCodeUnit(char c) |
---|
630 |
{ |
---|
631 |
return (c < 0xC0 || (c >= 0xC2 && c < 0xF5)); |
---|
632 |
} |
---|
633 |
|
---|
634 |
static const ubyte[128] tailTable = |
---|
635 |
[ |
---|
636 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
---|
637 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
---|
638 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
---|
639 |
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
---|
640 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
---|
641 |
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, |
---|
642 |
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
---|
643 |
3,3,3,3,3,3,3,3,4,4,4,4,5,5,6,0, |
---|
644 |
]; |
---|
645 |
|
---|
646 |
private int tails(char c) |
---|
647 |
in |
---|
648 |
{ |
---|
649 |
assert(c >= 0x80); |
---|
650 |
} |
---|
651 |
body |
---|
652 |
{ |
---|
653 |
return tailTable[c-0x80]; |
---|
654 |
} |
---|
655 |
|
---|
656 |
uint encodedLength(dchar c) |
---|
657 |
in |
---|
658 |
{ |
---|
659 |
assert(canEncode(c)); |
---|
660 |
} |
---|
661 |
body |
---|
662 |
{ |
---|
663 |
if (c < 0x80) return 1; |
---|
664 |
if (c < 0x800) return 2; |
---|
665 |
if (c < 0x10000) return 3; |
---|
666 |
return 4; |
---|
667 |
} |
---|
668 |
|
---|
669 |
void encodeViaWrite()(dchar c) |
---|
670 |
{ |
---|
671 |
if (c < 0x80) |
---|
672 |
{ |
---|
673 |
write(cast(char)c); |
---|
674 |
} |
---|
675 |
else if (c < 0x800) |
---|
676 |
{ |
---|
677 |
write(cast(char)((c >> 6) + 0xC0)); |
---|
678 |
write(cast(char)((c & 0x3F) + 0x80)); |
---|
679 |
} |
---|
680 |
else if (c < 0x10000) |
---|
681 |
{ |
---|
682 |
write(cast(char)((c >> 12) + 0xE0)); |
---|
683 |
write(cast(char)(((c >> 6) & 0x3F) + 0x80)); |
---|
684 |
write(cast(char)((c & 0x3F) + 0x80)); |
---|
685 |
} |
---|
686 |
else |
---|
687 |
{ |
---|
688 |
write(cast(char)((c >> 18) + 0xF0)); |
---|
689 |
write(cast(char)(((c >> 12) & 0x3F) + 0x80)); |
---|
690 |
write(cast(char)(((c >> 6) & 0x3F) + 0x80)); |
---|
691 |
write(cast(char)((c & 0x3F) + 0x80)); |
---|
692 |
} |
---|
693 |
} |
---|
694 |
|
---|
695 |
void skipViaRead()() |
---|
696 |
{ |
---|
697 |
auto c = read; |
---|
698 |
if (c < 0xC0) return; |
---|
699 |
int n = tails(cast(char) c); |
---|
700 |
for (uint i=0; i<n; ++i) |
---|
701 |
{ |
---|
702 |
read(); |
---|
703 |
} |
---|
704 |
} |
---|
705 |
|
---|
706 |
dchar decodeViaRead()() |
---|
707 |
{ |
---|
708 |
dchar c = read; |
---|
709 |
if (c < 0xC0) return c; |
---|
710 |
int n = tails(cast(char) c); |
---|
711 |
c &= (1 << (6 - n)) - 1; |
---|
712 |
for (uint i=0; i<n; ++i) |
---|
713 |
{ |
---|
714 |
c = (c << 6) + (read & 0x3F); |
---|
715 |
} |
---|
716 |
return c; |
---|
717 |
} |
---|
718 |
|
---|
719 |
dchar safeDecodeViaRead()() |
---|
720 |
{ |
---|
721 |
dchar c = read; |
---|
722 |
if (c < 0x80) return c; |
---|
723 |
int n = tails(cast(char) c); |
---|
724 |
if (n == 0) return INVALID_SEQUENCE; |
---|
725 |
|
---|
726 |
if (!canRead) return INVALID_SEQUENCE; |
---|
727 |
uint d = peek; |
---|
728 |
bool err = |
---|
729 |
( |
---|
730 |
(c < 0xC2) // fail overlong 2-byte sequences |
---|
731 |
|| (c > 0xF4) // fail overlong 4-6-byte sequences |
---|
732 |
|| (c == 0xE0 && ((d & 0xE0) == 0x80)) // fail overlong 3-byte sequences |
---|
733 |
|| (c == 0xED && ((d & 0xE0) == 0xA0)) // fail surrogates |
---|
734 |
|| (c == 0xF0 && ((d & 0xF0) == 0x80)) // fail overlong 4-byte sequences |
---|
735 |
|| (c == 0xF4 && ((d & 0xF0) >= 0x90)) // fail code points > 0x10FFFF |
---|
736 |
); |
---|
737 |
|
---|
738 |
c &= (1 << (6 - n)) - 1; |
---|
739 |
for (uint i=0; i<n; ++i) |
---|
740 |
{ |
---|
741 |
if (!canRead) return INVALID_SEQUENCE; |
---|
742 |
d = peek; |
---|
743 |
if ((d & 0xC0) != 0x80) return INVALID_SEQUENCE; |
---|
744 |
c = (c << 6) + (read & 0x3F); |
---|
745 |
} |
---|
746 |
|
---|
747 |
return err ? INVALID_SEQUENCE : c; |
---|
748 |
} |
---|
749 |
|
---|
750 |
dchar decodeReverseViaRead()() |
---|
751 |
{ |
---|
752 |
//auto feed_char = read; |
---|
753 |
//dchar c = feed_char; |
---|
754 |
dchar c = read; |
---|
755 |
if (c < 0x80) return c; |
---|
756 |
uint shift = 0; |
---|
757 |
c &= 0x3F; |
---|
758 |
for (uint i=0; i<4; ++i) |
---|
759 |
{ |
---|
760 |
shift += 6; |
---|
761 |
auto d = read; |
---|
762 |
uint n = tails(cast(char) d); |
---|
763 |
uint mask = n == 0 ? 0x3F : (1 << (6 - n)) - 1; |
---|
764 |
c += ((d & mask) << shift); |
---|
765 |
if (n != 0) break; |
---|
766 |
} |
---|
767 |
return c; |
---|
768 |
} |
---|
769 |
|
---|
770 |
EString replacementSequence() |
---|
771 |
{ |
---|
772 |
return "\uFFFD"; |
---|
773 |
} |
---|
774 |
|
---|
775 |
mixin EncoderFunctions; |
---|
776 |
} |
---|
777 |
|
---|
778 |
//============================================================================= |
---|
779 |
// UTF-16 |
---|
780 |
//============================================================================= |
---|
781 |
|
---|
782 |
template EncoderInstance(CharType : wchar) |
---|
783 |
{ |
---|
784 |
alias wchar E; |
---|
785 |
alias wchar[] EString; |
---|
786 |
|
---|
787 |
string encodingName() |
---|
788 |
{ |
---|
789 |
return "UTF-16"; |
---|
790 |
} |
---|
791 |
|
---|
792 |
bool canEncode(dchar c) |
---|
793 |
{ |
---|
794 |
return isValidCodePoint(c); |
---|
795 |
} |
---|
796 |
|
---|
797 |
bool isValidCodeUnit(wchar c) |
---|
798 |
{ |
---|
799 |
return true; |
---|
800 |
} |
---|
801 |
|
---|
802 |
uint encodedLength(dchar c) |
---|
803 |
in |
---|
804 |
{ |
---|
805 |
assert(canEncode(c)); |
---|
806 |
} |
---|
807 |
body |
---|
808 |
{ |
---|
809 |
return (c < 0x10000) ? 1 : 2; |
---|
810 |
} |
---|
811 |
|
---|
812 |
void encodeViaWrite()(dchar c) |
---|
813 |
{ |
---|
814 |
if (c < 0x10000) |
---|
815 |
{ |
---|
816 |
write(cast(wchar)c); |
---|
817 |
} |
---|
818 |
else |
---|
819 |
{ |
---|
820 |
uint n = c - 0x10000; |
---|
821 |
write(cast(wchar)(0xD800 + (n >> 10))); |
---|
822 |
write(cast(wchar)(0xDC00 + (n & 0x3FF))); |
---|
823 |
} |
---|
824 |
} |
---|
825 |
|
---|
826 |
void skipViaRead()() |
---|
827 |
{ |
---|
828 |
wchar c = read; |
---|
829 |
if (c < 0xD800 || c >= 0xE000) return; |
---|
830 |
read(); |
---|
831 |
} |
---|
832 |
|
---|
833 |
dchar decodeViaRead()() |
---|
834 |
{ |
---|
835 |
wchar c = read; |
---|
836 |
if (c < 0xD800 || c >= 0xE000) return cast(dchar)c; |
---|
837 |
wchar d = read; |
---|
838 |
c &= 0x3FF; |
---|
839 |
d &= 0x3FF; |
---|
840 |
return 0x10000 + (c << 10) + d; |
---|
841 |
} |
---|
842 |
|
---|
843 |
dchar safeDecodeViaRead()() |
---|
844 |
{ |
---|
845 |
wchar c = read; |
---|
846 |
if (c < 0xD800 || c >= 0xE000) return cast(dchar)c; |
---|
847 |
if (c >= 0xDC00) return INVALID_SEQUENCE; |
---|
848 |
if (!canRead) return INVALID_SEQUENCE; |
---|
849 |
wchar d = peek; |
---|
850 |
if (d < 0xDC00 || d >= 0xE000) return INVALID_SEQUENCE; |
---|
851 |
d = read; |
---|
852 |
c &= 0x3FF; |
---|
853 |
d &= 0x3FF; |
---|
854 |
return 0x10000 + (c << 10) + d; |
---|
855 |
} |
---|
856 |
|
---|
857 |
dchar decodeReverseViaRead()() |
---|
858 |
{ |
---|
859 |
wchar c = read; |
---|
860 |
if (c < 0xD800 || c >= 0xE000) return cast(dchar)c; |
---|
861 |
wchar d = read; |
---|
862 |
c &= 0x3FF; |
---|
863 |
d &= 0x3FF; |
---|
864 |
return 0x10000 + (d << 10) + c; |
---|
865 |
} |
---|
866 |
|
---|
867 |
EString replacementSequence() |
---|
868 |
{ |
---|
869 |
return "\uFFFD"w; |
---|
870 |
} |
---|
871 |
|
---|
872 |
mixin EncoderFunctions; |
---|
873 |
} |
---|
874 |
|
---|
875 |
//============================================================================= |
---|
876 |
// UTF-32 |
---|
877 |
//============================================================================= |
---|
878 |
|
---|
879 |
template EncoderInstance(CharType : dchar) |
---|
880 |
{ |
---|
881 |
alias dchar E; |
---|
882 |
alias dchar[] EString; |
---|
883 |
|
---|
884 |
string encodingName() |
---|
885 |
{ |
---|
886 |
return "UTF-32"; |
---|
887 |
} |
---|
888 |
|
---|
889 |
bool canEncode(dchar c) |
---|
890 |
{ |
---|
891 |
return isValidCodePoint(c); |
---|
892 |
} |
---|
893 |
|
---|
894 |
bool isValidCodeUnit(dchar c) |
---|
895 |
{ |
---|
896 |
return isValidCodePoint(c); |
---|
897 |
} |
---|
898 |
|
---|
899 |
uint encodedLength(dchar c) |
---|
900 |
in |
---|
901 |
{ |
---|
902 |
assert(canEncode(c)); |
---|
903 |
} |
---|
904 |
body |
---|
905 |
{ |
---|
906 |
return 1; |
---|
907 |
} |
---|
908 |
|
---|
909 |
void encodeViaWrite()(dchar c) |
---|
910 |
{ |
---|
911 |
write(c); |
---|
912 |
} |
---|
913 |
|
---|
914 |
void skipViaRead()() |
---|
915 |
{ |
---|
916 |
read(); |
---|
917 |
} |
---|
918 |
|
---|
919 |
dchar decodeViaRead()() |
---|
920 |
{ |
---|
921 |
return cast(dchar)read; |
---|
922 |
} |
---|
923 |
|
---|
924 |
dchar safeDecodeViaRead()() |
---|
925 |
{ |
---|
926 |
dchar c = read; |
---|
927 |
return isValidCodePoint(c) ? c : INVALID_SEQUENCE; |
---|
928 |
} |
---|
929 |
|
---|
930 |
dchar decodeReverseViaRead()() |
---|
931 |
{ |
---|
932 |
return cast(dchar)read; |
---|
933 |
} |
---|
934 |
|
---|
935 |
EString replacementSequence() |
---|
936 |
{ |
---|
937 |
return "\uFFFD"d; |
---|
938 |
} |
---|
939 |
|
---|
940 |
mixin EncoderFunctions; |
---|
941 |
} |
---|
942 |
|
---|
943 |
//============================================================================= |
---|
944 |
// Below are forwarding functions which expose the function to the user |
---|
945 |
|
---|
946 |
/** |
---|
947 |
Returns true if c is a valid code point |
---|
948 |
|
---|
949 |
Note that this includes the non-character code points U+FFFE and U+FFFF, |
---|
950 |
since these are valid code points (even though they are not valid |
---|
951 |
characters). |
---|
952 |
|
---|
953 |
Supercedes: |
---|
954 |
This function supercedes $(D std.utf.startsValidDchar()). |
---|
955 |
|
---|
956 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
957 |
|
---|
958 |
Params: |
---|
959 |
c = the code point to be tested |
---|
960 |
*/ |
---|
961 |
bool isValidCodePoint(dchar c) |
---|
962 |
{ |
---|
963 |
return c < 0xD800 || (c >= 0xE000 && c < 0x110000); |
---|
964 |
} |
---|
965 |
|
---|
966 |
/** |
---|
967 |
Returns the name of an encoding. |
---|
968 |
|
---|
969 |
The type of encoding cannot be deduced. Therefore, it is necessary to |
---|
970 |
explicitly specify the encoding type. |
---|
971 |
|
---|
972 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
973 |
|
---|
974 |
Examples: |
---|
975 |
----------------------------------- |
---|
976 |
assert(encodingName!(Latin1Char) == "ISO-8859-1"); |
---|
977 |
----------------------------------- |
---|
978 |
*/ |
---|
979 |
string encodingName(T)() |
---|
980 |
{ |
---|
981 |
return EncoderInstance!(T).encodingName; |
---|
982 |
} |
---|
983 |
|
---|
984 |
unittest |
---|
985 |
{ |
---|
986 |
assert(encodingName!(char) == "UTF-8"); |
---|
987 |
assert(encodingName!(wchar) == "UTF-16"); |
---|
988 |
assert(encodingName!(dchar) == "UTF-32"); |
---|
989 |
assert(encodingName!(AsciiChar) == "ASCII"); |
---|
990 |
assert(encodingName!(Latin1Char) == "ISO-8859-1"); |
---|
991 |
assert(encodingName!(Windows1252Char) == "windows-1252"); |
---|
992 |
} |
---|
993 |
|
---|
994 |
/** |
---|
995 |
Returns true iff it is possible to represent the specifed codepoint |
---|
996 |
in the encoding. |
---|
997 |
|
---|
998 |
The type of encoding cannot be deduced. Therefore, it is necessary to |
---|
999 |
explicitly specify the encoding type. |
---|
1000 |
|
---|
1001 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1002 |
|
---|
1003 |
Examples: |
---|
1004 |
----------------------------------- |
---|
1005 |
assert(canEncode!(Latin1Char)('A')); |
---|
1006 |
----------------------------------- |
---|
1007 |
*/ |
---|
1008 |
bool canEncode(E)(dchar c) |
---|
1009 |
{ |
---|
1010 |
return EncoderInstance!(E).canEncode(c); |
---|
1011 |
} |
---|
1012 |
|
---|
1013 |
unittest |
---|
1014 |
{ |
---|
1015 |
assert(!canEncode!(AsciiChar)('\u00A0')); |
---|
1016 |
assert(canEncode!(Latin1Char)('\u00A0')); |
---|
1017 |
assert(canEncode!(Windows1252Char)('\u20AC')); |
---|
1018 |
assert(!canEncode!(Windows1252Char)('\u20AD')); |
---|
1019 |
assert(!canEncode!(Windows1252Char)('\uFFFD')); |
---|
1020 |
assert(!canEncode!(char)(cast(dchar)0x110000)); |
---|
1021 |
} |
---|
1022 |
|
---|
1023 |
/** |
---|
1024 |
Returns true if the code unit is legal. For example, the byte 0x80 would |
---|
1025 |
not be legal in ASCII, because ASCII code units must always be in the range |
---|
1026 |
0x00 to 0x7F. |
---|
1027 |
|
---|
1028 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1029 |
|
---|
1030 |
Params: |
---|
1031 |
c = the code unit to be tested |
---|
1032 |
*/ |
---|
1033 |
bool isValidCodeUnit(E)(E c) |
---|
1034 |
{ |
---|
1035 |
return EncoderInstance!(E).isValidCodeUnit(c); |
---|
1036 |
} |
---|
1037 |
|
---|
1038 |
unittest |
---|
1039 |
{ |
---|
1040 |
assert(!isValidCodeUnit(cast(AsciiChar)0xA0)); |
---|
1041 |
assert( isValidCodeUnit(cast(Windows1252Char)0x80)); |
---|
1042 |
assert(!isValidCodeUnit(cast(Windows1252Char)0x81)); |
---|
1043 |
assert(!isValidCodeUnit(cast(char)0xC0)); |
---|
1044 |
assert(!isValidCodeUnit(cast(char)0xFF)); |
---|
1045 |
assert( isValidCodeUnit(cast(wchar)0xD800)); |
---|
1046 |
assert(!isValidCodeUnit(cast(dchar)0xD800)); |
---|
1047 |
} |
---|
1048 |
|
---|
1049 |
/** |
---|
1050 |
Returns true if the string is encoded correctly |
---|
1051 |
|
---|
1052 |
Supercedes: |
---|
1053 |
This function supercedes std.utf.validate(), however note that this |
---|
1054 |
function returns a bool indicating whether the input was valid or not, |
---|
1055 |
wheras the older funtion would throw an exception. |
---|
1056 |
|
---|
1057 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1058 |
|
---|
1059 |
Params: |
---|
1060 |
s = the string to be tested |
---|
1061 |
*/ |
---|
1062 |
bool isValid(E)(E[] s) |
---|
1063 |
{ |
---|
1064 |
return s.length == validLength(s); |
---|
1065 |
} |
---|
1066 |
|
---|
1067 |
unittest |
---|
1068 |
{ |
---|
1069 |
assert(isValid("\u20AC100")); |
---|
1070 |
} |
---|
1071 |
|
---|
1072 |
/** |
---|
1073 |
Returns the length of the longest possible substring, starting from |
---|
1074 |
the first code unit, which is validly encoded. |
---|
1075 |
|
---|
1076 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1077 |
|
---|
1078 |
Params: |
---|
1079 |
s = the string to be tested |
---|
1080 |
*/ |
---|
1081 |
uint validLength(E)(E[] s) |
---|
1082 |
{ |
---|
1083 |
uint result, before = void; |
---|
1084 |
while ((before = s.length) > 0) |
---|
1085 |
{ |
---|
1086 |
if (EncoderInstance!(E).safeDecode(s) == INVALID_SEQUENCE) |
---|
1087 |
break; |
---|
1088 |
result += before - s.length; |
---|
1089 |
} |
---|
1090 |
return result; |
---|
1091 |
} |
---|
1092 |
|
---|
1093 |
/** |
---|
1094 |
Sanitizes a string by replacing malformed code unit sequences with valid |
---|
1095 |
code unit sequences. The result is guaranteed to be valid for this encoding. |
---|
1096 |
|
---|
1097 |
If the input string is already valid, this function returns the original, |
---|
1098 |
otherwise it constructs a new string by replacing all illegal code unit |
---|
1099 |
sequences with the encoding's replacement character, Invalid sequences will |
---|
1100 |
be replaced with the Unicode replacement character (U+FFFD) if the |
---|
1101 |
character repertoire contains it, otherwise invalid sequences will be |
---|
1102 |
replaced with '?'. |
---|
1103 |
|
---|
1104 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1105 |
|
---|
1106 |
Params: |
---|
1107 |
s = the string to be sanitized |
---|
1108 |
*/ |
---|
1109 |
|
---|
1110 |
E[] sanitize(E)(E[] s) |
---|
1111 |
{ |
---|
1112 |
uint n = validLength(s); |
---|
1113 |
if (n == s.length) return s; |
---|
1114 |
|
---|
1115 |
auto repSeq = EncoderInstance!(E).replacementSequence; |
---|
1116 |
|
---|
1117 |
// Count how long the string needs to be. |
---|
1118 |
// Overestimating is not a problem |
---|
1119 |
uint len = s.length; |
---|
1120 |
E[] t = s[n..$]; |
---|
1121 |
while (t.length != 0) |
---|
1122 |
{ |
---|
1123 |
dchar c = EncoderInstance!(E).safeDecode(t); |
---|
1124 |
assert(c == INVALID_SEQUENCE); |
---|
1125 |
len += repSeq.length; |
---|
1126 |
t = t[validLength(t)..$]; |
---|
1127 |
} |
---|
1128 |
|
---|
1129 |
// Now do the write |
---|
1130 |
E[] array = new E[len]; |
---|
1131 |
array[0..n] = s[0..n]; |
---|
1132 |
uint offset = n; |
---|
1133 |
|
---|
1134 |
t = s[n..$]; |
---|
1135 |
while (t.length != 0) |
---|
1136 |
{ |
---|
1137 |
dchar c = EncoderInstance!(E).safeDecode(t); |
---|
1138 |
assert(c == INVALID_SEQUENCE); |
---|
1139 |
array[offset..offset+repSeq.length] = repSeq[]; |
---|
1140 |
offset += repSeq.length; |
---|
1141 |
n = validLength(t); |
---|
1142 |
array[offset..offset+n] = t[0..n]; |
---|
1143 |
offset += n; |
---|
1144 |
t = t[n..$]; |
---|
1145 |
} |
---|
1146 |
return cast(E[])array[0..offset]; |
---|
1147 |
} |
---|
1148 |
|
---|
1149 |
unittest |
---|
1150 |
{ |
---|
1151 |
assert(sanitize("hello \xF0\x80world") == "hello \xEF\xBF\xBDworld"); |
---|
1152 |
} |
---|
1153 |
|
---|
1154 |
/** |
---|
1155 |
Returns the length of the first encoded sequence. |
---|
1156 |
|
---|
1157 |
The input to this function MUST be validly encoded. |
---|
1158 |
This is enforced by the function's in-contract. |
---|
1159 |
|
---|
1160 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1161 |
|
---|
1162 |
Params: |
---|
1163 |
s = the string to be sliced |
---|
1164 |
*/ |
---|
1165 |
uint firstSequence(E)(E[] s) |
---|
1166 |
in |
---|
1167 |
{ |
---|
1168 |
assert(s.length != 0); |
---|
1169 |
E[] u = s; |
---|
1170 |
assert(safeDecode(u) != INVALID_SEQUENCE); |
---|
1171 |
} |
---|
1172 |
body |
---|
1173 |
{ |
---|
1174 |
auto before = s.length; |
---|
1175 |
EncoderInstance!(E).skip(s); |
---|
1176 |
return before - s.length; |
---|
1177 |
} |
---|
1178 |
|
---|
1179 |
unittest |
---|
1180 |
{ |
---|
1181 |
assert(firstSequence("\u20AC1000") == "\u20AC".length); |
---|
1182 |
} |
---|
1183 |
|
---|
1184 |
/** |
---|
1185 |
Returns the length the last encoded sequence. |
---|
1186 |
|
---|
1187 |
The input to this function MUST be validly encoded. |
---|
1188 |
This is enforced by the function's in-contract. |
---|
1189 |
|
---|
1190 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1191 |
|
---|
1192 |
Params: |
---|
1193 |
s = the string to be sliced |
---|
1194 |
*/ |
---|
1195 |
uint lastSequence(E)(E[] s) |
---|
1196 |
in |
---|
1197 |
{ |
---|
1198 |
assert(s.length != 0); |
---|
1199 |
assert(isValid(s)); |
---|
1200 |
} |
---|
1201 |
body |
---|
1202 |
{ |
---|
1203 |
E[] t = s; |
---|
1204 |
EncoderInstance!(E).decodeReverse(s); |
---|
1205 |
return t.length - s.length; |
---|
1206 |
} |
---|
1207 |
|
---|
1208 |
unittest |
---|
1209 |
{ |
---|
1210 |
assert(lastSequence("1000\u20AC") == "\u20AC".length); |
---|
1211 |
} |
---|
1212 |
|
---|
1213 |
/** |
---|
1214 |
Returns the total number of code points encoded in a string. |
---|
1215 |
|
---|
1216 |
The input to this function MUST be validly encoded. This is enforced |
---|
1217 |
by the function's in-contract. |
---|
1218 |
|
---|
1219 |
Supercedes: This function supercedes $(D std.utf.toUCSindex()). |
---|
1220 |
|
---|
1221 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1222 |
|
---|
1223 |
Params: |
---|
1224 |
s = the string to be counted |
---|
1225 |
*/ |
---|
1226 |
uint codepoints_count(E)(E[] s) |
---|
1227 |
in |
---|
1228 |
{ |
---|
1229 |
assert(isValid(s)); |
---|
1230 |
} |
---|
1231 |
body |
---|
1232 |
{ |
---|
1233 |
uint n = 0; |
---|
1234 |
while (s.length != 0) |
---|
1235 |
{ |
---|
1236 |
EncoderInstance!(E).skip(s); |
---|
1237 |
++n; |
---|
1238 |
} |
---|
1239 |
return n; |
---|
1240 |
} |
---|
1241 |
|
---|
1242 |
unittest |
---|
1243 |
{ |
---|
1244 |
assert(codepoints_count("\u20AC100") == 4); |
---|
1245 |
} |
---|
1246 |
|
---|
1247 |
/** |
---|
1248 |
Returns the array index at which the (n+1)th code point begins. |
---|
1249 |
|
---|
1250 |
The input to this function MUST be validly encoded. |
---|
1251 |
This is enforced by the function's in-contract. |
---|
1252 |
|
---|
1253 |
Supercedes: |
---|
1254 |
This function supercedes std.utf.toUTFindex(). |
---|
1255 |
|
---|
1256 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1257 |
|
---|
1258 |
Params: |
---|
1259 |
s = the string to be counted |
---|
1260 |
*/ |
---|
1261 |
int index(E)(E[] s,int n) |
---|
1262 |
in |
---|
1263 |
{ |
---|
1264 |
assert(isValid(s)); |
---|
1265 |
assert(n >= 0); |
---|
1266 |
} |
---|
1267 |
body |
---|
1268 |
{ |
---|
1269 |
E[] t = s; |
---|
1270 |
for (uint i=0; i<n; ++i) EncoderInstance!(E).skip(s); |
---|
1271 |
return t.length - s.length; |
---|
1272 |
} |
---|
1273 |
|
---|
1274 |
unittest |
---|
1275 |
{ |
---|
1276 |
assert(index("\u20AC100",1) == 3); |
---|
1277 |
} |
---|
1278 |
|
---|
1279 |
/** |
---|
1280 |
Decodes a single code point. |
---|
1281 |
|
---|
1282 |
This function removes one or more code units from the start of a string, |
---|
1283 |
and returns the decoded code point which those code units represent. |
---|
1284 |
|
---|
1285 |
The input to this function MUST be validly encoded. |
---|
1286 |
This is enforced by the function's in-contract. |
---|
1287 |
|
---|
1288 |
Supercedes: |
---|
1289 |
This function supercedes std.utf.decode(), however, note that the |
---|
1290 |
function codePoints() supercedes it more conveniently. |
---|
1291 |
|
---|
1292 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1293 |
|
---|
1294 |
Params: |
---|
1295 |
s = the string whose first code point is to be decoded |
---|
1296 |
*/ |
---|
1297 |
dchar decode(S)(ref S s) |
---|
1298 |
in |
---|
1299 |
{ |
---|
1300 |
assert(s.length != 0); |
---|
1301 |
auto u = s; |
---|
1302 |
assert(safeDecode(u) != INVALID_SEQUENCE); |
---|
1303 |
} |
---|
1304 |
body |
---|
1305 |
{ |
---|
1306 |
return EncoderInstance!(typeof(s[0])).decode(s); |
---|
1307 |
} |
---|
1308 |
|
---|
1309 |
/** |
---|
1310 |
Decodes a single code point from the end of a string. |
---|
1311 |
|
---|
1312 |
This function removes one or more code units from the end of a string, |
---|
1313 |
and returns the decoded code point which those code units represent. |
---|
1314 |
|
---|
1315 |
The input to this function MUST be validly encoded. |
---|
1316 |
This is enforced by the function's in-contract. |
---|
1317 |
|
---|
1318 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1319 |
|
---|
1320 |
Params: |
---|
1321 |
s = the string whose first code point is to be decoded |
---|
1322 |
*/ |
---|
1323 |
dchar decodeReverse(E)(ref E[] s) |
---|
1324 |
in |
---|
1325 |
{ |
---|
1326 |
assert(s.length != 0); |
---|
1327 |
assert(isValid(s)); |
---|
1328 |
} |
---|
1329 |
body |
---|
1330 |
{ |
---|
1331 |
return EncoderInstance!(E).decodeReverse(s); |
---|
1332 |
} |
---|
1333 |
|
---|
1334 |
/** |
---|
1335 |
Decodes a single code point. The input does not have to be valid. |
---|
1336 |
|
---|
1337 |
This function removes one or more code units from the start of a string, |
---|
1338 |
and returns the decoded code point which those code units represent. |
---|
1339 |
|
---|
1340 |
This function will accept an invalidly encoded string as input. |
---|
1341 |
If an invalid sequence is found at the start of the string, this |
---|
1342 |
function will remove it, and return the value INVALID_SEQUENCE. |
---|
1343 |
|
---|
1344 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1345 |
|
---|
1346 |
Params: |
---|
1347 |
s = the string whose first code point is to be decoded |
---|
1348 |
*/ |
---|
1349 |
dchar safeDecode(S)(ref S s) |
---|
1350 |
in |
---|
1351 |
{ |
---|
1352 |
assert(s.length != 0); |
---|
1353 |
} |
---|
1354 |
body |
---|
1355 |
{ |
---|
1356 |
return EncoderInstance!(typeof(s[0])).safeDecode(s); |
---|
1357 |
} |
---|
1358 |
|
---|
1359 |
/** |
---|
1360 |
Returns the number of code units required to encode a single code point. |
---|
1361 |
|
---|
1362 |
The input to this function MUST be a valid code point. |
---|
1363 |
This is enforced by the function's in-contract. |
---|
1364 |
|
---|
1365 |
The type of the output cannot be deduced. Therefore, it is necessary to |
---|
1366 |
explicitly specify the encoding as a template parameter. |
---|
1367 |
|
---|
1368 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1369 |
|
---|
1370 |
Params: |
---|
1371 |
c = the code point to be encoded |
---|
1372 |
*/ |
---|
1373 |
uint encodedLength(E)(dchar c) |
---|
1374 |
in |
---|
1375 |
{ |
---|
1376 |
assert(isValidCodePoint(c)); |
---|
1377 |
} |
---|
1378 |
body |
---|
1379 |
{ |
---|
1380 |
return EncoderInstance!(E).encodedLength(c); |
---|
1381 |
} |
---|
1382 |
|
---|
1383 |
/** |
---|
1384 |
Encodes a single code point. |
---|
1385 |
|
---|
1386 |
This function encodes a single code point into one or more code units. |
---|
1387 |
It returns a string containing those code units. |
---|
1388 |
|
---|
1389 |
The input to this function MUST be a valid code point. |
---|
1390 |
This is enforced by the function's in-contract. |
---|
1391 |
|
---|
1392 |
The type of the output cannot be deduced. Therefore, it is necessary to |
---|
1393 |
explicitly specify the encoding as a template parameter. |
---|
1394 |
|
---|
1395 |
Supercedes: |
---|
1396 |
This function supercedes std.utf.encode(), however, note that the |
---|
1397 |
function codeUnits() supercedes it more conveniently. |
---|
1398 |
|
---|
1399 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1400 |
|
---|
1401 |
Params: |
---|
1402 |
c = the code point to be encoded |
---|
1403 |
*/ |
---|
1404 |
E[] encode(E)(dchar c) |
---|
1405 |
in |
---|
1406 |
{ |
---|
1407 |
assert(isValidCodePoint(c)); |
---|
1408 |
} |
---|
1409 |
body |
---|
1410 |
{ |
---|
1411 |
return EncoderInstance!(E).encode(c); |
---|
1412 |
} |
---|
1413 |
|
---|
1414 |
/** |
---|
1415 |
Encodes a single code point into an array. |
---|
1416 |
|
---|
1417 |
This function encodes a single code point into one or more code units |
---|
1418 |
The code units are stored in a user-supplied fixed-size array, |
---|
1419 |
which must be passed by reference. |
---|
1420 |
|
---|
1421 |
The input to this function MUST be a valid code point. |
---|
1422 |
This is enforced by the function's in-contract. |
---|
1423 |
|
---|
1424 |
The type of the output cannot be deduced. Therefore, it is necessary to |
---|
1425 |
explicitly specify the encoding as a template parameter. |
---|
1426 |
|
---|
1427 |
Supercedes: |
---|
1428 |
This function supercedes std.utf.encode(), however, note that the |
---|
1429 |
function codeUnits() supercedes it more conveniently. |
---|
1430 |
|
---|
1431 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1432 |
|
---|
1433 |
Params: |
---|
1434 |
c = the code point to be encoded |
---|
1435 |
|
---|
1436 |
Returns: |
---|
1437 |
the number of code units written to the array |
---|
1438 |
*/ |
---|
1439 |
uint encode(E)(dchar c, E[] array) |
---|
1440 |
in |
---|
1441 |
{ |
---|
1442 |
assert(isValidCodePoint(c)); |
---|
1443 |
} |
---|
1444 |
body |
---|
1445 |
{ |
---|
1446 |
E[] t = array; |
---|
1447 |
EncoderInstance!(E).encode(c,t); |
---|
1448 |
return array.length - t.length; |
---|
1449 |
} |
---|
1450 |
|
---|
1451 |
// /** |
---|
1452 |
// * Encodes a single code point into a Buffer. |
---|
1453 |
// * |
---|
1454 |
// * This function encodes a single code point into one or more code units |
---|
1455 |
// * The code units are stored in a growable buffer. |
---|
1456 |
// * |
---|
1457 |
// * The input to this function MUST be a valid code point. |
---|
1458 |
// * This is enforced by the function's in-contract. |
---|
1459 |
// * |
---|
1460 |
// * The type of the output cannot be deduced. Therefore, it is necessary to |
---|
1461 |
// * explicitly specify the encoding as a template parameter. |
---|
1462 |
// * |
---|
1463 |
// * Supercedes: |
---|
1464 |
// * This function supercedes std.utf.encode(), however, note that the |
---|
1465 |
// * function codeUnits() supercedes it more conveniently. |
---|
1466 |
// * |
---|
1467 |
// * Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1468 |
// * |
---|
1469 |
// * Params: |
---|
1470 |
// * c = the code point to be encoded |
---|
1471 |
// */ |
---|
1472 |
// deprecated void encode(E)(dchar c, ref Buffer!(E) buffer) |
---|
1473 |
// in |
---|
1474 |
// { |
---|
1475 |
// assert(isValidCodePoint(c)); |
---|
1476 |
// } |
---|
1477 |
// body |
---|
1478 |
// { |
---|
1479 |
// EncoderInstance!(E).encode(c,buffer); |
---|
1480 |
// } |
---|
1481 |
|
---|
1482 |
/** |
---|
1483 |
Encodes $(D c) in units of type $(D E) and writes the result to the |
---|
1484 |
output range $(D R). Returns the number of $(D E)s written. |
---|
1485 |
*/ |
---|
1486 |
|
---|
1487 |
alias void delegate(char c) encode_putchar; |
---|
1488 |
alias void delegate(wchar c) encode_putwchar; |
---|
1489 |
|
---|
1490 |
size_t encode_char(dchar c, encode_putchar putc) |
---|
1491 |
{ |
---|
1492 |
if (c <= 0x7F) |
---|
1493 |
{ |
---|
1494 |
putc(cast(char) c); |
---|
1495 |
return 1; |
---|
1496 |
} |
---|
1497 |
if (c <= 0x7FF) |
---|
1498 |
{ |
---|
1499 |
putc(cast(char)(0xC0 | (c >> 6))); |
---|
1500 |
putc(cast(char)(0x80 | (c & 0x3F))); |
---|
1501 |
return 2; |
---|
1502 |
} |
---|
1503 |
if (c <= 0xFFFF) |
---|
1504 |
{ |
---|
1505 |
putc(cast(char)(0xE0 | (c >> 12))); |
---|
1506 |
putc(cast(char)(0x80 | ((c >> 6) & 0x3F))); |
---|
1507 |
putc(cast(char)(0x80 | (c & 0x3F))); |
---|
1508 |
return 3; |
---|
1509 |
} |
---|
1510 |
if (c <= 0x10FFFF) |
---|
1511 |
{ |
---|
1512 |
putc(cast(char)(0xF0 | (c >> 18))); |
---|
1513 |
putc(cast(char)(0x80 | ((c >> 12) & 0x3F))); |
---|
1514 |
putc(cast(char)(0x80 | ((c >> 6) & 0x3F))); |
---|
1515 |
putc(cast(char)(0x80 | (c & 0x3F))); |
---|
1516 |
return 4; |
---|
1517 |
} |
---|
1518 |
else |
---|
1519 |
{ |
---|
1520 |
assert(0); |
---|
1521 |
} |
---|
1522 |
} |
---|
1523 |
size_t encode_wchar(E, R)(dchar c, encode_putwchar putw) |
---|
1524 |
{ |
---|
1525 |
if (c <= 0xFFFF) |
---|
1526 |
{ |
---|
1527 |
r.putw(cast(wchar) c); |
---|
1528 |
return 1; |
---|
1529 |
} |
---|
1530 |
r.putw(cast(wchar) ((((c - 0x10000) >> 10) & 0x3FF) + 0xD800)); |
---|
1531 |
r.putw(cast(wchar) (((c - 0x10000) & 0x3FF) + 0xDC00)); |
---|
1532 |
return 2; |
---|
1533 |
} |
---|
1534 |
|
---|
1535 |
/** |
---|
1536 |
Encodes a single code point to a delegate. |
---|
1537 |
|
---|
1538 |
This function encodes a single code point into one or more code units. |
---|
1539 |
The code units are passed one at a time to the supplied delegate. |
---|
1540 |
|
---|
1541 |
The input to this function MUST be a valid code point. |
---|
1542 |
This is enforced by the function's in-contract. |
---|
1543 |
|
---|
1544 |
The type of the output cannot be deduced. Therefore, it is necessary to |
---|
1545 |
explicitly specify the encoding as a template parameter. |
---|
1546 |
|
---|
1547 |
Supercedes: |
---|
1548 |
This function supercedes std.utf.encode(), however, note that the |
---|
1549 |
function codeUnits() supercedes it more conveniently. |
---|
1550 |
|
---|
1551 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1552 |
|
---|
1553 |
Params: |
---|
1554 |
c = the code point to be encoded |
---|
1555 |
*/ |
---|
1556 |
void encode(E)(dchar c, void delegate(E) dg) |
---|
1557 |
in |
---|
1558 |
{ |
---|
1559 |
assert(isValidCodePoint(c)); |
---|
1560 |
} |
---|
1561 |
body |
---|
1562 |
{ |
---|
1563 |
EncoderInstance!(E).encode(c,dg); |
---|
1564 |
} |
---|
1565 |
|
---|
1566 |
/** |
---|
1567 |
Returns a foreachable struct which can bidirectionally iterate over all |
---|
1568 |
code points in a string. |
---|
1569 |
|
---|
1570 |
The input to this function MUST be validly encoded. |
---|
1571 |
This is enforced by the function's in-contract. |
---|
1572 |
|
---|
1573 |
You can foreach either |
---|
1574 |
with or without an index. If an index is specified, it will be initialized |
---|
1575 |
at each iteration with the offset into the string at which the code point |
---|
1576 |
begins. |
---|
1577 |
|
---|
1578 |
Supercedes: |
---|
1579 |
This function supercedes std.utf.decode(). |
---|
1580 |
|
---|
1581 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1582 |
|
---|
1583 |
Params: |
---|
1584 |
s = the string to be decoded |
---|
1585 |
|
---|
1586 |
Examples: |
---|
1587 |
-------------------------------------------------------- |
---|
1588 |
string s = "hello world"; |
---|
1589 |
foreach(c;codePoints(s)) |
---|
1590 |
{ |
---|
1591 |
// do something with c (which will always be a dchar) |
---|
1592 |
} |
---|
1593 |
-------------------------------------------------------- |
---|
1594 |
|
---|
1595 |
Note that, currently, foreach(c:codePoints(s)) is superior to foreach(c;s) |
---|
1596 |
in that the latter will fall over on encountering U+FFFF. |
---|
1597 |
*/ |
---|
1598 |
CodePoints!(E) codePoints(E)(E[] s) |
---|
1599 |
in |
---|
1600 |
{ |
---|
1601 |
assert(isValid(s)); |
---|
1602 |
} |
---|
1603 |
body |
---|
1604 |
{ |
---|
1605 |
return CodePoints!(E)(s); |
---|
1606 |
} |
---|
1607 |
|
---|
1608 |
unittest |
---|
1609 |
{ |
---|
1610 |
string s = "hello"; |
---|
1611 |
string t; |
---|
1612 |
foreach(c;codePoints(s)) |
---|
1613 |
{ |
---|
1614 |
t ~= cast(char)c; |
---|
1615 |
} |
---|
1616 |
assert(s == t); |
---|
1617 |
} |
---|
1618 |
|
---|
1619 |
/** |
---|
1620 |
Returns a foreachable struct which can bidirectionally iterate over all |
---|
1621 |
code units in a code point. |
---|
1622 |
|
---|
1623 |
The input to this function MUST be a valid code point. |
---|
1624 |
This is enforced by the function's in-contract. |
---|
1625 |
|
---|
1626 |
The type of the output cannot be deduced. Therefore, it is necessary to |
---|
1627 |
explicitly specify the encoding type in the template parameter. |
---|
1628 |
|
---|
1629 |
Supercedes: |
---|
1630 |
This function supercedes std.utf.encode(). |
---|
1631 |
|
---|
1632 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1633 |
|
---|
1634 |
Params: |
---|
1635 |
d = the code point to be encoded |
---|
1636 |
|
---|
1637 |
Examples: |
---|
1638 |
-------------------------------------------------------- |
---|
1639 |
dchar d = '\u20AC'; |
---|
1640 |
foreach(c;codeUnits!(char)(d)) |
---|
1641 |
{ |
---|
1642 |
writefln("%X",c) |
---|
1643 |
} |
---|
1644 |
// will print |
---|
1645 |
// E2 |
---|
1646 |
// 82 |
---|
1647 |
// AC |
---|
1648 |
-------------------------------------------------------- |
---|
1649 |
*/ |
---|
1650 |
CodeUnits!(E) codeUnits(E)(dchar c) |
---|
1651 |
in |
---|
1652 |
{ |
---|
1653 |
assert(isValidCodePoint(c)); |
---|
1654 |
} |
---|
1655 |
body |
---|
1656 |
{ |
---|
1657 |
return CodeUnits!(E)(c); |
---|
1658 |
} |
---|
1659 |
|
---|
1660 |
unittest |
---|
1661 |
{ |
---|
1662 |
char[] a; |
---|
1663 |
foreach(c;codeUnits!(char)(cast(dchar)'\u20AC')) |
---|
1664 |
{ |
---|
1665 |
a ~= c; |
---|
1666 |
} |
---|
1667 |
assert(a.length == 3); |
---|
1668 |
assert(a[0] == 0xE2); |
---|
1669 |
assert(a[1] == 0x82); |
---|
1670 |
assert(a[2] == 0xAC); |
---|
1671 |
} |
---|
1672 |
|
---|
1673 |
/** |
---|
1674 |
Encodes $(D c) in units of type $(D E) and writes the result to the |
---|
1675 |
output range $(D R). Returns the number of $(D E)s written. |
---|
1676 |
*/ |
---|
1677 |
|
---|
1678 |
uint encode(Tgt, Src, R)(in Src[] s, R range) |
---|
1679 |
{ |
---|
1680 |
uint result; |
---|
1681 |
foreach (c; s) |
---|
1682 |
{ |
---|
1683 |
result += encode!(Tgt)(c, range); |
---|
1684 |
} |
---|
1685 |
return result; |
---|
1686 |
} |
---|
1687 |
|
---|
1688 |
/** |
---|
1689 |
Convert a string from one encoding to another. (See also to!() below). |
---|
1690 |
|
---|
1691 |
The input to this function MUST be validly encoded. |
---|
1692 |
This is enforced by the function's in-contract. |
---|
1693 |
|
---|
1694 |
Supercedes: |
---|
1695 |
This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and |
---|
1696 |
std.utf.toUTF32() |
---|
1697 |
(but note that to!() supercedes it more conveniently). |
---|
1698 |
|
---|
1699 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1700 |
|
---|
1701 |
Params: |
---|
1702 |
s = the source string |
---|
1703 |
r = the destination string |
---|
1704 |
|
---|
1705 |
Examples: |
---|
1706 |
-------------------------------------------------------- |
---|
1707 |
wstring ws; |
---|
1708 |
transcode("hello world",ws); |
---|
1709 |
// transcode from UTF-8 to UTF-16 |
---|
1710 |
|
---|
1711 |
Latin1String ls; |
---|
1712 |
transcode(ws, ls); |
---|
1713 |
// transcode from UTF-16 to ISO-8859-1 |
---|
1714 |
-------------------------------------------------------- |
---|
1715 |
*/ |
---|
1716 |
void transcode(Src,Dst)(Src[] s,out Dst[] r) |
---|
1717 |
in |
---|
1718 |
{ |
---|
1719 |
assert(isValid(s)); |
---|
1720 |
} |
---|
1721 |
body |
---|
1722 |
{ |
---|
1723 |
static if(is(Src==Dst)) |
---|
1724 |
{ |
---|
1725 |
r = s; |
---|
1726 |
} |
---|
1727 |
else static if(is(Src==AsciiChar)) |
---|
1728 |
{ |
---|
1729 |
transcode!(char,Dst)(cast(string)s,r); |
---|
1730 |
} |
---|
1731 |
else |
---|
1732 |
{ |
---|
1733 |
Src[] t = s; |
---|
1734 |
while (t.length != 0) |
---|
1735 |
{ |
---|
1736 |
r ~= encode!(Dst)(decode(t)); |
---|
1737 |
} |
---|
1738 |
} |
---|
1739 |
} |
---|
1740 |
|
---|
1741 |
/* |
---|
1742 |
Convert a string from one encoding to another. (See also transcode() above). |
---|
1743 |
|
---|
1744 |
The input to this function MUST be validly encoded. |
---|
1745 |
This is enforced by the function's in-contract. |
---|
1746 |
|
---|
1747 |
Supercedes: |
---|
1748 |
This function supercedes std.utf.toUTF8(), std.utf.toUTF16() and |
---|
1749 |
std.utf.toUTF32(). |
---|
1750 |
|
---|
1751 |
Standards: Unicode 5.0, ASCII, ISO-8859-1, WINDOWS-1252 |
---|
1752 |
|
---|
1753 |
Params: |
---|
1754 |
Dst = the destination encoding type |
---|
1755 |
s = the source string |
---|
1756 |
|
---|
1757 |
Examples: |
---|
1758 |
----------------------------------------------------------------------------- |
---|
1759 |
auto ws = to!(wchar)("hello world"); // transcode from UTF-8 to UTF-16 |
---|
1760 |
auto ls = to!(Latin1Char)(ws); // transcode from UTF-16 to ISO-8859-1 |
---|
1761 |
----------------------------------------------------------------------------- |
---|
1762 |
*/ |
---|
1763 |
// TODO: Commented out for no - to be moved to std.conv |
---|
1764 |
// Dst to(Dst,Src)(immutable(Src)[] s) |
---|
1765 |
// in |
---|
1766 |
// { |
---|
1767 |
// assert(isValid(s)); |
---|
1768 |
// } |
---|
1769 |
// body |
---|
1770 |
// { |
---|
1771 |
// Dst r; |
---|
1772 |
// transcode(s,r); |
---|
1773 |
// return r; |
---|
1774 |
// } |
---|
1775 |
|
---|
1776 |
//============================================================================= |
---|
1777 |
|
---|
1778 |
/** The base class for exceptions thrown by this module */ |
---|
1779 |
class EncodingException : Exception { this(string msg) { super(msg); } } |
---|
1780 |
|
---|
1781 |
class UnrecognizedEncodingException : EncodingException |
---|
1782 |
{ |
---|
1783 |
private this(string msg) { super(msg); } |
---|
1784 |
} |
---|
1785 |
|
---|
1786 |
/** Abstract base class of all encoding schemes */ |
---|
1787 |
abstract class EncodingScheme |
---|
1788 |
{ |
---|
1789 |
/** |
---|
1790 |
* Registers a subclass of EncodingScheme. |
---|
1791 |
* |
---|
1792 |
* This function allows user-defined subclasses of EncodingScheme to |
---|
1793 |
* be declared in other modules. |
---|
1794 |
* |
---|
1795 |
* Examples: |
---|
1796 |
* ---------------------------------------------- |
---|
1797 |
* class Amiga1251 : EncodingScheme |
---|
1798 |
* { |
---|
1799 |
* static this() |
---|
1800 |
* { |
---|
1801 |
* EncodingScheme.register("path.to.Amiga1251"); |
---|
1802 |
* } |
---|
1803 |
* } |
---|
1804 |
* ---------------------------------------------- |
---|
1805 |
*/ |
---|
1806 |
static void register(string className) |
---|
1807 |
{ |
---|
1808 |
auto scheme = cast(EncodingScheme)ClassInfo.find(className).create(); |
---|
1809 |
if (scheme is null) |
---|
1810 |
throw new EncodingException("Unable to create class "~className); |
---|
1811 |
foreach(encodingName;scheme.names()) |
---|
1812 |
{ |
---|
1813 |
supported[tolower(encodingName)] = className; |
---|
1814 |
} |
---|
1815 |
} |
---|
1816 |
|
---|
1817 |
/** |
---|
1818 |
* Obtains a subclass of EncodingScheme which is capable of encoding |
---|
1819 |
* and decoding the named encoding scheme. |
---|
1820 |
* |
---|
1821 |
* This function is only aware of EncodingSchemes which have been |
---|
1822 |
* registered with the register() function. |
---|
1823 |
* |
---|
1824 |
* Examples: |
---|
1825 |
* --------------------------------------------------- |
---|
1826 |
* auto scheme = EncodingScheme.create("Amiga-1251"); |
---|
1827 |
* --------------------------------------------------- |
---|
1828 |
*/ |
---|
1829 |
static EncodingScheme create(string encodingName) |
---|
1830 |
{ |
---|
1831 |
auto p = std.string.tolower(encodingName) in supported; |
---|
1832 |
if (p is null) |
---|
1833 |
throw new EncodingException("Unrecognized Encoding: "~encodingName); |
---|
1834 |
string className = *p; |
---|
1835 |
auto scheme = cast(EncodingScheme)ClassInfo.find(className).create(); |
---|
1836 |
if (scheme is null) throw new EncodingException("Unable to create class "~className); |
---|
1837 |
return scheme; |
---|
1838 |
} |
---|
1839 |
|
---|
1840 |
/** |
---|
1841 |
* Returns the standard name of the encoding scheme |
---|
1842 |
*/ |
---|
1843 |
abstract override string toString(); |
---|
1844 |
|
---|
1845 |
/** |
---|
1846 |
* Returns an array of all known names for this encoding scheme |
---|
1847 |
*/ |
---|
1848 |
abstract string[] names(); |
---|
1849 |
|
---|
1850 |
/** |
---|
1851 |
* Returns true if the character c can be represented |
---|
1852 |
* in this encoding scheme. |
---|
1853 |
*/ |
---|
1854 |
abstract bool canEncode(dchar c); |
---|
1855 |
|
---|
1856 |
/** |
---|
1857 |
* Returns the number of ubytes required to encode this code point. |
---|
1858 |
* |
---|
1859 |
* The input to this function MUST be a valid code point. |
---|
1860 |
* |
---|
1861 |
* Params: |
---|
1862 |
* c = the code point to be encoded |
---|
1863 |
* |
---|
1864 |
* Returns: |
---|
1865 |
* the number of ubytes required. |
---|
1866 |
*/ |
---|
1867 |
abstract uint encodedLength(dchar c); |
---|
1868 |
|
---|
1869 |
/** |
---|
1870 |
* Encodes a single code point into a user-supplied, fixed-size buffer. |
---|
1871 |
* |
---|
1872 |
* This function encodes a single code point into one or more ubytes. |
---|
1873 |
* The supplied buffer must be code unit aligned. |
---|
1874 |
* (For example, UTF-16LE or UTF-16BE must be wchar-aligned, |
---|
1875 |
* UTF-32LE or UTF-32BE must be dchar-aligned, etc.) |
---|
1876 |
* |
---|
1877 |
* The input to this function MUST be a valid code point. |
---|
1878 |
* |
---|
1879 |
* Params: |
---|
1880 |
* c = the code point to be encoded |
---|
1881 |
* |
---|
1882 |
* Returns: |
---|
1883 |
* the number of ubytes written. |
---|
1884 |
*/ |
---|
1885 |
abstract uint encode(dchar c, ubyte[] buffer); |
---|
1886 |
|
---|
1887 |
/** |
---|
1888 |
* Decodes a single code point. |
---|
1889 |
* |
---|
1890 |
* This function removes one or more ubytes from the start of an array, |
---|
1891 |
* and returns the decoded code point which those ubytes represent. |
---|
1892 |
* |
---|
1893 |
* The input to this function MUST be validly encoded. |
---|
1894 |
* |
---|
1895 |
* Params: |
---|
1896 |
* s = the array whose first code point is to be decoded |
---|
1897 |
*/ |
---|
1898 |
abstract dchar decode(ref ubyte[] s); |
---|
1899 |
|
---|
1900 |
/** |
---|
1901 |
* Decodes a single code point. The input does not have to be valid. |
---|
1902 |
* |
---|
1903 |
* This function removes one or more ubytes from the start of an array, |
---|
1904 |
* and returns the decoded code point which those ubytes represent. |
---|
1905 |
* |
---|
1906 |
* This function will accept an invalidly encoded array as input. |
---|
1907 |
* If an invalid sequence is found at the start of the string, this |
---|
1908 |
* function will remove it, and return the value INVALID_SEQUENCE. |
---|
1909 |
* |
---|
1910 |
* Params: |
---|
1911 |
* s = the array whose first code point is to be decoded |
---|
1912 |
*/ |
---|
1913 |
abstract dchar safeDecode(ref ubyte[] s); |
---|
1914 |
|
---|
1915 |
/** |
---|
1916 |
* Returns the sequence of ubytes to be used to represent |
---|
1917 |
* any character which cannot be represented in the encoding scheme. |
---|
1918 |
* |
---|
1919 |
* Normally this will be a representation of some substitution |
---|
1920 |
* character, such as U+FFFD or '?'. |
---|
1921 |
*/ |
---|
1922 |
abstract ubyte[] replacementSequence(); |
---|
1923 |
|
---|
1924 |
|
---|
1925 |
/** |
---|
1926 |
* Returns true if the array is encoded correctly |
---|
1927 |
* |
---|
1928 |
* Params: |
---|
1929 |
* s = the array to be tested |
---|
1930 |
*/ |
---|
1931 |
bool isValid(ubyte[] s) |
---|
1932 |
{ |
---|
1933 |
while (s.length != 0) |
---|
1934 |
{ |
---|
1935 |
dchar d = safeDecode(s); |
---|
1936 |
if (d == INVALID_SEQUENCE) |
---|
1937 |
return false; |
---|
1938 |
} |
---|
1939 |
return true; |
---|
1940 |
} |
---|
1941 |
|
---|
1942 |
/** |
---|
1943 |
* Returns the length of the longest possible substring, starting from |
---|
1944 |
* the first element, which is validly encoded. |
---|
1945 |
* |
---|
1946 |
* Params: |
---|
1947 |
* s = the array to be tested |
---|
1948 |
*/ |
---|
1949 |
uint validLength(ubyte[] s) |
---|
1950 |
{ |
---|
1951 |
ubyte[] r = s; |
---|
1952 |
ubyte[] t = s; |
---|
1953 |
while (s.length != 0) |
---|
1954 |
{ |
---|
1955 |
if (safeDecode(s) == INVALID_SEQUENCE) break; |
---|
1956 |
t = s; |
---|
1957 |
} |
---|
1958 |
return r.length - t.length; |
---|
1959 |
} |
---|
1960 |
|
---|
1961 |
/** |
---|
1962 |
* Sanitizes an array by replacing malformed ubyte sequences with valid |
---|
1963 |
* ubyte sequences. The result is guaranteed to be valid for this |
---|
1964 |
* encoding scheme. |
---|
1965 |
* |
---|
1966 |
* If the input array is already valid, this function returns the |
---|
1967 |
* original, otherwise it constructs a new array by replacing all illegal |
---|
1968 |
* sequences with the encoding scheme's replacement sequence. |
---|
1969 |
* |
---|
1970 |
* Params: |
---|
1971 |
* s = the string to be sanitized |
---|
1972 |
*/ |
---|
1973 |
ubyte[] sanitize(ubyte[] s) |
---|
1974 |
{ |
---|
1975 |
uint n = validLength(s); |
---|
1976 |
if (n == s.length) return s; |
---|
1977 |
|
---|
1978 |
auto repSeq = replacementSequence; |
---|
1979 |
|
---|
1980 |
// Count how long the string needs to be. |
---|
1981 |
// Overestimating is not a problem |
---|
1982 |
uint len = s.length; |
---|
1983 |
ubyte[] t = s[n..$]; |
---|
1984 |
while (t.length != 0) |
---|
1985 |
{ |
---|
1986 |
dchar c = safeDecode(t); |
---|
1987 |
assert(c == INVALID_SEQUENCE); |
---|
1988 |
len += repSeq.length; |
---|
1989 |
t = t[validLength(t)..$]; |
---|
1990 |
} |
---|
1991 |
|
---|
1992 |
// Now do the write |
---|
1993 |
ubyte[] array = new ubyte[len]; |
---|
1994 |
array[0..n] = s[0..n]; |
---|
1995 |
uint offset = n; |
---|
1996 |
|
---|
1997 |
t = s[n..$]; |
---|
1998 |
while (t.length != 0) |
---|
1999 |
{ |
---|
2000 |
dchar c = safeDecode(t); |
---|
2001 |
assert(c == INVALID_SEQUENCE); |
---|
2002 |
array[offset .. offset+repSeq.length] = repSeq[]; |
---|
2003 |
offset += repSeq.length; |
---|
2004 |
n = validLength(t); |
---|
2005 |
array[offset..offset+n] = t[0..n]; |
---|
2006 |
offset += n; |
---|
2007 |
t = t[n..$]; |
---|
2008 |
} |
---|
2009 |
return cast(ubyte[])array[0..offset]; |
---|
2010 |
} |
---|
2011 |
|
---|
2012 |
/** |
---|
2013 |
* Returns the length of the first encoded sequence. |
---|
2014 |
* |
---|
2015 |
* The input to this function MUST be validly encoded. |
---|
2016 |
* This is enforced by the function's in-contract. |
---|
2017 |
* |
---|
2018 |
* Params: |
---|
2019 |
* s = the array to be sliced |
---|
2020 |
*/ |
---|
2021 |
uint firstSequence(ubyte[] s) |
---|
2022 |
in |
---|
2023 |
{ |
---|
2024 |
assert(s.length != 0); |
---|
2025 |
ubyte[] u = s; |
---|
2026 |
assert(safeDecode(u) != INVALID_SEQUENCE); |
---|
2027 |
} |
---|
2028 |
body |
---|
2029 |
{ |
---|
2030 |
ubyte[] t = s; |
---|
2031 |
decode(s); |
---|
2032 |
return t.length - s.length; |
---|
2033 |
} |
---|
2034 |
|
---|
2035 |
/** |
---|
2036 |
* Returns the total number of code points encoded in a ubyte array. |
---|
2037 |
* |
---|
2038 |
* The input to this function MUST be validly encoded. |
---|
2039 |
* This is enforced by the function's in-contract. |
---|
2040 |
* |
---|
2041 |
* Params: |
---|
2042 |
* s = the string to be counted |
---|
2043 |
*/ |
---|
2044 |
uint count(ubyte[] s) |
---|
2045 |
in |
---|
2046 |
{ |
---|
2047 |
assert(isValid(s)); |
---|
2048 |
} |
---|
2049 |
body |
---|
2050 |
{ |
---|
2051 |
uint n = 0; |
---|
2052 |
while (s.length != 0) |
---|
2053 |
{ |
---|
2054 |
decode(s); |
---|
2055 |
++n; |
---|
2056 |
} |
---|
2057 |
return n; |
---|
2058 |
} |
---|
2059 |
|
---|
2060 |
/** |
---|
2061 |
* Returns the array index at which the (n+1)th code point begins. |
---|
2062 |
* |
---|
2063 |
* The input to this function MUST be validly encoded. |
---|
2064 |
* This is enforced by the function's in-contract. |
---|
2065 |
* |
---|
2066 |
* Params: |
---|
2067 |
* s = the string to be counted |
---|
2068 |
*/ |
---|
2069 |
int index(ubyte[] s,int n) |
---|
2070 |
in |
---|
2071 |
{ |
---|
2072 |
assert(isValid(s)); |
---|
2073 |
assert(n >= 0); |
---|
2074 |
} |
---|
2075 |
body |
---|
2076 |
{ |
---|
2077 |
ubyte[] t = s; |
---|
2078 |
for (uint i=0; i<n; ++i) decode(s); |
---|
2079 |
return t.length - s.length; |
---|
2080 |
} |
---|
2081 |
|
---|
2082 |
static string[string] supported; |
---|
2083 |
} |
---|
2084 |
|
---|
2085 |
/** |
---|
2086 |
EncodingScheme to handle ASCII |
---|
2087 |
|
---|
2088 |
This scheme recognises the following names: |
---|
2089 |
"ANSI_X3.4-1968", |
---|
2090 |
"ANSI_X3.4-1986", |
---|
2091 |
"ASCII", |
---|
2092 |
"IBM367", |
---|
2093 |
"ISO646-US", |
---|
2094 |
"ISO_646.irv:1991", |
---|
2095 |
"US-ASCII", |
---|
2096 |
"cp367", |
---|
2097 |
"csASCII" |
---|
2098 |
"iso-ir-6", |
---|
2099 |
"us" |
---|
2100 |
*/ |
---|
2101 |
class EncodingSchemeASCII : EncodingScheme |
---|
2102 |
{ |
---|
2103 |
static this() |
---|
2104 |
{ |
---|
2105 |
EncodingScheme.register("std2.encoding.EncodingSchemeASCII"); |
---|
2106 |
} |
---|
2107 |
|
---|
2108 |
|
---|
2109 |
override string[] names() |
---|
2110 |
{ |
---|
2111 |
return |
---|
2112 |
[ |
---|
2113 |
cast(string) |
---|
2114 |
"ANSI_X3.4-1968", |
---|
2115 |
"ANSI_X3.4-1986", |
---|
2116 |
"ASCII", |
---|
2117 |
"IBM367", |
---|
2118 |
"ISO646-US", |
---|
2119 |
"ISO_646.irv:1991", |
---|
2120 |
"US-ASCII", |
---|
2121 |
"cp367", |
---|
2122 |
"csASCII" |
---|
2123 |
"iso-ir-6", |
---|
2124 |
"us" |
---|
2125 |
]; |
---|
2126 |
} |
---|
2127 |
|
---|
2128 |
override string toString() |
---|
2129 |
{ |
---|
2130 |
return "ASCII"; |
---|
2131 |
} |
---|
2132 |
|
---|
2133 |
override bool canEncode(dchar c) |
---|
2134 |
{ |
---|
2135 |
return std2.encoding.canEncode!(AsciiChar)(c); |
---|
2136 |
} |
---|
2137 |
|
---|
2138 |
override uint encodedLength(dchar c) |
---|
2139 |
{ |
---|
2140 |
return std2.encoding.encodedLength!(AsciiChar)(c); |
---|
2141 |
} |
---|
2142 |
|
---|
2143 |
override uint encode(dchar c, ubyte[] buffer) |
---|
2144 |
{ |
---|
2145 |
auto r = cast(AsciiChar[])buffer; |
---|
2146 |
return std2.encoding.encode(c,r); |
---|
2147 |
} |
---|
2148 |
|
---|
2149 |
override dchar decode(ref ubyte[] s) |
---|
2150 |
{ |
---|
2151 |
auto t = cast(AsciiChar[]) s; |
---|
2152 |
dchar c = std2.encoding.decode(t); |
---|
2153 |
s = s[$-t.length..$]; |
---|
2154 |
return c; |
---|
2155 |
} |
---|
2156 |
|
---|
2157 |
override dchar safeDecode(ref ubyte[] s) |
---|
2158 |
{ |
---|
2159 |
auto t = cast(AsciiChar[]) s; |
---|
2160 |
dchar c = std2.encoding.safeDecode(t); |
---|
2161 |
s = s[$-t.length..$]; |
---|
2162 |
return c; |
---|
2163 |
} |
---|
2164 |
|
---|
2165 |
override ubyte[] replacementSequence() |
---|
2166 |
{ |
---|
2167 |
return cast(ubyte[])"?"; |
---|
2168 |
} |
---|
2169 |
|
---|
2170 |
} |
---|
2171 |
|
---|
2172 |
/** |
---|
2173 |
EncodingScheme to handle Latin-1 |
---|
2174 |
|
---|
2175 |
This scheme recognises the following names: |
---|
2176 |
"CP819", |
---|
2177 |
"IBM819", |
---|
2178 |
"ISO-8859-1", |
---|
2179 |
"ISO_8859-1", |
---|
2180 |
"ISO_8859-1:1987", |
---|
2181 |
"csISOLatin1", |
---|
2182 |
"iso-ir-100", |
---|
2183 |
"l1", |
---|
2184 |
"latin1" |
---|
2185 |
*/ |
---|
2186 |
class EncodingSchemeLatin1 : EncodingScheme |
---|
2187 |
{ |
---|
2188 |
static this() |
---|
2189 |
{ |
---|
2190 |
EncodingScheme.register("std2.encoding.EncodingSchemeLatin1"); |
---|
2191 |
} |
---|
2192 |
|
---|
2193 |
override string[] names() |
---|
2194 |
{ |
---|
2195 |
return |
---|
2196 |
[ |
---|
2197 |
cast(string) |
---|
2198 |
"CP819", |
---|
2199 |
"IBM819", |
---|
2200 |
"ISO-8859-1", |
---|
2201 |
"ISO_8859-1", |
---|
2202 |
"ISO_8859-1:1987", |
---|
2203 |
"csISOLatin1", |
---|
2204 |
"iso-ir-100", |
---|
2205 |
"l1", |
---|
2206 |
"latin1" |
---|
2207 |
]; |
---|
2208 |
} |
---|
2209 |
|
---|
2210 |
override string toString() |
---|
2211 |
{ |
---|
2212 |
return "ISO-8859-1"; |
---|
2213 |
} |
---|
2214 |
|
---|
2215 |
override bool canEncode(dchar c) |
---|
2216 |
{ |
---|
2217 |
return std2.encoding.canEncode!(Latin1Char)(c); |
---|
2218 |
} |
---|
2219 |
|
---|
2220 |
override uint encodedLength(dchar c) |
---|
2221 |
{ |
---|
2222 |
return std2.encoding.encodedLength!(Latin1Char)(c); |
---|
2223 |
} |
---|
2224 |
|
---|
2225 |
override uint encode(dchar c, ubyte[] buffer) |
---|
2226 |
{ |
---|
2227 |
auto r = cast(Latin1Char[])buffer; |
---|
2228 |
return std2.encoding.encode(c,r); |
---|
2229 |
} |
---|
2230 |
|
---|
2231 |
override dchar decode(ref ubyte[] s) |
---|
2232 |
{ |
---|
2233 |
auto t = cast(Latin1Char[]) s; |
---|
2234 |
dchar c = std2.encoding.decode(t); |
---|
2235 |
s = s[$-t.length..$]; |
---|
2236 |
return c; |
---|
2237 |
} |
---|
2238 |
|
---|
2239 |
override dchar safeDecode(ref ubyte[] s) |
---|
2240 |
{ |
---|
2241 |
auto t = cast(Latin1Char[]) s; |
---|
2242 |
dchar c = std2.encoding.safeDecode(t); |
---|
2243 |
s = s[$-t.length..$]; |
---|
2244 |
return c; |
---|
2245 |
} |
---|
2246 |
|
---|
2247 |
override ubyte[] replacementSequence() |
---|
2248 |
{ |
---|
2249 |
return cast(ubyte[])"?"; |
---|
2250 |
} |
---|
2251 |
} |
---|
2252 |
|
---|
2253 |
/** |
---|
2254 |
EncodingScheme to handle Windows-1252 |
---|
2255 |
|
---|
2256 |
This scheme recognises the following names: |
---|
2257 |
"windows-1252" |
---|
2258 |
*/ |
---|
2259 |
class EncodingSchemeWindows1252 : EncodingScheme |
---|
2260 |
{ |
---|
2261 |
static this() |
---|
2262 |
{ |
---|
2263 |
EncodingScheme.register("std2.encoding.EncodingSchemeWindows1252"); |
---|
2264 |
} |
---|
2265 |
|
---|
2266 |
override string[] names() |
---|
2267 |
{ |
---|
2268 |
return |
---|
2269 |
[ |
---|
2270 |
cast(string) |
---|
2271 |
"windows-1252" |
---|
2272 |
]; |
---|
2273 |
} |
---|
2274 |
|
---|
2275 |
override string toString() |
---|
2276 |
{ |
---|
2277 |
return "windows-1252"; |
---|
2278 |
} |
---|
2279 |
|
---|
2280 |
override bool canEncode(dchar c) |
---|
2281 |
{ |
---|
2282 |
return std2.encoding.canEncode!(Windows1252Char)(c); |
---|
2283 |
} |
---|
2284 |
|
---|
2285 |
override uint encodedLength(dchar c) |
---|
2286 |
{ |
---|
2287 |
return std2.encoding.encodedLength!(Windows1252Char)(c); |
---|
2288 |
} |
---|
2289 |
|
---|
2290 |
override uint encode(dchar c, ubyte[] buffer) |
---|
2291 |
{ |
---|
2292 |
auto r = cast(Windows1252Char[])buffer; |
---|
2293 |
return std2.encoding.encode(c,r); |
---|
2294 |
} |
---|
2295 |
|
---|
2296 |
override dchar decode(ref ubyte[] s) |
---|
2297 |
{ |
---|
2298 |
auto t = cast(Windows1252Char[]) s; |
---|
2299 |
dchar c = std2.encoding.decode(t); |
---|
2300 |
s = s[$-t.length..$]; |
---|
2301 |
return c; |
---|
2302 |
} |
---|
2303 |
|
---|
2304 |
override dchar safeDecode(ref ubyte[] s) |
---|
2305 |
{ |
---|
2306 |
auto t = cast(Windows1252Char[]) s; |
---|
2307 |
dchar c = std2.encoding.safeDecode(t); |
---|
2308 |
s = s[$-t.length..$]; |
---|
2309 |
return c; |
---|
2310 |
} |
---|
2311 |
|
---|
2312 |
override ubyte[] replacementSequence() |
---|
2313 |
{ |
---|
2314 |
return cast(ubyte[])"?"; |
---|
2315 |
} |
---|
2316 |
} |
---|
2317 |
|
---|
2318 |
/** |
---|
2319 |
EncodingScheme to handle UTF-8 |
---|
2320 |
|
---|
2321 |
This scheme recognises the following names: |
---|
2322 |
"UTF-8" |
---|
2323 |
*/ |
---|
2324 |
class EncodingSchemeUtf8 : EncodingScheme |
---|
2325 |
{ |
---|
2326 |
static this() |
---|
2327 |
{ |
---|
2328 |
EncodingScheme.register("std2.encoding.EncodingSchemeUtf8"); |
---|
2329 |
} |
---|
2330 |
|
---|
2331 |
override string[] names() |
---|
2332 |
{ |
---|
2333 |
return |
---|
2334 |
[ |
---|
2335 |
cast(string) |
---|
2336 |
"UTF-8" |
---|
2337 |
]; |
---|
2338 |
} |
---|
2339 |
|
---|
2340 |
override string toString() |
---|
2341 |
{ |
---|
2342 |
return "UTF-8"; |
---|
2343 |
} |
---|
2344 |
|
---|
2345 |
override bool canEncode(dchar c) |
---|
2346 |
{ |
---|
2347 |
return std2.encoding.canEncode!(char)(c); |
---|
2348 |
} |
---|
2349 |
|
---|
2350 |
override uint encodedLength(dchar c) |
---|
2351 |
{ |
---|
2352 |
return std2.encoding.encodedLength!(char)(c); |
---|
2353 |
} |
---|
2354 |
|
---|
2355 |
override uint encode(dchar c, ubyte[] buffer) |
---|
2356 |
{ |
---|
2357 |
auto r = cast(char[])buffer; |
---|
2358 |
return std2.encoding.encode(c,r); |
---|
2359 |
} |
---|
2360 |
|
---|
2361 |
override dchar decode(ref ubyte[] s) |
---|
2362 |
{ |
---|
2363 |
auto t = cast(char[]) s; |
---|
2364 |
dchar c = std2.encoding.decode(t); |
---|
2365 |
s = s[$-t.length..$]; |
---|
2366 |
return c; |
---|
2367 |
} |
---|
2368 |
|
---|
2369 |
override dchar safeDecode(ref ubyte[] s) |
---|
2370 |
{ |
---|
2371 |
auto t = cast(char[]) s; |
---|
2372 |
dchar c = std2.encoding.safeDecode(t); |
---|
2373 |
s = s[$-t.length..$]; |
---|
2374 |
return c; |
---|
2375 |
} |
---|
2376 |
|
---|
2377 |
override ubyte[] replacementSequence() |
---|
2378 |
{ |
---|
2379 |
return cast(ubyte[])"\uFFFD"; |
---|
2380 |
} |
---|
2381 |
|
---|
2382 |
} |
---|
2383 |
|
---|
2384 |
/** |
---|
2385 |
EncodingScheme to handle UTF-16 in native byte order |
---|
2386 |
|
---|
2387 |
This scheme recognises the following names: |
---|
2388 |
"UTF-16LE" (little-endian architecture only) |
---|
2389 |
"UTF-16BE" (big-endian architecture only) |
---|
2390 |
*/ |
---|
2391 |
class EncodingSchemeUtf16Native : EncodingScheme |
---|
2392 |
{ |
---|
2393 |
static this() |
---|
2394 |
{ |
---|
2395 |
EncodingScheme.register("std2.encoding.EncodingSchemeUtf16Native"); |
---|
2396 |
} |
---|
2397 |
|
---|
2398 |
version(LittleEndian) { string NAME = "UTF-16LE"; } |
---|
2399 |
version(BigEndian) { string NAME = "UTF-16BE"; } |
---|
2400 |
|
---|
2401 |
override string[] names() |
---|
2402 |
{ |
---|
2403 |
return [ NAME ]; |
---|
2404 |
} |
---|
2405 |
|
---|
2406 |
override string toString() |
---|
2407 |
{ |
---|
2408 |
return NAME; |
---|
2409 |
} |
---|
2410 |
|
---|
2411 |
override bool canEncode(dchar c) |
---|
2412 |
{ |
---|
2413 |
return std2.encoding.canEncode!(wchar)(c); |
---|
2414 |
} |
---|
2415 |
|
---|
2416 |
override uint encodedLength(dchar c) |
---|
2417 |
{ |
---|
2418 |
return std2.encoding.encodedLength!(wchar)(c); |
---|
2419 |
} |
---|
2420 |
|
---|
2421 |
override uint encode(dchar c, ubyte[] buffer) |
---|
2422 |
{ |
---|
2423 |
auto r = cast(wchar[])buffer; |
---|
2424 |
return wchar.sizeof * std2.encoding.encode(c,r); |
---|
2425 |
} |
---|
2426 |
|
---|
2427 |
override dchar decode(ref ubyte[] s) |
---|
2428 |
in |
---|
2429 |
{ |
---|
2430 |
assert((s.length & 1) == 0); |
---|
2431 |
} |
---|
2432 |
body |
---|
2433 |
{ |
---|
2434 |
auto t = cast(wchar[]) s; |
---|
2435 |
dchar c = std2.encoding.decode(t); |
---|
2436 |
s = s[$-t.length..$]; |
---|
2437 |
return c; |
---|
2438 |
} |
---|
2439 |
|
---|
2440 |
override dchar safeDecode(ref ubyte[] s) |
---|
2441 |
in |
---|
2442 |
{ |
---|
2443 |
assert((s.length & 1) == 0); |
---|
2444 |
} |
---|
2445 |
body |
---|
2446 |
{ |
---|
2447 |
auto t = cast(wchar[]) s; |
---|
2448 |
dchar c = std2.encoding.safeDecode(t); |
---|
2449 |
s = s[$-t.length..$]; |
---|
2450 |
return c; |
---|
2451 |
} |
---|
2452 |
|
---|
2453 |
override ubyte[] replacementSequence() |
---|
2454 |
{ |
---|
2455 |
return cast(ubyte[])"\uFFFD"w; |
---|
2456 |
} |
---|
2457 |
} |
---|
2458 |
|
---|
2459 |
/** |
---|
2460 |
EncodingScheme to handle UTF-32 in native byte order |
---|
2461 |
|
---|
2462 |
This scheme recognises the following names: |
---|
2463 |
"UTF-32LE" (little-endian architecture only) |
---|
2464 |
"UTF-32BE" (big-endian architecture only) |
---|
2465 |
*/ |
---|
2466 |
class EncodingSchemeUtf32Native : EncodingScheme |
---|
2467 |
{ |
---|
2468 |
static this() |
---|
2469 |
{ |
---|
2470 |
EncodingScheme.register("std2.encoding.EncodingSchemeUtf32Native"); |
---|
2471 |
} |
---|
2472 |
|
---|
2473 |
|
---|
2474 |
version(LittleEndian) { string NAME = "UTF-32LE"; } |
---|
2475 |
version(BigEndian) { string NAME = "UTF-32BE"; } |
---|
2476 |
|
---|
2477 |
override string[] names() |
---|
2478 |
{ |
---|
2479 |
return [ NAME ]; |
---|
2480 |
} |
---|
2481 |
|
---|
2482 |
override string toString() |
---|
2483 |
{ |
---|
2484 |
return NAME; |
---|
2485 |
} |
---|
2486 |
|
---|
2487 |
override bool canEncode(dchar c) |
---|
2488 |
{ |
---|
2489 |
return std2.encoding.canEncode!(dchar)(c); |
---|
2490 |
} |
---|
2491 |
|
---|
2492 |
override uint encodedLength(dchar c) |
---|
2493 |
{ |
---|
2494 |
return std2.encoding.encodedLength!(dchar)(c); |
---|
2495 |
} |
---|
2496 |
|
---|
2497 |
override uint encode(dchar c, ubyte[] buffer) |
---|
2498 |
{ |
---|
2499 |
auto r = cast(dchar[])buffer; |
---|
2500 |
return dchar.sizeof * std2.encoding.encode(c,r); |
---|
2501 |
} |
---|
2502 |
|
---|
2503 |
override dchar decode(ref ubyte[] s) |
---|
2504 |
in |
---|
2505 |
{ |
---|
2506 |
assert((s.length & 3) == 0); |
---|
2507 |
} |
---|
2508 |
body |
---|
2509 |
{ |
---|
2510 |
auto t = cast(dchar[]) s; |
---|
2511 |
dchar c = std2.encoding.decode(t); |
---|
2512 |
s = s[$-t.length..$]; |
---|
2513 |
return c; |
---|
2514 |
} |
---|
2515 |
|
---|
2516 |
override dchar safeDecode(ref ubyte[] s) |
---|
2517 |
in |
---|
2518 |
{ |
---|
2519 |
assert((s.length & 3) == 0); |
---|
2520 |
} |
---|
2521 |
body |
---|
2522 |
{ |
---|
2523 |
auto t = cast(dchar[]) s; |
---|
2524 |
dchar c = std2.encoding.safeDecode(t); |
---|
2525 |
s = s[$-t.length..$]; |
---|
2526 |
return c; |
---|
2527 |
} |
---|
2528 |
|
---|
2529 |
override ubyte[] replacementSequence() |
---|
2530 |
{ |
---|
2531 |
return cast(ubyte[])"\uFFFD"d; |
---|
2532 |
} |
---|
2533 |
} |
---|
2534 |
|
---|
2535 |
void transcodeReverse(Src,Dst)(Src[] s, out Dst[] r) |
---|
2536 |
{ |
---|
2537 |
static if(is(Src==Dst)) |
---|
2538 |
{ |
---|
2539 |
return s; |
---|
2540 |
} |
---|
2541 |
else static if(is(Src==AsciiChar)) |
---|
2542 |
{ |
---|
2543 |
transcodeReverse!(char,Dst)(cast(string)s,r); |
---|
2544 |
} |
---|
2545 |
else |
---|
2546 |
{ |
---|
2547 |
foreach_reverse(d;codePoints(s)) |
---|
2548 |
{ |
---|
2549 |
foreach_reverse(c;codeUnits!(Dst)(d)) |
---|
2550 |
{ |
---|
2551 |
r = c ~ r; |
---|
2552 |
} |
---|
2553 |
} |
---|
2554 |
} |
---|
2555 |
} |
---|
2556 |
|
---|
2557 |
string makeReadable(string s) |
---|
2558 |
{ |
---|
2559 |
string r = "\""; |
---|
2560 |
foreach(char c;s) |
---|
2561 |
{ |
---|
2562 |
if (c >= 0x20 && c < 0x80) |
---|
2563 |
{ |
---|
2564 |
r ~= c; |
---|
2565 |
} |
---|
2566 |
else |
---|
2567 |
{ |
---|
2568 |
r ~= "\\x"; |
---|
2569 |
r ~= toHexDigit(c >> 4); |
---|
2570 |
r ~= toHexDigit(c); |
---|
2571 |
} |
---|
2572 |
} |
---|
2573 |
r ~= "\""; |
---|
2574 |
return r; |
---|
2575 |
} |
---|
2576 |
|
---|
2577 |
string makeReadable(wstring s) |
---|
2578 |
{ |
---|
2579 |
string r = "\""; |
---|
2580 |
foreach(wchar c;s) |
---|
2581 |
{ |
---|
2582 |
if (c >= 0x20 && c < 0x80) |
---|
2583 |
{ |
---|
2584 |
r ~= cast(char) c; |
---|
2585 |
} |
---|
2586 |
else |
---|
2587 |
{ |
---|
2588 |
r ~= "\\u"; |
---|
2589 |
r ~= toHexDigit(c >> 12); |
---|
2590 |
r ~= toHexDigit(c >> 8); |
---|
2591 |
r ~= toHexDigit(c >> 4); |
---|
2592 |
r ~= toHexDigit(c); |
---|
2593 |
} |
---|
2594 |
} |
---|
2595 |
r ~= "\"w"; |
---|
2596 |
return r; |
---|
2597 |
} |
---|
2598 |
|
---|
2599 |
string makeReadable(dstring s) |
---|
2600 |
{ |
---|
2601 |
string r = "\""; |
---|
2602 |
foreach(dchar c; s) |
---|
2603 |
{ |
---|
2604 |
if (c >= 0x20 && c < 0x80) |
---|
2605 |
{ |
---|
2606 |
r ~= cast(char) c; |
---|
2607 |
} |
---|
2608 |
else if (c < 0x10000) |
---|
2609 |
{ |
---|
2610 |
r ~= "\\u"; |
---|
2611 |
r ~= toHexDigit(c >> 12); |
---|
2612 |
r ~= toHexDigit(c >> 8); |
---|
2613 |
r ~= toHexDigit(c >> 4); |
---|
2614 |
r ~= toHexDigit(c); |
---|
2615 |
} |
---|
2616 |
else |
---|
2617 |
{ |
---|
2618 |
r ~= "\\U00"; |
---|
2619 |
r ~= toHexDigit(c >> 20); |
---|
2620 |
r ~= toHexDigit(c >> 16); |
---|
2621 |
r ~= toHexDigit(c >> 12); |
---|
2622 |
r ~= toHexDigit(c >> 8); |
---|
2623 |
r ~= toHexDigit(c >> 4); |
---|
2624 |
r ~= toHexDigit(c); |
---|
2625 |
} |
---|
2626 |
} |
---|
2627 |
r ~= "\"d"; |
---|
2628 |
return r; |
---|
2629 |
} |
---|
2630 |
|
---|
2631 |
char toHexDigit(int n) |
---|
2632 |
{ |
---|
2633 |
return "0123456789ABCDEF"[n & 0xF]; |
---|
2634 |
} |
---|
2635 |
|
---|
2636 |
|
---|
2637 |
unittest |
---|
2638 |
{ |
---|
2639 |
void TestEncoding() |
---|
2640 |
{ |
---|
2641 |
ubyte[][] validStrings = |
---|
2642 |
[ |
---|
2643 |
// Plain ASCII |
---|
2644 |
cast(ubyte[])"hello", |
---|
2645 |
|
---|
2646 |
// First possible sequence of a certain length |
---|
2647 |
[ 0x00 ], // U+00000000 one byte |
---|
2648 |
[ 0xC2, 0x80 ], // U+00000080 two bytes |
---|
2649 |
[ 0xE0, 0xA0, 0x80 ], // U+00000800 three bytes |
---|
2650 |
[ 0xF0, 0x90, 0x80, 0x80 ], // U+00010000 three bytes |
---|
2651 |
|
---|
2652 |
// Last possible sequence of a certain length |
---|
2653 |
[ 0x7F ], // U+0000007F one byte |
---|
2654 |
[ 0xDF, 0xBF ], // U+000007FF two bytes |
---|
2655 |
[ 0xEF, 0xBF, 0xBF ], // U+0000FFFF three bytes |
---|
2656 |
|
---|
2657 |
// Other boundary conditions |
---|
2658 |
[ 0xED, 0x9F, 0xBF ], |
---|
2659 |
// U+0000D7FF Last character before surrogates |
---|
2660 |
[ 0xEE, 0x80, 0x80 ], |
---|
2661 |
// U+0000E000 First character after surrogates |
---|
2662 |
[ 0xEF, 0xBF, 0xBD ], |
---|
2663 |
// U+0000FFFD Unicode replacement character |
---|
2664 |
[ 0xF4, 0x8F, 0xBF, 0xBF ], |
---|
2665 |
// U+0010FFFF Very last character |
---|
2666 |
|
---|
2667 |
// Non-character code points |
---|
2668 |
/* NOTE: These are legal in UTF, and may be converted from |
---|
2669 |
one UTF to another, however they do not represent Unicode |
---|
2670 |
characters. These code points have been reserved by |
---|
2671 |
Unicode as non-character code points. They are permissible |
---|
2672 |
for data exchange within an application, but they are are |
---|
2673 |
not permitted to be used as characters. Since this module |
---|
2674 |
deals with UTF, and not with Unicode per se, we choose to |
---|
2675 |
accept them here. */ |
---|
2676 |
[ 0xDF, 0xBE ], // U+0000FFFE |
---|
2677 |
[ 0xDF, 0xBF ], // U+0000FFFF |
---|
2678 |
]; |
---|
2679 |
|
---|
2680 |
|
---|
2681 |
ubyte[][] invalidStrings = |
---|
2682 |
[ |
---|
2683 |
// First possible sequence of a certain length, but greater |
---|
2684 |
// than U+10FFFF |
---|
2685 |
[ 0xF8, 0x88, 0x80, 0x80, 0x80 ], // U+00200000 five bytes |
---|
2686 |
[ 0xFC, 0x84, 0x80, 0x80, 0x80, 0x80 ], // U+04000000 six bytes |
---|
2687 |
|
---|
2688 |
// Last possible sequence of a certain length, but greater than U+10FFFF |
---|
2689 |
[ 0xF7, 0xBF, 0xBF, 0xBF ], // U+001FFFFF four bytes |
---|
2690 |
[ 0xFB, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF five bytes |
---|
2691 |
[ 0xFD, 0xBF, 0xBF, 0xBF, 0xBF, 0xBF ], // U+7FFFFFFF six bytes |
---|
2692 |
|
---|
2693 |
// Other boundary conditions |
---|
2694 |
[ 0xF4, 0x90, 0x80, 0x80 ], // U+00110000 |
---|
2695 |
// First code |
---|
2696 |
// point after |
---|
2697 |
// last character |
---|
2698 |
|
---|
2699 |
// Unexpected continuation bytes |
---|
2700 |
[ 0x80 ], |
---|
2701 |
[ 0xBF ], |
---|
2702 |
[ 0x20, 0x80, 0x20 ], |
---|
2703 |
[ 0x20, 0xBF, 0x20 ], |
---|
2704 |
[ 0x80, 0x9F, 0xA0 ], |
---|
2705 |
|
---|
2706 |
// Lonely start bytes |
---|
2707 |
[ 0xC0 ], |
---|
2708 |
[ 0xCF ], |
---|
2709 |
[ 0x20, 0xC0, 0x20 ], |
---|
2710 |
[ 0x20, 0xCF, 0x20 ], |
---|
2711 |
[ 0xD0 ], |
---|
2712 |
[ 0xDF ], |
---|
2713 |
[ 0x20, 0xD0, 0x20 ], |
---|
2714 |
[ 0x20, 0xDF, 0x20 ], |
---|
2715 |
[ 0xE0 ], |
---|
2716 |
[ 0xEF ], |
---|
2717 |
[ 0x20, 0xE0, 0x20 ], |
---|
2718 |
[ 0x20, 0xEF, 0x20 ], |
---|
2719 |
[ 0xF0 ], |
---|
2720 |
[ 0xF1 ], |
---|
2721 |
[ 0xF2 ], |
---|
2722 |
[ 0xF3 ], |
---|
2723 |
[ 0xF4 ], |
---|
2724 |
[ 0xF5 ], // If this were legal it would start a character > U+10FFFF |
---|
2725 |
[ 0xF6 ], // If this were legal it would start a character > U+10FFFF |
---|
2726 |
[ 0xF7 ], // If this were legal it would start a character > U+10FFFF |
---|
2727 |
|
---|
2728 |
[ 0xEF, 0xBF ], // Three byte sequence with third byte missing |
---|
2729 |
[ 0xF7, 0xBF, 0xBF ], // Four byte sequence with fourth byte missing |
---|
2730 |
[ 0xEF, 0xBF, 0xF7, 0xBF, 0xBF ], // Concatenation of the above |
---|
2731 |
|
---|
2732 |
// Impossible bytes |
---|
2733 |
[ 0xF8 ], |
---|
2734 |
[ 0xF9 ], |
---|
2735 |
[ 0xFA ], |
---|
2736 |
[ 0xFB ], |
---|
2737 |
[ 0xFC ], |
---|
2738 |
[ 0xFD ], |
---|
2739 |
[ 0xFE ], |
---|
2740 |
[ 0xFF ], |
---|
2741 |
[ 0x20, 0xF8, 0x20 ], |
---|
2742 |
[ 0x20, 0xF9, 0x20 ], |
---|
2743 |
[ 0x20, 0xFA, 0x20 ], |
---|
2744 |
[ 0x20, 0xFB, 0x20 ], |
---|
2745 |
[ 0x20, 0xFC, 0x20 ], |
---|
2746 |
[ 0x20, 0xFD, 0x20 ], |
---|
2747 |
[ 0x20, 0xFE, 0x20 ], |
---|
2748 |
[ 0x20, 0xFF, 0x20 ], |
---|
2749 |
|
---|
2750 |
// Overlong sequences, all representing U+002F |
---|
2751 |
/* With a safe UTF-8 decoder, all of the following five overlong |
---|
2752 |
representations of the ASCII character slash ("/") should be |
---|
2753 |
rejected like a malformed UTF-8 sequence */ |
---|
2754 |
[ 0xC0, 0xAF ], |
---|
2755 |
[ 0xE0, 0x80, 0xAF ], |
---|
2756 |
[ 0xF0, 0x80, 0x80, 0xAF ], |
---|
2757 |
[ 0xF8, 0x80, 0x80, 0x80, 0xAF ], |
---|
2758 |
[ 0xFC, 0x80, 0x80, 0x80, 0x80, 0xAF ], |
---|
2759 |
|
---|
2760 |
// Maximum overlong sequences |
---|
2761 |
/* Below you see the highest Unicode value that is still resulting in |
---|
2762 |
an overlong sequence if represented with the given number of bytes. |
---|
2763 |
This is a boundary test for safe UTF-8 decoders. All five |
---|
2764 |
characters should be rejected like malformed UTF-8 sequences. */ |
---|
2765 |
[ 0xC1, 0xBF ], // U+0000007F |
---|
2766 |
[ 0xE0, 0x9F, 0xBF ], // U+000007FF |
---|
2767 |
[ 0xF0, 0x8F, 0xBF, 0xBF ], // U+0000FFFF |
---|
2768 |
[ 0xF8, 0x87, 0xBF, 0xBF, 0xBF ], // U+001FFFFF |
---|
2769 |
[ 0xFC, 0x83, 0xBF, 0xBF, 0xBF, 0xBF ], // U+03FFFFFF |
---|
2770 |
|
---|
2771 |
// Overlong representation of the NUL character |
---|
2772 |
/* The following five sequences should also be rejected like malformed |
---|
2773 |
UTF-8 sequences and should not be treated like the ASCII NUL |
---|
2774 |
character. */ |
---|
2775 |
[ 0xC0, 0x80 ], |
---|
2776 |
[ 0xE0, 0x80, 0x80 ], |
---|
2777 |
[ 0xF0, 0x80, 0x80, 0x80 ], |
---|
2778 |
[ 0xF8, 0x80, 0x80, 0x80, 0x80 ], |
---|
2779 |
[ 0xFC, 0x80, 0x80, 0x80, 0x80, 0x80 ], |
---|
2780 |
|
---|
2781 |
// Illegal code positions |
---|
2782 |
/* The following UTF-8 sequences should be rejected like malformed |
---|
2783 |
sequences, because they never represent valid ISO 10646 characters |
---|
2784 |
and a UTF-8 decoder that accepts them might introduce security |
---|
2785 |
problems comparable to overlong UTF-8 sequences. */ |
---|
2786 |
[ 0xED, 0xA0, 0x80 ], // U+D800 |
---|
2787 |
[ 0xED, 0xAD, 0xBF ], // U+DB7F |
---|
2788 |
[ 0xED, 0xAE, 0x80 ], // U+DB80 |
---|
2789 |
[ 0xED, 0xAF, 0xBF ], // U+DBFF |
---|
2790 |
[ 0xED, 0xB0, 0x80 ], // U+DC00 |
---|
2791 |
[ 0xED, 0xBE, 0x80 ], // U+DF80 |
---|
2792 |
[ 0xED, 0xBF, 0xBF ], // U+DFFF |
---|
2793 |
]; |
---|
2794 |
|
---|
2795 |
string[] sanitizedStrings = |
---|
2796 |
[ |
---|
2797 |
"\uFFFD","\uFFFD", |
---|
2798 |
"\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ", |
---|
2799 |
" \uFFFD ","\uFFFD\uFFFD\uFFFD","\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ", |
---|
2800 |
"\uFFFD","\uFFFD"," \uFFFD "," \uFFFD ","\uFFFD","\uFFFD"," \uFFFD ", |
---|
2801 |
" \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", |
---|
2802 |
"\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD\uFFFD","\uFFFD","\uFFFD", |
---|
2803 |
"\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD"," \uFFFD ", |
---|
2804 |
" \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD "," \uFFFD ", |
---|
2805 |
" \uFFFD ","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", |
---|
2806 |
"\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", |
---|
2807 |
"\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD","\uFFFD", |
---|
2808 |
]; |
---|
2809 |
|
---|
2810 |
// Make sure everything that should be valid, is |
---|
2811 |
foreach(a;validStrings) |
---|
2812 |
{ |
---|
2813 |
string s = cast(string)a; |
---|
2814 |
assert(isValid(s),"Failed to validate: "~makeReadable(s)); |
---|
2815 |
} |
---|
2816 |
|
---|
2817 |
// Make sure everything that shouldn't be valid, isn't |
---|
2818 |
foreach(a;invalidStrings) |
---|
2819 |
{ |
---|
2820 |
string s = cast(string)a; |
---|
2821 |
assert(!isValid(s),"Incorrectly validated: "~makeReadable(s)); |
---|
2822 |
} |
---|
2823 |
|
---|
2824 |
// Make sure we can sanitize everything bad |
---|
2825 |
assert(invalidStrings.length == sanitizedStrings.length); |
---|
2826 |
for(int i=0; i<invalidStrings.length; ++i) |
---|
2827 |
{ |
---|
2828 |
string s = cast(string)invalidStrings[i]; |
---|
2829 |
string t = sanitize(s); |
---|
2830 |
assert(isValid(t)); |
---|
2831 |
assert(t == sanitizedStrings[i]); |
---|
2832 |
ubyte[] u = cast(ubyte[])t; |
---|
2833 |
validStrings ~= u; |
---|
2834 |
} |
---|
2835 |
|
---|
2836 |
// Make sure all transcodings work in both directions, using both forward |
---|
2837 |
// and reverse iteration |
---|
2838 |
foreach(i,a; validStrings) |
---|
2839 |
{ |
---|
2840 |
string s = cast(string)a; |
---|
2841 |
string s2; |
---|
2842 |
wstring ws, ws2; |
---|
2843 |
dstring ds, ds2; |
---|
2844 |
|
---|
2845 |
transcode(s,ws); |
---|
2846 |
assert(isValid(ws)); |
---|
2847 |
transcode(ws,s2); |
---|
2848 |
assert(s == s2); |
---|
2849 |
|
---|
2850 |
transcode(s,ds); |
---|
2851 |
assert(isValid(ds)); |
---|
2852 |
transcode(ds,s2); |
---|
2853 |
assert(s == s2); |
---|
2854 |
|
---|
2855 |
transcode(ws,s); |
---|
2856 |
assert(isValid(s)); |
---|
2857 |
transcode(s,ws2); |
---|
2858 |
assert(ws == ws2); |
---|
2859 |
|
---|
2860 |
transcode(ws,ds); |
---|
2861 |
assert(isValid(ds)); |
---|
2862 |
transcode(ds,ws2); |
---|
2863 |
assert(ws == ws2); |
---|
2864 |
|
---|
2865 |
transcode(ds,s); |
---|
2866 |
assert(isValid(s)); |
---|
2867 |
transcode(s,ds2); |
---|
2868 |
assert(ds == ds2); |
---|
2869 |
|
---|
2870 |
transcode(ds,ws); |
---|
2871 |
assert(isValid(ws)); |
---|
2872 |
transcode(ws,ds2); |
---|
2873 |
assert(ds == ds2); |
---|
2874 |
|
---|
2875 |
transcodeReverse(s,ws); |
---|
2876 |
assert(isValid(ws)); |
---|
2877 |
transcodeReverse(ws,s2); |
---|
2878 |
assert(s == s2); |
---|
2879 |
|
---|
2880 |
transcodeReverse(s,ds); |
---|
2881 |
assert(isValid(ds)); |
---|
2882 |
transcodeReverse(ds,s2); |
---|
2883 |
assert(s == s2); |
---|
2884 |
|
---|
2885 |
transcodeReverse(ws,s); |
---|
2886 |
assert(isValid(s)); |
---|
2887 |
transcodeReverse(s,ws2); |
---|
2888 |
assert(ws == ws2); |
---|
2889 |
|
---|
2890 |
transcodeReverse(ws,ds); |
---|
2891 |
assert(isValid(ds)); |
---|
2892 |
transcodeReverse(ds,ws2); |
---|
2893 |
assert(ws == ws2); |
---|
2894 |
|
---|
2895 |
transcodeReverse(ds,s); |
---|
2896 |
assert(isValid(s)); |
---|
2897 |
transcodeReverse(s,ds2); |
---|
2898 |
assert(ds == ds2); |
---|
2899 |
|
---|
2900 |
transcodeReverse(ds,ws); |
---|
2901 |
assert(isValid(ws)); |
---|
2902 |
transcodeReverse(ws,ds2); |
---|
2903 |
assert(ds == ds2); |
---|
2904 |
} |
---|
2905 |
|
---|
2906 |
// Make sure the non-UTF encodings work too |
---|
2907 |
{ |
---|
2908 |
auto s = "\u20AC100"; |
---|
2909 |
Windows1252String t; |
---|
2910 |
transcode(s,t); |
---|
2911 |
assert(t == [cast(Windows1252Char)0x80, '1', '0', '0']); |
---|
2912 |
string u; |
---|
2913 |
transcode(s,u); |
---|
2914 |
assert(s == u); |
---|
2915 |
Latin1String v; |
---|
2916 |
transcode(s,v); |
---|
2917 |
assert(cast(string)v == "?100"); |
---|
2918 |
AsciiString w; |
---|
2919 |
transcode(v,w); |
---|
2920 |
assert(cast(string)w == "?100"); |
---|
2921 |
} |
---|
2922 |
|
---|
2923 |
// Make sure we can count properly |
---|
2924 |
{ |
---|
2925 |
assert(encodedLength!(char)('A') == 1); |
---|
2926 |
assert(encodedLength!(char)('\u00E3') == 2); |
---|
2927 |
assert(encodedLength!(char)('\u2028') == 3); |
---|
2928 |
assert(encodedLength!(char)('\U0010FFF0') == 4); |
---|
2929 |
assert(encodedLength!(wchar)('A') == 1); |
---|
2930 |
assert(encodedLength!(wchar)('\U0010FFF0') == 2); |
---|
2931 |
} |
---|
2932 |
|
---|
2933 |
// Make sure we can write into mutable arrays |
---|
2934 |
{ |
---|
2935 |
char[4] buffer; |
---|
2936 |
uint n = encode(cast(dchar)'\u00E3',buffer); |
---|
2937 |
assert(n == 2); |
---|
2938 |
assert(buffer[0] == 0xC3); |
---|
2939 |
assert(buffer[1] == 0xA3); |
---|
2940 |
} |
---|
2941 |
} |
---|
2942 |
TestEncoding(); |
---|
2943 |
|
---|
2944 |
} |
---|
2945 |
version (unittest_report) |
---|
2946 |
{ |
---|
2947 |
import std.stdio; |
---|
2948 |
unittest { |
---|
2949 |
writefln("unittest std2.encoding passed"); |
---|
2950 |
} |
---|
2951 |
} |
---|
2952 |
//============================================================================= |
---|