Utf.inl
1 
2 //
3 // SFML - Simple and Fast Multimedia Library
4 // Copyright (C) 2007-2012 Laurent Gomila (laurent.gom@gmail.com)
5 //
6 // This software is provided 'as-is', without any express or implied warranty.
7 // In no event will the authors be held liable for any damages arising from the use of this software.
8 //
9 // Permission is granted to anyone to use this software for any purpose,
10 // including commercial applications, and to alter it and redistribute it freely,
11 // subject to the following restrictions:
12 //
13 // 1. The origin of this software must not be misrepresented;
14 // you must not claim that you wrote the original software.
15 // If you use this software in a product, an acknowledgment
16 // in the product documentation would be appreciated but is not required.
17 //
18 // 2. Altered source versions must be plainly marked as such,
19 // and must not be misrepresented as being the original software.
20 //
21 // 3. This notice may not be removed or altered from any source distribution.
22 //
24 
25 
27 // References :
28 //
29 // http://www.unicode.org/
30 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
31 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.h
32 // http://people.w3.org/rishida/scripts/uniview/conversion
33 //
35 
36 
38 template <typename In>
39 In Utf<8>::decode(In begin, In end, Uint32& output, Uint32 replacement)
40 {
41  // Some useful precomputed data
42  static const int trailing[256] =
43  {
44  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
45  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
46  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
51  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
52  };
53  static const Uint32 offsets[6] =
54  {
55  0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080
56  };
57 
58  // decode the character
59  int trailingBytes = trailing[static_cast<Uint8>(*begin)];
60  if (begin + trailingBytes < end)
61  {
62  output = 0;
63  switch (trailingBytes)
64  {
65  case 5 : output += static_cast<Uint8>(*begin++); output <<= 6;
66  case 4 : output += static_cast<Uint8>(*begin++); output <<= 6;
67  case 3 : output += static_cast<Uint8>(*begin++); output <<= 6;
68  case 2 : output += static_cast<Uint8>(*begin++); output <<= 6;
69  case 1 : output += static_cast<Uint8>(*begin++); output <<= 6;
70  case 0 : output += static_cast<Uint8>(*begin++);
71  }
72  output -= offsets[trailingBytes];
73  }
74  else
75  {
76  // Incomplete character
77  begin = end;
78  output = replacement;
79  }
80 
81  return begin;
82 }
83 
84 
86 template <typename Out>
87 Out Utf<8>::encode(Uint32 input, Out output, Uint8 replacement)
88 {
89  // Some useful precomputed data
90  static const Uint8 firstBytes[7] =
91  {
92  0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
93  };
94 
95  // encode the character
96  if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
97  {
98  // Invalid character
99  if (replacement)
100  *output++ = replacement;
101  }
102  else
103  {
104  // Valid character
105 
106  // Get the number of bytes to write
107  std::size_t bytestoWrite = 1;
108  if (input < 0x80) bytestoWrite = 1;
109  else if (input < 0x800) bytestoWrite = 2;
110  else if (input < 0x10000) bytestoWrite = 3;
111  else if (input <= 0x0010FFFF) bytestoWrite = 4;
112 
113  // Extract the bytes to write
114  Uint8 bytes[4];
115  switch (bytestoWrite)
116  {
117  case 4 : bytes[3] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
118  case 3 : bytes[2] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
119  case 2 : bytes[1] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
120  case 1 : bytes[0] = static_cast<Uint8> (input | firstBytes[bytestoWrite]);
121  }
122 
123  // Add them to the output
124  output = std::copy(bytes, bytes + bytestoWrite, output);
125  }
126 
127  return output;
128 }
129 
130 
132 template <typename In>
133 In Utf<8>::next(In begin, In end)
134 {
135  Uint32 codepoint;
136  return decode(begin, end, codepoint);
137 }
138 
139 
141 template <typename In>
142 std::size_t Utf<8>::count(In begin, In end)
143 {
144  std::size_t length = 0;
145  while (begin < end)
146  {
147  begin = next(begin, end);
148  ++length;
149  }
150 
151  return length;
152 }
153 
154 
156 template <typename In, typename Out>
157 Out Utf<8>::fromAnsi(In begin, In end, Out output, const std::locale& locale)
158 {
159  while (begin < end)
160  {
161  Uint32 codepoint = Utf<32>::decodeAnsi(*begin++, locale);
162  output = encode(codepoint, output);
163  }
164 
165  return output;
166 }
167 
168 
170 template <typename In, typename Out>
171 Out Utf<8>::fromWide(In begin, In end, Out output)
172 {
173  while (begin < end)
174  {
175  Uint32 codepoint = Utf<32>::decodeWide(*begin++);
176  output = encode(codepoint, output);
177  }
178 
179  return output;
180 }
181 
182 
184 template <typename In, typename Out>
185 Out Utf<8>::fromLatin1(In begin, In end, Out output)
186 {
187  // Latin-1 is directly compatible with Unicode encodings,
188  // and can thus be treated as (a sub-range of) UTF-32
189  while (begin < end)
190  output = encode(*begin++, output);
191 
192  return output;
193 }
194 
195 
197 template <typename In, typename Out>
198 Out Utf<8>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
199 {
200  while (begin < end)
201  {
202  Uint32 codepoint;
203  begin = decode(begin, end, codepoint);
204  output = Utf<32>::encodeAnsi(codepoint, output, replacement, locale);
205  }
206 
207  return output;
208 }
209 
210 
212 template <typename In, typename Out>
213 Out Utf<8>::toWide(In begin, In end, Out output, wchar_t replacement)
214 {
215  while (begin < end)
216  {
217  Uint32 codepoint;
218  begin = decode(begin, end, codepoint);
219  output = Utf<32>::encodeWide(codepoint, output, replacement);
220  }
221 
222  return output;
223 }
224 
225 
227 template <typename In, typename Out>
228 Out Utf<8>::toLatin1(In begin, In end, Out output, char replacement)
229 {
230  // Latin-1 is directly compatible with Unicode encodings,
231  // and can thus be treated as (a sub-range of) UTF-32
232  while (begin < end)
233  {
234  Uint32 codepoint;
235  begin = decode(begin, end, codepoint);
236  *output++ = codepoint < 256 ? static_cast<char>(codepoint) : replacement;
237  }
238 
239  return output;
240 }
241 
242 
244 template <typename In, typename Out>
245 Out Utf<8>::toUtf8(In begin, In end, Out output)
246 {
247  return std::copy(begin, end, output);
248 }
249 
250 
252 template <typename In, typename Out>
253 Out Utf<8>::toUtf16(In begin, In end, Out output)
254 {
255  while (begin < end)
256  {
257  Uint32 codepoint;
258  begin = decode(begin, end, codepoint);
259  output = Utf<16>::encode(codepoint, output);
260  }
261 
262  return output;
263 }
264 
265 
267 template <typename In, typename Out>
268 Out Utf<8>::toUtf32(In begin, In end, Out output)
269 {
270  while (begin < end)
271  {
272  Uint32 codepoint;
273  begin = decode(begin, end, codepoint);
274  *output++ = codepoint;
275  }
276 
277  return output;
278 }
279 
280 
282 template <typename In>
283 In Utf<16>::decode(In begin, In end, Uint32& output, Uint32 replacement)
284 {
285  Uint16 first = *begin++;
286 
287  // If it's a surrogate pair, first convert to a single UTF-32 character
288  if ((first >= 0xD800) && (first <= 0xDBFF))
289  {
290  if (begin < end)
291  {
292  Uint32 second = *begin++;
293  if ((second >= 0xDC00) && (second <= 0xDFFF))
294  {
295  // The second element is valid: convert the two elements to a UTF-32 character
296  output = static_cast<Uint32>(((first - 0xD800) << 10) + (second - 0xDC00) + 0x0010000);
297  }
298  else
299  {
300  // Invalid character
301  output = replacement;
302  }
303  }
304  else
305  {
306  // Invalid character
307  begin = end;
308  output = replacement;
309  }
310  }
311  else
312  {
313  // We can make a direct copy
314  output = first;
315  }
316 
317  return begin;
318 }
319 
320 
322 template <typename Out>
323 Out Utf<16>::encode(Uint32 input, Out output, Uint16 replacement)
324 {
325  if (input < 0xFFFF)
326  {
327  // The character can be copied directly, we just need to check if it's in the valid range
328  if ((input >= 0xD800) && (input <= 0xDFFF))
329  {
330  // Invalid character (this range is reserved)
331  if (replacement)
332  *output++ = replacement;
333  }
334  else
335  {
336  // Valid character directly convertible to a single UTF-16 character
337  *output++ = static_cast<Uint16>(input);
338  }
339  }
340  else if (input > 0x0010FFFF)
341  {
342  // Invalid character (greater than the maximum unicode value)
343  if (replacement)
344  *output++ = replacement;
345  }
346  else
347  {
348  // The input character will be converted to two UTF-16 elements
349  input -= 0x0010000;
350  *output++ = static_cast<Uint16>((input >> 10) + 0xD800);
351  *output++ = static_cast<Uint16>((input & 0x3FFUL) + 0xDC00);
352  }
353 
354  return output;
355 }
356 
357 
359 template <typename In>
360 In Utf<16>::next(In begin, In end)
361 {
362  Uint32 codepoint;
363  return decode(begin, end, codepoint);
364 }
365 
366 
368 template <typename In>
369 std::size_t Utf<16>::count(In begin, In end)
370 {
371  std::size_t length = 0;
372  while (begin < end)
373  {
374  begin = next(begin, end);
375  ++length;
376  }
377 
378  return length;
379 }
380 
381 
383 template <typename In, typename Out>
384 Out Utf<16>::fromAnsi(In begin, In end, Out output, const std::locale& locale)
385 {
386  while (begin < end)
387  {
388  Uint32 codepoint = Utf<32>::decodeAnsi(*begin++, locale);
389  output = encode(codepoint, output);
390  }
391 
392  return output;
393 }
394 
395 
397 template <typename In, typename Out>
398 Out Utf<16>::fromWide(In begin, In end, Out output)
399 {
400  while (begin < end)
401  {
402  Uint32 codepoint = Utf<32>::decodeWide(*begin++);
403  output = encode(codepoint, output);
404  }
405 
406  return output;
407 }
408 
409 
411 template <typename In, typename Out>
412 Out Utf<16>::fromLatin1(In begin, In end, Out output)
413 {
414  // Latin-1 is directly compatible with Unicode encodings,
415  // and can thus be treated as (a sub-range of) UTF-32
416  return std::copy(begin, end, output);
417 }
418 
419 
421 template <typename In, typename Out>
422 Out Utf<16>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
423 {
424  while (begin < end)
425  {
426  Uint32 codepoint;
427  begin = decode(begin, end, codepoint);
428  output = Utf<32>::encodeAnsi(codepoint, output, replacement, locale);
429  }
430 
431  return output;
432 }
433 
434 
436 template <typename In, typename Out>
437 Out Utf<16>::toWide(In begin, In end, Out output, wchar_t replacement)
438 {
439  while (begin < end)
440  {
441  Uint32 codepoint;
442  begin = decode(begin, end, codepoint);
443  output = Utf<32>::encodeWide(codepoint, output, replacement);
444  }
445 
446  return output;
447 }
448 
449 
451 template <typename In, typename Out>
452 Out Utf<16>::toLatin1(In begin, In end, Out output, char replacement)
453 {
454  // Latin-1 is directly compatible with Unicode encodings,
455  // and can thus be treated as (a sub-range of) UTF-32
456  while (begin < end)
457  {
458  *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement;
459  begin++;
460  }
461 
462  return output;
463 }
464 
465 
467 template <typename In, typename Out>
468 Out Utf<16>::toUtf8(In begin, In end, Out output)
469 {
470  while (begin < end)
471  {
472  Uint32 codepoint;
473  begin = decode(begin, end, codepoint);
474  output = Utf<8>::encode(codepoint, output);
475  }
476 
477  return output;
478 }
479 
480 
482 template <typename In, typename Out>
483 Out Utf<16>::toUtf16(In begin, In end, Out output)
484 {
485  return std::copy(begin, end, output);
486 }
487 
488 
490 template <typename In, typename Out>
491 Out Utf<16>::toUtf32(In begin, In end, Out output)
492 {
493  while (begin < end)
494  {
495  Uint32 codepoint;
496  begin = decode(begin, end, codepoint);
497  *output++ = codepoint;
498  }
499 
500  return output;
501 }
502 
503 
505 template <typename In>
506 In Utf<32>::decode(In begin, In /*end*/, Uint32& output, Uint32 /*replacement*/)
507 {
508  output = *begin++;
509  return begin;
510 }
511 
512 
514 template <typename Out>
515 Out Utf<32>::encode(Uint32 input, Out output, Uint32 /*replacement*/)
516 {
517  *output++ = input;
518  return output;
519 }
520 
521 
523 template <typename In>
524 In Utf<32>::next(In begin, In /*end*/)
525 {
526  return ++begin;
527 }
528 
529 
531 template <typename In>
532 std::size_t Utf<32>::count(In begin, In end)
533 {
534  return begin - end;
535 }
536 
537 
539 template <typename In, typename Out>
540 Out Utf<32>::fromAnsi(In begin, In end, Out output, const std::locale& locale)
541 {
542  while (begin < end)
543  *output++ = decodeAnsi(*begin++, locale);
544 
545  return output;
546 }
547 
548 
550 template <typename In, typename Out>
551 Out Utf<32>::fromWide(In begin, In end, Out output)
552 {
553  while (begin < end)
554  *output++ = decodeWide(*begin++);
555 
556  return output;
557 }
558 
559 
561 template <typename In, typename Out>
562 Out Utf<32>::fromLatin1(In begin, In end, Out output)
563 {
564  // Latin-1 is directly compatible with Unicode encodings,
565  // and can thus be treated as (a sub-range of) UTF-32
566  return std::copy(begin, end, output);
567 }
568 
569 
571 template <typename In, typename Out>
572 Out Utf<32>::toAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
573 {
574  while (begin < end)
575  output = encodeAnsi(*begin++, output, replacement, locale);
576 
577  return output;
578 }
579 
580 
582 template <typename In, typename Out>
583 Out Utf<32>::toWide(In begin, In end, Out output, wchar_t replacement)
584 {
585  while (begin < end)
586  output = encodeWide(*begin++, output, replacement);
587 
588  return output;
589 }
590 
591 
593 template <typename In, typename Out>
594 Out Utf<32>::toLatin1(In begin, In end, Out output, char replacement)
595 {
596  // Latin-1 is directly compatible with Unicode encodings,
597  // and can thus be treated as (a sub-range of) UTF-32
598  while (begin < end)
599  {
600  *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement;
601  begin++;
602  }
603 
604  return output;
605 }
606 
607 
609 template <typename In, typename Out>
610 Out Utf<32>::toUtf8(In begin, In end, Out output)
611 {
612  while (begin < end)
613  output = Utf<8>::encode(*begin++, output);
614 
615  return output;
616 }
617 
619 template <typename In, typename Out>
620 Out Utf<32>::toUtf16(In begin, In end, Out output)
621 {
622  while (begin < end)
623  output = Utf<16>::encode(*begin++, output);
624 
625  return output;
626 }
627 
628 
630 template <typename In, typename Out>
631 Out Utf<32>::toUtf32(In begin, In end, Out output)
632 {
633  return std::copy(begin, end, output);
634 }
635 
636 
638 template <typename In>
639 Uint32 Utf<32>::decodeAnsi(In input, const std::locale& locale)
640 {
641  // On Windows, gcc's standard library (glibc++) has almost
642  // no support for Unicode stuff. As a consequence, in this
643  // context we can only use the default locale and ignore
644  // the one passed as parameter.
645 
646  #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \
647  (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \
648  !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */
649 
650  (void)locale; // to avoid warnings
651 
652  wchar_t character = 0;
653  mbtowc(&character, &input, 1);
654  return static_cast<Uint32>(character);
655 
656  #else
657 
658  // Get the facet of the locale which deals with character conversion
659  const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale);
660 
661  // Use the facet to convert each character of the input string
662  return static_cast<Uint32>(facet.widen(input));
663 
664  #endif
665 }
666 
667 
669 template <typename In>
670 Uint32 Utf<32>::decodeWide(In input)
671 {
672  // The encoding of wide characters is not well defined and is left to the system;
673  // however we can safely assume that it is UCS-2 on Windows and
674  // UCS-4 on Unix systems.
675  // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4,
676  // and UCS-4 *is* UTF-32).
677 
678  return input;
679 }
680 
681 
683 template <typename Out>
684 Out Utf<32>::encodeAnsi(Uint32 codepoint, Out output, char replacement, const std::locale& locale)
685 {
686  // On Windows, gcc's standard library (glibc++) has almost
687  // no support for Unicode stuff. As a consequence, in this
688  // context we can only use the default locale and ignore
689  // the one passed as parameter.
690 
691  #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \
692  (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \
693  !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */
694 
695  (void)locale; // to avoid warnings
696 
697  char character = 0;
698  if (wctomb(&character, static_cast<wchar_t>(codepoint)) >= 0)
699  *output++ = character;
700  else if (replacement)
701  *output++ = replacement;
702 
703  return output;
704 
705  #else
706 
707  // Get the facet of the locale which deals with character conversion
708  const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale);
709 
710  // Use the facet to convert each character of the input string
711  *output++ = facet.narrow(static_cast<wchar_t>(codepoint), replacement);
712 
713  return output;
714 
715  #endif
716 }
717 
718 
720 template <typename Out>
721 Out Utf<32>::encodeWide(Uint32 codepoint, Out output, wchar_t replacement)
722 {
723  // The encoding of wide characters is not well defined and is left to the system;
724  // however we can safely assume that it is UCS-2 on Windows and
725  // UCS-4 on Unix systems.
726  // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4).
727  // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32).
728 
729  switch (sizeof(wchar_t))
730  {
731  case 4:
732  {
733  *output++ = static_cast<wchar_t>(codepoint);
734  break;
735  }
736 
737  default:
738  {
739  if ((codepoint <= 0xFFFF) && ((codepoint < 0xD800) || (codepoint > 0xDFFF)))
740  {
741  *output++ = static_cast<wchar_t>(codepoint);
742  }
743  else if (replacement)
744  {
745  *output++ = replacement;
746  }
747  break;
748  }
749  }
750 
751  return output;
752 }
Page rendered in 1.92794s using 6 queries.