utf_8.php 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
  1. <?php
  2. /**
  3. * functions/decode/utf-8.php - utf-8 decoding functions
  4. *
  5. * Copyright (c) 2003-2005 The SquirrelMail Project Team
  6. * Licensed under the GNU GPL. For full terms see the file COPYING.
  7. *
  8. * This file contains utf-8 decoding function that is needed to read
  9. * utf-8 encoded mails in non-utf-8 locale.
  10. *
  11. * Every decoded character consists of n bytes. First byte is octal
  12. * 300-375, other bytes - always octals 200-277.
  13. *<pre>
  14. * Ranges (first byte):
  15. * oct dec hex
  16. * Two byte - 300-337 192-223 C0-DF
  17. * Three byte - 340-357 224-239 E0-EF
  18. * Four byte - 360-367 240-247 F0-F7
  19. * Five byte - 370-373 248-251 F8-FB
  20. * Six byte - 374-375 252-253 FC-FD
  21. *
  22. * \a\b characters are decoded to html code calculated with formula:
  23. * octdec(a-300)*64 + octdec(b-200)
  24. *
  25. * \a\b\c characters are decoded to html code calculated with formula:
  26. * octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200)
  27. *
  28. * \a\b\c\d characters are decoded to html code calculated with formula:
  29. * octdec(a-360)*64^3 + octdec(b-200)*64^2 +
  30. * + octdec(c-200)*64 + octdec(d-200)
  31. *
  32. * \a\b\c\d\e characters are decoded to html code calculated with formula:
  33. * octdec(a-370)*64^4 + octdec(b-200)*64^3 +
  34. * + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200)
  35. *
  36. * \a\b\c\d\e\f characters are decoded to html code calculated with formula:
  37. * octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 +
  38. * + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200)
  39. *</pre>
  40. * @version $Id$
  41. * @package squirrelmail
  42. * @subpackage decode
  43. */
  44. /**
  45. * Decode utf-8 strings
  46. * @param string $string Encoded string
  47. * @return string Decoded string
  48. */
  49. function charset_decode_utf_8 ($string) {
  50. global $squirrelmail_language;
  51. // Japanese translation uses mbstring function to read utf-8
  52. if ($squirrelmail_language == 'ja_JP')
  53. return $string;
  54. // don't do decoding when there are no 8bit symbols
  55. if (! sq_is8bit($string,'utf-8'))
  56. return $string;
  57. // decode six byte unicode characters
  58. /* (i think currently there is no such symbol)
  59. $string = preg_replace("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e",
  60. "'&#'.((ord('\\1')-252)*1073741824+(ord('\\2')-200)*16777216+(ord('\\3')-200)*262144+(ord('\\4')-128)*4096+(ord('\\5')-128)*64+(ord('\\6')-128)).';'",
  61. $string);
  62. */
  63. // decode five byte unicode characters
  64. /* (i think currently there is no such symbol)
  65. $string = preg_replace("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/e",
  66. "'&#'.((ord('\\1')-248)*16777216+(ord('\\2')-200)*262144+(ord('\\3')-128)*4096+(ord('\\4')-128)*64+(ord('\\5')-128)).';'",
  67. $string);
  68. */
  69. // decode four byte unicode characters
  70. $string = preg_replace("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/e",
  71. "'&#'.((ord('\\1')-240)*262144+(ord('\\2')-128)*4096+(ord('\\3')-128)*64+(ord('\\4')-128)).';'",
  72. $string);
  73. // decode three byte unicode characters
  74. $string = preg_replace("/([\340-\357])([\200-\277])([\200-\277])/e",
  75. "'&#'.((ord('\\1')-224)*4096+(ord('\\2')-128)*64+(ord('\\3')-128)).';'",
  76. $string);
  77. // decode two byte unicode characters
  78. $string = preg_replace("/([\300-\337])([\200-\277])/e",
  79. "'&#'.((ord('\\1')-192)*64+(ord('\\2')-128)).';'",
  80. $string);
  81. // remove broken unicode
  82. $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string);
  83. return $string;
  84. }
  85. ?>