utf_8.php 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103
  1. <?php
  2. /**
  3. * functions/decode/utf-8.php - utf-8 decoding functions
  4. *
  5. * This file contains utf-8 decoding function that is needed to read
  6. * utf-8 encoded mails in non-utf-8 locale.
  7. *
  8. * Every decoded character consists of n bytes. First byte is octal
  9. * 300-375, other bytes - always octals 200-277.
  10. *<pre>
  11. * Ranges (first byte):
  12. * oct dec hex
  13. * Two byte - 300-337 192-223 C0-DF
  14. * Three byte - 340-357 224-239 E0-EF
  15. * Four byte - 360-367 240-247 F0-F7
  16. * Five byte - 370-373 248-251 F8-FB
  17. * Six byte - 374-375 252-253 FC-FD
  18. *
  19. * \a\b characters are decoded to html code calculated with formula:
  20. * octdec(a-300)*64 + octdec(b-200)
  21. *
  22. * \a\b\c characters are decoded to html code calculated with formula:
  23. * octdec(a-340)*64^2 + octdec(b-200)*64 + octdec(c-200)
  24. *
  25. * \a\b\c\d characters are decoded to html code calculated with formula:
  26. * octdec(a-360)*64^3 + octdec(b-200)*64^2 +
  27. * + octdec(c-200)*64 + octdec(d-200)
  28. *
  29. * \a\b\c\d\e characters are decoded to html code calculated with formula:
  30. * octdec(a-370)*64^4 + octdec(b-200)*64^3 +
  31. * + octdec(c-200)*64^2 + octdec(d-200)*64 + octdec(e-200)
  32. *
  33. * \a\b\c\d\e\f characters are decoded to html code calculated with formula:
  34. * octdec(a-374)*64^5 + octdec(b-200)*64^4 + octdec(c-200)*64^3 +
  35. * + octdec(d-200)*64^2 + octdec(e-200)*64 + octdec(f-200)
  36. *</pre>
  37. * @copyright 2003-2025 The SquirrelMail Project Team
  38. * @license http://opensource.org/licenses/gpl-license.php GNU Public License
  39. * @version $Id$
  40. * @package squirrelmail
  41. * @subpackage decode
  42. */
  43. /**
  44. * Decode utf-8 strings
  45. * @param string $string Encoded string
  46. * @return string Decoded string
  47. */
  48. function charset_decode_utf_8 ($string) {
  49. global $squirrelmail_language;
  50. // Japanese translation uses mbstring function to read utf-8
  51. if ($squirrelmail_language == 'ja_JP')
  52. return $string;
  53. // don't do decoding when there are no 8bit symbols
  54. if (! sq_is8bit($string,'utf-8'))
  55. return $string;
  56. // decode six byte unicode characters
  57. /* (i think currently there is no such symbol)
  58. $string = preg_replace_callback("/([\374-\375])([\200-\277])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/",
  59. create_function ('$matches', 'return \'&#\'.((ord($matches[1])-252)*1073741824+(ord($matches[2])-200)*16777216+(ord($matches[3])-200)*262144+(ord($matches[4])-128)*4096+(ord($matches[5])-128)*64+(ord($matches[6])-128)).\';\';'),
  60. $string);
  61. */
  62. // decode five byte unicode characters
  63. /* (i think currently there is no such symbol)
  64. $string = preg_replace_callback("/([\370-\373])([\200-\277])([\200-\277])([\200-\277])([\200-\277])/",
  65. create_function ('$matches', 'return \'&#\'.((ord($matches[1])-248)*16777216+(ord($matches[2])-200)*262144+(ord($matches[3])-128)*4096+(ord($matches[4])-128)*64+(ord($matches[5])-128)).\';\';'),
  66. $string);
  67. */
  68. // decode four byte unicode characters
  69. $string = preg_replace_callback("/([\360-\367])([\200-\277])([\200-\277])([\200-\277])/",
  70. (check_php_version(5, 3, 0)
  71. ? function($matches) { return '&#'.((ord($matches[1])-240)*262144+(ord($matches[2])-128)*4096+(ord($matches[3])-128)*64+(ord($matches[4])-128)).';'; }
  72. : create_function ('$matches', 'return \'&#\'.((ord($matches[1])-240)*262144+(ord($matches[2])-128)*4096+(ord($matches[3])-128)*64+(ord($matches[4])-128)).\';\';')
  73. ),
  74. $string);
  75. // decode three byte unicode characters
  76. $string = preg_replace_callback("/([\340-\357])([\200-\277])([\200-\277])/",
  77. (check_php_version(5, 3, 0)
  78. ? function($matches) { return '&#'.((ord($matches[1])-224)*4096+(ord($matches[2])-128)*64+(ord($matches[3])-128)).';'; }
  79. : create_function ('$matches', 'return \'&#\'.((ord($matches[1])-224)*4096+(ord($matches[2])-128)*64+(ord($matches[3])-128)).\';\';')
  80. ),
  81. $string);
  82. // decode two byte unicode characters
  83. $string = preg_replace_callback("/([\300-\337])([\200-\277])/",
  84. (check_php_version(5, 3, 0)
  85. ? function($matches) { return '&#'.((ord($matches[1])-192)*64+(ord($matches[2])-128)).';'; }
  86. : create_function ('$matches', 'return \'&#\'.((ord($matches[1])-192)*64+(ord($matches[2])-128)).\';\';')
  87. ),
  88. $string);
  89. // remove broken unicode
  90. $string = preg_replace("/[\200-\237]|\240|[\241-\377]/",'?',$string);
  91. return $string;
  92. }