|
@@ -1,87 +1,119 @@
|
|
|
<?php
|
|
|
/**
|
|
|
- * utf-8 encoding function
|
|
|
+ * utf-8 encoding functions
|
|
|
*
|
|
|
* takes a string of unicode entities and converts it to a utf-8 encoded string
|
|
|
* each unicode entitiy has the form &#nnn(nn); n={0..9} and can be displayed by utf-8 supporting
|
|
|
- * browsers. Ascii will not be modified.
|
|
|
+ * browsers. Ascii will not be modified.
|
|
|
*
|
|
|
- * code is taken from www.php.net manual comments
|
|
|
- * Author: ronen at greyzone dot com
|
|
|
+ * Original code is taken from www.php.net manual comments
|
|
|
+ * Original author: ronen at greyzone dot com
|
|
|
*
|
|
|
* @version $Id$
|
|
|
+ * @copyright Copyright © SquirrelMail Development Team, 2004
|
|
|
* @package squirrelmail
|
|
|
* @subpackage encode
|
|
|
*/
|
|
|
|
|
|
/**
|
|
|
* Converts string to utf-8
|
|
|
- * @param $source string of unicode entities [STRING]
|
|
|
- * @return a utf-8 encoded string [STRING]
|
|
|
+ * @param string $string text with numeric unicode entities
|
|
|
+ * @return string utf-8 encoded text
|
|
|
*/
|
|
|
-function charset_encode_utf_8 ($source) {
|
|
|
+function charset_encode_utf_8 ($string) {
|
|
|
+ // don't run encoding function, if there is no encoded characters
|
|
|
+ if (! preg_match("'&#[0-9]+;'",$string) ) return $string;
|
|
|
|
|
|
- // don't run though encoding function, if there is no encoded characters
|
|
|
- if (! preg_match("'&#'",$source) ) return $source;
|
|
|
+ $string=preg_replace("/&#([0-9]+);/e","unicodetoutf8('\\1')",$string);
|
|
|
+ // $string=preg_replace("/&#[xX]([0-9A-F]+);/e","unicodetoutf8(hexdec('\\1'))",$string);
|
|
|
+
|
|
|
+ return $string;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * Return utf8 symbol when unicode character number is provided
|
|
|
+ *
|
|
|
+ * This function is used internally by charset_encode_utf_8
|
|
|
+ * function. It might be unavailable to other squirrelmail functions.
|
|
|
+ * Don't use it or make sure, that functions/encode/utf_8.php is
|
|
|
+ * included.
|
|
|
+ *
|
|
|
+ * @param int $var decimal unicode value
|
|
|
+ * @return string utf8 character
|
|
|
+ */
|
|
|
+function unicodetoutf8($var) {
|
|
|
+
|
|
|
+ if ($var < 128) {
|
|
|
+ $ret = chr ($var);
|
|
|
+ } else if ($var < 2048) {
|
|
|
+ // Two byte utf-8
|
|
|
+ $binVal = str_pad (decbin ($var), 11, "0", STR_PAD_LEFT);
|
|
|
+ $binPart1 = substr ($binVal, 0, 5);
|
|
|
+ $binPart2 = substr ($binVal, 5);
|
|
|
+
|
|
|
+ $char1 = chr (192 + bindec ($binPart1));
|
|
|
+ $char2 = chr (128 + bindec ($binPart2));
|
|
|
+ $ret = $char1 . $char2;
|
|
|
+ } else if ($var < 65536) {
|
|
|
+ // Three byte utf-8
|
|
|
+ $binVal = str_pad (decbin ($var), 16, "0", STR_PAD_LEFT);
|
|
|
+ $binPart1 = substr ($binVal, 0, 4);
|
|
|
+ $binPart2 = substr ($binVal, 4, 6);
|
|
|
+ $binPart3 = substr ($binVal, 10);
|
|
|
+
|
|
|
+ $char1 = chr (224 + bindec ($binPart1));
|
|
|
+ $char2 = chr (128 + bindec ($binPart2));
|
|
|
+ $char3 = chr (128 + bindec ($binPart3));
|
|
|
+ $ret = $char1 . $char2 . $char3;
|
|
|
+ } else if ($var < 2097152) {
|
|
|
+ // Four byte utf-8
|
|
|
+ $binVal = str_pad (decbin ($var), 21, "0", STR_PAD_LEFT);
|
|
|
+ $binPart1 = substr ($binVal, 0, 3);
|
|
|
+ $binPart2 = substr ($binVal, 3, 6);
|
|
|
+ $binPart3 = substr ($binVal, 9, 6);
|
|
|
+ $binPart4 = substr ($binVal, 15);
|
|
|
|
|
|
- $utf8Str = '';
|
|
|
- $entityArray = explode ("&#", $source);
|
|
|
- $size = count ($entityArray);
|
|
|
- for ($i = 0; $i < $size; $i++) {
|
|
|
- $subStr = $entityArray[$i];
|
|
|
- $nonEntity = strstr ($subStr, ';');
|
|
|
- if ($nonEntity !== false) {
|
|
|
- $unicode = intval (substr ($subStr, 0, (strpos ($subStr, ';') + 1)));
|
|
|
- // determine how many chars are needed to reprsent this unicode char
|
|
|
- if ($unicode < 128) {
|
|
|
- $utf8Substring = chr ($unicode);
|
|
|
- }
|
|
|
- else if ($unicode >= 128 && $unicode < 2048) {
|
|
|
- $binVal = str_pad (decbin ($unicode), 11, "0", STR_PAD_LEFT);
|
|
|
- $binPart1 = substr ($binVal, 0, 5);
|
|
|
- $binPart2 = substr ($binVal, 5);
|
|
|
-
|
|
|
- $char1 = chr (192 + bindec ($binPart1));
|
|
|
- $char2 = chr (128 + bindec ($binPart2));
|
|
|
- $utf8Substring = $char1 . $char2;
|
|
|
- }
|
|
|
- else if ($unicode >= 2048 && $unicode < 65536) {
|
|
|
- $binVal = str_pad (decbin ($unicode), 16, "0", STR_PAD_LEFT);
|
|
|
- $binPart1 = substr ($binVal, 0, 4);
|
|
|
- $binPart2 = substr ($binVal, 4, 6);
|
|
|
- $binPart3 = substr ($binVal, 10);
|
|
|
-
|
|
|
- $char1 = chr (224 + bindec ($binPart1));
|
|
|
- $char2 = chr (128 + bindec ($binPart2));
|
|
|
- $char3 = chr (128 + bindec ($binPart3));
|
|
|
- $utf8Substring = $char1 . $char2 . $char3;
|
|
|
- }
|
|
|
- else {
|
|
|
- $binVal = str_pad (decbin ($unicode), 21, "0", STR_PAD_LEFT);
|
|
|
- $binPart1 = substr ($binVal, 0, 3);
|
|
|
- $binPart2 = substr ($binVal, 3, 6);
|
|
|
- $binPart3 = substr ($binVal, 9, 6);
|
|
|
- $binPart4 = substr ($binVal, 15);
|
|
|
-
|
|
|
- $char1 = chr (240 + bindec ($binPart1));
|
|
|
- $char2 = chr (128 + bindec ($binPart2));
|
|
|
- $char3 = chr (128 + bindec ($binPart3));
|
|
|
- $char4 = chr (128 + bindec ($binPart4));
|
|
|
- $utf8Substring = $char1 . $char2 . $char3 . $char4;
|
|
|
- }
|
|
|
-
|
|
|
- if (strlen ($nonEntity) > 1)
|
|
|
- $nonEntity = substr ($nonEntity, 1); // chop the first char (';')
|
|
|
- else
|
|
|
- $nonEntity = '';
|
|
|
+ $char1 = chr (240 + bindec ($binPart1));
|
|
|
+ $char2 = chr (128 + bindec ($binPart2));
|
|
|
+ $char3 = chr (128 + bindec ($binPart3));
|
|
|
+ $char4 = chr (128 + bindec ($binPart4));
|
|
|
+ $ret = $char1 . $char2 . $char3 . $char4;
|
|
|
+ } else if ($var < 67108864) {
|
|
|
+ // Five byte utf-8
|
|
|
+ $binVal = str_pad (decbin ($var), 26, "0", STR_PAD_LEFT);
|
|
|
+ $binPart1 = substr ($binVal, 0, 2);
|
|
|
+ $binPart2 = substr ($binVal, 2, 6);
|
|
|
+ $binPart3 = substr ($binVal, 8, 6);
|
|
|
+ $binPart4 = substr ($binVal, 14,6);
|
|
|
+ $binPart5 = substr ($binVal, 20);
|
|
|
|
|
|
- $utf8Str .= $utf8Substring . $nonEntity;
|
|
|
- }
|
|
|
- else {
|
|
|
- $utf8Str .= $subStr;
|
|
|
- }
|
|
|
- }
|
|
|
+ $char1 = chr (248 + bindec ($binPart1));
|
|
|
+ $char2 = chr (128 + bindec ($binPart2));
|
|
|
+ $char3 = chr (128 + bindec ($binPart3));
|
|
|
+ $char4 = chr (128 + bindec ($binPart4));
|
|
|
+ $char5 = chr (128 + bindec ($binPart5));
|
|
|
+ $ret = $char1 . $char2 . $char3 . $char4 . $char5;
|
|
|
+ } else if ($var < 2147483648) {
|
|
|
+ // Six byte utf-8
|
|
|
+ $binVal = str_pad (decbin ($var), 31, "0", STR_PAD_LEFT);
|
|
|
+ $binPart1 = substr ($binVal, 0, 1);
|
|
|
+ $binPart2 = substr ($binVal, 1, 6);
|
|
|
+ $binPart3 = substr ($binVal, 7, 6);
|
|
|
+ $binPart4 = substr ($binVal, 13,6);
|
|
|
+ $binPart5 = substr ($binVal, 19,6);
|
|
|
+ $binPart6 = substr ($binVal, 25);
|
|
|
|
|
|
- return $utf8Str;
|
|
|
+ $char1 = chr (252 + bindec ($binPart1));
|
|
|
+ $char2 = chr (128 + bindec ($binPart2));
|
|
|
+ $char3 = chr (128 + bindec ($binPart3));
|
|
|
+ $char4 = chr (128 + bindec ($binPart4));
|
|
|
+ $char5 = chr (128 + bindec ($binPart5));
|
|
|
+ $char6 = chr (128 + bindec ($binPart6));
|
|
|
+ $ret = $char1 . $char2 . $char3 . $char4 . $char5 . $char6;
|
|
|
+ } else {
|
|
|
+ // there is no such symbol in utf-8
|
|
|
+ $ret='?';
|
|
|
+ }
|
|
|
+ return $ret;
|
|
|
}
|
|
|
?>
|