Quellcode durchsuchen

Fix certain messages with headers in unknown charsets encoded as quoted printable showing up as blank sender/subject in message list. There are two ways to fix it, and the more accurate but costly one is left unfinished since the easy fix seems to have helped in every case I know of

pdontthink vor 3 Jahren
Ursprung
Commit
96a713c8e4
3 geänderte Dateien mit 125 neuen und 19 gelöschten Zeilen
  1. 8 0
      config/config_local.example.php
  2. 108 12
      functions/strings.php
  3. 9 7
      include/languages.php

+ 8 - 0
config/config_local.example.php

@@ -61,6 +61,14 @@
  * part (beginning with "@") will be stripped before
  * calculating the CRC or MD5.
  *
+ * $default_htmlspecialchars_encoding (string) is used to
+ * specify the charset that is used for htmlspecialchars()
+ * calls when an invalid charset was requested (PHP's
+ * htmlspecialchars() only supports a limited number of
+ * encodings).  SquirrelMail defaults to iso-8859-1, but if
+ * you want to change the default to something like utf-8,
+ * you can use this setting for that.
+ *
  * $smtp_stream_options allows more control over the SSL context
  * used when connecting to the SMTP server over SSL/TLS.  See:
  * http://www.php.net/manual/context.php and in particular

+ 108 - 12
functions/strings.php

@@ -1729,21 +1729,75 @@ function sm_validate_security_token($token, $validity_period=0, $show_error=FALS
   * attempts to add the correct character encoding
   *
   * @param string $string The string to be converted
-  * @param int $flags A bitmask that controls the behavior of htmlspecialchars()
+  * @param int $flags A bitmask that controls the behavior of
+  *                   htmlspecialchars() -- NOTE that this parameter
+  *                   should only be used to dictate handling of
+  *                   quotes; handling invalid code sequences is done
+  *                   using the $invalid_sequence_flag parameter below
   *                   (See http://php.net/manual/function.htmlspecialchars.php )
-  *                   (OPTIONAL; default ENT_COMPAT, ENT_COMPAT | ENT_SUBSTITUTE for PHP >=5.4)
+  *                   (OPTIONAL; default ENT_COMPAT)
   * @param string $encoding The character encoding to use in the conversion
-  *                         (OPTIONAL; default automatic detection)
+  *                         (if not one of the character sets supported
+  *                         by PHP's htmlspecialchars(), then $encoding
+  *                         will be ignored and iso-8859-1 will be used,
+  *                         unless a default has been specified in
+  *                         $default_htmlspecialchars_encoding in
+  *                         config_local.php) (OPTIONAL; default automatic
+  *                         detection)
   * @param boolean $double_encode Whether or not to convert entities that are
   *                               already in the string (only supported in
   *                               PHP 5.2.3+) (OPTIONAL; default TRUE)
+  * @param mixed $invalid_sequence_flag A bitmask that controls how invalid
+  *                                     code sequences should be handled;
+  *                                     When calling htmlspecialchars(),
+  *                                     this value will be combined with
+  *                                     the $flags parameter above
+  *                                     (See http://php.net/manual/function.htmlspecialchars.php )
+  *                                     (OPTIONAL; defaults to the string
+  *                                     "ent_substitute" that, for PHP 5.4+,
+  *                                     is converted to the ENT_SUBSTITUTE
+  *                                     constant, otherwise empty)
   *
   * @return string The converted text
   *
   */
 function sm_encode_html_special_chars($string, $flags=ENT_COMPAT,
-                                      $encoding=NULL, $double_encode=TRUE)
+                                      $encoding=NULL, $double_encode=TRUE,
+                                      $invalid_sequence_flag='ent_substitute')
 {
+   if ($invalid_sequence_flag === 'ent_substitute')
+   {
+      if (check_php_version(5, 4, 0))
+         $invalid_sequence_flag = ENT_SUBSTITUTE;
+      else
+         $invalid_sequence_flag = 0;
+   }
+
+
+   // charsets supported by PHP's htmlspecialchars
+   // (move this elsewhere if needed)
+   //
+   static $htmlspecialchars_charsets = array(
+      'iso-8859-1', 'iso8859-1',
+      'iso-8859-5', 'iso8859-5',
+      'iso-8859-15', 'iso8859-15',
+      'utf-8',
+      'cp866', 'ibm866', '866',
+      'cp1251', 'windows-1251', 'win-1251', '1251',
+      'cp1252', 'windows-1252', '1252',
+      'koi8-R', 'koi8-ru', 'koi8r',
+      'big5', '950',
+      'gb2312', '936',
+      'big5-hkscs',
+      'shift_jis', 'sjis', 'sjis-win', 'cp932', '932',
+      'euc-jp', 'eucjp', 'eucjp-win',
+      'macroman',
+   );
+
+
+   // if not given, set encoding to the charset being
+   // used by the current user interface language
+   //
    if (!$encoding)
    {
       global $default_charset;
@@ -1752,15 +1806,57 @@ function sm_encode_html_special_chars($string, $flags=ENT_COMPAT,
       $encoding = $default_charset;
    }
 
-   if (check_php_version(5, 2, 3)) {
-      // Replace invalid characters with a symbol instead of returning
-      // empty string for the entire to be encoded string.
-      if (check_php_version(5, 4, 0) && $flags == ENT_COMPAT) {
-         $flags = $flags | ENT_SUBSTITUTE;
+
+   // two ways to handle encodings not supported by htmlspecialchars() -
+   // one takes less CPU cycles but can munge characters in certain
+   // translations, the other is more exact but requires more resources
+   //
+   global $html_special_chars_extended_fix;
+//FIXME: need to document that the config switch above can be enabled in config_local... but first, we need to decide if we will implement the second option here -- currently there hasn't been a need for it (munged characters seem quite rare).... see tracker #2806 for some tips https://sourceforge.net/p/squirrelmail/bugs/2806
+   if (!in_array(strtolower($encoding), $htmlspecialchars_charsets))
+   {
+      if ($html_special_chars_extended_fix)
+      {
+         // convert to utf-8 first, run htmlspecialchars() and convert
+         // back to original encoding below
+         //
+//FIXME: try conversion functions in this order: recode_string(), iconv(), mbstring (with various charset checks: sq_mb_list_encodings(), mb_check_encoding) -- oh, first check for internal charset_decode_CHARSET() function?? or just use (does this put everything into HTML entities already? shouldn't, but if it does, return right here):
+         $string = charset_decode($encoding, $string, TRUE, TRUE);
+         $string = charset_encode($string, $encoding, TRUE);
+      }
+      else
+      {
+         // simply force use of an encoding that is supported (some
+         // characters may be munged)
+         //
+         // use default from configuration if provided or hard-coded fallback
+         //
+         global $default_htmlspecialchars_encoding;
+         if (!empty($default_htmlspecialchars_encoding))
+            $encoding = $default_htmlspecialchars_encoding;
+         else
+            $encoding = 'iso-8859-1';
       }
-      return htmlspecialchars($string, $flags, $encoding, $double_encode);
    }
 
-   return htmlspecialchars($string, $flags, $encoding);
-}
 
+// TODO: Is adding this check an unnecessary performance hit?
+   if (check_php_version(5, 2, 3))
+      $ret = htmlspecialchars($string, $flags | $invalid_sequence_flag,
+                              $encoding, $double_encode);
+   else
+      $ret = htmlspecialchars($string, $flags | $invalid_sequence_flag,
+                              $encoding);
+
+
+   // convert back to original encoding if needed (see above)
+   //
+   if ($html_special_chars_extended_fix
+    && !in_array(strtolower($encoding), $htmlspecialchars_charsets))
+   {
+//FIXME: NOT FINISHED - here, we'd convert from utf-8 back to original charset (if we obey $lossy_encoding and end up returning in utf-8 instead of original charset, does that screw up the caller?)
+   }
+
+
+   return $ret;
+}

+ 9 - 7
include/languages.php

@@ -180,16 +180,18 @@ function sq_setlocale($category,$locale) {
 }
 
 /**
- * Converts string from given charset to charset, that can be displayed by user translation.
+ * Converts a string from the given $charset to a character set that
+ * can be displayed by the current user interface language (translation)
  *
- * Function by default returns html encoded strings, if translation uses different encoding.
+ * Function by default returns html encoded strings if translation uses
+ * different encoding.
  * If Japanese translation is used - function returns string converted to euc-jp
  * If iconv or recode functions are enabled and translation uses utf-8 - function returns utf-8 encoded string.
  * If $charset is not supported - function returns unconverted string.
  *
  * sanitizing of html tags is also done by this function.
  *
- * @param string $charset
+ * @param string $charset The charset of the incoming string
  * @param string $string Text to be decoded
  * @param boolean $force_decode converts string to html without $charset!=$default_charset check.
  * Argument is available since 1.4.5 and 1.5.1.
@@ -218,7 +220,7 @@ function charset_decode ($charset, $string, $force_decode=false, $save_html=fals
 
     // Don't do conversion if charset is the same.
     if ( ! $force_decode && $charset == strtolower($default_charset) )
-        return ($save_html ? $string : sm_encode_html_special_chars($string));
+        return ($save_html ? $string : sm_encode_html_special_chars($string, ENT_COMPAT, $charset));
 
     // catch iso-8859-8-i thing
     if ( $charset == "iso-8859-8-i" )
@@ -234,7 +236,7 @@ function charset_decode ($charset, $string, $force_decode=false, $save_html=fals
             // other charsets can be converted to utf-8 without loss.
             // and output string is smaller
             $string = recode_string($charset . "..utf-8",$string);
-            return ($save_html ? $string : sm_encode_html_special_chars($string));
+            return ($save_html ? $string : sm_encode_html_special_chars($string, ENT_COMPAT, $charset));
         } else {
             $string = recode_string($charset . "..html",$string);
             // recode does not convert single quote, sm_encode_html_special_chars does.
@@ -250,13 +252,13 @@ function charset_decode ($charset, $string, $force_decode=false, $save_html=fals
     // iconv functions does not have html target and can be used only with utf-8
     if ( $use_php_iconv && $default_charset=='utf-8') {
         $string = iconv($charset,$default_charset,$string);
-        return ($save_html ? $string : sm_encode_html_special_chars($string));
+        return ($save_html ? $string : sm_encode_html_special_chars($string, ENT_COMPAT, $charset));
     }
 
     // If we don't use recode and iconv, we'll do it old way.
 
     /* All HTML special characters are 7 bit and can be replaced first */
-    if (! $save_html) $string = sm_encode_html_special_chars ($string);
+    if (! $save_html) $string = sm_encode_html_special_chars ($string, ENT_COMPAT, $charset);
 
     /* controls cpu and memory intensive decoding cycles */
     if (! isset($aggressive_decoding) || $aggressive_decoding=="" ) {