| 1 | <?php |
|---|
| 2 | |
|---|
| 3 | /* |
|---|
| 4 | +-----------------------------------------------------------------------+ |
|---|
| 5 | | program/include/rcube_charset.php | |
|---|
| 6 | | | |
|---|
| 7 | | This file is part of the Roundcube Webmail client | |
|---|
| 8 | | Copyright (C) 2005-2012, The Roundcube Dev Team | |
|---|
| 9 | | Copyright (C) 2011-2012, Kolab Systems AG | |
|---|
| 10 | | Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org> | |
|---|
| 11 | | | |
|---|
| 12 | | Licensed under the GNU General Public License version 3 or | |
|---|
| 13 | | any later version with exceptions for skins & plugins. | |
|---|
| 14 | | See the README file for a full license statement. | |
|---|
| 15 | | | |
|---|
| 16 | | PURPOSE: | |
|---|
| 17 | | Provide charset conversion functionality | |
|---|
| 18 | | | |
|---|
| 19 | +-----------------------------------------------------------------------+ |
|---|
| 20 | | Author: Thomas Bruederli <roundcube@gmail.com> | |
|---|
| 21 | | Author: Aleksander Machniak <alec@alec.pl> | |
|---|
| 22 | +-----------------------------------------------------------------------+ |
|---|
| 23 | |
|---|
| 24 | $Id$ |
|---|
| 25 | |
|---|
| 26 | */ |
|---|
| 27 | |
|---|
| 28 | /** |
|---|
| 29 | * Character sets conversion functionality |
|---|
| 30 | * |
|---|
| 31 | * @package Core |
|---|
| 32 | * @author Thomas Bruederli <roundcube@gmail.com> |
|---|
| 33 | * @author Aleksander Machniak <alec@alec.pl> |
|---|
| 34 | * @author Edmund Grimley Evans <edmundo@rano.org> |
|---|
| 35 | */ |
|---|
| 36 | class rcube_charset |
|---|
| 37 | { |
|---|
| 38 | // Aliases: some of them from HTML5 spec. |
|---|
| 39 | static public $aliases = array( |
|---|
| 40 | 'USASCII' => 'WINDOWS-1252', |
|---|
| 41 | 'ANSIX31101983' => 'WINDOWS-1252', |
|---|
| 42 | 'ANSIX341968' => 'WINDOWS-1252', |
|---|
| 43 | 'UNKNOWN8BIT' => 'ISO-8859-15', |
|---|
| 44 | 'UNKNOWN' => 'ISO-8859-15', |
|---|
| 45 | 'USERDEFINED' => 'ISO-8859-15', |
|---|
| 46 | 'KSC56011987' => 'EUC-KR', |
|---|
| 47 | 'GB2312' => 'GBK', |
|---|
| 48 | 'GB231280' => 'GBK', |
|---|
| 49 | 'UNICODE' => 'UTF-8', |
|---|
| 50 | 'UTF7IMAP' => 'UTF7-IMAP', |
|---|
| 51 | 'TIS620' => 'WINDOWS-874', |
|---|
| 52 | 'ISO88599' => 'WINDOWS-1254', |
|---|
| 53 | 'ISO885911' => 'WINDOWS-874', |
|---|
| 54 | 'MACROMAN' => 'MACINTOSH', |
|---|
| 55 | '77' => 'MAC', |
|---|
| 56 | '128' => 'SHIFT-JIS', |
|---|
| 57 | '129' => 'CP949', |
|---|
| 58 | '130' => 'CP1361', |
|---|
| 59 | '134' => 'GBK', |
|---|
| 60 | '136' => 'BIG5', |
|---|
| 61 | '161' => 'WINDOWS-1253', |
|---|
| 62 | '162' => 'WINDOWS-1254', |
|---|
| 63 | '163' => 'WINDOWS-1258', |
|---|
| 64 | '177' => 'WINDOWS-1255', |
|---|
| 65 | '178' => 'WINDOWS-1256', |
|---|
| 66 | '186' => 'WINDOWS-1257', |
|---|
| 67 | '204' => 'WINDOWS-1251', |
|---|
| 68 | '222' => 'WINDOWS-874', |
|---|
| 69 | '238' => 'WINDOWS-1250', |
|---|
| 70 | 'MS950' => 'CP950', |
|---|
| 71 | 'WINDOWS949' => 'UHC', |
|---|
| 72 | ); |
|---|
| 73 | |
|---|
| 74 | |
|---|
| 75 | /** |
|---|
| 76 | * Catch an error and throw an exception. |
|---|
| 77 | * |
|---|
| 78 | * @param int Level of the error |
|---|
| 79 | * @param string Error message |
|---|
| 80 | */ |
|---|
| 81 | public static function error_handler($errno, $errstr) |
|---|
| 82 | { |
|---|
| 83 | throw new ErrorException($errstr, 0, $errno); |
|---|
| 84 | } |
|---|
| 85 | |
|---|
| 86 | |
|---|
| 87 | /** |
|---|
| 88 | * Parse and validate charset name string (see #1485758). |
|---|
| 89 | * Sometimes charset string is malformed, there are also charset aliases |
|---|
| 90 | * but we need strict names for charset conversion (specially utf8 class) |
|---|
| 91 | * |
|---|
| 92 | * @param string Input charset name |
|---|
| 93 | * |
|---|
| 94 | * @return string The validated charset name |
|---|
| 95 | */ |
|---|
| 96 | public static function parse($input) |
|---|
| 97 | { |
|---|
| 98 | static $charsets = array(); |
|---|
| 99 | $charset = strtoupper($input); |
|---|
| 100 | |
|---|
| 101 | if (isset($charsets[$input])) { |
|---|
| 102 | return $charsets[$input]; |
|---|
| 103 | } |
|---|
| 104 | |
|---|
| 105 | $charset = preg_replace(array( |
|---|
| 106 | '/^[^0-9A-Z]+/', // e.g. _ISO-8859-JP$SIO |
|---|
| 107 | '/\$.*$/', // e.g. _ISO-8859-JP$SIO |
|---|
| 108 | '/UNICODE-1-1-*/', // RFC1641/1642 |
|---|
| 109 | '/^X-/', // X- prefix (e.g. X-ROMAN8 => ROMAN8) |
|---|
| 110 | ), '', $charset); |
|---|
| 111 | |
|---|
| 112 | if ($charset == 'BINARY') { |
|---|
| 113 | return $charsets[$input] = null; |
|---|
| 114 | } |
|---|
| 115 | |
|---|
| 116 | // allow A-Z and 0-9 only |
|---|
| 117 | $str = preg_replace('/[^A-Z0-9]/', '', $charset); |
|---|
| 118 | |
|---|
| 119 | if (isset(self::$aliases[$str])) { |
|---|
| 120 | $result = self::$aliases[$str]; |
|---|
| 121 | } |
|---|
| 122 | // UTF |
|---|
| 123 | else if (preg_match('/U[A-Z][A-Z](7|8|16|32)(BE|LE)*/', $str, $m)) { |
|---|
| 124 | $result = 'UTF-' . $m[1] . $m[2]; |
|---|
| 125 | } |
|---|
| 126 | // ISO-8859 |
|---|
| 127 | else if (preg_match('/ISO8859([0-9]{0,2})/', $str, $m)) { |
|---|
| 128 | $iso = 'ISO-8859-' . ($m[1] ? $m[1] : 1); |
|---|
| 129 | // some clients sends windows-1252 text as latin1, |
|---|
| 130 | // it is safe to use windows-1252 for all latin1 |
|---|
| 131 | $result = $iso == 'ISO-8859-1' ? 'WINDOWS-1252' : $iso; |
|---|
| 132 | } |
|---|
| 133 | // handle broken charset names e.g. WINDOWS-1250HTTP-EQUIVCONTENT-TYPE |
|---|
| 134 | else if (preg_match('/(WIN|WINDOWS)([0-9]+)/', $str, $m)) { |
|---|
| 135 | $result = 'WINDOWS-' . $m[2]; |
|---|
| 136 | } |
|---|
| 137 | // LATIN |
|---|
| 138 | else if (preg_match('/LATIN(.*)/', $str, $m)) { |
|---|
| 139 | $aliases = array('2' => 2, '3' => 3, '4' => 4, '5' => 9, '6' => 10, |
|---|
| 140 | '7' => 13, '8' => 14, '9' => 15, '10' => 16, |
|---|
| 141 | 'ARABIC' => 6, 'CYRILLIC' => 5, 'GREEK' => 7, 'GREEK1' => 7, 'HEBREW' => 8 |
|---|
| 142 | ); |
|---|
| 143 | |
|---|
| 144 | // some clients sends windows-1252 text as latin1, |
|---|
| 145 | // it is safe to use windows-1252 for all latin1 |
|---|
| 146 | if ($m[1] == 1) { |
|---|
| 147 | $result = 'WINDOWS-1252'; |
|---|
| 148 | } |
|---|
| 149 | // if iconv is not supported we need ISO labels, it's also safe for iconv |
|---|
| 150 | else if (!empty($aliases[$m[1]])) { |
|---|
| 151 | $result = 'ISO-8859-'.$aliases[$m[1]]; |
|---|
| 152 | } |
|---|
| 153 | // iconv requires convertion of e.g. LATIN-1 to LATIN1 |
|---|
| 154 | else { |
|---|
| 155 | $result = $str; |
|---|
| 156 | } |
|---|
| 157 | } |
|---|
| 158 | else { |
|---|
| 159 | $result = $charset; |
|---|
| 160 | } |
|---|
| 161 | |
|---|
| 162 | $charsets[$input] = $result; |
|---|
| 163 | |
|---|
| 164 | return $result; |
|---|
| 165 | } |
|---|
| 166 | |
|---|
| 167 | |
|---|
| 168 | /** |
|---|
| 169 | * Convert a string from one charset to another. |
|---|
| 170 | * Uses mbstring and iconv functions if possible |
|---|
| 171 | * |
|---|
| 172 | * @param string Input string |
|---|
| 173 | * @param string Suspected charset of the input string |
|---|
| 174 | * @param string Target charset to convert to; defaults to RCMAIL_CHARSET |
|---|
| 175 | * |
|---|
| 176 | * @return string Converted string |
|---|
| 177 | */ |
|---|
| 178 | public static function convert($str, $from, $to = null) |
|---|
| 179 | { |
|---|
| 180 | static $iconv_options = null; |
|---|
| 181 | static $mbstring_loaded = null; |
|---|
| 182 | static $mbstring_list = null; |
|---|
| 183 | static $conv = null; |
|---|
| 184 | |
|---|
| 185 | $to = empty($to) ? strtoupper(RCMAIL_CHARSET) : self::parse($to); |
|---|
| 186 | $from = self::parse($from); |
|---|
| 187 | |
|---|
| 188 | if ($from == $to || empty($str) || empty($from)) { |
|---|
| 189 | return $str; |
|---|
| 190 | } |
|---|
| 191 | |
|---|
| 192 | // convert charset using iconv module |
|---|
| 193 | if (function_exists('iconv') && $from != 'UTF7-IMAP' && $to != 'UTF7-IMAP') { |
|---|
| 194 | if ($iconv_options === null) { |
|---|
| 195 | // ignore characters not available in output charset |
|---|
| 196 | $iconv_options = '//IGNORE'; |
|---|
| 197 | if (iconv('', $iconv_options, '') === false) { |
|---|
| 198 | // iconv implementation does not support options |
|---|
| 199 | $iconv_options = ''; |
|---|
| 200 | } |
|---|
| 201 | } |
|---|
| 202 | |
|---|
| 203 | // throw an exception if iconv reports an illegal character in input |
|---|
| 204 | // it means that input string has been truncated |
|---|
| 205 | set_error_handler(array('rcube_charset', 'error_handler'), E_NOTICE); |
|---|
| 206 | try { |
|---|
| 207 | $_iconv = iconv($from, $to . $iconv_options, $str); |
|---|
| 208 | } catch (ErrorException $e) { |
|---|
| 209 | $_iconv = false; |
|---|
| 210 | } |
|---|
| 211 | restore_error_handler(); |
|---|
| 212 | |
|---|
| 213 | if ($_iconv !== false) { |
|---|
| 214 | return $_iconv; |
|---|
| 215 | } |
|---|
| 216 | } |
|---|
| 217 | |
|---|
| 218 | if ($mbstring_loaded === null) { |
|---|
| 219 | $mbstring_loaded = extension_loaded('mbstring'); |
|---|
| 220 | } |
|---|
| 221 | |
|---|
| 222 | // convert charset using mbstring module |
|---|
| 223 | if ($mbstring_loaded) { |
|---|
| 224 | $aliases['WINDOWS-1257'] = 'ISO-8859-13'; |
|---|
| 225 | |
|---|
| 226 | if ($mbstring_list === null) { |
|---|
| 227 | $mbstring_list = mb_list_encodings(); |
|---|
| 228 | $mbstring_list = array_map('strtoupper', $mbstring_list); |
|---|
| 229 | } |
|---|
| 230 | |
|---|
| 231 | $mb_from = $aliases[$from] ? $aliases[$from] : $from; |
|---|
| 232 | $mb_to = $aliases[$to] ? $aliases[$to] : $to; |
|---|
| 233 | |
|---|
| 234 | // return if encoding found, string matches encoding and convert succeeded |
|---|
| 235 | if (in_array($mb_from, $mbstring_list) && in_array($mb_to, $mbstring_list)) { |
|---|
| 236 | if (mb_check_encoding($str, $mb_from) && ($out = mb_convert_encoding($str, $mb_to, $mb_from))) { |
|---|
| 237 | return $out; |
|---|
| 238 | } |
|---|
| 239 | } |
|---|
| 240 | } |
|---|
| 241 | |
|---|
| 242 | // convert charset using bundled classes/functions |
|---|
| 243 | if ($to == 'UTF-8') { |
|---|
| 244 | if ($from == 'UTF7-IMAP') { |
|---|
| 245 | if ($_str = self::utf7imap_to_utf8($str)) { |
|---|
| 246 | return $_str; |
|---|
| 247 | } |
|---|
| 248 | } |
|---|
| 249 | else if ($from == 'UTF-7') { |
|---|
| 250 | if ($_str = self::utf7_to_utf8($str)) { |
|---|
| 251 | return $_str; |
|---|
| 252 | } |
|---|
| 253 | } |
|---|
| 254 | else if ($from == 'ISO-8859-1' && function_exists('utf8_encode')) { |
|---|
| 255 | return utf8_encode($str); |
|---|
| 256 | } |
|---|
| 257 | else if (class_exists('utf8')) { |
|---|
| 258 | if (!$conv) { |
|---|
| 259 | $conv = new utf8($from); |
|---|
| 260 | } |
|---|
| 261 | else { |
|---|
| 262 | $conv->loadCharset($from); |
|---|
| 263 | } |
|---|
| 264 | |
|---|
| 265 | if ($_str = $conv->strToUtf8($str)) { |
|---|
| 266 | return $_str; |
|---|
| 267 | } |
|---|
| 268 | } |
|---|
| 269 | } |
|---|
| 270 | |
|---|
| 271 | // encode string for output |
|---|
| 272 | if ($from == 'UTF-8') { |
|---|
| 273 | // @TODO: we need a function for UTF-7 (RFC2152) conversion |
|---|
| 274 | if ($to == 'UTF7-IMAP' || $to == 'UTF-7') { |
|---|
| 275 | if ($_str = self::utf8_to_utf7imap($str)) { |
|---|
| 276 | return $_str; |
|---|
| 277 | } |
|---|
| 278 | } |
|---|
| 279 | else if ($to == 'ISO-8859-1' && function_exists('utf8_decode')) { |
|---|
| 280 | return utf8_decode($str); |
|---|
| 281 | } |
|---|
| 282 | else if (class_exists('utf8')) { |
|---|
| 283 | if (!$conv) { |
|---|
| 284 | $conv = new utf8($to); |
|---|
| 285 | } |
|---|
| 286 | else { |
|---|
| 287 | $conv->loadCharset($from); |
|---|
| 288 | } |
|---|
| 289 | |
|---|
| 290 | if ($_str = $conv->strToUtf8($str)) { |
|---|
| 291 | return $_str; |
|---|
| 292 | } |
|---|
| 293 | } |
|---|
| 294 | } |
|---|
| 295 | |
|---|
| 296 | // return original string |
|---|
| 297 | return $str; |
|---|
| 298 | } |
|---|
| 299 | |
|---|
| 300 | |
|---|
| 301 | /** |
|---|
| 302 | * Converts string from standard UTF-7 (RFC 2152) to UTF-8. |
|---|
| 303 | * |
|---|
| 304 | * @param string Input string (UTF-7) |
|---|
| 305 | * |
|---|
| 306 | * @return string Converted string (UTF-8) |
|---|
| 307 | */ |
|---|
| 308 | public static function utf7_to_utf8($str) |
|---|
| 309 | { |
|---|
| 310 | $Index_64 = array( |
|---|
| 311 | 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|---|
| 312 | 0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0, |
|---|
| 313 | 0,0,0,0, 0,0,0,0, 0,0,0,1, 0,0,0,0, |
|---|
| 314 | 1,1,1,1, 1,1,1,1, 1,1,0,0, 0,0,0,0, |
|---|
| 315 | 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, |
|---|
| 316 | 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0, |
|---|
| 317 | 0,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1, |
|---|
| 318 | 1,1,1,1, 1,1,1,1, 1,1,1,0, 0,0,0,0, |
|---|
| 319 | ); |
|---|
| 320 | |
|---|
| 321 | $u7len = strlen($str); |
|---|
| 322 | $str = strval($str); |
|---|
| 323 | $res = ''; |
|---|
| 324 | |
|---|
| 325 | for ($i=0; $u7len > 0; $i++, $u7len--) { |
|---|
| 326 | $u7 = $str[$i]; |
|---|
| 327 | if ($u7 == '+') { |
|---|
| 328 | $i++; |
|---|
| 329 | $u7len--; |
|---|
| 330 | $ch = ''; |
|---|
| 331 | |
|---|
| 332 | for (; $u7len > 0; $i++, $u7len--) { |
|---|
| 333 | $u7 = $str[$i]; |
|---|
| 334 | |
|---|
| 335 | if (!$Index_64[ord($u7)]) { |
|---|
| 336 | break; |
|---|
| 337 | } |
|---|
| 338 | |
|---|
| 339 | $ch .= $u7; |
|---|
| 340 | } |
|---|
| 341 | |
|---|
| 342 | if ($ch == '') { |
|---|
| 343 | if ($u7 == '-') { |
|---|
| 344 | $res .= '+'; |
|---|
| 345 | } |
|---|
| 346 | |
|---|
| 347 | continue; |
|---|
| 348 | } |
|---|
| 349 | |
|---|
| 350 | $res .= self::utf16_to_utf8(base64_decode($ch)); |
|---|
| 351 | } |
|---|
| 352 | else { |
|---|
| 353 | $res .= $u7; |
|---|
| 354 | } |
|---|
| 355 | } |
|---|
| 356 | |
|---|
| 357 | return $res; |
|---|
| 358 | } |
|---|
| 359 | |
|---|
| 360 | |
|---|
| 361 | /** |
|---|
| 362 | * Converts string from UTF-16 to UTF-8 (helper for utf-7 to utf-8 conversion) |
|---|
| 363 | * |
|---|
| 364 | * @param string Input string |
|---|
| 365 | * |
|---|
| 366 | * @return string The converted string |
|---|
| 367 | */ |
|---|
| 368 | public static function utf16_to_utf8($str) |
|---|
| 369 | { |
|---|
| 370 | $len = strlen($str); |
|---|
| 371 | $dec = ''; |
|---|
| 372 | |
|---|
| 373 | for ($i = 0; $i < $len; $i += 2) { |
|---|
| 374 | $c = ord($str[$i]) << 8 | ord($str[$i + 1]); |
|---|
| 375 | if ($c >= 0x0001 && $c <= 0x007F) { |
|---|
| 376 | $dec .= chr($c); |
|---|
| 377 | } |
|---|
| 378 | else if ($c > 0x07FF) { |
|---|
| 379 | $dec .= chr(0xE0 | (($c >> 12) & 0x0F)); |
|---|
| 380 | $dec .= chr(0x80 | (($c >> 6) & 0x3F)); |
|---|
| 381 | $dec .= chr(0x80 | (($c >> 0) & 0x3F)); |
|---|
| 382 | } |
|---|
| 383 | else { |
|---|
| 384 | $dec .= chr(0xC0 | (($c >> 6) & 0x1F)); |
|---|
| 385 | $dec .= chr(0x80 | (($c >> 0) & 0x3F)); |
|---|
| 386 | } |
|---|
| 387 | } |
|---|
| 388 | |
|---|
| 389 | return $dec; |
|---|
| 390 | } |
|---|
| 391 | |
|---|
| 392 | |
|---|
| 393 | /** |
|---|
| 394 | * Convert the data ($str) from RFC 2060's UTF-7 to UTF-8. |
|---|
| 395 | * If input data is invalid, return the original input string. |
|---|
| 396 | * RFC 2060 obviously intends the encoding to be unique (see |
|---|
| 397 | * point 5 in section 5.1.3), so we reject any non-canonical |
|---|
| 398 | * form, such as &ACY- (instead of &-) or &AMA-&AMA- (instead |
|---|
| 399 | * of &AMAAwA-). |
|---|
| 400 | * |
|---|
| 401 | * Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com> |
|---|
| 402 | * |
|---|
| 403 | * @param string $str Input string (UTF7-IMAP) |
|---|
| 404 | * |
|---|
| 405 | * @return string Output string (UTF-8) |
|---|
| 406 | */ |
|---|
| 407 | public static function utf7imap_to_utf8($str) |
|---|
| 408 | { |
|---|
| 409 | $Index_64 = array( |
|---|
| 410 | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
|---|
| 411 | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, |
|---|
| 412 | -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, 63,-1,-1,-1, |
|---|
| 413 | 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1, |
|---|
| 414 | -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14, |
|---|
| 415 | 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1, |
|---|
| 416 | -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40, |
|---|
| 417 | 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1 |
|---|
| 418 | ); |
|---|
| 419 | |
|---|
| 420 | $u7len = strlen($str); |
|---|
| 421 | $str = strval($str); |
|---|
| 422 | $p = ''; |
|---|
| 423 | $err = ''; |
|---|
| 424 | |
|---|
| 425 | for ($i=0; $u7len > 0; $i++, $u7len--) { |
|---|
| 426 | $u7 = $str[$i]; |
|---|
| 427 | if ($u7 == '&') { |
|---|
| 428 | $i++; |
|---|
| 429 | $u7len--; |
|---|
| 430 | $u7 = $str[$i]; |
|---|
| 431 | |
|---|
| 432 | if ($u7len && $u7 == '-') { |
|---|
| 433 | $p .= '&'; |
|---|
| 434 | continue; |
|---|
| 435 | } |
|---|
| 436 | |
|---|
| 437 | $ch = 0; |
|---|
| 438 | $k = 10; |
|---|
| 439 | for (; $u7len > 0; $i++, $u7len--) { |
|---|
| 440 | $u7 = $str[$i]; |
|---|
| 441 | |
|---|
| 442 | if ((ord($u7) & 0x80) || ($b = $Index_64[ord($u7)]) == -1) { |
|---|
| 443 | break; |
|---|
| 444 | } |
|---|
| 445 | |
|---|
| 446 | if ($k > 0) { |
|---|
| 447 | $ch |= $b << $k; |
|---|
| 448 | $k -= 6; |
|---|
| 449 | } |
|---|
| 450 | else { |
|---|
| 451 | $ch |= $b >> (-$k); |
|---|
| 452 | if ($ch < 0x80) { |
|---|
| 453 | // Printable US-ASCII |
|---|
| 454 | if (0x20 <= $ch && $ch < 0x7f) { |
|---|
| 455 | return $err; |
|---|
| 456 | } |
|---|
| 457 | $p .= chr($ch); |
|---|
| 458 | } |
|---|
| 459 | else if ($ch < 0x800) { |
|---|
| 460 | $p .= chr(0xc0 | ($ch >> 6)); |
|---|
| 461 | $p .= chr(0x80 | ($ch & 0x3f)); |
|---|
| 462 | } |
|---|
| 463 | else { |
|---|
| 464 | $p .= chr(0xe0 | ($ch >> 12)); |
|---|
| 465 | $p .= chr(0x80 | (($ch >> 6) & 0x3f)); |
|---|
| 466 | $p .= chr(0x80 | ($ch & 0x3f)); |
|---|
| 467 | } |
|---|
| 468 | |
|---|
| 469 | $ch = ($b << (16 + $k)) & 0xffff; |
|---|
| 470 | $k += 10; |
|---|
| 471 | } |
|---|
| 472 | } |
|---|
| 473 | |
|---|
| 474 | // Non-zero or too many extra bits |
|---|
| 475 | if ($ch || $k < 6) { |
|---|
| 476 | return $err; |
|---|
| 477 | } |
|---|
| 478 | |
|---|
| 479 | // BASE64 not properly terminated |
|---|
| 480 | if (!$u7len || $u7 != '-') { |
|---|
| 481 | return $err; |
|---|
| 482 | } |
|---|
| 483 | |
|---|
| 484 | // Adjacent BASE64 sections |
|---|
| 485 | if ($u7len > 2 && $str[$i+1] == '&' && $str[$i+2] != '-') { |
|---|
| 486 | return $err; |
|---|
| 487 | } |
|---|
| 488 | } |
|---|
| 489 | // Not printable US-ASCII |
|---|
| 490 | else if (ord($u7) < 0x20 || ord($u7) >= 0x7f) { |
|---|
| 491 | return $err; |
|---|
| 492 | } |
|---|
| 493 | else { |
|---|
| 494 | $p .= $u7; |
|---|
| 495 | } |
|---|
| 496 | } |
|---|
| 497 | |
|---|
| 498 | return $p; |
|---|
| 499 | } |
|---|
| 500 | |
|---|
| 501 | |
|---|
| 502 | /** |
|---|
| 503 | * Convert the data ($str) from UTF-8 to RFC 2060's UTF-7. |
|---|
| 504 | * Unicode characters above U+FFFF are replaced by U+FFFE. |
|---|
| 505 | * If input data is invalid, return an empty string. |
|---|
| 506 | * |
|---|
| 507 | * Translated from C to PHP by Thomas Bruederli <roundcube@gmail.com> |
|---|
| 508 | * |
|---|
| 509 | * @param string $str Input string (UTF-8) |
|---|
| 510 | * |
|---|
| 511 | * @return string Output string (UTF7-IMAP) |
|---|
| 512 | */ |
|---|
| 513 | public static function utf8_to_utf7imap($str) |
|---|
| 514 | { |
|---|
| 515 | $B64Chars = array( |
|---|
| 516 | 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', |
|---|
| 517 | 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', |
|---|
| 518 | 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', |
|---|
| 519 | 't', 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', |
|---|
| 520 | '8', '9', '+', ',' |
|---|
| 521 | ); |
|---|
| 522 | |
|---|
| 523 | $u8len = strlen($str); |
|---|
| 524 | $base64 = 0; |
|---|
| 525 | $i = 0; |
|---|
| 526 | $p = ''; |
|---|
| 527 | $err = ''; |
|---|
| 528 | |
|---|
| 529 | while ($u8len) { |
|---|
| 530 | $u8 = $str[$i]; |
|---|
| 531 | $c = ord($u8); |
|---|
| 532 | |
|---|
| 533 | if ($c < 0x80) { |
|---|
| 534 | $ch = $c; |
|---|
| 535 | $n = 0; |
|---|
| 536 | } |
|---|
| 537 | else if ($c < 0xc2) { |
|---|
| 538 | return $err; |
|---|
| 539 | } |
|---|
| 540 | else if ($c < 0xe0) { |
|---|
| 541 | $ch = $c & 0x1f; |
|---|
| 542 | $n = 1; |
|---|
| 543 | } |
|---|
| 544 | else if ($c < 0xf0) { |
|---|
| 545 | $ch = $c & 0x0f; |
|---|
| 546 | $n = 2; |
|---|
| 547 | } |
|---|
| 548 | else if ($c < 0xf8) { |
|---|
| 549 | $ch = $c & 0x07; |
|---|
| 550 | $n = 3; |
|---|
| 551 | } |
|---|
| 552 | else if ($c < 0xfc) { |
|---|
| 553 | $ch = $c & 0x03; |
|---|
| 554 | $n = 4; |
|---|
| 555 | } |
|---|
| 556 | else if ($c < 0xfe) { |
|---|
| 557 | $ch = $c & 0x01; |
|---|
| 558 | $n = 5; |
|---|
| 559 | } |
|---|
| 560 | else { |
|---|
| 561 | return $err; |
|---|
| 562 | } |
|---|
| 563 | |
|---|
| 564 | $i++; |
|---|
| 565 | $u8len--; |
|---|
| 566 | |
|---|
| 567 | if ($n > $u8len) { |
|---|
| 568 | return $err; |
|---|
| 569 | } |
|---|
| 570 | |
|---|
| 571 | for ($j=0; $j < $n; $j++) { |
|---|
| 572 | $o = ord($str[$i+$j]); |
|---|
| 573 | if (($o & 0xc0) != 0x80) { |
|---|
| 574 | return $err; |
|---|
| 575 | } |
|---|
| 576 | $ch = ($ch << 6) | ($o & 0x3f); |
|---|
| 577 | } |
|---|
| 578 | |
|---|
| 579 | if ($n > 1 && !($ch >> ($n * 5 + 1))) { |
|---|
| 580 | return $err; |
|---|
| 581 | } |
|---|
| 582 | |
|---|
| 583 | $i += $n; |
|---|
| 584 | $u8len -= $n; |
|---|
| 585 | |
|---|
| 586 | if ($ch < 0x20 || $ch >= 0x7f) { |
|---|
| 587 | if (!$base64) { |
|---|
| 588 | $p .= '&'; |
|---|
| 589 | $base64 = 1; |
|---|
| 590 | $b = 0; |
|---|
| 591 | $k = 10; |
|---|
| 592 | } |
|---|
| 593 | if ($ch & ~0xffff) { |
|---|
| 594 | $ch = 0xfffe; |
|---|
| 595 | } |
|---|
| 596 | |
|---|
| 597 | $p .= $B64Chars[($b | $ch >> $k)]; |
|---|
| 598 | $k -= 6; |
|---|
| 599 | for (; $k >= 0; $k -= 6) { |
|---|
| 600 | $p .= $B64Chars[(($ch >> $k) & 0x3f)]; |
|---|
| 601 | } |
|---|
| 602 | |
|---|
| 603 | $b = ($ch << (-$k)) & 0x3f; |
|---|
| 604 | $k += 16; |
|---|
| 605 | } |
|---|
| 606 | else { |
|---|
| 607 | if ($base64) { |
|---|
| 608 | if ($k > 10) { |
|---|
| 609 | $p .= $B64Chars[$b]; |
|---|
| 610 | } |
|---|
| 611 | $p .= '-'; |
|---|
| 612 | $base64 = 0; |
|---|
| 613 | } |
|---|
| 614 | |
|---|
| 615 | $p .= chr($ch); |
|---|
| 616 | if (chr($ch) == '&') { |
|---|
| 617 | $p .= '-'; |
|---|
| 618 | } |
|---|
| 619 | } |
|---|
| 620 | } |
|---|
| 621 | |
|---|
| 622 | if ($base64) { |
|---|
| 623 | if ($k > 10) { |
|---|
| 624 | $p .= $B64Chars[$b]; |
|---|
| 625 | } |
|---|
| 626 | $p .= '-'; |
|---|
| 627 | } |
|---|
| 628 | |
|---|
| 629 | return $p; |
|---|
| 630 | } |
|---|
| 631 | |
|---|
| 632 | |
|---|
| 633 | /** |
|---|
| 634 | * A method to guess character set of a string. |
|---|
| 635 | * |
|---|
| 636 | * @param string $string String. |
|---|
| 637 | * @param string $failover Default result for failover. |
|---|
| 638 | * |
|---|
| 639 | * @return string Charset name |
|---|
| 640 | */ |
|---|
| 641 | public static function detect($string, $failover='') |
|---|
| 642 | { |
|---|
| 643 | if (!function_exists('mb_detect_encoding')) { |
|---|
| 644 | return $failover; |
|---|
| 645 | } |
|---|
| 646 | |
|---|
| 647 | // FIXME: the order is important, because sometimes |
|---|
| 648 | // iso string is detected as euc-jp and etc. |
|---|
| 649 | $enc = array( |
|---|
| 650 | 'UTF-8', 'SJIS', 'BIG5', 'GB2312', |
|---|
| 651 | 'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-4', |
|---|
| 652 | 'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9', |
|---|
| 653 | 'ISO-8859-10', 'ISO-8859-13', 'ISO-8859-14', 'ISO-8859-15', 'ISO-8859-16', |
|---|
| 654 | 'WINDOWS-1252', 'WINDOWS-1251', 'EUC-JP', 'EUC-TW', 'KOI8-R', |
|---|
| 655 | 'ISO-2022-KR', 'ISO-2022-JP' |
|---|
| 656 | ); |
|---|
| 657 | |
|---|
| 658 | $result = mb_detect_encoding($string, join(',', $enc)); |
|---|
| 659 | |
|---|
| 660 | return $result ? $result : $failover; |
|---|
| 661 | } |
|---|
| 662 | |
|---|
| 663 | |
|---|
| 664 | /** |
|---|
| 665 | * Removes non-unicode characters from input. |
|---|
| 666 | * |
|---|
| 667 | * @param mixed $input String or array. |
|---|
| 668 | * |
|---|
| 669 | * @return mixed String or array |
|---|
| 670 | */ |
|---|
| 671 | public static function clean($input) |
|---|
| 672 | { |
|---|
| 673 | // handle input of type array |
|---|
| 674 | if (is_array($input)) { |
|---|
| 675 | foreach ($input as $idx => $val) { |
|---|
| 676 | $input[$idx] = self::clean($val); |
|---|
| 677 | } |
|---|
| 678 | return $input; |
|---|
| 679 | } |
|---|
| 680 | |
|---|
| 681 | if (!is_string($input) || $input == '') { |
|---|
| 682 | return $input; |
|---|
| 683 | } |
|---|
| 684 | |
|---|
| 685 | // iconv/mbstring are much faster (especially with long strings) |
|---|
| 686 | if (function_exists('mb_convert_encoding')) { |
|---|
| 687 | if (($res = mb_convert_encoding($input, 'UTF-8', 'UTF-8')) !== false) { |
|---|
| 688 | return $res; |
|---|
| 689 | } |
|---|
| 690 | } |
|---|
| 691 | |
|---|
| 692 | if (function_exists('iconv')) { |
|---|
| 693 | if (($res = @iconv('UTF-8', 'UTF-8//IGNORE', $input)) !== false) { |
|---|
| 694 | return $res; |
|---|
| 695 | } |
|---|
| 696 | } |
|---|
| 697 | |
|---|
| 698 | $seq = ''; |
|---|
| 699 | $out = ''; |
|---|
| 700 | $regexp = '/^('. |
|---|
| 701 | // '[\x00-\x7F]'. // UTF8-1 |
|---|
| 702 | '|[\xC2-\xDF][\x80-\xBF]'. // UTF8-2 |
|---|
| 703 | '|\xE0[\xA0-\xBF][\x80-\xBF]'. // UTF8-3 |
|---|
| 704 | '|[\xE1-\xEC][\x80-\xBF][\x80-\xBF]'. // UTF8-3 |
|---|
| 705 | '|\xED[\x80-\x9F][\x80-\xBF]'. // UTF8-3 |
|---|
| 706 | '|[\xEE-\xEF][\x80-\xBF][\x80-\xBF]'. // UTF8-3 |
|---|
| 707 | '|\xF0[\x90-\xBF][\x80-\xBF][\x80-\xBF]'. // UTF8-4 |
|---|
| 708 | '|[\xF1-\xF3][\x80-\xBF][\x80-\xBF][\x80-\xBF]'.// UTF8-4 |
|---|
| 709 | '|\xF4[\x80-\x8F][\x80-\xBF][\x80-\xBF]'. // UTF8-4 |
|---|
| 710 | ')$/'; |
|---|
| 711 | |
|---|
| 712 | for ($i = 0, $len = strlen($input); $i < $len; $i++) { |
|---|
| 713 | $chr = $input[$i]; |
|---|
| 714 | $ord = ord($chr); |
|---|
| 715 | |
|---|
| 716 | // 1-byte character |
|---|
| 717 | if ($ord <= 0x7F) { |
|---|
| 718 | if ($seq) { |
|---|
| 719 | $out .= preg_match($regexp, $seq) ? $seq : ''; |
|---|
| 720 | } |
|---|
| 721 | $seq = ''; |
|---|
| 722 | $out .= $chr; |
|---|
| 723 | // first (or second) byte of multibyte sequence |
|---|
| 724 | } |
|---|
| 725 | else if ($ord >= 0xC0) { |
|---|
| 726 | if (strlen($seq) > 1) { |
|---|
| 727 | $out .= preg_match($regexp, $seq) ? $seq : ''; |
|---|
| 728 | $seq = ''; |
|---|
| 729 | } |
|---|
| 730 | else if ($seq && ord($seq) < 0xC0) { |
|---|
| 731 | $seq = ''; |
|---|
| 732 | } |
|---|
| 733 | $seq .= $chr; |
|---|
| 734 | // next byte of multibyte sequence |
|---|
| 735 | } |
|---|
| 736 | else if ($seq) { |
|---|
| 737 | $seq .= $chr; |
|---|
| 738 | } |
|---|
| 739 | } |
|---|
| 740 | |
|---|
| 741 | if ($seq) { |
|---|
| 742 | $out .= preg_match($regexp, $seq) ? $seq : ''; |
|---|
| 743 | } |
|---|
| 744 | |
|---|
| 745 | return $out; |
|---|
| 746 | } |
|---|
| 747 | |
|---|
| 748 | } |
|---|