// Copyright: Copyright (C) 2002-2008 Pristine Communications // Set the include path if (!defined("INCPATH_SET")) { require_once dirname(__FILE__) . "/incpath.inc.php"; } // Referenced subroutines require_once "monica/errhndl.inc.php"; require_once "monica/getlang.inc.php"; require_once "monica/htmlchar.inc.php"; require_once "monica/lninfo.inc.php"; // Settings if (!defined("DBMDIR")) { define("DBMDIR", dirname(dirname(dirname(__FILE__))) . "/" . php_uname("m")); } define("UNICODE_HTML_CHAR_ENT_REF", "&[A-Za-z]{2,8}\\d{0,2};"); define("_UNICODE_ENT2U8_DB", DBMDIR . "/ent2u8.db"); define("_UNICODE_U82ENT_DB", DBMDIR . "/u82ent.db"); define("_UNICODE_HNC2HCE_DB", DBMDIR . "/hnc2hce.db"); define("UNICODE_NO_HCEREF", true); // Character set conversion maps - // Obtain them with the above mb_encode_numericentity_cnvtmap(). $CNVTMAP = array(); // Character set conversion map definitions are in seperated files // and are loaded only when it is used, to reduce the size of data // to be loaded into the memory. Look into the cnvtmap/ subdirectory // for these maps. // in_charset: If a piece of text is in a certain character set function in_charset($text, $charset) { $GLOBALS["php_errormsg"] = null; set_error_handler("null_error_handler"); iconv("UTF-8", $charset, $text); restore_error_handler(); return is_null($GLOBALS["php_errormsg"]); } // is_usascii: If a piece of text is US-ASCII function is_usascii($text) { return preg_match("/[\\x80-\\xFF]/", $text)? false: true; } // is_usascii_printable: If a piece of text is US-ASCII printable function is_usascii_printable($a) { // An array if (is_array($a)) { foreach (array_keys($a) as $k) { if (!is_usascii_printable($k) || !is_usascii_printable($a[$k])) { return false; } } return true; // A piece of text } else { return preg_match("/^[\\x20-\\x7E]+$/", $a)? true: false; } } // is_usascii_printable_text: If a piece of multi-line text is US-ASCII printable function is_usascii_printable_text($a) { // An array if (is_array($a)) { foreach (array_keys($a) as $k) { if (!is_usascii_printable_text($k) || !is_usascii_printable_text($a[$k])) { return false; } } return true; // A piece of text } else { return preg_match("/^[\\x20-\\x7E\s]+$/", $a)? true: false; } } // is_valid_unicode: If a piece of text is valid in unicode function is_valid_unicode($text) { // Try to encode the invalid characters and see if there is any global $CNVTMAP; require_once "monica/cnvtmap/invalid.inc.php"; return $text == mb_encode_numericentity($text, $CNVTMAP["invalid"], "UTF-8"); } ///////////////////////// // Traditiona-Simplified Chinese conversion ///////////////////////// // trad_to_simp: Convert Traditional Chinese to Simplified Chinese function trad_to_simp($a) { // Encode Big5 characters that are not available in GB2312 global $CNVTMAP; require_once "monica/cnvtmap/trad_to_simp.inc.php"; $a = mb_encode_numericentity($a, $CNVTMAP["trad_to_simp"], "UTF-8"); $a = h_encode($a, "Big5"); $a = iconv("Big5", "GB2312", $a); $a = h_decode($a, "GB2312"); return $a; } // simp_to_trad: Convert Simplified Chinese to Traditional Chinese function simp_to_trad($a) { // Encode GB2312 characters that are not available in Big5 global $CNVTMAP; require_once "monica/cnvtmap/simp_to_trad.inc.php"; $a = mb_encode_numericentity($a, $CNVTMAP["simp_to_trad"], "UTF-8"); $a = h_encode($a, "GB2312"); $a = iconv("GB2312", "Big5", $a); $a = h_decode($a, "Big5"); return $a; } ///////////////////////// // Dealing with encodings and HTML character references // Refer to HTML 4.01 specification ///////////////////////// // h_encode: Encode UTF-8 text to octets with HTML character entity references function h_encode($html, $charset = null, $nohce = false) { // Default distination character set to the current character set if (is_null($charset)) { $charset = getlang(LN_CHARSET); } // Destination is UTF-8 -- Conversion is not needed if ($charset == "UTF-8") { return $html; } // Load the conversion map global $CNVTMAP; require_once "monica/cnvtmap/$charset.inc.php"; // Convert it and return // Preserve the original timeout $timeout = ini_get("max_execution_time"); ini_set("max_execution_time", 0); if (!$nohce) { $r = iconv("UTF-8", $charset, text_hnc2hce(mb_encode_numericentity($html, $CNVTMAP[$charset], "UTF-8"))); } else { $r = iconv("UTF-8", $charset, mb_encode_numericentity($html, $CNVTMAP[$charset], "UTF-8")); } // Restore the timeout ini_set("max_execution_time", $timeout); return $r; } // h_decode: Decode octets and HTML character references to UTF-8 text // The output is in UTF-8 function h_decode($html, $charset = null) { // Default to the current character set if (is_null($charset)) { $charset = getlang(LN_CHARSET); } // Whether source is in UTF-8 or not does not matter. // We still have to check if it is really in UTF-8, and decode // the HTML character references. // Try to decode with the specified encoding $GLOBALS["php_errormsg"] = null; set_error_handler("null_error_handler"); $html = iconv($charset, "UTF-8", $html); restore_error_handler(); // Wrong encoding if (!is_null($GLOBALS["php_errormsg"])) { return null; } // Decode the HTML character references return a_hcref2char($html); } // page_encode: h_encode() an HTML page function page_encode($html, $charset = null, $nohce = false) { // Default to the current character set if (is_null($charset)) { $charset = getlang(LN_CHARSET); } $html = h_encode($html, $charset, $nohce); if (is_null($html)) { return null; } $html = str_replace("", h($charset), $html); return $html; } // a_hcref2char: Decode HTML character entity references in a piece of text // to its corresponding characters, in UTF-8 // The input and output should both be in UTF-8 // It preserves encoded US-ASCII characters. US-ASCII characters do // not need to be encoded. There must be some reason to encode them. // (like @/@, </<, >/>, etc.) function a_hcref2char($a) { // Numeric character references (decimal) $a = preg_replace_callback("/&#(\d{1,10});/", create_function("\$m", "\$c = iconv(\"UTF-32LE\", \"UTF-8\", pack(\"V\", \$m[1]));\n" . "return !is_usascii_printable(\$c)? \$c: \$m[0];\n"), $a); // Numeric character references (hexadecimal) $a = preg_replace_callback("/&#x([0-9a-f]{1,8});/i", create_function("\$m", "\$c = iconv(\"UTF-32LE\", \"UTF-8\", pack(\"V\", hexdec(\$m[1])));\n" . "return !is_usascii_printable(\$c)? \$c: \$m[0];\n"), $a); // Character entity references $a = preg_replace_callback("/" . UNICODE_HTML_CHAR_ENT_REF . "/", create_function("\$m", "\$c = hceref2char(\$m[0]);\n" . "return (mb_strlen(\$c) > 1? \$c: (!is_usascii_printable(\$c)? \$c: \$m[0]));\n"), $a); return $a; } // hceref2char: Decode an HTML character entity reference // to its corresponding character, in UTF-8 // The output is in UTF-8 function hceref2char($hceref) { // Cache the result static $cache = array(); // Return the cache if (array_key_exists($hceref, $cache)) { return $cache[$hceref]; } static $ENT2U8; // Open the character entity reference mapping database if (!isset($ENT2U8)) { $ENT2U8 = dba_open(_UNICODE_ENT2U8_DB, "r", "gdbm"); } // Look for it $cache[$hceref] = dba_exists($hceref, $ENT2U8)? dba_fetch($hceref, $ENT2U8): $hceref; return $cache[$hceref]; } // text_hnc2hce: Convert HTML numeric character referenecs // to HTML character entity references // in a piece of text function text_hnc2hce($text) { return preg_replace_callback("/&#(?:\d{1,10}|x[0-9a-f]{1,8});/i", "_unicode_char_hnc2hce", $text); } // _unicode_char_hnc2hce: Convert a HTML numeric character referenec // to a HTML character entity reference function _unicode_char_hnc2hce($m) { static $HNC2HCE; // Open the character entity reference mapping database if (!isset($HNC2HCE)) { $HNC2HCE = dba_open(_UNICODE_HNC2HCE_DB, "r", "gdbm"); } // Found if (dba_exists($m[0], $HNC2HCE)) { return dba_fetch($m[0], $HNC2HCE); } // Not found -- return untouched return $m[0]; } ?>