287 lines
8.9 KiB
PHP
287 lines
8.9 KiB
PHP
<?php
|
|
// File name: unicode.inc.php
|
|
// Description: PHP subroutine to handle unicode related functions
|
|
// Date: 2002-04-17
|
|
// Author: imacat <imacat@pristine.com.tw>
|
|
// Copyright: Copyright (C) 2002-2008 Pristine Communications
|
|
|
|
// Set the include path
|
|
if (!defined("INCPATH_SET")) {
|
|
require_once dirname(__FILE__) . "/incpath.inc.php";
|
|
}
|
|
// Referenced subroutines
|
|
require_once "monica/errhndl.inc.php";
|
|
require_once "monica/getlang.inc.php";
|
|
require_once "monica/htmlchar.inc.php";
|
|
require_once "monica/lninfo.inc.php";
|
|
|
|
// Settings
|
|
if (!defined("DBMDIR")) {
|
|
define("DBMDIR", dirname(dirname(dirname(__FILE__))) . "/" . php_uname("m"));
|
|
}
|
|
define("UNICODE_HTML_CHAR_ENT_REF", "&[A-Za-z]{2,8}\\d{0,2};");
|
|
define("_UNICODE_ENT2U8_DB", DBMDIR . "/ent2u8.db");
|
|
define("_UNICODE_U82ENT_DB", DBMDIR . "/u82ent.db");
|
|
define("_UNICODE_HNC2HCE_DB", DBMDIR . "/hnc2hce.db");
|
|
define("UNICODE_NO_HCEREF", true);
|
|
|
|
// Character set conversion maps -
|
|
// Obtain them with the above mb_encode_numericentity_cnvtmap().
|
|
$CNVTMAP = array();
|
|
// Character set conversion map definitions are in seperated files
|
|
// and are loaded only when it is used, to reduce the size of data
|
|
// to be loaded into the memory. Look into the cnvtmap/ subdirectory
|
|
// for these maps.
|
|
|
|
// in_charset: If a piece of text is in a certain character set
|
|
function in_charset($text, $charset)
|
|
{
|
|
$GLOBALS["php_errormsg"] = null;
|
|
set_error_handler("null_error_handler");
|
|
iconv("UTF-8", $charset, $text);
|
|
restore_error_handler();
|
|
return is_null($GLOBALS["php_errormsg"]);
|
|
}
|
|
|
|
// is_usascii: If a piece of text is US-ASCII
|
|
function is_usascii($text)
|
|
{
|
|
return preg_match("/[\\x80-\\xFF]/", $text)? false: true;
|
|
}
|
|
|
|
// is_usascii_printable: If a piece of text is US-ASCII printable
|
|
function is_usascii_printable($a)
|
|
{
|
|
// An array
|
|
if (is_array($a)) {
|
|
foreach (array_keys($a) as $k) {
|
|
if (!is_usascii_printable($k) || !is_usascii_printable($a[$k])) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
|
|
// A piece of text
|
|
} else {
|
|
return preg_match("/^[\\x20-\\x7E]+$/", $a)? true: false;
|
|
}
|
|
}
|
|
|
|
// is_usascii_printable_text: If a piece of multi-line text is US-ASCII printable
|
|
function is_usascii_printable_text($a)
|
|
{
|
|
// An array
|
|
if (is_array($a)) {
|
|
foreach (array_keys($a) as $k) {
|
|
if (!is_usascii_printable_text($k) || !is_usascii_printable_text($a[$k])) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
|
|
// A piece of text
|
|
} else {
|
|
return preg_match("/^[\\x20-\\x7E\s]+$/", $a)? true: false;
|
|
}
|
|
}
|
|
|
|
// is_valid_unicode: If a piece of text is valid in unicode
|
|
function is_valid_unicode($text)
|
|
{
|
|
// Try to encode the invalid characters and see if there is any
|
|
global $CNVTMAP;
|
|
require_once "monica/cnvtmap/invalid.inc.php";
|
|
return $text == mb_encode_numericentity($text, $CNVTMAP["invalid"], "UTF-8");
|
|
}
|
|
|
|
|
|
/////////////////////////
|
|
// Traditiona-Simplified Chinese conversion
|
|
/////////////////////////
|
|
// trad_to_simp: Convert Traditional Chinese to Simplified Chinese
|
|
function trad_to_simp($a)
|
|
{
|
|
// Encode Big5 characters that are not available in GB2312
|
|
global $CNVTMAP;
|
|
require_once "monica/cnvtmap/trad_to_simp.inc.php";
|
|
$a = mb_encode_numericentity($a, $CNVTMAP["trad_to_simp"], "UTF-8");
|
|
|
|
$a = h_encode($a, "Big5");
|
|
$a = iconv("Big5", "GB2312", $a);
|
|
$a = h_decode($a, "GB2312");
|
|
return $a;
|
|
}
|
|
|
|
// simp_to_trad: Convert Simplified Chinese to Traditional Chinese
|
|
function simp_to_trad($a)
|
|
{
|
|
// Encode GB2312 characters that are not available in Big5
|
|
global $CNVTMAP;
|
|
require_once "monica/cnvtmap/simp_to_trad.inc.php";
|
|
$a = mb_encode_numericentity($a, $CNVTMAP["simp_to_trad"], "UTF-8");
|
|
|
|
$a = h_encode($a, "GB2312");
|
|
$a = iconv("GB2312", "Big5", $a);
|
|
$a = h_decode($a, "Big5");
|
|
return $a;
|
|
}
|
|
|
|
|
|
/////////////////////////
|
|
// Dealing with encodings and HTML character references
|
|
// Refer to HTML 4.01 specification
|
|
/////////////////////////
|
|
// h_encode: Encode UTF-8 text to octets with HTML character entity references
|
|
function h_encode($html, $charset = null, $nohce = false)
|
|
{
|
|
// Default distination character set to the current character set
|
|
if (is_null($charset)) {
|
|
$charset = getlang(LN_CHARSET);
|
|
}
|
|
// Destination is UTF-8 -- Conversion is not needed
|
|
if ($charset == "UTF-8") {
|
|
return $html;
|
|
}
|
|
// Load the conversion map
|
|
global $CNVTMAP;
|
|
require_once "monica/cnvtmap/$charset.inc.php";
|
|
// Convert it and return
|
|
|
|
// Preserve the original timeout
|
|
$timeout = ini_get("max_execution_time");
|
|
ini_set("max_execution_time", 0);
|
|
if (!$nohce) {
|
|
$r = iconv("UTF-8", $charset,
|
|
text_hnc2hce(mb_encode_numericentity($html, $CNVTMAP[$charset], "UTF-8")));
|
|
} else {
|
|
$r = iconv("UTF-8", $charset,
|
|
mb_encode_numericentity($html, $CNVTMAP[$charset], "UTF-8"));
|
|
}
|
|
// Restore the timeout
|
|
ini_set("max_execution_time", $timeout);
|
|
return $r;
|
|
}
|
|
|
|
// h_decode: Decode octets and HTML character references to UTF-8 text
|
|
// The output is in UTF-8
|
|
function h_decode($html, $charset = null)
|
|
{
|
|
// Default to the current character set
|
|
if (is_null($charset)) {
|
|
$charset = getlang(LN_CHARSET);
|
|
}
|
|
// Whether source is in UTF-8 or not does not matter.
|
|
// We still have to check if it is really in UTF-8, and decode
|
|
// the HTML character references.
|
|
// Try to decode with the specified encoding
|
|
$GLOBALS["php_errormsg"] = null;
|
|
set_error_handler("null_error_handler");
|
|
$html = iconv($charset, "UTF-8", $html);
|
|
restore_error_handler();
|
|
// Wrong encoding
|
|
if (!is_null($GLOBALS["php_errormsg"])) {
|
|
return null;
|
|
}
|
|
// Decode the HTML character references
|
|
return a_hcref2char($html);
|
|
}
|
|
|
|
// page_encode: h_encode() an HTML page
|
|
function page_encode($html, $charset = null, $nohce = false)
|
|
{
|
|
// Default to the current character set
|
|
if (is_null($charset)) {
|
|
$charset = getlang(LN_CHARSET);
|
|
}
|
|
$html = h_encode($html, $charset, $nohce);
|
|
if (is_null($html)) {
|
|
return null;
|
|
}
|
|
$html = str_replace("<!--monica:charset-->", h($charset), $html);
|
|
return $html;
|
|
}
|
|
|
|
// a_hcref2char: Decode HTML character entity references in a piece of text
|
|
// to its corresponding characters, in UTF-8
|
|
// The input and output should both be in UTF-8
|
|
// It preserves encoded US-ASCII characters. US-ASCII characters do
|
|
// not need to be encoded. There must be some reason to encode them.
|
|
// (like @/@, </<, >/>, etc.)
|
|
function a_hcref2char($a)
|
|
{
|
|
// Numeric character references (decimal)
|
|
$a = preg_replace_callback("/&#(\d{1,10});/",
|
|
create_function("\$m",
|
|
"\$c = iconv(\"UTF-32LE\", \"UTF-8\", pack(\"V\", \$m[1]));\n"
|
|
. "return !is_usascii_printable(\$c)? \$c: \$m[0];\n"),
|
|
$a);
|
|
|
|
// Numeric character references (hexadecimal)
|
|
$a = preg_replace_callback("/&#x([0-9a-f]{1,8});/i",
|
|
create_function("\$m",
|
|
"\$c = iconv(\"UTF-32LE\", \"UTF-8\", pack(\"V\", hexdec(\$m[1])));\n"
|
|
. "return !is_usascii_printable(\$c)? \$c: \$m[0];\n"),
|
|
$a);
|
|
|
|
// Character entity references
|
|
$a = preg_replace_callback("/" . UNICODE_HTML_CHAR_ENT_REF . "/",
|
|
create_function("\$m",
|
|
"\$c = hceref2char(\$m[0]);\n"
|
|
. "return (mb_strlen(\$c) > 1? \$c: (!is_usascii_printable(\$c)? \$c: \$m[0]));\n"),
|
|
$a);
|
|
|
|
return $a;
|
|
}
|
|
|
|
// hceref2char: Decode an HTML character entity reference
|
|
// to its corresponding character, in UTF-8
|
|
// The output is in UTF-8
|
|
function hceref2char($hceref)
|
|
{
|
|
// Cache the result
|
|
static $cache = array();
|
|
// Return the cache
|
|
if (array_key_exists($hceref, $cache)) {
|
|
return $cache[$hceref];
|
|
}
|
|
|
|
static $ENT2U8;
|
|
// Open the character entity reference mapping database
|
|
if (!isset($ENT2U8)) {
|
|
$ENT2U8 = dba_open(_UNICODE_ENT2U8_DB, "r", "gdbm");
|
|
}
|
|
// Look for it
|
|
$cache[$hceref] = dba_exists($hceref, $ENT2U8)?
|
|
dba_fetch($hceref, $ENT2U8): $hceref;
|
|
|
|
return $cache[$hceref];
|
|
}
|
|
|
|
// text_hnc2hce: Convert HTML numeric character referenecs
|
|
// to HTML character entity references
|
|
// in a piece of text
|
|
function text_hnc2hce($text)
|
|
{
|
|
return preg_replace_callback("/&#(?:\d{1,10}|x[0-9a-f]{1,8});/i",
|
|
"_unicode_char_hnc2hce", $text);
|
|
}
|
|
|
|
// _unicode_char_hnc2hce: Convert a HTML numeric character referenec
|
|
// to a HTML character entity reference
|
|
function _unicode_char_hnc2hce($m)
|
|
{
|
|
static $HNC2HCE;
|
|
// Open the character entity reference mapping database
|
|
if (!isset($HNC2HCE)) {
|
|
$HNC2HCE = dba_open(_UNICODE_HNC2HCE_DB, "r", "gdbm");
|
|
}
|
|
// Found
|
|
if (dba_exists($m[0], $HNC2HCE)) {
|
|
return dba_fetch($m[0], $HNC2HCE);
|
|
}
|
|
// Not found -- return untouched
|
|
return $m[0];
|
|
}
|
|
|
|
?>
|