Files
selima-perl/lib/php/monica/unicode.inc.php
2026-03-10 21:31:43 +08:00

287 lines
8.9 KiB
PHP

<?php
// File name: unicode.inc.php
// Description: PHP subroutine to handle unicode related functions
// Date: 2002-04-17
// Author: imacat <imacat@pristine.com.tw>
// Copyright: Copyright (C) 2002-2008 Pristine Communications
// Set the include path
if (!defined("INCPATH_SET")) {
require_once dirname(__FILE__) . "/incpath.inc.php";
}
// Referenced subroutines
require_once "monica/errhndl.inc.php";
require_once "monica/getlang.inc.php";
require_once "monica/htmlchar.inc.php";
require_once "monica/lninfo.inc.php";
// Settings
if (!defined("DBMDIR")) {
define("DBMDIR", dirname(dirname(dirname(__FILE__))) . "/" . php_uname("m"));
}
define("UNICODE_HTML_CHAR_ENT_REF", "&[A-Za-z]{2,8}\\d{0,2};");
define("_UNICODE_ENT2U8_DB", DBMDIR . "/ent2u8.db");
define("_UNICODE_U82ENT_DB", DBMDIR . "/u82ent.db");
define("_UNICODE_HNC2HCE_DB", DBMDIR . "/hnc2hce.db");
define("UNICODE_NO_HCEREF", true);
// Character set conversion maps -
// Obtain them with the above mb_encode_numericentity_cnvtmap().
$CNVTMAP = array();
// Character set conversion map definitions are in seperated files
// and are loaded only when it is used, to reduce the size of data
// to be loaded into the memory. Look into the cnvtmap/ subdirectory
// for these maps.
// in_charset: If a piece of text is in a certain character set
function in_charset($text, $charset)
{
$GLOBALS["php_errormsg"] = null;
set_error_handler("null_error_handler");
iconv("UTF-8", $charset, $text);
restore_error_handler();
return is_null($GLOBALS["php_errormsg"]);
}
// is_usascii: If a piece of text is US-ASCII
function is_usascii($text)
{
return preg_match("/[\\x80-\\xFF]/", $text)? false: true;
}
// is_usascii_printable: If a piece of text is US-ASCII printable
function is_usascii_printable($a)
{
// An array
if (is_array($a)) {
foreach (array_keys($a) as $k) {
if (!is_usascii_printable($k) || !is_usascii_printable($a[$k])) {
return false;
}
}
return true;
// A piece of text
} else {
return preg_match("/^[\\x20-\\x7E]+$/", $a)? true: false;
}
}
// is_usascii_printable_text: If a piece of multi-line text is US-ASCII printable
function is_usascii_printable_text($a)
{
// An array
if (is_array($a)) {
foreach (array_keys($a) as $k) {
if (!is_usascii_printable_text($k) || !is_usascii_printable_text($a[$k])) {
return false;
}
}
return true;
// A piece of text
} else {
return preg_match("/^[\\x20-\\x7E\s]+$/", $a)? true: false;
}
}
// is_valid_unicode: If a piece of text is valid in unicode
function is_valid_unicode($text)
{
// Try to encode the invalid characters and see if there is any
global $CNVTMAP;
require_once "monica/cnvtmap/invalid.inc.php";
return $text == mb_encode_numericentity($text, $CNVTMAP["invalid"], "UTF-8");
}
/////////////////////////
// Traditiona-Simplified Chinese conversion
/////////////////////////
// trad_to_simp: Convert Traditional Chinese to Simplified Chinese
function trad_to_simp($a)
{
// Encode Big5 characters that are not available in GB2312
global $CNVTMAP;
require_once "monica/cnvtmap/trad_to_simp.inc.php";
$a = mb_encode_numericentity($a, $CNVTMAP["trad_to_simp"], "UTF-8");
$a = h_encode($a, "Big5");
$a = iconv("Big5", "GB2312", $a);
$a = h_decode($a, "GB2312");
return $a;
}
// simp_to_trad: Convert Simplified Chinese to Traditional Chinese
function simp_to_trad($a)
{
// Encode GB2312 characters that are not available in Big5
global $CNVTMAP;
require_once "monica/cnvtmap/simp_to_trad.inc.php";
$a = mb_encode_numericentity($a, $CNVTMAP["simp_to_trad"], "UTF-8");
$a = h_encode($a, "GB2312");
$a = iconv("GB2312", "Big5", $a);
$a = h_decode($a, "Big5");
return $a;
}
/////////////////////////
// Dealing with encodings and HTML character references
// Refer to HTML 4.01 specification
/////////////////////////
// h_encode: Encode UTF-8 text to octets with HTML character entity references
function h_encode($html, $charset = null, $nohce = false)
{
// Default distination character set to the current character set
if (is_null($charset)) {
$charset = getlang(LN_CHARSET);
}
// Destination is UTF-8 -- Conversion is not needed
if ($charset == "UTF-8") {
return $html;
}
// Load the conversion map
global $CNVTMAP;
require_once "monica/cnvtmap/$charset.inc.php";
// Convert it and return
// Preserve the original timeout
$timeout = ini_get("max_execution_time");
ini_set("max_execution_time", 0);
if (!$nohce) {
$r = iconv("UTF-8", $charset,
text_hnc2hce(mb_encode_numericentity($html, $CNVTMAP[$charset], "UTF-8")));
} else {
$r = iconv("UTF-8", $charset,
mb_encode_numericentity($html, $CNVTMAP[$charset], "UTF-8"));
}
// Restore the timeout
ini_set("max_execution_time", $timeout);
return $r;
}
// h_decode: Decode octets and HTML character references to UTF-8 text
// The output is in UTF-8
function h_decode($html, $charset = null)
{
// Default to the current character set
if (is_null($charset)) {
$charset = getlang(LN_CHARSET);
}
// Whether source is in UTF-8 or not does not matter.
// We still have to check if it is really in UTF-8, and decode
// the HTML character references.
// Try to decode with the specified encoding
$GLOBALS["php_errormsg"] = null;
set_error_handler("null_error_handler");
$html = iconv($charset, "UTF-8", $html);
restore_error_handler();
// Wrong encoding
if (!is_null($GLOBALS["php_errormsg"])) {
return null;
}
// Decode the HTML character references
return a_hcref2char($html);
}
// page_encode: h_encode() an HTML page
function page_encode($html, $charset = null, $nohce = false)
{
// Default to the current character set
if (is_null($charset)) {
$charset = getlang(LN_CHARSET);
}
$html = h_encode($html, $charset, $nohce);
if (is_null($html)) {
return null;
}
$html = str_replace("<!--monica:charset-->", h($charset), $html);
return $html;
}
// a_hcref2char: Decode HTML character entity references in a piece of text
// to its corresponding characters, in UTF-8
// The input and output should both be in UTF-8
// It preserves encoded US-ASCII characters. US-ASCII characters do
// not need to be encoded. There must be some reason to encode them.
// (like &#64;/@, &lt;/<, &gt/>, etc.)
function a_hcref2char($a)
{
// Numeric character references (decimal)
$a = preg_replace_callback("/&#(\d{1,10});/",
create_function("\$m",
"\$c = iconv(\"UTF-32LE\", \"UTF-8\", pack(\"V\", \$m[1]));\n"
. "return !is_usascii_printable(\$c)? \$c: \$m[0];\n"),
$a);
// Numeric character references (hexadecimal)
$a = preg_replace_callback("/&#x([0-9a-f]{1,8});/i",
create_function("\$m",
"\$c = iconv(\"UTF-32LE\", \"UTF-8\", pack(\"V\", hexdec(\$m[1])));\n"
. "return !is_usascii_printable(\$c)? \$c: \$m[0];\n"),
$a);
// Character entity references
$a = preg_replace_callback("/" . UNICODE_HTML_CHAR_ENT_REF . "/",
create_function("\$m",
"\$c = hceref2char(\$m[0]);\n"
. "return (mb_strlen(\$c) > 1? \$c: (!is_usascii_printable(\$c)? \$c: \$m[0]));\n"),
$a);
return $a;
}
// hceref2char: Decode an HTML character entity reference
// to its corresponding character, in UTF-8
// The output is in UTF-8
function hceref2char($hceref)
{
// Cache the result
static $cache = array();
// Return the cache
if (array_key_exists($hceref, $cache)) {
return $cache[$hceref];
}
static $ENT2U8;
// Open the character entity reference mapping database
if (!isset($ENT2U8)) {
$ENT2U8 = dba_open(_UNICODE_ENT2U8_DB, "r", "gdbm");
}
// Look for it
$cache[$hceref] = dba_exists($hceref, $ENT2U8)?
dba_fetch($hceref, $ENT2U8): $hceref;
return $cache[$hceref];
}
// text_hnc2hce: Convert HTML numeric character referenecs
// to HTML character entity references
// in a piece of text
function text_hnc2hce($text)
{
return preg_replace_callback("/&#(?:\d{1,10}|x[0-9a-f]{1,8});/i",
"_unicode_char_hnc2hce", $text);
}
// _unicode_char_hnc2hce: Convert a HTML numeric character referenec
// to a HTML character entity reference
function _unicode_char_hnc2hce($m)
{
static $HNC2HCE;
// Open the character entity reference mapping database
if (!isset($HNC2HCE)) {
$HNC2HCE = dba_open(_UNICODE_HNC2HCE_DB, "r", "gdbm");
}
// Found
if (dba_exists($m[0], $HNC2HCE)) {
return dba_fetch($m[0], $HNC2HCE);
}
// Not found -- return untouched
return $m[0];
}
?>