Initial commit.
This commit is contained in:
286
lib/php/monica/unicode.inc.php
Normal file
286
lib/php/monica/unicode.inc.php
Normal file
@@ -0,0 +1,286 @@
|
||||
<?php
|
||||
// File name: unicode.inc.php
|
||||
// Description: PHP subroutine to handle unicode related functions
|
||||
// Date: 2002-04-17
|
||||
// Author: imacat <imacat@pristine.com.tw>
|
||||
// Copyright: Copyright (C) 2002-2008 Pristine Communications
|
||||
|
||||
// Set the include path
|
||||
if (!defined("INCPATH_SET")) {
|
||||
require_once dirname(__FILE__) . "/incpath.inc.php";
|
||||
}
|
||||
// Referenced subroutines
|
||||
require_once "monica/errhndl.inc.php";
|
||||
require_once "monica/getlang.inc.php";
|
||||
require_once "monica/htmlchar.inc.php";
|
||||
require_once "monica/lninfo.inc.php";
|
||||
|
||||
// Settings
|
||||
if (!defined("DBMDIR")) {
|
||||
define("DBMDIR", dirname(dirname(dirname(__FILE__))) . "/" . php_uname("m"));
|
||||
}
|
||||
define("UNICODE_HTML_CHAR_ENT_REF", "&[A-Za-z]{2,8}\\d{0,2};");
|
||||
define("_UNICODE_ENT2U8_DB", DBMDIR . "/ent2u8.db");
|
||||
define("_UNICODE_U82ENT_DB", DBMDIR . "/u82ent.db");
|
||||
define("_UNICODE_HNC2HCE_DB", DBMDIR . "/hnc2hce.db");
|
||||
define("UNICODE_NO_HCEREF", true);
|
||||
|
||||
// Character set conversion maps -
|
||||
// Obtain them with the above mb_encode_numericentity_cnvtmap().
|
||||
$CNVTMAP = array();
|
||||
// Character set conversion map definitions are in seperated files
|
||||
// and are loaded only when it is used, to reduce the size of data
|
||||
// to be loaded into the memory. Look into the cnvtmap/ subdirectory
|
||||
// for these maps.
|
||||
|
||||
// in_charset: If a piece of text is in a certain character set
|
||||
function in_charset($text, $charset)
|
||||
{
|
||||
$GLOBALS["php_errormsg"] = null;
|
||||
set_error_handler("null_error_handler");
|
||||
iconv("UTF-8", $charset, $text);
|
||||
restore_error_handler();
|
||||
return is_null($GLOBALS["php_errormsg"]);
|
||||
}
|
||||
|
||||
// is_usascii: If a piece of text is US-ASCII
|
||||
function is_usascii($text)
|
||||
{
|
||||
return preg_match("/[\\x80-\\xFF]/", $text)? false: true;
|
||||
}
|
||||
|
||||
// is_usascii_printable: If a piece of text is US-ASCII printable
|
||||
function is_usascii_printable($a)
|
||||
{
|
||||
// An array
|
||||
if (is_array($a)) {
|
||||
foreach (array_keys($a) as $k) {
|
||||
if (!is_usascii_printable($k) || !is_usascii_printable($a[$k])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
// A piece of text
|
||||
} else {
|
||||
return preg_match("/^[\\x20-\\x7E]+$/", $a)? true: false;
|
||||
}
|
||||
}
|
||||
|
||||
// is_usascii_printable_text: If a piece of multi-line text is US-ASCII printable
|
||||
function is_usascii_printable_text($a)
|
||||
{
|
||||
// An array
|
||||
if (is_array($a)) {
|
||||
foreach (array_keys($a) as $k) {
|
||||
if (!is_usascii_printable_text($k) || !is_usascii_printable_text($a[$k])) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
|
||||
// A piece of text
|
||||
} else {
|
||||
return preg_match("/^[\\x20-\\x7E\s]+$/", $a)? true: false;
|
||||
}
|
||||
}
|
||||
|
||||
// is_valid_unicode: If a piece of text is valid in unicode
|
||||
function is_valid_unicode($text)
|
||||
{
|
||||
// Try to encode the invalid characters and see if there is any
|
||||
global $CNVTMAP;
|
||||
require_once "monica/cnvtmap/invalid.inc.php";
|
||||
return $text == mb_encode_numericentity($text, $CNVTMAP["invalid"], "UTF-8");
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////
|
||||
// Traditiona-Simplified Chinese conversion
|
||||
/////////////////////////
|
||||
// trad_to_simp: Convert Traditional Chinese to Simplified Chinese
|
||||
function trad_to_simp($a)
|
||||
{
|
||||
// Encode Big5 characters that are not available in GB2312
|
||||
global $CNVTMAP;
|
||||
require_once "monica/cnvtmap/trad_to_simp.inc.php";
|
||||
$a = mb_encode_numericentity($a, $CNVTMAP["trad_to_simp"], "UTF-8");
|
||||
|
||||
$a = h_encode($a, "Big5");
|
||||
$a = iconv("Big5", "GB2312", $a);
|
||||
$a = h_decode($a, "GB2312");
|
||||
return $a;
|
||||
}
|
||||
|
||||
// simp_to_trad: Convert Simplified Chinese to Traditional Chinese
|
||||
function simp_to_trad($a)
|
||||
{
|
||||
// Encode GB2312 characters that are not available in Big5
|
||||
global $CNVTMAP;
|
||||
require_once "monica/cnvtmap/simp_to_trad.inc.php";
|
||||
$a = mb_encode_numericentity($a, $CNVTMAP["simp_to_trad"], "UTF-8");
|
||||
|
||||
$a = h_encode($a, "GB2312");
|
||||
$a = iconv("GB2312", "Big5", $a);
|
||||
$a = h_decode($a, "Big5");
|
||||
return $a;
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////
|
||||
// Dealing with encodings and HTML character references
|
||||
// Refer to HTML 4.01 specification
|
||||
/////////////////////////
|
||||
// h_encode: Encode UTF-8 text to octets with HTML character entity references
|
||||
function h_encode($html, $charset = null, $nohce = false)
|
||||
{
|
||||
// Default distination character set to the current character set
|
||||
if (is_null($charset)) {
|
||||
$charset = getlang(LN_CHARSET);
|
||||
}
|
||||
// Destination is UTF-8 -- Conversion is not needed
|
||||
if ($charset == "UTF-8") {
|
||||
return $html;
|
||||
}
|
||||
// Load the conversion map
|
||||
global $CNVTMAP;
|
||||
require_once "monica/cnvtmap/$charset.inc.php";
|
||||
// Convert it and return
|
||||
|
||||
// Preserve the original timeout
|
||||
$timeout = ini_get("max_execution_time");
|
||||
ini_set("max_execution_time", 0);
|
||||
if (!$nohce) {
|
||||
$r = iconv("UTF-8", $charset,
|
||||
text_hnc2hce(mb_encode_numericentity($html, $CNVTMAP[$charset], "UTF-8")));
|
||||
} else {
|
||||
$r = iconv("UTF-8", $charset,
|
||||
mb_encode_numericentity($html, $CNVTMAP[$charset], "UTF-8"));
|
||||
}
|
||||
// Restore the timeout
|
||||
ini_set("max_execution_time", $timeout);
|
||||
return $r;
|
||||
}
|
||||
|
||||
// h_decode: Decode octets and HTML character references to UTF-8 text
|
||||
// The output is in UTF-8
|
||||
function h_decode($html, $charset = null)
|
||||
{
|
||||
// Default to the current character set
|
||||
if (is_null($charset)) {
|
||||
$charset = getlang(LN_CHARSET);
|
||||
}
|
||||
// Whether source is in UTF-8 or not does not matter.
|
||||
// We still have to check if it is really in UTF-8, and decode
|
||||
// the HTML character references.
|
||||
// Try to decode with the specified encoding
|
||||
$GLOBALS["php_errormsg"] = null;
|
||||
set_error_handler("null_error_handler");
|
||||
$html = iconv($charset, "UTF-8", $html);
|
||||
restore_error_handler();
|
||||
// Wrong encoding
|
||||
if (!is_null($GLOBALS["php_errormsg"])) {
|
||||
return null;
|
||||
}
|
||||
// Decode the HTML character references
|
||||
return a_hcref2char($html);
|
||||
}
|
||||
|
||||
// page_encode: h_encode() an HTML page
|
||||
function page_encode($html, $charset = null, $nohce = false)
|
||||
{
|
||||
// Default to the current character set
|
||||
if (is_null($charset)) {
|
||||
$charset = getlang(LN_CHARSET);
|
||||
}
|
||||
$html = h_encode($html, $charset, $nohce);
|
||||
if (is_null($html)) {
|
||||
return null;
|
||||
}
|
||||
$html = str_replace("<!--monica:charset-->", h($charset), $html);
|
||||
return $html;
|
||||
}
|
||||
|
||||
// a_hcref2char: Decode HTML character entity references in a piece of text
|
||||
// to its corresponding characters, in UTF-8
|
||||
// The input and output should both be in UTF-8
|
||||
// It preserves encoded US-ASCII characters. US-ASCII characters do
|
||||
// not need to be encoded. There must be some reason to encode them.
|
||||
// (like @/@, </<, >/>, etc.)
|
||||
function a_hcref2char($a)
|
||||
{
|
||||
// Numeric character references (decimal)
|
||||
$a = preg_replace_callback("/&#(\d{1,10});/",
|
||||
create_function("\$m",
|
||||
"\$c = iconv(\"UTF-32LE\", \"UTF-8\", pack(\"V\", \$m[1]));\n"
|
||||
. "return !is_usascii_printable(\$c)? \$c: \$m[0];\n"),
|
||||
$a);
|
||||
|
||||
// Numeric character references (hexadecimal)
|
||||
$a = preg_replace_callback("/&#x([0-9a-f]{1,8});/i",
|
||||
create_function("\$m",
|
||||
"\$c = iconv(\"UTF-32LE\", \"UTF-8\", pack(\"V\", hexdec(\$m[1])));\n"
|
||||
. "return !is_usascii_printable(\$c)? \$c: \$m[0];\n"),
|
||||
$a);
|
||||
|
||||
// Character entity references
|
||||
$a = preg_replace_callback("/" . UNICODE_HTML_CHAR_ENT_REF . "/",
|
||||
create_function("\$m",
|
||||
"\$c = hceref2char(\$m[0]);\n"
|
||||
. "return (mb_strlen(\$c) > 1? \$c: (!is_usascii_printable(\$c)? \$c: \$m[0]));\n"),
|
||||
$a);
|
||||
|
||||
return $a;
|
||||
}
|
||||
|
||||
// hceref2char: Decode an HTML character entity reference
|
||||
// to its corresponding character, in UTF-8
|
||||
// The output is in UTF-8
|
||||
function hceref2char($hceref)
|
||||
{
|
||||
// Cache the result
|
||||
static $cache = array();
|
||||
// Return the cache
|
||||
if (array_key_exists($hceref, $cache)) {
|
||||
return $cache[$hceref];
|
||||
}
|
||||
|
||||
static $ENT2U8;
|
||||
// Open the character entity reference mapping database
|
||||
if (!isset($ENT2U8)) {
|
||||
$ENT2U8 = dba_open(_UNICODE_ENT2U8_DB, "r", "gdbm");
|
||||
}
|
||||
// Look for it
|
||||
$cache[$hceref] = dba_exists($hceref, $ENT2U8)?
|
||||
dba_fetch($hceref, $ENT2U8): $hceref;
|
||||
|
||||
return $cache[$hceref];
|
||||
}
|
||||
|
||||
// text_hnc2hce: Convert HTML numeric character referenecs
|
||||
// to HTML character entity references
|
||||
// in a piece of text
|
||||
function text_hnc2hce($text)
|
||||
{
|
||||
return preg_replace_callback("/&#(?:\d{1,10}|x[0-9a-f]{1,8});/i",
|
||||
"_unicode_char_hnc2hce", $text);
|
||||
}
|
||||
|
||||
// _unicode_char_hnc2hce: Convert a HTML numeric character referenec
|
||||
// to a HTML character entity reference
|
||||
function _unicode_char_hnc2hce($m)
|
||||
{
|
||||
static $HNC2HCE;
|
||||
// Open the character entity reference mapping database
|
||||
if (!isset($HNC2HCE)) {
|
||||
$HNC2HCE = dba_open(_UNICODE_HNC2HCE_DB, "r", "gdbm");
|
||||
}
|
||||
// Found
|
||||
if (dba_exists($m[0], $HNC2HCE)) {
|
||||
return dba_fetch($m[0], $HNC2HCE);
|
||||
}
|
||||
// Not found -- return untouched
|
||||
return $m[0];
|
||||
}
|
||||
|
||||
?>
|
||||
Reference in New Issue
Block a user