selima-perl/lib/php/monica/unicode.inc.php

<?php
// File name:	unicode.inc.php
// Description:	PHP subroutine to handle unicode related functions
// Date:	2002-04-17
// Author:	imacat <imacat@pristine.com.tw>
// Copyright:	Copyright (C) 2002-2008 Pristine Communications

// Set the include path
if (!defined("INCPATH_SET")) {
    require_once dirname(__FILE__) . "/incpath.inc.php";
}
// Referenced subroutines
require_once "monica/errhndl.inc.php";
require_once "monica/getlang.inc.php";
require_once "monica/htmlchar.inc.php";
require_once "monica/lninfo.inc.php";

// Settings
if (!defined("DBMDIR")) {
    define("DBMDIR", dirname(dirname(dirname(__FILE__))) . "/" . php_uname("m"));
}
define("UNICODE_HTML_CHAR_ENT_REF", "&[A-Za-z]{2,8}\\d{0,2};");
define("_UNICODE_ENT2U8_DB", DBMDIR . "/ent2u8.db");
define("_UNICODE_U82ENT_DB", DBMDIR . "/u82ent.db");
define("_UNICODE_HNC2HCE_DB", DBMDIR . "/hnc2hce.db");
define("UNICODE_NO_HCEREF", true);

// Character set conversion maps -
//  Obtain them with the above mb_encode_numericentity_cnvtmap().
$CNVTMAP = array();
// Character set conversion map definitions are in seperated files
//   and are loaded only when it is used, to reduce the size of data
//   to be loaded into the memory.  Look into the cnvtmap/ subdirectory
//   for these maps.

// in_charset: If a piece of text is in a certain character set
function in_charset($text, $charset)
{
    $GLOBALS["php_errormsg"] = null;
    set_error_handler("null_error_handler");
    iconv("UTF-8", $charset, $text);
    restore_error_handler();
    return is_null($GLOBALS["php_errormsg"]);
}

// is_usascii: If a piece of text is US-ASCII
function is_usascii($text)
{
    return preg_match("/[\\x80-\\xFF]/", $text)? false: true;
}

// is_usascii_printable: If a piece of text is US-ASCII printable
function is_usascii_printable($a)
{
    // An array
    if (is_array($a)) {
        foreach (array_keys($a) as $k) {
            if (!is_usascii_printable($k) || !is_usascii_printable($a[$k])) {
                return false;
            }
        }
        return true;

    // A piece of text
    } else {
        return preg_match("/^[\\x20-\\x7E]+$/", $a)? true: false;
    }
}

// is_usascii_printable_text: If a piece of multi-line text is US-ASCII printable
function is_usascii_printable_text($a)
{
    // An array
    if (is_array($a)) {
        foreach (array_keys($a) as $k) {
            if (!is_usascii_printable_text($k) || !is_usascii_printable_text($a[$k])) {
                return false;
            }
        }
        return true;

    // A piece of text
    } else {
        return preg_match("/^[\\x20-\\x7E\s]+$/", $a)? true: false;
    }
}

// is_valid_unicode: If a piece of text is valid in unicode
function is_valid_unicode($text)
{
    // Try to encode the invalid characters and see if there is any
    global $CNVTMAP;
    require_once "monica/cnvtmap/invalid.inc.php";
    return $text == mb_encode_numericentity($text, $CNVTMAP["invalid"], "UTF-8");
}


/////////////////////////
// Traditiona-Simplified Chinese conversion
/////////////////////////
// trad_to_simp: Convert Traditional Chinese to Simplified Chinese
function trad_to_simp($a)
{
    // Encode Big5 characters that are not available in GB2312
    global $CNVTMAP;
    require_once "monica/cnvtmap/trad_to_simp.inc.php";
    $a = mb_encode_numericentity($a, $CNVTMAP["trad_to_simp"], "UTF-8");

    $a = h_encode($a, "Big5");
    $a = iconv("Big5", "GB2312", $a);
    $a = h_decode($a, "GB2312");
    return $a;
}

// simp_to_trad: Convert Simplified Chinese to Traditional Chinese
function simp_to_trad($a)
{
    // Encode GB2312 characters that are not available in Big5
    global $CNVTMAP;
    require_once "monica/cnvtmap/simp_to_trad.inc.php";
    $a = mb_encode_numericentity($a, $CNVTMAP["simp_to_trad"], "UTF-8");

    $a = h_encode($a, "GB2312");
    $a = iconv("GB2312", "Big5", $a);
    $a = h_decode($a, "Big5");
    return $a;
}


/////////////////////////
// Dealing with encodings and HTML character references
// Refer to HTML 4.01 specification
/////////////////////////
// h_encode: Encode UTF-8 text to octets with HTML character entity references
function h_encode($html, $charset = null, $nohce = false)
{
    // Default distination character set to the current character set
    if (is_null($charset)) {
        $charset = getlang(LN_CHARSET);
    }
    // Destination is UTF-8 -- Conversion is not needed
    if ($charset == "UTF-8") {
        return $html;
    }
    // Load the conversion map
    global $CNVTMAP;
    require_once "monica/cnvtmap/$charset.inc.php";
    // Convert it and return

    // Preserve the original timeout
    $timeout = ini_get("max_execution_time");
    ini_set("max_execution_time", 0);
    if (!$nohce) {
        $r = iconv("UTF-8", $charset,
            text_hnc2hce(mb_encode_numericentity($html, $CNVTMAP[$charset], "UTF-8")));
    } else {
        $r = iconv("UTF-8", $charset,
            mb_encode_numericentity($html, $CNVTMAP[$charset], "UTF-8"));
    }
    // Restore the timeout
    ini_set("max_execution_time", $timeout);
    return $r;
}

// h_decode: Decode octets and HTML character references to UTF-8 text
//   The output is in UTF-8
function h_decode($html, $charset = null)
{
    // Default to the current character set
    if (is_null($charset)) {
        $charset = getlang(LN_CHARSET);
    }
    // Whether source is in UTF-8 or not does not matter.
    // We still have to check if it is really in UTF-8, and decode
    //   the HTML character references.
    // Try to decode with the specified encoding
    $GLOBALS["php_errormsg"] = null;
    set_error_handler("null_error_handler");
    $html = iconv($charset, "UTF-8", $html);
    restore_error_handler();
    // Wrong encoding
    if (!is_null($GLOBALS["php_errormsg"])) {
        return null;
    }
    // Decode the HTML character references
    return a_hcref2char($html);
}

// page_encode: h_encode() an HTML page
function page_encode($html, $charset = null, $nohce = false)
{
    // Default to the current character set
    if (is_null($charset)) {
        $charset = getlang(LN_CHARSET);
    }
    $html = h_encode($html, $charset, $nohce);
    if (is_null($html)) {
        return null;
    }
    $html = str_replace("<!--monica:charset-->", h($charset), $html);
    return $html;
}

// a_hcref2char: Decode HTML character entity references in a piece of text
//               to its corresponding characters, in UTF-8
//   The input and output should both be in UTF-8
//   It preserves encoded US-ASCII characters.  US-ASCII characters do
//     not need to be encoded.  There must be some reason to encode them.
//     (like &#64;/@, &lt;/<, &gt/>, etc.)
function a_hcref2char($a)
{
    // Numeric character references (decimal)
    $a = preg_replace_callback("/&#(\d{1,10});/",
        create_function("\$m",
            "\$c = iconv(\"UTF-32LE\", \"UTF-8\", pack(\"V\", \$m[1]));\n"
            . "return !is_usascii_printable(\$c)? \$c: \$m[0];\n"),
        $a);

    // Numeric character references (hexadecimal)
    $a = preg_replace_callback("/&#x([0-9a-f]{1,8});/i",
        create_function("\$m",
            "\$c = iconv(\"UTF-32LE\", \"UTF-8\", pack(\"V\", hexdec(\$m[1])));\n"
            . "return !is_usascii_printable(\$c)? \$c: \$m[0];\n"),
        $a);

    // Character entity references
    $a = preg_replace_callback("/" . UNICODE_HTML_CHAR_ENT_REF . "/",
        create_function("\$m",
            "\$c = hceref2char(\$m[0]);\n"
            . "return (mb_strlen(\$c) > 1? \$c: (!is_usascii_printable(\$c)? \$c: \$m[0]));\n"),
        $a);

    return $a;
}

// hceref2char: Decode an HTML character entity reference
//              to its corresponding character, in UTF-8
//   The output is in UTF-8
function hceref2char($hceref)
{
    // Cache the result
    static $cache = array();
    // Return the cache
    if (array_key_exists($hceref, $cache)) {
        return $cache[$hceref];
    }

    static $ENT2U8;
    // Open the character entity reference mapping database
    if (!isset($ENT2U8)) {
        $ENT2U8 = dba_open(_UNICODE_ENT2U8_DB, "r", "gdbm");
    }
    // Look for it
    $cache[$hceref] = dba_exists($hceref, $ENT2U8)?
        dba_fetch($hceref, $ENT2U8): $hceref;

    return $cache[$hceref];
}

// text_hnc2hce: Convert HTML numeric character referenecs
//               to HTML character entity references
//               in a piece of text
function text_hnc2hce($text)
{
    return preg_replace_callback("/&#(?:\d{1,10}|x[0-9a-f]{1,8});/i",
        "_unicode_char_hnc2hce", $text);
}

// _unicode_char_hnc2hce: Convert a HTML numeric character referenec
//                        to a HTML character entity reference
function _unicode_char_hnc2hce($m)
{
    static $HNC2HCE;
    // Open the character entity reference mapping database
    if (!isset($HNC2HCE)) {
        $HNC2HCE = dba_open(_UNICODE_HNC2HCE_DB, "r", "gdbm");
    }
    // Found
    if (dba_exists($m[0], $HNC2HCE)) {
        return dba_fetch($m[0], $HNC2HCE);
    }
    // Not found -- return untouched
    return $m[0];
}

?>