selima-perl/lib/php/monica/zh2py.inc.php

<?php
// File name:	zh2py.inc.php
// Description:	PHP subroutines to convert Chinese to pinyin
// Date:	2004-05-15
// Author:	imacat <imacat@pristine.com.tw>
// Copyright:	Copyright (C) 2004-2007 Pristine Communications

// We use GDBM but not SQLite.  SQLite is cross-platform, but it is a lot
// slower than GDBM.  SQLite only wins GDBM in connection open speed.

// Set the include path
if (!defined("INCPATH_SET")) {
    require_once dirname(__FILE__) . "/incpath.inc.php";
}
// Referenced subroutines
require_once "monica/addslash.inc.php";

// Settings
if (!defined("DBMDIR")) {
    define("DBMDIR", dirname(dirname(dirname(__FILE__))) . "/" . php_uname("m"));
}
define("_ZH2PY_DB", DBMDIR . "/zh2py.db");
define("_ZH2PY_CHARPAT", "(?:[a-z]|\\xC3[\\xAA\\xBC]){1,6}[1-5]");
$_ZH2PY = null;

// zh2py: Convert Chinese to pinyin
function zh2py($chinese)
{
    // Use first result from zh2pys()
    $pinyins = zh2pys($chinese);
    return $pinyins[0];
}

// zh2pys: Convert Chinese to pinyin, return all possibly pinyins
function zh2pys($chinese)
{
    // Bounce the empty text
    if ($chinese == "") {
        return "";
    }

    // Split text into Chinese or non-Chinese piecess
    $pieces = _zh2py_split_text($chinese);

    // Convert each piece into a proper printf pattern
    $chars = array();
    for ($i = 0; $i < count($pieces); $i++) {
        // A Chinese piece
        if ($pieces[$i]["is_chinese"]) {
            $patterns = array();
            for ($j = 0; $j < mb_strlen($pieces[$i]["text"]); $j++) {
                $char = mb_substr($pieces[$i]["text"], $j, 1);
                $chars[] = $char;
                $patterns[] = "%s";
            }
            $pieces[$i]["text"] = implode(" ", $patterns);
        // A non-Chinese piece
        } else {
            // Escape the printf metacharacter
            $pieces[$i]["text"] = str_replace("%", "%%", $pieces[$i]["text"]);
        }
    }

    // Concatenate text pieces
    $pinyin = $pieces[0]["text"];
    for ($i = 1; $i < count($pieces); $i++) {
        // Insert a space
        if (    !preg_match("/\s$/", $pieces[$i-1]["text"])
                && !preg_match("/^\s/", $pieces[$i]["text"])) {
            $pinyin .= " ";
        }
        $pinyin .= $pieces[$i]["text"];
    }

    // Get all the possible pinyins
    $chars = _zh2py_chars2py($chars);

    $pinyins = array();
    for ($i = 0; $i < count($chars); $i++) {
        $pinyins[] = vsprintf($pinyin, $chars[$i]);
    }

    return $pinyins;
}

// pinyin_match_chinese: If the pinyin matches the Chinese
function pinyin_match_chinese($chinese, $pinyin)
{
    // Split text into Chinese or non-Chinese piecess
    $pieces = _zh2py_split_text($chinese);

    // Convert each piece into a proper perl regular expression pattern
    for ($i = 0; $i < count($pieces); $i++) {
        // A Chinese piece
        if ($pieces[$i]["is_chinese"]) {
            $patterns = array();
            for ($j = 0; $j < mb_strlen($pieces[$i]["text"]); $j++) {
                $patterns[] = _ZH2PY_CHARPAT;
            }
            $pieces[$i]["text"] = implode(" ", $patterns);
        // A non-Chinese piece
        } else {
            // Escape the perl regular expression metacharacter
            $pieces[$i]["text"] = addslashes_re_php($pieces[$i]["text"]);
        }
    }

    // Concatenate text pieces
    $pattern = $pieces[0]["text"];
    for ($i = 1; $i < count($pieces); $i++) {
        // Insert a space
        if (    !preg_match("/(?:\s|\\\\[rnt])$/", $pieces[$i-1]["text"])
                && !preg_match("/^(?:\s|\\\\[rnt])/", $pieces[$i]["text"])) {
            $pattern .= " ";
        }
        $pattern .= $pieces[$i]["text"];
    }

    return preg_match("/^$pattern$/", $pinyin)? true: false;
}

// _zh2py_split_text: Split text into Chinese or non-Chinese piecess
function _zh2py_split_text($text)
{
    global $_ZH2PY;
    // Start the database
    if (is_null($_ZH2PY)) {
        $_ZH2PY = dba_open(_ZH2PY_DB, "r", "gdbm");
    }

    // Split into pieces
    for ($i = 0, $chars = array(); $i < mb_strlen($text); $i++) {
        $chars[] = mb_substr($text, $i, 1);
    }
    $pieces = array();
    // Tag the first phrase
    $pieces[] = array(
        "is_chinese"    => dba_exists($chars[0], $_ZH2PY),
        "text"          => "",
    );
    foreach ($chars as $char) {
        // Chinese status changed
        if (dba_exists($char, $_ZH2PY) xor $pieces[count($pieces)-1]["is_chinese"]) {
            // Start a new piece
            $pieces[] = array(
                "is_chinese"    => dba_exists($char, $_ZH2PY),
                "text"          => $char,
            );
        } else {
            // Append to the current piece
            $pieces[count($pieces)-1]["text"] .= $char;
        }
    }
    return $pieces;
}

// _zh2py_chars2py: Loop up a series of Chinese characters
//   and return all possible pinyins
function _zh2py_chars2py($chars)
{
    global $_ZH2PY;

    // No more characters to work with
    if (count($chars) == 0) {
        return array(array());
    }

    $char = array_shift($chars);
    $pinyins = explode("|", dba_fetch($char, $_ZH2PY));
    $follows = _zh2py_chars2py($chars);
    $results = array();
    for ($i = 0; $i < count($pinyins); $i++) {
        for ($j = 0; $j < count($follows); $j++) {
            $results[] = array_merge(array($pinyins[$i]), $follows[$j]);
        }
    }

    return $results;
}

// The SQLite version - we are not using it
// SQLite is not as efficient as GDBM in this case.
// See unused.inc.php

?>