// Copyright: Copyright (C) 2004-2007 Pristine Communications // We use GDBM but not SQLite. SQLite is cross-platform, but it is a lot // slower than GDBM. SQLite only wins GDBM in connection open speed. // Set the include path if (!defined("INCPATH_SET")) { require_once dirname(__FILE__) . "/incpath.inc.php"; } // Referenced subroutines require_once "monica/addslash.inc.php"; // Settings if (!defined("DBMDIR")) { define("DBMDIR", dirname(dirname(dirname(__FILE__))) . "/" . php_uname("m")); } define("_ZH2PY_DB", DBMDIR . "/zh2py.db"); define("_ZH2PY_CHARPAT", "(?:[a-z]|\\xC3[\\xAA\\xBC]){1,6}[1-5]"); $_ZH2PY = null; // zh2py: Convert Chinese to pinyin function zh2py($chinese) { // Use first result from zh2pys() $pinyins = zh2pys($chinese); return $pinyins[0]; } // zh2pys: Convert Chinese to pinyin, return all possibly pinyins function zh2pys($chinese) { // Bounce the empty text if ($chinese == "") { return ""; } // Split text into Chinese or non-Chinese piecess $pieces = _zh2py_split_text($chinese); // Convert each piece into a proper printf pattern $chars = array(); for ($i = 0; $i < count($pieces); $i++) { // A Chinese piece if ($pieces[$i]["is_chinese"]) { $patterns = array(); for ($j = 0; $j < mb_strlen($pieces[$i]["text"]); $j++) { $char = mb_substr($pieces[$i]["text"], $j, 1); $chars[] = $char; $patterns[] = "%s"; } $pieces[$i]["text"] = implode(" ", $patterns); // A non-Chinese piece } else { // Escape the printf metacharacter $pieces[$i]["text"] = str_replace("%", "%%", $pieces[$i]["text"]); } } // Concatenate text pieces $pinyin = $pieces[0]["text"]; for ($i = 1; $i < count($pieces); $i++) { // Insert a space if ( !preg_match("/\s$/", $pieces[$i-1]["text"]) && !preg_match("/^\s/", $pieces[$i]["text"])) { $pinyin .= " "; } $pinyin .= $pieces[$i]["text"]; } // Get all the possible pinyins $chars = _zh2py_chars2py($chars); $pinyins = array(); for ($i = 0; $i < count($chars); $i++) { $pinyins[] = vsprintf($pinyin, $chars[$i]); } return $pinyins; } // pinyin_match_chinese: If the pinyin matches the Chinese function pinyin_match_chinese($chinese, $pinyin) { // Split text into Chinese or non-Chinese piecess $pieces = _zh2py_split_text($chinese); // Convert each piece into a proper perl regular expression pattern for ($i = 0; $i < count($pieces); $i++) { // A Chinese piece if ($pieces[$i]["is_chinese"]) { $patterns = array(); for ($j = 0; $j < mb_strlen($pieces[$i]["text"]); $j++) { $patterns[] = _ZH2PY_CHARPAT; } $pieces[$i]["text"] = implode(" ", $patterns); // A non-Chinese piece } else { // Escape the perl regular expression metacharacter $pieces[$i]["text"] = addslashes_re_php($pieces[$i]["text"]); } } // Concatenate text pieces $pattern = $pieces[0]["text"]; for ($i = 1; $i < count($pieces); $i++) { // Insert a space if ( !preg_match("/(?:\s|\\\\[rnt])$/", $pieces[$i-1]["text"]) && !preg_match("/^(?:\s|\\\\[rnt])/", $pieces[$i]["text"])) { $pattern .= " "; } $pattern .= $pieces[$i]["text"]; } return preg_match("/^$pattern$/", $pinyin)? true: false; } // _zh2py_split_text: Split text into Chinese or non-Chinese piecess function _zh2py_split_text($text) { global $_ZH2PY; // Start the database if (is_null($_ZH2PY)) { $_ZH2PY = dba_open(_ZH2PY_DB, "r", "gdbm"); } // Split into pieces for ($i = 0, $chars = array(); $i < mb_strlen($text); $i++) { $chars[] = mb_substr($text, $i, 1); } $pieces = array(); // Tag the first phrase $pieces[] = array( "is_chinese" => dba_exists($chars[0], $_ZH2PY), "text" => "", ); foreach ($chars as $char) { // Chinese status changed if (dba_exists($char, $_ZH2PY) xor $pieces[count($pieces)-1]["is_chinese"]) { // Start a new piece $pieces[] = array( "is_chinese" => dba_exists($char, $_ZH2PY), "text" => $char, ); } else { // Append to the current piece $pieces[count($pieces)-1]["text"] .= $char; } } return $pieces; } // _zh2py_chars2py: Loop up a series of Chinese characters // and return all possible pinyins function _zh2py_chars2py($chars) { global $_ZH2PY; // No more characters to work with if (count($chars) == 0) { return array(array()); } $char = array_shift($chars); $pinyins = explode("|", dba_fetch($char, $_ZH2PY)); $follows = _zh2py_chars2py($chars); $results = array(); for ($i = 0; $i < count($pinyins); $i++) { for ($j = 0; $j < count($follows); $j++) { $results[] = array_merge(array($pinyins[$i]), $follows[$j]); } } return $results; } // The SQLite version - we are not using it // SQLite is not as efficient as GDBM in this case. // See unused.inc.php ?>