// Copyright: Copyright (C) 2007-2008 Pristine Communications // Set the include path if (!defined("INCPATH_SET")) { require_once dirname(__FILE__) . "/incpath.inc.php"; } // Referenced subroutines require_once "monica/addcol.inc.php"; require_once "monica/errhndl.inc.php"; require_once "monica/hires.inc.php"; require_once "monica/sql.inc.php"; require_once "monica/unicode.inc.php"; require_once "monica/zh2py.inc.php"; // // cnvtmap.inc.php // // cnvtmap.inc.php is removed now. It contains only this unsed // mb_encode_numericentity_cnvtmap(). // mb_encode_numericentity_cnvtmap: Obtain the cnvtmap of a character set // to be used in mb_encode_numericentity() // This is an occational maintainance subroutine. Do not call it // regularily. The result should be saved. function mb_encode_numericentity_cnvtmap($charset, $archive = false) { // Preserve the original timeout $timeout = ini_get("max_execution_time"); ini_set("max_execution_time", 0); // Find the characters that does not fit into that character set for ($i = 0, $ords = array(); $i < 65536 * 32; $i++) { $c = iconv("UTF-32LE", "UTF-8", pack("V", $i)); $GLOBALS["php_errormsg"] = null; set_error_handler("null_error_handler"); iconv("UTF-8", $charset, $c); restore_error_handler(); if (!is_null($GLOBALS["php_errormsg"])) { $ords[] = $i; } } // Get the ranges for ($i = 0, $prev = -2, $ranges = array(); $i < count($ords); $i++) { // A new range if ($ords[$i] != $prev + 1) { $ranges[] = array($ords[$i], $ords[$i]); } // Adjust the end point $ranges[count($ranges) - 1][1] = $ords[$i]; $prev = $ords[$i]; } // Convert the ranges to cnvtmap for mb_encode_numericentity() for ($i = 0, $cnvtmap = array(); $i < count($ranges); $i++) { $cnvtmap[] = $ranges[$i][0]; $cnvtmap[] = $ranges[$i][1]; $cnvtmap[] = 0x000000; $cnvtmap[] = 0xFFFFFF; } // Output it in a format suitable to be archived if ($archive) { echo "\$CNVTMAP[\"$charset\"] = array(\n"; for ($i = 0; $i < count($cnvtmap); $i += 4) { printf(" 0x%06X, 0x%06X, 0x%06X, 0x%06X,\n", $cnvtmap[$i], $cnvtmap[$i+1], $cnvtmap[$i+2], $cnvtmap[$i+3]); } echo ");\n"; } // Restore the timeout ini_set("max_execution_time", $timeout); return $cnvtmap; } // mb_encode_numericentity_invalid_cnvtmap: Obtain the cnvtmap of invalid unicode characters // This is an occational maintainance subroutine. Do not call it // regularily. The result should be saved. function mb_encode_numericentity_invalid_cnvtmap($archive = false) { // Preserve the original timeout $timeout = ini_get("max_execution_time"); ini_set("max_execution_time", 0); // Find the characters that does not fit into that character set for ($i = 0, $ords = array(); $i < 65536; $i++) { $c = iconv("UTF-32LE", "UTF-8", pack("V", $i)); $GLOBALS["php_errormsg"] = null; set_error_handler("null_error_handler"); //iconv("UTF-8", $charset, $c); $result = pg_query("SELECT '" . sql_esctext($c) . "';\n"); restore_error_handler(); if ($result === false) { $ords[] = $i; } } // Get the ranges for ($i = 0, $prev = -2, $ranges = array(); $i < count($ords); $i++) { // A new range if ($ords[$i] != $prev + 1) { $ranges[] = array($ords[$i], $ords[$i]); } // Adjust the end point $ranges[count($ranges) - 1][1] = $ords[$i]; $prev = $ords[$i]; } // Convert the ranges to cnvtmap for mb_encode_numericentity() for ($i = 0, $cnvtmap = array(); $i < count($ranges); $i++) { $cnvtmap[] = $ranges[$i][0]; $cnvtmap[] = $ranges[$i][1]; $cnvtmap[] = 0x000000; $cnvtmap[] = 0xFFFFFF; } // Output it in a format suitable to be archived if ($archive) { echo "\$CNVTMAP[\"invalid\"] = array(\n"; for ($i = 0; $i < count($cnvtmap); $i += 4) { printf(" 0x%06X, 0x%06X, 0x%06X, 0x%06X,\n", $cnvtmap[$i], $cnvtmap[$i+1], $cnvtmap[$i+2], $cnvtmap[$i+3]); } echo ");\n"; } // Restore the timeout ini_set("max_execution_time", $timeout); return $cnvtmap; } // // unicode.inc.php // // rest_hcerefs: Restore HTML character entities references in the database // This is an occational maintainance subroutine. Do not call it // regularily. Also this will replace all HTML character entities // references. Stop if you want to preserve any of them. function rest_hcerefs() { // Preserve the original timeout $timeout = ini_get("max_execution_time"); ini_set("max_execution_time", 0); $t0 = time_hires(); // Lock the tables $tables = sql_tables(); $locks = array(); foreach ($tables as $table) { $locks[$table] = LOCK_EX; } sql_lock($locks); $sqls = array(); // Loop each table foreach ($tables as $table) { $select = "SELECT * FROM $table;\n"; $result = sql_query($select); $count = sql_num_rows($result); // Loop each record for ($i = 0; $i < $count; $i++) { $cur = sql_fetch_assoc($result); $new = $cur; $cols = new AddCol($table, ADDCOL_UPDATE); // Loop each column foreach (sql_cols($table) as $col) { // Skip non-string (numbers, boolean) columns if (!is_string($new[$col])) { continue; } // Read the character references with a_hcref2char() $new[$col] = a_hcref2char($new[$col]); $cols->addstr($col, $new[$col], $cur[$col]); } if ($cols->modified()) { printf("%s - %s\n", $table, $cur["sn"]); $sqls[] = "UPDATE $table " . $cols->ret() . " WHERE sn=" . $cur["sn"] . ";\n"; } } } // Update it sql_begin(); for ($i = 0; $i < count($sqls); $i++) { sql_query($sqls[$i]); } sql_commit(); // Restore the timeout ini_set("max_execution_time", $timeout); $t1 = time_hires(); printf("[%s] Done. %0.10f seconds elapsed\n", date("Y-m-d H:i:s"), $t1-$t0); return; } // // zh2py.inc.php // // The SQLite version - we are not using it // test_zh2py_sqlite: Run tests on the speed of GDBM vs. SQLite function test_zh2py_sqlite() { // Settings if (!defined("_ZH2PY_SQLITE_DB")) { define("_ZH2PY_SQLITE_DB", "/tmp/zh2py.db"); } $GLOBALS["_ZH2PY_SQLITE"] = null; if (!file_exists(_ZH2PY_SQLITE_DB)) { zh2pydb_gdbm2sqlite(); } $phrases = explode(" ", "臺北大塞車 我是依瑪貓 廚王爭霸戰 甜心酥餅 一口接一口 蒙大拿牛仔妹 恐龍入侵台灣 綠巨人玉米醬 小魚的故事 玉山銀行 我的一顆心 牛伯伯沙茶醬 王建民大勝利"); $count = 4; $used = array(); $idx = rand(0, count($phrases) - 1); $used[] = $idx; // Open the connection first zh2pys($phrases[$idx]); zh2pys_sqlite($phrases[$idx]); $suites = array( array(1, 1), array(5, 1), array(1, 4), array(5, 4), ); foreach ($suites as $suite) { for ($idxs = array(); count($idxs) < $suite[0]; ) { $idx = rand(0, count($phrases) - 1); if (!in_array($idx, $used)) { $used[] = $idx; $idxs[] = $idx; } } for ($i = 0, $testphrases = array(); $i < count($idxs); $i++) { $testphrases[] = $phrases[$idxs[$i]]; } test_zh2py_sqlite_onetest($testphrases, $suite[1]); } return; } // test_zh2py_sqlite_onetest: Run one GDBM vs. SQLite test suite function test_zh2py_sqlite_onetest($phrases, $count) { printf("=== Phrase %s for %d times ...\n", join(", ", $phrases), $count); $t0 = time_hires(); for ($i = 0; $i < $count; $i++) { for ($j = 0; $j < count($phrases); $j++) { zh2pys($phrases[$j]); } } printf("%-16s %0.10f seconds elapsed.\n", "zh2pys():", time_hires()-$t0); $t0 = time_hires(); for ($i = 0; $i < $count; $i++) { for ($j = 0; $j < count($phrases); $j++) { zh2pys_sqlite($phrases[$j]); } } printf("%-16s %0.10f seconds elapsed.\n", "zh2pys_sqlite():", time_hires()-$t0); return; } // zh2pydb_gdbm2sqlite: Initialize the zh2py SQLite database from the GDBM database function zh2pydb_gdbm2sqlite() { global $_ZH2PY, $_ZH2PY_SQLITE; // Start the database if (is_null($_ZH2PY)) { $_ZH2PY = dba_open(_ZH2PY_DB, "r", "gdbm"); } // Start the database if (!is_null($_ZH2PY_SQLITE)) { sqlite_close($_ZH2PY_SQLITE); unset($_ZH2PY_SQLITE); } if (file_exists(_ZH2PY_SQLITE_DB)) { unlink(_ZH2PY_SQLITE_DB); } if (is_null($_ZH2PY_SQLITE)) { $error = null; $_ZH2PY_SQLITE = sqlite_open(_ZH2PY_SQLITE_DB, 0666, $error); if ($_ZH2PY_SQLITE === false) { trigger_error("Failed sqlite_open().\n$error", E_USER_ERROR); } $error = null; $create = "CREATE TABLE zh2py (ch varchar(3) NOT NULL, ord int NOT NULL, pinyin varchar(7) NOT NULL);\n"; $r = sqlite_exec($_ZH2PY_SQLITE, $create, $error); if ($r === false) { trigger_error("Failed sqlite_exec().\n$create\n$error", E_USER_ERROR); } } $char = dba_firstkey($_ZH2PY); while ($char !== false) { $pinyins = explode("|", dba_fetch($char, $_ZH2PY)); for ($i = 0; $i < count($pinyins); $i++) { $error = null; $insert = "INSERT INTO zh2py (ch, ord, pinyin)" . " VALUES ('" . sqlite_escape_string($char) . "', $i, '" . sqlite_escape_string($pinyins[$i]) . "');\n"; $r = sqlite_exec($_ZH2PY_SQLITE, $insert, $error); if ($r === false) { trigger_error("Failed sqlite_exec().\n$insert\n$error", E_USER_ERROR); } } $char = dba_nextkey($_ZH2PY); } $error = null; $create = "CREATE INDEX zh2py_char ON zh2py (ch);\n"; $r = sqlite_exec($_ZH2PY_SQLITE, $create, $error); if ($r === false) { trigger_error("Failed sqlite_exec().\n$create\n$error", E_USER_ERROR); } return; } // zh2pys_sqlite: Convert Chinese to pinyin, return all possibly pinyins function zh2pys_sqlite($chinese) { // Bounce the empty text if ($chinese == "") { return ""; } // Split text into Chinese or non-Chinese piecess $pieces = _zh2py_sqlite_split_text($chinese); // Convert each piece into a proper printf pattern $chars = array(); for ($i = 0; $i < count($pieces); $i++) { // A Chinese piece if ($pieces[$i]["is_chinese"]) { $patterns = array(); for ($j = 0; $j < mb_strlen($pieces[$i]["text"]); $j++) { $char = mb_substr($pieces[$i]["text"], $j, 1); $chars[] = $char; $patterns[] = "%s"; } $pieces[$i]["text"] = implode(" ", $patterns); // A non-Chinese piece } else { // Escape the printf metacharacter $pieces[$i]["text"] = str_replace("%", "%%", $pieces[$i]["text"]); } } // Concatenate text pieces $pinyin = $pieces[0]["text"]; for ($i = 1; $i < count($pieces); $i++) { // Insert a space if ( !preg_match("/\s$/", $pieces[$i-1]["text"]) && !preg_match("/^\s/", $pieces[$i]["text"])) { $pinyin .= " "; } $pinyin .= $pieces[$i]["text"]; } // Get all the possible pinyins $chars = _zh2py_sqlite_chars2py($chars); $pinyins = array(); for ($i = 0; $i < count($chars); $i++) { $pinyins[] = vsprintf($pinyin, $chars[$i]); } return $pinyins; } // _zh2py_sqlite_split_text: Split text into Chinese or non-Chinese piecess function _zh2py_sqlite_split_text($text) { global $_ZH2PY_SQLITE; // Start the database if (is_null($_ZH2PY_SQLITE)) { $error = null; $_ZH2PY_SQLITE = sqlite_open(_ZH2PY_SQLITE_DB, 0666, $error); if ($_ZH2PY_SQLITE === false) { trigger_error("Failed sqlite_open().\n$error", E_USER_ERROR); } } // Split into pieces for ($i = 0, $chars = array(); $i < mb_strlen($text); $i++) { $chars[] = mb_substr($text, $i, 1); } $pieces = array(); // Tag the first phrase $error = null; $select = "SELECT pinyin FROM zh2py" . " WHERE ch='" . sqlite_escape_string($chars[0]) . "'" . " LIMIT 1;\n"; $result = sqlite_query($select, $_ZH2PY_SQLITE, SQLITE_ASSOC, $error); if ($result === false) { trigger_error("Failed sqlite_query().\n$select\n$error", E_USER_ERROR); } $pieces[] = array( "is_chinese" => sqlite_num_rows($result) > 0, "text" => "", ); foreach ($chars as $char) { $error = null; $select = "SELECT pinyin FROM zh2py" . " WHERE ch='" . sqlite_escape_string($char) . "'" . " LIMIT 1;\n"; $result = sqlite_query($select, $_ZH2PY_SQLITE, SQLITE_ASSOC, $error); if ($result === false) { trigger_error("Failed sqlite_query().\n$select\n$error", E_USER_ERROR); } // Chinese status changed if (sqlite_num_rows($result) > 0 xor $pieces[count($pieces)-1]["is_chinese"]) { // Start a new piece $pieces[] = array( "is_chinese" => sqlite_num_rows($result) > 0, "text" => $char, ); } else { // Append to the current piece $pieces[count($pieces)-1]["text"] .= $char; } } return $pieces; } // _zh2py_sqlite_chars2py: Loop up a series of Chinese characters // and return all possible pinyins function _zh2py_sqlite_chars2py($chars) { global $_ZH2PY_SQLITE; // No more characters to work with if (count($chars) == 0) { return array(array()); } $char = array_shift($chars); $error = null; $select = "SELECT pinyin FROM zh2py" . " WHERE ch='" . sqlite_escape_string($char) . "'" . " ORDER BY ord;\n"; $result = sqlite_query($select, $_ZH2PY_SQLITE, SQLITE_ASSOC, $error); if ($result === false) { trigger_error("Failed sqlite_query().\n$select\n$error", E_USER_ERROR); } $count = sqlite_num_rows($result); for ($i = 0, $pinyins = array(); $i < $count; $i++) { $row = sqlite_fetch_array($result, SQLITE_ASSOC); $pinyins[] = $row["pinyin"]; } $follows = _ZH2PY_SQLITE_chars2py($chars); $results = array(); for ($i = 0; $i < count($pinyins); $i++) { for ($j = 0; $j < count($follows); $j++) { $results[] = array_merge(array($pinyins[$i]), $follows[$j]); } } return $results; } ?>