Initial commit.

This commit is contained in:
2026-03-10 21:25:26 +08:00
commit 78739bf725
3089 changed files with 472990 additions and 0 deletions

View File

@@ -0,0 +1,185 @@
<?php
// File name: zh2py.inc.php
// Description: PHP subroutines to convert Chinese to pinyin
// Date: 2004-05-15
// Author: imacat <imacat@pristine.com.tw>
// Copyright: Copyright (C) 2004-2007 Pristine Communications
// We use GDBM but not SQLite. SQLite is cross-platform, but it is a lot
// slower than GDBM. SQLite only wins GDBM in connection open speed.
// Set the include path
if (!defined("INCPATH_SET")) {
require_once dirname(__FILE__) . "/incpath.inc.php";
}
// Referenced subroutines
require_once "monica/addslash.inc.php";
// Settings
if (!defined("DBMDIR")) {
define("DBMDIR", dirname(dirname(dirname(__FILE__))) . "/" . php_uname("m"));
}
define("_ZH2PY_DB", DBMDIR . "/zh2py.db");
define("_ZH2PY_CHARPAT", "(?:[a-z]|\\xC3[\\xAA\\xBC]){1,6}[1-5]");
$_ZH2PY = null;
// zh2py: Convert Chinese to pinyin
function zh2py($chinese)
{
// Use first result from zh2pys()
$pinyins = zh2pys($chinese);
return $pinyins[0];
}
// zh2pys: Convert Chinese to pinyin, return all possibly pinyins
function zh2pys($chinese)
{
// Bounce the empty text
if ($chinese == "") {
return "";
}
// Split text into Chinese or non-Chinese piecess
$pieces = _zh2py_split_text($chinese);
// Convert each piece into a proper printf pattern
$chars = array();
for ($i = 0; $i < count($pieces); $i++) {
// A Chinese piece
if ($pieces[$i]["is_chinese"]) {
$patterns = array();
for ($j = 0; $j < mb_strlen($pieces[$i]["text"]); $j++) {
$char = mb_substr($pieces[$i]["text"], $j, 1);
$chars[] = $char;
$patterns[] = "%s";
}
$pieces[$i]["text"] = implode(" ", $patterns);
// A non-Chinese piece
} else {
// Escape the printf metacharacter
$pieces[$i]["text"] = str_replace("%", "%%", $pieces[$i]["text"]);
}
}
// Concatenate text pieces
$pinyin = $pieces[0]["text"];
for ($i = 1; $i < count($pieces); $i++) {
// Insert a space
if ( !preg_match("/\s$/", $pieces[$i-1]["text"])
&& !preg_match("/^\s/", $pieces[$i]["text"])) {
$pinyin .= " ";
}
$pinyin .= $pieces[$i]["text"];
}
// Get all the possible pinyins
$chars = _zh2py_chars2py($chars);
$pinyins = array();
for ($i = 0; $i < count($chars); $i++) {
$pinyins[] = vsprintf($pinyin, $chars[$i]);
}
return $pinyins;
}
// pinyin_match_chinese: If the pinyin matches the Chinese
function pinyin_match_chinese($chinese, $pinyin)
{
// Split text into Chinese or non-Chinese piecess
$pieces = _zh2py_split_text($chinese);
// Convert each piece into a proper perl regular expression pattern
for ($i = 0; $i < count($pieces); $i++) {
// A Chinese piece
if ($pieces[$i]["is_chinese"]) {
$patterns = array();
for ($j = 0; $j < mb_strlen($pieces[$i]["text"]); $j++) {
$patterns[] = _ZH2PY_CHARPAT;
}
$pieces[$i]["text"] = implode(" ", $patterns);
// A non-Chinese piece
} else {
// Escape the perl regular expression metacharacter
$pieces[$i]["text"] = addslashes_re_php($pieces[$i]["text"]);
}
}
// Concatenate text pieces
$pattern = $pieces[0]["text"];
for ($i = 1; $i < count($pieces); $i++) {
// Insert a space
if ( !preg_match("/(?:\s|\\\\[rnt])$/", $pieces[$i-1]["text"])
&& !preg_match("/^(?:\s|\\\\[rnt])/", $pieces[$i]["text"])) {
$pattern .= " ";
}
$pattern .= $pieces[$i]["text"];
}
return preg_match("/^$pattern$/", $pinyin)? true: false;
}
// _zh2py_split_text: Split text into Chinese or non-Chinese piecess
function _zh2py_split_text($text)
{
global $_ZH2PY;
// Start the database
if (is_null($_ZH2PY)) {
$_ZH2PY = dba_open(_ZH2PY_DB, "r", "gdbm");
}
// Split into pieces
for ($i = 0, $chars = array(); $i < mb_strlen($text); $i++) {
$chars[] = mb_substr($text, $i, 1);
}
$pieces = array();
// Tag the first phrase
$pieces[] = array(
"is_chinese" => dba_exists($chars[0], $_ZH2PY),
"text" => "",
);
foreach ($chars as $char) {
// Chinese status changed
if (dba_exists($char, $_ZH2PY) xor $pieces[count($pieces)-1]["is_chinese"]) {
// Start a new piece
$pieces[] = array(
"is_chinese" => dba_exists($char, $_ZH2PY),
"text" => $char,
);
} else {
// Append to the current piece
$pieces[count($pieces)-1]["text"] .= $char;
}
}
return $pieces;
}
// _zh2py_chars2py: Loop up a series of Chinese characters
// and return all possible pinyins
function _zh2py_chars2py($chars)
{
global $_ZH2PY;
// No more characters to work with
if (count($chars) == 0) {
return array(array());
}
$char = array_shift($chars);
$pinyins = explode("|", dba_fetch($char, $_ZH2PY));
$follows = _zh2py_chars2py($chars);
$results = array();
for ($i = 0; $i < count($pinyins); $i++) {
for ($j = 0; $j < count($follows); $j++) {
$results[] = array_merge(array($pinyins[$i]), $follows[$j]);
}
}
return $results;
}
// The SQLite version - we are not using it
// SQLite is not as efficient as GDBM in this case.
// See unused.inc.php
?>