Bgbot — a MediaWiki robot Code
Status: Beta
Brought to you by:
bmanolov
--- a/trunk/utils.php +++ b/trunk/utils.php @@ -20,8 +20,8 @@ # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # http://www.gnu.org/copyleft/gpl.html # -# Author: Borislav Manolov <b.manolov at web.de> -# http://purl.oclc.org/NET/manolov/ +# Author: Borislav Manolov <b.manolov at gmail dot com> +# http://purl.org/NET/borislav/ ############################################################################# @@ -36,150 +36,160 @@ $lats = array( - 'a', 'b', 'v', 'g', 'd', 'e', 'zh', 'z', 'i', 'j', - 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', - 'f', 'h', 'c', 'ch', 'sh', 'sht', 'j', 'y', 'ju', 'ja', - 'A', 'B', 'V', 'G', 'D', 'E', 'Zh', 'Z', 'I', 'J', - 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', - 'F', 'H', 'C', 'Ch', 'Sh', 'Sht', 'J', 'Y', 'Ju', 'Ja', - ' '); +'a', 'b', 'v', 'g', 'd', 'e', 'zh', 'z', 'i', 'j', +'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', +'f', 'h', 'c', 'ch', 'sh', 'sht', 'j', 'y', 'ju', 'ja', +'A', 'B', 'V', 'G', 'D', 'E', 'Zh', 'Z', 'I', 'J', +'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', +'F', 'H', 'C', 'Ch', 'Sh', 'Sht', 'J', 'Y', 'Ju', 'Ja', +' '); # change the string encoding from windows-1251 to UTF-8 function utf8($str) { - return iconv('windows-1251', 'UTF-8', $str); + return iconv('windows-1251', 'UTF-8', $str); } function cyr2hex($str) { - return urlencode( iconv('windows-1251', 'UTF-8', $str) ); + return urlencode( iconv('windows-1251', 'UTF-8', $str) ); } function cyr2lat($str) { - global $cyrs, $lats; - return str_replace($cyrs, $lats, $str); + global $cyrs, $lats; + return str_replace($cyrs, $lats, $str); } # read a line from the console function my_readline($str = 'Enter required data: ') { - echo "\n$str"; + echo "\n$str"; - $fp = fopen('php://stdin', 'r'); - $resp = trim( fgets($fp) ); - fclose($fp); - return $resp; + $fp = fopen('php://stdin', 'r'); + $resp = trim( fgets($fp) ); + fclose($fp); + return $resp; } function my_fwrite($file, $text, $mode='a+') { - $myFile = @fopen($file, $mode); - if (! $myFile) return 0; - if (! @fputs($myFile, $text)) return 0; - if (! @fclose($myFile)) return 0; - return 1; + $myFile = @fopen($file, $mode); + if (! $myFile) return false; + flock($myFile, LOCK_EX); + if (! @fputs($myFile, $text)) return false; + flock($myFile, LOCK_UN); + if (! @fclose($myFile)) return false; + return true; } function my_urlencode($str) { - $len = strlen($str); + $len = strlen($str); for ($i = 0; $i < $len; $i++) { $nstr .= $str{$i} != '%' ? urlencode($str{$i}) : '%'; } - return $nstr; + return $nstr; } # returns the text inside the $element element function getCDATA($element, $data) { - preg_match("/<$element>(.+)<\/$element>/s", $data, $matches); - return $matches[1]; + preg_match("/<$element.*>(.+)<\/$element>/Us", $data, $matches); + return $matches[1]; } # returns the text inside the $element elements function getCDATAs($element, $data) { - preg_match_all("/<$element>(.+)<\/$element>/s", $data, $matches); - return $matches[1]; + preg_match_all("/<$element.*>(.+)<\/$element>/Us", $data, $matches); + return $matches[1]; } # original source: http://www.randomchaos.com/document.php?source=php_and_unicode function utf82unicode($str) { - $unicode = ''; - $values = array(); - $lookingFor = 1; + $unicode = ''; + $values = array(); + $lookingFor = 1; - for ($i = 0; $i < strlen($str); $i++ ) { - $thisValue = ord( $str{$i} ); - if ( $thisValue < 128 ) { - $unicode .= $str{$i}; # don't convert ASCII characters - } else { - if ( count($values) == 0 ) - $lookingFor = ( $thisValue < 224 ) ? 2 : 3; + for ($i = 0; $i < strlen($str); $i++ ) { + $thisValue = ord( $str{$i} ); + if ( $thisValue < 128 ) { + $unicode .= $str{$i}; # don't convert ASCII characters + } else { + if ( count($values) == 0 ) { + $lookingFor = ( $thisValue < 224 ) ? 2 : 3; + } - $values[] = $thisValue; + $values[] = $thisValue; - if ( count( $values ) == $lookingFor ) { - $number = ( $lookingFor == 3 ) - ? - ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ) - : - ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 ); + if ( count( $values ) == $lookingFor ) { + $number = ( $lookingFor == 3 ) + ? + ( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 ) + : + ( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 ); - $unicode .= "&#$number;"; - $values = array(); - $lookingFor = 1; - } // if - } // if - } // for - return $unicode; -} // utf82unicode + $unicode .= "&#$number;"; + $values = array(); + $lookingFor = 1; + } + } + } + return $unicode; +} # encodes a unicode sequence as UTF-8 function unicode2utf8($unicode_str) { - $base = 10; - if ($unicode_str[0] == 'x') { - $unicode_str = substr($unicode_str, 1); - $base = 16; - } - $bin = base_convert($unicode_str, $base, 2); - $len = strlen($bin); - if ( $len < 8 ) { - // 0xxxxxxx - $bin = '0' . str_pad($bin, 7, '0', STR_PAD_LEFT); - } elseif ( $len < 12 ) { - // 110xxxxx 10xxxxxx - $bin = str_pad($bin, 11, '0', STR_PAD_LEFT); - $bin = substr_replace($bin, '10', 5, 0); - $bin = '110' . $bin; - } elseif ( $len < 17 ) { - // 1110xxxx 10xxxxxx 10xxxxxx - $bin = str_pad($bin, 16, '0', STR_PAD_LEFT); - $bin = substr_replace($bin, '10', 4, 0); - $bin = substr_replace($bin, '10', 12, 0); - $bin = '1110' . $bin; - } else { - // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - $bin = str_pad($bin, 21, '0', STR_PAD_LEFT); - $bin = substr_replace($bin, '10', 3, 0); - $bin = substr_replace($bin, '10', 11, 0); - $bin = substr_replace($bin, '10', 19, 0); - $bin = '11110' . $bin; - } + $base = 10; + if ($unicode_str[0] == 'x') { + $unicode_str = substr($unicode_str, 1); + $base = 16; + } + $bin = base_convert($unicode_str, $base, 2); + $len = strlen($bin); + if ( $len < 8 ) { + // 0xxxxxxx + $bin = '0' . str_pad($bin, 7, '0', STR_PAD_LEFT); + } elseif ( $len < 12 ) { + // 110xxxxx 10xxxxxx + $bin = str_pad($bin, 11, '0', STR_PAD_LEFT); + $bin = substr_replace($bin, '10', 5, 0); + $bin = '110' . $bin; + } elseif ( $len < 17 ) { + // 1110xxxx 10xxxxxx 10xxxxxx + $bin = str_pad($bin, 16, '0', STR_PAD_LEFT); + $bin = substr_replace($bin, '10', 4, 0); + $bin = substr_replace($bin, '10', 12, 0); + $bin = '1110' . $bin; + } else { + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + $bin = str_pad($bin, 21, '0', STR_PAD_LEFT); + $bin = substr_replace($bin, '10', 3, 0); + $bin = substr_replace($bin, '10', 11, 0); + $bin = substr_replace($bin, '10', 19, 0); + $bin = '11110' . $bin; + } - $hex = base_convert($bin, 2, 16); - $hex = strtoupper($hex); - $len = strlen($hex); - if ($len == 1) { - $utf8 = '%0' . $hex; - } else { - for ($i = 0; $i < strlen($hex); $i+=2) { - $utf8 .= '%' . $hex[$i] . $hex[$i+1]; - } - } + $hex = base_convert($bin, 2, 16); + $hex = strtoupper($hex); + $len = strlen($hex); + if ($len == 1) { + $utf8 = '%0' . $hex; + } else { + for ($i = 0; $i < strlen($hex); $i+=2) { + $utf8 .= '%' . $hex[$i] . $hex[$i+1]; + } + } - return $utf8; + return $utf8; } -?> + +# escape all regexp meta-characters in a string +function escape_regexp($str) { + return str_replace( + array('\\', '^', '$', '.', '[', ']', '|', '(', ')', '?', '*', '+', '{', ']', '/'), + array('\\\\', '\^', '\$', '\.', '\[', '\]', '\|', '\(', '\)', '\?', '\*', '\+', '\{', '\]', '\/'), + $str); +}