Bgbot — a MediaWiki robot Code

Status: Beta
Brought to you by: bmanolov
[r41]: / trunk / utils.php Maximize Restore History
291 lines (232 with data), 7.4 kB

<?php
#
#  utils
#
#  Copyright (C) 2004 Borislav Manolov
#
#  This program is free software; you can redistribute it and/or
#  modify it under the terms of the GNU General Public License
#  as published by the Free Software Foundation; either version 2
#  of the License, or (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License along
#  with this program; if not, write to the Free Software Foundation, Inc.,
#  59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#  http://www.gnu.org/copyleft/gpl.html
#
#  Author: Borislav Manolov <b.manolov at gmail dot com>
#          http://purl.org/NET/borislav/
#############################################################################


$cyrs = array(
' а',' б',' в',' г',' д',' е',' ж',' з',' и',' й',
' к',' л',' м',' н',' о',' п',' р',' с',' т',' у',
' ф',' х',' ц',' ч',' ш',' щ',' ъ',' ю',' я',
' А',' Б',' В',' Г',' Д',' Е',' Ж',' З',' И',' Й',
' К',' Л',' М',' Н',' О',' П',' Р',' С',' Т',' У',
' Ф',' Х',' Ц',' Ч',' Ш',' Щ',' Ъ',' Ю',' Я',
' ');


$lats = array(
'a', 'b', 'v', 'g', 'd', 'e', 'zh', 'z', 'i', 'j',
'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u',
'f', 'h', 'c', 'ch', 'sh', 'sht', 'j', 'y', 'ju', 'ja',
'A', 'B', 'V', 'G', 'D', 'E', 'Zh', 'Z', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U',
'F', 'H', 'C', 'Ch', 'Sh', 'Sht', 'J', 'Y', 'Ju', 'Ja',
' ');


# change the string encoding from windows-1251 to UTF-8
function utf8($str) {
	return iconv('windows-1251', 'UTF-8', $str);
}


function cyr2hex($str) {
	return urlencode( iconv('windows-1251', 'UTF-8', $str) );
}


function cyr2lat($str) {
	global $cyrs, $lats;
	return str_replace($cyrs, $lats, $str);
}


# read a line from the console
function my_readline($str = 'Enter required data: ') {
	echo "\n$str";
	$resp = trim( fgets(STDIN) );
	return $resp;
}


function my_fwrite($file, $text, $mode='a+') {
	$myFile = @fopen($file, $mode);
	if (! $myFile) return false;
	flock($myFile, LOCK_EX);
	if (! @fputs($myFile, $text)) return false;
	flock($myFile, LOCK_UN);
	if (! @fclose($myFile)) return false;
	return true;
}


function my_urlencode($str) {
	$len = strlen($str);
	for ($i = 0; $i < $len; $i++) {
		$nstr .= $str{$i} != '%' ? my_char_urlencode($str{$i}) : '%';
	}
	return $nstr;
}

function my_char_urlencode($ch) {
	$spec = array('+'=>'%2B', '?'=>'%3F');
	if ( isset( $spec[$ch] ) ) {
		return $spec[$ch];
	}
	return urlencode($ch);
}

# returns the text inside the $element element
function getCDATA($element, $data) {
	$start = strpos ( $data, ">", strpos ( $data, "<$element" ) );
	$data = substr_replace ( $data, "", 0, $start + 1 );
	$end = strpos ( $data, "</$element>" );
	return substr_replace ( $data, "", $end );
/*
	Taken out by Grigor Gatchev -
	seems that PHP PCRE functions cannot handle subpatterns longer than 64K.
	preg_match("/<$element.*>(.+)<\/$element>/Us", $data, $matches);
	return $matches[1];
*/
}

# returns the text inside the $element elements
function getCDATAs($element, $data) {
	preg_match_all("/<$element.*>(.+)<\/$element>/Us", $data, $matches);
	return $matches[1];
}

# original source: http://www.randomchaos.com/document.php?source=php_and_unicode
function utf82unicode($str) {

	$unicode = '';
	$values = array();
	$lookingFor = 1;

	for ($i = 0; $i < strlen($str); $i++ ) {
		$thisValue = ord( $str{$i} );
		if ( $thisValue < 128 ) {
			$unicode .= $str{$i}; # don't convert ASCII characters
		} else {
			if ( count($values) == 0 ) {
				$lookingFor = ( $thisValue < 224 ) ? 2 : 3;
			}

			$values[] = $thisValue;

			if ( count( $values ) == $lookingFor ) {
				$number = ( $lookingFor == 3 )
					?
				( ( $values[0] % 16 ) * 4096 ) + ( ( $values[1] % 64 ) * 64 ) + ( $values[2] % 64 )
					:
				( ( $values[0] % 32 ) * 64 ) + ( $values[1] % 64 );

				$unicode .= "&#$number;";
				$values = array();
				$lookingFor = 1;
			}
		}
	}
	return $unicode;
}


# encodes a unicode sequence as UTF-8
function unicode2utf8($unicode_str) {
	$base = 10;
	if ($unicode_str[0] == 'x') {
		$unicode_str = substr($unicode_str, 1);
		$base = 16;
	}
	$bin = base_convert($unicode_str, $base, 2);
	$len = strlen($bin);
	if ( $len < 8 ) {
		// 0xxxxxxx
		$bin = '0' . str_pad($bin, 7, '0', STR_PAD_LEFT);
	} elseif ( $len < 12 ) {
		// 110xxxxx 10xxxxxx
		$bin = str_pad($bin, 11, '0', STR_PAD_LEFT);
		$bin = substr_replace($bin, '10', 5, 0);
		$bin = '110' . $bin;
	} elseif ( $len < 17 ) {
		// 1110xxxx 10xxxxxx 10xxxxxx
		$bin = str_pad($bin, 16, '0', STR_PAD_LEFT);
		$bin = substr_replace($bin, '10', 4, 0);
		$bin = substr_replace($bin, '10', 12, 0);
		$bin = '1110' . $bin;
	} else {
		// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
		$bin = str_pad($bin, 21, '0', STR_PAD_LEFT);
		$bin = substr_replace($bin, '10', 3, 0);
		$bin = substr_replace($bin, '10', 11, 0);
		$bin = substr_replace($bin, '10', 19, 0);
		$bin = '11110' . $bin;
	}

	$hex = base_convert($bin, 2, 16);
	$hex = strtoupper($hex);
	$len = strlen($hex);
	if ($len == 1) {
		$utf8 = '%0' . $hex;
	} else {
		for ($i = 0; $i < strlen($hex); $i+=2) {
			$utf8 .= '%' . $hex[$i] . $hex[$i+1];
		}
	}

	return $utf8;
}

# escape all regexp meta-characters in a string
function escape_regexp($str) {
	return str_replace(
		array('\\', '^', '$', '.', '[', ']', '|', '(', ')', '?', '*', '+', '{', ']', '/'),
		array('\\\\', '\^', '\$', '\.', '\[', '\]', '\|', '\(', '\)', '\?', '\*', '\+', '\{', '\]', '\/'),
		$str);
}

# -----  Smuggled in by Grigor Gatchev  ----- #


function strip_parentheses ( $text ) {
	$start = strpos ( $text, "(" );
	$end  = strrpos ( $text, ")" );
	return substr ( $text, $start + 1, ( $end - $start - 1 ) );
}



function extract_element ( $regex, $string, $flags = '', $offset = 0 ) {
	preg_match ( $regex, $string, $matches, $start, $offset );
	return $matches[1];
}

function extract_span_content ( $text ) {
	return extract_element ( '/<span .*>(.*)<\/span>/Us', $text );
}


function extract_first_element ( $element, $line ) {

	if ( preg_match ( '/(' . $element . ')/Us', $line, $matches ) > 0 ) {
		return $matches[1];
	} else {
		return false;
	}

}

function strip_first_element ( $element, $line, $things_before_it = true ) {

	$found = extract_first_element ( $element, $line );

	if ( $found ) {

		$offset = strlen ( $found );
		if ( $things_before_it ) {
			$offset += strpos ( $line, $found );
		}
		return substr ( $line, $offset );

	} else {
		return false;
	}

}


function parse_first_href ( $line ) {

	$href = extract_first_element ( '<a href=.*<\/a>', $line );

	if ( $href ) {
		preg_match ( '/<a href="(.*)".*title="(.*)">(.*)<\/a>/Us', $href, $matches );

		$parts = array();
		$parts['url'] = $matches[1];
		$parts['title'] = $matches[2];
		$parts['text'] = $matches[3];

		return $parts;

	} else {
		return false;
	}

}

function strip_first_href ( $line, $things_before_it = true ) {
	return strip_first_element ( '<a href=.*<\/a>', $line, $things_before_it );
}


function parse_first_maybe_href ( $href ) {
	$parts = parse_first_href ( $href );
	if ( ! ( $parts ) ) {
		$parts = array();
		$parts['text'] = $href;
	}
	return $parts;
}