User:FairuseBot/Pearle.pm

From Wikipedia, the free encyclopedia
### IMPORTANT ###

# This code is released into the public domain.

### RECENT CHANGES ###

# 30 Nov 2005: Created, based off of the  12 Nov 2005 version of Pearle Wisebot
# 15 Feb 2006: Modifed "retry" to work with any function that signals failure by dying, modified to use a simple exponential backoff formula
#              Simplified "limit" code, modified to take an optional parameter
#              Added "config" function as a clean interface to change internal parameters
#              Modified Wiki-access functions for use with the new "retry" function
#              Cleanup of boolean config vars to use standard Perl boolean conventions
# 28 Feb 2006: Added checkLogin bottleneck, option to allow editing while logged out
#              Added support for proxy servers
#  8 Mar 2006: Added support for getting a user's contributions
#              Added support for retrieving logs
#              Separated out some common regex parts into variables
# 29 Mar 2006: Added protection against Unicode in URLs
#              Made thrown exceptions consistent
#              Sanity-checking on postPage: talkpage without article, userpage or user talkpage without user
# 17 May 2005: Improved log retrieval
# 12 Jul 2007: Started support for api.php
#              Log retrieval converted to api.php, added timestamps to retrieval
#              Modified to work with any wiki
#              Modified to use index.php rather than wiki.phtml
#              Converted GetLogArticles to use named parameters
# 14 Jul 2007: Modified logging to use message levels.  Removed "print" and "myPrint" functions
#  6 Aug 2007: Added the "WikiPage" class
#              Modified getPage and putPage to only work with WikiPage objects
#              Renamed to "Pearle.pm"
#              Made a proper module
# 10 Aug 2007: Changed the default XML parser, reduced memory usage when parsing
# 17 Oct 2007: Removed nullEdit() -- MediaWiki hasn't required them in some time.
#              Modified getCategoryArticles, getCategoryImages, and getSubcategories to use api.php
# 21 Oct 2007: Proper Unicode support
# 29 Oct 2007: Made edit summaries mandatory
# 23 Mar 2008: Changed "minor" flag from text to boolean
# 29 Mar 2008: Improved UTF-8 support
# 17 Oct 2008: Added a second login check, to handle non-Monobook skins
#  1 Dec 2008: Added a screen-scraping function to get Special:UncategorizedFiles
#              Removed some hard-coded references to enwiki
# 27 Apr 2010: Fixed the login function to work with the new enwiki login page
#              The HTML version of wikitext can validly contain ">"; fixed to handle this
# 12 Aug 2010: Added read-only and test modes
# 13 Dec 2010: Updated to use the API for editing.  This removes the last coherent piece of Pearle Wisebot code.
# 31 Aug 2011: Added option for bot-flagging an edit to postPage()
# 24 Sep 2011: Added namespace-handling routines
#              Added getPageList()
#              Re-wrote parseHistory as getArticleHistory. Breaking change.
# 17 Jan 2012: Updated login and logout to use the API.
#              Added APIPost()
#              Refactored APIQuery(), APIEdit(), and APIPost() to move common code into a helper function.
# 20 Feb 2012: Added getToken() and appendToPage() to permit efficient logging.
#              Removed the protection check from getPage(): it's simpler to just try the edit than to figure out how the wiki's protection rules apply.
#              Refactored postPage and appendToPage to move common code into a helper function.
#  9 Nov 2012: Fixed a bug where getCategoryContents would only return the last set of items if the category had more than 500 items of that type.
#              Added getPageImages()
# 11 Jun 2015: Modified all API access functions to use 'rawcontinue'

# Errors thrown by this package always begin with a three-digit number
#     4xx: HTTP client errors
#     505: Server error: HTTP version not supported
#     509: Server error: Bandwidth exceeded
#
#     900: Unspecified internal error.
#     901: Library not initialized.  You didn't call Pearle::init() before calling this function.
#     902: Parameter error.  You made a function call, but forgot a mandatory parameter, or provided an invalid one.
#     903: Attempted write in read-only mode.
#
#     920: Unexpected response.  The MediaWiki site returned something unexpected.
#     921: Unexpected logout.  The MediaWiki site logged us out unexpectedly.
#     922: Edit conflict.  Someone edited the article while we were.
#     923: Deleted article conflict.  Someone deleted the article while we were editing.
#     924: Spam filter.  A link in the page tripped the spam filter.
#     925: Protected page.  The page is protected, and the bot doesn't have the rights to edit it.

package Pearle;

use strict;
use warnings;

use Time::HiRes;
use Encode;

use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Request::Common qw(POST);
use HTML::Entities;
use XML::Simple;
use Data::Dumper;		# For debugging
use URI::Escape;
use Digest::MD5 qw(md5_hex);

use Pearle::WikiPage;

# Standard regex parts
$Pearle::regex_timestamp = '(\d\d):(\d\d), (\d\d?) (\w+) (\d\d\d\d)';				# Match and capture a Wikipedia timestamp
$Pearle::regex_timestamp_nc = '\d\d:\d\d, \d\d? \w+ \d\d\d\d';						# Match a Wikipedia timestamp
$Pearle::regex_timestamp_ISO = '(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)';     # Match and capture a timestamp of the form 2007-07-13T04:21:39Z
$Pearle::regex_timestamp_ISO_nc = '\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d';              # Match a timestamp of the form 2007-07-13T04:21:39Z

#<a href="/w/index.php?title=User:Angel_dunn&amp;action=edit" class="new" title="User:Angel dunn">
#<a href="/wiki/User:Jimbo_Wales" title="User:Jimbo Wales">
$Pearle::regex_pagelink = '<a href="[^"]*"(?: class="new"|) title="([^"]*)">';	# Match and capture any page
$Pearle::regex_redpagelink = '<a href="[^"]*" class="new" title="([^"]*)">';	# Match and capture nonexistant pages only
$Pearle::regex_bluepagelink = '<a href="[^"]*" title="([^"]*)">';				# Match and capture existing pages only
$Pearle::regex_pagelink_nc = '<a href="[^"]*"(?: class="new"|) title="[^"]*">';	# Match any page
$Pearle::regex_redpagelink_nc = '<a href="[^"]*" class="new" title="[^"]*">';	# Match nonexistant pages only
$Pearle::regex_bluepagelink_nc = '<a href="[^"]*" title="[^"]*">';				# Match existing pages only

# Namespaces available on this wiki
@Pearle::namespaces = ();

$Pearle::logfile = "";
$Pearle::_inited = 0;
$Pearle::username = "";
$Pearle::password = "";
$Pearle::testmode = 0;		# Read-only mode: do not actually make wiki-modifying calls, but return as if they succeeded
$Pearle::readonly = 0;		# Read-only mode: wiki-modifying calls are errors.
$Pearle::speedLimit = 10;	# Seconds to wait by default when limit() is called
$Pearle::_speedMult = 1;	# Multiplier for default wait time if the wiki is being slow
$Pearle::roughMode = 0;		# Ignore most errors
$Pearle::nullOK = 0;		# Permit editing non-existent pages
$Pearle::sanityCheck = 0;	# Sanity checking on edits
$Pearle::loglevel = 2;		# Level of message to write to file
$Pearle::printlevel = 3;	# Level of message to print to stdout
$Pearle::logoutOK = 0;		# Permit editing while logged out
$Pearle::proxy = undef;		# Proxy to use
$Pearle::wiki = 'https://en.wikipedia.org/w/';	# URL of the directory containing index.php and api.php
$XML::Simple::PREFERRED_PARSER = "XML::Parser";		# Much faster than the default XML::SAX parser
$Pearle::xml_parser = XML::Simple->new();


########## Accessors #########################################################

sub getXMLParser
{
	return $Pearle::xml_parser;
}


########## Other functions ###################################################

# This must be the first function from the library called
sub init
{
	$Pearle::username = $_[0] or die("902 No username provided!\n");
	$Pearle::password = $_[1] or die("902 No password provided!\n");
	$Pearle::logfile = $_[2] or die("902 No logfile name provided!\n");
	$Pearle::cookies = $_[3] or die("902 No cookie file provided!\n");
	$Pearle::useragent = $_[4] or $Pearle::useragent = "PearleLib/0.2 (User:${Pearle::username})";
	

	$Pearle::ua = LWP::UserAgent->new(timeout => 300);
	$Pearle::ua->agent($Pearle::useragent);
	$Pearle::ua->cookie_jar(HTTP::Cookies->new(file => $Pearle::cookies, autosave => 1));
	$Pearle::ua->cookie_jar->load();

	$Pearle::roughMode = "no";

	$Pearle::_inited = 1;
}

sub config
{
	my %params = @_;
	$Pearle::readonly = $params{readonly} if(defined($params{readonly}));
	$Pearle::testmode = $params{testmode} if(defined($params{testmode}));
	$Pearle::speedLimit = $params{speedLimit} if(defined($params{speedLimit}));
	$Pearle::roughMode = $params{roughMode} if(defined($params{roughMode}));
	$Pearle::nullOK = $params{nullOK} if(defined($params{nullOK}));
	$Pearle::loglevel = $params{loglevel} if(defined($params{loglevel}));
	$Pearle::printlevel = $params{printlevel} if(defined($params{printlevel}));
	$Pearle::logfile = $params{logfile} if(defined($params{logfile}));
	$Pearle::logoutOK = $params{logoutOK} if(defined($params{logoutOK}));
	$Pearle::sanityCheck = $params{sanityCheck} if(defined($params{sanityCheck}));
	if(defined($params{wiki}) and $params{wiki} ne $Pearle::wiki)
	{
		$params{wiki} .= '/' if($params{wiki} !~ /\/$/);	# Add a trailing slash if needed
		$Pearle::wiki = $params{wiki};
	}
	
	if(exists($params{proxy}))
	{
		if(defined($params{proxy}))
		{
			myLog(3, "Proxying: $params{proxy}\n");
			$Pearle::ua->proxy('http', $params{proxy});
			$Pearle::proxy = $params{proxy};
		}
		else
		{
			myLog(3, "Not proxying\n");
			$Pearle::ua->no_proxy();
			$Pearle::proxy = undef;
		}
	}
}

# Logging levels:
# 0: Immediately fatal errors.  Call to myLog will be followed by a call to die()
# 1: Non-fatal errors.  The library can recover, turn the function call into a no-op, and return an error indicator
# 2: Serious warning.  The library can still complete the action
# 3: Status messages.  Messages useful for tracing library execution.
# 4: Debugging messages.
sub myLog
{
	my $level = shift;
	my @message = @_;
	my $message = join "", @message;
	
	if($level <= $Pearle::loglevel)
	{
		die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

		open (LOG, ">>:utf8", $Pearle::logfile) || die "900 Could not append to log!";
		print LOG $message;
		close (LOG);
	}
	
	if($level <= $Pearle::printlevel)
	{
		print $message;
	}
}

# Rate-limiting.  Can be sensibly run even if libPearle isn't initialized
sub limit
{
	my ($i);
	$i = ($_[0] or ($Pearle::speedLimit * $Pearle::_speedMult));
	$i = 10 if($i < 10);

	# Rate-limiting to avoid hosing the wiki server
	# Min 30 sec unmarked
	# Min 10 sec marked
	# May be raised by retry() if load is heavy

	### ATTENTION ###
	# Increasing the speed of the bot to faster than 1 edit every 10
	# seconds violates English Wikipedia rules as of April, 2005, and
	# will cause your bot to be banned.  So don't change $normalDelay
	# unless you know what you are doing.  Other sites may have
	# similar policies, and you are advised to check before using your
	# bot at the default speed.
	#################

	while ($i >= 0)
	{
		sleep (1);
		print "Sleeping $i seconds...\r";
		$i--;
	}
	print "                                   \r";
}

sub login 
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
	
	my $xml = APIPost(action => 'login', lgname => $Pearle::username, lgpassword => $Pearle::password);
	my $parsed_xml = $Pearle::xml_parser->XMLin($xml);
	print Dumper($parsed_xml);
	if($parsed_xml->{login}->{result} eq 'NeedToken')
	{
		$xml = APIPost(action => 'login', lgname => $Pearle::username, lgpassword => $Pearle::password, lgtoken => $parsed_xml->{login}->{token});
		$parsed_xml = $Pearle::xml_parser->XMLin($xml);
		print Dumper($parsed_xml);
	}
	if($parsed_xml->{login}->{result} eq 'Success')
	{
		return 1;
	}
	else
	{
		return 0;
	}
}

sub logout {
	APIQuery(action => 'logout');
	return 1;
}

sub checkLogin
{
	my ($reply_text);
	$reply_text = $_[0];
	
	if ($reply_text !~ m/>My talk<\/a>/ and !($Pearle::logoutOK))
	{
		if($reply_text !~ /var wgUserName = "$Pearle::username"/)
		{
			# We've lost our identity.
			myLog(0, "Wiki server is not recognizing me (1).\n---\n${reply_text}\n---\n");
			die ("921 Wiki server is not recognizing me (1).\n");
		}
	}
}

# Make an HTTP request, performing basic error checking and handling.  Suitable for use with the "retry" function
sub httpRequest
{
	my ($request, $response, $attemptStartTime, $attemptEndTime);
	$request = $_[0];

	$response = $Pearle::ua->request($request);

	# Monitor wiki server responsiveness
	$attemptStartTime = Time::HiRes::time();

	if ($response->is_success or $response->is_redirect)
	{
		return $response
	} 
	else 
	{
		# 50X HTTP errors mean there is a problem connecting to the wiki server.  Can be remedied by waiting and trying again
		if (500 <= $response->code and 504 >= $response->code)
		{
			myLog(2, "HTTP ERR (".$response->status_line.")\n");
			die("retry:".$response->status_line);
		}
		else
		{
			# Unhandled HTTP response.  Waiting probably won't fix it
			myLog(0, "HTTP ERR (".$response->status_line.")\n".$response->decoded_content."\n");
			die($response->status_line."\n");
		}
	}
	# Monitor wiki server responsiveness
	$attemptEndTime = Time::HiRes::time();

	if($request->method() eq "POST")
	{
		if (($attemptEndTime - $attemptStartTime) > 20)
		{
			$Pearle::_speedMult = 60;

			myLog(3, "Wiki is very slow.  Increasing minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
		}

		# If the response time is between 10 and 20 seconds...
		elsif (($attemptEndTime - $attemptStartTime) > 10)
		{
			$Pearle::_speedMult = 6;

			myLog(3, "Wiki is somewhat slow.  Setting minimum wait to " . $Pearle::speedLimit * $Pearle::_speedMult . " sec...\n");
		}

		# If the response time is less than 10 seconds...
		else
		{
			if ($Pearle::_speedMult != 1)
			{
				$Pearle::_speedMult = 1;

				myLog(3, "Returning to normal minimum wait time.\n");
			}
		}
	}
}

# Check out a page for editing.
sub getPage
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($target, $xml, $parsed_xml, %query_params, $text, $editTime, $startTime, $token);

	$target = $_[0];

	if ($target =~ m/^\s*$/)
	{
		myLog(0, "getPage: Null target.");
		die("902 getPage: Null target.");
	}

	$query_params{prop} = ['revisions', 'info'];
	$query_params{rvprop} = ['timestamp', 'content'];
	$query_params{inprop} = 'protection';
	$query_params{intoken} = 'edit';
	$query_params{titles} = $target;

	$xml = APIQuery(%query_params);

	if(!defined($xml))
	{
		myLog(0, "Unknown error requesting page contents\n");
		die "900 Unknown error requesting page contents";
	}

	$parsed_xml = $Pearle::xml_parser->XMLin($xml);

	myLog(4, Dumper($parsed_xml));
	$xml = undef;

	# Check for errors
	if(exists($parsed_xml->{query}->{pages}->{page}->{invalid}))
	{
		myLog(0, "Invalid page title: $target\n");
		die("902 Invalid page title: $target");
	}
		
	# See if the page is blank
	if(!$Pearle::nullOK && exists($parsed_xml->{query}->{pages}->{page}->{missing})) 
	{
		myLog (1, "Empty page: $target\n");
		if (!$Pearle::roughMode)
		{
			die ("920 Empty page: $target\n");
		}
	}

	$text = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{content};
	$startTime = $parsed_xml->{query}->{pages}->{page}->{starttimestamp};
	$editTime = $parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}->{timestamp};
	$token = $parsed_xml->{query}->{pages}->{page}->{edittoken};

	return Pearle::WikiPage->new(text => $text, editTime => $editTime, startTime => $startTime, editToken => $token, title => $target);
}

sub getToken
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($target, $xml, $parsed_xml, %query_params);
	
	$target = $_[0];
	
	if ($target =~ m/^\s*$/)
	{
		myLog(0, "getPage: Null target.");
		die("902 getPage: Null target.");
	}
	
	$query_params{prop} = ['info'];
	$query_params{intoken} = 'edit';
	$query_params{titles} = $target;
	$query_params{inprop} = '';

	$xml = APIQuery(%query_params);
	
	$parsed_xml = $Pearle::xml_parser->XMLin($xml);

	myLog(4, Dumper($parsed_xml));
	$xml = undef;
	
	# Check for errors
	if(exists($parsed_xml->{query}->{pages}->{page}->{invalid}))
	{
		myLog(0, "Invalid page title: $target\n");
		die("902 Invalid page title: $target");
	}

	# See if the page has been protected
	if(exists($parsed_xml->{query}->{pages}->{page}->{protection}->{pr}))
	{
		myLog(0, "Page $target is protected\n");
		die("925 Protected");
	}
	
	return $parsed_xml->{query}->{pages}->{page}->{edittoken};
}

# The common elements of postPage and appendToPage.  Not for external consumption.
sub _editPage
{
	die "903 Library in read-only mode!\n" if($Pearle::readonly);
	my ($xml, $parsed_xml, $pageName, %params);

	%params = @_;
	$pageName = $params{title};

	$params{assert} = 'user' if(!$Pearle::logoutOK);
	
EDITRETRY:
	$xml = APIEdit(%params);
	
	if(!defined($xml))
	{
		myLog(0, "Unknown error posting edit\n");
		die "900 Unknown error posting edit";
	}

	$parsed_xml = $Pearle::xml_parser->XMLin($xml);

	myLog(4, Dumper($parsed_xml));
	$xml = undef;
	
	# Check for errors
	if(exists($parsed_xml->{edit}) && $parsed_xml->{edit}->{result} eq 'Failure' && $parsed_xml->{edit}->{assert} eq 'user')
	{
			myLog(0, "Wiki server is not recognizing me\n");
			die ("921 Not logged in");
	}
	
	if(exists($parsed_xml->{error}))
	{
		if($parsed_xml->{error}->{code} eq 'blocked')
		{
			myLog(0, "Blocked\n");
			die ("900 Blocked");
		}
		elsif($parsed_xml->{error}->{code} eq 'protectedpage' || $parsed_xml->{error}->{code} eq 'cascadeprotected')
		{
			myLog(0, "Page $pageName is protected\n");
			die ("925 Protected");
		}
		elsif($parsed_xml->{error}->{code} eq 'pagedeleted')
		{
			myLog(0, "Deleted article conflict on $pageName\n");
			die ("923 Deleted article conflict");
		}
		elsif($parsed_xml->{error}->{code} eq 'editconflict')
		{
			myLog(0, "Edit conflict on $pageName\n");
			die ("922 Edit conflict on $pageName");
		}
		elsif($parsed_xml->{error}->{code} eq 'spamdetected')
		{
			myLog(0, "Spam link on $pageName: $parsed_xml->{error}->{info}\n");
			die ("924 Spam filter");
		}
		elsif($parsed_xml->{error}->{code} eq 'readonly')
		{
			myLog(1, "Wiki is in readonly mode.  Waiting before retry\n");
			sleep 60;
			goto EDITRETRY;
		}
		else
		{
			myLog(0, "Unexpected error. Code: $parsed_xml->{error}->{code} Info: $parsed_xml->{error}->{info}\n");
			die ("920 Server error");
		}
	}
	return $parsed_xml->{edit}->{result};
}

sub postPage
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($page, $pageName, $summaryEntry, $minor, $bot, $xml, %params, $parsed_xml);

	$page = $_[0];
	$summaryEntry = $_[1];
	$minor = $_[2];
	$bot = $_[3];
	$bot = 1 if(!defined($bot));

	if(!defined($minor))
	{
		myLog(0, "postPage(): Not enough parameters.\n");
		die "902 postPage(): Not enough parameters!\n";
	}
	
	if(!$page->isa("Pearle::WikiPage"))
	{
		myLog(0, "postPage(): First parameter is not a WikiPage object\n");
		die "902 postPage(): First parameter is not a WikiPage object\n";
	}

	if ($summaryEntry eq "")
	{
		myLog(0, "postPage(): No edit summary provided\n");
		die "902 postPage(): No edit summary provided\n";
	}

	return "Success" if($Pearle::testmode);

	$pageName = $page->getTitle();

	$params{title} = $pageName;
	$params{text} = $page->getWikiText();
	$params{token} = $page->getEditToken();
	$params{summary} = $summaryEntry;
	$params{minor} = 1 if($minor);
	$params{starttimestamp} = $page->getStartTime();
	$params{basetimestamp} = $page->getEditTime();
	$params{bot} = 1 if($bot);
	
	return _editPage(%params);
}

sub appendToPage
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($page, $token, $text, $summary, $minor, $bot, %params, $xml, $parsed_xml);
	
	$page = $_[0];
	$token = $_[1];
	$text = $_[2];
	$summary = $_[3];
	$minor = $_[4];
	$bot = $_[5];
	$bot = 1 if(!defined($bot));

	if(!defined($minor))
	{
		myLog(0, "appendToPage(): Not enough parameters.\n");
		die "902 appendToPage(): Not enough parameters!\n";
	}
	
	return "Success" if($Pearle::testmode);
	
	$params{title} = $page;
	$params{appendtext} = $text;
	$params{token} = $token;
	$params{summary} = $summary;
	$params{minor} = 1 if($minor);
	$params{bot} = 1 if($bot);

	return _editPage(%params);;
}


# Get a list of the contents in a given category, filtered by namespace
sub getCategoryContents
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($target, $category_offset, @articles, $xml, $parsed_xml, %query_params,
	    $numberOfArticles, @namespaces);


	$target = shift;
	
	@namespaces = @_;

	# Category: prefix is mandatory
	if($target !~ /^[Cc]ategory:/)
	{
		$target = "Category:" . $target;
	}
	
	$query_params{list} = 'categorymembers';
	$query_params{cmprop} = 'title';
	$query_params{cmtitle} = $target;
	$query_params{cmlimit} = 500;	# If you're a flagged bot, this could be 5000, but we default to 500 for compatibility and to keep memory usage down
	$query_params{rawcontinue} = "";
	
	foreach my $namespace (@namespaces)
	{
		$query_params{cmnamespace} .= "${namespace}|";
	}
	if(exists($query_params{cmnamespace}) and defined($query_params{cmnamespace}))
	{
		chop $query_params{cmnamespace};
	}
		
	do
	{
		$xml = APIQuery(%query_params);

		if(!defined($xml))
		{
			myLog(0, "Unknown error accessing wiki\n");
			die "900 Unknown error accessing wiki";
		}

		$parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['cm']);

		myLog(4, Dumper($parsed_xml));
		$xml = undef;

		if(exists($parsed_xml->{query}->{categorymembers}->{cm}) and defined($parsed_xml->{query}->{categorymembers}->{cm}))
		{
			my @set_articles = map {$_->{title}} @{$parsed_xml->{query}->{categorymembers}->{cm}};
			push @articles, @set_articles;
		}
		
		if(exists($parsed_xml->{'query-continue'}->{categorymembers}->{cmcontinue}))
		{
			$category_offset = $parsed_xml->{'query-continue'}->{categorymembers}->{cmcontinue};
			$category_offset =~ s/&/%26/;
			$query_params{cmcontinue} = $category_offset;
		}
		else
		{
			$category_offset = undef;
		}

		sleep (1); # Throttle GETs
	}
	while(defined($category_offset));

	$numberOfArticles = scalar(@articles);
	myLog(4, "Got $numberOfArticles articles.\n");

	return @articles;
}

sub getCategoryArticles
{
	return getCategoryContents($_[0], 0);	# Namespace 0: Articles
}

sub getCategoryImages
{
	return getCategoryContents($_[0], 6);	#Namespace 6: Images
}

sub getSubcategories
{
	return getCategoryContents($_[0], 14);	# Namespace 14: Categories
}

sub getPageImages
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
	
	my ($target, $list_offset, @images, $xml, $parsed_xml, %query_params);
	$target = shift;
	die "902 No article provided for getPageImages\n" if(!defined($target));
	
	$query_params{prop} = "images";
	$query_params{titles} = $target;
	$query_params{imlimit} = 500;
	$query_params{rawcontinue} = "";
	
	do
	{
		$xml = APIQuery(%query_params);

		if(!defined($xml))
		{
			myLog(0, "Unknown error accessing wiki\n");
			die "900 Unknown error accessing wiki";
		}

		$parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['im']);

		myLog(4, Dumper($parsed_xml));
		$xml = undef;

		if(exists($parsed_xml->{query}->{pages}->{page}->{images}->{im}) and defined($parsed_xml->{query}->{pages}->{page}->{images}->{im}))
		{
			my @set_images = map {$_->{title}} @{$parsed_xml->{query}->{pages}->{page}->{images}->{im}};
			push @images, @set_images;
		}
		
		if(exists($parsed_xml->{'query-continue'}->{images}->{imcontinue}))
		{
			$list_offset = $parsed_xml->{'query-continue'}->{images}->{imcontinue};
			$list_offset =~ s/&/%26/;
			$query_params{imcontinue} = $list_offset;
		}
		else
		{
			$list_offset = undef;
		}

		sleep (1); # Throttle GETs
	}
	while(defined($list_offset));

	return @images;	
}

# Get up to $max most recent articles edited by a user
sub getUserArticles
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my ($url, $request, $response, $reply, @contribs,
	    $target, $namespace, $max, $offset);
	
	$target = $_[0];
	$max = $_[1];
	$offset = $_[2];
	$namespace = namespaceToNumber($_[3]);

	# Create a request-object
	if(defined($namespace))
	{
		$url = "${Pearle::wiki}index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}&namespace=$namespace";
	}
	else
	{
		$url = "${Pearle::wiki}index.php?title=Special%3AContributions&limit=${max}&offset=${offset}&target=${target}";
	}

	myLog(3, "GET $url\n");
	$request = HTTP::Request->new(GET => "$url");
	$response = startRetry(\&httpRequest, $request);
	$reply = $response->decoded_content;

	# This detects whether or not we're logged in.
	checkLogin($reply);
		
	# Extract the contributions
	# <li>23:18, 6 March 2006 (<a href="/w/index.php?title=User_talk:OrphanBot&amp;action=history" title="User talk:OrphanBot">
	while($reply =~ /<li>$Pearle::regex_timestamp_nc \($Pearle::regex_bluepagelink/g)
	{
		push @contribs, $1;
	}
	
	# Remove duplicates	
#	@contribs = uniquify(@contribs);
	return @contribs;
}

# Gets a list of (page, id, namespace) tuples
#
# Takes the following named parameters:
#	prefix: Filter to only include pages that start with this string
#	namespace: a reference to a list of namespaces to get pages from.  If not provided, gets only pages from namespace 0.
#	redirects: one of "yes", "no", "both"
sub getPageList
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my %params = @_;
	my %base_query_params = (list => 'allpages', aplimit => 500);
	my %full_query_params;
	my @namespaces;
	my @articles = ();
	
	foreach my $key (keys(%params))
	{
		if($key eq 'prefix')
		{
			$base_query_params{apprefix} = $params{prefix};
		}
		elsif($key eq 'redirects')
		{
			if($params{redirects} eq 'both')
			{
				$base_query_params{apfilterredir} = "all";
			}
			elsif($params{redirects} eq 'yes')
			{
				$base_query_params{apfilterredir} = "redirects";
			}
			elsif($params{redirects} eq 'no')
			{
				$base_query_params{apfilterredir} = "nonredirects";
			}
			else
			{
				myLog(2, "Unrecognized redirect option in getPageList: $params{redirects}.  Ignoring.\n");
			}
		}
		elsif($key eq 'namespace')
		{
			if(ref($params{namespace}) eq 'ARRAY')
			{
				@namespaces = @{$params{namespace}};
			}
			elsif(!ref($params{namespace}))
			{
				push @namespaces, $params{namespace};
			}
			else
			{
				myLog(0, "Namespace list in getPageList must be a scalar or an array reference\n");
				die("902 Namespace list in getPageList must be a scalar or an array reference");
			}
		}
	}
	
	push @namespaces, 0 if(!scalar(@namespaces));
	
	foreach my $namespace (@namespaces)
	{
		%full_query_params = %base_query_params;
		$full_query_params{apnamespace} = $namespace;
		my $offset;
		
		do
		{
			my $xml = APIQuery(%full_query_params);

			if(!defined($xml))
			{
				myLog(0, "Unknown error accessing wiki\n");
				die "900 Unknown error accessing wiki";
			}

			my $parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['p']);

			myLog(4, Dumper($parsed_xml));
			$xml = undef;

			if(exists($parsed_xml->{query}->{allpages}->{p}) and defined($parsed_xml->{query}->{allpages}->{p}))
			{
					push(@articles, map({[$_->{title}, $_->{ns}]} @{$parsed_xml->{query}->{allpages}->{p}}));
			}

			if(exists($parsed_xml->{'query-continue'}->{allpages}->{apfrom}))
			{
				$offset = $parsed_xml->{'query-continue'}->{allpages}->{apfrom};
				$offset =~ s/&/%26/;
				$full_query_params{apfrom} = $offset;
			}
			else
			{
				$offset = undef;
			}

			sleep (1); # Throttle GETs
		}
		while(defined($offset));
	}
	return @articles;
}

# Gets a list of (articles, actor, summary, timestamp) tuples from the specified log (upload, delete, move, protect).  The list is sorted by timestamp
# with the newest entry first
#
# Takes the following named parameters:
#	user: Filter "actor" to include only actions by this user
#	log: Filter to include only actions in this log (upload, delete, move, protect).
#	limit: Include this many items.  Defaults to 50 items.
#	time: Start checking the log at this time.  Timestamp in ISO 8601 format.
#	dir: Check the log in this direction (newer or older) from the timestamp
sub getLogArticles
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my %params = @_;
	my %query_params = (list => 'logevents', lelimit => 50);
	my @articles = ();

	foreach my $key (keys(%params))
	{
		if($key eq 'user')
		{
			$query_params{leuser} = $params{user};
			$query_params{leuser} =~ s/^User://i;	# Strip namespace prefix, if it's there.
		}
		elsif($key eq 'log')
		{
			$query_params{letype} = $params{log};
		}
		elsif($key eq 'limit')
		{
			$query_params{lelimit} = $params{limit};
		}
		elsif($key eq 'time')
		{
			$query_params{lestart} = $params{time};
		}
		elsif($key eq 'dir')
		{
			$query_params{ledir} = $params{dir};
		}
		else
		{
			myLog(2, "Error: Unknown parameter $key in getLogArticles\n");
		}
	}

	my $xml = APIQuery(%query_params);
	
	if(!defined($xml))
	{
		myLog(0, "Unknown error accessing wiki\n");
		die "920 Unknown error accessing wiki";
	}
	
	my $parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['item']);

	myLog(4, Dumper($parsed_xml));
	$xml = undef;

	if(exists($parsed_xml->{query}->{logevents}->{item}) and defined($parsed_xml->{query}->{logevents}->{item}))
	{
#		foreach my $item (@{$parsed_xml->{query}->{logevents}->{item}})
#		{
#			push @articles, [$item->{title}, $item->{user}, $item->{comment}, $item->{timestamp}];
#		}
		@articles = map {[$_->{title}, $_->{user}, $_->{comment}, $_->{timestamp}]} @{$parsed_xml->{query}->{logevents}->{item}};
		$parsed_xml = undef;
		
		@articles = uniquify_ref(0, @articles);
		@articles = sort {$b->[3] cmp $a->[3]} @articles;
	}
	return @articles;
}

# Gets a list of all files at [[Special:UncategorizedFiles]]
#
# Uses screen-scraping because there isn't an API call for this.  May not work for non-Wikipedia wikis.
sub getUncatFiles
{
	my $offset = shift || 0;
	my @files = ();
	
	my $content = getURL("${Pearle::wiki}index.php?title=Special:UncategorizedFiles&limit=500&offset=$offset");
	
	if($content !~ /There are no results for this report./)
	{
		while($content =~ /<div class="thumb".*?<a[^>]*href="\/wiki\/([^"]*)"/g)
		{
			my $file = decode('utf8', urlDecode($1));
			myLog(4, "Found file $file\n");
			push @files, $file;
		}
		
		if($content =~ /class="mw-nextlink"/)
		{
			myLog(4, "More files\n");
			push @files, getUncatFiles($offset + 500);
		}
	}
	return @files;
}

sub fixupURLFragment
{
	my $url_fragment = shift;
	$url_fragment =~ s/%/%25/g;
	$url_fragment =~ s/&/%26/g;
	$url_fragment =~ s/\+/%2B/g;
	$url_fragment =~ s/#/%23/g;
	$url_fragment =~ s/\?/%3F/g;
	$url_fragment =~ s/\\/%5C/g;
	return $url_fragment;
}

# Use the api.php interface to query the wiki using a GET request
#
# Takes a hash of parameter,value pairs
#
# Returns raw the XML blob from the wiki, or undef on error
sub APIQuery
{
	my %params = @_;
	my $url = "${Pearle::wiki}api.php?action=query&format=xml";
	my $reply = undef;
	
	foreach my $key (keys(%params))
	{
		my $val;
		if(ref($params{$key}) eq 'ARRAY')	# We've got a list of values
		{
			$val = join '|', @{$params{$key}};
		}
		else
		{
			$val = $params{$key};
		}
		$val =~ s/ /_/g if($key eq 'titles');
		$val = fixupURLFragment($val);
		$key = fixupURLFragment($key);
		$url .= "&${key}=$val";
	}
	
	myLog(3, "API query: $url\n");
	
	$url = encode("utf8", $url);

	my $request = HTTP::Request->new(GET => "$url");

	$reply = _APIguts($request);
	
	return $reply;
}

# Use the api.php interface to query the wiki using a POST request
#
# Takes a hash of parameter,value pairs
#
# Returns raw the XML blob from the wiki (possibly including an API error message), or undef on server error
sub APIPost
{
	my %params = @_;
	my $url = "${Pearle::wiki}api.php?format=xml";
	my $data = "";
	my $reply = undef;
	
	$data = join('&', map {fixupURLFragment($_) . "=" . fixupURLFragment((ref($params{$_}) eq 'ARRAY')?(join '|', @{$params{$_}}):($params{$_}))} keys(%params));
	
	myLog(3, "API URL: $url\n");
	myLog(3, "API query: $data\n");
	
	my $request = HTTP::Request->new('POST', $url, 
		HTTP::Headers->new(Content_Type => "application/x-www-form-urlencoded"), encode("utf8", $data));

	$reply = _APIguts($request);
	
	return $reply;
}

# Use the api.php interface to query the wiki
#
# Takes a hash of parameter,value pairs
#
# Returns raw the XML blob from the wiki (possibly including an API error message), or undef on server error
sub APIEdit
{
	my %params = @_;
	my $url = "${Pearle::wiki}api.php?action=edit&format=xml";
	my $data = "";
	my $reply = undef;
	
	$data = join('&', map {fixupURLFragment($_) . "=" . fixupURLFragment((ref($params{$_}) eq 'ARRAY')?(join '|', @{$params{$_}}):($params{$_}))} keys(%params));
	
	myLog(3, "API query: $data\n");
	
	my $request = HTTP::Request->new('POST', $url, 
		HTTP::Headers->new(Content_Type => "application/x-www-form-urlencoded"), encode("utf8", $data));

	$reply = _APIguts($request);
	
	return $reply;
}

# The common elements of APIQuery and APIPost.  Not for external consumption.
sub _APIguts
{
	my $request = shift;
	die "902 request is not an HTTP::Request object" if(!$request->isa('HTTP::Request'));

	my $reply;
	
APIretry:
	my $response = startRetry(\&httpRequest, $request);
	if($response->is_success)
	{
		$Pearle::ua->cookie_jar->save();
		
		$reply = $response->decoded_content;
		if(!defined($reply))
		{
			myLog(1, "Failed to decode response\n");
			#return undef;
			$reply = decode("utf8", $response->content);
		}
		if($reply =~ /<error code="([^"]*)"/)
		{
			# Errors that can be fixed by trying again later
			if($1 eq 'internal_api_error_DBConnectionError')
			{
				myLog(1, "Error $1 querying server.  Retrying after 60 seconds.\n");
				sleep(60);
				goto APIretry;
			}
			else
			{
				# Format error
				myLog(1, "Error $1 querying server\n");
			}
		}
	}
	else
	{
		myLog(1, "HTTP error accessing server\n");
		$reply = undef;
	}
	return $reply;
}

# Use the Special:Export interface to get the wikitext of one or more articles
sub Export
{
	my ($request, $response, $reply, $articles);
	
	$articles = join "\n", @_;
	
	$request = POST "${Pearle::wiki}index.php?title=Special:Export&action=submit", [action => 'submit', pages => $articles, curonly => 1];
	$response = startRetry(\&httpRequest, $request);
	$reply = $response->decoded_content;

	return $reply;
}

# Get the history of an article as a set of (id, timestamp, user, comment, minor) or (id, timestamp, user, comment, minor, content) tuples.
#
# Takes the following named parameters:
#  title: the title of the article to retrieve.  Mandatory.
#  content: a boolean indicating if article content should be retrieved.
#  limit: the maximum number of revisions to fetch.  Must be smaller than the wiki's limit on how many revisions can be fetched at one time.
sub getArticleHistory
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
	my %params = @_;
	
	my @history;
	my $offset;
	
	my $title = $params{title};
	my $content = $params{content};
	$content = 0 if(!defined($content));
	my $limit = $params{limit};
	
	die "902 Must specify title when calling getArticleHistory()" if(!defined($title));
	
	my %query_params = (prop => 'revisions', rvlimit => 500, rvprop => ['ids', 'timestamp', 'user', 'comment', 'flags']);
	$query_params{rvprop} = ['ids', 'timestamp', 'user', 'comment', 'flags', 'content' ] if($content);
	$query_params{titles} = $title;
	$query_params{rvlimit} = $limit if(defined($limit));
	
	do
	{
		my $xml = APIQuery(%query_params);
		if(!defined($xml))
		{
			myLog(0, "Unknown error accessing wiki\n");
			die "920 Unknown error accessing wiki";
		}

		my $parsed_xml = $Pearle::xml_parser->XMLin($xml, ForceArray => ['rev']);

		myLog(4, Dumper($parsed_xml));
		$xml = undef;

		if(exists($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}) and defined($parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}))
		{
			foreach my $item (@{$parsed_xml->{query}->{pages}->{page}->{revisions}->{rev}})
			{
				my $comment = $item->{comment};
				$comment = "" if(!defined($comment));
				my $minor = 0;
				$minor = 1 if(exists($item->{minor}) and defined($item->{minor}));
				if($content)
				{
					push @history, [$item->{revid}, $item->{timestamp}, $item->{user}, $comment, $minor, $item->{content}];
				}
				else
				{
					push @history, [$item->{revid}, $item->{timestamp}, $item->{user}, $comment, $minor];
				}
			}
		}
		if(!defined($limit) and exists($parsed_xml->{'query-continue'}->{revisions}->{rvstartid}))
		{
			$offset = $parsed_xml->{'query-continue'}->{revisions}->{rvstartid};
			$offset =~ s/&/%26/;
			$query_params{rvstartid} = $offset;
			sleep(10);
		}
		else
		{
			$offset = undef;
		}
	}
	while(defined($offset));
	
	return @history;
}


sub getURL #($target)
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);
    # Read throttle!
    sleep (1);

    my ($request, $response, $reply, $url);
    
    $url = $_[0];

    # Create a request-object
    myLog(3, "GET ${url}\n");
    $request = HTTP::Request->new(GET => "${url}");
    $response = startRetry(\&httpRequest, $request);

	$reply = $response->decoded_content;
	
	# This may or may not actually work
	$Pearle::ua->cookie_jar->save();

	return ($reply);
}


# Retries a given function repeatedly, with an exponential backoff rate
# The function should throw an exception beginning with "retry:" (case insensitive) if the call should be retried
sub startRetry
{
	my ($call_fn, @args) = @_;
	return retry($Pearle::speedLimit, $call_fn, @args);
}

sub retry
{
	my ($call_fn, @args, $delay, @result, $result);
	
	($delay, $call_fn, @args) = @_;
	
	if(wantarray())
	{
		@result = eval{ $call_fn->(@args) };
		if($@ =~ /^retry:/i)
		{
			limit($delay);
			@result = retry($delay * 2, $call_fn, @args);
		}
		elsif($@)
		{
			die;
		}
		return @result;
	}
	else
	{
		$result = eval{ &{$call_fn}(@args) };
		if($@ =~ /^retry:/i)
		{
			limit($delay);
			$result = retry($delay * 2, $call_fn, @args);
		}
		elsif($@)
		{
			die;
		}
		return $result;
	}
}

sub initNamespaceList
{
	die "901 Pearle library not initialized!\n" if(!$Pearle::_inited);

	my $xml = APIQuery(meta => 'siteinfo', siprop => 'namespaces');
	if(!defined($xml))
	{
		myLog(0, "Unknown error accessing wiki\n");
		die "920 Unknown error accessing wiki";
	}

	my $parsed_xml = $Pearle::xml_parser->XMLin($xml, KeyAttr => []);
	
	@Pearle::namespaces = map({[$_->{id}, $_->{content}]} grep({$_->{id} >= 0} @{$parsed_xml->{query}->{namespaces}->{ns}}));
}

sub getNamespaceNames
{
	return map {$_->[1]} @Pearle::namespaces;
}

sub getNamespaceNumbers
{
	return map {$_->[0]} @Pearle::namespaces;
}

sub namespaceToNumber
{
	my $namespace = $_[0];
	my $i = 0;
	my $name;
	if(scalar(@Pearle::namespaces) > 0)
	{
		if(defined($namespace))
		{
			my @val = grep( {lc($_->[1]) eq lc($namespace)} @Pearle::namespaces);
			return undef if(!scalar(@val));
			return $val[0]->[0];
		}
		else
		{
			return undef;
		}
	}
	else
	{
		myLog(1, "Namespace array not initialized\n");
		return undef;
	}
}

sub numberToNamespace
{
	my $i = shift;
	if(scalar(@Pearle::namespaces) > 0)
	{
		if(defined($i))
		{
			my @val = grep( {$_->[0] == $i} @Pearle::namespaces);
			return undef if(!scalar(@val));
			return $val[0]->[1];
		}
		else
		{
			return undef;
		}
	}
	else
	{
		myLog(1, "Namespace array not initialized\n");
		return undef;
	}
}


# Translate from HTTP URL encoding to the native character set.
sub urlDecode
{
	my ($input);

	$input = $_[0];

	$input =~ s/\%([a-f|A-F|0-9][a-f|A-F|0-9])/chr(hex($1))/eg;

	return ($input);
}

sub decodeArray
{
	return map {urlDecode($_)} @_;
}

# Remove duplicates from a list
sub uniquify
{
	my @list = @_;
	@list = sort @list;
	my $last = undef;
	my @new_list;
	my $item;
	
	foreach $item (@list)
	{
		push @new_list, $item if(!defined($last) or ($item ne $last));
		$last = $item;
	}
	return @new_list;
}

# Remove duplicates from a list of array references, grouping on the specified subelement
sub uniquify_ref
{
	my $element = shift;
	my @list = @_;
	@list = sort {$a->[$element] cmp $b->[$element]} @list;
	my $last = undef;
	my @new_list;
	my $item;

	foreach $item (@list)
	{
		push @new_list, $item if(!defined($last) or ($item->[$element] ne $last));
		$last = $item->[$element];
	}
	return @new_list;
}

1;