User:MIMEStatBot/Source

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search

This is a copy of the source code of MIMEStatBot copied from the actual script on the toolserver.

#!/usr/bin/perl -w

# Copyright (c) 2008-2010 Ilmari Karonen <vyznev@toolserver.org>.
#
# Permission to use, copy, modify, and/or distribute this
# software for any purpose with or without fee is hereby granted,
# provided that the above copyright notice and this permission
# notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL
# THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR
# CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
# LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.

# The following lines are for running this script as an SGE job:
#
#$ -N commons-mime-statistics
#$ -S /usr/bin/perl
#$ -o public_html/stats/commons_mime_statistics_log.txt -j y
#$ -M vyznev@toolserver.org -m ea
#$ -l sql-s4-rr=1
# Typical runtimes as of 2012 are under 30 mins, but it's been more in the past
#$ -l h_rt=2:05:00
#$ -l s_rt=2:00:00
# Typical peak memory usage on Solaris seems to be around 22M, but Linux reports 200M for some reason???
#$ -l virtual_free=200M
# This script should work on both Linux and Solaris
#$ -l arch=*

use strict;
use utf8;  # not really needed for now, but for the sake of consistency
use Time::HiRes 'time';
use POSIX 'strftime';
use DBI;
use LWP::UserAgent;
use XML::Simple;
use Digest::MD5 'md5_hex';
use Data::Dumper 'Dumper';
use URI;
use Getopt::Long;

use constant TIMEOUT => 1200;  # stop db retries after this many secs

# Command line parsing (currently used only for debug flag):

my $debug;
GetOptions('debug' => \$debug)
    or die "Usage: $0 [--debug]\n";

# Configuration and general setup:
# TODO: read these from the env or command line

my $dbname = "commonswiki";
my $server = "commons.wikimedia.org";
my $botname = "MIMEStatBot";
my $pwfile = "mimestatbot.pass";
my $dumpfile = "public_html/stats/commons_mime_statistics_data.txt";

my $sitename = "Commons";
my $pageprefix = ($debug ? "User:$botname/test/" : "$sitename:");
my $statpage = $pageprefix . "MIME type statistics";
my $listpage = "$statpage/Unusual types";

my $interval = ($debug ? "test" : "weekly");
my $list_max = 500;

my @now = gmtime;
my $date = strftime "%Y-%m-%d", @now;
my $time = strftime "%H:%M:%S", @now;

my ($t0, $t1, $dt, $sql, $rows);  # common vars for queries and timeouts

my $botsummary = "bot updating $interval statistics at $date $time";
my $testcond = ($debug ? strftime("img_timestamp >= '%Y%m%d%H%M%S'", gmtime(time-7*24*60*60)) : "1=1");

# Start script:

warn "\n--- Generating $interval MIME statistics for $dbname at $date $time (UTC) ---\n";

# Knowing the system we're running on should help debugging:
system qw(uname -a);
$| = 1;

# SGE should start us in home dir anyway, but better safe than sorry...
chdir or die "Error changing to home directory: $!\n";

# Connect to database:

my $data_source = "DBI:mysql:database=${dbname}_p;host=${dbname}-p.rrdb.toolserver.org;mysql_read_default_group=client";
my $dbh;
$t0 = time;
until ($dbh = DBI->connect($data_source)) {
    warn "SQL connect failed: $DBI::errstr\n";
    my $sleep = time - $t0 + 1;
    die "Giving up after $sleep seconds.\n" if $sleep > TIMEOUT;
    warn "Waiting $sleep seconds before retry...\n";
    sleep $sleep;
}
$dbh->{RaiseError} = 1;  # all DB errors should be fatal
$dbh->do("SET SESSION TRANSACTION ISOLATION LEVEL READ UNCOMMITTED");

# Set up LWP:

my $ua = LWP::UserAgent->new(
                             agent => "Mozilla/4.0 (compatible; $0)",
                             from => 'vyznev@toolserver.org',
                             cookie_jar => {},
                             parse_head => 0,
                            );

# Generic MediaWiki API request handler:

my $apiURI = "http://$server/w/api.php";

sub apireq {
    my $query = [format => 'xml', @_];
    my $sleep = 5;
    if ($URI::VERSION < 1.36) {
	# Handling of Unicode strings changed in URI.pm v1.36, which $ua->post() calls internally
	utf8::encode($_) for @$query;
    }
    while (1) {
        my $res = $ua->post($apiURI, $query);
        my $err = $res->header('MediaWiki-API-Error') || "";

        return XMLin( $res->content ) if $res->is_success and $err ne 'maxlag';

        print STDERR "API request failed, ", ($err || $res->status_line), "...";
	if ($sleep > 3*60*60) {
	    warn "giving up\n";
	    return XMLin( $res->content );
	}
	warn "sleeping $sleep seconds\n";
        sleep $sleep;
        $sleep *= 2;
    }
}

# Subroutine to post data to the wiki:

sub postpage {
    my ($title, $content, $summary) = @_;
    # XXX: inputs are assumed to be Unicode strings; use utf8::decode() on data from MySQL first!

    my $md5 = md5_hex(do { my $x = $content; utf8::encode($x); $x });  # md5_hex() wants octets

    warn "Getting edit token for [[$title]]\n";
    my $data = apireq(
                      action => 'query',
                      prop => 'info',
                      intoken => 'edit',
                      titles => $title,
                     );
    my $token = $data->{query}{pages}{page}{edittoken}
        or die "Failed to get token, got:\n", Dumper($data), "\n";

    # warn "pretending to save $title ($summary):\n$content\n"; return;  ## DEBUG

    warn "Editing [[$title]] ($summary)\n";
    my $edit = eval { apireq(
			     maxlag => 5,
			     action => 'edit',
			     title => $title,
			     summary => $summary,
			     recreate => 1,
			     md5 => $md5,
			     text => $content,
			     token => $token,
			    ) };
    warn $@ if $@;

    if (ref $edit ne 'HASH' or $edit->{error} or $edit->{edit}{result} ne 'Success') {
	if (ref $edit ne 'HASH') {
	    warn "Got unexpected result:\n", Dumper($edit), "\n";
	} elsif ($edit->{error}) {
	    warn "Editing $title failed ($edit->{error}{code}): $edit->{error}{info}\n";
	} elsif ($edit->{edit}{result} ne 'Success') {
	    warn "Editing $title did not succeed ($edit->{edit}{result}):\n", Dumper($edit), "\n";
	} else {
	    warn "Qweebl zzyzx bleep blort?\n", Dumper($edit), "\n"; # should be impossible
	}
	require File::Temp;
	my $dump = File::Temp->new( UNLINK => 0, SUFFIX => "-$botname-dump-$date.txt", DIR => "public_html/temp" );
	warn "Error detected, dumping content to ", $dump->filename, "\n";
	binmode $dump, ":utf8" or warn "binmode failed: $!\n";
	print $dump $content or warn "print failed: $!\n";
	close $dump or warn "close failed: $!\n";
	chmod 0644, $dump->filename or warn "chmod failed: $!\n";
	warn "Dump complete.\n";
	return 0;
    }
    warn "Page [[$title]] successfully saved.\n";
    return 1;
}

# Run stat query:
$t0 = time;
warn "Starting stat query at ".gmtime($t0)."\n";
$sql = <<"END";
  SELECT img_major_mime, img_minor_mime, img_media_type,
         COUNT(*) AS files, SUM(img_size) AS bytes
    FROM image
   WHERE $testcond
   GROUP BY img_major_mime, img_minor_mime, img_media_type
  /* SLOW_OK LIMIT:3600 */
END
warn "Running query:\n$sql" if $debug;
{
    $rows = eval { $dbh->selectall_arrayref($sql) };
    if ($@) {
	warn "Stat query failed: $@\n";
	my $sleep = time - $t0 + 1;
	die "Giving up after $sleep seconds.\n" if $sleep > TIMEOUT;
	warn "Waiting $sleep seconds before retry...\n";
	sleep $sleep;
	redo;
    }
}
$t1 = time;
$dt = sprintf "%.1f", $t1 - $t0;
warn "Stat query done in $dt seconds at ".gmtime($t1)."\n";

# Dump stats to file:

if (open my $dump, ">>", $dumpfile) {
    if ($debug) {
	warn "Debug mode, not actually writing ".@$rows." rows to $dumpfile\n";
    }
    else {
	print $dump "\n$date\n";
	for my $row (@$rows) {
	    my ($major, $minor, $media, $files, $bytes) = @$row;
	    print $dump "$major/$minor\t$media\t$files\t$bytes\n";
	}
	warn @$rows." rows dumped to $dumpfile\n";
    }
    close $dump or warn "ERROR writing to $dumpfile: $!\n";
}
else {
    warn "ERROR opening $dumpfile for append: $!\n";
}

# Generate stat page:

my $stat_text = <<"END";
<noinclude>
This page is updated $interval by [[User:$botname|]]. Any other edits made to this page will be lost on next update.
</noinclude>
'''Files on $sitename by [[w:MIME type|]] as of $date $time (UTC)'''

See also: [[Commons:Project scope/Allowable file types]]

{| class="wikitable sortable"
! MIME type !! Media type !! Files !! Bytes
END

my $total_f = 0;
my $total_b = 0;
my %rare_types;
my $total_rare = 0;

for my $row (@$rows) {
    my ($major, $minor, $media, $files, $bytes) = @$row;
    $total_f += $files;
    $total_b += $bytes;
    my $type = "$major/$minor || $media";
    if ($files <= $list_max) {
        $type = "[[$listpage#$major/$minor ($media)|$major/$minor]] || $media";
        $_ = $dbh->quote($_) for $major, $minor, $media;
        $rare_types{$media}{$major}{$minor}++;
        $total_rare += $files;
    }
    $_ = reverse($_), s/(\d{3})\B/$1,/g, $_ = reverse($_) for $files, $bytes;
    utf8::decode($type);  # should not be necessary, but let's be consistent
    $stat_text .= <<"END";
|-
| $type
| align="right" | $files
| align="right" | $bytes
END
}

$_ = reverse($_), s/(\d{3})\B/$1,/g, $_ = reverse($_) for $total_f, $total_b;
$stat_text .= <<"END";
|- class="sortbottom"
! style="border-right:0" | '''Total'''
! style="border-left:0" | &nbsp;<!-- workaround for regression in table sort JS -->
| align="right" | $total_f
| align="right" | $total_b
|}

[[Category:Commons statistics|MIME type statistics]]

<!-- Generated in $dt seconds. -->
END

# Run list query:

my @cond;
for my $q_media (sort keys %rare_types) {
    my $major_types = $rare_types{$q_media};
    for my $q_major (sort keys %$major_types) {
        my $minor_types = $major_types->{$q_major};
        my $q_minor = join ", ", sort keys %$minor_types;
        push @cond, "(img_major_mime = $q_major AND img_media_type = $q_media AND img_minor_mime IN ($q_minor))";
    }
}
my $cond = join "\n      OR ", @cond;

$t0 = time;
warn "Starting list query at ".gmtime($t0).", expecting $total_rare rows\n";
$sql = <<"END";
  SELECT img_major_mime, img_minor_mime, img_media_type, img_name
    FROM image
   WHERE ($cond) AND ($testcond)
  /* SLOW_OK LIMIT:3600 */
END
warn "Running query:\n$sql" if $debug;
{
    $rows = eval { $dbh->selectall_arrayref($sql) };
    if ($@) {
	warn "List query failed: $@\n";
	my $sleep = time - $t0 + 1;
	die "Giving up after $sleep seconds.\n" if $sleep > TIMEOUT;
	warn "Waiting $sleep seconds before retry...\n";
	sleep $sleep;
	redo;
    }
}
$t1 = time;
$dt = sprintf "%.1f", $t1 - $t0;
warn "List query done in $dt seconds at ".gmtime($t1)."\n";

my %list;
for my $row (@$rows) {
    my ($major, $minor, $media, $title) = @$row;
    $title =~ tr/_/ /;
    push @{ $list{"$major/$minor ($media)"} }, $title;
}

# Generate list page:

my $list_text = <<"END";
<noinclude>
This page is updated $interval by [[User:$botname|]]. Any other edits made to this page will be lost on next update.
</noinclude>
This page lists all files on $sitename for [[w:MIME type|]]s that have less than $list_max files each, as of $date $time (UTC).  For a list of currently permitted types, see [[Commons:Project scope/Allowable file types]].
END

for my $type (sort keys %list) {
    my $heading = $type;
    utf8::decode($heading);  # should not be necessary, but let's be consistent
    $list_text .= "\n== $heading ==\n\n";
    for my $title (sort @{ $list{$type} }) {
	utf8::decode($title);
        $title =~ s/([&<>\[\]{}|\x27])/sprintf "&#%d;", ord $1/eg;  # \x27 = single quote
        $title =~ s/\xA0/&nbsp;/g;
        $list_text .= "* [[:File:$title|$title]]\n";
    }
}

$list_text .= "\n<!-- Generated in $dt seconds. -->\n";

# Disconnect from database:

$dbh->disconnect;

# Log in to wiki:

open PW, "<", $pwfile or die "Error opening $pwfile: $!";
my $botpass = <PW>;
chomp $botpass;
close PW or warn "Error reading $pwfile: $!\n";

warn "Logging in to $server as $botname, will post to [[$statpage]]\n";
my $login = apireq( action => 'login', lgname => $botname, lgpassword => $botpass );
$login = apireq( action => 'login', lgname => $botname, lgpassword => $botpass, lgtoken => $login->{login}{token} )
    if ($login->{login}{result} || '') eq 'NeedToken';
$login->{error} and die "Login as $botname failed ($login->{error}{code}): $login->{error}{info}\n";
$login->{login}{result} eq 'Success' or die "Login as $botname failed: $login->{login}{result}\n";

# Post pages to wiki:

postpage($statpage, $stat_text, $botsummary);
postpage($listpage, $list_text, $botsummary);

warn "All done, exiting.\n";

__END__