User:Pfctdayelise/file age/code

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search

to get this data:

  • use the API and copy by hand all the entries to the upload file for a particular day. (You can only get 500 results at a time so you have to adjust the "lestart" property by hand, or write a script to do it. Since it's only half a dozen times it is not too onerous to do it by hand.)
  • use python to prettify API results.
  • use python + urllib + mw api to get deletion log results for these files. use python timedelta to calculate age of deleted file.
  • use a python dictionary (or a number of other methods...) to calculate the total number of images deleted after a particular number of days.
  • insert standard disclaimer: one day's worth of data may or may not reflect reality. beware of outliers. etc.

prettify[edit]

import re
filename = re.compile('title="([^"]*)"')
timestamp = re.compile('timestamp="(?P<year>[0-9]*)-(?P<month>[0-9][0-9])-(?P<day>[0-9][0-9])T(?P<hh>[0-9][0-9]):(?P<mm>[0-9][0-9]):(?P<ss>[0-9][0-9])Z"')

fi = "uploads-2006-08-01.txt"

f = open(fi,'r')
files = f.readlines()
f.close()


images = []

for im in files:
    image = filename.search(im)
    times = timestamp.search(im)

#    realtimes = [times.group(t) for t in ["year","month","day","hh","mm","ss"]]

    newim = (image.group(1), times.groups())
    images.append(newim)

#print times.groups()

g = open("list-uploads-2006-08-01.txt",'w')
for im in images:
    g.write(im[0])
    g.write('\t')
    g.write(' '.join(im[1]))
    g.write('\n')

g.close()
print "done"

deletion log[edit]

import re
import urllib
from datetime import date
api = r"http://commons.wikimedia.org/w/api.php?action=query&format=xml&list=logevents&letype=delete&leprop=title|user|comment|timestamp&letitle="

user = re.compile('user="([^"]*)"')
comment = re.compile('comment="([^"]*)"')
timestamp = re.compile('timestamp="(?P<year>[0-9]*)-(?P<month>[0-9][0-9])-(?P<day>[0-9][0-9])T(?P<hh>[0-9][0-9]):(?P<mm>[0-9][0-9]):(?P<ss>[0-9][0-9])Z"')
item = re.compile(r'<item ns="6" .*?/>')

f = open("list-uploads-2006-08-01.txt",'r')
fs = f.readlines()
f.close()

filenames = [l.split('\t')[0] for l in fs]
uploaddates = [[int(i) for i in l.split('\t')[1].strip().split(' ')] for l in fs]

notdeleted = 0

g = open('list-deletions-from-2006-08-01.txt','w')

for (i, fn) in enumerate(filenames):
    if i%50==0:
        print i
    add = api + fn.replace(" ","_")
    u = urllib.urlopen(add)
#    print add
    logs = u.read()
    if '<item ns="6"' not in logs:
        # not deleted yet
        notdeleted += 1
    else:
        # just take first deletion, even if was deleted multiple times
        entry = item.findall(logs)[0]
        deleter = user.findall(entry)[0]
        delcomment = comment.findall(entry)[0]
        deltimestamp = timestamp.search(entry)

        update = date(uploaddates[i][0],uploaddates[i][1],uploaddates[i][2])
        deldate = date(int(deltimestamp.group('year')),
                       int(deltimestamp.group('month')),
                       int(deltimestamp.group('day')))

        life = (deldate-update).days

        
        g.write(fn)
        g.write('\t')
        g.write(str(life))
        g.write('\n')

        
g.close()

print "not deleted:",notdeleted
print "total:",len(filenames)