User:Pfctdayelise/file age/code
Jump to navigation
Jump to search
to get this data:
- use the API and copy by hand all the entries to the upload file for a particular day. (You can only get 500 results at a time so you have to adjust the "lestart" property by hand, or write a script to do it. Since it's only half a dozen times it is not too onerous to do it by hand.)
- use python to prettify API results.
- use python + urllib + mw api to get deletion log results for these files. use python timedelta to calculate age of deleted file.
- use a python dictionary (or a number of other methods...) to calculate the total number of images deleted after a particular number of days.
- insert standard disclaimer: one day's worth of data may or may not reflect reality. beware of outliers. etc.
prettify[edit]
import re
filename = re.compile('title="([^"]*)"')
timestamp = re.compile('timestamp="(?P<year>[0-9]*)-(?P<month>[0-9][0-9])-(?P<day>[0-9][0-9])T(?P<hh>[0-9][0-9]):(?P<mm>[0-9][0-9]):(?P<ss>[0-9][0-9])Z"')
fi = "uploads-2006-08-01.txt"
f = open(fi,'r')
files = f.readlines()
f.close()
images = []
for im in files:
image = filename.search(im)
times = timestamp.search(im)
# realtimes = [times.group(t) for t in ["year","month","day","hh","mm","ss"]]
newim = (image.group(1), times.groups())
images.append(newim)
#print times.groups()
g = open("list-uploads-2006-08-01.txt",'w')
for im in images:
g.write(im[0])
g.write('\t')
g.write(' '.join(im[1]))
g.write('\n')
g.close()
print "done"
deletion log[edit]
import re
import urllib
from datetime import date
api = r"http://commons.wikimedia.org/w/api.php?action=query&format=xml&list=logevents&letype=delete&leprop=title|user|comment|timestamp&letitle="
user = re.compile('user="([^"]*)"')
comment = re.compile('comment="([^"]*)"')
timestamp = re.compile('timestamp="(?P<year>[0-9]*)-(?P<month>[0-9][0-9])-(?P<day>[0-9][0-9])T(?P<hh>[0-9][0-9]):(?P<mm>[0-9][0-9]):(?P<ss>[0-9][0-9])Z"')
item = re.compile(r'<item ns="6" .*?/>')
f = open("list-uploads-2006-08-01.txt",'r')
fs = f.readlines()
f.close()
filenames = [l.split('\t')[0] for l in fs]
uploaddates = [[int(i) for i in l.split('\t')[1].strip().split(' ')] for l in fs]
notdeleted = 0
g = open('list-deletions-from-2006-08-01.txt','w')
for (i, fn) in enumerate(filenames):
if i%50==0:
print i
add = api + fn.replace(" ","_")
u = urllib.urlopen(add)
# print add
logs = u.read()
if '<item ns="6"' not in logs:
# not deleted yet
notdeleted += 1
else:
# just take first deletion, even if was deleted multiple times
entry = item.findall(logs)[0]
deleter = user.findall(entry)[0]
delcomment = comment.findall(entry)[0]
deltimestamp = timestamp.search(entry)
update = date(uploaddates[i][0],uploaddates[i][1],uploaddates[i][2])
deldate = date(int(deltimestamp.group('year')),
int(deltimestamp.group('month')),
int(deltimestamp.group('day')))
life = (deldate-update).days
g.write(fn)
g.write('\t')
g.write(str(life))
g.write('\n')
g.close()
print "not deleted:",notdeleted
print "total:",len(filenames)