User:Fæ/code/reportRedlinksLACMA.py

From Wikimedia Commons, the free media repository
< User:Fæ‎ | code
Jump to navigation Jump to search
#!/usr/bin/python
# -*- coding: utf-8 -*-
'''
# reportRedlinksLACMA.py
#
# Quick fix for LACMA uploads with red links
# Grab matching files from catscan (JSON query), use API to check for red links and repost text.
#
# Date: August 2013
# Author: Fae http://j.mp/faewm
# Copyright: CC-BY-SA
'''

import wikipedia, upload, sys, config, urllib2, urllib, re, string, time, catlib, pagegenerators, os.path, json
from time import sleep
from colorama import Fore, Back, Style
from colorama import init
init()

site = wikipedia.getSite('commons', 'commons')

def urltry(u):
  headers = { 'User-Agent' : 'Mozilla/5.0' } # Spoof header
  countErr=0
  x=''
  while x=='':
      try:
          req = urllib2.Request(u,None,headers)
          x = urllib2.urlopen(req)
          time.sleep(1)
      except:
          x=''
          countErr+=1
          if countErr>20: countErr=20
          print Fore.CYAN,'** ERROR',countErr,'\n ** Failed to read from '+u+'\n ** Pause for '+str(countErr*1)+' seconds and try again'+Fore.WHITE
          time.sleep(1*countErr)
  return x

def htmltry(x,u):
    countErr=0
    r=True
    while r:
        try:
            return x.read()
        except:
            x=urltry(u)
            countErr+=1
            if countErr>200:
                p=300
            else:
                p=countErr*2
            print Fore.CYAN,'** ERROR',countErr,'\n ** Failed to read xml'
            if countErr==1:
                print Fore.BLUE+'xml ='+str(x)
                print 'url ='+u+Fore.CYAN
            print ' ** Pause for '+str(p)+' seconds and try again'+Fore.WHITE
            time.sleep(p)
        else:
            r=False
    return

def exists(page):
    url="http://commons.wikimedia.org/w/api.php?action=query&prop=info&format=xml&titles="+urllib.quote(page)
    xml=htmltry(urltry(url),url)
    if re.search('missing=""',xml):
        return False
    else:
        return True

'''
*** MAIN ***
'''

# Get list of articles with redlinks
url="http://tools.wmflabs.org/catscan2/quick_intersection.php?lang=commons&project=commons&cats=Files+with+broken+file+links%0D%0AImages+from+LACMA+uploaded+by+F%C3%A6&ns=6&depth=-1&max=1000&start=0&format=json"
uri=urltry(url)
json=json.loads(htmltry(uri,url))
print Fore.GREEN+"Intersection of:\n"+Fore.YELLOW,json['cats']
print Fore.GREEN+"Total pages returned:", Fore.YELLOW,json['pagecount'],Fore.WHITE

pages=[p['page_title'] for p in json['pages']]
count=0
for p in pages:
    count+=1
    #if count>1:continue
    print Fore.CYAN+str(count),p,Fore.WHITE
    page=wikipedia.Page(site,"File:"+p)
    html=page.get()
    links=[l.split("|")[0] for l in html.split("other_versions=")[1].split("}}")[0].split("[[")]
    links.pop(0)
    for l in links:
        if not exists(l):
            print Fore.YELLOW,l,Fore.WHITE
            sl="[["+l+"|220px|left]]\n"
            html=html.split(sl)[0]+html.split(sl)[1]
    wikipedia.setAction("Trim other_versions not uploaded")
    page.put(html)