User:Tpt/djvuocr.py

From Wikimedia Commons, the free media repository
Jump to navigation Jump to search

This script for pywikipedia and GNU/Linux do the OCR of a djvu file by adding a text layer. It is completely automatized : it download the file, do the OCR and upload the new version. It use Tesseract 3.00 and, if you use an other language than English the corresponding language pack (they are in all good repositories like Debian, Ubuntu or Fedora). Sample : python djvuocr.py -filelang:fra -djvu:"Name of the djvu.djvu"

djvuocr.py[edit]

#!/usr/bin/python
# -*- coding: utf-8  -*-
"""
This bot do the ocr of a djvu file.  It is intended to be used for Commons.

The following parameters are supported:

    -djvu:...      Name of the djvu file on commons
    -filelang:...  The lang of the text like eng of fra
                   (Default: eng)
"""
#
# (C) Pywikipedia bot team, 2008-2010
#
# Distributed under the terms of the MIT license.
#
__version__ = '$Id: djvuocr.py 9246 2011-07-29 15:42:46Z xqt $'
import wikipedia as pywikibot
import os, sys, urllib
import config, codecs, query

class AppURLopener(urllib.FancyURLopener):
    version = 'Pywikipediabot/1.0'
urllib._urlopener = AppURLopener()

# This is required for the text that is shown when you run this script
# with the parameter -help.
docuReplacements = {
}


class DjVuOCRBot:
    # Edit summary message that should be used.
    # NOTE: Put a good description here, and add translations, if possible!
    msg = {
        'en': u'Robot: OCR of the file with Tesseract',
        'fr': u'Bot: OCR du fichier avec Tesseract',
    }

    def __init__(self, djvu, filelang='eng', ask=False, debug=False):
        """
        Constructor. Parameters:
        djvu : filename
        """
        self.djvu = djvu
        self.dry = debug
        self.ask = ask
        self.filelang = filelang
        self.targetSite = pywikibot.getSite()

    def NoOfImages(self):
        cmd = u"djvused -e 'n' \"/tmp/%s\"" % (self.djvu)
        count = os.popen( cmd.encode(sys.stdout.encoding) ).readline().rstrip()
        count = int(count)
        pywikibot.output("page count = %d" % count)
        return count

    def PagesGenerator(self):
        start = 1
        end = self.NoOfImages()

        if self.pages:
            pos = self.pages.find('-')
            if pos != -1:
                start = int(self.pages[:pos])
                if pos < len(self.pages)-1:
                    end = int(self.pages[pos+1:])
            else:
                start = int(self.pages)
                end = start
        pywikibot.output(u"Processing pages %d-%d" % (start, end))
        return range(start, end+1)

    def run(self):
        self.djvu = self.djvu.replace(' ', '_')
        if self.djvu[len(self.djvu) - 5 : len(self.djvu)] != '.djvu':
            pywikibot.output("It isn't a djvu file")
            return

        #download the djvu
        djvuPage = pywikibot.ImagePage(self.targetSite, 'File:%s' % self.djvu)
        if not djvuPage:
            pywikibot.output("The djvu file can't be downloaded")
            return
        pywikibot.output('Download of the djvu file')
        try:
            urllib.urlretrieve(djvuPage.fileUrl(), '/tmp/' + self.djvu)
        except:
            pywikibot.output("The djvu file can't be downloaded")
            return
        os.stat('/tmp/' + self.djvu)

        #OCR
        pywikibot.output(u"OCRing text from %s" % (self.djvu) )
        num_pages = self.NoOfImages()
        pageno = 1
        while pageno <= num_pages:
            pywikibot.output("Processing page %d" % pageno)
            self.ocr_page(pageno)
            pageno += 1

        #upload
        f = codecs.open("/tmp/%s" % self.djvu, 'r')
        self.upload(f.read())
        f.close()

        os.remove(u"/tmp/%s" % self.djvu)
        os.remove(u"/tmp/%s.txt" % self.djvu)
        os.remove(u"/tmp/%s.tiff" % self.djvu)
        os.remove(u"/tmp/%s.djvu-txt" % self.djvu)

    def ocr_page(self, pageno):
        cmd = u"ddjvu -format=tiff -page=%d \"/tmp/%s\" \"/tmp/%s.tiff\" " % (pageno, self.djvu, self.djvu)
        os.system ( cmd.encode(sys.stdout.encoding) )
	cmd = u"tesseract \"/tmp/%s.tiff\" \"/tmp/%s\" -l %s " % (self.djvu, self.djvu, self.filelang)
        os.system ( cmd.encode(sys.stdout.encoding) )
        f = codecs.open(u"/tmp/%s.txt" % self.djvu, 'r', config.textfile_encoding, 'replace')
        ft = codecs.open(u"/tmp/%s.djvu-txt" % self.djvu, 'w', config.textfile_encoding, 'replace')
        lines = f.readlines()
        if lines:
            ft.write('(page 0 0 1 1\n')
            for line in lines:
                line = line.strip()
                line = line.replace('"', '')
                line = line.replace('\\', '')
                ft.write(u"(line 0 0 1 1 \"%s\")\n" % line)
            ft.write(')\n')
            f.close()
            ft.close()
            cmd = u"djvused \"/tmp/%s\" -e 'select %d; remove-txt' -s" % (self.djvu, pageno)
            os.system ( cmd.encode(sys.stdout.encoding) )
            cmd = u"djvused \"/tmp/%s\" -e 'select %d; set-txt \"/tmp/%s.djvu-txt\"' -s" % (self.djvu, pageno, self.djvu)
            os.system ( cmd.encode(sys.stdout.encoding) )

    # function from upload.py of pywikipedia
    # (C) Rob W.W. Hooft, Andre Engels 2003-2004
    # (C) Pywikipedia bot team, 2003-2010
    def upload(self, djvu):
        """Upload the image at self.url to the target wiki.

        Return the filename that was used to upload the image.
        If the upload fails, ask the user whether to try again or not.
        If the user chooses not to retry, return null.

        """
        if not self.targetSite.has_api() or self.targetSite.versionnumber() < 16:
            pywikibot.output("The file can't be uploaded : the wiki have a too old configuration" % arg)
            return

        params = {
            'action': 'upload',
            'token': self.targetSite.getToken(),
            'comment': pywikibot.translate(pywikibot.getSite(), self.msg),
            'ignorewarnings': 1,
            'filename': self.djvu,
            'file': djvu
        }

        pywikibot.output(u'Uploading file to %s via API....' % self.targetSite)

        data = query.GetData(params, self.targetSite)

        if pywikibot.verbose:
            pywikibot.output("%s" % data)

        if 'error' in data:
            errCode = data['error']['code']
            pywikibot.output("%s" % data)
        else:
            data = data['upload']
            if data['result'] == 'Success':
                pywikibot.output("Upload successful.")
                return

def main():
    import os
    djvu = None
    # what would have been changed.
    dry = False
    ask = False

    # Parse command line arguments
    for arg in pywikibot.handleArgs():
        if arg.startswith("-dry"):
            dry = True
        elif arg.startswith("-ask"):
            ask = True
        elif arg.startswith("-djvu:"):
            djvu = arg[6:]
        elif arg.startswith("-filelang:"):
            filelang = arg[10:]
        else:
            pywikibot.output(u"Unknown argument %s" % arg)

    if djvu:
        site = pywikibot.getSite()
        bot = DjVuOCRBot(djvu, filelang, ask, dry)
        bot.run()
    else:
        pywikibot.showHelp()

if __name__ == "__main__":
    try:
        main()
    finally:
        pywikibot.stopme()