Utente:Wisbot/coordbot.py

# -*- coding: utf-8 -*-
"""
This bot will make direct text replacements. It  will retrieve information on
which pages might need changes either from an XML dump or a text file, or only
change a single page.

You can run the bot with the following commandline parameters:

-file        - Work on all pages given in a local text file.
               Will read any [[wiki link]] and use these articles.
               Argument can also be given as "-file:filename".
-cat         - Work on all pages which are in a specific category.
               Argument can also be given as "-cat:categoryname".
-page        - Only edit a specific page.
               Argument can also be given as "-page:pagetitle". You can give this
               parameter multiple times to edit multiple pages.
-ref         - Work on all pages that link to a certain page.
               Argument can also be given as "-ref:referredpagetitle".
-filelinks   - Works on all pages that link to a certain image.
               Argument can also be given as "-filelinks:ImageName".
-links       - Work on all pages that are linked to from a certain page.
               Argument can also be given as "-links:linkingpagetitle".
-start       - Work on all pages in the wiki, starting at a given page. Choose
               "-start:!" to start at the beginning.
               NOTE: You are advised to use -xml instead of this option; this is
               meant for cases where there is no recent XML dump.
-except:XYZ  - Ignore pages which contain XYZ. If the -regex argument is given,
               XYZ will be regarded as a regular expression.
-summary:XYZ - Set the summary message text for the edit to XYZ, bypassing the
               predefined message texts with original and replacements inserted.
-template:XYZ- 
-namespace:n - Number of namespace to process. The parameter can be used
               multiple times. It works in combination with all other
               parameters, except for the -start parameter. If you e.g. want to
               iterate over all user pages starting at User:M, use
               -start:User:M.
-always      - Don't prompt you for each replacement
other:       - 

NOTE: Only use either -xml or -file or -page, but don't mix them.

Examples:

"""
#
# [[Utente:Wiso]] 2007
#
# Distributed under the terms of the GPL licence
#

from __future__ import generators
import sys, re
import wikipedia, pagegenerators,catlib, config

__version__='$Id: coordbot.py,v 0.1 $'

# Summary messages in different languages
# NOTE: Predefined replacement tasks might use their own dictionary, see 'fixes'
# below.`v
msg = u'robot Aggiungo [[Template:Coord]] dalla pagina %s'

templates = {
    'safe': [
    (r'\{\{ ?[Cc]oord(.*?)\}\}',   r"{{Coord\1}}\n"),
    (r'{{coor[_ ]title[_ ]d\|([0-9\.-]+)\|([NS])\|([0-9\.-]+)\|([EW])\|?([^}]*?)}}', r"{{coord|\1|\2|\3|\4|\5|display=title}}\n"),
    (r'{{coor[_ ]title[_ ]dm\|([0-9\.-]+)\|([0-9\.-]+)\|([NS])\|([0-9\.-]+)\|([0-9\.-]+)\|([EW])\|?([^\}]*?)\}\}', r"{{coord|\1|\2|\3|\4|\5|\6|\7|display=title}}\n"),
    (r'{{coor[_ ]title[_ ]dms\|([0-9\.-]+)\|([0-9\.-]+)\|([0-9\.-]+)\|([NS])\|([0-9\.-]+)\|([0-9\.-]+)\|([0-9\.-]+)\|([EW])\|?([^}]*?)}}', r"{{coord|\1|\2|\3|\4|\5|\6|\7|\8|\9|display=title}}\n"),
    (r'\{\{ ?[Cc]oor[ _]d\|([0-9\.+-]+)\|([0-9\.+-])(\|?[^\|]*)\}\}',    r"{{Coord|\1|\2\3|display=title}}\n"),
    ],
    'notsafe': [
    (r'\{\{ ?[Cc]oord[ _]dm\|([0-9]+)\|([0-9\.]+)\|([NS])\|([0-9\.]+)\|([0-9\.]+)\|([EW])(\|?[^\|]*)\}\}',       r"{{Coord|\1|\2|\3|\4|\5|\6\7|display=title}}\n"),
    (r'\{\{ ?[Cc]oor[ _]dms\|([0-9]+)\|([0-9\.]+)\|([0-9\.]+)\|([NS])\|([0-9\.]+)\|([0-9\.]+)\|([0-9\.]+)\|([EW])(\|?[^\|]*)\}\}',       r"{{Coord|\1|\2|\3|\4|\5|\6|\7|\8\9|display=title}}\n"),
    (r'\{\{.*latd *= *([0-9\.]+).*longd ?= ?([0-9\.]+)',    r"{{Coord|\1|\2|display=title}}\n")
    ]
    }

exceptions = [ r'\{\{ *?Geobox',
               r'\{\{ ?[Cc]oord',
               r'\{\{ ?Template:[Cc]oord',
               r'\{\{ ?[mM]ontagna',
               r'\{\{ ?(Template:)?[cC]omune',
               r'\{\{ ?[cC]ittà',
               r'\{\{ ?[mM]unicipalità',
               r'\{\{ ?[aA]eroporto\|',
               r'\{\{ ?[Mm]unicipi',
               r'\{\{ ?[iI]nfobox[ _]Azienda\|',
               r'\{\{ ?[Ss]\|aziende',
               r'\{\{ ?[Dd]isambigua\|',
               r'\{\{ ?[Ff]razione',
               r'\{\{ ?[Ss]quadra',
               r'\{\{ ?[Pp]asso ?(\||\n)',
               r'\{\{ ?[Bb]undesland[ _]tedesco'
               ]    
    

class CoordRobot:
    """
    A bot that import coordinates from other wikipedia.
    """
    def __init__(self, generator, autoTitle = False, autoText = False):
        self.generator = generator
        self.compileregex()

    def compileregex(self):
    for key in templates.keys():        
            for i in range(len(templates[key])):
                old, new = templates[key][i]
                oldR = re.compile(old, re.UNICODE)
                templates[key][i] = oldR, new
        for i in range(len(exceptions)):
            exceptions[i] = re.compile(exceptions[i])

    def checkExceptions(self, text):
        for exception in exceptions:
            hit = exception.search(text)
            if hit:
                return hit.group(0)
        return None

    def change(self,page,new_text):
        try:
            page.put(new_text)
        except wikipedia.EditConflict:
            wikipedia.output(u'Skipping %s because of edit conflict' % (page.title()))
        except wikipedia.SpamfilterError, url:
            wikipedia.output(u'Cannot change %s because of blacklist entry %s' % (page.title(),url))



    def run(self):
        trovato_en = False
        sen = wikipedia.Site('en')
        interwiki_list = []
        for page in self.generator: 
            try:
                if not page.canBeEdited():
                    wikipedia.output(u'Skipping locked page %s' % page.title())
                    continue
                interwiki_list = page.interwiki()
            except wikipedia.NoPage:
                wikipedia.output(u'Page %s not found' % page.title())
                continue
            except wikipedia.IsRedirectPage:
                wikipedia.output(u'Page %s is a redirect, skip' % page.title())
                continue
            trovato_en = False
            for page_en in interwiki_list:
                if page_en.site() == sen:
                    trovato_en = True
                    break
            if not trovato_en:
                continue
            wikipedia.output(page.title())
            wikipedia.output(u'en: %s' %page_en.title())
            text_it = page.get()
            match = self.checkExceptions(text_it)
            # skip all pages that contain certain texts
            if match:
                colors = [None] * 9 + [None] * len(page.title()) + [None] * 21 + [10] * len(match)
                wikipedia.output(u'Skipping %s because it contains %s' % (page.title(), match), colors = colors)
                continue
            try:
                text_en = page_en.get()
            except wikipedia.NoPage:
                wikipedia.output(u'Page %s not found' %page_en.title())
                continue
            except wikipedia.IsRedirectPage:
                wikipedia.output(u'Page %s is a redirect, follow redirect' %page_en.title())
                text_en = page_en.get(get_redirect=True)
            for old, new in templates['safe']:
                match = old.search(text_en)
                if not match:
                    continue
                colors = [None] * 5 + [13] * len(page.title()) + [None] * 4
                wikipedia.output(u'\n>>> %s <<<' % page.title(), colors = colors)
                wikipedia.output(u'Trovato %s: ' %text_en[match.start():match.end()])
                template_new = old.sub(new, text_en[match.start():match.end()])
                wikipedia.output(template_new)
                new_text_it = template_new + text_it
              
                choice = wikipedia.inputChoice(u'Do you want to accept these changes?', ['Yes', 'No'], ['y', 'N'], 'N')
                if choice in ['y', 'Y']:
                    wikipedia.setAction(msg % page_en.aslink())
                    self.change(page,new_text_it)
    

        
def main():
    gen = None
    # summary message
    summary_commandline = None
    # Don't edit pages which contain certain texts.
    exceptions = []
    # commandline paramater.
    # Which namespaces should be processed?
    # default to [] which means all namespaces will be processed
    namespaces = []
    template = None
    PageTitles = []
    autoText = False
    autoTitle = False
    # This factory is responsible for processing command line arguments
    # that are also used by other scripts and that determine on which pages
    # to work on.
    genFactory = pagegenerators.GeneratorFactory()
    # Load default summary message.
    # BUG WARNING: This is probably incompatible with the -lang parameter.
    wikipedia.setAction(msg)

    # Read commandline parameters.
    for arg in wikipedia.handleArgs():
        if arg == '-autotitle':
            autoTitle = True
        elif arg =='-autotext':
            autoText = True
        elif arg.startswith('-page'):
            if len(arg) == 5:
                PageTitles.append(wikipedia.input(u'Which page do you want to chage?'))
            else:
                PageTitles.append(arg[6:])
        elif arg.startswith('-except:'):
            exceptions.append(arg[8:])
        elif arg.startswith('-template:'):
            template = arg[10:]
        elif arg.startswith('-namespace:'):
            namespaces.append(int(arg[11:]))
        elif arg.startswith('-summary:'):
            wikipedia.setAction(arg[9:])
            summary_commandline = True
        else:
            generator = genFactory.handleArg(arg)
            if generator:
                gen = generator
    print namespaces
    if PageTitles:
        pages = [wikipedia.Page(wikipedia.getSite(), PageTitle) for PageTitle in PageTitles]
        gen = iter(pages)
    if not gen:
        # syntax error, show help text from the top of this file
        wikipedia.showHelp('coordbot')
        wikipedia.stopme()
        sys.exit()
    if namespaces != []:
        gen =  pagegenerators.NamespaceFilterPageGenerator(gen, namespaces)
#    gen = pagegenerators.RedirectFilterPageGenerator(gen)
    preloadingGen = pagegenerators.PreloadingGenerator(gen, pageNumber = 20)
    bot = CoordRobot(preloadingGen, autoTitle, autoText)
    bot.run()
                

if __name__ == "__main__":
    try:
        main()
    finally:
        wikipedia.stopme()