Vés al contingut

Usuari:Anskarbot/Codis/Carrega de commons

De Viquitexts
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
#  viquitext-0.1.py
#

u"""
Author: Anskar
Look for me in catalan wikipedia

Script to find new .djvu in commons and get to wikisource

Global Variables:

Category to find new .djvu i commons
  site_comm = pwb.Site("wikimedia", "commons")
  cat_comm = u'Category:DjVu files in Catalan'

File to save the last timestamp and get pagegenerators beggining from that
  formed in call save_obj(_file, obj)

How to get wikisouce template
  print some_book_page.get() and copy manually the content to make ws_tmplt

Format to ws_tmplt
  getting commons {{Book}} template and put params as values in ws_template
  dict(re_info) construct the key, value pairs to put in ws_template.format(**dict) important **
  and set the text to save in "{{%s}}" % ws_tmplt
  Catalan example:
    ws_template = u'''
      :MediaWiki:Proofreadpage_index_template -- Name ws_tmplt it exists as [[MediaWiki:Proofreadpage_index_template]] page
      |Títol=''{Title}'' -- Títol is param ws_tmplt,  {Title} would be param commons {{Book}} template
      |Autor={Author}        -- so construct dict in form
      |Editor={Publisher}    -- data[comm_param] = comm_value
      |Lloc={City}           -- data["City"] = "Barcelona"
      |Any={Date}            -- would be "Lloc=Barcelona" in text to save
      |Font=[[:commons:{Image}|Commons]] -- As can see, not always common param is the same wikisource param
      |Imatge=1
      |Pàgines=<pagelist/>
      |Sumari=
      |Nivell= I'''
    comm_data = re.findall(ur".*?[|] ?(\w+) *= ?(.+)\n", comm_page.get()) to get commons data
    data = ws_tmplt.format(**dict(comm_data))
    ws_page.put("{{%s}}" % data)

  !¡!¡!¡!¡!¡!¡!¡!¡!
  !!! IMPORTANT ¡¡¡
  !¡!¡!¡!¡!¡!¡!¡!¡!

* All pages created would be rewiewed
* All commons pages that not use {{Book}} template don't success satisfactorily
  * If find {{Information}} template, put in title ws_tmplt param all commons description
  * Others kind of data will save page name in bot page and do nothing
* All pwb.inputs are to run in test mode put test = False to run without polls
* Have a limit variable to test low number of pages
* If you find better code to run, please, share it.

  !¡!¡!¡!¡!¡!¡!¡!¡!
  !!! IMPORTANT ¡¡¡
  !¡!¡!¡!¡!¡!¡!¡!¡!


!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!!!!!
!!! MORE GREATEST IMPORTANT ¡¡¡
!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!¡!!!!!

     Enjoy it, be fun ;)

"""
import sys, os

import datetime
import traceback
import codecs
import bz2, pickle
import re
from collections import Counter

import wikipedia as pwb
from query import GetData


def backtrace_error(func):
  """Function to traceback errors and don't out script"""
  err = traceback.format_exc()
  output(err)

def dont_uploaded(page):
  """Put in bot page all pages not uploaded"""
  errors_page_text = errors_page.get()
  errors_page_text += u"""
== No s'ha pogut carregat la pàgina ==
El llibre [[:commons:%s]] no s'ha pogut carregar automàticament perque la pàgina de commons no té plantilles processables --~~~~
""" % page
  errors_page.put(errors_page_text, comment=u"Error d'Anskarbot")



def save_obj(file, obj, test=False):
  """Save all objects to find info.
Objects can't be strings, would be list or dict"""
  if not test:
    f = bz2.BZ2File("temp/.%s.bin" % file, 'w')
  else:
    f = bz2.BZ2File("temp/.%s_test.bin" % file, 'w')
  pickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL)
  f.close()


def read_obj(file, test=False):
  """Read file to get objects to find info"""
  if not test:
    f = bz2.BZ2File("temp/.%s.bin" % file, 'r')
  else:
    f = bz2.BZ2File("temp/.%s_test.bin" % file, 'r')
  obj = pickle.load(f)
  f.close()
  return obj


def output(msg, bot_op=True):
  """Create a log file with current prints
    bot_op: boolean to print msg in console"""
  try:
    unicode(msg)
  except:
    msg = msg
  try:
    with codecs.open("./temp/%s.txt" % log, "a", "utf-8") as f:
      f.write(u"%s\n" % msg)
  except:
    pass
  if bot_op:
    pwb.output(msg)
  else:
    print "done", unicode(msg)[:25]


def find_author(comm_author):
  author = [a for a in gen_autors for b in comm_author.split(" ") if b not in ("i", "y", "de") and b in a]
  counter = dict(Counter(author))
  author_p = []
  for author_f in author:
    print comm_author
  print counter.items()
  if counter and sorted(counter.items())[0][1] > 1:
    author_t = sorted(counter.items())[0][0]
  else:
    author_t = "[[Autor:%s|%s]]" % (comm_author,comm_author)
  print author_t
  if test:
    pwb.input("")
  return author_t

def get_data(comm_text, page, j, book=True):
  global timestamp
  data = dict(re_info.findall(comm_text))
  print data
  if not book:
    for key in ws_keys:
      if key not in data:
        data[key] = ""
    data["Title"] = re.findall(ur"\{{2}.*?=(.+)\}{2}", data["Description"])[0]
  else:
    data["Title"] = "[[%s]]" % data["Title"]
  data["Image"] = unicode(page)[10:]
  if data["Author"]:
    data["Author"] = find_author(data["Author"])

  text = "{{%s}}" % ws_tmplt.format(**data)
  if test:
    resp = pwb.input("desem timestamp:\n[y|N]\n")
    if resp in ("s", "si", "y"):
      save_obj("viquitext_timestamp", [timestamp[j]])
  return text


def main():
  list_comm = []
  n = 1
  j = -1
  for page in gen_djvu:
    j += 1
    titol_comm = page.title()
    pag_wt = pwb.Page(site, u"Llibre:%s" % titol_comm[5:])
    if not pag_wt.exists():
      try:
        comm_text = page.get()
        print comm_text
        if "== {{int:filedesc}} ==\n{{Book" in comm_text:
          text_final = get_data(comm_text, page, j)
        elif "{{Information\n|Description=" in comm_text:
          text_final = get_data(comm_text, page, j, book=False)
        else:
          dont_uploaded(page)
          continue
        output(text_final)
        if test:
          resp = pwb.input("Desem\n[y/N]")
          if resp in ("s", "si", "y"):
            pag_wt.put(text_final, comment="Proves de BOT")
        else:
          pag_wt.put(text_final, comment="Proves de BOT")
        #llista_comm.append(page.title())
        if limit and n >= limit:
          break
        n += 1
      except:
        backtrace_error(pag_wt)
    else:
      print "ja el tenim"
      if test:
        try:
          comm_text = page.get()
          print comm_text
          if "== {{int:filedesc}} ==\n{{Book" in comm_text:
            text_final = get_data(comm_text, page, j)
          elif "{{Information\n|Description=" in comm_text:
            text_final = get_data(comm_text, page, j, book=False)
          output(text_final)
        except:
          backtrace_error(pag_wt)

# save_obj("djvu_comm", llista_comm)



  return 0


def pages_comm(cats):
  """Create all pages from commons category"""
  global site_comm, timestamp
  pages = []
  if isinstance(cats, unicode): # if only have a category must be listed
    cats = [cats]
  if test:
    timestamp = ["2010-11-07T18:07:23Z"] # default timestamp to begins look for, overrides for others languages
  else:
    timestamp = read_obj("viquitext_timestamp") # read last timestamp used
  for cat in cats:
    print cat
    params = {
      "action":"query",
      "list":"categorymembers",
      "cmtitle" : cat,
      "cmlimit" : 500,
      "cmprop" : "title|timestamp",
      "cmsort" : "timestamp",
      "cmstart" : timestamp[0]
    }
    data = GetData(params, site=site_comm)
    for x in data["query"]["categorymembers"]:
      if x["ns"] == 14:
        cats.append(x["title"])
      elif x["ns"] == 6:
        pages.append(x["title"])
        timestamp.append(x["timestamp"])
    while "query-continue" in data:
      params["cmcontinue"] = data["query-continue"]["categorymembers"]["cmcontinue"]
      data0 = GetData(params, site=site_comm)
      for x in data0["query"]["categorymembers"]:
        if x["ns"] == 14:
          cats.append(x["title"])
        elif x["ns"] == 6:
          pages.append(x["title"])
          timestamp.append(x["timestamp"])
      data = data0
  if not test:
    save_obj("viquitext_timestamp", [timestamp[-1]])
  for page in pages:
    print page
    yield pwb.Page(site_comm, page)


def authors():
  """
Create all pages from Author namespaces
In catalan wikisource Autor namespace is 106
In english wikisource Author namespace is 102
Find apropiate namespace from other languages in
(put your language in <lang> and look for exist <Author:page> in titles)
  https://<lang>.wikisource.org/wiki/Special:ApiSandbox#action=query&prop=info&format=json
"""

  global site
  pages = []
  params = {
    "action":"query",
    "list":"allpages",
    "aplimit" : 500,
    "apnamespace" : 106
  }
  data = GetData(params, site=site)
  for x in data["query"]["allpages"]:
    pages.append(x["title"])
  while "query-continue" in data:
    params["cmcontinue"] = data["query-continue"]["allpages"]["cmcontinue"]
    data0 = GetData(site=site_comm, **params).submit()
    for x in data0["query"]["allpages"]:
      pages.append(x["title"])
    data = data0
  return pages


if __name__ == '__main__':
  timestamp = []
  test = True
  ws_tmplt = u""":MediaWiki:Proofreadpage_index_template
|Títol=''{Title}''
|Autor={Author}
|Editor={Publisher}
|Lloc={City}
|Any={Date}
|Font=[[:commons:{Image}|Commons]]
|Imatge=1
|Pàgines=<pagelist/>
|Sumari=
|Nivell= I
"""

  ws_keys = ("Title", "Author", "Publisher", "City", "Date", "Image")
  log = "carrega-viquitext"
  if os.path.exists("/data"):
    site = pwb.Site("ca", "wikisource_ba")
  else:
    site = pwb.Site("ca", "wikisource")
  page = u"Usuari:Anskarbot/proves"
  errors_page = pwb.Page(site, page)
  limit = None
  try:
    with codecs.open("./temp/%s.txt" % log, "w", "utf-8") as f:
      f.write(unicode(datetime.datetime.now()))
#   re_info = re.compile(r"\{{2}Information.*.\}{2}", re.DOTALL)
    re_info = re.compile(ur".*?[|] ?(\w+) *= ?(.+)\n")
    site_comm = pwb.Site("wikimedia", "commons")
    cat_comm = u'Category:DjVu files in Catalan'
    gen_djvu = pages_comm(cat_comm)
    gen_autors = authors()
    main()
  except:
    backtrace_error(__name__)
  finally:
    pwb.stopme()