openclonk/docs/tools/xml2po.py

775 lines
25 KiB
Python

#!/usr/bin/env python
# Copyright (c) 2004 Danilo Segan <danilo@kvota.net>.
#
# This file is part of xml2po.
#
# xml2po is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# xml2po is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with xml2po; if not, write to the Free Software Foundation, Inc.,
# 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
# xml2po -- translate XML documents
VERSION = "0.5.2"
# Versioning system (I use this for a long time, so lets explain it to
# those Linux-versioning-scheme addicts):
# 1.0.* are unstable, development versions
# 1.1 will be first stable release (release 1), and 1.1.* bugfix releases
# 2.0.* will be unstable-feature-development stage (milestone 1)
# 2.1.* unstable development betas (milestone 2)
# 2.2 second stable release (release 2), and 2.2.* bugfix releases
# ...
#
import sys
import libxml2
import gettext
import os
import re
class MessageOutput:
def __init__(self, with_translations = 0):
self.messages = []
self.comments = {}
self.linenos = {}
self.nowrap = {}
if with_translations:
self.translations = []
self.do_translations = with_translations
self.output_msgstr = 0 # this is msgid mode for outputMessage; 1 is for msgstr mode
def translationsFollow(self):
"""Indicate that what follows are translations."""
self.output_msgstr = 1
def setFilename(self, filename):
self.filename = filename
def outputMessage(self, text, lineno = 0, comment = None, spacepreserve = 0, tag = None):
"""Adds a string to the list of messages."""
if (text.strip() != ''):
t = escapePoString(normalizeString(text, not spacepreserve))
if self.output_msgstr:
self.translations.append(t)
return
if self.do_translations or (not t in self.messages):
self.messages.append(t)
if spacepreserve:
self.nowrap[t] = 1
if t in self.linenos.keys():
self.linenos[t].append((self.filename, tag, lineno))
else:
self.linenos[t] = [ (self.filename, tag, lineno) ]
if (not self.do_translations) and comment and not t in self.comments:
self.comments[t] = comment
else:
if t in self.linenos.keys():
self.linenos[t].append((self.filename, tag, lineno))
else:
self.linenos[t] = [ (self.filename, tag, lineno) ]
if comment and not t in self.comments:
self.comments[t] = comment
def outputHeader(self, out):
import time
out.write("""msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\\n"
"POT-Creation-Date: %s\\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
"Language-Team: LANGUAGE <LL@li.org>\\n"
"MIME-Version: 1.0\\n"
"Content-Type: text/plain; charset=UTF-8\\n"
"Content-Transfer-Encoding: 8bit\\n"
""" % (time.strftime("%Y-%m-%d %H:%M%z")))
def outputAll(self, out):
self.outputHeader(out)
for k in self.messages:
if k in self.comments:
out.write("#. %s\n" % (self.comments[k].replace("\n","\n#. ")))
references = ""
for reference in self.linenos[k]:
references += "%s:%d(%s) " % (reference[0], reference[2], reference[1])
out.write("#: %s\n" % (references))
if k in self.nowrap and self.nowrap[k]:
out.write("#, no-wrap\n")
out.write("msgid \"%s\"\n" % (k))
translation = ""
if self.do_translations:
if len(self.translations)>0:
translation = self.translations.pop(0)
out.write("msgstr \"%s\"\n\n" % (translation))
def normalizeNode(node):
if not node:
return
elif isSpacePreserveNode(node):
return
elif node.isText():
if node.isBlankNode():
node.setContent('')
else:
node.setContent(re.sub('\s+',' ', node.content))
elif node.children and node.type == 'element':
child = node.children
while child:
normalizeNode(child)
child = child.next
def normalizeString(text, ignorewhitespace = 1):
"""Normalizes string to be used as key for gettext lookup.
Removes all unnecessary whitespace."""
if not ignorewhitespace:
return text
try:
# Lets add document DTD so entities are resolved
dtd = doc.intSubset()
tmp = dtd.serialize('utf-8')
tmp = tmp + '<norm>%s</norm>' % text
except:
tmp = '<norm>%s</norm>' % text
try:
ctxt = libxml2.createDocParserCtxt(tmp)
if expand_entities:
ctxt.replaceEntities(1)
ctxt.parseDocument()
tree = ctxt.doc()
newnode = tree.getRootElement()
except:
print >> sys.stderr, """Error while normalizing string as XML:\n"%s"\n""" % (text)
return text
normalizeNode(newnode)
result = ''
child = newnode.children
while child:
result += child.serialize('utf-8')
child = child.next
result = re.sub('^ ','', result)
result = re.sub(' $','', result)
return result
def stringForEntity(node):
"""Replaces entities in the node."""
text = node.serialize('utf-8')
try:
# Lets add document DTD so entities are resolved
dtd = node.doc.intSubset()
tmp = dtd.serialize('utf-8') + '<norm>%s</norm>' % text
next = 1
except:
tmp = '<norm>%s</norm>' % text
next = 0
ctxt = libxml2.createDocParserCtxt(tmp)
if expand_entities:
ctxt.replaceEntities(1)
ctxt.parseDocument()
tree = ctxt.doc()
if next:
newnode = tree.children.next
else:
newnode = tree.children
result = ''
child = newnode.children
while child:
result += child.serialize('utf-8')
child = child.next
return result
def escapePoString(text):
return text.replace('\\','\\\\').replace('"', "\\\"").replace("\n","\\n").replace("\t","\\t")
def unEscapePoString(text):
return text.replace('\\"', '"').replace('\\\\','\\')
def getTranslation(text, spacepreserve = 0):
"""Returns a translation via gettext for specified snippet.
text should be a string to look for, spacepreserve set to 1
when spaces should be preserved.
"""
text = normalizeString(text, not spacepreserve)
if (text.strip() == ''):
return text
if gt:
return gt.ugettext(text.decode('utf-8'))
return text
def startTagForNode(node):
if not node:
return 0
result = node.name
params = ''
if node.properties:
for p in node.properties:
if p.type == 'attribute':
# FIXME: This part sucks
params += p.serialize('utf-8')
return result+params
def endTagForNode(node):
if not node:
return 0
result = node.name
return result
def isFinalNode(node):
if automatic:
auto = autoNodeIsFinal(node)
# Check if any of the parents is also autoNodeIsFinal,
# and if it is, don't consider this node a final one
parent = node.parent
while parent and auto:
auto = not autoNodeIsFinal(parent)
parent = parent.parent
return auto
#node.type =='text' or not node.children or
if node.type == 'element' and node.name in ultimate_tags:
return 1
elif node.children:
final_children = 1
child = node.children
while child and final_children:
if not isFinalNode(child):
final_children = 0
child = child.next
if final_children:
return 1
return 0
def isSwallowNode(node):
if not node.name:
return 0
if node.name in swallow_tags:
return 1
if node.parent and node.parent.name and ((node.parent.name + '/' + node.name) in swallow_tags):
return 1
return 0
def ignoreNode(node):
if automatic:
if node.type in ('dtd', 'comment', 'cdata'):
return 1
else:
return 0
else:
if isFinalNode(node):
return 0
if isSwallowNode(node):
return 0
if node.name in ignored_tags or node.type in ('dtd', 'comment', 'cdata'):
return 1
return 0
def isSpacePreserveNode(node):
pres = node.getSpacePreserve()
if pres == 1:
return 1
else:
if CurrentXmlMode and (node.name in CurrentXmlMode.getSpacePreserveTags()):
return 1
else:
return 0
def getCommentForNode(node):
"""Walk through previous siblings until a comment is found, or other element.
Only whitespace is allowed between comment and current node."""
prev = node.prev
while prev and prev.type == 'text' and prev.content.strip() == '':
prev = prev.prev
if prev and prev.type == 'comment':
return prev.content.strip()
else:
return None
def replaceNodeContentsWithText(node,text):
"""Replaces all subnodes of a node with contents of text treated as XML."""
if node.children:
starttag = node.name #startTagForNode(node)
endtag = endTagForNode(node)
try:
# Lets add document DTD so entities are resolved
dtd = doc.intSubset()
tmp = ''
if expand_entities: # FIXME: we get a "Segmentation fault" in libxml2.parseMemory() when we include DTD otherwise
tmp = dtd.serialize('utf-8')
tmp = tmp + '<%s>%s</%s>' % (starttag, text, endtag)
except:
tmp = '<%s>%s</%s>' % (starttag, text, endtag)
try:
ctxt = libxml2.createDocParserCtxt(tmp.encode('utf-8'))
ctxt.replaceEntities(0)
ctxt.parseDocument()
newnode = ctxt.doc()
except:
print >> sys.stderr, """Error while parsing translation as XML:\n"%s"\n""" % (text.encode('utf-8'))
return
newelem = newnode.getRootElement()
if newelem and newelem.children:
free = node.children
while free:
next = free.next
free.unlinkNode()
free = next
node.addChildList(newelem.children)
else:
# In practice, this happens with tags such as "<para> </para>" (only whitespace in between)
pass
else:
node.setContent(text)
def autoNodeIsFinal(node):
"""Returns 1 if node is text node, contains non-whitespace text nodes or entities."""
final = 0
if node.isText() and node.content.strip()!='':
return 1
child = node.children
while child:
if child.type in ['text'] and child.content.strip()!='':
final = 1
break
child = child.next
return final
def worthOutputting(node):
"""Returns 1 if node is "worth outputting", otherwise 0.
Node is "worth outputting", if none of the parents
isFinalNode, and it contains non-blank text and entities.
"""
worth = 1
parent = node.parent
final = isFinalNode(node) and node.name not in ignored_tags
while not final and parent:
if isFinalNode(parent):
final = 1 # reset if we've got to one final tag
if final and (parent.name not in ignored_tags) and worthOutputting(parent):
worth = 0
break
parent = parent.parent
#if node.name == 'funclink' and worth:
#print "wah?" + str(isFinalNode(node)) + " " + str(node.name not in ignored_tags) + " " + str(worth)
if not worth:
return 0
return autoNodeIsFinal(node)
def processElementTag(node, replacements, restart = 0):
"""Process node with node.type == 'element'."""
if node.type == 'element':
outtxt = ''
if restart:
myrepl = []
else:
myrepl = replacements
submsgs = []
child = node.children
while child:
if (isFinalNode(child)) or (child.type == 'element' and worthOutputting(child)):
myrepl.append(processElementTag(child, myrepl, 1))
outtxt += '<placeholder-%d/>' % (len(myrepl))
else:
if child.type == 'element':
(starttag, content, endtag, translation) = processElementTag(child, myrepl, 0)
outtxt += '<%s>%s</%s>' % (starttag, content, endtag)
else:
outtxt += doSerialize(child)
child = child.next
if mode == 'merge':
if isSwallowNode(node):
translation = outtxt.decode('utf-8')
else:
translation = getTranslation(outtxt, isSpacePreserveNode(node))
else:
translation = outtxt
starttag = startTagForNode(node)
endtag = endTagForNode(node)
if restart or worthOutputting(node):
i = 0
while i < len(myrepl):
replacement = '<%s>%s</%s>' % (myrepl[i][0], myrepl[i][3], myrepl[i][2])
i += 1
translation = translation.replace('<placeholder-%d/>' % (i), replacement)
if worthOutputting(node):
if mode == 'merge':
replaceNodeContentsWithText(node, translation)
elif not isSwallowNode(node):
msg.outputMessage(outtxt, node.lineNo(), getCommentForNode(node), isSpacePreserveNode(node), tag = node.name)
return (starttag, outtxt, endtag, translation)
else:
raise Exception("You must pass node with node.type=='element'.")
def isExternalGeneralParsedEntity(node):
if (node and node.type=='entity_ref'):
try:
# it would be nice if debugDumpNode could use StringIO, but it apparently cannot
tmp = file(".xml2po-entitychecking","w+")
node.debugDumpNode(tmp,0)
tmp.seek(0)
tmpstr = tmp.read()
tmp.close()
os.remove(".xml2po-entitychecking")
except:
# We fail silently, and replace all entities if we cannot
# write .xml2po-entitychecking
# !!! This is not very nice thing to do, but I don't know if
# raising an exception is any better
return 0
if tmpstr.find('EXTERNAL_GENERAL_PARSED_ENTITY') != -1:
return 1
else:
return 0
else:
return 0
def doSerialize(node):
"""Serializes a node and its children, emitting PO messages along the way.
node is the node to serialize, first indicates whether surrounding
tags should be emitted as well.
"""
if ignoreNode(node):
return ''
elif not node.children:
return node.serialize("utf-8")
elif node.type == 'entity_ref':
if isExternalGeneralParsedEntity(node):
return node.serialize('utf-8')
else:
return stringForEntity(node) #content #content #serialize("utf-8")
elif node.type == 'entity_decl':
return node.serialize('utf-8') #'<%s>%s</%s>' % (startTagForNode(node), node.content, node.name)
elif node.type == 'text':
return node.serialize('utf-8')
elif node.type == 'element':
repl = []
(starttag, content, endtag, translation) = processElementTag(node, repl, 1)
return '<%s>%s</%s>' % (starttag, content, endtag)
else:
child = node.children
outtxt = ''
while child:
outtxt += doSerialize(child)
child = child.next
return outtxt
def read_finaltags(filelist):
if CurrentXmlMode:
return CurrentXmlMode.getFinalTags()
else:
defaults = ['para', 'title', 'releaseinfo', 'revnumber',
'date', 'itemizedlist', 'orderedlist',
'variablelist', 'varlistentry', 'term' ]
return defaults
def read_ignoredtags(filelist):
if CurrentXmlMode:
return CurrentXmlMode.getIgnoredTags()
else:
defaults = ['itemizedlist', 'orderedlist', 'variablelist',
'varlistentry' ]
return defaults
def read_swallowtags():
if CurrentXmlMode and CurrentXmlMode.getSwallowTags:
return CurrentXmlMode.getSwallowTags()
else:
defaults = [ ]
return defaults
def tryToUpdate(allargs, lang):
# Remove "-u" and "--update-translation"
command = allargs[0]
args = allargs[1:]
opts, args = getopt.getopt(args, 'avhmket:o:p:u:',
['automatic-tags','version', 'help', 'keep-entities', 'extract-all-entities', 'merge', 'translation=',
'output=', 'po-file=', 'update-translation=' ])
for opt, arg in opts:
if opt in ('-a', '--automatic-tags'):
command += " -a"
elif opt in ('-k', '--keep-entities'):
command += " -k"
elif opt in ('-e', '--extract-all-entities'):
command += " -e"
elif opt in ('-m', '--mode'):
command += " -m %s" % arg
elif opt in ('-o', '--output'):
sys.stderr.write("Error: Option '-o' is not yet supported when updating translations directly.\n")
sys.exit(8)
elif opt in ('-v', '--version'):
print VERSION
sys.exit(0)
elif opt in ('-h', '--help'):
sys.stderr.write("Error: If you want help, please use `%s --help' without '-u' option.\n" % (allargs[0]))
sys.exit(9)
elif opt in ('-u', '--update-translation'):
pass
else:
sys.stderr.write("Error: Option `%s' is not supported with option `-u'.\n" % (opt))
sys.exit(9)
while args:
command += " " + args.pop()
file = lang
sys.stderr.write("Merging translations for %s: " % (lang))
result = os.system("%s | msgmerge -o .tmp.%s.po %s -" % (command, lang, file))
if result:
sys.exit(10)
else:
result = os.system("mv .tmp.%s.po %s" % (lang, file))
if result:
sys.stderr.write("Error: cannot rename file.\n")
sys.exit(11)
else:
os.system("msgfmt -cv -o %s %s" % (NULL_STRING, file))
sys.exit(0)
def load_mode(modename):
#import imp
#found = imp.find_module(modename, submodes_path)
#module = imp.load_module(modename, found[0], found[1], found[2])
sys.path.append(submodes_path)
module = __import__(modename)
modeModule = '%sXmlMode' % modename
return getattr(module, modeModule)
def xml_error_handler(arg, ctxt):
pass
#libxml2.registerErrorHandler(xml_error_handler, None)
# Main program start
if __name__ != '__main__': raise NotImplementedError
# Parameters
submodes_path = "/opt/gnome/share/xml2po"
default_mode = 'docbook'
filename = ''
origxml = ''
mofile = ''
ultimate = [ ]
ignored = [ ]
filenames = [ ]
translationlanguage = ''
mode = 'pot' # 'pot' or 'merge'
automatic = 0
expand_entities = 1
expand_all_entities = 0
output = '-' # this means to stdout
NULL_STRING = '/dev/null'
if not os.path.exists('/dev/null'): NULL_STRING = 'NUL'
import getopt, fileinput
def usage (with_help = False):
print >> sys.stderr, "Usage: %s [OPTIONS] [XMLFILE]..." % (sys.argv[0])
if (with_help):
print >> sys.stderr, """
OPTIONS may be some of:
-a --automatic-tags Automatically decides if tags are to be considered
"final" or not
-k --keep-entities Don't expand entities
-e --expand-all-entities Expand ALL entities (including SYSTEM ones)
-m --mode=TYPE Treat tags as type TYPE (default: docbook)
-o --output=FILE Print resulting text (XML or POT) to FILE
-p --po-file=FILE Specify PO file containing translation, and merge
Overwrites temporary file .xml2po.mo.
-r --reuse=FILE Specify translated XML file with the same structure
-t --translation=FILE Specify MO file containing translation, and merge
-u --update-translation=LANG.po Updates a PO file using msgmerge program
-l --language=LANG Set language of the translation to LANG
-v --version Output version of the xml2po program
-h --help Output this message
EXAMPLES:
To create a POTemplate book.pot from input files chapter1.xml and
chapter2.xml, run the following:
%s -o book.pot chapter1.xml chapter2.xml
After translating book.pot into de.po, merge the translations back,
using -p option for each XML file:
%s -p de.po chapter1.xml > chapter1.de.xml
%s -p de.po chapter2.xml > chapter2.de.xml
""" % (sys.argv[0], sys.argv[0], sys.argv[0])
sys.exit(0)
if len(sys.argv) < 2: usage()
args = sys.argv[1:]
try: opts, args = getopt.getopt(args, 'avhkem:t:o:p:u:r:l:',
['automatic-tags','version', 'help', 'keep-entities', 'expand-all-entities', 'mode=', 'translation=',
'output=', 'po-file=', 'update-translation=', 'reuse=', 'language=' ])
except getopt.GetoptError: usage(True)
for opt, arg in opts:
if opt in ('-m', '--mode'):
default_mode = arg
if opt in ('-a', '--automatic-tags'):
automatic = 1
elif opt in ('-k', '--keep-entities'):
expand_entities = 0
elif opt in ('-e', '--expand-all-entities'):
expand_all_entities = 1
elif opt in ('-l', '--language'):
translationlanguage = arg
elif opt in ('-t', '--translation'):
mofile = arg
mode = 'merge'
if translationlanguage == '': translationlanguage = os.path.split(os.path.splitext(mofile)[0])[1]
elif opt in ('-r', '--reuse'):
origxml = arg
elif opt in ('-u', '--update-translation'):
tryToUpdate(sys.argv, arg)
elif opt in ('-p', '--po-file'):
mofile = ".xml2po.mo"
pofile = arg
if translationlanguage == '': translationlanguage = os.path.split(os.path.splitext(pofile)[0])[1]
os.system("msgfmt -o %s %s >%s" % (mofile, pofile, NULL_STRING)) and sys.exit(7)
mode = 'merge'
elif opt in ('-o', '--output'):
output = arg
elif opt in ('-v', '--version'):
print VERSION
sys.exit(0)
elif opt in ('-h', '--help'):
usage(True)
# Treat remaining arguments as XML files
while args:
filenames.append(args.pop())
if len(filenames) > 1 and mode=='merge':
print >> sys.stderr, "Error: You can merge translations with only one XML file at a time."
sys.exit(2)
try:
CurrentXmlMode = load_mode(default_mode)()
except:
CurrentXmlMode = None
print >> sys.stderr, "Warning: cannot load module '%s', using automatic detection (-a)." % (default_mode)
automatic = 1
if mode=='merge' and mofile=='':
print >> sys.stderr, "Error: You must specify MO file when merging translations."
sys.exit(3)
if mode=='merge':
openedfile = open(mofile, "rb")
if openedfile:
gt = gettext.GNUTranslations(openedfile)
ultimate_tags = read_finaltags(ultimate)
ignored_tags = read_ignoredtags(ignored)
swallow_tags = read_swallowtags()
# I'm not particularly happy about making any of these global,
# but I don't want to bother too much with it right now
semitrans = {}
PlaceHolder = 0
if origxml == '':
msg = MessageOutput()
else:
filenames.append(origxml)
msg = MessageOutput(1)
for filename in filenames:
try:
if filename == origxml:
msg.translationsFollow()
ctxt = libxml2.createFileParserCtxt(filename)
ctxt.lineNumbers(1)
if expand_all_entities:
ctxt.replaceEntities(1)
ctxt.parseDocument()
doc = ctxt.doc()
if doc.name != filename:
print >> sys.stderr, "Error: I tried to open '%s' but got '%s' -- how did that happen?" % (filename, doc.name)
sys.exit(4)
except:
print >> sys.stderr, "Error: cannot open file '%s'." % (filename)
sys.exit(1)
msg.setFilename(filename)
if CurrentXmlMode and origxml=='':
CurrentXmlMode.preProcessXml(doc,msg)
doSerialize(doc)
if output == '-':
out = sys.stdout
else:
try:
out = file(output, 'w')
except:
print >> sys.stderr, "Error: cannot open file %s for writing." % (output)
sys.exit(5)
if mode != 'merge':
if CurrentXmlMode:
tcmsg = CurrentXmlMode.getStringForTranslators()
tccom = CurrentXmlMode.getCommentForTranslators()
if tcmsg:
msg.outputMessage(tcmsg, 0, tccom)
msg.outputAll(out)
else:
if CurrentXmlMode:
tcmsg = CurrentXmlMode.getStringForTranslators()
if tcmsg:
outtxt = getTranslation(tcmsg)
else:
outtxt = ''
CurrentXmlMode.postProcessXmlTranslation(doc, translationlanguage, outtxt)
out.write(doc.serialize('utf-8', 1))