apostrophe/uberwriter/gtkspellcheck/oxt_extract.py

294 lines
12 KiB
Python

# -*- coding:utf-8 -*-
#
# Copyright (C) 2012, Carlos Jenkins <carlos@jenkins.co.cr>
# Copyright (C) 2012-2016, Maximilian Köhl <mail@koehlma.de>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
This module extracts the .dic and .aff (Hunspell) dictionaries from any given
.oxt extension.
Extensions could be found at:
http://extensions.services.openoffice.org/dictionary
"""
import functools
import gettext
import logging
import os
import shutil
import sys
import warnings
import xml.dom.minidom
import xml.parsers.expat
import zipfile
# enable deprecation warnings
warnings.simplefilter('always', DeprecationWarning)
# public objects
__all__ = ['extract_oxt', 'batch_extract', 'BadXml', 'BadExtensionFile',
'ExtractPathIsNoDirectory', 'BATCH_SUCCESS', 'BATCH_ERROR',
'BATCH_WARNING']
# logger
logger = logging.getLogger(__name__)
# translation
locale_name = 'py{}gtkspellcheck'.format(sys.version_info.major)
_ = gettext.translation(locale_name, fallback=True).gettext
class BadXml(Exception):
"""
The XML dictionary registry is not valid XML.
"""
class BadExtensionFile(Exception):
"""
The extension has a wrong file format, should be a ZIP file.
"""
class ExtractPathIsNoDirectory(Exception):
"""
The given `extract_path` is no directory.
"""
def find_dictionaries(registry):
def oor_name(name, element):
return element.attributes['oor:name'].value.lower() == name
def get_property(name, properties):
property = list(filter(functools.partial(oor_name, name),
properties))
if property:
return property[0].getElementsByTagName('value')[0]
result = []
# find all "node" elements which have "dictionaries" as "oor:name" attribute
for dictionaries in filter(functools.partial(oor_name, 'dictionaries'),
registry.getElementsByTagName('node')):
# for all "node" elements in this dictionary nodes
for dictionary in dictionaries.getElementsByTagName('node'):
# get all "prop" elements
properties = dictionary.getElementsByTagName('prop')
# get the format property as text
format = get_property('format', properties).firstChild.data.strip()
if format and format == 'DICT_SPELL':
# find the locations property
locations = get_property('locations', properties)
# if the location property is text:
# %origin%/dictionary.aff %origin%/dictionary.dic
if locations.firstChild.nodeType == xml.dom.Node.TEXT_NODE:
locations = locations.firstChild.data
locations = locations.replace('%origin%/', '').strip()
result.append(locations.split())
# otherwise:
# <i>%origin%/dictionary.aff</i> <i>%origin%/dictionary.dic</i>
else:
locations = [item.firshChild.data.replace('%origin%/', '') \
.strip() for item in
locations.getElementsByTagName('it')]
result.append(locations)
return result
def extract(filename, target, override=False):
"""
Extract Hunspell dictionaries out of LibreOffice ``.oxt`` extensions.
:param filename: path to the ``.oxt`` extension
:param target: path to extract Hunspell dictionaries to
:param override: override existing files in the target directory
:rtype: list of the extracted dictionaries
This function extracts the Hunspell dictionaries (``.dic`` and ``.aff``
files) from the given ``.oxt`` extension found to ``target``.
Extensions could be found at:
http://extensions.services.openoffice.org/dictionary
"""
# TODO 5.0: remove this function
warnings.warn(('call to deprecated function "{}", '
'moved to separate package "oxt_extract", '
'will be removed in pygtkspellcheck 5.0').format(extract.__name__),
category=DeprecationWarning)
try:
with zipfile.ZipFile(filename, 'r') as extension:
files = extension.namelist()
registry = 'dictionaries.xcu'
if not registry in files:
for filename in files:
if filename.lower().endswith(registry):
registry = filename
if registry in files:
registry = xml.dom.minidom.parse(extension.open(registry))
dictionaries = find_dictionaries(registry)
extracted = []
for dictionary in dictionaries:
for filename in dictionary:
dict_file = os.path.join(target,
os.path.basename(filename))
if (not os.path.exists(dict_file)
or (override and os.path.isfile(dict_file))):
if filename in files:
with open(dict_file, 'wb') as _target:
with extension.open(filename, 'r') as _source:
extracted.append(os.path.basename(filename))
_target.write(_source.read())
else:
logger.warning('dictionary exists in registry '
'but not in the extension zip')
else:
logging.warning(('dictionary file "{}" already exists '
'and not overriding it'
).format(dict_file))
return extracted
except zipfile.BadZipfile:
raise BadExtensionFile('extension is not a valid ZIP file')
except xml.parsers.expat.ExpatError:
raise BadXml('dictionary registry is not valid XML')
BATCH_SUCCESS = 'success'
BATCH_ERROR = 'error'
BATCH_WARNING = 'warning'
def batch_extract(oxt_path, extract_path, override=False, move_path=None):
"""
Uncompress, read and install LibreOffice ``.oxt`` dictionaries extensions.
:param oxt_path: path to a directory containing the ``.oxt`` extensions
:param extract_path: path to extract Hunspell dictionaries files to
:param override: override already existing files
:param move_path: optional path to move the ``.oxt`` files after processing
:rtype: generator over all extensions, yielding result, extension name,
error, extracted dictionaries and translated error message - result
would be :const:`BATCH_SUCCESS` for success, :const:`BATCH_ERROR` if
some error happened or :const:`BATCH_WARNING` which contain some warning
messages instead of errors
This function extracts the Hunspell dictionaries (``.dic`` and ``.aff``
files) from all the ``.oxt`` extensions found on ``oxt_path`` directory to
the ``extract_path`` directory.
Extensions could be found at:
http://extensions.services.openoffice.org/dictionary
In detail, this functions does the following:
1. find all the ``.oxt`` extension files within ``oxt_path``
2. open (unzip) each extension
3. find the dictionary definition file within (*dictionaries.xcu*)
4. parse the dictionary definition file and locate the dictionaries files
5. uncompress those files to ``extract_path``
By default file overriding is disabled, set ``override`` parameter to True
if you want to enable it. As additional option, each processed extension can
be moved to ``move_path``.
Example::
for result, name, error, dictionaries, message in oxt_extract.batch_extract(...):
if result == oxt_extract.BATCH_SUCCESS:
print('successfully extracted extension "{}"'.format(name))
elif result == oxt_extract.BATCH_ERROR:
print('could not extract extension "{}"'.format(name))
print(message)
print('error {}'.format(error))
elif result == oxt_extract.BATCH_WARNING:
print('warning during processing extension "{}"'.format(name))
print(message)
print(error)
"""
# TODO 5.0: remove this function
warnings.warn(('call to deprecated function "{}", '
'moved to separate package "oxt_extract", '
'will be removed in pygtkspellcheck 5.0').format(extract.__name__),
category=DeprecationWarning)
# get the real, absolute and normalized path
oxt_path = os.path.normpath(os.path.abspath(os.path.realpath(oxt_path)))
# check that the input directory exists
if not os.path.isdir(oxt_path):
return
# create extract directory if not exists
if not os.path.exists(extract_path):
os.makedirs(extract_path)
# check that the extract path is a directory
if not os.path.isdir(extract_path):
raise ExtractPathIsNoDirectory('extract path is not a valid directory')
# get all .oxt extension at given path
oxt_files = [extension for extension in os.listdir(oxt_path)
if extension.lower().endswith('.oxt')]
for extension_name in oxt_files:
extension_path = os.path.join(oxt_path, extension_name)
try:
dictionaries = extract(extension_path, extract_path, override)
yield BATCH_SUCCESS, extension_name, None, dictionaries, ''
except BadExtensionFile as error:
logger.error(('extension "{}" is not a valid ZIP file'
).format(extension_name))
yield (BATCH_ERROR, extension_name, error, [],
_('extension "{}" is not a valid ZIP file'
).format(extension_name))
except BadXml as error:
logger.error(('extension "{}" has no valid XML dictionary registry'
).format(extension_name))
yield (BATCH_ERROR, extension_name, error, [],
_('extension "{}" has no valid XML dictionary registry'
).format(extension_name))
# move the extension after processing if user requires it
if move_path is not None:
# create move path if it doesn't exists
if not os.path.exists(move_path):
os.makedirs(move_path)
# move to the given path only if it is a directory and target
# doesn't exists
if os.path.isdir(move_path):
if (not os.path.exists(os.path.join(move_path, extension_name))
or override):
shutil.move(extension_path, move_path)
else:
logger.warning(('unable to move extension, file with same '
'name exists within move_path'))
yield (BATCH_WARNING, extension_name,
('unable to move extension, file with same name '
'exists within move_path'), [],
_('unable to move extension, file with same name '
'exists within move_path'))
else:
logger.warning(('unable to move extension, move_path is not a '
'directory'))
yield (BATCH_WARNING, extension_name,
('unable to move extension, move_path is not a '
'directory'), [],
_('unable to move extension, move_path is not a '
'directory'))