apostrophe/apostrophe/plugins/bibtex/bibtexparser/bparser.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Original source: github.com/okfn/bibserver
# Authors:
# markmacgillivray
# Etienne Posthumus (epoz)
# Francois Boulogne <fboulogne at april dot org>

import sys
import logging
import io
import re
from .bibdatabase import BibDatabase

logger = logging.getLogger(__name__)

__all__ = ['BibTexParser']


if sys.version_info >= (3, 0):
    from io import StringIO
    ustr = str
else:
    from StringIO import StringIO
    ustr = unicode


class BibTexParser(object):
    """
    A parser for reading BibTeX bibliographic data files.

    Example::

        from bibtexparser.bparser import BibTexParser

        bibtex_str = ...

        parser = BibTexParser()
        parser.ignore_nonstandard_types = False
        parser.homogenise_fields = False
        bib_database = bibtexparser.loads(bibtex_str, parser)
    """

    def __new__(cls, data=None,
                customization=None,
                ignore_nonstandard_types=True,
                homogenise_fields=True):
        """
        To catch the old API structure in which creating the parser would immediately parse and return data.
        """

        if data is None:
            return super(BibTexParser, cls).__new__(cls)
        else:
            # For backwards compatibility: if data is given, parse and return the `BibDatabase` object instead of the
            # parser.
            parser = BibTexParser()
            parser.customization = customization
            parser.ignore_nonstandard_types = ignore_nonstandard_types
            parser.homogenise_fields = homogenise_fields
            return parser.parse(data)

    def __init__(self):
        """
        Creates a parser for rading BibTeX files

        :return: parser
        :rtype: `BibTexParser`
        """
        self.bib_database = BibDatabase()
        #: Callback function to process BibTeX entries after parsing, for example to create a list from a string with
        #: multiple values. By default all BibTeX values are treated as simple strings. Default: `None`.
        self.customization = None

        #: Ignore non-standard BibTeX types (`book`, `article`, etc). Default: `True`.
        self.ignore_nonstandard_types = True

        #: Sanitise BibTeX field names, for example change `url` to `link` etc. Field names are always converted to
        #: lowercase names. Default: `True`.
        self.homogenise_fields = True

        # On some sample data files, the character encoding detection simply
        # hangs We are going to default to utf8, and mandate it.
        self.encoding = 'utf8'

        # pre-defined set of key changes
        self.alt_dict = {
            'keyw': 'keyword',
            'keywords': 'keyword',
            'authors': 'author',
            'editors': 'editor',
            'url': 'link',
            'urls': 'link',
            'links': 'link',
            'subjects': 'subject'
        }

        self.replace_all_re = re.compile(r'((?P<pre>"?)\s*(#|^)\s*(?P<id>[^\d\W]\w*)\s*(#|$)\s*(?P<post>"?))', re.UNICODE)

    def _bibtex_file_obj(self, bibtex_str):
        # Some files have Byte-order marks inserted at the start
        byte = '\xef\xbb\xbf'
        if not isinstance(byte, ustr):
            byte = ustr('\xef\xbb\xbf', self.encoding, 'ignore')
        if bibtex_str[:3] == byte:
            bibtex_str = bibtex_str[3:]
        return StringIO(bibtex_str)

    def parse(self, bibtex_str):
        """Parse a BibTeX string into an object

        :param bibtex_str: BibTeX string
        :type: str or unicode
        :return: bibliographic database
        :rtype: BibDatabase
        """
        self.bibtex_file_obj = self._bibtex_file_obj(bibtex_str)
        self._parse_records(customization=self.customization)
        return self.bib_database

    def parse_file(self, file):
        """Parse a BibTeX file into an object

        :param file: BibTeX file or file-like object
        :type: file
        :return: bibliographic database
        :rtype: BibDatabase
        """
        return self.parse(file.read())

    def _parse_records(self, customization=None):
        """Parse the bibtex into a list of records.

        :param customization: a function
        """
        def _add_parsed_record(record, records):
            """
            Atomic function to parse a record
            and append the result in records
            """
            if record != "":
                logger.debug('The record is not empty. Let\'s parse it.')
                parsed = self._parse_record(record, customization=customization)
                if parsed:
                    logger.debug('Store the result of the parsed record')
                    records.append(parsed)
                else:
                    logger.debug('Nothing returned from the parsed record!')
            else:
                logger.debug('The record is empty')

        records = []
        record = ""
        # read each line, bundle them up until they form an object, then send for parsing
        for linenumber, line in enumerate(self.bibtex_file_obj):
            logger.debug('Inspect line %s', linenumber)
            if line.strip().startswith('@'):
                # Remove leading whitespaces
                line = line.lstrip()
                logger.debug('Line starts with @')
                # Parse previous record
                _add_parsed_record(record, records)
                # Start new record
                logger.debug('The record is set to empty')
                record = ""
            # Keep adding lines to the record
            record += line

        # catch any remaining record and send it for parsing
        _add_parsed_record(record, records)
        logger.debug('Set the list of entries')
        self.bib_database.entries = records

    def _parse_record(self, record, customization=None):
        """Parse a record.

        * tidy whitespace and other rubbish
        * parse out the bibtype and citekey
        * find all the key-value pairs it contains

        :param record: a record
        :param customization: a function

        :returns: dict --
        """
        d = {}

        if not record.startswith('@'):
            logger.debug('The record does not start with @. Return empty dict.')
            return {}

        # if a comment record, add to bib_database.comments
        if record.lower().startswith('@comment'):
            logger.debug('The record startswith @comment')
            logger.debug('Store comment in list of comments')

            self.bib_database.comments.append(re.search('\{(.*)\}', record, re.DOTALL).group(1))

            logger.debug('Return an empty dict')
            return {}

        # if a preamble record, add to bib_database.preambles
        if record.lower().startswith('@preamble'):
            logger.debug('The record startswith @preamble')
            logger.debug('Store preamble in list of preambles')

            self.bib_database.preambles.append(re.search('\{(.*)\}', record, re.DOTALL).group(1))

            logger.debug('Return an empty dict')
            return {}

        # prepare record
        record = '\n'.join([i.strip() for i in record.split('\n')])
        if '}\n' in record:
            logger.debug('}\\n detected in the record. Clean up.')
            record = record.replace('\r\n', '\n').replace('\r', '\n').rstrip('\n')
            # treat the case for which the last line of the record
            # does not have a coma
            if record.endswith('}\n}') or record.endswith('}}'):
                logger.debug('Missing coma in the last line of the record. Fix it.')
                record = re.sub('}(\n|)}$', '},\n}', record)

        # if a string record, put it in the replace_dict
        if record.lower().startswith('@string'):
            logger.debug('The record startswith @string')
            key, val = [i.strip().strip('{').strip('}').replace('\n', ' ') for i in record.split('{', 1)[1].strip('}').strip('\n').strip(',').split('=')]
            key = key.lower()  # key is case insensitive
            val = self._string_subst_partial(val)
            if val.startswith('"') or val.lower() not in self.bib_database.strings:
                self.bib_database.strings[key] = val.strip('"')
            else:
                self.bib_database.strings[key] = self.bib_database.strings[val.lower()]
            logger.debug('Return a dict')
            return d

        # for each line in record
        logger.debug('Split the record of its lines and treat them')
        kvs = [i.strip() for i in re.split(',\s*\n|\n\s*,', record)]
        inkey = ""
        inval = ""
        for kv in kvs:
            logger.debug('Inspect: %s', kv)
            # TODO: We may check that the keyword belongs to a known type
            if kv.startswith('@') and not inkey:
                # it is the start of the record - set the bibtype and citekey (id)
                logger.debug('Line starts with @ and the key is not stored yet.')
                bibtype, id = kv.split('{', 1)
                bibtype = self._add_key(bibtype)
                id = id.lstrip().strip('}').strip(',')
                logger.debug('bibtype = %s', bibtype)
                logger.debug('id = %s', id)
                if self.ignore_nonstandard_types and bibtype not in ('article',
                                                                     'book',
                                                                     'booklet',
                                                                     'conference',
                                                                     'inbook',
                                                                     'incollection',
                                                                     'inproceedings',
                                                                     'manual',
                                                                     'mastersthesis',
                                                                     'misc',
                                                                     'phdthesis',
                                                                     'proceedings',
                                                                     'techreport',
                                                                     'unpublished'):
                    logger.warning('Entry type %s not standard. Not considered.', bibtype)
                    break
            elif '=' in kv and not inkey:
                # it is a line with a key value pair on it
                logger.debug('Line contains a key-pair value and the key is not stored yet.')
                key, val = [i.strip() for i in kv.split('=', 1)]
                key = self._add_key(key)
                val = self._string_subst_partial(val)
                # if it looks like the value spans lines, store details for next loop
                if (val.count('{') != val.count('}')) or (val.startswith('"') and not val.replace('}', '').endswith('"')):
                    logger.debug('The line is not ending the record.')
                    inkey = key
                    inval = val
                else:
                    logger.debug('The line is the end of the record.')
                    d[key] = self._add_val(val)
            elif inkey:
                logger.debug('Continues the previous line to complete the key pair value...')
                # if this line continues the value from a previous line, append
                inval += ', ' + kv
                # if it looks like this line finishes the value, store it and clear for next loop
                if (inval.startswith('{') and inval.endswith('}')) or (inval.startswith('"') and inval.endswith('"')):
                    logger.debug('This line represents the end of the current key-pair value')
                    d[inkey] = self._add_val(inval)
                    inkey = ""
                    inval = ""
                else:
                    logger.debug('This line does NOT represent the end of the current key-pair value')

        logger.debug('All lines have been treated')
        if not d:
            logger.debug('The dict is empty, return it.')
            return d

        d['ENTRYTYPE'] = bibtype
        d['ID'] = id

        if customization is None:
            logger.debug('No customization to apply, return dict')
            return d
        else:
            # apply any customizations to the record object then return it
            logger.debug('Apply customizations and return dict')
            return customization(d)

    def _strip_quotes(self, val):
        """Strip double quotes enclosing string

        :param val: a value
        :type val: string
        :returns: string -- value
        """
        logger.debug('Strip quotes')
        val = val.strip()
        if val.startswith('"') and val.endswith('"'):
            return val[1:-1]
        return val

    def _strip_braces(self, val):
        """Strip braces enclosing string

        :param val: a value
        :type val: string
        :returns: string -- value
        """
        logger.debug('Strip braces')
        val = val.strip()
        if val.startswith('{') and val.endswith('}') and self._full_span(val):
            return val[1:-1]
        return val

    def _full_span(self, val):
        cnt = 0
        for i in range(0, len(val)):
                if val[i] == '{':
                        cnt += 1
                elif val[i] == '}':
                        cnt -= 1
                if cnt == 0:
                        break
        if i == len(val) - 1:
                return True
        else:
                return False

    def _string_subst(self, val):
        """ Substitute string definitions

        :param val: a value
        :type val: string
        :returns: string -- value
        """
        logger.debug('Substitute string definitions')
        if not val:
            return ''
        for k in list(self.bib_database.strings.keys()):
            if val.lower() == k:
                val = self.bib_database.strings[k]
        if not isinstance(val, ustr):
            val = ustr(val, self.encoding, 'ignore')

        return val

    def _string_subst_partial(self, val):
        """ Substitute string definitions inside larger expressions

        :param val: a value
        :type val: string
        :returns: string -- value
        """
        def repl(m):
            k = m.group('id')
            replacement = self.bib_database.strings[k.lower()] if k.lower() in self.bib_database.strings else k
            pre = '"' if m.group('pre') != '"' else ''
            post = '"' if m.group('post') != '"' else ''
            return pre + replacement + post

        logger.debug('Substitute string definitions inside larger expressions')
        if '#' not in val:
            return val

        # TODO?: Does not match two subsequent variables or strings, such as  "start" # foo # bar # "end"  or  "start" # "end".
        # TODO:  Does not support braces instead of quotes, e.g.: {start} # foo # {bar}
        # TODO:  Does not support strings like: "te#s#t"
        return self.replace_all_re.sub(repl, val)

    def _add_val(self, val):
        """ Clean instring before adding to dictionary

        :param val: a value
        :type val: string
        :returns: string -- value
        """
        if not val or val == "{}":
            return ''
        val = self._strip_braces(val)
        val = self._strip_quotes(val)
        val = self._strip_braces(val)
        val = self._string_subst(val)
        return val

    def _add_key(self, key):
        """ Add a key and homogeneize alternative forms.

        :param key: a key
        :type key: string
        :returns: string -- value
        """
        key = key.strip().strip('@').lower()
        if self.homogenise_fields:
            if key in list(self.alt_dict.keys()):
                key = self.alt_dict[key]
        if not isinstance(key, ustr):
            return ustr(key, 'utf-8')
        else:
            return key