#!/usr/bin/env python # -*- coding: utf-8 -*- # Original source: github.com/okfn/bibserver # Authors: # markmacgillivray # Etienne Posthumus (epoz) # Francois Boulogne import sys import logging import io import re from .bibdatabase import BibDatabase logger = logging.getLogger(__name__) __all__ = ['BibTexParser'] if sys.version_info >= (3, 0): from io import StringIO ustr = str else: from StringIO import StringIO ustr = unicode class BibTexParser(object): """ A parser for reading BibTeX bibliographic data files. Example:: from bibtexparser.bparser import BibTexParser bibtex_str = ... parser = BibTexParser() parser.ignore_nonstandard_types = False parser.homogenise_fields = False bib_database = bibtexparser.loads(bibtex_str, parser) """ def __new__(cls, data=None, customization=None, ignore_nonstandard_types=True, homogenise_fields=True): """ To catch the old API structure in which creating the parser would immediately parse and return data. """ if data is None: return super(BibTexParser, cls).__new__(cls) else: # For backwards compatibility: if data is given, parse and return the `BibDatabase` object instead of the # parser. parser = BibTexParser() parser.customization = customization parser.ignore_nonstandard_types = ignore_nonstandard_types parser.homogenise_fields = homogenise_fields return parser.parse(data) def __init__(self): """ Creates a parser for rading BibTeX files :return: parser :rtype: `BibTexParser` """ self.bib_database = BibDatabase() #: Callback function to process BibTeX entries after parsing, for example to create a list from a string with #: multiple values. By default all BibTeX values are treated as simple strings. Default: `None`. self.customization = None #: Ignore non-standard BibTeX types (`book`, `article`, etc). Default: `True`. self.ignore_nonstandard_types = True #: Sanitise BibTeX field names, for example change `url` to `link` etc. Field names are always converted to #: lowercase names. Default: `True`. self.homogenise_fields = True # On some sample data files, the character encoding detection simply # hangs We are going to default to utf8, and mandate it. self.encoding = 'utf8' # pre-defined set of key changes self.alt_dict = { 'keyw': 'keyword', 'keywords': 'keyword', 'authors': 'author', 'editors': 'editor', 'url': 'link', 'urls': 'link', 'links': 'link', 'subjects': 'subject' } self.replace_all_re = re.compile(r'((?P
"?)\s*(#|^)\s*(?P[^\d\W]\w*)\s*(#|$)\s*(?P"?))', re.UNICODE)

    def _bibtex_file_obj(self, bibtex_str):
        # Some files have Byte-order marks inserted at the start
        byte = '\xef\xbb\xbf'
        if not isinstance(byte, ustr):
            byte = ustr('\xef\xbb\xbf', self.encoding, 'ignore')
        if bibtex_str[:3] == byte:
            bibtex_str = bibtex_str[3:]
        return StringIO(bibtex_str)

    def parse(self, bibtex_str):
        """Parse a BibTeX string into an object

        :param bibtex_str: BibTeX string
        :type: str or unicode
        :return: bibliographic database
        :rtype: BibDatabase
        """
        self.bibtex_file_obj = self._bibtex_file_obj(bibtex_str)
        self._parse_records(customization=self.customization)
        return self.bib_database

    def parse_file(self, file):
        """Parse a BibTeX file into an object

        :param file: BibTeX file or file-like object
        :type: file
        :return: bibliographic database
        :rtype: BibDatabase
        """
        return self.parse(file.read())

    def _parse_records(self, customization=None):
        """Parse the bibtex into a list of records.

        :param customization: a function
        """
        def _add_parsed_record(record, records):
            """
            Atomic function to parse a record
            and append the result in records
            """
            if record != "":
                logger.debug('The record is not empty. Let\'s parse it.')
                parsed = self._parse_record(record, customization=customization)
                if parsed:
                    logger.debug('Store the result of the parsed record')
                    records.append(parsed)
                else:
                    logger.debug('Nothing returned from the parsed record!')
            else:
                logger.debug('The record is empty')

        records = []
        record = ""
        # read each line, bundle them up until they form an object, then send for parsing
        for linenumber, line in enumerate(self.bibtex_file_obj):
            logger.debug('Inspect line %s', linenumber)
            if line.strip().startswith('@'):
                # Remove leading whitespaces
                line = line.lstrip()
                logger.debug('Line starts with @')
                # Parse previous record
                _add_parsed_record(record, records)
                # Start new record
                logger.debug('The record is set to empty')
                record = ""
            # Keep adding lines to the record
            record += line

        # catch any remaining record and send it for parsing
        _add_parsed_record(record, records)
        logger.debug('Set the list of entries')
        self.bib_database.entries = records

    def _parse_record(self, record, customization=None):
        """Parse a record.

        * tidy whitespace and other rubbish
        * parse out the bibtype and citekey
        * find all the key-value pairs it contains

        :param record: a record
        :param customization: a function

        :returns: dict --
        """
        d = {}

        if not record.startswith('@'):
            logger.debug('The record does not start with @. Return empty dict.')
            return {}

        # if a comment record, add to bib_database.comments
        if record.lower().startswith('@comment'):
            logger.debug('The record startswith @comment')
            logger.debug('Store comment in list of comments')

            self.bib_database.comments.append(re.search('\{(.*)\}', record, re.DOTALL).group(1))

            logger.debug('Return an empty dict')
            return {}

        # if a preamble record, add to bib_database.preambles
        if record.lower().startswith('@preamble'):
            logger.debug('The record startswith @preamble')
            logger.debug('Store preamble in list of preambles')

            self.bib_database.preambles.append(re.search('\{(.*)\}', record, re.DOTALL).group(1))

            logger.debug('Return an empty dict')
            return {}

        # prepare record
        record = '\n'.join([i.strip() for i in record.split('\n')])
        if '}\n' in record:
            logger.debug('}\\n detected in the record. Clean up.')
            record = record.replace('\r\n', '\n').replace('\r', '\n').rstrip('\n')
            # treat the case for which the last line of the record
            # does not have a coma
            if record.endswith('}\n}') or record.endswith('}}'):
                logger.debug('Missing coma in the last line of the record. Fix it.')
                record = re.sub('}(\n|)}$', '},\n}', record)

        # if a string record, put it in the replace_dict
        if record.lower().startswith('@string'):
            logger.debug('The record startswith @string')
            key, val = [i.strip().strip('{').strip('}').replace('\n', ' ') for i in record.split('{', 1)[1].strip('}').strip('\n').strip(',').split('=')]
            key = key.lower()  # key is case insensitive
            val = self._string_subst_partial(val)
            if val.startswith('"') or val.lower() not in self.bib_database.strings:
                self.bib_database.strings[key] = val.strip('"')
            else:
                self.bib_database.strings[key] = self.bib_database.strings[val.lower()]
            logger.debug('Return a dict')
            return d

        # for each line in record
        logger.debug('Split the record of its lines and treat them')
        kvs = [i.strip() for i in re.split(',\s*\n|\n\s*,', record)]
        inkey = ""
        inval = ""
        for kv in kvs:
            logger.debug('Inspect: %s', kv)
            # TODO: We may check that the keyword belongs to a known type
            if kv.startswith('@') and not inkey:
                # it is the start of the record - set the bibtype and citekey (id)
                logger.debug('Line starts with @ and the key is not stored yet.')
                bibtype, id = kv.split('{', 1)
                bibtype = self._add_key(bibtype)
                id = id.lstrip().strip('}').strip(',')
                logger.debug('bibtype = %s', bibtype)
                logger.debug('id = %s', id)
                if self.ignore_nonstandard_types and bibtype not in ('article',
                                                                     'book',
                                                                     'booklet',
                                                                     'conference',
                                                                     'inbook',
                                                                     'incollection',
                                                                     'inproceedings',
                                                                     'manual',
                                                                     'mastersthesis',
                                                                     'misc',
                                                                     'phdthesis',
                                                                     'proceedings',
                                                                     'techreport',
                                                                     'unpublished'):
                    logger.warning('Entry type %s not standard. Not considered.', bibtype)
                    break
            elif '=' in kv and not inkey:
                # it is a line with a key value pair on it
                logger.debug('Line contains a key-pair value and the key is not stored yet.')
                key, val = [i.strip() for i in kv.split('=', 1)]
                key = self._add_key(key)
                val = self._string_subst_partial(val)
                # if it looks like the value spans lines, store details for next loop
                if (val.count('{') != val.count('}')) or (val.startswith('"') and not val.replace('}', '').endswith('"')):
                    logger.debug('The line is not ending the record.')
                    inkey = key
                    inval = val
                else:
                    logger.debug('The line is the end of the record.')
                    d[key] = self._add_val(val)
            elif inkey:
                logger.debug('Continues the previous line to complete the key pair value...')
                # if this line continues the value from a previous line, append
                inval += ', ' + kv
                # if it looks like this line finishes the value, store it and clear for next loop
                if (inval.startswith('{') and inval.endswith('}')) or (inval.startswith('"') and inval.endswith('"')):
                    logger.debug('This line represents the end of the current key-pair value')
                    d[inkey] = self._add_val(inval)
                    inkey = ""
                    inval = ""
                else:
                    logger.debug('This line does NOT represent the end of the current key-pair value')

        logger.debug('All lines have been treated')
        if not d:
            logger.debug('The dict is empty, return it.')
            return d

        d['ENTRYTYPE'] = bibtype
        d['ID'] = id

        if customization is None:
            logger.debug('No customization to apply, return dict')
            return d
        else:
            # apply any customizations to the record object then return it
            logger.debug('Apply customizations and return dict')
            return customization(d)

    def _strip_quotes(self, val):
        """Strip double quotes enclosing string

        :param val: a value
        :type val: string
        :returns: string -- value
        """
        logger.debug('Strip quotes')
        val = val.strip()
        if val.startswith('"') and val.endswith('"'):
            return val[1:-1]
        return val

    def _strip_braces(self, val):
        """Strip braces enclosing string

        :param val: a value
        :type val: string
        :returns: string -- value
        """
        logger.debug('Strip braces')
        val = val.strip()
        if val.startswith('{') and val.endswith('}') and self._full_span(val):
            return val[1:-1]
        return val

    def _full_span(self, val):
        cnt = 0
        for i in range(0, len(val)):
                if val[i] == '{':
                        cnt += 1
                elif val[i] == '}':
                        cnt -= 1
                if cnt == 0:
                        break
        if i == len(val) - 1:
                return True
        else:
                return False

    def _string_subst(self, val):
        """ Substitute string definitions

        :param val: a value
        :type val: string
        :returns: string -- value
        """
        logger.debug('Substitute string definitions')
        if not val:
            return ''
        for k in list(self.bib_database.strings.keys()):
            if val.lower() == k:
                val = self.bib_database.strings[k]
        if not isinstance(val, ustr):
            val = ustr(val, self.encoding, 'ignore')

        return val

    def _string_subst_partial(self, val):
        """ Substitute string definitions inside larger expressions

        :param val: a value
        :type val: string
        :returns: string -- value
        """
        def repl(m):
            k = m.group('id')
            replacement = self.bib_database.strings[k.lower()] if k.lower() in self.bib_database.strings else k
            pre = '"' if m.group('pre') != '"' else ''
            post = '"' if m.group('post') != '"' else ''
            return pre + replacement + post

        logger.debug('Substitute string definitions inside larger expressions')
        if '#' not in val:
            return val

        # TODO?: Does not match two subsequent variables or strings, such as  "start" # foo # bar # "end"  or  "start" # "end".
        # TODO:  Does not support braces instead of quotes, e.g.: {start} # foo # {bar}
        # TODO:  Does not support strings like: "te#s#t"
        return self.replace_all_re.sub(repl, val)

    def _add_val(self, val):
        """ Clean instring before adding to dictionary

        :param val: a value
        :type val: string
        :returns: string -- value
        """
        if not val or val == "{}":
            return ''
        val = self._strip_braces(val)
        val = self._strip_quotes(val)
        val = self._strip_braces(val)
        val = self._string_subst(val)
        return val

    def _add_key(self, key):
        """ Add a key and homogeneize alternative forms.

        :param key: a key
        :type key: string
        :returns: string -- value
        """
        key = key.strip().strip('@').lower()
        if self.homogenise_fields:
            if key in list(self.alt_dict.keys()):
                key = self.alt_dict[key]
        if not isinstance(key, ustr):
            return ustr(key, 'utf-8')
        else:
            return key