forked from Mirrors/apostrophe
423 lines
16 KiB
Python
423 lines
16 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
|
|
# Original source: github.com/okfn/bibserver
|
|
# Authors:
|
|
# markmacgillivray
|
|
# Etienne Posthumus (epoz)
|
|
# Francois Boulogne <fboulogne at april dot org>
|
|
|
|
import sys
|
|
import logging
|
|
import io
|
|
import re
|
|
from .bibdatabase import BibDatabase
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
__all__ = ['BibTexParser']
|
|
|
|
|
|
if sys.version_info >= (3, 0):
|
|
from io import StringIO
|
|
ustr = str
|
|
else:
|
|
from StringIO import StringIO
|
|
ustr = unicode
|
|
|
|
|
|
class BibTexParser(object):
|
|
"""
|
|
A parser for reading BibTeX bibliographic data files.
|
|
|
|
Example::
|
|
|
|
from bibtexparser.bparser import BibTexParser
|
|
|
|
bibtex_str = ...
|
|
|
|
parser = BibTexParser()
|
|
parser.ignore_nonstandard_types = False
|
|
parser.homogenise_fields = False
|
|
bib_database = bibtexparser.loads(bibtex_str, parser)
|
|
"""
|
|
|
|
def __new__(cls, data=None,
|
|
customization=None,
|
|
ignore_nonstandard_types=True,
|
|
homogenise_fields=True):
|
|
"""
|
|
To catch the old API structure in which creating the parser would immediately parse and return data.
|
|
"""
|
|
|
|
if data is None:
|
|
return super(BibTexParser, cls).__new__(cls)
|
|
else:
|
|
# For backwards compatibility: if data is given, parse and return the `BibDatabase` object instead of the
|
|
# parser.
|
|
parser = BibTexParser()
|
|
parser.customization = customization
|
|
parser.ignore_nonstandard_types = ignore_nonstandard_types
|
|
parser.homogenise_fields = homogenise_fields
|
|
return parser.parse(data)
|
|
|
|
def __init__(self):
|
|
"""
|
|
Creates a parser for rading BibTeX files
|
|
|
|
:return: parser
|
|
:rtype: `BibTexParser`
|
|
"""
|
|
self.bib_database = BibDatabase()
|
|
#: Callback function to process BibTeX entries after parsing, for example to create a list from a string with
|
|
#: multiple values. By default all BibTeX values are treated as simple strings. Default: `None`.
|
|
self.customization = None
|
|
|
|
#: Ignore non-standard BibTeX types (`book`, `article`, etc). Default: `True`.
|
|
self.ignore_nonstandard_types = True
|
|
|
|
#: Sanitise BibTeX field names, for example change `url` to `link` etc. Field names are always converted to
|
|
#: lowercase names. Default: `True`.
|
|
self.homogenise_fields = True
|
|
|
|
# On some sample data files, the character encoding detection simply
|
|
# hangs We are going to default to utf8, and mandate it.
|
|
self.encoding = 'utf8'
|
|
|
|
# pre-defined set of key changes
|
|
self.alt_dict = {
|
|
'keyw': 'keyword',
|
|
'keywords': 'keyword',
|
|
'authors': 'author',
|
|
'editors': 'editor',
|
|
'url': 'link',
|
|
'urls': 'link',
|
|
'links': 'link',
|
|
'subjects': 'subject'
|
|
}
|
|
|
|
self.replace_all_re = re.compile(r'((?P<pre>"?)\s*(#|^)\s*(?P<id>[^\d\W]\w*)\s*(#|$)\s*(?P<post>"?))', re.UNICODE)
|
|
|
|
def _bibtex_file_obj(self, bibtex_str):
|
|
# Some files have Byte-order marks inserted at the start
|
|
byte = '\xef\xbb\xbf'
|
|
if not isinstance(byte, ustr):
|
|
byte = ustr('\xef\xbb\xbf', self.encoding, 'ignore')
|
|
if bibtex_str[:3] == byte:
|
|
bibtex_str = bibtex_str[3:]
|
|
return StringIO(bibtex_str)
|
|
|
|
def parse(self, bibtex_str):
|
|
"""Parse a BibTeX string into an object
|
|
|
|
:param bibtex_str: BibTeX string
|
|
:type: str or unicode
|
|
:return: bibliographic database
|
|
:rtype: BibDatabase
|
|
"""
|
|
self.bibtex_file_obj = self._bibtex_file_obj(bibtex_str)
|
|
self._parse_records(customization=self.customization)
|
|
return self.bib_database
|
|
|
|
def parse_file(self, file):
|
|
"""Parse a BibTeX file into an object
|
|
|
|
:param file: BibTeX file or file-like object
|
|
:type: file
|
|
:return: bibliographic database
|
|
:rtype: BibDatabase
|
|
"""
|
|
return self.parse(file.read())
|
|
|
|
def _parse_records(self, customization=None):
|
|
"""Parse the bibtex into a list of records.
|
|
|
|
:param customization: a function
|
|
"""
|
|
def _add_parsed_record(record, records):
|
|
"""
|
|
Atomic function to parse a record
|
|
and append the result in records
|
|
"""
|
|
if record != "":
|
|
logger.debug('The record is not empty. Let\'s parse it.')
|
|
parsed = self._parse_record(record, customization=customization)
|
|
if parsed:
|
|
logger.debug('Store the result of the parsed record')
|
|
records.append(parsed)
|
|
else:
|
|
logger.debug('Nothing returned from the parsed record!')
|
|
else:
|
|
logger.debug('The record is empty')
|
|
|
|
records = []
|
|
record = ""
|
|
# read each line, bundle them up until they form an object, then send for parsing
|
|
for linenumber, line in enumerate(self.bibtex_file_obj):
|
|
logger.debug('Inspect line %s', linenumber)
|
|
if line.strip().startswith('@'):
|
|
# Remove leading whitespaces
|
|
line = line.lstrip()
|
|
logger.debug('Line starts with @')
|
|
# Parse previous record
|
|
_add_parsed_record(record, records)
|
|
# Start new record
|
|
logger.debug('The record is set to empty')
|
|
record = ""
|
|
# Keep adding lines to the record
|
|
record += line
|
|
|
|
# catch any remaining record and send it for parsing
|
|
_add_parsed_record(record, records)
|
|
logger.debug('Set the list of entries')
|
|
self.bib_database.entries = records
|
|
|
|
def _parse_record(self, record, customization=None):
|
|
"""Parse a record.
|
|
|
|
* tidy whitespace and other rubbish
|
|
* parse out the bibtype and citekey
|
|
* find all the key-value pairs it contains
|
|
|
|
:param record: a record
|
|
:param customization: a function
|
|
|
|
:returns: dict --
|
|
"""
|
|
d = {}
|
|
|
|
if not record.startswith('@'):
|
|
logger.debug('The record does not start with @. Return empty dict.')
|
|
return {}
|
|
|
|
# if a comment record, add to bib_database.comments
|
|
if record.lower().startswith('@comment'):
|
|
logger.debug('The record startswith @comment')
|
|
logger.debug('Store comment in list of comments')
|
|
|
|
self.bib_database.comments.append(re.search('\{(.*)\}', record, re.DOTALL).group(1))
|
|
|
|
logger.debug('Return an empty dict')
|
|
return {}
|
|
|
|
# if a preamble record, add to bib_database.preambles
|
|
if record.lower().startswith('@preamble'):
|
|
logger.debug('The record startswith @preamble')
|
|
logger.debug('Store preamble in list of preambles')
|
|
|
|
self.bib_database.preambles.append(re.search('\{(.*)\}', record, re.DOTALL).group(1))
|
|
|
|
logger.debug('Return an empty dict')
|
|
return {}
|
|
|
|
# prepare record
|
|
record = '\n'.join([i.strip() for i in record.split('\n')])
|
|
if '}\n' in record:
|
|
logger.debug('}\\n detected in the record. Clean up.')
|
|
record = record.replace('\r\n', '\n').replace('\r', '\n').rstrip('\n')
|
|
# treat the case for which the last line of the record
|
|
# does not have a coma
|
|
if record.endswith('}\n}') or record.endswith('}}'):
|
|
logger.debug('Missing coma in the last line of the record. Fix it.')
|
|
record = re.sub('}(\n|)}$', '},\n}', record)
|
|
|
|
# if a string record, put it in the replace_dict
|
|
if record.lower().startswith('@string'):
|
|
logger.debug('The record startswith @string')
|
|
key, val = [i.strip().strip('{').strip('}').replace('\n', ' ') for i in record.split('{', 1)[1].strip('}').strip('\n').strip(',').split('=')]
|
|
key = key.lower() # key is case insensitive
|
|
val = self._string_subst_partial(val)
|
|
if val.startswith('"') or val.lower() not in self.bib_database.strings:
|
|
self.bib_database.strings[key] = val.strip('"')
|
|
else:
|
|
self.bib_database.strings[key] = self.bib_database.strings[val.lower()]
|
|
logger.debug('Return a dict')
|
|
return d
|
|
|
|
# for each line in record
|
|
logger.debug('Split the record of its lines and treat them')
|
|
kvs = [i.strip() for i in re.split(',\s*\n|\n\s*,', record)]
|
|
inkey = ""
|
|
inval = ""
|
|
for kv in kvs:
|
|
logger.debug('Inspect: %s', kv)
|
|
# TODO: We may check that the keyword belongs to a known type
|
|
if kv.startswith('@') and not inkey:
|
|
# it is the start of the record - set the bibtype and citekey (id)
|
|
logger.debug('Line starts with @ and the key is not stored yet.')
|
|
bibtype, id = kv.split('{', 1)
|
|
bibtype = self._add_key(bibtype)
|
|
id = id.lstrip().strip('}').strip(',')
|
|
logger.debug('bibtype = %s', bibtype)
|
|
logger.debug('id = %s', id)
|
|
if self.ignore_nonstandard_types and bibtype not in ('article',
|
|
'book',
|
|
'booklet',
|
|
'conference',
|
|
'inbook',
|
|
'incollection',
|
|
'inproceedings',
|
|
'manual',
|
|
'mastersthesis',
|
|
'misc',
|
|
'phdthesis',
|
|
'proceedings',
|
|
'techreport',
|
|
'unpublished'):
|
|
logger.warning('Entry type %s not standard. Not considered.', bibtype)
|
|
break
|
|
elif '=' in kv and not inkey:
|
|
# it is a line with a key value pair on it
|
|
logger.debug('Line contains a key-pair value and the key is not stored yet.')
|
|
key, val = [i.strip() for i in kv.split('=', 1)]
|
|
key = self._add_key(key)
|
|
val = self._string_subst_partial(val)
|
|
# if it looks like the value spans lines, store details for next loop
|
|
if (val.count('{') != val.count('}')) or (val.startswith('"') and not val.replace('}', '').endswith('"')):
|
|
logger.debug('The line is not ending the record.')
|
|
inkey = key
|
|
inval = val
|
|
else:
|
|
logger.debug('The line is the end of the record.')
|
|
d[key] = self._add_val(val)
|
|
elif inkey:
|
|
logger.debug('Continues the previous line to complete the key pair value...')
|
|
# if this line continues the value from a previous line, append
|
|
inval += ', ' + kv
|
|
# if it looks like this line finishes the value, store it and clear for next loop
|
|
if (inval.startswith('{') and inval.endswith('}')) or (inval.startswith('"') and inval.endswith('"')):
|
|
logger.debug('This line represents the end of the current key-pair value')
|
|
d[inkey] = self._add_val(inval)
|
|
inkey = ""
|
|
inval = ""
|
|
else:
|
|
logger.debug('This line does NOT represent the end of the current key-pair value')
|
|
|
|
logger.debug('All lines have been treated')
|
|
if not d:
|
|
logger.debug('The dict is empty, return it.')
|
|
return d
|
|
|
|
d['ENTRYTYPE'] = bibtype
|
|
d['ID'] = id
|
|
|
|
if customization is None:
|
|
logger.debug('No customization to apply, return dict')
|
|
return d
|
|
else:
|
|
# apply any customizations to the record object then return it
|
|
logger.debug('Apply customizations and return dict')
|
|
return customization(d)
|
|
|
|
def _strip_quotes(self, val):
|
|
"""Strip double quotes enclosing string
|
|
|
|
:param val: a value
|
|
:type val: string
|
|
:returns: string -- value
|
|
"""
|
|
logger.debug('Strip quotes')
|
|
val = val.strip()
|
|
if val.startswith('"') and val.endswith('"'):
|
|
return val[1:-1]
|
|
return val
|
|
|
|
def _strip_braces(self, val):
|
|
"""Strip braces enclosing string
|
|
|
|
:param val: a value
|
|
:type val: string
|
|
:returns: string -- value
|
|
"""
|
|
logger.debug('Strip braces')
|
|
val = val.strip()
|
|
if val.startswith('{') and val.endswith('}') and self._full_span(val):
|
|
return val[1:-1]
|
|
return val
|
|
|
|
def _full_span(self, val):
|
|
cnt = 0
|
|
for i in range(0, len(val)):
|
|
if val[i] == '{':
|
|
cnt += 1
|
|
elif val[i] == '}':
|
|
cnt -= 1
|
|
if cnt == 0:
|
|
break
|
|
if i == len(val) - 1:
|
|
return True
|
|
else:
|
|
return False
|
|
|
|
def _string_subst(self, val):
|
|
""" Substitute string definitions
|
|
|
|
:param val: a value
|
|
:type val: string
|
|
:returns: string -- value
|
|
"""
|
|
logger.debug('Substitute string definitions')
|
|
if not val:
|
|
return ''
|
|
for k in list(self.bib_database.strings.keys()):
|
|
if val.lower() == k:
|
|
val = self.bib_database.strings[k]
|
|
if not isinstance(val, ustr):
|
|
val = ustr(val, self.encoding, 'ignore')
|
|
|
|
return val
|
|
|
|
def _string_subst_partial(self, val):
|
|
""" Substitute string definitions inside larger expressions
|
|
|
|
:param val: a value
|
|
:type val: string
|
|
:returns: string -- value
|
|
"""
|
|
def repl(m):
|
|
k = m.group('id')
|
|
replacement = self.bib_database.strings[k.lower()] if k.lower() in self.bib_database.strings else k
|
|
pre = '"' if m.group('pre') != '"' else ''
|
|
post = '"' if m.group('post') != '"' else ''
|
|
return pre + replacement + post
|
|
|
|
logger.debug('Substitute string definitions inside larger expressions')
|
|
if '#' not in val:
|
|
return val
|
|
|
|
# TODO?: Does not match two subsequent variables or strings, such as "start" # foo # bar # "end" or "start" # "end".
|
|
# TODO: Does not support braces instead of quotes, e.g.: {start} # foo # {bar}
|
|
# TODO: Does not support strings like: "te#s#t"
|
|
return self.replace_all_re.sub(repl, val)
|
|
|
|
def _add_val(self, val):
|
|
""" Clean instring before adding to dictionary
|
|
|
|
:param val: a value
|
|
:type val: string
|
|
:returns: string -- value
|
|
"""
|
|
if not val or val == "{}":
|
|
return ''
|
|
val = self._strip_braces(val)
|
|
val = self._strip_quotes(val)
|
|
val = self._strip_braces(val)
|
|
val = self._string_subst(val)
|
|
return val
|
|
|
|
def _add_key(self, key):
|
|
""" Add a key and homogeneize alternative forms.
|
|
|
|
:param key: a key
|
|
:type key: string
|
|
:returns: string -- value
|
|
"""
|
|
key = key.strip().strip('@').lower()
|
|
if self.homogenise_fields:
|
|
if key in list(self.alt_dict.keys()):
|
|
key = self.alt_dict[key]
|
|
if not isinstance(key, ustr):
|
|
return ustr(key, 'utf-8')
|
|
else:
|
|
return key
|