From 3f4f8292ca5cd84ffffdcc8cae2a01975e512c77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gon=C3=A7alo=20Silva?= Date: Sat, 8 Jun 2019 03:02:47 +0100 Subject: [PATCH] Use markup regexp for character count Deferring to Pandoc is not without its faults. It still requires processing (eg. horizontal rules turning into 72 dashes). It is significantly slower and resource hungry. On the reverse, the markup regexps have improved over time and are able to handle the task. --- uberwriter/markup_regex.py | 2 +- uberwriter/stats_counter.py | 28 ++++++++++++++++++++-------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/uberwriter/markup_regex.py b/uberwriter/markup_regex.py index 8738843..7b2e9c0 100644 --- a/uberwriter/markup_regex.py +++ b/uberwriter/markup_regex.py @@ -31,6 +31,6 @@ TABLE = re.compile( MATH = re.compile( r"([$]{1,2})[^` ](?P.+?)[^`\\ ]\1") FOOTNOTE_ID = re.compile( - r"[^\s]+\[\^(?P[^\s]+)\]") + r"[^\s]+\[\^(?P(?P[^\s]+))\]") FOOTNOTE = re.compile( r"(?:^\n*|\n\n)\[\^(?P[^\s]+)\]: (?P(?:[^\n]+|\n+(?=(?:\t| {4})))+)(?:\n+|$)", re.M) diff --git a/uberwriter/stats_counter.py b/uberwriter/stats_counter.py index adeac08..ec54095 100644 --- a/uberwriter/stats_counter.py +++ b/uberwriter/stats_counter.py @@ -1,21 +1,19 @@ -import math import re from queue import Queue from threading import Thread from gi.repository import GLib -from uberwriter import helpers +from uberwriter.markup_regex import ITALIC, BOLD_ITALIC, BOLD, STRIKETHROUGH, IMAGE, LINK, \ + HORIZONTAL_RULE, LIST, MATH, TABLE, CODE_BLOCK, HEADER_UNDER, HEADER, BLOCK_QUOTE, ORDERED_LIST, \ + FOOTNOTE_ID, FOOTNOTE class StatsCounter: """Counts characters, words, sentences and read time using a background thread.""" - # Regexp that matches characters, with the following exceptions: - # * Newlines - # * Sequential spaces - # * Sequential dashes - CHARACTERS = re.compile(r"[^\s-]|(?:[^\S\n](?!\s)|-(?![-\n]))") + # Regexp that matches any character, except for newlines and subsequent spaces. + CHARACTERS = re.compile(r"[^\s]|(?:[^\S\n](?!\s))") # Regexp that matches Asian letters, general symbols and hieroglyphs, # as well as sequences of word characters optionally containing non-word characters in-between. @@ -28,6 +26,17 @@ class StatsCounter: # Regexp that matches paragraphs, ie. anything separated by newlines. PARAGRAPHS = re.compile(r".+\n?") + # List of regexp whose matches should be replaced by their "text" group. Order is important. + MARKUP_REGEXP_REPLACE = ( + BOLD_ITALIC, ITALIC, BOLD, STRIKETHROUGH, IMAGE, LINK, LIST, ORDERED_LIST, BLOCK_QUOTE, + HEADER, HEADER_UNDER, CODE_BLOCK, TABLE, MATH, FOOTNOTE_ID, FOOTNOTE + ) + + # List of regexp whose matches should be removed. Order is important. + MARKUP_REGEXP_REMOVE = ( + HORIZONTAL_RULE, + ) + def __init__(self): super().__init__() @@ -59,7 +68,10 @@ class StatsCounter: if self.queue.empty(): break - text = helpers.pandoc_convert(text, to="plain") + for regexp in self.MARKUP_REGEXP_REPLACE: + text = re.sub(regexp, r"\g", text) + for regexp in self.MARKUP_REGEXP_REMOVE: + text = re.sub(regexp, "", text) character_count = len(re.findall(self.CHARACTERS, text))